From 8085678f5af19291d14ab3d48411b0ae43ca77f4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 10 Jan 2014 00:28:52 -0800 Subject: [PATCH] PACK/UNPACK. --- src/alloy/backend/ivm/ivm_intcode.cc | 128 ++++++++ src/alloy/frontend/ppc/ppc_emit_altivec.cc | 331 ++++++++++----------- src/alloy/hir/hir_builder.cc | 22 ++ src/alloy/hir/hir_builder.h | 3 +- src/alloy/hir/opcodes.h | 8 + src/alloy/hir/opcodes.inl | 12 + 6 files changed, 321 insertions(+), 183 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 04249b95e..40fae05a3 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -21,6 +21,10 @@ using namespace alloy::hir; using namespace alloy::runtime; +// TODO(benvanik): reimplement packing functions +#include + + // TODO(benvanik): make a compile time flag? //#define DYNAMIC_REGISTER_ACCESS_CHECK(address) false #define DYNAMIC_REGISTER_ACCESS_CHECK(address) ((address & 0xFF000000) == 0x7F000000) @@ -3726,6 +3730,128 @@ int Translate_SWIZZLE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->src1.value->type]); } +uint32_t IntCode_PACK_D3DCOLOR(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + // RGBA (XYZW) -> ARGB (WXYZ) + dest.ix = dest.iy = dest.iz = 0; + float r = roundf(((src1.x < 0) ? 0 : ((1 < src1.x) ? 1 : src1.x)) * 255); + float g = roundf(((src1.y < 0) ? 0 : ((1 < src1.y) ? 1 : src1.y)) * 255); + float b = roundf(((src1.z < 0) ? 0 : ((1 < src1.z) ? 1 : src1.z)) * 255); + float a = roundf(((src1.w < 0) ? 0 : ((1 < src1.w) ? 1 : src1.w)) * 255); + dest.iw = ((uint32_t)a << 24) | + ((uint32_t)r << 16) | + ((uint32_t)g << 8) | + ((uint32_t)b); + return IA_NEXT; +} +uint32_t IntCode_PACK_FLOAT16_2(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + dest.ix = dest.iy = dest.iz = 0; + dest.iw = + ((uint32_t)DirectX::PackedVector::XMConvertFloatToHalf(src1.x) << 16) | + DirectX::PackedVector::XMConvertFloatToHalf(src1.y); + return IA_NEXT; +} +uint32_t IntCode_PACK_FLOAT16_4(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + dest.iz = + ((uint32_t)DirectX::PackedVector::XMConvertFloatToHalf(src1.x) << 16) | + DirectX::PackedVector::XMConvertFloatToHalf(src1.y); + dest.iw = + ((uint32_t)DirectX::PackedVector::XMConvertFloatToHalf(src1.z) << 16) | + DirectX::PackedVector::XMConvertFloatToHalf(src1.w); + return IA_NEXT; +} +uint32_t IntCode_PACK_SHORT_2(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + // sx = 3 + (x / 1<<22) + // x = (sx - 3) * 1<<22 + float sx = src1.x; + float sy = src1.y; + union { + int16_t dx; + int16_t dy; + }; + dx = (int16_t)((sx - 3.0f) * (float)(1 << 22)); + dy = (int16_t)((sy - 3.0f) * (float)(1 << 22)); + dest.ix = dest.iy = dest.iz = 0; + dest.iw = ((uint32_t)dx << 16) | dy; + return IA_NEXT; +} +int Translate_PACK(TranslationContext& ctx, Instr* i) { + static IntCodeFn fns[] = { + IntCode_PACK_D3DCOLOR, + IntCode_PACK_FLOAT16_2, + IntCode_PACK_FLOAT16_4, + IntCode_PACK_SHORT_2, + }; + return DispatchToC(ctx, i, fns[i->flags]); +} + +uint32_t IntCode_UNPACK_D3DCOLOR(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + // ARGB (WXYZ) -> RGBA (XYZW) + // XMLoadColor + int32_t src = (int32_t)src1.iw; + dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); + dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); + dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); + dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); + return IA_NEXT; +} +uint32_t IntCode_UNPACK_FLOAT16_2(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + uint32_t src = src1.iw; + for (int n = 0; n < 2; n++) { + dest.f4[n] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); + src >>= 16; + } + dest.f4[2] = 0.0f; + dest.f4[3] = 1.0f; + return IA_NEXT; +} +uint32_t IntCode_UNPACK_FLOAT16_4(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + uint64_t src = src1.iz | ((uint64_t)src1.iw << 32); + for (int n = 0; n < 4; n++) { + dest.f4[n] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); + src >>= 16; + } + return IA_NEXT; +} +uint32_t IntCode_UNPACK_SHORT_2(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + // XMLoadShortN2 + union { + int16_t sx; + int16_t sy; + }; + sx = (int16_t)(src1.iw >> 16); + sy = (int16_t)src1.iw; + dest.f4[0] = 3.0f + ((float)sx / (float)(1 << 22)); + dest.f4[1] = 3.0f + ((float)sy / (float)(1 << 22)); + dest.f4[2] = 0.0f; + dest.f4[3] = 1.0f; // 3? + return IA_NEXT; +} +int Translate_UNPACK(TranslationContext& ctx, Instr* i) { + static IntCodeFn fns[] = { + IntCode_UNPACK_D3DCOLOR, + IntCode_UNPACK_FLOAT16_2, + IntCode_UNPACK_FLOAT16_4, + IntCode_UNPACK_SHORT_2, + }; + return DispatchToC(ctx, i, fns[i->flags]); +} + uint32_t IntCode_ATOMIC_EXCHANGE_I32(IntCodeState& ics, const IntCode* i) { auto address = (uint8_t*)ics.rf[i->src1_reg].u64; auto new_value = ics.rf[i->src2_reg].u32; @@ -3860,6 +3986,8 @@ static const TranslateFn dispatch_table[] = { Translate_SPLAT, Translate_PERMUTE, Translate_SWIZZLE, + Translate_PACK, + Translate_UNPACK, TranslateInvalid, //Translate_COMPARE_EXCHANGE, Translate_ATOMIC_EXCHANGE, diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index 57ec71c7a..416005aef 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -1128,88 +1128,6 @@ XEEMITTER(vpermwi128, VX128_P(6, 528), VX128_P)(PPCHIRBuilder& f, InstrData return 0; } -XEEMITTER(vpkpx, 0x1000030E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkshss, 0x1000018E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkshss128, VX128(5, 512), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkswss, 0x100001CE, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkswss128, VX128(5, 640), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkswus, 0x1000014E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkswus128, VX128(5, 704), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkuhum, 0x1000000E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkuhum128, VX128(5, 768), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkuhus, 0x1000008E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkuhus128, VX128(5, 832), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkshus, 0x1000010E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkshus128, VX128(5, 576), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkuwum, 0x1000004E, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkuwum128, VX128(5, 896), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkuwus, 0x100000CE, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} -XEEMITTER(vpkuwus128, VX128(5, 960), VX128 )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - int InstrEmit_vrefp_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb) { // (VD) <- 1/(VB) vec128_t one = { 1, 1, 1, 1 }; @@ -1696,6 +1614,84 @@ XEEMITTER(vsum4ubs, 0x10000608, VX )(PPCHIRBuilder& f, InstrData& i) { return 1; } +XEEMITTER(vpkpx, 0x1000030E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkshss, 0x1000018E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkshss128, VX128(5, 512), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkswss, 0x100001CE, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkswss128, VX128(5, 640), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkswus, 0x1000014E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkswus128, VX128(5, 704), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkuhum, 0x1000000E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkuhum128, VX128(5, 768), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkuhus, 0x1000008E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkuhus128, VX128(5, 832), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkshus, 0x1000010E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkshus128, VX128(5, 576), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkuwum, 0x1000004E, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkuwum128, VX128(5, 896), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + +XEEMITTER(vpkuwus, 0x100000CE, VX )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} +XEEMITTER(vpkuwus128, VX128(5, 960), VX128 )(PPCHIRBuilder& f, InstrData& i) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} + + XEEMITTER(vupkhpx, 0x1000034E, VX )(PPCHIRBuilder& f, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1734,31 +1730,69 @@ XEEMITTER(vupklsh, 0x100002CE, VX )(PPCHIRBuilder& f, InstrData& i) { return 1; } -// __m128 half_to_float5_SSE2(__m128i h) { -// #define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } -// #define SSE_CONST(name) *(const __m128i *)&name -// #define SSE_CONSTF(name) *(const __m128 *)&name -// SSE_CONST4(mask_nosign, 0x7fff); -// SSE_CONST4(magic, (254 - 15) << 23); -// SSE_CONST4(was_infnan, 0x7bff); -// SSE_CONST4(exp_infnan, 255 << 23); -// __m128i mnosign = SSE_CONST(mask_nosign); -// __m128i expmant = _mm_and_si128(mnosign, h); -// __m128i justsign = _mm_xor_si128(h, expmant); -// __m128i expmant2 = expmant; // copy (just here for counting purposes) -// __m128i shifted = _mm_slli_epi32(expmant, 13); -// __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic); -// __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, SSE_CONST(was_infnan)); -// __m128i sign = _mm_slli_epi32(justsign, 16); -// __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), SSE_CONSTF(exp_infnan)); -// __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp); -// __m128 final = _mm_or_ps(scaled, sign_inf); -// // ~11 SSE2 ops. -// return final; -// #undef SSE_CONST4 -// #undef CONST -// #undef CONSTF -// } +XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, InstrData& i) { + const uint32_t vd = i.VX128_4.VD128l | (i.VX128_4.VD128h << 5); + const uint32_t vb = i.VX128_4.VB128l | (i.VX128_4.VB128h << 5); + uint32_t type = i.VX128_4.IMM >> 2; + uint32_t shift = i.VX128_4.IMM & 0x3; + uint32_t pack = i.VX128_4.z; + Value* v = f.LoadVR(vb); + switch (type) { + case 0: // VPACK_D3DCOLOR + v = f.Pack(v, PACK_TYPE_D3DCOLOR); + break; + case 1: // VPACK_NORMSHORT2 + v = f.Pack(v, PACK_TYPE_SHORT_2); + break; + case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT + v = f.Pack(v, PACK_TYPE_FLOAT16_2); + break; + case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT + v = f.Pack(v, PACK_TYPE_FLOAT16_4); + break; + default: + XEASSERTALWAYS(); + return 1; + } + // http://hlssmod.net/he_code/public/pixelwriter.h + // control = prev:0123 | new:4567 + uint32_t control = 0x00010203; // original + uint32_t src = _rotl(0x04050607, shift * 8); + uint32_t mask = 0; + switch (pack) { + case 1: // VPACK_32 + // VPACK_32 & shift = 3 puts lower 32 bits in x (leftmost slot). + mask = 0x000000FF << (shift * 8); + control = (control & ~mask) | (src & mask); + break; + case 2: // 64bit + if (shift < 3) { + mask = 0x0000FFFF << (shift * 8); + } else { + // w + src = 0x00000007; + mask = 0x000000FF; + } + control = (control & ~mask) | (src & mask); + break; + case 3: // 64bit + if (shift < 3) { + mask = 0x0000FFFF << (shift * 8); + } else { + // z + src = 0x00000006; + mask = 0x000000FF; + } + control = (control & ~mask) | (src & mask); + break; + default: + XEASSERTALWAYS(); + return 1; + } + v = f.Permute(f.LoadConstant(control), f.LoadVR(vd), v, INT32_TYPE); + f.StoreVR(vd, v); + return 0; +} XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(PPCHIRBuilder& f, InstrData& i) { // Can't find many docs on this. Best reference is @@ -1768,86 +1802,19 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(PPCHIRBuilder& f, InstrData const uint32_t vd = i.VX128_3.VD128l | (i.VX128_3.VD128h << 5); const uint32_t vb = i.VX128_3.VB128l | (i.VX128_3.VB128h << 5); const uint32_t type = i.VX128_3.IMM >> 2; - Value* v; + Value* v = f.LoadVR(vb); switch (type) { case 0: // VPACK_D3DCOLOR - { - // http://hlssmod.net/he_code/public/pixelwriter.h - // ARGB (WXYZ) -> RGBA (XYZW) - // zzzzZZZZzzzzARGB - v = f.LoadVR(vb); - // 0zzzZZZZzzzzARGB - v = f.Insert(v, 0ull, f.LoadConstant((int8_t)0)); - // 000R000G000B000A - vec128_t shuf_v = { 0 }; - shuf_v.b16[3] = 13; - shuf_v.b16[7] = 14; - shuf_v.b16[11] = 15; - shuf_v.b16[15] = 12; - Value* shuf = f.LoadConstant(shuf_v); - v = f.Permute(shuf, v, v, INT8_TYPE); - // {256*R.0, 256*G.0, 256*B.0, 256*A.0} - v = f.VectorConvertI2F(v); - // {R.0, G.0, B.0 A.0} - // 1/256 = 0.00390625 = 0x3B800000 - v = f.Mul( - v, - f.Splat(f.LoadConstant((uint32_t)0x3B800000), VEC128_TYPE)); - } + v = f.Unpack(v, PACK_TYPE_D3DCOLOR); break; case 1: // VPACK_NORMSHORT2 - { - // (VD.x) = 3.0 + (VB.x)*2^-22 - // (VD.y) = 3.0 + (VB.y)*2^-22 - // (VD.z) = 0.0 - // (VD.w) = 1.0 - // v = VB.x|VB.y|0|0 - v = f.Permute( - f.LoadConstant(PERMUTE_XY_ZW), - f.LoadVR(vb), - f.LoadZero(VEC128_TYPE), - INT32_TYPE); - // *= 2^-22 + {3.0, 3.0, 0, 1.0} - vec128_t v3301 = { 3.0, 3.0, 0, 1.0 }; - v = f.MulAdd( - v, - f.Splat(f.LoadConstant(0x34800000), VEC128_TYPE), - f.LoadConstant(v3301)); - } + v = f.Unpack(v, PACK_TYPE_SHORT_2); break; - case 3: // VPACK_... 2 FLOAT16s - { - // (VD.x) = fixed_16_to_32(VB.x (low)) - // (VD.y) = fixed_16_to_32(VB.x (high)) - // (VD.z) = 0.0 - // (VD.w) = 1.0 - v = f.LoadZero(VEC128_TYPE); - f.DebugBreak(); - // 1 bit sign, 5 bit exponent, 10 bit mantissa - // D3D10 half float format - // TODO(benvanik): fixed_16_to_32 in SSE? - // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx - // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) - // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ - // Packing half floats: https://gist.github.com/rygorous/2156668 - // Load source, move from tight pack of X16Y16.... to X16...Y16... - // Also zero out the high end. - //c.int3(); - //c.movaps(vt, f.LoadVR(vb)); - //c.save(vt); - //c.lea(gt, vt.m128()); - //X86CompilerFuncCall* call = c.call(half_to_float5_SSE2); - //uint32_t args[] = {kX86VarTypeGpq}; - //call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args)); - //call->setArgument(0, gt); - //call->setReturn(v); - //// Select XY00. - //c.xorps(vt, vt); - //c.shufps(v, vt, imm(0x04)); - //// {0.0, 0.0, 0.0, 1.0} - //c.mov(gt, imm(0x3F800000)); - //c.pinsrd(v, gt.r32(), imm(3)); - } + case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT + v = f.Unpack(v, PACK_TYPE_FLOAT16_2); + break; + case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT + v = f.Unpack(v, PACK_TYPE_FLOAT16_4); break; default: XEASSERTALWAYS(); diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 481347635..1e937d234 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -1677,6 +1677,28 @@ Value* HIRBuilder::Swizzle( return i->dest; } +Value* HIRBuilder::Pack(Value* value, uint32_t pack_flags) { + ASSERT_VECTOR_TYPE(value); + Instr* i = AppendInstr( + OPCODE_PACK_info, pack_flags, + AllocValue(VEC128_TYPE)); + i->set_src1(value); + i->src2.value = i->src3.value = NULL; + return i->dest; +} + +Value* HIRBuilder::Unpack(Value* value, uint32_t pack_flags) { + ASSERT_VECTOR_TYPE(value); + // TODO(benvanik): check if this is a constant - sometimes this is just used + // to initialize registers. + Instr* i = AppendInstr( + OPCODE_UNPACK_info, pack_flags, + AllocValue(VEC128_TYPE)); + i->set_src1(value); + i->src2.value = i->src3.value = NULL; + return i->dest; +} + Value* HIRBuilder::CompareExchange( Value* address, Value* compare_value, Value* exchange_value) { ASSERT_ADDRESS_TYPE(address); diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index c59e4d50d..11ea2b0f6 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -194,7 +194,8 @@ public: TypeName part_type); Value* Swizzle(Value* value, TypeName part_type, uint32_t swizzle_mask); // SelectBits(cond, value1, value2) - // pack/unpack/etc + Value* Pack(Value* value, uint32_t pack_flags = 0); + Value* Unpack(Value* value, uint32_t pack_flags = 0); Value* CompareExchange(Value* address, Value* compare_value, Value* exchange_value); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 114ee5a1c..282a9600a 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -66,6 +66,12 @@ enum Swizzles { SWIZZLE_XYZW_TO_ZWXY = SWIZZLE_MASK(2, 3, 0, 1), SWIZZLE_XYZW_TO_WXYZ = SWIZZLE_MASK(3, 0, 1, 2), }; +enum PackType { + PACK_TYPE_D3DCOLOR = 0, + PACK_TYPE_FLOAT16_2 = 1, + PACK_TYPE_FLOAT16_4 = 2, + PACK_TYPE_SHORT_2 = 3, +}; enum Opcode { @@ -173,6 +179,8 @@ enum Opcode { OPCODE_SPLAT, OPCODE_PERMUTE, OPCODE_SWIZZLE, + OPCODE_PACK, + OPCODE_UNPACK, OPCODE_COMPARE_EXCHANGE, OPCODE_ATOMIC_EXCHANGE, diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index 97ca4ff3b..42449ca17 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -539,6 +539,18 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_O, 0); +DEFINE_OPCODE( + OPCODE_PACK, + "pack", + OPCODE_SIG_V_V, + 0); + +DEFINE_OPCODE( + OPCODE_UNPACK, + "unpack", + OPCODE_SIG_V_V, + 0); + DEFINE_OPCODE( OPCODE_COMPARE_EXCHANGE, "compare_exchange",