PACK to 2101010.

This commit is contained in:
Ben Vanik 2015-06-07 19:44:07 -07:00
parent 98e14fcb1b
commit cd62d4e461
4 changed files with 100 additions and 4 deletions

View File

@ -5884,6 +5884,9 @@ EMITTER(PACK, MATCH(I<OPCODE_PACK, V128<>, V128<>, V128<>>)) {
case PACK_TYPE_SHORT_2:
EmitSHORT_2(e, i);
break;
case PACK_TYPE_UINT_2101010:
EmitUINT_2101010(e, i);
break;
case PACK_TYPE_8_IN_16:
Emit8_IN_16(e, i, i.instr->flags);
break;
@ -5973,6 +5976,60 @@ EMITTER(PACK, MATCH(I<OPCODE_PACK, V128<>, V128<>, V128<>>)) {
// Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2));
}
static __m128i EmulatePackUINT_2101010(void*, __m128i src1) {
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
union {
alignas(16) int32_t a_i[4];
alignas(16) uint32_t a_u[4];
alignas(16) float a_f[4];
};
alignas(16) uint32_t b[4];
alignas(16) uint32_t c[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a_u), src1);
// XYZ are 10 bits, signed and saturated.
for (int i = 0; i < 3; ++i) {
static const int32_t kMinValueXYZ = 0x403FFE01; // 3 - 1FF / (1 << 22)
static const int32_t kMaxValueXYZ = 0x404001FF; // 3 + 1FF / (1 << 22)
uint32_t exponent = (a_u[i] >> 23) & 0xFF;
uint32_t fractional = a_u[i] & 0x007FFFFF;
if ((exponent == 0xFF) && fractional) {
b[i] = 0x200;
} else if (a_i[i] > kMaxValueXYZ) {
b[i] = 0x1FF; // INT_MAX
} else if (a_i[i] < kMinValueXYZ) {
b[i] = 0x201; // -INT_MAX
} else {
b[i] = a_u[i] & 0x3FF;
}
}
// W is 2 bits, unsigned and saturated.
static const int32_t kMinValueW = 0x40400000; // 3
static const int32_t kMaxValueW = 0x40400003; // 3 + 3 / (1 << 22)
uint32_t w_exponent = (a_u[3] >> 23) & 0xFF;
uint32_t w_fractional = a_u[3] & 0x007FFFFF;
if ((w_exponent == 0xFF) && w_fractional) {
b[3] = 0x0;
} else if (a_i[3] > kMaxValueW) {
b[3] = 0x3;
} else if (a_i[3] < kMinValueW) {
b[3] = 0x0;
} else {
b[3] = a_u[3] & 0x3;
}
// Combine in 2101010 WZYX.
c[0] = c[1] = c[2] = 0;
c[3] = ((b[3] & 0x3) << 30) | ((b[2] & 0x3FF) << 20) |
((b[1] & 0x3FF) << 10) | ((b[0] & 0x3FF));
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// dest = [(b2(src1.w), b10(src1.z), b10(src1.y), b10(src1.x)), 0, 0, 0]
// TODO(benvanik): optimized version.
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulatePackUINT_2101010));
e.vmovaps(i.dest, e.xmm0);
}
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
__m128i src2) {
alignas(16) uint16_t a[8];
@ -6111,6 +6168,9 @@ EMITTER(UNPACK, MATCH(I<OPCODE_UNPACK, V128<>, V128<>>)) {
case PACK_TYPE_SHORT_2:
EmitSHORT_2(e, i);
break;
case PACK_TYPE_UINT_2101010:
EmitUINT_2101010(e, i);
break;
case PACK_TYPE_8_IN_16:
Emit8_IN_16(e, i, i.instr->flags);
break;
@ -6245,6 +6305,9 @@ EMITTER(UNPACK, MATCH(I<OPCODE_UNPACK, V128<>, V128<>>)) {
// Add 3,3,0,1.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_always("not implemented");
}
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
assert_false(IsPackOutSaturate(flags));
if (IsPackToLo(flags)) {

View File

@ -2027,6 +2027,9 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f,
case 1: // VPACK_NORMSHORT2
v = f.Pack(v, PACK_TYPE_SHORT_2);
break;
case 2: // VPACK_... 2_10_10_10 w_z_y_x
v = f.Pack(v, PACK_TYPE_UINT_2101010);
break;
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
v = f.Pack(v, PACK_TYPE_FLOAT16_2);
break;
@ -2125,6 +2128,9 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(PPCHIRBuilder& f,
case 1: // VPACK_NORMSHORT2
v = f.Unpack(v, PACK_TYPE_SHORT_2);
break;
case 2: // VPACK_... 2_10_10_10 w_z_y_x
v = f.Unpack(v, PACK_TYPE_UINT_2101010);
break;
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
v = f.Unpack(v, PACK_TYPE_FLOAT16_2);
break;

View File

@ -177,6 +177,33 @@ test_vpkd3d128_short2_2:
#_ REGISTER_OUT v3 [4040FFFE, 403FF333, 42A23EC8, 403DB757]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 7FFFF333]
test_vpkd3d128_uint_2101010_0:
#_ REGISTER_IN v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
# vpkd3d128 v4, v3, 2, 1, 0
.long 0x18891E10
blr
#_ REGISTER_OUT v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, E0180601]
test_vpkd3d128_uint_2101010_1:
#_ REGISTER_IN v3 [42C80000, C2C80000, 40400000, 3F800000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
# vpkd3d128 v4, v3, 2, 1, 0
.long 0x18891E10
blr
#_ REGISTER_OUT v3 [42C80000, C2C80000, 40400000, 3F800000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 000805FF]
test_vpkd3d128_uint_2101010_2:
#_ REGISTER_IN v3 [3F000000, BF000000, 3F800000, 00000000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
# vpkd3d128 v4, v3, 2, 1, 0
.long 0x18891E10
blr
#_ REGISTER_OUT v3 [3F000000, BF000000, 3F800000, 00000000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 20180601]
test_vpkd3d128_float16_2_invalid_0:
#_ REGISTER_IN v3 [3FC00000, BFC00000, 42A23EC8, 403DB757]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
@ -185,7 +212,6 @@ test_vpkd3d128_float16_2_invalid_0:
blr
#_ REGISTER_OUT v3 [3FC00000, BFC00000, 42A23EC8, 403DB757]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3E00BE00]
test_vpkd3d128_float16_2_0:
#_ REGISTER_IN v3 [3F000000, BF000000, 00000000, 00000000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
@ -195,6 +221,7 @@ test_vpkd3d128_float16_2_0:
#_ REGISTER_OUT v3 [3F000000, BF000000, 00000000, 00000000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800]
test_vpkd3d128_float16_4_invalid_0:
#_ REGISTER_IN v3 [3FC00000, BFC00000, 3FC00000, BFC00000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
@ -203,7 +230,6 @@ test_vpkd3d128_float16_4_invalid_0:
blr
#_ REGISTER_OUT v3 [3FC00000, BFC00000, 3FC00000, BFC00000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 3E00BE00, 3E00BE00]
test_vpkd3d128_float16_4_0:
#_ REGISTER_IN v3 [3F000000, BF000000, 3F000000, BF000000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]

View File

@ -71,10 +71,11 @@ enum PackType : uint16_t {
PACK_TYPE_FLOAT16_2 = 1,
PACK_TYPE_FLOAT16_4 = 2,
PACK_TYPE_SHORT_2 = 3,
PACK_TYPE_UINT_2101010 = 4,
// Types which use the bitmasks below for configuration:
PACK_TYPE_8_IN_16 = 4,
PACK_TYPE_16_IN_32 = 5,
PACK_TYPE_8_IN_16 = 5,
PACK_TYPE_16_IN_32 = 6,
PACK_TYPE_MODE = 0x000F, // just to get the mode