From cd62d4e46172d7fed4604ed9deedcb1acf4c4b21 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 7 Jun 2015 19:44:07 -0700 Subject: [PATCH] PACK to 2101010. --- src/xenia/cpu/backend/x64/x64_sequences.cc | 63 +++++++++++++++++++ src/xenia/cpu/frontend/ppc_emit_altivec.cc | 6 ++ src/xenia/cpu/frontend/test/instr_vpkd3d128.s | 30 ++++++++- src/xenia/cpu/hir/opcodes.h | 5 +- 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 0eb943d4b..489af879d 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -5884,6 +5884,9 @@ EMITTER(PACK, MATCH(I, V128<>, V128<>>)) { case PACK_TYPE_SHORT_2: EmitSHORT_2(e, i); break; + case PACK_TYPE_UINT_2101010: + EmitUINT_2101010(e, i); + break; case PACK_TYPE_8_IN_16: Emit8_IN_16(e, i, i.instr->flags); break; @@ -5973,6 +5976,60 @@ EMITTER(PACK, MATCH(I, V128<>, V128<>>)) { // Pack. e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2)); } + static __m128i EmulatePackUINT_2101010(void*, __m128i src1) { + // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt + union { + alignas(16) int32_t a_i[4]; + alignas(16) uint32_t a_u[4]; + alignas(16) float a_f[4]; + }; + alignas(16) uint32_t b[4]; + alignas(16) uint32_t c[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(a_u), src1); + // XYZ are 10 bits, signed and saturated. + for (int i = 0; i < 3; ++i) { + static const int32_t kMinValueXYZ = 0x403FFE01; // 3 - 1FF / (1 << 22) + static const int32_t kMaxValueXYZ = 0x404001FF; // 3 + 1FF / (1 << 22) + uint32_t exponent = (a_u[i] >> 23) & 0xFF; + uint32_t fractional = a_u[i] & 0x007FFFFF; + if ((exponent == 0xFF) && fractional) { + b[i] = 0x200; + } else if (a_i[i] > kMaxValueXYZ) { + b[i] = 0x1FF; // INT_MAX + } else if (a_i[i] < kMinValueXYZ) { + b[i] = 0x201; // -INT_MAX + } else { + b[i] = a_u[i] & 0x3FF; + } + } + // W is 2 bits, unsigned and saturated. + static const int32_t kMinValueW = 0x40400000; // 3 + static const int32_t kMaxValueW = 0x40400003; // 3 + 3 / (1 << 22) + uint32_t w_exponent = (a_u[3] >> 23) & 0xFF; + uint32_t w_fractional = a_u[3] & 0x007FFFFF; + if ((w_exponent == 0xFF) && w_fractional) { + b[3] = 0x0; + } else if (a_i[3] > kMaxValueW) { + b[3] = 0x3; + } else if (a_i[3] < kMinValueW) { + b[3] = 0x0; + } else { + b[3] = a_u[3] & 0x3; + } + // Combine in 2101010 WZYX. + c[0] = c[1] = c[2] = 0; + c[3] = ((b[3] & 0x3) << 30) | ((b[2] & 0x3FF) << 20) | + ((b[1] & 0x3FF) << 10) | ((b[0] & 0x3FF)); + return _mm_load_si128(reinterpret_cast<__m128i*>(c)); + } + static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + // dest = [(b2(src1.w), b10(src1.z), b10(src1.y), b10(src1.x)), 0, 0, 0] + // TODO(benvanik): optimized version. + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulatePackUINT_2101010)); + e.vmovaps(i.dest, e.xmm0); + } static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1, __m128i src2) { alignas(16) uint16_t a[8]; @@ -6111,6 +6168,9 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { case PACK_TYPE_SHORT_2: EmitSHORT_2(e, i); break; + case PACK_TYPE_UINT_2101010: + EmitUINT_2101010(e, i); + break; case PACK_TYPE_8_IN_16: Emit8_IN_16(e, i, i.instr->flags); break; @@ -6245,6 +6305,9 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // Add 3,3,0,1. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); } + static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { + assert_always("not implemented"); + } static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { assert_false(IsPackOutSaturate(flags)); if (IsPackToLo(flags)) { diff --git a/src/xenia/cpu/frontend/ppc_emit_altivec.cc b/src/xenia/cpu/frontend/ppc_emit_altivec.cc index c309975df..f5c6fef73 100644 --- a/src/xenia/cpu/frontend/ppc_emit_altivec.cc +++ b/src/xenia/cpu/frontend/ppc_emit_altivec.cc @@ -2027,6 +2027,9 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, case 1: // VPACK_NORMSHORT2 v = f.Pack(v, PACK_TYPE_SHORT_2); break; + case 2: // VPACK_... 2_10_10_10 w_z_y_x + v = f.Pack(v, PACK_TYPE_UINT_2101010); + break; case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT v = f.Pack(v, PACK_TYPE_FLOAT16_2); break; @@ -2125,6 +2128,9 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(PPCHIRBuilder& f, case 1: // VPACK_NORMSHORT2 v = f.Unpack(v, PACK_TYPE_SHORT_2); break; + case 2: // VPACK_... 2_10_10_10 w_z_y_x + v = f.Unpack(v, PACK_TYPE_UINT_2101010); + break; case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_2); break; diff --git a/src/xenia/cpu/frontend/test/instr_vpkd3d128.s b/src/xenia/cpu/frontend/test/instr_vpkd3d128.s index babff5853..9268b2a8b 100644 --- a/src/xenia/cpu/frontend/test/instr_vpkd3d128.s +++ b/src/xenia/cpu/frontend/test/instr_vpkd3d128.s @@ -177,6 +177,33 @@ test_vpkd3d128_short2_2: #_ REGISTER_OUT v3 [4040FFFE, 403FF333, 42A23EC8, 403DB757] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 7FFFF333] + +test_vpkd3d128_uint_2101010_0: + #_ REGISTER_IN v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A] + #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] + # vpkd3d128 v4, v3, 2, 1, 0 + .long 0x18891E10 + blr + #_ REGISTER_OUT v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A] + #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, E0180601] +test_vpkd3d128_uint_2101010_1: + #_ REGISTER_IN v3 [42C80000, C2C80000, 40400000, 3F800000] + #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] + # vpkd3d128 v4, v3, 2, 1, 0 + .long 0x18891E10 + blr + #_ REGISTER_OUT v3 [42C80000, C2C80000, 40400000, 3F800000] + #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 000805FF] +test_vpkd3d128_uint_2101010_2: + #_ REGISTER_IN v3 [3F000000, BF000000, 3F800000, 00000000] + #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] + # vpkd3d128 v4, v3, 2, 1, 0 + .long 0x18891E10 + blr + #_ REGISTER_OUT v3 [3F000000, BF000000, 3F800000, 00000000] + #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 20180601] + + test_vpkd3d128_float16_2_invalid_0: #_ REGISTER_IN v3 [3FC00000, BFC00000, 42A23EC8, 403DB757] #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] @@ -185,7 +212,6 @@ test_vpkd3d128_float16_2_invalid_0: blr #_ REGISTER_OUT v3 [3FC00000, BFC00000, 42A23EC8, 403DB757] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3E00BE00] - test_vpkd3d128_float16_2_0: #_ REGISTER_IN v3 [3F000000, BF000000, 00000000, 00000000] #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] @@ -195,6 +221,7 @@ test_vpkd3d128_float16_2_0: #_ REGISTER_OUT v3 [3F000000, BF000000, 00000000, 00000000] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800] + test_vpkd3d128_float16_4_invalid_0: #_ REGISTER_IN v3 [3FC00000, BFC00000, 3FC00000, BFC00000] #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] @@ -203,7 +230,6 @@ test_vpkd3d128_float16_4_invalid_0: blr #_ REGISTER_OUT v3 [3FC00000, BFC00000, 3FC00000, BFC00000] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 3E00BE00, 3E00BE00] - test_vpkd3d128_float16_4_0: #_ REGISTER_IN v3 [3F000000, BF000000, 3F000000, BF000000] #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 5af5fa7ac..6ff5f8a36 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -71,10 +71,11 @@ enum PackType : uint16_t { PACK_TYPE_FLOAT16_2 = 1, PACK_TYPE_FLOAT16_4 = 2, PACK_TYPE_SHORT_2 = 3, + PACK_TYPE_UINT_2101010 = 4, // Types which use the bitmasks below for configuration: - PACK_TYPE_8_IN_16 = 4, - PACK_TYPE_16_IN_32 = 5, + PACK_TYPE_8_IN_16 = 5, + PACK_TYPE_16_IN_32 = 6, PACK_TYPE_MODE = 0x000F, // just to get the mode