diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index a41f507bf..c3c2babbb 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -5767,14 +5767,24 @@ struct VECTOR_AVERAGE case INT32_TYPE: // No 32bit averages in AVX. if (is_unsigned) { + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.lea(e.r9, e.StashXmm(1, i.src2)); e.CallNativeSafe( reinterpret_cast(EmulateVectorAverageUnsignedI32)); e.vmovaps(i.dest, e.xmm0); } else { + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.lea(e.r9, e.StashXmm(1, i.src2)); e.CallNativeSafe( reinterpret_cast(EmulateVectorAverageSignedI32)); e.vmovaps(i.dest, e.xmm0); @@ -6576,6 +6586,18 @@ struct PACK : Sequence> { } return _mm_load_si128(reinterpret_cast<__m128i*>(c)); } + static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) { + alignas(16) uint8_t a[16]; + alignas(16) uint8_t b[16]; + alignas(16) uint8_t c[16]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (int i = 0; i < 8; ++i) { + c[i] = a[i * 2]; + c[i + 8] = b[i * 2]; + } + return _mm_load_si128(reinterpret_cast<__m128i*>(c)); + } static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { // TODO(benvanik): handle src2 (or src1) being constant zero if (IsPackInUnsigned(flags)) { @@ -6595,7 +6617,11 @@ struct PACK : Sequence> { e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); } else { // unsigned -> unsigned - assert_always(); + e.lea(e.r9, e.StashXmm(1, i.src2)); + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulatePack8_IN_16_UN_UN)); + e.vmovaps(i.dest, e.xmm0); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); } } else { if (IsPackOutSaturate(flags)) { @@ -6630,32 +6656,51 @@ struct PACK : Sequence> { } } } + static __m128i EmulatePack16_IN_32_UN_UN_SAT(void*, __m128i src1, + __m128i src2) { + alignas(16) uint32_t a[4]; + alignas(16) uint32_t b[4]; + alignas(16) uint16_t c[8]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (int i = 0; i < 4; ++i) { + c[i] = uint16_t(std::min(65535u, a[i])); + c[i + 4] = uint16_t(std::min(65535u, b[i])); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(c)); + } static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, uint32_t flags) { // TODO(benvanik): handle src2 (or src1) being constant zero if (IsPackInUnsigned(flags)) { if (IsPackOutUnsigned(flags)) { if (IsPackOutSaturate(flags)) { - // TODO(gibbed): check if this is actually correct, it's a duplicate - // of the signed -> unsigned + saturate code, but seems to work. // unsigned -> unsigned + saturate - // PACKUSDW - // TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0]; - // DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0]; Xmm src2; - if (!i.src2.is_constant) { - src2 = i.src2; - } else { - assert_false(i.src1 == e.xmm0); + if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); - src2 = e.xmm0; + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); } - e.vpackusdw(i.dest, i.src1, src2); - e.vpshuflw(i.dest, i.dest, 0b10110001); - e.vpshufhw(i.dest, i.dest, 0b10110001); + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulatePack16_IN_32_UN_UN_SAT)); + e.vmovaps(i.dest, e.xmm0); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); } else { // unsigned -> unsigned - assert_always(); + e.vmovaps(e.xmm0, i.src1); + e.vpshuflw(e.xmm0, e.xmm0, 0b00100010); + e.vpshufhw(e.xmm0, e.xmm0, 0b00100010); + e.vpshufd(e.xmm0, e.xmm0, 0b00001000); + + e.vmovaps(i.dest, i.src2); + e.vpshuflw(i.dest, i.dest, 0b00100010); + e.vpshufhw(i.dest, i.dest, 0b00100010); + e.vpshufd(i.dest, i.dest, 0b10000000); + + e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111); } } else { if (IsPackOutSaturate(flags)) { @@ -6795,7 +6840,15 @@ struct UNPACK : Sequence> { e.vpshufd(i.dest, i.dest, B10100100); e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); } else { - e.lea(e.r8, e.StashXmm(0, i.src1)); + Xmm src; + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + src = e.xmm0; + } else { + src = i.src1; + } + + e.lea(e.r8, e.StashXmm(0, src)); e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); e.vmovaps(i.dest, e.xmm0); } diff --git a/src/xenia/cpu/frontend/testing/instr_vpkuhum.s b/src/xenia/cpu/frontend/testing/instr_vpkuhum.s index de1aa6450..3eebaef8c 100644 --- a/src/xenia/cpu/frontend/testing/instr_vpkuhum.s +++ b/src/xenia/cpu/frontend/testing/instr_vpkuhum.s @@ -1,39 +1,38 @@ -#vpkuhum isn't implemented yet -#test_vpkuhum_1: -# # {0, 1, 2, 3, 4, 5, 6, 7} -# #_ REGISTER_IN v3 [00000001, 00020003, 00040005, 00060007] -# # {8, 9, 10, 11, 12, 13, 14, 15} -# #_ REGISTER_IN v4 [00080009, 000A000B, 000C000D, 000E000F] -# vpkuhum v5, v3, v4 -# blr -# #_ REGISTER_OUT v3 [00000001, 00020003, 00040005, 00060007] -# #_ REGISTER_OUT v4 [00080009, 000A000B, 000C000D, 000E000F] -# # {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} -# #_ REGISTER_OUT v5 [00010203, 04050607, 08090A0B, 0C0D0E0F] -# blr +test_vpkuhum_1: + # {0, 1, 2, 3, 4, 5, 6, 7} + #_ REGISTER_IN v3 [00000001, 00020003, 00040005, 00060007] + # {8, 9, 10, 11, 12, 13, 14, 15} + #_ REGISTER_IN v4 [00080009, 000A000B, 000C000D, 000E000F] + vpkuhum v5, v3, v4 + blr + #_ REGISTER_OUT v3 [00000001, 00020003, 00040005, 00060007] + #_ REGISTER_OUT v4 [00080009, 000A000B, 000C000D, 000E000F] + # {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} + #_ REGISTER_OUT v5 [00010203, 04050607, 08090A0B, 0C0D0E0F] + blr -#test_vpkuhum_2: -# # {-8, -7, -6, -5, -4, -3, -2, -1} -# #_ REGISTER_IN v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF] -# # {0, 1, 2, 3, 4, 5, 6, 7} -# #_ REGISTER_IN v4 [00000001, 00020003, 00040005, 00060007] -# vpkuhum v5, v3, v4 -# blr -# #_ REGISTER_OUT v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF] -# #_ REGISTER_OUT v4 [00000001, 00020003, 00040005, 00060007] -# # {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7} -# #_ REGISTER_OUT v5 [F8F9FAFB, FCFDFEFF, 00010203, 04050607] -# blr +test_vpkuhum_2: + # {-8, -7, -6, -5, -4, -3, -2, -1} + #_ REGISTER_IN v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF] + # {0, 1, 2, 3, 4, 5, 6, 7} + #_ REGISTER_IN v4 [00000001, 00020003, 00040005, 00060007] + vpkuhum v5, v3, v4 + blr + #_ REGISTER_OUT v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF] + #_ REGISTER_OUT v4 [00000001, 00020003, 00040005, 00060007] + # {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7} + #_ REGISTER_OUT v5 [F8F9FAFB, FCFDFEFF, 00010203, 04050607] + blr -#test_vpkuhum_3: -# # {0, 65535, 65535, 0, 0, 0, 65535, 0} -# #_ REGISTER_IN v3 [0000FFFF, FFFF0000, 00000000, FFFF0000] -# # {65535, 0, 0, 65535, 65535, 65535, 0, 65535} -# #_ REGISTER_IN v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF] -# vpkuhum v5, v3, v4 -# blr -# #_ REGISTER_OUT v3 [0000FFFF, FFFF0000, 00000000, FFFF0000] -# #_ REGISTER_OUT v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF] -# # {0, 255, 255, 0, 0, 0, 255, 0, 255, 0, 0, 255, 255, 255, 0, 255} -# #_ REGISTER_OUT v5 [00FFFF00, 0000FF00, FF0000FF, FFFF00FF] -# blr +test_vpkuhum_3: + # {0, 65535, 65535, 0, 0, 0, 65535, 0} + #_ REGISTER_IN v3 [0000FFFF, FFFF0000, 00000000, FFFF0000] + # {65535, 0, 0, 65535, 65535, 65535, 0, 65535} + #_ REGISTER_IN v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF] + vpkuhum v5, v3, v4 + blr + #_ REGISTER_OUT v3 [0000FFFF, FFFF0000, 00000000, FFFF0000] + #_ REGISTER_OUT v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF] + # {0, 255, 255, 0, 0, 0, 255, 0, 255, 0, 0, 255, 255, 255, 0, 255} + #_ REGISTER_OUT v5 [00FFFF00, 0000FF00, FF0000FF, FFFF00FF] + blr diff --git a/src/xenia/cpu/frontend/testing/instr_vpkuwum.s b/src/xenia/cpu/frontend/testing/instr_vpkuwum.s index 28841cfe0..66116f7a9 100644 --- a/src/xenia/cpu/frontend/testing/instr_vpkuwum.s +++ b/src/xenia/cpu/frontend/testing/instr_vpkuwum.s @@ -1,36 +1,35 @@ -#vpkuwum isn't implemented yet -#test_vpkuwum_1: -# # {0, 1, 2, 3} -# #_ REGISTER_IN v3 [00000000, 00000001, 00000002, 00000003] -# # {4, 5, 6, 7} -# #_ REGISTER_IN v4 [00000004, 00000005, 00000006, 00000007] -# vpkuwum v5, v3, v4 -# blr -# #_ REGISTER_OUT v3 [00000000, 00000001, 00000002, 00000003] -# #_ REGISTER_OUT v4 [00000004, 00000005, 00000006, 00000007] -# # {0, 1, 2, 3, 4, 5, 6, 7} -# #_ REGISTER_OUT v5 [00000001, 00020003, 00040005, 00060007] +test_vpkuwum_1: + # {0, 1, 2, 3} + #_ REGISTER_IN v3 [00000000, 00000001, 00000002, 00000003] + # {4, 5, 6, 7} + #_ REGISTER_IN v4 [00000004, 00000005, 00000006, 00000007] + vpkuwum v5, v3, v4 + blr + #_ REGISTER_OUT v3 [00000000, 00000001, 00000002, 00000003] + #_ REGISTER_OUT v4 [00000004, 00000005, 00000006, 00000007] + # {0, 1, 2, 3, 4, 5, 6, 7} + #_ REGISTER_OUT v5 [00000001, 00020003, 00040005, 00060007] -#test_vpkuwum_2: -# # {-4, -3, -2, -1} -# #_ REGISTER_IN v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF] -# # {0, 1, 2, 3} -# #_ REGISTER_IN v4 [00000000, 00000001, 00000002, 00000003] -# vpkuwum v5, v3, v4 -# blr -# #_ REGISTER_OUT v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF] -# #_ REGISTER_OUT v4 [00000000, 00000001, 00000002, 00000003] -# # {-4, -3, -2, -1, 0, 1, 2, 3} -# #_ REGISTER_OUT v5 [FFFCFFFD, FFFEFFFF, 00000001, 00020003] +test_vpkuwum_2: + # {-4, -3, -2, -1} + #_ REGISTER_IN v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF] + # {0, 1, 2, 3} + #_ REGISTER_IN v4 [00000000, 00000001, 00000002, 00000003] + vpkuwum v5, v3, v4 + blr + #_ REGISTER_OUT v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF] + #_ REGISTER_OUT v4 [00000000, 00000001, 00000002, 00000003] + # {-4, -3, -2, -1, 0, 1, 2, 3} + #_ REGISTER_OUT v5 [FFFCFFFD, FFFEFFFF, 00000001, 00020003] -#test_vpkuwum_3: -# # {0, 4294967295, 4294967295, 4294967295} -# #_ REGISTER_IN v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF] -# # {4294967295, 0, 0, 0} -# #_ REGISTER_IN v4 [FFFFFFFF, 00000000, 00000000, 00000000] -# vpkuwum v5, v3, v4 -# blr -# #_ REGISTER_OUT v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF] -# #_ REGISTER_OUT v4 [FFFFFFFF, 00000000, 00000000, 00000000] -# # {0, 65535, 65535, 65535, 65535, 0, 0, 0} -# #_ REGISTER_OUT v5 [0000FFFF, FFFFFFFF, FFFF0000, 00000000] +test_vpkuwum_3: + # {0, 4294967295, 4294967295, 4294967295} + #_ REGISTER_IN v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF] + # {4294967295, 0, 0, 0} + #_ REGISTER_IN v4 [FFFFFFFF, 00000000, 00000000, 00000000] + vpkuwum v5, v3, v4 + blr + #_ REGISTER_OUT v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF] + #_ REGISTER_OUT v4 [FFFFFFFF, 00000000, 00000000, 00000000] + # {0, 65535, 65535, 65535, 65535, 0, 0, 0} + #_ REGISTER_OUT v5 [0000FFFF, FFFFFFFF, FFFF0000, 00000000] diff --git a/src/xenia/cpu/frontend/testing/instr_vpkuwus.s b/src/xenia/cpu/frontend/testing/instr_vpkuwus.s index ee1cfa37c..7f117c1eb 100644 --- a/src/xenia/cpu/frontend/testing/instr_vpkuwus.s +++ b/src/xenia/cpu/frontend/testing/instr_vpkuwus.s @@ -9,3 +9,15 @@ test_vpkuwus_1: #_ REGISTER_OUT v4 [00000002, 00010002, 00000003, 00010003] # {0, 65535, 1, 65535, 2, 65535, 3, 65535} #_ REGISTER_OUT v5 [0000FFFF, 0001FFFF, 0002FFFF, 0003FFFF] + +test_vpkuwus_2: + # {2147483648, 2147483647, 2, 3} + #_ REGISTER_IN v3 [80000000, 7FFFFFFF, 00000002, 00000003] + # {4294967295, 65538, 4294967294, 16} + #_ REGISTER_IN v4 [FFFFFFFF, 00010002, FFFFFFFE, 00000010] + vpkuwus v5, v3, v4 + blr + #_ REGISTER_OUT v3 [80000000, 7FFFFFFF, 00000002, 00000003] + #_ REGISTER_OUT v4 [FFFFFFFF, 00010002, FFFFFFFE, 00000010] + # {65535, 65535, 2, 3, 65535, 65535, 65535, 16} + #_ REGISTER_OUT v5 [FFFFFFFF, 00020003, FFFFFFFF, FFFF0010]