Merge pull request #452 from DrChat/vpk_instrs
Fix vpkuwus / Implemented vpkuwum/vpkuhum
This commit is contained in:
commit
1ffd25c91b
|
@ -5767,14 +5767,24 @@ struct VECTOR_AVERAGE
|
||||||
case INT32_TYPE:
|
case INT32_TYPE:
|
||||||
// No 32bit averages in AVX.
|
// No 32bit averages in AVX.
|
||||||
if (is_unsigned) {
|
if (is_unsigned) {
|
||||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
if (i.src2.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
e.lea(e.r9, e.StashXmm(1, i.src2));
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
}
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
e.CallNativeSafe(
|
e.CallNativeSafe(
|
||||||
reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
|
reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
} else {
|
} else {
|
||||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
if (i.src2.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
e.lea(e.r9, e.StashXmm(1, i.src2));
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
}
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
e.CallNativeSafe(
|
e.CallNativeSafe(
|
||||||
reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
|
reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
@ -6576,6 +6586,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
}
|
}
|
||||||
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
}
|
}
|
||||||
|
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
|
||||||
|
alignas(16) uint8_t a[16];
|
||||||
|
alignas(16) uint8_t b[16];
|
||||||
|
alignas(16) uint8_t c[16];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
c[i] = a[i * 2];
|
||||||
|
c[i + 8] = b[i * 2];
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
|
}
|
||||||
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
||||||
// TODO(benvanik): handle src2 (or src1) being constant zero
|
// TODO(benvanik): handle src2 (or src1) being constant zero
|
||||||
if (IsPackInUnsigned(flags)) {
|
if (IsPackInUnsigned(flags)) {
|
||||||
|
@ -6595,7 +6617,11 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
||||||
} else {
|
} else {
|
||||||
// unsigned -> unsigned
|
// unsigned -> unsigned
|
||||||
assert_always();
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
|
@ -6630,32 +6656,51 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
static __m128i EmulatePack16_IN_32_UN_UN_SAT(void*, __m128i src1,
|
||||||
|
__m128i src2) {
|
||||||
|
alignas(16) uint32_t a[4];
|
||||||
|
alignas(16) uint32_t b[4];
|
||||||
|
alignas(16) uint16_t c[8];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
c[i] = uint16_t(std::min(65535u, a[i]));
|
||||||
|
c[i + 4] = uint16_t(std::min(65535u, b[i]));
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
|
}
|
||||||
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
|
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
|
||||||
uint32_t flags) {
|
uint32_t flags) {
|
||||||
// TODO(benvanik): handle src2 (or src1) being constant zero
|
// TODO(benvanik): handle src2 (or src1) being constant zero
|
||||||
if (IsPackInUnsigned(flags)) {
|
if (IsPackInUnsigned(flags)) {
|
||||||
if (IsPackOutUnsigned(flags)) {
|
if (IsPackOutUnsigned(flags)) {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
// TODO(gibbed): check if this is actually correct, it's a duplicate
|
|
||||||
// of the signed -> unsigned + saturate code, but seems to work.
|
|
||||||
// unsigned -> unsigned + saturate
|
// unsigned -> unsigned + saturate
|
||||||
// PACKUSDW
|
|
||||||
// TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0];
|
|
||||||
// DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0];
|
|
||||||
Xmm src2;
|
Xmm src2;
|
||||||
if (!i.src2.is_constant) {
|
if (i.src2.is_constant) {
|
||||||
src2 = i.src2;
|
|
||||||
} else {
|
|
||||||
assert_false(i.src1 == e.xmm0);
|
|
||||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
src2 = e.xmm0;
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
}
|
}
|
||||||
e.vpackusdw(i.dest, i.src1, src2);
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
e.vpshuflw(i.dest, i.dest, 0b10110001);
|
e.CallNativeSafe(
|
||||||
e.vpshufhw(i.dest, i.dest, 0b10110001);
|
reinterpret_cast<void*>(EmulatePack16_IN_32_UN_UN_SAT));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
||||||
} else {
|
} else {
|
||||||
// unsigned -> unsigned
|
// unsigned -> unsigned
|
||||||
assert_always();
|
e.vmovaps(e.xmm0, i.src1);
|
||||||
|
e.vpshuflw(e.xmm0, e.xmm0, 0b00100010);
|
||||||
|
e.vpshufhw(e.xmm0, e.xmm0, 0b00100010);
|
||||||
|
e.vpshufd(e.xmm0, e.xmm0, 0b00001000);
|
||||||
|
|
||||||
|
e.vmovaps(i.dest, i.src2);
|
||||||
|
e.vpshuflw(i.dest, i.dest, 0b00100010);
|
||||||
|
e.vpshufhw(i.dest, i.dest, 0b00100010);
|
||||||
|
e.vpshufd(i.dest, i.dest, 0b10000000);
|
||||||
|
|
||||||
|
e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
|
@ -6795,7 +6840,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
e.vpshufd(i.dest, i.dest, B10100100);
|
e.vpshufd(i.dest, i.dest, B10100100);
|
||||||
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
|
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
|
||||||
} else {
|
} else {
|
||||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
Xmm src;
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src1.constant());
|
||||||
|
src = e.xmm0;
|
||||||
|
} else {
|
||||||
|
src = i.src1;
|
||||||
|
}
|
||||||
|
|
||||||
|
e.lea(e.r8, e.StashXmm(0, src));
|
||||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,39 +1,38 @@
|
||||||
#vpkuhum isn't implemented yet
|
test_vpkuhum_1:
|
||||||
#test_vpkuhum_1:
|
# {0, 1, 2, 3, 4, 5, 6, 7}
|
||||||
# # {0, 1, 2, 3, 4, 5, 6, 7}
|
#_ REGISTER_IN v3 [00000001, 00020003, 00040005, 00060007]
|
||||||
# #_ REGISTER_IN v3 [00000001, 00020003, 00040005, 00060007]
|
# {8, 9, 10, 11, 12, 13, 14, 15}
|
||||||
# # {8, 9, 10, 11, 12, 13, 14, 15}
|
#_ REGISTER_IN v4 [00080009, 000A000B, 000C000D, 000E000F]
|
||||||
# #_ REGISTER_IN v4 [00080009, 000A000B, 000C000D, 000E000F]
|
vpkuhum v5, v3, v4
|
||||||
# vpkuhum v5, v3, v4
|
blr
|
||||||
# blr
|
#_ REGISTER_OUT v3 [00000001, 00020003, 00040005, 00060007]
|
||||||
# #_ REGISTER_OUT v3 [00000001, 00020003, 00040005, 00060007]
|
#_ REGISTER_OUT v4 [00080009, 000A000B, 000C000D, 000E000F]
|
||||||
# #_ REGISTER_OUT v4 [00080009, 000A000B, 000C000D, 000E000F]
|
# {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
||||||
# # {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
#_ REGISTER_OUT v5 [00010203, 04050607, 08090A0B, 0C0D0E0F]
|
||||||
# #_ REGISTER_OUT v5 [00010203, 04050607, 08090A0B, 0C0D0E0F]
|
blr
|
||||||
# blr
|
|
||||||
|
|
||||||
#test_vpkuhum_2:
|
test_vpkuhum_2:
|
||||||
# # {-8, -7, -6, -5, -4, -3, -2, -1}
|
# {-8, -7, -6, -5, -4, -3, -2, -1}
|
||||||
# #_ REGISTER_IN v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF]
|
#_ REGISTER_IN v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF]
|
||||||
# # {0, 1, 2, 3, 4, 5, 6, 7}
|
# {0, 1, 2, 3, 4, 5, 6, 7}
|
||||||
# #_ REGISTER_IN v4 [00000001, 00020003, 00040005, 00060007]
|
#_ REGISTER_IN v4 [00000001, 00020003, 00040005, 00060007]
|
||||||
# vpkuhum v5, v3, v4
|
vpkuhum v5, v3, v4
|
||||||
# blr
|
blr
|
||||||
# #_ REGISTER_OUT v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF]
|
#_ REGISTER_OUT v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF]
|
||||||
# #_ REGISTER_OUT v4 [00000001, 00020003, 00040005, 00060007]
|
#_ REGISTER_OUT v4 [00000001, 00020003, 00040005, 00060007]
|
||||||
# # {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}
|
# {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}
|
||||||
# #_ REGISTER_OUT v5 [F8F9FAFB, FCFDFEFF, 00010203, 04050607]
|
#_ REGISTER_OUT v5 [F8F9FAFB, FCFDFEFF, 00010203, 04050607]
|
||||||
# blr
|
blr
|
||||||
|
|
||||||
#test_vpkuhum_3:
|
test_vpkuhum_3:
|
||||||
# # {0, 65535, 65535, 0, 0, 0, 65535, 0}
|
# {0, 65535, 65535, 0, 0, 0, 65535, 0}
|
||||||
# #_ REGISTER_IN v3 [0000FFFF, FFFF0000, 00000000, FFFF0000]
|
#_ REGISTER_IN v3 [0000FFFF, FFFF0000, 00000000, FFFF0000]
|
||||||
# # {65535, 0, 0, 65535, 65535, 65535, 0, 65535}
|
# {65535, 0, 0, 65535, 65535, 65535, 0, 65535}
|
||||||
# #_ REGISTER_IN v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF]
|
#_ REGISTER_IN v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF]
|
||||||
# vpkuhum v5, v3, v4
|
vpkuhum v5, v3, v4
|
||||||
# blr
|
blr
|
||||||
# #_ REGISTER_OUT v3 [0000FFFF, FFFF0000, 00000000, FFFF0000]
|
#_ REGISTER_OUT v3 [0000FFFF, FFFF0000, 00000000, FFFF0000]
|
||||||
# #_ REGISTER_OUT v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF]
|
#_ REGISTER_OUT v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF]
|
||||||
# # {0, 255, 255, 0, 0, 0, 255, 0, 255, 0, 0, 255, 255, 255, 0, 255}
|
# {0, 255, 255, 0, 0, 0, 255, 0, 255, 0, 0, 255, 255, 255, 0, 255}
|
||||||
# #_ REGISTER_OUT v5 [00FFFF00, 0000FF00, FF0000FF, FFFF00FF]
|
#_ REGISTER_OUT v5 [00FFFF00, 0000FF00, FF0000FF, FFFF00FF]
|
||||||
# blr
|
blr
|
||||||
|
|
|
@ -1,36 +1,35 @@
|
||||||
#vpkuwum isn't implemented yet
|
test_vpkuwum_1:
|
||||||
#test_vpkuwum_1:
|
# {0, 1, 2, 3}
|
||||||
# # {0, 1, 2, 3}
|
#_ REGISTER_IN v3 [00000000, 00000001, 00000002, 00000003]
|
||||||
# #_ REGISTER_IN v3 [00000000, 00000001, 00000002, 00000003]
|
# {4, 5, 6, 7}
|
||||||
# # {4, 5, 6, 7}
|
#_ REGISTER_IN v4 [00000004, 00000005, 00000006, 00000007]
|
||||||
# #_ REGISTER_IN v4 [00000004, 00000005, 00000006, 00000007]
|
vpkuwum v5, v3, v4
|
||||||
# vpkuwum v5, v3, v4
|
blr
|
||||||
# blr
|
#_ REGISTER_OUT v3 [00000000, 00000001, 00000002, 00000003]
|
||||||
# #_ REGISTER_OUT v3 [00000000, 00000001, 00000002, 00000003]
|
#_ REGISTER_OUT v4 [00000004, 00000005, 00000006, 00000007]
|
||||||
# #_ REGISTER_OUT v4 [00000004, 00000005, 00000006, 00000007]
|
# {0, 1, 2, 3, 4, 5, 6, 7}
|
||||||
# # {0, 1, 2, 3, 4, 5, 6, 7}
|
#_ REGISTER_OUT v5 [00000001, 00020003, 00040005, 00060007]
|
||||||
# #_ REGISTER_OUT v5 [00000001, 00020003, 00040005, 00060007]
|
|
||||||
|
|
||||||
#test_vpkuwum_2:
|
test_vpkuwum_2:
|
||||||
# # {-4, -3, -2, -1}
|
# {-4, -3, -2, -1}
|
||||||
# #_ REGISTER_IN v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF]
|
#_ REGISTER_IN v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF]
|
||||||
# # {0, 1, 2, 3}
|
# {0, 1, 2, 3}
|
||||||
# #_ REGISTER_IN v4 [00000000, 00000001, 00000002, 00000003]
|
#_ REGISTER_IN v4 [00000000, 00000001, 00000002, 00000003]
|
||||||
# vpkuwum v5, v3, v4
|
vpkuwum v5, v3, v4
|
||||||
# blr
|
blr
|
||||||
# #_ REGISTER_OUT v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF]
|
#_ REGISTER_OUT v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF]
|
||||||
# #_ REGISTER_OUT v4 [00000000, 00000001, 00000002, 00000003]
|
#_ REGISTER_OUT v4 [00000000, 00000001, 00000002, 00000003]
|
||||||
# # {-4, -3, -2, -1, 0, 1, 2, 3}
|
# {-4, -3, -2, -1, 0, 1, 2, 3}
|
||||||
# #_ REGISTER_OUT v5 [FFFCFFFD, FFFEFFFF, 00000001, 00020003]
|
#_ REGISTER_OUT v5 [FFFCFFFD, FFFEFFFF, 00000001, 00020003]
|
||||||
|
|
||||||
#test_vpkuwum_3:
|
test_vpkuwum_3:
|
||||||
# # {0, 4294967295, 4294967295, 4294967295}
|
# {0, 4294967295, 4294967295, 4294967295}
|
||||||
# #_ REGISTER_IN v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF]
|
#_ REGISTER_IN v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF]
|
||||||
# # {4294967295, 0, 0, 0}
|
# {4294967295, 0, 0, 0}
|
||||||
# #_ REGISTER_IN v4 [FFFFFFFF, 00000000, 00000000, 00000000]
|
#_ REGISTER_IN v4 [FFFFFFFF, 00000000, 00000000, 00000000]
|
||||||
# vpkuwum v5, v3, v4
|
vpkuwum v5, v3, v4
|
||||||
# blr
|
blr
|
||||||
# #_ REGISTER_OUT v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF]
|
#_ REGISTER_OUT v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF]
|
||||||
# #_ REGISTER_OUT v4 [FFFFFFFF, 00000000, 00000000, 00000000]
|
#_ REGISTER_OUT v4 [FFFFFFFF, 00000000, 00000000, 00000000]
|
||||||
# # {0, 65535, 65535, 65535, 65535, 0, 0, 0}
|
# {0, 65535, 65535, 65535, 65535, 0, 0, 0}
|
||||||
# #_ REGISTER_OUT v5 [0000FFFF, FFFFFFFF, FFFF0000, 00000000]
|
#_ REGISTER_OUT v5 [0000FFFF, FFFFFFFF, FFFF0000, 00000000]
|
||||||
|
|
|
@ -9,3 +9,15 @@ test_vpkuwus_1:
|
||||||
#_ REGISTER_OUT v4 [00000002, 00010002, 00000003, 00010003]
|
#_ REGISTER_OUT v4 [00000002, 00010002, 00000003, 00010003]
|
||||||
# {0, 65535, 1, 65535, 2, 65535, 3, 65535}
|
# {0, 65535, 1, 65535, 2, 65535, 3, 65535}
|
||||||
#_ REGISTER_OUT v5 [0000FFFF, 0001FFFF, 0002FFFF, 0003FFFF]
|
#_ REGISTER_OUT v5 [0000FFFF, 0001FFFF, 0002FFFF, 0003FFFF]
|
||||||
|
|
||||||
|
test_vpkuwus_2:
|
||||||
|
# {2147483648, 2147483647, 2, 3}
|
||||||
|
#_ REGISTER_IN v3 [80000000, 7FFFFFFF, 00000002, 00000003]
|
||||||
|
# {4294967295, 65538, 4294967294, 16}
|
||||||
|
#_ REGISTER_IN v4 [FFFFFFFF, 00010002, FFFFFFFE, 00000010]
|
||||||
|
vpkuwus v5, v3, v4
|
||||||
|
blr
|
||||||
|
#_ REGISTER_OUT v3 [80000000, 7FFFFFFF, 00000002, 00000003]
|
||||||
|
#_ REGISTER_OUT v4 [FFFFFFFF, 00010002, FFFFFFFE, 00000010]
|
||||||
|
# {65535, 65535, 2, 3, 65535, 65535, 65535, 16}
|
||||||
|
#_ REGISTER_OUT v5 [FFFFFFFF, 00020003, FFFFFFFF, FFFF0010]
|
||||||
|
|
Loading…
Reference in New Issue