Merge pull request #452 from DrChat/vpk_instrs

Fix vpkuwus / Implemented vpkuwum/vpkuhum
This commit is contained in:
Ben Vanik 2015-11-06 13:43:05 -08:00
commit 1ffd25c91b
4 changed files with 152 additions and 89 deletions

View File

@ -5767,14 +5767,24 @@ struct VECTOR_AVERAGE
case INT32_TYPE: case INT32_TYPE:
// No 32bit averages in AVX. // No 32bit averages in AVX.
if (is_unsigned) { if (is_unsigned) {
if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.lea(e.r9, e.StashXmm(1, e.xmm0));
} else {
e.lea(e.r9, e.StashXmm(1, i.src2));
}
e.lea(e.r8, e.StashXmm(0, i.src1)); e.lea(e.r8, e.StashXmm(0, i.src1));
e.lea(e.r9, e.StashXmm(1, i.src2));
e.CallNativeSafe( e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32)); reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
} else { } else {
if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.lea(e.r9, e.StashXmm(1, e.xmm0));
} else {
e.lea(e.r9, e.StashXmm(1, i.src2));
}
e.lea(e.r8, e.StashXmm(0, i.src1)); e.lea(e.r8, e.StashXmm(0, i.src1));
e.lea(e.r9, e.StashXmm(1, i.src2));
e.CallNativeSafe( e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorAverageSignedI32)); reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
@ -6576,6 +6586,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
} }
return _mm_load_si128(reinterpret_cast<__m128i*>(c)); return _mm_load_si128(reinterpret_cast<__m128i*>(c));
} }
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
alignas(16) uint8_t a[16];
alignas(16) uint8_t b[16];
alignas(16) uint8_t c[16];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
for (int i = 0; i < 8; ++i) {
c[i] = a[i * 2];
c[i + 8] = b[i * 2];
}
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
}
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
// TODO(benvanik): handle src2 (or src1) being constant zero // TODO(benvanik): handle src2 (or src1) being constant zero
if (IsPackInUnsigned(flags)) { if (IsPackInUnsigned(flags)) {
@ -6595,7 +6617,11 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
} else { } else {
// unsigned -> unsigned // unsigned -> unsigned
assert_always(); e.lea(e.r9, e.StashXmm(1, i.src2));
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN));
e.vmovaps(i.dest, e.xmm0);
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
} }
} else { } else {
if (IsPackOutSaturate(flags)) { if (IsPackOutSaturate(flags)) {
@ -6630,32 +6656,51 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
} }
} }
} }
static __m128i EmulatePack16_IN_32_UN_UN_SAT(void*, __m128i src1,
__m128i src2) {
alignas(16) uint32_t a[4];
alignas(16) uint32_t b[4];
alignas(16) uint16_t c[8];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
for (int i = 0; i < 4; ++i) {
c[i] = uint16_t(std::min(65535u, a[i]));
c[i + 4] = uint16_t(std::min(65535u, b[i]));
}
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
}
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
uint32_t flags) { uint32_t flags) {
// TODO(benvanik): handle src2 (or src1) being constant zero // TODO(benvanik): handle src2 (or src1) being constant zero
if (IsPackInUnsigned(flags)) { if (IsPackInUnsigned(flags)) {
if (IsPackOutUnsigned(flags)) { if (IsPackOutUnsigned(flags)) {
if (IsPackOutSaturate(flags)) { if (IsPackOutSaturate(flags)) {
// TODO(gibbed): check if this is actually correct, it's a duplicate
// of the signed -> unsigned + saturate code, but seems to work.
// unsigned -> unsigned + saturate // unsigned -> unsigned + saturate
// PACKUSDW
// TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0];
// DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0];
Xmm src2; Xmm src2;
if (!i.src2.is_constant) { if (i.src2.is_constant) {
src2 = i.src2;
} else {
assert_false(i.src1 == e.xmm0);
e.LoadConstantXmm(e.xmm0, i.src2.constant()); e.LoadConstantXmm(e.xmm0, i.src2.constant());
src2 = e.xmm0; e.lea(e.r9, e.StashXmm(1, e.xmm0));
} else {
e.lea(e.r9, e.StashXmm(1, i.src2));
} }
e.vpackusdw(i.dest, i.src1, src2); e.lea(e.r8, e.StashXmm(0, i.src1));
e.vpshuflw(i.dest, i.dest, 0b10110001); e.CallNativeSafe(
e.vpshufhw(i.dest, i.dest, 0b10110001); reinterpret_cast<void*>(EmulatePack16_IN_32_UN_UN_SAT));
e.vmovaps(i.dest, e.xmm0);
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
} else { } else {
// unsigned -> unsigned // unsigned -> unsigned
assert_always(); e.vmovaps(e.xmm0, i.src1);
e.vpshuflw(e.xmm0, e.xmm0, 0b00100010);
e.vpshufhw(e.xmm0, e.xmm0, 0b00100010);
e.vpshufd(e.xmm0, e.xmm0, 0b00001000);
e.vmovaps(i.dest, i.src2);
e.vpshuflw(i.dest, i.dest, 0b00100010);
e.vpshufhw(i.dest, i.dest, 0b00100010);
e.vpshufd(i.dest, i.dest, 0b10000000);
e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111);
} }
} else { } else {
if (IsPackOutSaturate(flags)) { if (IsPackOutSaturate(flags)) {
@ -6795,7 +6840,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.vpshufd(i.dest, i.dest, B10100100); e.vpshufd(i.dest, i.dest, B10100100);
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
} else { } else {
e.lea(e.r8, e.StashXmm(0, i.src1)); Xmm src;
if (i.src1.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src1.constant());
src = e.xmm0;
} else {
src = i.src1;
}
e.lea(e.r8, e.StashXmm(0, src));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
} }

View File

@ -1,39 +1,38 @@
#vpkuhum isn't implemented yet test_vpkuhum_1:
#test_vpkuhum_1: # {0, 1, 2, 3, 4, 5, 6, 7}
# # {0, 1, 2, 3, 4, 5, 6, 7} #_ REGISTER_IN v3 [00000001, 00020003, 00040005, 00060007]
# #_ REGISTER_IN v3 [00000001, 00020003, 00040005, 00060007] # {8, 9, 10, 11, 12, 13, 14, 15}
# # {8, 9, 10, 11, 12, 13, 14, 15} #_ REGISTER_IN v4 [00080009, 000A000B, 000C000D, 000E000F]
# #_ REGISTER_IN v4 [00080009, 000A000B, 000C000D, 000E000F] vpkuhum v5, v3, v4
# vpkuhum v5, v3, v4 blr
# blr #_ REGISTER_OUT v3 [00000001, 00020003, 00040005, 00060007]
# #_ REGISTER_OUT v3 [00000001, 00020003, 00040005, 00060007] #_ REGISTER_OUT v4 [00080009, 000A000B, 000C000D, 000E000F]
# #_ REGISTER_OUT v4 [00080009, 000A000B, 000C000D, 000E000F] # {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
# # {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} #_ REGISTER_OUT v5 [00010203, 04050607, 08090A0B, 0C0D0E0F]
# #_ REGISTER_OUT v5 [00010203, 04050607, 08090A0B, 0C0D0E0F] blr
# blr
#test_vpkuhum_2: test_vpkuhum_2:
# # {-8, -7, -6, -5, -4, -3, -2, -1} # {-8, -7, -6, -5, -4, -3, -2, -1}
# #_ REGISTER_IN v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF] #_ REGISTER_IN v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF]
# # {0, 1, 2, 3, 4, 5, 6, 7} # {0, 1, 2, 3, 4, 5, 6, 7}
# #_ REGISTER_IN v4 [00000001, 00020003, 00040005, 00060007] #_ REGISTER_IN v4 [00000001, 00020003, 00040005, 00060007]
# vpkuhum v5, v3, v4 vpkuhum v5, v3, v4
# blr blr
# #_ REGISTER_OUT v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF] #_ REGISTER_OUT v3 [FFF8FFF9, FFFAFFFB, FFFCFFFD, FFFEFFFF]
# #_ REGISTER_OUT v4 [00000001, 00020003, 00040005, 00060007] #_ REGISTER_OUT v4 [00000001, 00020003, 00040005, 00060007]
# # {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7} # {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}
# #_ REGISTER_OUT v5 [F8F9FAFB, FCFDFEFF, 00010203, 04050607] #_ REGISTER_OUT v5 [F8F9FAFB, FCFDFEFF, 00010203, 04050607]
# blr blr
#test_vpkuhum_3: test_vpkuhum_3:
# # {0, 65535, 65535, 0, 0, 0, 65535, 0} # {0, 65535, 65535, 0, 0, 0, 65535, 0}
# #_ REGISTER_IN v3 [0000FFFF, FFFF0000, 00000000, FFFF0000] #_ REGISTER_IN v3 [0000FFFF, FFFF0000, 00000000, FFFF0000]
# # {65535, 0, 0, 65535, 65535, 65535, 0, 65535} # {65535, 0, 0, 65535, 65535, 65535, 0, 65535}
# #_ REGISTER_IN v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF] #_ REGISTER_IN v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF]
# vpkuhum v5, v3, v4 vpkuhum v5, v3, v4
# blr blr
# #_ REGISTER_OUT v3 [0000FFFF, FFFF0000, 00000000, FFFF0000] #_ REGISTER_OUT v3 [0000FFFF, FFFF0000, 00000000, FFFF0000]
# #_ REGISTER_OUT v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF] #_ REGISTER_OUT v4 [FFFF0000, 0000FFFF, FFFFFFFF, 0000FFFF]
# # {0, 255, 255, 0, 0, 0, 255, 0, 255, 0, 0, 255, 255, 255, 0, 255} # {0, 255, 255, 0, 0, 0, 255, 0, 255, 0, 0, 255, 255, 255, 0, 255}
# #_ REGISTER_OUT v5 [00FFFF00, 0000FF00, FF0000FF, FFFF00FF] #_ REGISTER_OUT v5 [00FFFF00, 0000FF00, FF0000FF, FFFF00FF]
# blr blr

View File

@ -1,36 +1,35 @@
#vpkuwum isn't implemented yet test_vpkuwum_1:
#test_vpkuwum_1: # {0, 1, 2, 3}
# # {0, 1, 2, 3} #_ REGISTER_IN v3 [00000000, 00000001, 00000002, 00000003]
# #_ REGISTER_IN v3 [00000000, 00000001, 00000002, 00000003] # {4, 5, 6, 7}
# # {4, 5, 6, 7} #_ REGISTER_IN v4 [00000004, 00000005, 00000006, 00000007]
# #_ REGISTER_IN v4 [00000004, 00000005, 00000006, 00000007] vpkuwum v5, v3, v4
# vpkuwum v5, v3, v4 blr
# blr #_ REGISTER_OUT v3 [00000000, 00000001, 00000002, 00000003]
# #_ REGISTER_OUT v3 [00000000, 00000001, 00000002, 00000003] #_ REGISTER_OUT v4 [00000004, 00000005, 00000006, 00000007]
# #_ REGISTER_OUT v4 [00000004, 00000005, 00000006, 00000007] # {0, 1, 2, 3, 4, 5, 6, 7}
# # {0, 1, 2, 3, 4, 5, 6, 7} #_ REGISTER_OUT v5 [00000001, 00020003, 00040005, 00060007]
# #_ REGISTER_OUT v5 [00000001, 00020003, 00040005, 00060007]
#test_vpkuwum_2: test_vpkuwum_2:
# # {-4, -3, -2, -1} # {-4, -3, -2, -1}
# #_ REGISTER_IN v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF] #_ REGISTER_IN v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF]
# # {0, 1, 2, 3} # {0, 1, 2, 3}
# #_ REGISTER_IN v4 [00000000, 00000001, 00000002, 00000003] #_ REGISTER_IN v4 [00000000, 00000001, 00000002, 00000003]
# vpkuwum v5, v3, v4 vpkuwum v5, v3, v4
# blr blr
# #_ REGISTER_OUT v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF] #_ REGISTER_OUT v3 [FFFFFFFC, FFFFFFFD, FFFFFFFE, FFFFFFFF]
# #_ REGISTER_OUT v4 [00000000, 00000001, 00000002, 00000003] #_ REGISTER_OUT v4 [00000000, 00000001, 00000002, 00000003]
# # {-4, -3, -2, -1, 0, 1, 2, 3} # {-4, -3, -2, -1, 0, 1, 2, 3}
# #_ REGISTER_OUT v5 [FFFCFFFD, FFFEFFFF, 00000001, 00020003] #_ REGISTER_OUT v5 [FFFCFFFD, FFFEFFFF, 00000001, 00020003]
#test_vpkuwum_3: test_vpkuwum_3:
# # {0, 4294967295, 4294967295, 4294967295} # {0, 4294967295, 4294967295, 4294967295}
# #_ REGISTER_IN v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF] #_ REGISTER_IN v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF]
# # {4294967295, 0, 0, 0} # {4294967295, 0, 0, 0}
# #_ REGISTER_IN v4 [FFFFFFFF, 00000000, 00000000, 00000000] #_ REGISTER_IN v4 [FFFFFFFF, 00000000, 00000000, 00000000]
# vpkuwum v5, v3, v4 vpkuwum v5, v3, v4
# blr blr
# #_ REGISTER_OUT v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF] #_ REGISTER_OUT v3 [00000000, FFFFFFFF, FFFFFFFF, FFFFFFFF]
# #_ REGISTER_OUT v4 [FFFFFFFF, 00000000, 00000000, 00000000] #_ REGISTER_OUT v4 [FFFFFFFF, 00000000, 00000000, 00000000]
# # {0, 65535, 65535, 65535, 65535, 0, 0, 0} # {0, 65535, 65535, 65535, 65535, 0, 0, 0}
# #_ REGISTER_OUT v5 [0000FFFF, FFFFFFFF, FFFF0000, 00000000] #_ REGISTER_OUT v5 [0000FFFF, FFFFFFFF, FFFF0000, 00000000]

View File

@ -9,3 +9,15 @@ test_vpkuwus_1:
#_ REGISTER_OUT v4 [00000002, 00010002, 00000003, 00010003] #_ REGISTER_OUT v4 [00000002, 00010002, 00000003, 00010003]
# {0, 65535, 1, 65535, 2, 65535, 3, 65535} # {0, 65535, 1, 65535, 2, 65535, 3, 65535}
#_ REGISTER_OUT v5 [0000FFFF, 0001FFFF, 0002FFFF, 0003FFFF] #_ REGISTER_OUT v5 [0000FFFF, 0001FFFF, 0002FFFF, 0003FFFF]
test_vpkuwus_2:
# {2147483648, 2147483647, 2, 3}
#_ REGISTER_IN v3 [80000000, 7FFFFFFF, 00000002, 00000003]
# {4294967295, 65538, 4294967294, 16}
#_ REGISTER_IN v4 [FFFFFFFF, 00010002, FFFFFFFE, 00000010]
vpkuwus v5, v3, v4
blr
#_ REGISTER_OUT v3 [80000000, 7FFFFFFF, 00000002, 00000003]
#_ REGISTER_OUT v4 [FFFFFFFF, 00010002, FFFFFFFE, 00000010]
# {65535, 65535, 2, 3, 65535, 65535, 65535, 16}
#_ REGISTER_OUT v5 [FFFFFFFF, 00020003, FFFFFFFF, FFFF0010]