Implement vpkuwum/vpkuhum
Fixed vpkuwus with signed numbers Implement some constant sources for vec average/unpack
This commit is contained in:
parent
64b0cc9d5e
commit
9985187b23
|
@ -5767,14 +5767,24 @@ struct VECTOR_AVERAGE
|
||||||
case INT32_TYPE:
|
case INT32_TYPE:
|
||||||
// No 32bit averages in AVX.
|
// No 32bit averages in AVX.
|
||||||
if (is_unsigned) {
|
if (is_unsigned) {
|
||||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
if (i.src2.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
e.lea(e.r9, e.StashXmm(1, i.src2));
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
}
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
e.CallNativeSafe(
|
e.CallNativeSafe(
|
||||||
reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
|
reinterpret_cast<void*>(EmulateVectorAverageUnsignedI32));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
} else {
|
} else {
|
||||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
if (i.src2.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
e.lea(e.r9, e.StashXmm(1, i.src2));
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
}
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
e.CallNativeSafe(
|
e.CallNativeSafe(
|
||||||
reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
|
reinterpret_cast<void*>(EmulateVectorAverageSignedI32));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
@ -6576,6 +6586,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
}
|
}
|
||||||
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
}
|
}
|
||||||
|
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
|
||||||
|
alignas(16) uint8_t a[16];
|
||||||
|
alignas(16) uint8_t b[16];
|
||||||
|
alignas(16) uint8_t c[16];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
c[i] = a[i * 2];
|
||||||
|
c[i + 8] = b[i * 2];
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
|
}
|
||||||
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
||||||
// TODO(benvanik): handle src2 (or src1) being constant zero
|
// TODO(benvanik): handle src2 (or src1) being constant zero
|
||||||
if (IsPackInUnsigned(flags)) {
|
if (IsPackInUnsigned(flags)) {
|
||||||
|
@ -6595,7 +6617,11 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
||||||
} else {
|
} else {
|
||||||
// unsigned -> unsigned
|
// unsigned -> unsigned
|
||||||
assert_always();
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulatePack8_IN_16_UN_UN));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
|
@ -6630,32 +6656,51 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
static __m128i EmulatePack16_IN_32_UN_UN_SAT(void*, __m128i src1,
|
||||||
|
__m128i src2) {
|
||||||
|
alignas(16) uint32_t a[4];
|
||||||
|
alignas(16) uint32_t b[4];
|
||||||
|
alignas(16) uint16_t c[8];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
c[i] = uint16_t(std::min(65535u, a[i]));
|
||||||
|
c[i + 4] = uint16_t(std::min(65535u, b[i]));
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
|
}
|
||||||
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
|
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
|
||||||
uint32_t flags) {
|
uint32_t flags) {
|
||||||
// TODO(benvanik): handle src2 (or src1) being constant zero
|
// TODO(benvanik): handle src2 (or src1) being constant zero
|
||||||
if (IsPackInUnsigned(flags)) {
|
if (IsPackInUnsigned(flags)) {
|
||||||
if (IsPackOutUnsigned(flags)) {
|
if (IsPackOutUnsigned(flags)) {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
// TODO(gibbed): check if this is actually correct, it's a duplicate
|
|
||||||
// of the signed -> unsigned + saturate code, but seems to work.
|
|
||||||
// unsigned -> unsigned + saturate
|
// unsigned -> unsigned + saturate
|
||||||
// PACKUSDW
|
|
||||||
// TMP[15:0] <- (DEST[31:0] < 0) ? 0 : DEST[15:0];
|
|
||||||
// DEST[15:0] <- (DEST[31:0] > FFFFH) ? FFFFH : TMP[15:0];
|
|
||||||
Xmm src2;
|
Xmm src2;
|
||||||
if (!i.src2.is_constant) {
|
if (i.src2.is_constant) {
|
||||||
src2 = i.src2;
|
|
||||||
} else {
|
|
||||||
assert_false(i.src1 == e.xmm0);
|
|
||||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
src2 = e.xmm0;
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
}
|
}
|
||||||
e.vpackusdw(i.dest, i.src1, src2);
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
e.vpshuflw(i.dest, i.dest, 0b10110001);
|
e.CallNativeSafe(
|
||||||
e.vpshufhw(i.dest, i.dest, 0b10110001);
|
reinterpret_cast<void*>(EmulatePack16_IN_32_UN_UN_SAT));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask));
|
||||||
} else {
|
} else {
|
||||||
// unsigned -> unsigned
|
// unsigned -> unsigned
|
||||||
assert_always();
|
e.vmovaps(e.xmm0, i.src1);
|
||||||
|
e.vpshuflw(e.xmm0, e.xmm0, 0b00100010);
|
||||||
|
e.vpshufhw(e.xmm0, e.xmm0, 0b00100010);
|
||||||
|
e.vpshufd(e.xmm0, e.xmm0, 0b00001000);
|
||||||
|
|
||||||
|
e.vmovaps(i.dest, i.src2);
|
||||||
|
e.vpshuflw(i.dest, i.dest, 0b00100010);
|
||||||
|
e.vpshufhw(i.dest, i.dest, 0b00100010);
|
||||||
|
e.vpshufd(i.dest, i.dest, 0b10000000);
|
||||||
|
|
||||||
|
e.vpblendw(i.dest, i.dest, e.xmm0, 0b00001111);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
|
@ -6795,7 +6840,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
e.vpshufd(i.dest, i.dest, B10100100);
|
e.vpshufd(i.dest, i.dest, B10100100);
|
||||||
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
|
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
|
||||||
} else {
|
} else {
|
||||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
Xmm src;
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src1.constant());
|
||||||
|
src = e.xmm0;
|
||||||
|
} else {
|
||||||
|
src = i.src1;
|
||||||
|
}
|
||||||
|
|
||||||
|
e.lea(e.r8, e.StashXmm(0, src));
|
||||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue