Use vpminud to saturate rather than bitwise ops (shaves 6 instructions per 16 in 32 saturated pack)
This commit is contained in:
parent
2d55b12cc9
commit
8427acfada
|
@ -588,6 +588,7 @@ struct Sequence {
|
||||||
e.LoadConstantXmm(e.xmm0, i.src1.constant());
|
e.LoadConstantXmm(e.xmm0, i.src1.constant());
|
||||||
fn(e, i.dest, e.xmm0, i.src2);
|
fn(e, i.dest, e.xmm0, i.src2);
|
||||||
} else if (i.src2.is_constant) {
|
} else if (i.src2.is_constant) {
|
||||||
|
assert_true(!i.src1.is_constant);
|
||||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
fn(e, i.dest, i.src1, e.xmm0);
|
fn(e, i.dest, i.src1, e.xmm0);
|
||||||
} else {
|
} else {
|
||||||
|
@ -7070,6 +7071,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Pack 2 32-bit vectors into a 16-bit vector.
|
||||||
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
|
static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i,
|
||||||
uint32_t flags) {
|
uint32_t flags) {
|
||||||
// TODO(benvanik): handle src2 (or src1) being constant zero
|
// TODO(benvanik): handle src2 (or src1) being constant zero
|
||||||
|
@ -7077,26 +7079,34 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
if (IsPackOutUnsigned(flags)) {
|
if (IsPackOutUnsigned(flags)) {
|
||||||
if (IsPackOutSaturate(flags)) {
|
if (IsPackOutSaturate(flags)) {
|
||||||
// unsigned -> unsigned + saturate
|
// unsigned -> unsigned + saturate
|
||||||
// Construct a saturation mask
|
// Construct a saturation max value
|
||||||
e.mov(e.eax, ~0xFFFFu);
|
e.mov(e.eax, 0xFFFFu);
|
||||||
e.vmovd(e.xmm0, e.eax);
|
e.vmovd(e.xmm0, e.eax);
|
||||||
e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
|
e.vpshufd(e.xmm0, e.xmm0, 0b00000000);
|
||||||
|
|
||||||
e.vandps(e.xmm1, e.xmm0, i.src1); // src1 & 0xFFFF0000
|
if (!i.src1.is_constant) {
|
||||||
e.vpcmpeqd(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMZero));
|
e.vpminud(e.xmm1, i.src1, e.xmm0); // Saturate src1
|
||||||
e.vpxor(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMFFFF));
|
e.vpshuflw(e.xmm1, e.xmm1, 0b00100010);
|
||||||
e.vpor(e.xmm1, e.xmm1, i.src1); // Saturate src1
|
e.vpshufhw(e.xmm1, e.xmm1, 0b00100010);
|
||||||
e.vpshuflw(e.xmm1, e.xmm1, 0b00100010);
|
e.vpshufd(e.xmm1, e.xmm1, 0b00001000);
|
||||||
e.vpshufhw(e.xmm1, e.xmm1, 0b00100010);
|
} else {
|
||||||
e.vpshufd(e.xmm1, e.xmm1, 0b00001000);
|
// TODO(DrChat): Non-zero constants
|
||||||
|
assert_true(i.src1.constant().u64[0] == 0 &&
|
||||||
|
i.src1.constant().u64[1] == 0);
|
||||||
|
e.vpxor(e.xmm1, e.xmm1);
|
||||||
|
}
|
||||||
|
|
||||||
e.vandps(e.xmm0, e.xmm0, i.src2); // src2 & 0xFFFF0000
|
if (!i.src2.is_constant) {
|
||||||
e.vpcmpeqd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
|
e.vpminud(i.dest, i.src2, e.xmm0); // Saturate src2
|
||||||
e.vpxor(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFFFF));
|
e.vpshuflw(i.dest, i.dest, 0b00100010);
|
||||||
e.vpor(i.dest, e.xmm0, i.src2); // Saturate src2
|
e.vpshufhw(i.dest, i.dest, 0b00100010);
|
||||||
e.vpshuflw(i.dest, i.dest, 0b00100010);
|
e.vpshufd(i.dest, i.dest, 0b10000000);
|
||||||
e.vpshufhw(i.dest, i.dest, 0b00100010);
|
} else {
|
||||||
e.vpshufd(i.dest, i.dest, 0b10000000);
|
// TODO(DrChat): Non-zero constants
|
||||||
|
assert_true(i.src2.constant().u64[0] == 0 &&
|
||||||
|
i.src2.constant().u64[1] == 0);
|
||||||
|
e.vpxor(i.dest, i.dest);
|
||||||
|
}
|
||||||
|
|
||||||
e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111);
|
e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111);
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in New Issue