From 8427acfada26cd51cb73507bc277f4658c7fee6c Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sat, 23 Jul 2016 19:19:40 -0500 Subject: [PATCH] Use vpminud to saturate rather than bitwise ops (shaves 6 instructions per 16 in 32 saturated pack) --- src/xenia/cpu/backend/x64/x64_sequences.cc | 42 +++++++++++++--------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index e12eaf1da..87f0b3553 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -588,6 +588,7 @@ struct Sequence { e.LoadConstantXmm(e.xmm0, i.src1.constant()); fn(e, i.dest, e.xmm0, i.src2); } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); e.LoadConstantXmm(e.xmm0, i.src2.constant()); fn(e, i.dest, i.src1, e.xmm0); } else { @@ -7070,6 +7071,7 @@ struct PACK : Sequence> { } } } + // Pack 2 32-bit vectors into a 16-bit vector. static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, uint32_t flags) { // TODO(benvanik): handle src2 (or src1) being constant zero @@ -7077,26 +7079,34 @@ struct PACK : Sequence> { if (IsPackOutUnsigned(flags)) { if (IsPackOutSaturate(flags)) { // unsigned -> unsigned + saturate - // Construct a saturation mask - e.mov(e.eax, ~0xFFFFu); + // Construct a saturation max value + e.mov(e.eax, 0xFFFFu); e.vmovd(e.xmm0, e.eax); e.vpshufd(e.xmm0, e.xmm0, 0b00000000); - e.vandps(e.xmm1, e.xmm0, i.src1); // src1 & 0xFFFF0000 - e.vpcmpeqd(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMZero)); - e.vpxor(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMFFFF)); - e.vpor(e.xmm1, e.xmm1, i.src1); // Saturate src1 - e.vpshuflw(e.xmm1, e.xmm1, 0b00100010); - e.vpshufhw(e.xmm1, e.xmm1, 0b00100010); - e.vpshufd(e.xmm1, e.xmm1, 0b00001000); + if (!i.src1.is_constant) { + e.vpminud(e.xmm1, i.src1, e.xmm0); // Saturate src1 + e.vpshuflw(e.xmm1, e.xmm1, 0b00100010); + e.vpshufhw(e.xmm1, e.xmm1, 0b00100010); + e.vpshufd(e.xmm1, e.xmm1, 0b00001000); + } else { + // TODO(DrChat): Non-zero constants + assert_true(i.src1.constant().u64[0] == 0 && + i.src1.constant().u64[1] == 0); + e.vpxor(e.xmm1, e.xmm1); + } - e.vandps(e.xmm0, e.xmm0, i.src2); // src2 & 0xFFFF0000 - e.vpcmpeqd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero)); - e.vpxor(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFFFF)); - e.vpor(i.dest, e.xmm0, i.src2); // Saturate src2 - e.vpshuflw(i.dest, i.dest, 0b00100010); - e.vpshufhw(i.dest, i.dest, 0b00100010); - e.vpshufd(i.dest, i.dest, 0b10000000); + if (!i.src2.is_constant) { + e.vpminud(i.dest, i.src2, e.xmm0); // Saturate src2 + e.vpshuflw(i.dest, i.dest, 0b00100010); + e.vpshufhw(i.dest, i.dest, 0b00100010); + e.vpshufd(i.dest, i.dest, 0b10000000); + } else { + // TODO(DrChat): Non-zero constants + assert_true(i.src2.constant().u64[0] == 0 && + i.src2.constant().u64[1] == 0); + e.vpxor(i.dest, i.dest); + } e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111); } else {