diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index a30ccf053..b9850b80f 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -539,6 +539,10 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { 0xFFFFFF0Cu, 0xFFFFFF0Fu), /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), + /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, + 0x0000FFFFu, 0x0000FFFFu), + /* XMMShiftMaskEvenPI16 */ vec128i(0x0000000Fu, 0x0000000Fu, + 0x0000000Fu, 0x0000000Fu), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 086a1d689..414a94899 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -54,6 +54,8 @@ enum XmmConst { XMMPackD3DCOLOR, XMMUnpackD3DCOLOR, XMMOneOver255, + XMMMaskEvenPI16, + XMMShiftMaskEvenPI16, XMMShiftMaskPS, XMMShiftByteMask, XMMUnsignedDwordMax, diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 52d4df79c..fb6e80d18 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4150,7 +4150,23 @@ EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { } } else { // Fully variable shift. - assert_always(); + // TODO(benvanik): find a better sequence. + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Even: + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + e.vpand(e.xmm1, e.GetXmmConstPtr(XMMMaskEvenPI16)); + // Odd: + e.vpsrld(e.xmm0, i.src2, 16); + e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); + e.vpsrld(i.dest, i.src1, 16); + e.vpsllvd(i.dest, i.dest, e.xmm0); + e.vpslld(i.dest, 8); + // Merge: + e.vpor(i.dest, e.xmm1); } } static void EmitInt32(X64Emitter& e, const EmitArgType& i) { @@ -4308,6 +4324,20 @@ EMITTER_OPCODE_TABLE( EMITTER(VECTOR_SHA_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { + case INT16_TYPE: + // Even halfwords: + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); + e.vpslld(e.xmm1, i.src1, 16); + e.vpsrad(e.xmm1, 8); + e.vpsravd(e.xmm1, e.xmm1, e.xmm0); + // Odd halfwords: + e.vpsrld(e.xmm0, i.src2, 16); + e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); + e.vpslld(i.dest, i.src1, 16); + e.vpsravd(i.dest, i.dest, e.xmm0); + // Merge: + e.vpor(i.dest, e.xmm1); + break; case INT32_TYPE: // src shift mask may have values >31, and x86 sets to zero when // that happens so we mask.