diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 8f3b2599f..45bbaa32e 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2456,11 +2456,11 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { } else if (IsVecType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // dest_src ^= 0xFFFF... - e.cmpeqps(e.xmm0, e.xmm0); if (dest != src) { e.movaps(dest, src); } - e.pxor(dest, e.xmm0); + e.mov(e.rax, XMMCONSTBASE); + e.pxor(dest, XMMCONST(e.rax, XMMOne)); }); } else { ASSERT_INVALID_TYPE(); @@ -2937,14 +2937,64 @@ table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { } } else if (i->src1.value->type == VEC128_TYPE) { // Permute bytes between src2 and src3. - // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. - Xmm dest, control, src2, src3; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, control, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - UNIMPLEMENTED_SEQ(); - e.EndOp(dest, control, src2, src3); + if (i->src3.value->IsConstantZero()) { + // Permuting with src2/zero, so just shuffle/mask. + Xmm dest, control, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, control, 0, + i->src2.value, src2, 0); + if (i->src2.value->IsConstantZero()) { + e.vpxor(dest, src2, src2); + } else { + if (i->src2.value->IsConstant()) { + LoadXmmConstant(e, src2, i->src2.value->constant.v128); + } + // Control mask needs to be shuffled. + e.mov(e.rax, XMMCONSTBASE); + e.vpshufb(e.xmm0, control, XMMCONST(e.rax, XMMByteSwapMask)); + e.vpshufb(dest, src2, e.xmm0); + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(e.xmm0, e.xmm0, XMMCONST(e.rax, XMMPermuteControl15)); + e.vpandn(dest, e.xmm0, dest); + } + e.EndOp(dest, control, src2); + } else { + // General permute. + Xmm dest, control, src2, src3; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, control, 0, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + e.mov(e.rax, XMMCONSTBASE); + // Control mask needs to be shuffled. + e.vpshufb(e.xmm1, control, XMMCONST(e.rax, XMMByteSwapMask)); + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(dest, e.xmm1, XMMCONST(e.rax, XMMPermuteControl15)); + Xmm src2_shuf, src3_shuf; + if (i->src2.value->IsConstantZero()) { + e.vpxor(src2, src2); + src2_shuf = src2; + } else { + if (i->src2.value->IsConstant()) { + LoadXmmConstant(e, src2, i->src2.value->constant.v128); + } + src2_shuf = e.xmm0; + e.vpshufb(src2_shuf, src2, e.xmm1); + } + if (i->src3.value->IsConstantZero()) { + e.vpxor(src3, src3); + src3_shuf = src3; + } else { + if (i->src3.value->IsConstant()) { + LoadXmmConstant(e, src3, i->src3.value->constant.v128); + } + // NOTE: reusing xmm1 here. + src3_shuf = e.xmm1; + e.vpshufb(src3_shuf, src3, e.xmm1); + } + e.vpblendvb(dest, src2_shuf, src3_shuf, dest); + e.EndOp(dest, control, src2, src3); + } } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 94aaaef72..3e0ed6789 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -52,9 +52,18 @@ Address Stash(X64Emitter& e, const Xmm& r) { } void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) { - e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); - e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); - e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]); + if (!v.low && !v.high) { + // zero + e.vpxor(dest, dest); + //} else if (v.low == ~0ull && v.high == ~0ull) { + // one + // TODO(benvanik): XMMCONST? + } else { + // TODO(benvanik): more efficient loading of partial values? + e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); + e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); + e.vmovaps(dest, e.ptr[e.rsp + STASH_OFFSET]); + } } // Moves a 64bit immediate into memory. @@ -539,8 +548,14 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, vv_fn(e, *i, dest, Ntx); } } else { - e.mov(dest, src2); - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + e.mov(dest, src2); + vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); + } else { + // Need a cv_fn. Or a better way to do all of this. + e.mov(dest, (uint32_t)src1->get_constant(CT())); + vv_fn(e, *i, dest, src2); + } } } else { // 64-bit. diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 4d441673f..5aee9fb59 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -102,21 +102,21 @@ void* X64Emitter::Emplace(size_t stack_size) { int X64Emitter::Emit(HIRBuilder* builder) { // These are the registers we will not be using. All others are fare game. const uint32_t reserved_regs = - GetRegBit(rax) | - GetRegBit(rcx) | - GetRegBit(rdx) | - GetRegBit(rsp) | - GetRegBit(rbp) | - GetRegBit(rsi) | - GetRegBit(rdi) | - GetRegBit(xmm0) | + GetRegBit(rax) | // scratch + GetRegBit(rcx) | // arg + GetRegBit(rdx) | // arg/clobbered + GetRegBit(rsp) | + GetRegBit(rbp) | + GetRegBit(rsi) | + GetRegBit(rdi) | + GetRegBit(r8) | // arg/clobbered + GetRegBit(xmm0) | // scratch + GetRegBit(xmm1) | // sometimes used for scratch, could be fixed // TODO(benvanik): save so that we can use these. - GetRegBit(r8) | - GetRegBit(r9) | - GetRegBit(r10) | - GetRegBit(r11) | - GetRegBit(xmm1) | + GetRegBit(r9) | + GetRegBit(r10) | + GetRegBit(r11) | GetRegBit(xmm2) | GetRegBit(xmm3) | GetRegBit(xmm4) |