diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 2607c4495..8f3b2599f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -63,6 +63,7 @@ enum XmmConst { XMMSignMaskPS = 8, XMMSignMaskPD = 9, XMMByteSwapMask = 10, + XMMPermuteControl15 = 11, }; static const vec128_t xmm_consts[] = { /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), @@ -76,6 +77,7 @@ static const vec128_t xmm_consts[] = { /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), + /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), }; // Use consts by first loading the base register then accessing memory: // e.mov(e.rax, XMMCONSTBASE) @@ -84,6 +86,45 @@ static const vec128_t xmm_consts[] = { #define XMMCONSTBASE (uint64_t)&xmm_consts[0] #define XMMCONST(base_reg, name) e.ptr[base_reg + name * 16] +static vec128_t lvsl_table[17] = { + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), +}; +static vec128_t lvsr_table[17] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), +}; + // A note about vectors: // Alloy represents vectors as xyzw pairs, with indices 0123. // XMM registers are xyzw pairs with indices 3210, making them more like wzyx. @@ -792,14 +833,56 @@ table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { XEASSERT(i->dest->type == VEC128_TYPE); - UNIMPLEMENTED_SEQ(); + if (i->src1.value->IsConstant()) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + auto sh = MIN(16, i->src1.value->AsUint32()); + e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); + e.movaps(dest, e.ptr[e.rax]); + e.EndOp(dest); + } else { + Xmm dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): probably a way to do this with addressing. + e.mov(TEMP_REG, 16); + e.movzx(e.rax, src); + e.cmp(src, 16); + e.cmovb(TEMP_REG, e.rax); + e.shl(TEMP_REG, 4); + e.mov(e.rax, (uintptr_t)lvsl_table); + e.movaps(dest, e.ptr[e.rax + TEMP_REG]); + e.EndOp(dest, src); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { XEASSERT(i->dest->type == VEC128_TYPE); - UNIMPLEMENTED_SEQ(); + if (i->src1.value->IsConstant()) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + auto sh = MIN(16, i->src1.value->AsUint32()); + e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); + e.movaps(dest, e.ptr[e.rax]); + e.EndOp(dest); + } else { + Xmm dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): probably a way to do this with addressing. + e.mov(TEMP_REG, 16); + e.movzx(e.rax, src); + e.cmp(src, 16); + e.cmovb(TEMP_REG, e.rax); + e.shl(TEMP_REG, 4); + e.mov(e.rax, (uintptr_t)lvsr_table); + e.movaps(dest, e.ptr[e.rax + TEMP_REG]); + e.EndOp(dest, src); + } i = e.Advance(i); return true; }); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index da28030c5..94aaaef72 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -51,6 +51,12 @@ Address Stash(X64Emitter& e, const Xmm& r) { return addr; } +void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) { + e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); + e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); + e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]); +} + // Moves a 64bit immediate into memory. void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { if ((v & ~0x7FFFFFFF) == 0) { diff --git a/src/alloy/core.h b/src/alloy/core.h index cd7a32204..aef7e57c2 100644 --- a/src/alloy/core.h +++ b/src/alloy/core.h @@ -55,6 +55,18 @@ XEFORCEINLINE vec128_t vec128f(float x, float y, float z, float w) { v.f4[0] = x; v.f4[1] = y; v.f4[2] = z; v.f4[3] = w; return v; } +XEFORCEINLINE vec128_t vec128b( + uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, + uint8_t y0, uint8_t y1, uint8_t y2, uint8_t y3, + uint8_t z0, uint8_t z1, uint8_t z2, uint8_t z3, + uint8_t w0, uint8_t w1, uint8_t w2, uint8_t w3) { + vec128_t v; + v.b16[0] = x3; v.b16[1] = x2; v.b16[2] = x1; v.b16[3] = x0; + v.b16[4] = y3; v.b16[5] = y2; v.b16[6] = y1; v.b16[7] = y0; + v.b16[8] = z3; v.b16[9] = z2; v.b16[10] = z1; v.b16[11] = z0; + v.b16[12] = w3; v.b16[13] = w2; v.b16[14] = w1; v.b16[15] = w0; + return v; +} } // namespace alloy diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index 37a0a4a5a..4fa957932 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -184,25 +184,26 @@ public: } bool IsConstantTrue() const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && !!constant.i64; } bool IsConstantFalse() const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && !constant.i64; } bool IsConstantZero() const { if (type == VEC128_TYPE) { - return false; + return (flags & VALUE_IS_CONSTANT) && + !constant.v128.low && !constant.v128.high; } return (flags & VALUE_IS_CONSTANT) && !constant.i64; } bool IsConstantEQ(Value* other) const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && (other->flags & VALUE_IS_CONSTANT) && @@ -210,7 +211,7 @@ public: } bool IsConstantNE(Value* other) const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && (other->flags & VALUE_IS_CONSTANT) &&