More SSE work.
This commit is contained in:
parent
ac4360913f
commit
e5cf47a0d8
|
@ -35,6 +35,39 @@ namespace {
|
|||
|
||||
#define SHUFPS_SWAP_DWORDS 0x1B
|
||||
|
||||
enum XmmConst {
|
||||
XMMZero = 0,
|
||||
XMMOne = 1,
|
||||
XMMNegativeOne = 2,
|
||||
XMMMaskX16Y16 = 3,
|
||||
XMMFlipX16Y16 = 4,
|
||||
XMMFixX16Y16 = 5,
|
||||
XMMNormalizeX16Y16 = 6,
|
||||
XMM3301 = 7,
|
||||
XMMSignMaskPS = 8,
|
||||
XMMSignMaskPD = 9,
|
||||
XMMByteSwapMask = 10,
|
||||
};
|
||||
static const vec128_t xmm_consts[] = {
|
||||
/* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f),
|
||||
/* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f),
|
||||
/* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
|
||||
/* XMMMaskX16Y16 */ vec128i(0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000),
|
||||
/* XMMFlipX16Y16 */ vec128i(0x00008000, 0x00000000, 0x00000000, 0x00000000),
|
||||
/* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f),
|
||||
/* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f),
|
||||
/* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f),
|
||||
/* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
|
||||
/* XMMSignMaskPD */ vec128i(0x80000000u, 0x00000000u, 0x80000000u, 0x00000000u),
|
||||
/* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
|
||||
};
|
||||
// Use consts by first loading the base register then accessing memory:
|
||||
// e.mov(e.rax, XMMCONSTBASE)
|
||||
// e.andps(reg, XMMCONST(XMM3303))
|
||||
// TODO(benvanik): find a way to do this without the base register.
|
||||
#define XMMCONSTBASE (uint64_t)&xmm_consts[0]
|
||||
#define XMMCONST(base_reg, name) e.ptr[base_reg + name * 16]
|
||||
|
||||
// A note about vectors:
|
||||
// Alloy represents vectors as xyzw pairs, with indices 0123.
|
||||
// XMM registers are xyzw pairs with indices 3210, making them more like wzyx.
|
||||
|
@ -339,10 +372,50 @@ table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) {
|
|||
});
|
||||
|
||||
table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) {
|
||||
if (IsIntType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
if (i->dest->type == INT32_TYPE) {
|
||||
if (i->src1.value->type == FLOAT32_TYPE) {
|
||||
Reg32 dest;
|
||||
Xmm src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pextrd(dest, src, 0);
|
||||
e.EndOp(dest, src);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
} else if (i->dest->type == INT64_TYPE) {
|
||||
if (i->src1.value->type == FLOAT64_TYPE) {
|
||||
Reg64 dest;
|
||||
Xmm src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pextrq(dest, src, 0);
|
||||
e.EndOp(dest, src);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
} else if (i->dest->type == FLOAT32_TYPE) {
|
||||
if (i->src1.value->type == INT32_TYPE) {
|
||||
Xmm dest;
|
||||
Reg32 src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pinsrd(dest, src, 0);
|
||||
e.EndOp(dest, src);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
} else if (i->dest->type == FLOAT64_TYPE) {
|
||||
if (i->src1.value->type == INT64_TYPE) {
|
||||
Xmm dest;
|
||||
Reg64 src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pinsrq(dest, src, 0);
|
||||
e.EndOp(dest, src);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else {
|
||||
|
@ -625,15 +698,32 @@ table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) {
|
|||
});
|
||||
|
||||
table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) {
|
||||
// flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED
|
||||
UNIMPLEMENTED_SEQ();
|
||||
// flags = ARITHMETIC_UNSIGNED
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
// TODO(benvanik): are these really the same? VC++ thinks so.
|
||||
if (i.flags & ARITHMETIC_UNSIGNED) {
|
||||
e.cvtdq2ps(dest, src);
|
||||
} else {
|
||||
e.cvtdq2ps(dest, src);
|
||||
}
|
||||
});
|
||||
i = e.Advance(i);
|
||||
return true;
|
||||
});
|
||||
|
||||
table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) {
|
||||
// flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
// TODO(benvanik): are these really the same? VC++ thinks so.
|
||||
if (i.flags & ARITHMETIC_UNSIGNED) {
|
||||
e.cvttps2dq(dest, src);
|
||||
} else {
|
||||
e.cvttps2dq(dest, src);
|
||||
}
|
||||
if (i.flags & ARITHMETIC_SATURATE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
});
|
||||
i = e.Advance(i);
|
||||
return true;
|
||||
});
|
||||
|
@ -991,7 +1081,6 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) {
|
|||
// TODO(benvanik): we should try to stick to movaps if possible.
|
||||
e.movups(dest, e.ptr[addr]);
|
||||
e.EndOp(dest);
|
||||
e.db(0xCC);
|
||||
#if DTRACE
|
||||
e.lea(e.rdx, e.ptr[addr]);
|
||||
e.lea(e.r8, Stash(e, dest));
|
||||
|
@ -1168,7 +1257,6 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) {
|
|||
// TODO(benvanik): we should try to stick to movaps if possible.
|
||||
e.movups(e.ptr[addr], src);
|
||||
e.EndOp(src);
|
||||
e.db(0xCC);
|
||||
#if DTRACE
|
||||
e.lea(e.rdx, e.ptr[addr]);
|
||||
e.lea(e.r8, Stash(e, src));
|
||||
|
@ -1208,9 +1296,17 @@ table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) {
|
|||
if (IsIntType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
if (i.src1.value->type == FLOAT32_TYPE) {
|
||||
e.maxss(dest_src, src);
|
||||
} else {
|
||||
e.maxsd(dest_src, src);
|
||||
}
|
||||
});
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
e.maxps(dest_src, src);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -1222,9 +1318,17 @@ table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) {
|
|||
if (IsIntType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
if (i.src1.value->type == FLOAT32_TYPE) {
|
||||
e.minss(dest_src, src);
|
||||
} else {
|
||||
e.minsd(dest_src, src);
|
||||
}
|
||||
});
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
e.minps(dest_src, src);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -1233,12 +1337,22 @@ table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) {
|
|||
});
|
||||
|
||||
table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) {
|
||||
CheckBoolean(e, i->src1.value);
|
||||
if (IsIntType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type) || IsVecType(i->dest->type)) {
|
||||
Xmm dest, src2, src3;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
// TODO(benvanik): find a way to do this without branches.
|
||||
e.inLocalLabel();
|
||||
e.movaps(dest, src3);
|
||||
e.jz(".skip");
|
||||
e.movaps(dest, src2);
|
||||
e.L(".skip");
|
||||
e.outLocalLabel();
|
||||
e.EndOp(dest, src2, src3);
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -1707,9 +1821,17 @@ table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) {
|
|||
if (IsIntType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) {
|
||||
if (i.dest->type == FLOAT32_TYPE) {
|
||||
e.vfmadd132ss(dest_src, src3, src2);
|
||||
} else {
|
||||
e.vfmadd132sd(dest_src, src3, src2);
|
||||
}
|
||||
});
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) {
|
||||
e.vfmadd132ps(dest_src, src3, src2);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -1721,9 +1843,17 @@ table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) {
|
|||
if (IsIntType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (IsFloatType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) {
|
||||
if (i.dest->type == FLOAT32_TYPE) {
|
||||
e.vfmsub132ss(dest_src, src3, src2);
|
||||
} else {
|
||||
e.vfmsub132sd(dest_src, src3, src2);
|
||||
}
|
||||
});
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) {
|
||||
e.vfmsub132ps(dest_src, src3, src2);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -1739,14 +1869,17 @@ table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) {
|
|||
} else if (IsFloatType(i->dest->type)) {
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
if (i.src1.value->type == FLOAT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPS));
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPD));
|
||||
}
|
||||
});
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPS));
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
|
@ -1761,14 +1894,20 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) {
|
|||
} else if (IsFloatType(i->dest->type)) {
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
if (i.src1.value->type == FLOAT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPS));
|
||||
e.vpandn(dest, e.xmm0, src);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPD));;
|
||||
e.vpandn(dest, e.xmm0, src);
|
||||
}
|
||||
});
|
||||
} else if (IsVecType(i->dest->type)) {
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPS));;
|
||||
e.vpandn(dest, e.xmm0, src);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
|
@ -1848,7 +1987,6 @@ table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) {
|
|||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
// http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
||||
// TODO(benvanik): verify ordering
|
||||
e.db(0xCC);
|
||||
e.dpps(dest_src, src, B01110001);
|
||||
});
|
||||
} else {
|
||||
|
@ -1863,7 +2001,6 @@ table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) {
|
|||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
// http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
||||
// TODO(benvanik): verify ordering
|
||||
e.db(0xCC);
|
||||
e.dpps(dest_src, src, B11110001);
|
||||
});
|
||||
} else {
|
||||
|
@ -2020,7 +2157,16 @@ table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) {
|
|||
} else if (i->flags == INT16_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (i->flags == INT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
// src shift mask may have values >31, and x86 sets to zero when
|
||||
// that happens so we mask.
|
||||
e.db(0xCC);
|
||||
e.mov(e.eax, 0x1F);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vpbroadcastd(e.xmm0, e.xmm0);
|
||||
e.vandps(e.xmm0, src, e.xmm0);
|
||||
e.vpsllvd(dest_src, dest_src, e.xmm0);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -2038,7 +2184,15 @@ table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) {
|
|||
} else if (i->flags == INT16_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (i->flags == INT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
// src shift mask may have values >31, and x86 sets to zero when
|
||||
// that happens so we mask.
|
||||
e.mov(e.eax, 0x1F);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vpbroadcastd(e.xmm0, e.xmm0);
|
||||
e.vandps(e.xmm0, src, e.xmm0);
|
||||
e.vpsrlvd(dest_src, dest_src, src);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -2056,7 +2210,15 @@ table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) {
|
|||
} else if (i->flags == INT16_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (i->flags == INT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
|
||||
// src shift mask may have values >31, and x86 sets to zero when
|
||||
// that happens so we mask.
|
||||
e.mov(e.eax, 0x1F);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vpbroadcastd(e.xmm0, e.xmm0);
|
||||
e.vandps(e.xmm0, src, e.xmm0);
|
||||
e.vpsravd(dest_src, dest_src, src);
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -2088,7 +2250,7 @@ table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) {
|
|||
|
||||
table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) {
|
||||
if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) {
|
||||
Reg16 d, s1;
|
||||
Reg16 dest, src1;
|
||||
// TODO(benvanik): fix register allocator to put the value in ABCD
|
||||
//e.BeginOp(i->dest, d, REG_DEST | REG_ABCD,
|
||||
// i->src1.value, s1, 0);
|
||||
|
@ -2098,45 +2260,42 @@ table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) {
|
|||
//} else {
|
||||
// e.xchg(d.cvt8(), Reg8(d.getIdx() + 4));
|
||||
//}
|
||||
e.BeginOp(i->dest, d, REG_DEST,
|
||||
i->src1.value, s1, 0);
|
||||
e.mov(e.ax, s1);
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
e.mov(e.ax, src1);
|
||||
e.xchg(e.ah, e.al);
|
||||
e.mov(d, e.ax);
|
||||
e.EndOp(d, s1);
|
||||
e.mov(dest, e.ax);
|
||||
e.EndOp(dest, src1);
|
||||
} else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) {
|
||||
Reg32 d, s1;
|
||||
e.BeginOp(i->dest, d, REG_DEST,
|
||||
i->src1.value, s1, 0);
|
||||
if (d != s1) {
|
||||
e.mov(d, s1);
|
||||
e.bswap(d);
|
||||
Reg32 dest, src1;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
if (dest != src1) {
|
||||
e.mov(dest, src1);
|
||||
e.bswap(dest);
|
||||
} else {
|
||||
e.bswap(d);
|
||||
e.bswap(dest);
|
||||
}
|
||||
e.EndOp(d, s1);
|
||||
e.EndOp(dest, src1);
|
||||
} else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) {
|
||||
Reg64 d, s1;
|
||||
e.BeginOp(i->dest, d, REG_DEST,
|
||||
i->src1.value, s1, 0);
|
||||
if (d != s1) {
|
||||
e.mov(d, s1);
|
||||
e.bswap(d);
|
||||
Reg64 dest, src1;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
if (dest != src1) {
|
||||
e.mov(dest, src1);
|
||||
e.bswap(dest);
|
||||
} else {
|
||||
e.bswap(d);
|
||||
e.bswap(dest);
|
||||
}
|
||||
e.EndOp(d, s1);
|
||||
e.EndOp(dest, src1);
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128)) {
|
||||
Xmm d, s1;
|
||||
e.db(0xCC);
|
||||
e.BeginOp(i->dest, d, REG_DEST,
|
||||
i->src1.value, s1, 0);
|
||||
if (d != s1) {
|
||||
e.shufps(d, s1, SHUFPS_SWAP_DWORDS);
|
||||
} else {
|
||||
e.shufps(d, d, SHUFPS_SWAP_DWORDS);
|
||||
}
|
||||
e.EndOp(d, s1);
|
||||
Xmm dest, src1;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
// TODO(benvanik): find a way to do this without the memory load.
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.vpshufb(dest, src1, XMMCONST(e.rax, XMMByteSwapMask));
|
||||
e.EndOp(dest, src1);
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -2278,36 +2437,67 @@ table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) {
|
|||
|
||||
table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) {
|
||||
if (IsVecType(i->dest->type)) {
|
||||
if (i->src1.value->type == INT8_TYPE) {
|
||||
if (i->Match(SIG_TYPE_V128, SIG_TYPE_I8)) {
|
||||
Xmm dest;
|
||||
Reg8 src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pinsrb(e.xmm0, src, 0);
|
||||
e.vmovd(e.xmm0, src.cvt32());
|
||||
e.vpbroadcastb(dest, e.xmm0);
|
||||
e.EndOp(dest, src);
|
||||
} else if (i->src1.value->type == INT16_TYPE) {
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I8C)) {
|
||||
Xmm dest;
|
||||
e.BeginOp(i->dest, dest, REG_DEST);
|
||||
// TODO(benvanik): faster constant splats.
|
||||
e.mov(e.eax, i->src1.value->constant.i8);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vpbroadcastb(dest, e.xmm0);
|
||||
e.EndOp(dest);
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I16)) {
|
||||
Xmm dest;
|
||||
Reg16 src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pinsrw(e.xmm0, src, 0);
|
||||
e.vmovd(e.xmm0, src.cvt32());
|
||||
e.vpbroadcastw(dest, e.xmm0);
|
||||
e.EndOp(dest, src);
|
||||
} else if (i->src1.value->type == INT32_TYPE) {
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I16C)) {
|
||||
Xmm dest;
|
||||
e.BeginOp(i->dest, dest, REG_DEST);
|
||||
// TODO(benvanik): faster constant splats.
|
||||
e.mov(e.eax, i->src1.value->constant.i16);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vpbroadcastw(dest, e.xmm0);
|
||||
e.EndOp(dest);
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I32)) {
|
||||
Xmm dest;
|
||||
Reg32 src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.pinsrd(e.xmm0, src, 0);
|
||||
e.vmovd(e.xmm0, src);
|
||||
e.vpbroadcastd(dest, e.xmm0);
|
||||
e.EndOp(dest, src);
|
||||
} else if (i->src1.value->type == FLOAT32_TYPE) {
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I32C)) {
|
||||
Xmm dest;
|
||||
e.BeginOp(i->dest, dest, REG_DEST);
|
||||
// TODO(benvanik): faster constant splats.
|
||||
e.mov(e.eax, i->src1.value->constant.i32);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vpbroadcastd(dest, e.xmm0);
|
||||
e.EndOp(dest);
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_F32)) {
|
||||
Xmm dest, src;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src, 0);
|
||||
e.vbroadcastss(dest, src);
|
||||
e.EndOp(dest, src);
|
||||
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_F32C)) {
|
||||
Xmm dest;
|
||||
e.BeginOp(i->dest, dest, REG_DEST);
|
||||
e.mov(e.eax, i->src1.value->constant.i32);
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
e.vbroadcastss(dest, e.xmm0);
|
||||
e.EndOp(dest);
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -2321,9 +2511,57 @@ table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) {
|
|||
table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) {
|
||||
if (IsVecType(i->dest->type)) {
|
||||
if (i->src1.value->type == INT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
// Permute words between src2 and src3.
|
||||
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
||||
if (i->src1.value->IsConstant()) {
|
||||
uint32_t control = i->src1.value->AsUint32();
|
||||
Xmm dest, src2, src3;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
// Shuffle things into the right places in dest & xmm0,
|
||||
// then we blend them together.
|
||||
uint32_t src_control =
|
||||
(((control >> 24) & 0x3) << 0) |
|
||||
(((control >> 16) & 0x3) << 2) |
|
||||
(((control >> 8) & 0x3) << 4) |
|
||||
(((control >> 0) & 0x3) << 6);
|
||||
uint32_t blend_control =
|
||||
(((control >> 26) & 0x1) << 0) |
|
||||
(((control >> 18) & 0x1) << 1) |
|
||||
(((control >> 10) & 0x1) << 2) |
|
||||
(((control >> 2) & 0x1) << 3);
|
||||
if (dest != src3) {
|
||||
e.pshufd(dest, src2, src_control);
|
||||
e.pshufd(e.xmm0, src3, src_control);
|
||||
e.blendps(dest, e.xmm0, blend_control);
|
||||
} else {
|
||||
e.movaps(e.xmm0, src3);
|
||||
e.pshufd(dest, src2, src_control);
|
||||
e.pshufd(e.xmm0, e.xmm0, src_control);
|
||||
e.blendps(dest, e.xmm0, blend_control);
|
||||
}
|
||||
e.EndOp(dest, src2, src3);
|
||||
} else {
|
||||
Reg32 control;
|
||||
Xmm dest, src2, src3;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, control, 0,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.EndOp(dest, control, src2, src3);
|
||||
}
|
||||
} else if (i->src1.value->type == VEC128_TYPE) {
|
||||
// Permute bytes between src2 and src3.
|
||||
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
||||
Xmm dest, control, src2, src3;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, control, 0,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.EndOp(dest, control, src2, src3);
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -2339,7 +2577,11 @@ table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) {
|
|||
// Defined by SWIZZLE_MASK()
|
||||
if (i->flags == INT32_TYPE || i->flags == FLOAT32_TYPE) {
|
||||
uint8_t swizzle_mask = (uint8_t)i->src2.offset;
|
||||
e.db(0xCC);
|
||||
swizzle_mask =
|
||||
(((swizzle_mask >> 6) & 0x3) << 0) |
|
||||
(((swizzle_mask >> 4) & 0x3) << 2) |
|
||||
(((swizzle_mask >> 2) & 0x3) << 4) |
|
||||
(((swizzle_mask >> 0) & 0x3) << 6);
|
||||
Xmm dest, src1;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
|
@ -2392,7 +2634,7 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) {
|
|||
// Load source, move from tight pack of X16Y16.... to X16...Y16...
|
||||
// Also zero out the high end.
|
||||
// TODO(benvanik): special case constant unpacks that just get 0/1/etc.
|
||||
XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm&, const Xmm& src) {
|
||||
XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
// sx = src.iw >> 16;
|
||||
// sy = src.iw & 0xFFFF;
|
||||
// dest = { 3.0 + (sx / float(1 << 22)),
|
||||
|
@ -2410,11 +2652,31 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) {
|
|||
// Could be shared with FLOAT16_2.
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (i->flags == PACK_TYPE_SHORT_2) {
|
||||
// (VD.x) = 3.0 + (VB.x)*2^-22
|
||||
// (VD.y) = 3.0 + (VB.y)*2^-22
|
||||
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
|
||||
// (VD.y) = 3.0 + (VB.x)*2^-22
|
||||
// (VD.z) = 0.0
|
||||
// (VD.w) = 3.0
|
||||
UNIMPLEMENTED_SEQ();
|
||||
// (VD.w) = 1.0
|
||||
XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
// XMLoadShortN2 plus 3,3,0,3 (for some reason)
|
||||
// src is (xx,xx,xx,VALUE)
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
// (VALUE,VALUE,VALUE,VALUE)
|
||||
e.vbroadcastss(dest, src);
|
||||
// (VALUE&0xFFFF,VALUE&0xFFFF0000,0,0)
|
||||
e.andps(dest, XMMCONST(e.rax, XMMMaskX16Y16));
|
||||
// Sign extend.
|
||||
e.xorps(dest, XMMCONST(e.rax, XMMFlipX16Y16));
|
||||
// Convert int->float.
|
||||
e.cvtpi2ps(dest, Stash(e, dest));
|
||||
// 0x8000 to undo sign.
|
||||
e.addps(dest, XMMCONST(e.rax, XMMFixX16Y16));
|
||||
// Normalize.
|
||||
e.mulps(dest, XMMCONST(e.rax, XMMNormalizeX16Y16));
|
||||
// Clamp.
|
||||
e.maxps(dest, XMMCONST(e.rax, XMMNegativeOne));
|
||||
// Add 3,3,0,1.
|
||||
e.addps(dest, XMMCONST(e.rax, XMM3301));
|
||||
});
|
||||
} else if (i->flags == PACK_TYPE_S8_IN_16_LO) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
} else if (i->flags == PACK_TYPE_S8_IN_16_HI) {
|
||||
|
|
|
@ -18,8 +18,9 @@ namespace {
|
|||
#define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false)
|
||||
|
||||
Address Stash(X64Emitter& e, const Xmm& r) {
|
||||
auto addr = e.ptr[e.rsp + 40];
|
||||
e.movaps(addr, r);
|
||||
// TODO(benvanik): ensure aligned.
|
||||
auto addr = e.ptr[e.rsp + 48];
|
||||
e.movups(addr, r);
|
||||
return addr;
|
||||
}
|
||||
|
||||
|
@ -65,11 +66,22 @@ void CheckBoolean(X64Emitter& e, Value* v) {
|
|||
e.test(src, src);
|
||||
e.EndOp(src);
|
||||
} else if (v->type == FLOAT32_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
// TODO(benvanik): mask?
|
||||
Xmm src;
|
||||
e.BeginOp(v, src, 0);
|
||||
e.ptest(src, src);
|
||||
e.EndOp(src);
|
||||
} else if (v->type == FLOAT64_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
// TODO(benvanik): mask?
|
||||
Xmm src;
|
||||
e.BeginOp(v, src, 0);
|
||||
e.ptest(src, src);
|
||||
e.EndOp(src);
|
||||
} else if (v->type == VEC128_TYPE) {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
Xmm src;
|
||||
e.BeginOp(v, src, 0);
|
||||
e.ptest(src, src);
|
||||
e.EndOp(src);
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
@ -180,6 +192,52 @@ void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest,
|
|||
e.cmp(src2, e.rax);
|
||||
set_fn(e, dest, true);
|
||||
e.EndOp(dest, src2);
|
||||
} else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32)) {
|
||||
Reg8 dest;
|
||||
Xmm src1, src2;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0,
|
||||
i->src2.value, src2, 0);
|
||||
e.comiss(src1, src2);
|
||||
set_fn(e, dest, false);
|
||||
e.EndOp(dest, src1, src2);
|
||||
} else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32C)) {
|
||||
Reg8 dest;
|
||||
Xmm src1;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
if (i->src2.value->IsConstantZero()) {
|
||||
e.pxor(e.xmm0, e.xmm0);
|
||||
} else {
|
||||
e.mov(e.eax, (uint32_t)i->src2.value->constant.i32);
|
||||
e.pinsrd(e.xmm0, e.eax, 0);
|
||||
}
|
||||
e.comiss(src1, e.xmm0);
|
||||
set_fn(e, dest, false);
|
||||
e.EndOp(dest, src1);
|
||||
} else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64)) {
|
||||
Reg8 dest;
|
||||
Xmm src1, src2;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0,
|
||||
i->src2.value, src2, 0);
|
||||
e.comisd(src1, src2);
|
||||
set_fn(e, dest, false);
|
||||
e.EndOp(dest, src1, src2);
|
||||
} else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64C)) {
|
||||
Reg8 dest;
|
||||
Xmm src1;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0);
|
||||
if (i->src2.value->IsConstantZero()) {
|
||||
e.pxor(e.xmm0, e.xmm0);
|
||||
} else {
|
||||
e.mov(e.rax, (uint64_t)i->src2.value->constant.i64);
|
||||
e.pinsrq(e.xmm0, e.rax, 0);
|
||||
}
|
||||
e.comisd(src1, e.xmm0);
|
||||
set_fn(e, dest, false);
|
||||
e.EndOp(dest, src1);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
|
@ -894,6 +952,52 @@ void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) {
|
|||
}
|
||||
};
|
||||
|
||||
typedef void(xmm_vvv_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3);
|
||||
void XmmTernaryOpVVV(X64Emitter& e, Instr*& i, xmm_vvv_fn vvv_fn,
|
||||
Xmm& dest, Xmm& src1, Xmm& src2, Xmm& src3) {
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, src1, 0,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
if (dest == src1) {
|
||||
vvv_fn(e, *i, dest, src2, src3);
|
||||
} else if (dest == src2) {
|
||||
if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) {
|
||||
vvv_fn(e, *i, dest, src1, src3);
|
||||
} else {
|
||||
// Eww.
|
||||
e.movaps(e.xmm0, src1);
|
||||
vvv_fn(e, *i, e.xmm0, src2, src3);
|
||||
e.movaps(dest, e.xmm0);
|
||||
}
|
||||
} else if (dest == src3) {
|
||||
if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) {
|
||||
vvv_fn(e, *i, dest, src1, src2);
|
||||
} else {
|
||||
UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
} else {
|
||||
e.movaps(dest, src1);
|
||||
vvv_fn(e, *i, dest, src2, src3);
|
||||
}
|
||||
e.EndOp(dest, src1, src2, src3);
|
||||
}
|
||||
void XmmTernaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vvv_fn vvv_fn) {
|
||||
// TODO(benvanik): table lookup. This linear scan is slow.
|
||||
if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant() &&
|
||||
!i->src3.value->IsConstant()) {
|
||||
Xmm dest, src1, src2, src3;
|
||||
XmmTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3);
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
if (flags & ARITHMETIC_SET_CARRY) {
|
||||
// EFLAGS should have CA set?
|
||||
// (so long as we don't fuck with it)
|
||||
// UNIMPLEMENTED_SEQ();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_
|
||||
|
|
|
@ -24,15 +24,15 @@ namespace x64 {
|
|||
namespace lowering {
|
||||
|
||||
|
||||
#define IPRINT
|
||||
#define IFLUSH()
|
||||
#define DPRINT
|
||||
#define IPRINT
|
||||
#define DFLUSH()
|
||||
#define DPRINT
|
||||
|
||||
#define IPRINT if (thread_state->thread_id() == 1) printf
|
||||
#define IFLUSH() fflush(stdout)
|
||||
#define DPRINT if (thread_state->thread_id() == 1) printf
|
||||
#define IPRINT if (thread_state->thread_id() == 1) printf
|
||||
#define DFLUSH() fflush(stdout)
|
||||
#define DPRINT DFLUSH(); if (thread_state->thread_id() == 1) printf
|
||||
|
||||
|
||||
void TraceString(void* raw_context, const char* str) {
|
||||
|
|
|
@ -45,6 +45,16 @@ typedef struct XECACHEALIGN vec128_s {
|
|||
};
|
||||
};
|
||||
} vec128_t;
|
||||
XEFORCEINLINE vec128_t vec128i(uint32_t x, uint32_t y, uint32_t z, uint32_t w) {
|
||||
vec128_t v;
|
||||
v.i4[0] = x; v.i4[1] = y; v.i4[2] = z; v.i4[3] = w;
|
||||
return v;
|
||||
}
|
||||
XEFORCEINLINE vec128_t vec128f(float x, float y, float z, float w) {
|
||||
vec128_t v;
|
||||
v.f4[0] = x; v.f4[1] = y; v.f4[2] = z; v.f4[3] = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
} // namespace alloy
|
||||
|
||||
|
|
Loading…
Reference in New Issue