PERMUTE by V128 and fixing some ops.
This commit is contained in:
parent
44c29a6691
commit
14d6855b6d
|
@ -2456,11 +2456,11 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) {
|
||||||
} else if (IsVecType(i->dest->type)) {
|
} else if (IsVecType(i->dest->type)) {
|
||||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||||
// dest_src ^= 0xFFFF...
|
// dest_src ^= 0xFFFF...
|
||||||
e.cmpeqps(e.xmm0, e.xmm0);
|
|
||||||
if (dest != src) {
|
if (dest != src) {
|
||||||
e.movaps(dest, src);
|
e.movaps(dest, src);
|
||||||
}
|
}
|
||||||
e.pxor(dest, e.xmm0);
|
e.mov(e.rax, XMMCONSTBASE);
|
||||||
|
e.pxor(dest, XMMCONST(e.rax, XMMOne));
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
ASSERT_INVALID_TYPE();
|
ASSERT_INVALID_TYPE();
|
||||||
|
@ -2937,14 +2937,64 @@ table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) {
|
||||||
}
|
}
|
||||||
} else if (i->src1.value->type == VEC128_TYPE) {
|
} else if (i->src1.value->type == VEC128_TYPE) {
|
||||||
// Permute bytes between src2 and src3.
|
// Permute bytes between src2 and src3.
|
||||||
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
if (i->src3.value->IsConstantZero()) {
|
||||||
Xmm dest, control, src2, src3;
|
// Permuting with src2/zero, so just shuffle/mask.
|
||||||
e.BeginOp(i->dest, dest, REG_DEST,
|
Xmm dest, control, src2;
|
||||||
i->src1.value, control, 0,
|
e.BeginOp(i->dest, dest, REG_DEST,
|
||||||
i->src2.value, src2, 0,
|
i->src1.value, control, 0,
|
||||||
i->src3.value, src3, 0);
|
i->src2.value, src2, 0);
|
||||||
UNIMPLEMENTED_SEQ();
|
if (i->src2.value->IsConstantZero()) {
|
||||||
e.EndOp(dest, control, src2, src3);
|
e.vpxor(dest, src2, src2);
|
||||||
|
} else {
|
||||||
|
if (i->src2.value->IsConstant()) {
|
||||||
|
LoadXmmConstant(e, src2, i->src2.value->constant.v128);
|
||||||
|
}
|
||||||
|
// Control mask needs to be shuffled.
|
||||||
|
e.mov(e.rax, XMMCONSTBASE);
|
||||||
|
e.vpshufb(e.xmm0, control, XMMCONST(e.rax, XMMByteSwapMask));
|
||||||
|
e.vpshufb(dest, src2, e.xmm0);
|
||||||
|
// Build a mask with values in src2 having 0 and values in src3 having 1.
|
||||||
|
e.vpcmpgtb(e.xmm0, e.xmm0, XMMCONST(e.rax, XMMPermuteControl15));
|
||||||
|
e.vpandn(dest, e.xmm0, dest);
|
||||||
|
}
|
||||||
|
e.EndOp(dest, control, src2);
|
||||||
|
} else {
|
||||||
|
// General permute.
|
||||||
|
Xmm dest, control, src2, src3;
|
||||||
|
e.BeginOp(i->dest, dest, REG_DEST,
|
||||||
|
i->src1.value, control, 0,
|
||||||
|
i->src2.value, src2, 0,
|
||||||
|
i->src3.value, src3, 0);
|
||||||
|
e.mov(e.rax, XMMCONSTBASE);
|
||||||
|
// Control mask needs to be shuffled.
|
||||||
|
e.vpshufb(e.xmm1, control, XMMCONST(e.rax, XMMByteSwapMask));
|
||||||
|
// Build a mask with values in src2 having 0 and values in src3 having 1.
|
||||||
|
e.vpcmpgtb(dest, e.xmm1, XMMCONST(e.rax, XMMPermuteControl15));
|
||||||
|
Xmm src2_shuf, src3_shuf;
|
||||||
|
if (i->src2.value->IsConstantZero()) {
|
||||||
|
e.vpxor(src2, src2);
|
||||||
|
src2_shuf = src2;
|
||||||
|
} else {
|
||||||
|
if (i->src2.value->IsConstant()) {
|
||||||
|
LoadXmmConstant(e, src2, i->src2.value->constant.v128);
|
||||||
|
}
|
||||||
|
src2_shuf = e.xmm0;
|
||||||
|
e.vpshufb(src2_shuf, src2, e.xmm1);
|
||||||
|
}
|
||||||
|
if (i->src3.value->IsConstantZero()) {
|
||||||
|
e.vpxor(src3, src3);
|
||||||
|
src3_shuf = src3;
|
||||||
|
} else {
|
||||||
|
if (i->src3.value->IsConstant()) {
|
||||||
|
LoadXmmConstant(e, src3, i->src3.value->constant.v128);
|
||||||
|
}
|
||||||
|
// NOTE: reusing xmm1 here.
|
||||||
|
src3_shuf = e.xmm1;
|
||||||
|
e.vpshufb(src3_shuf, src3, e.xmm1);
|
||||||
|
}
|
||||||
|
e.vpblendvb(dest, src2_shuf, src3_shuf, dest);
|
||||||
|
e.EndOp(dest, control, src2, src3);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ASSERT_INVALID_TYPE();
|
ASSERT_INVALID_TYPE();
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,9 +52,18 @@ Address Stash(X64Emitter& e, const Xmm& r) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) {
|
void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) {
|
||||||
e.mov(e.qword[e.rsp + STASH_OFFSET], v.low);
|
if (!v.low && !v.high) {
|
||||||
e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high);
|
// zero
|
||||||
e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]);
|
e.vpxor(dest, dest);
|
||||||
|
//} else if (v.low == ~0ull && v.high == ~0ull) {
|
||||||
|
// one
|
||||||
|
// TODO(benvanik): XMMCONST?
|
||||||
|
} else {
|
||||||
|
// TODO(benvanik): more efficient loading of partial values?
|
||||||
|
e.mov(e.qword[e.rsp + STASH_OFFSET], v.low);
|
||||||
|
e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high);
|
||||||
|
e.vmovaps(dest, e.ptr[e.rsp + STASH_OFFSET]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Moves a 64bit immediate into memory.
|
// Moves a 64bit immediate into memory.
|
||||||
|
@ -539,8 +548,14 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn,
|
||||||
vv_fn(e, *i, dest, Ntx);
|
vv_fn(e, *i, dest, Ntx);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
e.mov(dest, src2);
|
if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) {
|
||||||
vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT()));
|
e.mov(dest, src2);
|
||||||
|
vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT()));
|
||||||
|
} else {
|
||||||
|
// Need a cv_fn. Or a better way to do all of this.
|
||||||
|
e.mov(dest, (uint32_t)src1->get_constant(CT()));
|
||||||
|
vv_fn(e, *i, dest, src2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// 64-bit.
|
// 64-bit.
|
||||||
|
|
|
@ -102,21 +102,21 @@ void* X64Emitter::Emplace(size_t stack_size) {
|
||||||
int X64Emitter::Emit(HIRBuilder* builder) {
|
int X64Emitter::Emit(HIRBuilder* builder) {
|
||||||
// These are the registers we will not be using. All others are fare game.
|
// These are the registers we will not be using. All others are fare game.
|
||||||
const uint32_t reserved_regs =
|
const uint32_t reserved_regs =
|
||||||
GetRegBit(rax) |
|
GetRegBit(rax) | // scratch
|
||||||
GetRegBit(rcx) |
|
GetRegBit(rcx) | // arg
|
||||||
GetRegBit(rdx) |
|
GetRegBit(rdx) | // arg/clobbered
|
||||||
GetRegBit(rsp) |
|
GetRegBit(rsp) |
|
||||||
GetRegBit(rbp) |
|
GetRegBit(rbp) |
|
||||||
GetRegBit(rsi) |
|
GetRegBit(rsi) |
|
||||||
GetRegBit(rdi) |
|
GetRegBit(rdi) |
|
||||||
GetRegBit(xmm0) |
|
GetRegBit(r8) | // arg/clobbered
|
||||||
|
GetRegBit(xmm0) | // scratch
|
||||||
|
GetRegBit(xmm1) | // sometimes used for scratch, could be fixed
|
||||||
|
|
||||||
// TODO(benvanik): save so that we can use these.
|
// TODO(benvanik): save so that we can use these.
|
||||||
GetRegBit(r8) |
|
GetRegBit(r9) |
|
||||||
GetRegBit(r9) |
|
GetRegBit(r10) |
|
||||||
GetRegBit(r10) |
|
GetRegBit(r11) |
|
||||||
GetRegBit(r11) |
|
|
||||||
GetRegBit(xmm1) |
|
|
||||||
GetRegBit(xmm2) |
|
GetRegBit(xmm2) |
|
||||||
GetRegBit(xmm3) |
|
GetRegBit(xmm3) |
|
||||||
GetRegBit(xmm4) |
|
GetRegBit(xmm4) |
|
||||||
|
|
Loading…
Reference in New Issue