PERMUTE by V128 and fixing some ops.

This commit is contained in:
Ben Vanik 2014-02-02 11:23:03 -08:00
parent 44c29a6691
commit 14d6855b6d
3 changed files with 93 additions and 28 deletions

View File

@ -2456,11 +2456,11 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) {
} else if (IsVecType(i->dest->type)) {
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
// dest_src ^= 0xFFFF...
e.cmpeqps(e.xmm0, e.xmm0);
if (dest != src) {
e.movaps(dest, src);
}
e.pxor(dest, e.xmm0);
e.mov(e.rax, XMMCONSTBASE);
e.pxor(dest, XMMCONST(e.rax, XMMOne));
});
} else {
ASSERT_INVALID_TYPE();
@ -2937,14 +2937,64 @@ table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) {
}
} else if (i->src1.value->type == VEC128_TYPE) {
// Permute bytes between src2 and src3.
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
Xmm dest, control, src2, src3;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, control, 0,
i->src2.value, src2, 0,
i->src3.value, src3, 0);
UNIMPLEMENTED_SEQ();
e.EndOp(dest, control, src2, src3);
if (i->src3.value->IsConstantZero()) {
// Permuting with src2/zero, so just shuffle/mask.
Xmm dest, control, src2;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, control, 0,
i->src2.value, src2, 0);
if (i->src2.value->IsConstantZero()) {
e.vpxor(dest, src2, src2);
} else {
if (i->src2.value->IsConstant()) {
LoadXmmConstant(e, src2, i->src2.value->constant.v128);
}
// Control mask needs to be shuffled.
e.mov(e.rax, XMMCONSTBASE);
e.vpshufb(e.xmm0, control, XMMCONST(e.rax, XMMByteSwapMask));
e.vpshufb(dest, src2, e.xmm0);
// Build a mask with values in src2 having 0 and values in src3 having 1.
e.vpcmpgtb(e.xmm0, e.xmm0, XMMCONST(e.rax, XMMPermuteControl15));
e.vpandn(dest, e.xmm0, dest);
}
e.EndOp(dest, control, src2);
} else {
// General permute.
Xmm dest, control, src2, src3;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, control, 0,
i->src2.value, src2, 0,
i->src3.value, src3, 0);
e.mov(e.rax, XMMCONSTBASE);
// Control mask needs to be shuffled.
e.vpshufb(e.xmm1, control, XMMCONST(e.rax, XMMByteSwapMask));
// Build a mask with values in src2 having 0 and values in src3 having 1.
e.vpcmpgtb(dest, e.xmm1, XMMCONST(e.rax, XMMPermuteControl15));
Xmm src2_shuf, src3_shuf;
if (i->src2.value->IsConstantZero()) {
e.vpxor(src2, src2);
src2_shuf = src2;
} else {
if (i->src2.value->IsConstant()) {
LoadXmmConstant(e, src2, i->src2.value->constant.v128);
}
src2_shuf = e.xmm0;
e.vpshufb(src2_shuf, src2, e.xmm1);
}
if (i->src3.value->IsConstantZero()) {
e.vpxor(src3, src3);
src3_shuf = src3;
} else {
if (i->src3.value->IsConstant()) {
LoadXmmConstant(e, src3, i->src3.value->constant.v128);
}
// NOTE: reusing xmm1 here.
src3_shuf = e.xmm1;
e.vpshufb(src3_shuf, src3, e.xmm1);
}
e.vpblendvb(dest, src2_shuf, src3_shuf, dest);
e.EndOp(dest, control, src2, src3);
}
} else {
ASSERT_INVALID_TYPE();
}

View File

@ -52,9 +52,18 @@ Address Stash(X64Emitter& e, const Xmm& r) {
}
void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) {
e.mov(e.qword[e.rsp + STASH_OFFSET], v.low);
e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high);
e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]);
if (!v.low && !v.high) {
// zero
e.vpxor(dest, dest);
//} else if (v.low == ~0ull && v.high == ~0ull) {
// one
// TODO(benvanik): XMMCONST?
} else {
// TODO(benvanik): more efficient loading of partial values?
e.mov(e.qword[e.rsp + STASH_OFFSET], v.low);
e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high);
e.vmovaps(dest, e.ptr[e.rsp + STASH_OFFSET]);
}
}
// Moves a 64bit immediate into memory.
@ -539,8 +548,14 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn,
vv_fn(e, *i, dest, Ntx);
}
} else {
e.mov(dest, src2);
vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT()));
if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) {
e.mov(dest, src2);
vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT()));
} else {
// Need a cv_fn. Or a better way to do all of this.
e.mov(dest, (uint32_t)src1->get_constant(CT()));
vv_fn(e, *i, dest, src2);
}
}
} else {
// 64-bit.

View File

@ -102,21 +102,21 @@ void* X64Emitter::Emplace(size_t stack_size) {
int X64Emitter::Emit(HIRBuilder* builder) {
// These are the registers we will not be using. All others are fare game.
const uint32_t reserved_regs =
GetRegBit(rax) |
GetRegBit(rcx) |
GetRegBit(rdx) |
GetRegBit(rsp) |
GetRegBit(rbp) |
GetRegBit(rsi) |
GetRegBit(rdi) |
GetRegBit(xmm0) |
GetRegBit(rax) | // scratch
GetRegBit(rcx) | // arg
GetRegBit(rdx) | // arg/clobbered
GetRegBit(rsp) |
GetRegBit(rbp) |
GetRegBit(rsi) |
GetRegBit(rdi) |
GetRegBit(r8) | // arg/clobbered
GetRegBit(xmm0) | // scratch
GetRegBit(xmm1) | // sometimes used for scratch, could be fixed
// TODO(benvanik): save so that we can use these.
GetRegBit(r8) |
GetRegBit(r9) |
GetRegBit(r10) |
GetRegBit(r11) |
GetRegBit(xmm1) |
GetRegBit(r9) |
GetRegBit(r10) |
GetRegBit(r11) |
GetRegBit(xmm2) |
GetRegBit(xmm3) |
GetRegBit(xmm4) |