PERMUTE by V128 and fixing some ops.
This commit is contained in:
parent
44c29a6691
commit
14d6855b6d
|
@ -2456,11 +2456,11 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) {
|
|||
} else if (IsVecType(i->dest->type)) {
|
||||
XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
|
||||
// dest_src ^= 0xFFFF...
|
||||
e.cmpeqps(e.xmm0, e.xmm0);
|
||||
if (dest != src) {
|
||||
e.movaps(dest, src);
|
||||
}
|
||||
e.pxor(dest, e.xmm0);
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.pxor(dest, XMMCONST(e.rax, XMMOne));
|
||||
});
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
|
@ -2937,14 +2937,64 @@ table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) {
|
|||
}
|
||||
} else if (i->src1.value->type == VEC128_TYPE) {
|
||||
// Permute bytes between src2 and src3.
|
||||
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
||||
Xmm dest, control, src2, src3;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, control, 0,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
UNIMPLEMENTED_SEQ();
|
||||
e.EndOp(dest, control, src2, src3);
|
||||
if (i->src3.value->IsConstantZero()) {
|
||||
// Permuting with src2/zero, so just shuffle/mask.
|
||||
Xmm dest, control, src2;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, control, 0,
|
||||
i->src2.value, src2, 0);
|
||||
if (i->src2.value->IsConstantZero()) {
|
||||
e.vpxor(dest, src2, src2);
|
||||
} else {
|
||||
if (i->src2.value->IsConstant()) {
|
||||
LoadXmmConstant(e, src2, i->src2.value->constant.v128);
|
||||
}
|
||||
// Control mask needs to be shuffled.
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
e.vpshufb(e.xmm0, control, XMMCONST(e.rax, XMMByteSwapMask));
|
||||
e.vpshufb(dest, src2, e.xmm0);
|
||||
// Build a mask with values in src2 having 0 and values in src3 having 1.
|
||||
e.vpcmpgtb(e.xmm0, e.xmm0, XMMCONST(e.rax, XMMPermuteControl15));
|
||||
e.vpandn(dest, e.xmm0, dest);
|
||||
}
|
||||
e.EndOp(dest, control, src2);
|
||||
} else {
|
||||
// General permute.
|
||||
Xmm dest, control, src2, src3;
|
||||
e.BeginOp(i->dest, dest, REG_DEST,
|
||||
i->src1.value, control, 0,
|
||||
i->src2.value, src2, 0,
|
||||
i->src3.value, src3, 0);
|
||||
e.mov(e.rax, XMMCONSTBASE);
|
||||
// Control mask needs to be shuffled.
|
||||
e.vpshufb(e.xmm1, control, XMMCONST(e.rax, XMMByteSwapMask));
|
||||
// Build a mask with values in src2 having 0 and values in src3 having 1.
|
||||
e.vpcmpgtb(dest, e.xmm1, XMMCONST(e.rax, XMMPermuteControl15));
|
||||
Xmm src2_shuf, src3_shuf;
|
||||
if (i->src2.value->IsConstantZero()) {
|
||||
e.vpxor(src2, src2);
|
||||
src2_shuf = src2;
|
||||
} else {
|
||||
if (i->src2.value->IsConstant()) {
|
||||
LoadXmmConstant(e, src2, i->src2.value->constant.v128);
|
||||
}
|
||||
src2_shuf = e.xmm0;
|
||||
e.vpshufb(src2_shuf, src2, e.xmm1);
|
||||
}
|
||||
if (i->src3.value->IsConstantZero()) {
|
||||
e.vpxor(src3, src3);
|
||||
src3_shuf = src3;
|
||||
} else {
|
||||
if (i->src3.value->IsConstant()) {
|
||||
LoadXmmConstant(e, src3, i->src3.value->constant.v128);
|
||||
}
|
||||
// NOTE: reusing xmm1 here.
|
||||
src3_shuf = e.xmm1;
|
||||
e.vpshufb(src3_shuf, src3, e.xmm1);
|
||||
}
|
||||
e.vpblendvb(dest, src2_shuf, src3_shuf, dest);
|
||||
e.EndOp(dest, control, src2, src3);
|
||||
}
|
||||
} else {
|
||||
ASSERT_INVALID_TYPE();
|
||||
}
|
||||
|
|
|
@ -52,9 +52,18 @@ Address Stash(X64Emitter& e, const Xmm& r) {
|
|||
}
|
||||
|
||||
void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) {
|
||||
e.mov(e.qword[e.rsp + STASH_OFFSET], v.low);
|
||||
e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high);
|
||||
e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]);
|
||||
if (!v.low && !v.high) {
|
||||
// zero
|
||||
e.vpxor(dest, dest);
|
||||
//} else if (v.low == ~0ull && v.high == ~0ull) {
|
||||
// one
|
||||
// TODO(benvanik): XMMCONST?
|
||||
} else {
|
||||
// TODO(benvanik): more efficient loading of partial values?
|
||||
e.mov(e.qword[e.rsp + STASH_OFFSET], v.low);
|
||||
e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high);
|
||||
e.vmovaps(dest, e.ptr[e.rsp + STASH_OFFSET]);
|
||||
}
|
||||
}
|
||||
|
||||
// Moves a 64bit immediate into memory.
|
||||
|
@ -539,8 +548,14 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn,
|
|||
vv_fn(e, *i, dest, Ntx);
|
||||
}
|
||||
} else {
|
||||
e.mov(dest, src2);
|
||||
vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT()));
|
||||
if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) {
|
||||
e.mov(dest, src2);
|
||||
vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT()));
|
||||
} else {
|
||||
// Need a cv_fn. Or a better way to do all of this.
|
||||
e.mov(dest, (uint32_t)src1->get_constant(CT()));
|
||||
vv_fn(e, *i, dest, src2);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 64-bit.
|
||||
|
|
|
@ -102,21 +102,21 @@ void* X64Emitter::Emplace(size_t stack_size) {
|
|||
int X64Emitter::Emit(HIRBuilder* builder) {
|
||||
// These are the registers we will not be using. All others are fare game.
|
||||
const uint32_t reserved_regs =
|
||||
GetRegBit(rax) |
|
||||
GetRegBit(rcx) |
|
||||
GetRegBit(rdx) |
|
||||
GetRegBit(rsp) |
|
||||
GetRegBit(rbp) |
|
||||
GetRegBit(rsi) |
|
||||
GetRegBit(rdi) |
|
||||
GetRegBit(xmm0) |
|
||||
GetRegBit(rax) | // scratch
|
||||
GetRegBit(rcx) | // arg
|
||||
GetRegBit(rdx) | // arg/clobbered
|
||||
GetRegBit(rsp) |
|
||||
GetRegBit(rbp) |
|
||||
GetRegBit(rsi) |
|
||||
GetRegBit(rdi) |
|
||||
GetRegBit(r8) | // arg/clobbered
|
||||
GetRegBit(xmm0) | // scratch
|
||||
GetRegBit(xmm1) | // sometimes used for scratch, could be fixed
|
||||
|
||||
// TODO(benvanik): save so that we can use these.
|
||||
GetRegBit(r8) |
|
||||
GetRegBit(r9) |
|
||||
GetRegBit(r10) |
|
||||
GetRegBit(r11) |
|
||||
GetRegBit(xmm1) |
|
||||
GetRegBit(r9) |
|
||||
GetRegBit(r10) |
|
||||
GetRegBit(r11) |
|
||||
GetRegBit(xmm2) |
|
||||
GetRegBit(xmm3) |
|
||||
GetRegBit(xmm4) |
|
||||
|
|
Loading…
Reference in New Issue