Juicy optimization goodness

This commit is contained in:
chss95cs@gmail.com 2020-01-23 20:00:18 -08:00 committed by illusion
parent 27d12f1130
commit 364937e836
2 changed files with 70 additions and 17 deletions

View File

@ -802,6 +802,13 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
// 1111...
vpcmpeqb(dest, dest);
} else {
for(unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
if(xmm_consts[i] == v) {
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
return;
}
}
// TODO(benvanik): see what other common values are.
// TODO(benvanik): build constant table - 99% are reused.
MovMem64(rsp + kStashOffset, v.low);
@ -817,13 +824,25 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
} x = {v};
if (!v) {
// 0
vpxor(dest, dest);
vxorps(dest, dest);
} else if (x.i == ~0U) {
// 1111...
vpcmpeqb(dest, dest);
vcmpeqss(dest, dest);
} else {
// TODO(benvanik): see what other common values are.
// TODO(benvanik): build constant table - 99% are reused.
unsigned raw_bits =*reinterpret_cast<unsigned*>(&v);
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
if(xmm_consts[i].u32[0] == raw_bits) {
vmovss(dest, GetXmmConstPtr((XmmConst)i));
return;
}
}
mov(eax, x.i);
vmovd(dest, eax);
}
@ -836,13 +855,24 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
} x = {v};
if (!v) {
// 0
vpxor(dest, dest);
vxorpd(dest, dest);
} else if (x.i == ~0ULL) {
// 1111...
vpcmpeqb(dest, dest);
vcmpeqpd(dest, dest);
} else {
// TODO(benvanik): see what other common values are.
// TODO(benvanik): build constant table - 99% are reused.
uint64_t raw_bits = *reinterpret_cast<uint64_t*>(&v);
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
if(xmm_consts[i].u64[0] == raw_bits) {
vmovsd(dest, GetXmmConstPtr((XmmConst)i));
return;
}
}
mov(rax, x.i);
vmovq(dest, rax);
}

View File

@ -317,14 +317,12 @@ struct CONVERT_I32_F64
struct CONVERT_I64_F64
: Sequence<CONVERT_I64_F64, I<OPCODE_CONVERT, I64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.xor_(e.eax, e.eax);
e.xor_ (e.eax, e.eax);
e.vcomisd(i.src1, e.GetXmmConstPtr(XmmConst::XMMZero));
if (i.instr->flags == ROUND_TO_ZERO) {
e.vcvttsd2si(i.dest, i.src1);
}
else {
} else {
e.vcvtsd2si(i.dest, i.src1);
}
// cf set if less than
@ -332,7 +330,7 @@ struct CONVERT_I64_F64
e.cmp(i.dest, -1LL);
// if dest == 0x80000000 and not inp < 0 then dest = 0x7FFFFFFF
e.seto(e.al);
e.and_ (e.al, e.cl);
e.and_(e.al, e.cl);
e.sub(i.dest, e.rax);
}
};
@ -1204,6 +1202,9 @@ template <typename SEQ, typename REG, typename ARGS>
void EmitAddCarryXX(X64Emitter& e, const ARGS& i) {
// TODO(benvanik): faster setting? we could probably do some fun math tricks
// here to get the carry flag set.
// chrisps: faster setting now, but i think the i.src3.is_constant check is
// dead code
if (i.src3.is_constant) {
if (i.src3.constant()) {
e.stc();
@ -1211,15 +1212,9 @@ void EmitAddCarryXX(X64Emitter& e, const ARGS& i) {
e.clc();
}
} else {
if (i.src3.reg().getIdx() <= 4) {
// Can move from A/B/C/DX to AH.
e.mov(e.ah, i.src3.reg().cvt8());
} else {
e.mov(e.al, i.src3);
e.mov(e.ah, e.al);
}
e.sahf();
e.bt(i.src3.reg().cvt32(), 0);
}
SEQ::EmitCommutativeBinaryOp(
e, i,
[](X64Emitter& e, const REG& dest_src, const REG& src) {
@ -2696,6 +2691,34 @@ void EmitAndXX(X64Emitter& e, const ARGS& i) {
e.and_(dest_src, src);
},
[](X64Emitter& e, const REG& dest_src, int32_t constant) {
if (constant == 0xFF) {
if (dest_src.getBit() == 16 || dest_src.getBit() == 32) {
e.movzx(dest_src, dest_src.cvt8());
return;
} else if (dest_src.getBit() == 64) {
// take advantage of automatic zeroing of upper 32 bits
e.movzx(dest_src.cvt32(), dest_src.cvt8());
return;
}
} else if (constant == 0xFFFF) {
if (dest_src.getBit() == 32) {
e.movzx(dest_src, dest_src.cvt16());
return;
} else if (dest_src.getBit() == 64) {
e.movzx(dest_src.cvt32(), dest_src.cvt16());
return;
}
} else if (constant == -1) {
if (dest_src.getBit() == 64) {
// todo: verify that mov eax, eax will properly zero upper 64 bits
}
} else if (dest_src.getBit() == 64 && constant > 0) {
// do 32 bit and, not the full 64, because the upper 32 of the mask
// are zero and the 32 bit op will auto clear the top, save space on
// the immediate and avoid a rex prefix
e.and_(dest_src.cvt32(), constant);
return;
}
e.and_(dest_src, constant);
});
}