Juicy optimization goodness
This commit is contained in:
parent
27d12f1130
commit
364937e836
|
@ -802,6 +802,13 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
|||
// 1111...
|
||||
vpcmpeqb(dest, dest);
|
||||
} else {
|
||||
|
||||
for(unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||
if(xmm_consts[i] == v) {
|
||||
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
// TODO(benvanik): see what other common values are.
|
||||
// TODO(benvanik): build constant table - 99% are reused.
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
|
@ -817,13 +824,25 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
|
|||
} x = {v};
|
||||
if (!v) {
|
||||
// 0
|
||||
vpxor(dest, dest);
|
||||
vxorps(dest, dest);
|
||||
} else if (x.i == ~0U) {
|
||||
// 1111...
|
||||
vpcmpeqb(dest, dest);
|
||||
vcmpeqss(dest, dest);
|
||||
} else {
|
||||
// TODO(benvanik): see what other common values are.
|
||||
// TODO(benvanik): build constant table - 99% are reused.
|
||||
|
||||
|
||||
unsigned raw_bits =*reinterpret_cast<unsigned*>(&v);
|
||||
|
||||
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||
|
||||
if(xmm_consts[i].u32[0] == raw_bits) {
|
||||
vmovss(dest, GetXmmConstPtr((XmmConst)i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
mov(eax, x.i);
|
||||
vmovd(dest, eax);
|
||||
}
|
||||
|
@ -836,13 +855,24 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
|
|||
} x = {v};
|
||||
if (!v) {
|
||||
// 0
|
||||
vpxor(dest, dest);
|
||||
vxorpd(dest, dest);
|
||||
} else if (x.i == ~0ULL) {
|
||||
// 1111...
|
||||
vpcmpeqb(dest, dest);
|
||||
vcmpeqpd(dest, dest);
|
||||
|
||||
} else {
|
||||
// TODO(benvanik): see what other common values are.
|
||||
// TODO(benvanik): build constant table - 99% are reused.
|
||||
|
||||
uint64_t raw_bits = *reinterpret_cast<uint64_t*>(&v);
|
||||
|
||||
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||
|
||||
if(xmm_consts[i].u64[0] == raw_bits) {
|
||||
vmovsd(dest, GetXmmConstPtr((XmmConst)i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
mov(rax, x.i);
|
||||
vmovq(dest, rax);
|
||||
}
|
||||
|
|
|
@ -317,14 +317,12 @@ struct CONVERT_I32_F64
|
|||
struct CONVERT_I64_F64
|
||||
: Sequence<CONVERT_I64_F64, I<OPCODE_CONVERT, I64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
|
||||
e.xor_(e.eax, e.eax);
|
||||
|
||||
e.vcomisd(i.src1, e.GetXmmConstPtr(XmmConst::XMMZero));
|
||||
if (i.instr->flags == ROUND_TO_ZERO) {
|
||||
e.vcvttsd2si(i.dest, i.src1);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
e.vcvtsd2si(i.dest, i.src1);
|
||||
}
|
||||
// cf set if less than
|
||||
|
@ -1204,6 +1202,9 @@ template <typename SEQ, typename REG, typename ARGS>
|
|||
void EmitAddCarryXX(X64Emitter& e, const ARGS& i) {
|
||||
// TODO(benvanik): faster setting? we could probably do some fun math tricks
|
||||
// here to get the carry flag set.
|
||||
// chrisps: faster setting now, but i think the i.src3.is_constant check is
|
||||
// dead code
|
||||
|
||||
if (i.src3.is_constant) {
|
||||
if (i.src3.constant()) {
|
||||
e.stc();
|
||||
|
@ -1211,15 +1212,9 @@ void EmitAddCarryXX(X64Emitter& e, const ARGS& i) {
|
|||
e.clc();
|
||||
}
|
||||
} else {
|
||||
if (i.src3.reg().getIdx() <= 4) {
|
||||
// Can move from A/B/C/DX to AH.
|
||||
e.mov(e.ah, i.src3.reg().cvt8());
|
||||
} else {
|
||||
e.mov(e.al, i.src3);
|
||||
e.mov(e.ah, e.al);
|
||||
}
|
||||
e.sahf();
|
||||
e.bt(i.src3.reg().cvt32(), 0);
|
||||
}
|
||||
|
||||
SEQ::EmitCommutativeBinaryOp(
|
||||
e, i,
|
||||
[](X64Emitter& e, const REG& dest_src, const REG& src) {
|
||||
|
@ -2696,6 +2691,34 @@ void EmitAndXX(X64Emitter& e, const ARGS& i) {
|
|||
e.and_(dest_src, src);
|
||||
},
|
||||
[](X64Emitter& e, const REG& dest_src, int32_t constant) {
|
||||
if (constant == 0xFF) {
|
||||
if (dest_src.getBit() == 16 || dest_src.getBit() == 32) {
|
||||
e.movzx(dest_src, dest_src.cvt8());
|
||||
return;
|
||||
} else if (dest_src.getBit() == 64) {
|
||||
// take advantage of automatic zeroing of upper 32 bits
|
||||
e.movzx(dest_src.cvt32(), dest_src.cvt8());
|
||||
return;
|
||||
}
|
||||
} else if (constant == 0xFFFF) {
|
||||
if (dest_src.getBit() == 32) {
|
||||
e.movzx(dest_src, dest_src.cvt16());
|
||||
return;
|
||||
} else if (dest_src.getBit() == 64) {
|
||||
e.movzx(dest_src.cvt32(), dest_src.cvt16());
|
||||
return;
|
||||
}
|
||||
} else if (constant == -1) {
|
||||
if (dest_src.getBit() == 64) {
|
||||
// todo: verify that mov eax, eax will properly zero upper 64 bits
|
||||
}
|
||||
} else if (dest_src.getBit() == 64 && constant > 0) {
|
||||
// do 32 bit and, not the full 64, because the upper 32 of the mask
|
||||
// are zero and the 32 bit op will auto clear the top, save space on
|
||||
// the immediate and avoid a rex prefix
|
||||
e.and_(dest_src.cvt32(), constant);
|
||||
return;
|
||||
}
|
||||
e.and_(dest_src, constant);
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue