Fixed shit precision in RECIP, made multiplication codegen less garbage
This commit is contained in:
parent
364937e836
commit
bfc31f8706
|
@ -57,13 +57,13 @@ static const size_t kStashOffset = 32;
|
||||||
// static const size_t kStashOffsetHigh = 32 + 32;
|
// static const size_t kStashOffsetHigh = 32 + 32;
|
||||||
|
|
||||||
const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
|
const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
|
||||||
Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11,
|
Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11,
|
||||||
Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14,
|
Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14,
|
||||||
Xbyak::Operand::R15,
|
Xbyak::Operand::R15,
|
||||||
};
|
};
|
||||||
|
|
||||||
const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = {
|
const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = {
|
||||||
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||||
};
|
};
|
||||||
|
|
||||||
X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
||||||
|
@ -212,9 +212,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
|
|
||||||
// Record call history value into slot (guest addr in RDX).
|
// Record call history value into slot (guest addr in RDX).
|
||||||
mov(dword[Xbyak::RegExp(uint32_t(uint64_t(
|
mov(dword[Xbyak::RegExp(uint32_t(uint64_t(
|
||||||
low_address(&trace_header->function_caller_history)))) +
|
low_address(&trace_header->function_caller_history)))) +
|
||||||
rax * 4],
|
rax * 4], edx);
|
||||||
edx);
|
|
||||||
|
|
||||||
// Calling thread. Load ax with thread ID.
|
// Calling thread. Load ax with thread ID.
|
||||||
EmitGetCurrentThreadId();
|
EmitGetCurrentThreadId();
|
||||||
|
@ -739,9 +738,8 @@ static const vec128_t xmm_consts[] = {
|
||||||
/* XMMIntMax */ vec128i(INT_MAX),
|
/* XMMIntMax */ vec128i(INT_MAX),
|
||||||
/* XMMIntMaxPD */ vec128d(INT_MAX),
|
/* XMMIntMaxPD */ vec128d(INT_MAX),
|
||||||
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
|
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
|
||||||
/* XMMQNaN */ vec128i(0x7FC00000u),
|
/* XMMQNaN */ vec128i(0x7FC00000u),
|
||||||
/*XMMSelectTableBase */vec128i(0),
|
/* XMMOneDouble */ vec128d(1.0)
|
||||||
/*XMMSelectTableLast*/ vec128i(-1)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// First location to try and place constants.
|
// First location to try and place constants.
|
||||||
|
@ -780,17 +778,14 @@ void X64Emitter::FreeConstData(uintptr_t data) {
|
||||||
memory::DeallocFixed(reinterpret_cast<void*>(data), 0,
|
memory::DeallocFixed(reinterpret_cast<void*>(data), 0,
|
||||||
memory::DeallocationType::kRelease);
|
memory::DeallocationType::kRelease);
|
||||||
}
|
}
|
||||||
uintptr_t X64Emitter::GetXmmRawAddress(XmmConst id) {
|
|
||||||
return backend_->emitter_data() + sizeof(vec128_t) * id;
|
|
||||||
}
|
|
||||||
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
||||||
// Load through fixed constant table setup by PlaceConstData.
|
// Load through fixed constant table setup by PlaceConstData.
|
||||||
// It's important that the pointer is not signed, as it will be sign-extended.
|
// It's important that the pointer is not signed, as it will be sign-extended.
|
||||||
return ptr[GetXmmRawAddress(id)];
|
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
|
sizeof(vec128_t) * id)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Implies possible StashXmm(0, ...)!
|
// Implies possible StashXmm(0, ...)!
|
||||||
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||||
// https://www.agner.org/optimize/optimizing_assembly.pdf
|
// https://www.agner.org/optimize/optimizing_assembly.pdf
|
||||||
|
@ -804,10 +799,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for(unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
for(unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||||
if(xmm_consts[i] == v) {
|
if(xmm_consts[i] == v) {
|
||||||
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// TODO(benvanik): see what other common values are.
|
// TODO(benvanik): see what other common values are.
|
||||||
// TODO(benvanik): build constant table - 99% are reused.
|
// TODO(benvanik): build constant table - 99% are reused.
|
||||||
|
@ -832,15 +827,13 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
|
||||||
// TODO(benvanik): see what other common values are.
|
// TODO(benvanik): see what other common values are.
|
||||||
// TODO(benvanik): build constant table - 99% are reused.
|
// TODO(benvanik): build constant table - 99% are reused.
|
||||||
|
|
||||||
|
|
||||||
unsigned raw_bits =*reinterpret_cast<unsigned*>(&v);
|
unsigned raw_bits =*reinterpret_cast<unsigned*>(&v);
|
||||||
|
|
||||||
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||||
|
if(xmm_consts[i].u32[0] == raw_bits) {
|
||||||
if(xmm_consts[i].u32[0] == raw_bits) {
|
vmovss(dest, GetXmmConstPtr((XmmConst)i));
|
||||||
vmovss(dest, GetXmmConstPtr((XmmConst)i));
|
return;
|
||||||
return;
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mov(eax, x.i);
|
mov(eax, x.i);
|
||||||
|
@ -867,11 +860,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
|
||||||
uint64_t raw_bits = *reinterpret_cast<uint64_t*>(&v);
|
uint64_t raw_bits = *reinterpret_cast<uint64_t*>(&v);
|
||||||
|
|
||||||
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||||
|
if(xmm_consts[i].u64[0] == raw_bits) {
|
||||||
if(xmm_consts[i].u64[0] == raw_bits) {
|
vmovsd(dest, GetXmmConstPtr((XmmConst)i));
|
||||||
vmovsd(dest, GetXmmConstPtr((XmmConst)i));
|
return;
|
||||||
return;
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
mov(rax, x.i);
|
mov(rax, x.i);
|
||||||
vmovq(dest, rax);
|
vmovq(dest, rax);
|
||||||
|
|
|
@ -114,8 +114,7 @@ enum XmmConst {
|
||||||
XMMIntMaxPD,
|
XMMIntMaxPD,
|
||||||
XMMPosIntMinPS,
|
XMMPosIntMinPS,
|
||||||
XMMQNaN,
|
XMMQNaN,
|
||||||
XMMSelectTableBase,
|
XMMOneDouble
|
||||||
XMMSelectTableLast,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||||
|
@ -212,7 +211,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
void MovMem64(const Xbyak::RegExp& addr, uint64_t v);
|
void MovMem64(const Xbyak::RegExp& addr, uint64_t v);
|
||||||
|
|
||||||
Xbyak::Address GetXmmConstPtr(XmmConst id);
|
Xbyak::Address GetXmmConstPtr(XmmConst id);
|
||||||
uintptr_t GetXmmRawAddress(XmmConst id);
|
|
||||||
void LoadConstantXmm(Xbyak::Xmm dest, float v);
|
void LoadConstantXmm(Xbyak::Xmm dest, float v);
|
||||||
void LoadConstantXmm(Xbyak::Xmm dest, double v);
|
void LoadConstantXmm(Xbyak::Xmm dest, double v);
|
||||||
void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
|
void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
|
||||||
|
|
|
@ -175,7 +175,7 @@ struct ZERO_EXTEND_I32_I8
|
||||||
struct ZERO_EXTEND_I64_I8
|
struct ZERO_EXTEND_I64_I8
|
||||||
: Sequence<ZERO_EXTEND_I64_I8, I<OPCODE_ZERO_EXTEND, I64Op, I8Op>> {
|
: Sequence<ZERO_EXTEND_I64_I8, I<OPCODE_ZERO_EXTEND, I64Op, I8Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.movzx(i.dest, i.src1);
|
e.movzx(i.dest.reg().cvt32(), i.src1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct ZERO_EXTEND_I32_I16
|
struct ZERO_EXTEND_I32_I16
|
||||||
|
@ -187,7 +187,7 @@ struct ZERO_EXTEND_I32_I16
|
||||||
struct ZERO_EXTEND_I64_I16
|
struct ZERO_EXTEND_I64_I16
|
||||||
: Sequence<ZERO_EXTEND_I64_I16, I<OPCODE_ZERO_EXTEND, I64Op, I16Op>> {
|
: Sequence<ZERO_EXTEND_I64_I16, I<OPCODE_ZERO_EXTEND, I64Op, I16Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.movzx(i.dest, i.src1);
|
e.movzx(i.dest.reg().cvt32(), i.src1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct ZERO_EXTEND_I64_I32
|
struct ZERO_EXTEND_I64_I32
|
||||||
|
@ -1323,6 +1323,19 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32,
|
||||||
// We exploit mulx here to avoid creating too much register pressure.
|
// We exploit mulx here to avoid creating too much register pressure.
|
||||||
struct MUL_I8 : Sequence<MUL_I8, I<OPCODE_MUL, I8Op, I8Op, I8Op>> {
|
struct MUL_I8 : Sequence<MUL_I8, I<OPCODE_MUL, I8Op, I8Op, I8Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if(i.src1.is_constant || i.src2.is_constant ) {
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant();
|
||||||
|
|
||||||
|
if(cval < (1ull<<32)) {
|
||||||
|
|
||||||
|
auto& whichevs = i.src1.is_constant ? i.src2 : i.src1;
|
||||||
|
|
||||||
|
e.imul(i.dest, whichevs, (int)cval);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
||||||
// mulx: $1:$2 = EDX * $3
|
// mulx: $1:$2 = EDX * $3
|
||||||
|
|
||||||
|
@ -1364,6 +1377,19 @@ struct MUL_I8 : Sequence<MUL_I8, I<OPCODE_MUL, I8Op, I8Op, I8Op>> {
|
||||||
};
|
};
|
||||||
struct MUL_I16 : Sequence<MUL_I16, I<OPCODE_MUL, I16Op, I16Op, I16Op>> {
|
struct MUL_I16 : Sequence<MUL_I16, I<OPCODE_MUL, I16Op, I16Op, I16Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if(i.src1.is_constant || i.src2.is_constant ) {
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant();
|
||||||
|
|
||||||
|
if(cval < (1ull<<32)) {
|
||||||
|
|
||||||
|
auto& whichevs = i.src1.is_constant ? i.src2 : i.src1;
|
||||||
|
|
||||||
|
e.imul(i.dest, whichevs, (int)cval);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
||||||
// mulx: $1:$2 = EDX * $3
|
// mulx: $1:$2 = EDX * $3
|
||||||
|
|
||||||
|
@ -1412,6 +1438,20 @@ struct MUL_I32 : Sequence<MUL_I32, I<OPCODE_MUL, I32Op, I32Op, I32Op>> {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(i.src1.is_constant || i.src2.is_constant ) {
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant();
|
||||||
|
|
||||||
|
if(cval < (1ull<<32)) {
|
||||||
|
|
||||||
|
auto& whichevs = i.src1.is_constant ? i.src2 : i.src1;
|
||||||
|
|
||||||
|
e.imul(i.dest, whichevs, (int)cval);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
||||||
// mulx: $1:$2 = EDX * $3
|
// mulx: $1:$2 = EDX * $3
|
||||||
|
|
||||||
|
@ -1462,6 +1502,21 @@ struct MUL_I64 : Sequence<MUL_I64, I<OPCODE_MUL, I64Op, I64Op, I64Op>> {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(i.src1.is_constant || i.src2.is_constant ) {
|
||||||
|
|
||||||
|
|
||||||
|
uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant();
|
||||||
|
|
||||||
|
if(cval < (1ull<<32)) {
|
||||||
|
|
||||||
|
auto& whichevs = i.src1.is_constant ? i.src2 : i.src1;
|
||||||
|
|
||||||
|
e.imul(i.dest, whichevs, (int)cval);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
|
||||||
// mulx: $1:$2 = RDX * $3
|
// mulx: $1:$2 = RDX * $3
|
||||||
|
|
||||||
|
@ -2470,9 +2525,14 @@ struct RSQRT_F32 : Sequence<RSQRT_F32, I<OPCODE_RSQRT, F32Op, F32Op>> {
|
||||||
};
|
};
|
||||||
struct RSQRT_F64 : Sequence<RSQRT_F64, I<OPCODE_RSQRT, F64Op, F64Op>> {
|
struct RSQRT_F64 : Sequence<RSQRT_F64, I<OPCODE_RSQRT, F64Op, F64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vcvtsd2ss(i.dest, i.src1);
|
/*e.vcvtsd2ss(i.dest, i.src1);
|
||||||
e.vrsqrtss(i.dest, i.dest);
|
e.vrsqrtss(i.dest, i.dest);
|
||||||
e.vcvtss2sd(i.dest, i.dest);
|
e.vcvtss2sd(i.dest, i.dest);*/
|
||||||
|
|
||||||
|
e.vmovsd(e.xmm0, e.GetXmmConstPtr(XmmConst::XMMOneDouble));
|
||||||
|
e.vsqrtsd(i.dest, i.src1);
|
||||||
|
e.vdivsd(i.dest, e.xmm0, i.dest);
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
|
struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
|
||||||
|
@ -2492,9 +2552,11 @@ struct RECIP_F32 : Sequence<RECIP_F32, I<OPCODE_RECIP, F32Op, F32Op>> {
|
||||||
};
|
};
|
||||||
struct RECIP_F64 : Sequence<RECIP_F64, I<OPCODE_RECIP, F64Op, F64Op>> {
|
struct RECIP_F64 : Sequence<RECIP_F64, I<OPCODE_RECIP, F64Op, F64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vcvtsd2ss(i.dest, i.src1);
|
/*e.vcvtsd2ss(i.dest, i.src1);
|
||||||
e.vrcpss(i.dest, i.dest);
|
e.vrcpss(i.dest, i.dest);
|
||||||
e.vcvtss2sd(i.dest, i.dest);
|
e.vcvtss2sd(i.dest, i.dest);*/
|
||||||
|
e.vmovsd(e.xmm0, e.GetXmmConstPtr(XmmConst::XMMOneDouble));
|
||||||
|
e.vdivsd(i.dest, e.xmm0, i.src1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct RECIP_V128 : Sequence<RECIP_V128, I<OPCODE_RECIP, V128Op, V128Op>> {
|
struct RECIP_V128 : Sequence<RECIP_V128, I<OPCODE_RECIP, V128Op, V128Op>> {
|
||||||
|
|
Loading…
Reference in New Issue