Adding --enable_haswell_instructions=false to disable modern instructions.

This commit is contained in:
Ben Vanik 2015-05-11 21:42:10 -07:00
parent fc02a0c404
commit ad7efa964f
3 changed files with 60 additions and 68 deletions

View File

@ -9,6 +9,8 @@
#include "xenia/cpu/backend/x64/x64_emitter.h" #include "xenia/cpu/backend/x64/x64_emitter.h"
#include <gflags/gflags.h>
#include "xenia/base/assert.h" #include "xenia/base/assert.h"
#include "xenia/base/atomic.h" #include "xenia/base/atomic.h"
#include "xenia/base/logging.h" #include "xenia/base/logging.h"
@ -28,6 +30,10 @@
#include "xenia/cpu/thread_state.h" #include "xenia/cpu/thread_state.h"
#include "xenia/profiling.h" #include "xenia/profiling.h"
DEFINE_bool(
enable_haswell_instructions, true,
"Uses the AVX2/FMA/etc instructions on Haswell processors, if available.");
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace backend { namespace backend {
@ -65,11 +71,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
backend_(backend), backend_(backend),
code_cache_(backend->code_cache()), code_cache_(backend->code_cache()),
allocator_(allocator), allocator_(allocator),
feature_flags_(0),
current_instr_(0), current_instr_(0),
debug_info_(nullptr), debug_info_(nullptr),
debug_info_flags_(0), debug_info_flags_(0),
source_map_count_(0), source_map_count_(0),
stack_size_(0) {} stack_size_(0) {
if (FLAGS_enable_haswell_instructions) {
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tAVX2) ? kX64EmitAVX2 : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tFMA) ? kX64EmitFMA : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tLZCNT) ? kX64EmitLZCNT : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tBMI2) ? kX64EmitBMI2 : 0;
}
}
X64Emitter::~X64Emitter() = default; X64Emitter::~X64Emitter() = default;

View File

@ -97,6 +97,13 @@ class XbyakAllocator : public Xbyak::Allocator {
virtual bool useProtect() const { return false; } virtual bool useProtect() const { return false; }
}; };
enum X64EmitterFeatureFlags {
kX64EmitAVX2 = 1 << 1,
kX64EmitFMA = 1 << 2,
kX64EmitLZCNT = 1 << 3,
kX64EmitBMI2 = 1 << 4,
};
class X64Emitter : public Xbyak::CodeGenerator { class X64Emitter : public Xbyak::CodeGenerator {
public: public:
X64Emitter(X64Backend* backend, XbyakAllocator* allocator); X64Emitter(X64Backend* backend, XbyakAllocator* allocator);
@ -104,7 +111,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
Processor* processor() const { return processor_; } Processor* processor() const { return processor_; }
X64Backend* backend() const { return backend_; } X64Backend* backend() const { return backend_; }
const Xbyak::util::Cpu* cpu() const { return &cpu_; }
bool Emit(hir::HIRBuilder* builder, uint32_t debug_info_flags, bool Emit(hir::HIRBuilder* builder, uint32_t debug_info_flags,
DebugInfo* debug_info, void*& out_code_address, DebugInfo* debug_info, void*& out_code_address,
@ -177,6 +183,10 @@ class X64Emitter : public Xbyak::CodeGenerator {
void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v); void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
Xbyak::Address StashXmm(int index, const Xbyak::Xmm& r); Xbyak::Address StashXmm(int index, const Xbyak::Xmm& r);
bool IsFeatureEnabled(uint32_t feature_flag) const {
return (feature_flags_ & feature_flag) != 0;
}
DebugInfo* debug_info() const { return debug_info_; } DebugInfo* debug_info() const { return debug_info_; }
size_t stack_size() const { return stack_size_; } size_t stack_size() const { return stack_size_; }
@ -192,7 +202,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
X64Backend* backend_; X64Backend* backend_;
X64CodeCache* code_cache_; X64CodeCache* code_cache_;
XbyakAllocator* allocator_; XbyakAllocator* allocator_;
Xbyak::util::Cpu cpu_; // Host CPU info Xbyak::util::Cpu cpu_;
uint32_t feature_flags_;
hir::Instr* current_instr_; hir::Instr* current_instr_;

View File

@ -3070,7 +3070,7 @@ EMITTER(MUL_I8, MATCH(I<OPCODE_MUL, I8<>, I8<>, I8<>>)) {
// dest hi, dest low = src * edx // dest hi, dest low = src * edx
// TODO(justin): Find a way to shorten this has call // TODO(justin): Find a way to shorten this has call
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src2 in edx? // TODO(benvanik): place src2 in edx?
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); assert_true(!i.src2.is_constant);
@ -3088,17 +3088,13 @@ EMITTER(MUL_I8, MATCH(I<OPCODE_MUL, I8<>, I8<>, I8<>>)) {
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX <- EAX * $1; // EDX:EAX <- EAX * $1;
//e.DebugBreak();
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); assert_true(!i.src2.is_constant);
e.mov(e.eax, i.src1); e.mov(e.eax, i.src1);
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.eax); e.mov(i.dest, e.eax);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); assert_true(!i.src1.is_constant);
e.mov(e.eax, i.src2); e.mov(e.eax, i.src2);
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.eax); e.mov(i.dest, e.eax);
@ -3116,7 +3112,7 @@ EMITTER(MUL_I16, MATCH(I<OPCODE_MUL, I16<>, I16<>, I16<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// dest hi, dest low = src * edx // dest hi, dest low = src * edx
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src2 in edx? // TODO(benvanik): place src2 in edx?
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); assert_true(!i.src2.is_constant);
@ -3134,17 +3130,13 @@ EMITTER(MUL_I16, MATCH(I<OPCODE_MUL, I16<>, I16<>, I16<>>)) {
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX <- EAX * REG; // EDX:EAX <- EAX * REG;
//e.DebugBreak();
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); assert_true(!i.src2.is_constant);
e.mov(e.eax, i.src1.constant()); e.mov(e.eax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.eax); e.mov(i.dest, e.eax);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); assert_true(!i.src1.is_constant);
e.mov(e.eax, i.src2.constant()); e.mov(e.eax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.eax); e.mov(i.dest, e.eax);
@ -3163,7 +3155,7 @@ EMITTER(MUL_I32, MATCH(I<OPCODE_MUL, I32<>, I32<>, I32<>>)) {
// dest hi, dest low = src * edx // dest hi, dest low = src * edx
// mulx: edx src, 1st op high half, 2nd op low half, 3rd op src2 // mulx: edx src, 1st op high half, 2nd op low half, 3rd op src2
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src2 in edx? // TODO(benvanik): place src2 in edx?
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); assert_true(!i.src2.is_constant);
@ -3181,18 +3173,13 @@ EMITTER(MUL_I32, MATCH(I<OPCODE_MUL, I32<>, I32<>, I32<>>)) {
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX < EAX * REG(op1); // EDX:EAX < EAX * REG(op1);
//e.DebugBreak();
// is_constant AKA not a register
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); // can't multiply 2 constants assert_true(!i.src2.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src1.constant()); e.mov(e.eax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.eax); e.mov(i.dest, e.eax);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); // can't multiply 2 constants assert_true(!i.src1.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src2.constant()); e.mov(e.eax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.eax); e.mov(i.dest, e.eax);
@ -3210,7 +3197,7 @@ EMITTER(MUL_I64, MATCH(I<OPCODE_MUL, I64<>, I64<>, I64<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// dest hi, dest low = src * rdx // dest hi, dest low = src * rdx
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// mulx: edx src, 1st op high half, 2nd op low half, 3rd op src2 // mulx: edx src, 1st op high half, 2nd op low half, 3rd op src2
// TODO(benvanik): place src2 in edx? // TODO(benvanik): place src2 in edx?
@ -3230,17 +3217,13 @@ EMITTER(MUL_I64, MATCH(I<OPCODE_MUL, I64<>, I64<>, I64<>>)) {
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX < EAX * REG(op1); // EDX:EAX < EAX * REG(op1);
//e.DebugBreak();
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); // can't multiply 2 constants assert_true(!i.src2.is_constant); // can't multiply 2 constants
e.mov(e.rax, i.src1.constant()); e.mov(e.rax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.rax); e.mov(i.dest, e.rax);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); // can't multiply 2 constants assert_true(!i.src1.is_constant); // can't multiply 2 constants
e.mov(e.rax, i.src2.constant()); e.mov(e.rax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.rax); e.mov(i.dest, e.rax);
@ -3302,24 +3285,20 @@ EMITTER(MUL_HI_I8, MATCH(I<OPCODE_MUL_HI, I8<>, I8<>, I8<>>)) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call // TODO(justin): Find a way to shorten this has call
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.movzx(e.edx, i.src1); e.movzx(e.edx, i.src1);
e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32());
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX < EAX * REG(op1); // EDX:EAX < EAX * REG(op1);
// is_constant AKA not a register
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); // can't multiply 2 constants assert_true(!i.src2.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src1.constant()); e.mov(e.eax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.edx); e.mov(i.dest, e.edx);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); // can't multiply 2 constants assert_true(!i.src1.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src2.constant()); e.mov(e.eax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.edx); e.mov(i.dest, e.edx);
@ -3346,24 +3325,20 @@ EMITTER(MUL_HI_I16, MATCH(I<OPCODE_MUL_HI, I16<>, I16<>, I16<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call // TODO(justin): Find a way to shorten this has call
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.movzx(e.edx, i.src1); e.movzx(e.edx, i.src1);
e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32());
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX < EAX * REG(op1); // EDX:EAX < EAX * REG(op1);
// is_constant AKA not a register
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); // can't multiply 2 constants assert_true(!i.src2.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src1.constant()); e.mov(e.eax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.edx); e.mov(i.dest, e.edx);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); // can't multiply 2 constants assert_true(!i.src1.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src2.constant()); e.mov(e.eax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.edx); e.mov(i.dest, e.edx);
@ -3390,7 +3365,7 @@ EMITTER(MUL_HI_I32, MATCH(I<OPCODE_MUL_HI, I32<>, I32<>, I32<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call // TODO(justin): Find a way to shorten this has call
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.mov(e.edx, i.src1); e.mov(e.edx, i.src1);
if (i.src2.is_constant) { if (i.src2.is_constant) {
@ -3402,17 +3377,13 @@ EMITTER(MUL_HI_I32, MATCH(I<OPCODE_MUL_HI, I32<>, I32<>, I32<>>)) {
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX < EAX * REG(op1); // EDX:EAX < EAX * REG(op1);
// is_constant AKA not a register
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); // can't multiply 2 constants assert_true(!i.src2.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src1.constant()); e.mov(e.eax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.edx); e.mov(i.dest, e.edx);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); // can't multiply 2 constants assert_true(!i.src1.is_constant); // can't multiply 2 constants
e.mov(e.eax, i.src2.constant()); e.mov(e.eax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.edx); e.mov(i.dest, e.edx);
@ -3439,7 +3410,7 @@ EMITTER(MUL_HI_I64, MATCH(I<OPCODE_MUL_HI, I64<>, I64<>, I64<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call // TODO(justin): Find a way to shorten this has call
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.mov(e.rdx, i.src1); e.mov(e.rdx, i.src1);
if (i.src2.is_constant) { if (i.src2.is_constant) {
@ -3451,17 +3422,13 @@ EMITTER(MUL_HI_I64, MATCH(I<OPCODE_MUL_HI, I64<>, I64<>, I64<>>)) {
} else { } else {
// x86 mul instruction // x86 mul instruction
// EDX:EAX < EAX * REG(op1); // EDX:EAX < EAX * REG(op1);
// is_constant AKA not a register
if (i.src1.is_constant) { if (i.src1.is_constant) {
assert_true(!i.src2.is_constant); // can't multiply 2 constants assert_true(!i.src2.is_constant); // can't multiply 2 constants
e.mov(e.rax, i.src1.constant()); e.mov(e.rax, i.src1.constant());
e.mul(i.src2); e.mul(i.src2);
e.mov(i.dest, e.rdx); e.mov(i.dest, e.rdx);
} else if (i.src2.is_constant) { } else if (i.src2.is_constant) {
assert_true(!i.src1.is_constant); // can't multiply 2 constants assert_true(!i.src1.is_constant); // can't multiply 2 constants
e.mov(e.rax, i.src2.constant()); e.mov(e.rax, i.src2.constant());
e.mul(i.src1); e.mul(i.src1);
e.mov(i.dest, e.rdx); e.mov(i.dest, e.rdx);
@ -3772,7 +3739,7 @@ EMITTER_OPCODE_TABLE(
EMITTER(MUL_ADD_F32, MATCH(I<OPCODE_MUL_ADD, F32<>, F32<>, F32<>, F32<>>)) { EMITTER(MUL_ADD_F32, MATCH(I<OPCODE_MUL_ADD, F32<>, F32<>, F32<>, F32<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// FMA extension // FMA extension
if (e.cpu()->has(Xbyak::util::Cpu::tFMA)) { if (e.IsFeatureEnabled(kX64EmitFMA)) {
if (i.dest == i.src1) { if (i.dest == i.src1) {
e.vfmadd213ss(i.dest, i.src2, i.src3); e.vfmadd213ss(i.dest, i.src2, i.src3);
} else { } else {
@ -3801,7 +3768,7 @@ EMITTER(MUL_ADD_F32, MATCH(I<OPCODE_MUL_ADD, F32<>, F32<>, F32<>, F32<>>)) {
EMITTER(MUL_ADD_F64, MATCH(I<OPCODE_MUL_ADD, F64<>, F64<>, F64<>, F64<>>)) { EMITTER(MUL_ADD_F64, MATCH(I<OPCODE_MUL_ADD, F64<>, F64<>, F64<>, F64<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// FMA extension // FMA extension
if (e.cpu()->has(Xbyak::util::Cpu::tFMA)) { if (e.IsFeatureEnabled(kX64EmitFMA)) {
if (i.dest == i.src1) { if (i.dest == i.src1) {
e.vfmadd213sd(i.dest, i.src2, i.src3); e.vfmadd213sd(i.dest, i.src2, i.src3);
} else { } else {
@ -3830,7 +3797,7 @@ EMITTER(MUL_ADD_F64, MATCH(I<OPCODE_MUL_ADD, F64<>, F64<>, F64<>, F64<>>)) {
EMITTER(MUL_ADD_V128, MATCH(I<OPCODE_MUL_ADD, V128<>, V128<>, V128<>, V128<>>)) { EMITTER(MUL_ADD_V128, MATCH(I<OPCODE_MUL_ADD, V128<>, V128<>, V128<>, V128<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// FMA extension // FMA extension
if (e.cpu()->has(Xbyak::util::Cpu::tFMA)) { if (e.IsFeatureEnabled(kX64EmitFMA)) {
if (i.dest == i.src1) { if (i.dest == i.src1) {
e.vfmadd213ps(i.dest, i.src2, i.src3); e.vfmadd213ps(i.dest, i.src2, i.src3);
} else { } else {
@ -3877,7 +3844,7 @@ EMITTER_OPCODE_TABLE(
EMITTER(MUL_SUB_F32, MATCH(I<OPCODE_MUL_SUB, F32<>, F32<>, F32<>, F32<>>)) { EMITTER(MUL_SUB_F32, MATCH(I<OPCODE_MUL_SUB, F32<>, F32<>, F32<>, F32<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// FMA extension // FMA extension
if (e.cpu()->has(Xbyak::util::Cpu::tFMA)) { if (e.IsFeatureEnabled(kX64EmitFMA)) {
if (i.dest == i.src1) { if (i.dest == i.src1) {
e.vfmsub213ss(i.dest, i.src2, i.src3); e.vfmsub213ss(i.dest, i.src2, i.src3);
} else { } else {
@ -3909,7 +3876,7 @@ EMITTER(MUL_SUB_F32, MATCH(I<OPCODE_MUL_SUB, F32<>, F32<>, F32<>, F32<>>)) {
EMITTER(MUL_SUB_F64, MATCH(I<OPCODE_MUL_SUB, F64<>, F64<>, F64<>, F64<>>)) { EMITTER(MUL_SUB_F64, MATCH(I<OPCODE_MUL_SUB, F64<>, F64<>, F64<>, F64<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// FMA extension // FMA extension
if (e.cpu()->has(Xbyak::util::Cpu::tFMA)) { if (e.IsFeatureEnabled(kX64EmitFMA)) {
if (i.dest == i.src1) { if (i.dest == i.src1) {
e.vfmsub213sd(i.dest, i.src2, i.src3); e.vfmsub213sd(i.dest, i.src2, i.src3);
} else { } else {
@ -3941,7 +3908,7 @@ EMITTER(MUL_SUB_F64, MATCH(I<OPCODE_MUL_SUB, F64<>, F64<>, F64<>, F64<>>)) {
EMITTER(MUL_SUB_V128, MATCH(I<OPCODE_MUL_SUB, V128<>, V128<>, V128<>, V128<>>)) { EMITTER(MUL_SUB_V128, MATCH(I<OPCODE_MUL_SUB, V128<>, V128<>, V128<>, V128<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// FMA extension // FMA extension
if (e.cpu()->has(Xbyak::util::Cpu::tFMA)) { if (e.IsFeatureEnabled(kX64EmitFMA)) {
if (i.dest == i.src1) { if (i.dest == i.src1) {
e.vfmsub213ps(i.dest, i.src2, i.src3); e.vfmsub213ps(i.dest, i.src2, i.src3);
} else { } else {
@ -4458,7 +4425,7 @@ void EmitShlXX(X64Emitter& e, const ARGS& i) {
[](X64Emitter& e, const REG& dest_src, const Reg8& src) { [](X64Emitter& e, const REG& dest_src, const Reg8& src) {
// shlx: $1 = $2 << $3 // shlx: $1 = $2 << $3
// shl: $1 = $1 << $2 // shl: $1 = $1 << $2
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (dest_src.getBit() == 64) { if (dest_src.getBit() == 64) {
e.shlx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); e.shlx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64());
} else { } else {
@ -4512,7 +4479,7 @@ void EmitShrXX(X64Emitter& e, const ARGS& i) {
[](X64Emitter& e, const REG& dest_src, const Reg8& src) { [](X64Emitter& e, const REG& dest_src, const Reg8& src) {
// shrx: op1 dest, op2 src, op3 count // shrx: op1 dest, op2 src, op3 count
// shr: op1 src/dest, op2 count // shr: op1 src/dest, op2 count
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (dest_src.getBit() == 64) { if (dest_src.getBit() == 64) {
e.shrx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); e.shrx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64());
} else if (dest_src.getBit() == 32) { } else if (dest_src.getBit() == 32) {
@ -4594,7 +4561,7 @@ void EmitSarXX(X64Emitter& e, const ARGS& i) {
SEQ::EmitAssociativeBinaryOp( SEQ::EmitAssociativeBinaryOp(
e, i, e, i,
[](X64Emitter& e, const REG& dest_src, const Reg8& src) { [](X64Emitter& e, const REG& dest_src, const Reg8& src) {
if (e.cpu()->has(Xbyak::util::Cpu::tBMI2)) { if (e.IsFeatureEnabled(kX64EmitBMI2)) {
if (dest_src.getBit() == 64) { if (dest_src.getBit() == 64) {
e.sarx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); e.sarx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64());
} else if (dest_src.getBit() == 32) { } else if (dest_src.getBit() == 32) {
@ -4730,7 +4697,7 @@ EMITTER(VECTOR_SHL_V128, MATCH(I<OPCODE_VECTOR_SHL, V128<>, V128<>, V128<>>)) {
return _mm_load_si128(reinterpret_cast<__m128i*>(value)); return _mm_load_si128(reinterpret_cast<__m128i*>(value));
} }
static void EmitInt32(X64Emitter& e, const EmitArgType& i) { static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (i.src2.is_constant) { if (i.src2.is_constant) {
const auto& shamt = i.src2.constant(); const auto& shamt = i.src2.constant();
bool all_same = true; bool all_same = true;
@ -4882,7 +4849,7 @@ EMITTER(VECTOR_SHR_V128, MATCH(I<OPCODE_VECTOR_SHR, V128<>, V128<>, V128<>>)) {
e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F); e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F);
return; return;
} else { } else {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Counts differ, so pre-mask and load constant. // Counts differ, so pre-mask and load constant.
vec128_t masked = i.src2.constant(); vec128_t masked = i.src2.constant();
for (size_t n = 0; n < 4; ++n) { for (size_t n = 0; n < 4; ++n) {
@ -4894,7 +4861,7 @@ EMITTER(VECTOR_SHR_V128, MATCH(I<OPCODE_VECTOR_SHR, V128<>, V128<>, V128<>>)) {
} }
} }
} else { } else {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Fully variable shift. // Fully variable shift.
// src shift mask may have values >31, and x86 sets to zero when // src shift mask may have values >31, and x86 sets to zero when
// that happens so we mask. // that happens so we mask.
@ -4983,7 +4950,7 @@ EMITTER(VECTOR_SHA_V128, MATCH(I<OPCODE_VECTOR_SHA, V128<>, V128<>, V128<>>)) {
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
break; break;
case INT32_TYPE: case INT32_TYPE:
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// src shift mask may have values >31, and x86 sets to zero when // src shift mask may have values >31, and x86 sets to zero when
// that happens so we mask. // that happens so we mask.
if (i.src2.is_constant) { if (i.src2.is_constant) {
@ -5130,7 +5097,7 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I<OPCODE_VECTOR_ROTATE_LEFT, V128<>, V128
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
break; break;
case INT32_TYPE: { case INT32_TYPE: {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest; Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) { if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2; temp = e.xmm2;
@ -5286,7 +5253,7 @@ EMITTER_OPCODE_TABLE(
// ============================================================================ // ============================================================================
EMITTER(CNTLZ_I8, MATCH(I<OPCODE_CNTLZ, I8<>, I8<>>)) { EMITTER(CNTLZ_I8, MATCH(I<OPCODE_CNTLZ, I8<>, I8<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tLZCNT)) { if (e.IsFeatureEnabled(kX64EmitLZCNT)) {
// No 8bit lzcnt, so do 16 and sub 8. // No 8bit lzcnt, so do 16 and sub 8.
e.movzx(i.dest.reg().cvt16(), i.src1); e.movzx(i.dest.reg().cvt16(), i.src1);
e.lzcnt(i.dest.reg().cvt16(), i.dest.reg().cvt16()); e.lzcnt(i.dest.reg().cvt16(), i.dest.reg().cvt16());
@ -5317,7 +5284,7 @@ EMITTER(CNTLZ_I8, MATCH(I<OPCODE_CNTLZ, I8<>, I8<>>)) {
}; };
EMITTER(CNTLZ_I16, MATCH(I<OPCODE_CNTLZ, I8<>, I16<>>)) { EMITTER(CNTLZ_I16, MATCH(I<OPCODE_CNTLZ, I8<>, I16<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tLZCNT)) { if (e.IsFeatureEnabled(kX64EmitLZCNT)) {
// LZCNT: searches $2 until MSB 1 found, stores idx (from last bit) in $1 // LZCNT: searches $2 until MSB 1 found, stores idx (from last bit) in $1
e.lzcnt(i.dest.reg().cvt32(), i.src1); e.lzcnt(i.dest.reg().cvt32(), i.src1);
} else { } else {
@ -5346,7 +5313,7 @@ EMITTER(CNTLZ_I16, MATCH(I<OPCODE_CNTLZ, I8<>, I16<>>)) {
}; };
EMITTER(CNTLZ_I32, MATCH(I<OPCODE_CNTLZ, I8<>, I32<>>)) { EMITTER(CNTLZ_I32, MATCH(I<OPCODE_CNTLZ, I8<>, I32<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tLZCNT)) { if (e.IsFeatureEnabled(kX64EmitLZCNT)) {
e.lzcnt(i.dest.reg().cvt32(), i.src1); e.lzcnt(i.dest.reg().cvt32(), i.src1);
} else { } else {
e.inLocalLabel(); e.inLocalLabel();
@ -5374,7 +5341,7 @@ EMITTER(CNTLZ_I32, MATCH(I<OPCODE_CNTLZ, I8<>, I32<>>)) {
}; };
EMITTER(CNTLZ_I64, MATCH(I<OPCODE_CNTLZ, I8<>, I64<>>)) { EMITTER(CNTLZ_I64, MATCH(I<OPCODE_CNTLZ, I8<>, I64<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tLZCNT)) { if (e.IsFeatureEnabled(kX64EmitLZCNT)) {
e.lzcnt(i.dest.reg().cvt64(), i.src1); e.lzcnt(i.dest.reg().cvt64(), i.src1);
} else { } else {
e.inLocalLabel(); e.inLocalLabel();
@ -5524,7 +5491,7 @@ EMITTER_OPCODE_TABLE(
// Copy a value into all elements of a vector // Copy a value into all elements of a vector
EMITTER(SPLAT_I8, MATCH(I<OPCODE_SPLAT, V128<>, I8<>>)) { EMITTER(SPLAT_I8, MATCH(I<OPCODE_SPLAT, V128<>, I8<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (i.src1.is_constant) { if (i.src1.is_constant) {
// TODO(benvanik): faster constant splats. // TODO(benvanik): faster constant splats.
e.mov(e.al, i.src1.constant()); e.mov(e.al, i.src1.constant());
@ -5551,7 +5518,7 @@ EMITTER(SPLAT_I8, MATCH(I<OPCODE_SPLAT, V128<>, I8<>>)) {
}; };
EMITTER(SPLAT_I16, MATCH(I<OPCODE_SPLAT, V128<>, I16<>>)) { EMITTER(SPLAT_I16, MATCH(I<OPCODE_SPLAT, V128<>, I16<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (i.src1.is_constant) { if (i.src1.is_constant) {
// TODO(benvanik): faster constant splats. // TODO(benvanik): faster constant splats.
e.mov(e.ax, i.src1.constant()); e.mov(e.ax, i.src1.constant());
@ -5577,7 +5544,7 @@ EMITTER(SPLAT_I16, MATCH(I<OPCODE_SPLAT, V128<>, I16<>>)) {
}; };
EMITTER(SPLAT_I32, MATCH(I<OPCODE_SPLAT, V128<>, I32<>>)) { EMITTER(SPLAT_I32, MATCH(I<OPCODE_SPLAT, V128<>, I32<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (i.src1.is_constant) { if (i.src1.is_constant) {
// TODO(benvanik): faster constant splats. // TODO(benvanik): faster constant splats.
e.mov(e.eax, i.src1.constant()); e.mov(e.eax, i.src1.constant());
@ -5601,7 +5568,7 @@ EMITTER(SPLAT_I32, MATCH(I<OPCODE_SPLAT, V128<>, I32<>>)) {
}; };
EMITTER(SPLAT_F32, MATCH(I<OPCODE_SPLAT, V128<>, F32<>>)) { EMITTER(SPLAT_F32, MATCH(I<OPCODE_SPLAT, V128<>, F32<>>)) {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (i.src1.is_constant) { if (i.src1.is_constant) {
// TODO(benvanik): faster constant splats. // TODO(benvanik): faster constant splats.
e.mov(e.eax, i.src1.value->constant.i32); e.mov(e.eax, i.src1.value->constant.i32);
@ -5649,7 +5616,7 @@ EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
(((control >> 0) & 0x3) << 0); (((control >> 0) & 0x3) << 0);
uint32_t blend_control = 0; uint32_t blend_control = 0;
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Blender for vpblendd // Blender for vpblendd
blend_control = blend_control =
(((control >> 26) & 0x1) << 3) | (((control >> 26) & 0x1) << 3) |
@ -5690,7 +5657,7 @@ EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
e.vpshufd(e.xmm0, e.xmm0, src_control); e.vpshufd(e.xmm0, e.xmm0, src_control);
} }
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { if (e.IsFeatureEnabled(kX64EmitAVX2)) {
e.vpblendd(i.dest, e.xmm0, blend_control); // $0 = $1 <blend> $2 e.vpblendd(i.dest, e.xmm0, blend_control); // $0 = $1 <blend> $2
} else { } else {
e.vpblendw(i.dest, e.xmm0, blend_control); // $0 = $1 <blend> $2 e.vpblendw(i.dest, e.xmm0, blend_control); // $0 = $1 <blend> $2