mirror of https://github.com/PCSX2/pcsx2.git
3rdparty: Update xbyak to 6.73
Updates xbyak from 6.00 to 6.73.
This commit is contained in:
parent
b644957cee
commit
bca4f15f9d
|
@ -77,7 +77,11 @@
|
|||
#endif
|
||||
#include <windows.h>
|
||||
#include <malloc.h>
|
||||
#define XBYAK_TLS __declspec(thread)
|
||||
#ifdef _MSC_VER
|
||||
#define XBYAK_TLS __declspec(thread)
|
||||
#else
|
||||
#define XBYAK_TLS __thread
|
||||
#endif
|
||||
#elif defined(__GNUC__)
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
|
@ -95,7 +99,9 @@
|
|||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if !defined(MFD_CLOEXEC) // defined only linux 3.17 or later
|
||||
// MFD_CLOEXEC defined only linux 3.17 or later.
|
||||
// Android wraps the memfd_create syscall from API version 30.
|
||||
#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
|
||||
#undef XBYAK_USE_MEMFD
|
||||
#endif
|
||||
|
||||
|
@ -112,7 +118,7 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800)
|
||||
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
|
||||
#undef XBYAK_TLS
|
||||
#define XBYAK_TLS thread_local
|
||||
#define XBYAK_VARIADIC_TEMPLATE
|
||||
|
@ -138,11 +144,18 @@
|
|||
#pragma warning(disable : 4127) /* constant expresison */
|
||||
#endif
|
||||
|
||||
// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Warray-bounds"
|
||||
#endif
|
||||
|
||||
namespace Xbyak {
|
||||
|
||||
enum {
|
||||
DEFAULT_MAX_CODE_SIZE = 4096,
|
||||
VERSION = 0x6000 /* 0xABCD = A.BC(D) */
|
||||
VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
|
||||
};
|
||||
|
||||
#ifndef MIE_INTEGER_TYPE_DEFINED
|
||||
|
@ -291,10 +304,10 @@ inline void SetError(int err) {
|
|||
inline void ClearError() {
|
||||
local::GetErrorRef() = 0;
|
||||
}
|
||||
inline int GetError() { return local::GetErrorRef(); }
|
||||
inline int GetError() { return Xbyak::local::GetErrorRef(); }
|
||||
|
||||
#define XBYAK_THROW(err) { local::SetError(err); return; }
|
||||
#define XBYAK_THROW_RET(err, r) { local::SetError(err); return r; }
|
||||
#define XBYAK_THROW(err) { Xbyak::local::SetError(err); return; }
|
||||
#define XBYAK_THROW_RET(err, r) { Xbyak::local::SetError(err); return r; }
|
||||
|
||||
#else
|
||||
class Error : public std::exception {
|
||||
|
@ -358,14 +371,36 @@ inline const To CastTo(From p) XBYAK_NOEXCEPT
|
|||
}
|
||||
namespace inner {
|
||||
|
||||
static const size_t ALIGN_PAGE_SIZE = 4096;
|
||||
#ifdef _WIN32
|
||||
struct SystemInfo {
|
||||
SYSTEM_INFO info;
|
||||
SystemInfo()
|
||||
{
|
||||
GetSystemInfo(&info);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
//static const size_t ALIGN_PAGE_SIZE = 4096;
|
||||
inline size_t getPageSize()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
static const SystemInfo si;
|
||||
return si.info.dwPageSize;
|
||||
#elif defined(__GNUC__)
|
||||
static const long pageSize = sysconf(_SC_PAGESIZE);
|
||||
if (pageSize > 0) {
|
||||
return (size_t)pageSize;
|
||||
}
|
||||
#endif
|
||||
return 4096;
|
||||
}
|
||||
|
||||
inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
|
||||
inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
|
||||
|
||||
inline uint32_t VerifyInInt32(uint64_t x)
|
||||
{
|
||||
#ifdef XBYAK64
|
||||
#if defined(XBYAK64) && !defined(__ILP32__)
|
||||
if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
|
||||
#endif
|
||||
return static_cast<uint32_t>(x);
|
||||
|
@ -383,7 +418,8 @@ enum LabelMode {
|
|||
custom allocator
|
||||
*/
|
||||
struct Allocator {
|
||||
virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::ALIGN_PAGE_SIZE)); }
|
||||
explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator
|
||||
virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
|
||||
virtual void free(uint8_t *p) { AlignedFree(p); }
|
||||
virtual ~Allocator() {}
|
||||
/* override to return false if you call protect() manually */
|
||||
|
@ -414,13 +450,24 @@ inline int getMacOsVersion()
|
|||
|
||||
} // util
|
||||
#endif
|
||||
class MmapAllocator : Allocator {
|
||||
typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, size_t> SizeList;
|
||||
SizeList sizeList_;
|
||||
class MmapAllocator : public Allocator {
|
||||
struct Allocation {
|
||||
size_t size;
|
||||
#if defined(XBYAK_USE_MEMFD)
|
||||
// fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
|
||||
// during the lifetime of each allocation in order to support
|
||||
// checkpoint/restore by unprivileged users.
|
||||
int fd;
|
||||
#endif
|
||||
};
|
||||
const std::string name_; // only used with XBYAK_USE_MEMFD
|
||||
typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
|
||||
AllocationList allocList_;
|
||||
public:
|
||||
explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
|
||||
uint8_t *alloc(size_t size)
|
||||
{
|
||||
const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1;
|
||||
const size_t alignedSizeM1 = inner::getPageSize() - 1;
|
||||
size = (size + alignedSizeM1) & ~alignedSizeM1;
|
||||
#if defined(MAP_ANONYMOUS)
|
||||
int mode = MAP_PRIVATE | MAP_ANONYMOUS;
|
||||
|
@ -435,30 +482,42 @@ public:
|
|||
#endif
|
||||
int fd = -1;
|
||||
#if defined(XBYAK_USE_MEMFD)
|
||||
fd = memfd_create("xbyak", MFD_CLOEXEC);
|
||||
fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
|
||||
if (fd != -1) {
|
||||
mode = MAP_SHARED;
|
||||
if (ftruncate(fd, size) != 0) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
|
||||
if (ftruncate(fd, size) != 0) {
|
||||
close(fd);
|
||||
XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
|
||||
#if defined(XBYAK_USE_MEMFD)
|
||||
if (fd != -1) close(fd);
|
||||
#endif
|
||||
if (p == MAP_FAILED) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
|
||||
if (p == MAP_FAILED) {
|
||||
if (fd != -1) close(fd);
|
||||
XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
|
||||
}
|
||||
assert(p);
|
||||
sizeList_[(uintptr_t)p] = size;
|
||||
Allocation &alloc = allocList_[(uintptr_t)p];
|
||||
alloc.size = size;
|
||||
#if defined(XBYAK_USE_MEMFD)
|
||||
alloc.fd = fd;
|
||||
#endif
|
||||
return (uint8_t*)p;
|
||||
}
|
||||
void free(uint8_t *p)
|
||||
{
|
||||
if (p == 0) return;
|
||||
SizeList::iterator i = sizeList_.find((uintptr_t)p);
|
||||
if (i == sizeList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
|
||||
if (munmap((void*)i->first, i->second) < 0) XBYAK_THROW(ERR_MUNMAP)
|
||||
sizeList_.erase(i);
|
||||
AllocationList::iterator i = allocList_.find((uintptr_t)p);
|
||||
if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
|
||||
if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
|
||||
#if defined(XBYAK_USE_MEMFD)
|
||||
if (i->second.fd != -1) close(i->second.fd);
|
||||
#endif
|
||||
allocList_.erase(i);
|
||||
}
|
||||
};
|
||||
#else
|
||||
typedef Allocator MmapAllocator;
|
||||
#endif
|
||||
|
||||
class Address;
|
||||
|
@ -1176,9 +1235,6 @@ public:
|
|||
size_t pageSize = sysconf(_SC_PAGESIZE);
|
||||
size_t iaddr = reinterpret_cast<size_t>(addr);
|
||||
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
|
||||
#ifndef NDEBUG
|
||||
if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize);
|
||||
#endif
|
||||
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
|
||||
#else
|
||||
return true;
|
||||
|
@ -1448,7 +1504,6 @@ public:
|
|||
clabelDefList_.clear();
|
||||
clabelUndefList_.clear();
|
||||
resetLabelPtrList();
|
||||
ClearError();
|
||||
}
|
||||
void enterLocal()
|
||||
{
|
||||
|
@ -1574,6 +1629,7 @@ public:
|
|||
enum LabelType {
|
||||
T_SHORT,
|
||||
T_NEAR,
|
||||
T_FAR, // far jump
|
||||
T_AUTO // T_SHORT if possible
|
||||
};
|
||||
private:
|
||||
|
@ -1622,6 +1678,11 @@ private:
|
|||
{
|
||||
return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
|
||||
}
|
||||
static inline bool isValidSSE(const Operand& op1)
|
||||
{
|
||||
// SSE instructions do not support XMM16 - XMM31
|
||||
return !(op1.isXMM() && op1.getIdx() >= 16);
|
||||
}
|
||||
void rex(const Operand& op1, const Operand& op2 = Operand())
|
||||
{
|
||||
uint8_t rex = 0;
|
||||
|
@ -1784,9 +1845,16 @@ private:
|
|||
void setSIB(const RegExp& e, int reg, int disp8N = 0)
|
||||
{
|
||||
uint64_t disp64 = e.getDisp();
|
||||
#ifdef XBYAK64
|
||||
#if defined(XBYAK64) && !defined(__ILP32__)
|
||||
#ifdef XBYAK_OLD_DISP_CHECK
|
||||
// treat 0xffffffff as 0xffffffffffffffff
|
||||
uint64_t high = disp64 >> 32;
|
||||
if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
|
||||
#else
|
||||
// displacement should be a signed 32-bit value, so also check sign bit
|
||||
uint64_t high = disp64 >> 31;
|
||||
if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
|
||||
#endif
|
||||
#endif
|
||||
uint32_t disp = static_cast<uint32_t>(disp64);
|
||||
const Reg& base = e.getBase();
|
||||
|
@ -1887,6 +1955,7 @@ private:
|
|||
template<class T>
|
||||
void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref)
|
||||
{
|
||||
if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
|
||||
size_t offset = 0;
|
||||
if (labelMgr_.getOffset(&offset, label)) { /* label exists */
|
||||
|
@ -1907,6 +1976,7 @@ private:
|
|||
}
|
||||
void opJmpAbs(const void *addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0)
|
||||
{
|
||||
if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
if (isAutoGrow()) {
|
||||
if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
|
||||
if (size_ + 16 >= maxSize_) growMemory();
|
||||
|
@ -1919,6 +1989,16 @@ private:
|
|||
}
|
||||
|
||||
}
|
||||
void opJmpOp(const Operand& op, LabelType type, int ext)
|
||||
{
|
||||
const int bit = 16|i32e;
|
||||
if (type == T_FAR) {
|
||||
if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
|
||||
} else {
|
||||
opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
|
||||
}
|
||||
}
|
||||
// reg is reg field of ModRM
|
||||
// immSize is the size for immediate value
|
||||
// disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
|
||||
|
@ -1945,6 +2025,7 @@ private:
|
|||
void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&), int imm8 = NONE, int preCode = NONE)
|
||||
{
|
||||
if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||
if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
if (pref != NONE) db(pref);
|
||||
if (op.isMEM()) {
|
||||
opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
|
||||
|
@ -1955,6 +2036,7 @@ private:
|
|||
}
|
||||
void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext)
|
||||
{
|
||||
if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
if (mmx.isXMM()) db(0x66);
|
||||
opModR(Reg32(ext), mmx, 0x0F, code);
|
||||
db(imm8);
|
||||
|
@ -1965,6 +2047,7 @@ private:
|
|||
}
|
||||
void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
|
||||
{
|
||||
if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
if (pref != NONE) db(pref);
|
||||
if (op1.isXMM() && op2.isMEM()) {
|
||||
opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
|
||||
|
@ -1976,6 +2059,7 @@ private:
|
|||
}
|
||||
void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false)
|
||||
{
|
||||
if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
|
||||
if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
|
||||
if (mmx.isXMM()) db(0x66);
|
||||
opModR(op.getReg(), mmx, 0x0F, 0xC5); db(imm);
|
||||
|
@ -2132,9 +2216,6 @@ private:
|
|||
{
|
||||
if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||
int w = op.isBit(16);
|
||||
#ifdef XBYAK64
|
||||
if (op.isHigh8bit()) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||
#endif
|
||||
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
|
||||
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
|
||||
}
|
||||
|
@ -2356,18 +2437,21 @@ private:
|
|||
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
|
||||
opVex(x, 0, addr, type, code);
|
||||
}
|
||||
void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
|
||||
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
|
||||
{
|
||||
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
|
||||
}
|
||||
int orEvexIf(PreferredEncoding encoding) {
|
||||
if (encoding == DefaultEncoding) {
|
||||
encoding = EvexEncoding;
|
||||
encoding = defaultEncoding_;
|
||||
}
|
||||
if (encoding == EvexEncoding) {
|
||||
#ifdef XBYAK_DISABLE_AVX512
|
||||
XBYAK_THROW(ERR_EVEX_IS_INVALID)
|
||||
#endif
|
||||
type |= T_MUST_EVEX;
|
||||
return T_MUST_EVEX;
|
||||
}
|
||||
opAVX_X_X_XM(x1, x2, op, type, code0);
|
||||
return 0;
|
||||
}
|
||||
void opInOut(const Reg& a, const Reg& d, uint8_t code)
|
||||
{
|
||||
|
@ -2452,6 +2536,7 @@ public:
|
|||
#endif
|
||||
private:
|
||||
bool isDefaultJmpNEAR_;
|
||||
PreferredEncoding defaultEncoding_;
|
||||
public:
|
||||
void L(const std::string& label) { labelMgr_.defineSlabel(label); }
|
||||
void L(Label& label) { labelMgr_.defineClabel(label); }
|
||||
|
@ -2474,13 +2559,13 @@ public:
|
|||
|
||||
// set default type of `jmp` of undefined label to T_NEAR
|
||||
void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
|
||||
void jmp(const Operand& op) { opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true); }
|
||||
void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
|
||||
void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
|
||||
void jmp(const char *label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
|
||||
void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
|
||||
void jmp(const void *addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
|
||||
|
||||
void call(const Operand& op) { opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true); }
|
||||
void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
|
||||
// call(string label), not const std::string&
|
||||
void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
|
||||
void call(const char *label) { call(std::string(label)); }
|
||||
|
@ -2731,11 +2816,13 @@ public:
|
|||
, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
|
||||
#endif
|
||||
, isDefaultJmpNEAR_(false)
|
||||
, defaultEncoding_(EvexEncoding)
|
||||
{
|
||||
labelMgr_.set(this);
|
||||
}
|
||||
void reset()
|
||||
{
|
||||
ClearError();
|
||||
resetSize();
|
||||
labelMgr_.reset();
|
||||
labelMgr_.set(this);
|
||||
|
@ -2767,6 +2854,9 @@ public:
|
|||
#undef jnl
|
||||
#endif
|
||||
|
||||
// set default encoding to select Vex or Evex
|
||||
void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
|
||||
|
||||
/*
|
||||
use single byte nop if useMultiByteNop = false
|
||||
*/
|
||||
|
@ -2813,7 +2903,7 @@ public:
|
|||
{
|
||||
if (x == 1) return;
|
||||
if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
|
||||
if (isAutoGrow() && x > inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", (int)x);
|
||||
if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
|
||||
size_t remain = size_t(getCurr()) % x;
|
||||
if (remain) {
|
||||
nop(x - remain, useMultiByteNop);
|
||||
|
@ -2871,6 +2961,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen
|
|||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
} // end of namespace
|
||||
|
||||
#endif // XBYAK_XBYAK_H_
|
||||
|
|
|
@ -0,0 +1,258 @@
|
|||
enum {
|
||||
B00000000= 0,
|
||||
B00000001= 1,
|
||||
B00000010= 2,
|
||||
B00000011= 3,
|
||||
B00000100= 4,
|
||||
B00000101= 5,
|
||||
B00000110= 6,
|
||||
B00000111= 7,
|
||||
B00001000= 8,
|
||||
B00001001= 9,
|
||||
B00001010= 10,
|
||||
B00001011= 11,
|
||||
B00001100= 12,
|
||||
B00001101= 13,
|
||||
B00001110= 14,
|
||||
B00001111= 15,
|
||||
B00010000= 16,
|
||||
B00010001= 17,
|
||||
B00010010= 18,
|
||||
B00010011= 19,
|
||||
B00010100= 20,
|
||||
B00010101= 21,
|
||||
B00010110= 22,
|
||||
B00010111= 23,
|
||||
B00011000= 24,
|
||||
B00011001= 25,
|
||||
B00011010= 26,
|
||||
B00011011= 27,
|
||||
B00011100= 28,
|
||||
B00011101= 29,
|
||||
B00011110= 30,
|
||||
B00011111= 31,
|
||||
B00100000= 32,
|
||||
B00100001= 33,
|
||||
B00100010= 34,
|
||||
B00100011= 35,
|
||||
B00100100= 36,
|
||||
B00100101= 37,
|
||||
B00100110= 38,
|
||||
B00100111= 39,
|
||||
B00101000= 40,
|
||||
B00101001= 41,
|
||||
B00101010= 42,
|
||||
B00101011= 43,
|
||||
B00101100= 44,
|
||||
B00101101= 45,
|
||||
B00101110= 46,
|
||||
B00101111= 47,
|
||||
B00110000= 48,
|
||||
B00110001= 49,
|
||||
B00110010= 50,
|
||||
B00110011= 51,
|
||||
B00110100= 52,
|
||||
B00110101= 53,
|
||||
B00110110= 54,
|
||||
B00110111= 55,
|
||||
B00111000= 56,
|
||||
B00111001= 57,
|
||||
B00111010= 58,
|
||||
B00111011= 59,
|
||||
B00111100= 60,
|
||||
B00111101= 61,
|
||||
B00111110= 62,
|
||||
B00111111= 63,
|
||||
B01000000= 64,
|
||||
B01000001= 65,
|
||||
B01000010= 66,
|
||||
B01000011= 67,
|
||||
B01000100= 68,
|
||||
B01000101= 69,
|
||||
B01000110= 70,
|
||||
B01000111= 71,
|
||||
B01001000= 72,
|
||||
B01001001= 73,
|
||||
B01001010= 74,
|
||||
B01001011= 75,
|
||||
B01001100= 76,
|
||||
B01001101= 77,
|
||||
B01001110= 78,
|
||||
B01001111= 79,
|
||||
B01010000= 80,
|
||||
B01010001= 81,
|
||||
B01010010= 82,
|
||||
B01010011= 83,
|
||||
B01010100= 84,
|
||||
B01010101= 85,
|
||||
B01010110= 86,
|
||||
B01010111= 87,
|
||||
B01011000= 88,
|
||||
B01011001= 89,
|
||||
B01011010= 90,
|
||||
B01011011= 91,
|
||||
B01011100= 92,
|
||||
B01011101= 93,
|
||||
B01011110= 94,
|
||||
B01011111= 95,
|
||||
B01100000= 96,
|
||||
B01100001= 97,
|
||||
B01100010= 98,
|
||||
B01100011= 99,
|
||||
B01100100= 100,
|
||||
B01100101= 101,
|
||||
B01100110= 102,
|
||||
B01100111= 103,
|
||||
B01101000= 104,
|
||||
B01101001= 105,
|
||||
B01101010= 106,
|
||||
B01101011= 107,
|
||||
B01101100= 108,
|
||||
B01101101= 109,
|
||||
B01101110= 110,
|
||||
B01101111= 111,
|
||||
B01110000= 112,
|
||||
B01110001= 113,
|
||||
B01110010= 114,
|
||||
B01110011= 115,
|
||||
B01110100= 116,
|
||||
B01110101= 117,
|
||||
B01110110= 118,
|
||||
B01110111= 119,
|
||||
B01111000= 120,
|
||||
B01111001= 121,
|
||||
B01111010= 122,
|
||||
B01111011= 123,
|
||||
B01111100= 124,
|
||||
B01111101= 125,
|
||||
B01111110= 126,
|
||||
B01111111= 127,
|
||||
B10000000= 128,
|
||||
B10000001= 129,
|
||||
B10000010= 130,
|
||||
B10000011= 131,
|
||||
B10000100= 132,
|
||||
B10000101= 133,
|
||||
B10000110= 134,
|
||||
B10000111= 135,
|
||||
B10001000= 136,
|
||||
B10001001= 137,
|
||||
B10001010= 138,
|
||||
B10001011= 139,
|
||||
B10001100= 140,
|
||||
B10001101= 141,
|
||||
B10001110= 142,
|
||||
B10001111= 143,
|
||||
B10010000= 144,
|
||||
B10010001= 145,
|
||||
B10010010= 146,
|
||||
B10010011= 147,
|
||||
B10010100= 148,
|
||||
B10010101= 149,
|
||||
B10010110= 150,
|
||||
B10010111= 151,
|
||||
B10011000= 152,
|
||||
B10011001= 153,
|
||||
B10011010= 154,
|
||||
B10011011= 155,
|
||||
B10011100= 156,
|
||||
B10011101= 157,
|
||||
B10011110= 158,
|
||||
B10011111= 159,
|
||||
B10100000= 160,
|
||||
B10100001= 161,
|
||||
B10100010= 162,
|
||||
B10100011= 163,
|
||||
B10100100= 164,
|
||||
B10100101= 165,
|
||||
B10100110= 166,
|
||||
B10100111= 167,
|
||||
B10101000= 168,
|
||||
B10101001= 169,
|
||||
B10101010= 170,
|
||||
B10101011= 171,
|
||||
B10101100= 172,
|
||||
B10101101= 173,
|
||||
B10101110= 174,
|
||||
B10101111= 175,
|
||||
B10110000= 176,
|
||||
B10110001= 177,
|
||||
B10110010= 178,
|
||||
B10110011= 179,
|
||||
B10110100= 180,
|
||||
B10110101= 181,
|
||||
B10110110= 182,
|
||||
B10110111= 183,
|
||||
B10111000= 184,
|
||||
B10111001= 185,
|
||||
B10111010= 186,
|
||||
B10111011= 187,
|
||||
B10111100= 188,
|
||||
B10111101= 189,
|
||||
B10111110= 190,
|
||||
B10111111= 191,
|
||||
B11000000= 192,
|
||||
B11000001= 193,
|
||||
B11000010= 194,
|
||||
B11000011= 195,
|
||||
B11000100= 196,
|
||||
B11000101= 197,
|
||||
B11000110= 198,
|
||||
B11000111= 199,
|
||||
B11001000= 200,
|
||||
B11001001= 201,
|
||||
B11001010= 202,
|
||||
B11001011= 203,
|
||||
B11001100= 204,
|
||||
B11001101= 205,
|
||||
B11001110= 206,
|
||||
B11001111= 207,
|
||||
B11010000= 208,
|
||||
B11010001= 209,
|
||||
B11010010= 210,
|
||||
B11010011= 211,
|
||||
B11010100= 212,
|
||||
B11010101= 213,
|
||||
B11010110= 214,
|
||||
B11010111= 215,
|
||||
B11011000= 216,
|
||||
B11011001= 217,
|
||||
B11011010= 218,
|
||||
B11011011= 219,
|
||||
B11011100= 220,
|
||||
B11011101= 221,
|
||||
B11011110= 222,
|
||||
B11011111= 223,
|
||||
B11100000= 224,
|
||||
B11100001= 225,
|
||||
B11100010= 226,
|
||||
B11100011= 227,
|
||||
B11100100= 228,
|
||||
B11100101= 229,
|
||||
B11100110= 230,
|
||||
B11100111= 231,
|
||||
B11101000= 232,
|
||||
B11101001= 233,
|
||||
B11101010= 234,
|
||||
B11101011= 235,
|
||||
B11101100= 236,
|
||||
B11101101= 237,
|
||||
B11101110= 238,
|
||||
B11101111= 239,
|
||||
B11110000= 240,
|
||||
B11110001= 241,
|
||||
B11110010= 242,
|
||||
B11110011= 243,
|
||||
B11110100= 244,
|
||||
B11110101= 245,
|
||||
B11110110= 246,
|
||||
B11110111= 247,
|
||||
B11111000= 248,
|
||||
B11111001= 249,
|
||||
B11111010= 250,
|
||||
B11111011= 251,
|
||||
B11111100= 252,
|
||||
B11111101= 253,
|
||||
B11111110= 254,
|
||||
B11111111= 255
|
||||
};
|
|
@ -1,4 +1,6 @@
|
|||
const char *getVersionString() const { return "6.00"; }
|
||||
const char *getVersionString() const { return "6.73"; }
|
||||
void aadd(const Address& addr, const Reg32e ®) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
|
||||
void aand(const Address& addr, const Reg32e ®) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
|
||||
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
|
||||
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
|
||||
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
|
||||
|
@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM
|
|||
void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
|
||||
void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
|
||||
void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
|
||||
void aor(const Address& addr, const Reg32e ®) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
|
||||
void axor(const Address& addr, const Reg32e ®) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
|
||||
void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
|
||||
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
|
||||
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
|
||||
|
@ -57,9 +61,11 @@ void cbw() { db(0x66); db(0x98); }
|
|||
void cdq() { db(0x99); }
|
||||
void clc() { db(0xF8); }
|
||||
void cld() { db(0xFC); }
|
||||
void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
|
||||
void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
|
||||
void clflushopt(const Address& addr) { db(0x66); opModM(addr, Reg32(7), 0x0F, 0xAE); }
|
||||
void cli() { db(0xFA); }
|
||||
void clwb(const Address& addr) { db(0x66); opMIB(addr, esi, 0x0F, 0xAE); }
|
||||
void clzero() { db(0x0F); db(0x01); db(0xFC); }
|
||||
void cmc() { db(0xF5); }
|
||||
void cmova(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7); }//-V524
|
||||
|
@ -323,6 +329,7 @@ void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op,
|
|||
void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
|
||||
void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
|
||||
void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
|
||||
void hlt() { db(0xF4); }
|
||||
void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
|
||||
void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
|
||||
void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
|
||||
|
@ -500,6 +507,8 @@ void movd(const Mmx& mmx, const Address& addr) { if (mmx.isXMM()) db(0x66); opMo
|
|||
void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); }
|
||||
void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
|
||||
void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
|
||||
void movdir64b(const Reg& reg, const Address& addr) { db(0x66); opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8); }
|
||||
void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
|
||||
void movdq2q(const Mmx& mmx, const Xmm& xmm) { db(0xF2); opModR(mmx, xmm, 0x0F, 0xD6); }
|
||||
void movdqa(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x7F); }
|
||||
void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
|
||||
|
@ -580,9 +589,9 @@ void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
|
|||
void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
|
||||
void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
|
||||
void pblendw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
|
||||
void pclmulhqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
|
||||
void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
|
||||
void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
|
||||
void pclmullqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
|
||||
void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
|
||||
void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
|
||||
void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
|
||||
void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
|
||||
|
@ -649,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
|
|||
void popcnt(const Reg®, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
|
||||
void popf() { db(0x9D); }
|
||||
void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
|
||||
void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
|
||||
void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
|
||||
void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
|
||||
void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
|
||||
void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
|
||||
|
@ -719,6 +730,7 @@ void repne() { db(0xF2); }
|
|||
void repnz() { db(0xF2); }
|
||||
void repz() { db(0xF3); }
|
||||
void ret(int imm = 0) { if (imm) { db(0xC2); dw(imm); } else { db(0xC3); } }
|
||||
void retf(int imm = 0) { if (imm) { db(0xCA); dw(imm); } else { db(0xCB); } }
|
||||
void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
|
||||
void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
|
||||
void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
|
||||
|
@ -741,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
|
|||
void scasb() { db(0xAE); }
|
||||
void scasd() { db(0xAF); }
|
||||
void scasw() { db(0x66); db(0xAF); }
|
||||
void serialize() { db(0x0F); db(0x01); db(0xE8); }
|
||||
void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
|
||||
void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
|
||||
void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
|
||||
|
@ -811,10 +824,13 @@ void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM
|
|||
void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
|
||||
void sysenter() { db(0x0F); db(0x34); }
|
||||
void sysexit() { db(0x0F); db(0x35); }
|
||||
void tpause(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
|
||||
void tzcnt(const Reg®, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
|
||||
void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
|
||||
void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
|
||||
void ud2() { db(0x0F); db(0x0B); }
|
||||
void umonitor(const Reg& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit(); if (BIT != bit) { if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) { db(0x67); } else { XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) } } db(0xF3); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
|
||||
void umwait(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
|
||||
void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
|
||||
void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
|
||||
void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
|
||||
|
@ -835,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
|
|||
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
|
||||
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
|
||||
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
|
||||
void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
|
||||
void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
|
||||
void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); }
|
||||
void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); }
|
||||
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); }
|
||||
|
@ -979,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T
|
|||
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
|
||||
void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); }
|
||||
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
|
||||
void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
|
||||
void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
|
||||
void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
|
||||
void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
|
||||
void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }
|
||||
void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
|
||||
void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
|
||||
void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
|
||||
|
@ -1169,6 +1192,10 @@ void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isME
|
|||
void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58); }
|
||||
void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59); }
|
||||
void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79); }
|
||||
void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
|
||||
void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
|
||||
void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
|
||||
void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
|
||||
void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm); }
|
||||
void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
|
||||
void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
|
||||
|
@ -1182,10 +1209,22 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
|
|||
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
|
||||
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
|
||||
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
|
||||
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
|
||||
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
|
||||
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
|
||||
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
|
||||
void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); }
|
||||
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); }
|
||||
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); }
|
||||
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); }
|
||||
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
|
||||
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
|
||||
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); }
|
||||
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); }
|
||||
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
|
||||
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
|
||||
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2); }
|
||||
void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3); }
|
||||
void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2); }
|
||||
void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3); }
|
||||
void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2); }
|
||||
void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3); }
|
||||
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
|
||||
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
|
||||
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
|
||||
|
@ -1217,6 +1256,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if
|
|||
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
|
||||
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
|
||||
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
|
||||
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); }
|
||||
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); }
|
||||
void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); }
|
||||
void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); }
|
||||
void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
|
||||
|
@ -1313,8 +1354,16 @@ void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { op
|
|||
void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm); }
|
||||
void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
|
||||
void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
|
||||
void vsha512msg1(const Ymm& y, const Xmm& x) { if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC); }
|
||||
void vsha512msg2(const Ymm& y1, const Ymm& y2) { if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD); }
|
||||
void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) { if (!(y1.isYMM() && y2.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB); }
|
||||
void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm); }
|
||||
void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm); }
|
||||
void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
|
||||
void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
|
||||
void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm); }
|
||||
void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
|
||||
void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
|
||||
void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51); }
|
||||
void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51); }
|
||||
void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51); }
|
||||
|
@ -1339,7 +1388,10 @@ void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
|
|||
void wait() { db(0x9B); }
|
||||
void wbinvd() { db(0x0F); db(0x09); }
|
||||
void wrmsr() { db(0x0F); db(0x30); }
|
||||
void xabort(uint8_t imm) { db(0xC6); db(0xF8); db(imm); }
|
||||
void xadd(const Operand& op, const Reg& reg) { opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xC0 | (reg.isBit(8) ? 0 : 1)); }
|
||||
void xbegin(uint32_t rel) { db(0xC7); db(0xF8); dd(rel); }
|
||||
void xend() { db(0x0F); db(0x01); db(0xD5); }
|
||||
void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
|
||||
void xlatb() { db(0xD7); }
|
||||
void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
|
||||
|
@ -1620,6 +1672,10 @@ void scasq() { db(0x48); db(0xAF); }
|
|||
void stosq() { db(0x48); db(0xAB); }
|
||||
void syscall() { db(0x0F); db(0x05); }
|
||||
void sysret() { db(0x0F); db(0x07); }
|
||||
void clui() { db(0xF3); db(0x0F); db(0x01); db(0xEE); }
|
||||
void stui() { db(0xF3); db(0x0F); db(0x01); db(0xEF); }
|
||||
void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); }
|
||||
void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); }
|
||||
void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
|
||||
void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
|
||||
void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
|
||||
|
@ -1627,12 +1683,29 @@ void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR(
|
|||
void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }
|
||||
void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }
|
||||
void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }
|
||||
void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }
|
||||
void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }
|
||||
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }
|
||||
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); }
|
||||
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); }
|
||||
void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }
|
||||
void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }
|
||||
void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); }
|
||||
void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); }
|
||||
void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); }
|
||||
void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); }
|
||||
void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); }
|
||||
void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); }
|
||||
void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); }
|
||||
void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); }
|
||||
void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); }
|
||||
void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); }
|
||||
void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); }
|
||||
void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); }
|
||||
void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); }
|
||||
void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); }
|
||||
void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); }
|
||||
void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); }
|
||||
void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
|
||||
void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
|
||||
void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
|
||||
|
@ -1644,6 +1717,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T
|
|||
void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
|
||||
void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
|
||||
void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
|
||||
void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
|
||||
void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
|
||||
#else
|
||||
void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
|
||||
|
@ -1898,7 +1972,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 |
|
|||
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
|
||||
void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); }
|
||||
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
|
||||
void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
|
||||
void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); }
|
||||
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
|
||||
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
|
||||
|
@ -2132,38 +2205,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T
|
|||
void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); }
|
||||
void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); }
|
||||
void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); }
|
||||
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); }
|
||||
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); }
|
||||
void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); }
|
||||
void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); }
|
||||
void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); }
|
||||
void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); }
|
||||
void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
|
||||
void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
|
||||
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); }
|
||||
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); }
|
||||
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); }
|
||||
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); }
|
||||
void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
|
||||
void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
|
||||
void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
|
||||
void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
|
||||
void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
|
||||
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); }
|
||||
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); }
|
||||
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); }
|
||||
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); }
|
||||
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); }
|
||||
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); }
|
||||
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); }
|
||||
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); }
|
||||
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); }
|
||||
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); }
|
||||
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); }
|
||||
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); }
|
||||
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); }
|
||||
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); }
|
||||
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); }
|
||||
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); }
|
||||
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); }
|
||||
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); }
|
||||
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); }
|
||||
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); }
|
||||
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); }
|
||||
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); }
|
||||
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); }
|
||||
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); }
|
||||
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); }
|
||||
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); }
|
||||
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); }
|
||||
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); }
|
||||
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); }
|
||||
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); }
|
||||
void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
|
||||
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); }
|
||||
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); }
|
||||
void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
|
||||
void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); }
|
||||
void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }
|
||||
|
|
|
@ -4,12 +4,18 @@
|
|||
#ifdef XBYAK_ONLY_CLASS_CPU
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#ifndef XBYAK_THROW
|
||||
#define XBYAK_THROW(x) ;
|
||||
#define XBYAK_THROW_RET(x, y) return y;
|
||||
#endif
|
||||
#ifndef XBYAK_CONSTEXPR
|
||||
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
||||
#define XBYAK_CONSTEXPR constexpr
|
||||
#else
|
||||
#define XBYAK_CONSTEXPR
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#include <string.h>
|
||||
|
||||
|
@ -17,7 +23,6 @@
|
|||
utility class and functions for Xbyak
|
||||
Xbyak::util::Clock ; rdtsc timer
|
||||
Xbyak::util::Cpu ; detect CPU
|
||||
@note this header is UNDER CONSTRUCTION!
|
||||
*/
|
||||
#include "xbyak.h"
|
||||
#endif // XBYAK_ONLY_CLASS_CPU
|
||||
|
@ -27,8 +32,8 @@
|
|||
#endif
|
||||
|
||||
#ifdef XBYAK_INTEL_CPU_SPECIFIC
|
||||
#ifdef _MSC_VER
|
||||
#if (_MSC_VER < 1400) && defined(XBYAK32)
|
||||
#ifdef _WIN32
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
|
||||
static inline __declspec(naked) void __cpuid(int[4], int)
|
||||
{
|
||||
__asm {
|
||||
|
@ -88,32 +93,69 @@ typedef enum {
|
|||
CoreLevel = 2
|
||||
} IntelCpuTopologyLevel;
|
||||
|
||||
namespace local {
|
||||
|
||||
template<uint64_t L, uint64_t H = 0>
|
||||
struct TypeT {
|
||||
};
|
||||
|
||||
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
|
||||
XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
|
||||
|
||||
template<typename T>
|
||||
inline T max_(T x, T y) { return x >= y ? x : y; }
|
||||
template<typename T>
|
||||
inline T min_(T x, T y) { return x < y ? x : y; }
|
||||
|
||||
} // local
|
||||
|
||||
/**
|
||||
CPU detection class
|
||||
@note static inline const member is supported by c++17 or later, so use template hack
|
||||
*/
|
||||
class Cpu {
|
||||
uint64_t type_;
|
||||
public:
|
||||
class Type {
|
||||
uint64_t L;
|
||||
uint64_t H;
|
||||
public:
|
||||
Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) { }
|
||||
template<uint64_t L_, uint64_t H_>
|
||||
Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
|
||||
Type& operator&=(const Type& rhs) { L &= rhs.L; H &= rhs.H; return *this; }
|
||||
Type& operator|=(const Type& rhs) { L |= rhs.L; H |= rhs.H; return *this; }
|
||||
Type operator&(const Type& rhs) const { Type t = *this; t &= rhs; return t; }
|
||||
Type operator|(const Type& rhs) const { Type t = *this; t |= rhs; return t; }
|
||||
bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
|
||||
bool operator!=(const Type& rhs) const { return !operator==(rhs); }
|
||||
// without explicit because backward compatilibity
|
||||
operator bool() const { return (H | L) != 0; }
|
||||
uint64_t getL() const { return L; }
|
||||
uint64_t getH() const { return H; }
|
||||
};
|
||||
private:
|
||||
Type type_;
|
||||
//system topology
|
||||
bool x2APIC_supported_;
|
||||
static const size_t maxTopologyLevels = 2;
|
||||
unsigned int numCores_[maxTopologyLevels];
|
||||
uint32_t numCores_[maxTopologyLevels];
|
||||
|
||||
static const unsigned int maxNumberCacheLevels = 10;
|
||||
unsigned int dataCacheSize_[maxNumberCacheLevels];
|
||||
unsigned int coresSharignDataCache_[maxNumberCacheLevels];
|
||||
unsigned int dataCacheLevels_;
|
||||
static const uint32_t maxNumberCacheLevels = 10;
|
||||
uint32_t dataCacheSize_[maxNumberCacheLevels];
|
||||
uint32_t coresSharignDataCache_[maxNumberCacheLevels];
|
||||
uint32_t dataCacheLevels_;
|
||||
|
||||
unsigned int get32bitAsBE(const char *x) const
|
||||
uint32_t get32bitAsBE(const char *x) const
|
||||
{
|
||||
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
|
||||
}
|
||||
unsigned int mask(int n) const
|
||||
uint32_t mask(int n) const
|
||||
{
|
||||
return (1U << n) - 1;
|
||||
}
|
||||
void setFamily()
|
||||
{
|
||||
unsigned int data[4] = {};
|
||||
uint32_t data[4] = {};
|
||||
getCpuid(1, data);
|
||||
stepping = data[0] & mask(4);
|
||||
model = (data[0] >> 4) & mask(4);
|
||||
|
@ -132,17 +174,15 @@ class Cpu {
|
|||
displayModel = model;
|
||||
}
|
||||
}
|
||||
unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
|
||||
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end)
|
||||
{
|
||||
return (val >> base) & ((1u << (end - base)) - 1);
|
||||
}
|
||||
void setNumCores()
|
||||
{
|
||||
if ((type_ & tINTEL) == 0) return;
|
||||
if (!has(tINTEL) && !has(tAMD)) return;
|
||||
|
||||
unsigned int data[4] = {};
|
||||
|
||||
/* CAUTION: These numbers are configuration as shipped by Intel. */
|
||||
uint32_t data[4] = {};
|
||||
getCpuidEx(0x0, 0, data);
|
||||
if (data[0] >= 0xB) {
|
||||
/*
|
||||
|
@ -152,7 +192,7 @@ class Cpu {
|
|||
leaf 0xB can be zeroed-out by a hypervisor
|
||||
*/
|
||||
x2APIC_supported_ = true;
|
||||
for (unsigned int i = 0; i < maxTopologyLevels; i++) {
|
||||
for (uint32_t i = 0; i < maxTopologyLevels; i++) {
|
||||
getCpuidEx(0xB, i, data);
|
||||
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
|
||||
if (level == SmtLevel || level == CoreLevel) {
|
||||
|
@ -162,8 +202,8 @@ class Cpu {
|
|||
/*
|
||||
Fallback values in case a hypervisor has 0xB leaf zeroed-out.
|
||||
*/
|
||||
numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
|
||||
numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
|
||||
numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
|
||||
numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
|
||||
} else {
|
||||
/*
|
||||
Failed to deremine num of cores without x2APIC support.
|
||||
|
@ -176,14 +216,55 @@ class Cpu {
|
|||
}
|
||||
void setCacheHierarchy()
|
||||
{
|
||||
if ((type_ & tINTEL) == 0) return;
|
||||
const unsigned int NO_CACHE = 0;
|
||||
const unsigned int DATA_CACHE = 1;
|
||||
// const unsigned int INSTRUCTION_CACHE = 2;
|
||||
const unsigned int UNIFIED_CACHE = 3;
|
||||
unsigned int smt_width = 0;
|
||||
unsigned int logical_cores = 0;
|
||||
unsigned int data[4] = {};
|
||||
if (!has(tINTEL) && !has(tAMD)) return;
|
||||
|
||||
// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
|
||||
if (has(tAMD)) {
|
||||
// There are 3 Data Cache Levels (L1, L2, L3)
|
||||
dataCacheLevels_ = 3;
|
||||
const uint32_t leaf = 0x8000001D; // for modern AMD CPus
|
||||
// Sub leaf value ranges from 0 to 3
|
||||
// Sub leaf value 0 refers to L1 Data Cache
|
||||
// Sub leaf value 1 refers to L1 Instruction Cache
|
||||
// Sub leaf value 2 refers to L2 Cache
|
||||
// Sub leaf value 3 refers to L3 Cache
|
||||
// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
|
||||
// and 0x80000006 for L2 and L3 cache
|
||||
int cache_index = 0;
|
||||
for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
|
||||
// Skip sub_leaf = 1 as it refers to
|
||||
// L1 Instruction Cache (not required)
|
||||
if (sub_leaf == 1) {
|
||||
continue;
|
||||
}
|
||||
uint32_t data[4] = {};
|
||||
getCpuidEx(leaf, sub_leaf, data);
|
||||
// Cache Size = Line Size * Partitions * Associativity * Cache Sets
|
||||
dataCacheSize_[cache_index] =
|
||||
(extractBit(data[1], 22, 31) + 1) // Associativity-1
|
||||
* (extractBit(data[1], 12, 21) + 1) // Partitions-1
|
||||
* (extractBit(data[1], 0, 11) + 1) // Line Size
|
||||
* (data[2] + 1);
|
||||
// Calculate the number of cores sharing the current data cache
|
||||
int smt_width = numCores_[0];
|
||||
int logical_cores = numCores_[1];
|
||||
int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
|
||||
if (logical_cores != 0) {
|
||||
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
|
||||
}
|
||||
coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
|
||||
++cache_index;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// intel
|
||||
const uint32_t NO_CACHE = 0;
|
||||
const uint32_t DATA_CACHE = 1;
|
||||
// const uint32_t INSTRUCTION_CACHE = 2;
|
||||
const uint32_t UNIFIED_CACHE = 3;
|
||||
uint32_t smt_width = 0;
|
||||
uint32_t logical_cores = 0;
|
||||
uint32_t data[4] = {};
|
||||
|
||||
if (x2APIC_supported_) {
|
||||
smt_width = numCores_[0];
|
||||
|
@ -201,12 +282,12 @@ class Cpu {
|
|||
*/
|
||||
for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
|
||||
getCpuidEx(0x4, i, data);
|
||||
unsigned int cacheType = extractBit(data[0], 0, 4);
|
||||
uint32_t cacheType = extractBit(data[0], 0, 4);
|
||||
if (cacheType == NO_CACHE) break;
|
||||
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
|
||||
unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
|
||||
uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
|
||||
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
|
||||
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
|
||||
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
|
||||
}
|
||||
assert(actual_logical_cores != 0);
|
||||
dataCacheSize_[dataCacheLevels_] =
|
||||
|
@ -216,7 +297,7 @@ class Cpu {
|
|||
* (data[2] + 1);
|
||||
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
|
||||
assert(smt_width != 0);
|
||||
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
|
||||
coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
|
||||
dataCacheLevels_++;
|
||||
}
|
||||
}
|
||||
|
@ -231,7 +312,7 @@ public:
|
|||
int displayFamily; // family + extFamily
|
||||
int displayModel; // model + extModel
|
||||
|
||||
unsigned int getNumCores(IntelCpuTopologyLevel level) const {
|
||||
uint32_t getNumCores(IntelCpuTopologyLevel level) const {
|
||||
if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
|
||||
switch (level) {
|
||||
case SmtLevel: return numCores_[level - 1];
|
||||
|
@ -240,13 +321,13 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
|
||||
unsigned int getCoresSharingDataCache(unsigned int i) const
|
||||
uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
|
||||
uint32_t getCoresSharingDataCache(uint32_t i) const
|
||||
{
|
||||
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
|
||||
return coresSharignDataCache_[i];
|
||||
}
|
||||
unsigned int getDataCacheSize(unsigned int i) const
|
||||
uint32_t getDataCacheSize(uint32_t i) const
|
||||
{
|
||||
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
|
||||
return dataCacheSize_[i];
|
||||
|
@ -255,10 +336,10 @@ public:
|
|||
/*
|
||||
data[] = { eax, ebx, ecx, edx }
|
||||
*/
|
||||
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
|
||||
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
|
||||
{
|
||||
#ifdef XBYAK_INTEL_CPU_SPECIFIC
|
||||
#ifdef _MSC_VER
|
||||
#ifdef _WIN32
|
||||
__cpuid(reinterpret_cast<int*>(data), eaxIn);
|
||||
#else
|
||||
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
|
||||
|
@ -268,10 +349,10 @@ public:
|
|||
(void)data;
|
||||
#endif
|
||||
}
|
||||
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
|
||||
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
|
||||
{
|
||||
#ifdef XBYAK_INTEL_CPU_SPECIFIC
|
||||
#ifdef _MSC_VER
|
||||
#ifdef _WIN32
|
||||
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
|
||||
#else
|
||||
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
|
||||
|
@ -288,7 +369,7 @@ public:
|
|||
#ifdef _MSC_VER
|
||||
return _xgetbv(0);
|
||||
#else
|
||||
unsigned int eax, edx;
|
||||
uint32_t eax, edx;
|
||||
// xgetvb is not support on gcc 4.2
|
||||
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
|
||||
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
|
||||
|
@ -298,93 +379,116 @@ public:
|
|||
return 0;
|
||||
#endif
|
||||
}
|
||||
typedef uint64_t Type;
|
||||
|
||||
static const Type NONE = 0;
|
||||
static const Type tMMX = 1 << 0;
|
||||
static const Type tMMX2 = 1 << 1;
|
||||
static const Type tCMOV = 1 << 2;
|
||||
static const Type tSSE = 1 << 3;
|
||||
static const Type tSSE2 = 1 << 4;
|
||||
static const Type tSSE3 = 1 << 5;
|
||||
static const Type tSSSE3 = 1 << 6;
|
||||
static const Type tSSE41 = 1 << 7;
|
||||
static const Type tSSE42 = 1 << 8;
|
||||
static const Type tPOPCNT = 1 << 9;
|
||||
static const Type tAESNI = 1 << 10;
|
||||
static const Type tOSXSAVE = 1 << 12;
|
||||
static const Type tPCLMULQDQ = 1 << 13;
|
||||
static const Type tAVX = 1 << 14;
|
||||
static const Type tFMA = 1 << 15;
|
||||
#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
|
||||
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
|
||||
#define XBYAK_DEFINE_TYPE(id, NAME) static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME{}
|
||||
#else
|
||||
#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
|
||||
#endif
|
||||
XBYAK_DEFINE_TYPE(0, tMMX);
|
||||
XBYAK_DEFINE_TYPE(1, tMMX2);
|
||||
XBYAK_DEFINE_TYPE(2, tCMOV);
|
||||
XBYAK_DEFINE_TYPE(3, tSSE);
|
||||
XBYAK_DEFINE_TYPE(4, tSSE2);
|
||||
XBYAK_DEFINE_TYPE(5, tSSE3);
|
||||
XBYAK_DEFINE_TYPE(6, tSSSE3);
|
||||
XBYAK_DEFINE_TYPE(7, tSSE41);
|
||||
XBYAK_DEFINE_TYPE(8, tSSE42);
|
||||
XBYAK_DEFINE_TYPE(9, tPOPCNT);
|
||||
XBYAK_DEFINE_TYPE(10, tAESNI);
|
||||
XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
|
||||
XBYAK_DEFINE_TYPE(12, tOSXSAVE);
|
||||
XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
|
||||
XBYAK_DEFINE_TYPE(14, tAVX);
|
||||
XBYAK_DEFINE_TYPE(15, tFMA);
|
||||
XBYAK_DEFINE_TYPE(16, t3DN);
|
||||
XBYAK_DEFINE_TYPE(17, tE3DN);
|
||||
XBYAK_DEFINE_TYPE(18, tWAITPKG);
|
||||
XBYAK_DEFINE_TYPE(19, tRDTSCP);
|
||||
XBYAK_DEFINE_TYPE(20, tAVX2);
|
||||
XBYAK_DEFINE_TYPE(21, tBMI1); // andn, bextr, blsi, blsmsk, blsr, tzcnt
|
||||
XBYAK_DEFINE_TYPE(22, tBMI2); // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
|
||||
XBYAK_DEFINE_TYPE(23, tLZCNT);
|
||||
XBYAK_DEFINE_TYPE(24, tINTEL);
|
||||
XBYAK_DEFINE_TYPE(25, tAMD);
|
||||
XBYAK_DEFINE_TYPE(26, tENHANCED_REP); // enhanced rep movsb/stosb
|
||||
XBYAK_DEFINE_TYPE(27, tRDRAND);
|
||||
XBYAK_DEFINE_TYPE(28, tADX); // adcx, adox
|
||||
XBYAK_DEFINE_TYPE(29, tRDSEED); // rdseed
|
||||
XBYAK_DEFINE_TYPE(30, tSMAP); // stac
|
||||
XBYAK_DEFINE_TYPE(31, tHLE); // xacquire, xrelease, xtest
|
||||
XBYAK_DEFINE_TYPE(32, tRTM); // xbegin, xend, xabort
|
||||
XBYAK_DEFINE_TYPE(33, tF16C); // vcvtph2ps, vcvtps2ph
|
||||
XBYAK_DEFINE_TYPE(34, tMOVBE); // mobve
|
||||
XBYAK_DEFINE_TYPE(35, tAVX512F);
|
||||
XBYAK_DEFINE_TYPE(36, tAVX512DQ);
|
||||
XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
|
||||
XBYAK_DEFINE_TYPE(37, tAVX512IFMA);// = tAVX512_IFMA;
|
||||
XBYAK_DEFINE_TYPE(38, tAVX512PF);
|
||||
XBYAK_DEFINE_TYPE(39, tAVX512ER);
|
||||
XBYAK_DEFINE_TYPE(40, tAVX512CD);
|
||||
XBYAK_DEFINE_TYPE(41, tAVX512BW);
|
||||
XBYAK_DEFINE_TYPE(42, tAVX512VL);
|
||||
XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
|
||||
XBYAK_DEFINE_TYPE(43, tAVX512VBMI); // = tAVX512_VBMI; // changed by Intel's manual
|
||||
XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
|
||||
XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
|
||||
XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
|
||||
XBYAK_DEFINE_TYPE(47, tPREFETCHW);
|
||||
XBYAK_DEFINE_TYPE(48, tSHA);
|
||||
XBYAK_DEFINE_TYPE(49, tMPX);
|
||||
XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
|
||||
XBYAK_DEFINE_TYPE(51, tGFNI);
|
||||
XBYAK_DEFINE_TYPE(52, tVAES);
|
||||
XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
|
||||
XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
|
||||
XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
|
||||
XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
|
||||
XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
|
||||
XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
|
||||
XBYAK_DEFINE_TYPE(59, tAMX_TILE);
|
||||
XBYAK_DEFINE_TYPE(60, tAMX_INT8);
|
||||
XBYAK_DEFINE_TYPE(61, tAMX_BF16);
|
||||
XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
|
||||
XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
|
||||
XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
|
||||
XBYAK_DEFINE_TYPE(65, tMOVDIRI);
|
||||
XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
|
||||
XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen
|
||||
XBYAK_DEFINE_TYPE(68, tAMX_FP16);
|
||||
XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
|
||||
XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
|
||||
XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
|
||||
XBYAK_DEFINE_TYPE(72, tRAO_INT);
|
||||
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
|
||||
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
|
||||
XBYAK_DEFINE_TYPE(75, tSERIALIZE);
|
||||
XBYAK_DEFINE_TYPE(76, tUINTR);
|
||||
XBYAK_DEFINE_TYPE(77, tXSAVE);
|
||||
XBYAK_DEFINE_TYPE(78, tSHA512);
|
||||
XBYAK_DEFINE_TYPE(79, tSM3);
|
||||
XBYAK_DEFINE_TYPE(80, tSM4);
|
||||
XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
|
||||
|
||||
static const Type t3DN = 1 << 16;
|
||||
static const Type tE3DN = 1 << 17;
|
||||
static const Type tRDTSCP = 1 << 19;
|
||||
static const Type tAVX2 = 1 << 20;
|
||||
static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
|
||||
static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
|
||||
static const Type tLZCNT = 1 << 23;
|
||||
|
||||
static const Type tINTEL = 1 << 24;
|
||||
static const Type tAMD = 1 << 25;
|
||||
|
||||
static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
|
||||
static const Type tRDRAND = 1 << 27;
|
||||
static const Type tADX = 1 << 28; // adcx, adox
|
||||
static const Type tRDSEED = 1 << 29; // rdseed
|
||||
static const Type tSMAP = 1 << 30; // stac
|
||||
static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
|
||||
static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
|
||||
static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
|
||||
static const Type tMOVBE = uint64_t(1) << 34; // mobve
|
||||
static const Type tAVX512F = uint64_t(1) << 35;
|
||||
static const Type tAVX512DQ = uint64_t(1) << 36;
|
||||
static const Type tAVX512_IFMA = uint64_t(1) << 37;
|
||||
static const Type tAVX512IFMA = tAVX512_IFMA;
|
||||
static const Type tAVX512PF = uint64_t(1) << 38;
|
||||
static const Type tAVX512ER = uint64_t(1) << 39;
|
||||
static const Type tAVX512CD = uint64_t(1) << 40;
|
||||
static const Type tAVX512BW = uint64_t(1) << 41;
|
||||
static const Type tAVX512VL = uint64_t(1) << 42;
|
||||
static const Type tAVX512_VBMI = uint64_t(1) << 43;
|
||||
static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
|
||||
static const Type tAVX512_4VNNIW = uint64_t(1) << 44;
|
||||
static const Type tAVX512_4FMAPS = uint64_t(1) << 45;
|
||||
static const Type tPREFETCHWT1 = uint64_t(1) << 46;
|
||||
static const Type tPREFETCHW = uint64_t(1) << 47;
|
||||
static const Type tSHA = uint64_t(1) << 48;
|
||||
static const Type tMPX = uint64_t(1) << 49;
|
||||
static const Type tAVX512_VBMI2 = uint64_t(1) << 50;
|
||||
static const Type tGFNI = uint64_t(1) << 51;
|
||||
static const Type tVAES = uint64_t(1) << 52;
|
||||
static const Type tVPCLMULQDQ = uint64_t(1) << 53;
|
||||
static const Type tAVX512_VNNI = uint64_t(1) << 54;
|
||||
static const Type tAVX512_BITALG = uint64_t(1) << 55;
|
||||
static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
|
||||
static const Type tAVX512_BF16 = uint64_t(1) << 57;
|
||||
static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58;
|
||||
static const Type tAMX_TILE = uint64_t(1) << 59;
|
||||
static const Type tAMX_INT8 = uint64_t(1) << 60;
|
||||
static const Type tAMX_BF16 = uint64_t(1) << 61;
|
||||
static const Type tAVX_VNNI = uint64_t(1) << 62;
|
||||
static const Type tAVX512_FP16 = uint64_t(1) << 11;
|
||||
// 18, 63
|
||||
#undef XBYAK_SPLIT_ID
|
||||
#undef XBYAK_DEFINE_TYPE
|
||||
|
||||
Cpu()
|
||||
: type_(NONE)
|
||||
: type_()
|
||||
, x2APIC_supported_(false)
|
||||
, numCores_()
|
||||
, dataCacheSize_()
|
||||
, coresSharignDataCache_()
|
||||
, dataCacheLevels_(0)
|
||||
{
|
||||
unsigned int data[4] = {};
|
||||
const unsigned int& EAX = data[0];
|
||||
const unsigned int& EBX = data[1];
|
||||
const unsigned int& ECX = data[2];
|
||||
const unsigned int& EDX = data[3];
|
||||
uint32_t data[4] = {};
|
||||
const uint32_t& EAX = data[0];
|
||||
const uint32_t& EBX = data[1];
|
||||
const uint32_t& ECX = data[2];
|
||||
const uint32_t& EDX = data[3];
|
||||
getCpuid(0, data);
|
||||
const unsigned int maxNum = EAX;
|
||||
const uint32_t maxNum = EAX;
|
||||
static const char intel[] = "ntel";
|
||||
static const char amd[] = "cAMD";
|
||||
if (ECX == get32bitAsBE(amd)) {
|
||||
|
@ -407,7 +511,8 @@ public:
|
|||
|
||||
// Extended flags information
|
||||
getCpuid(0x80000000, data);
|
||||
if (EAX >= 0x80000001) {
|
||||
const uint32_t maxExtendedNum = EAX;
|
||||
if (maxExtendedNum >= 0x80000001) {
|
||||
getCpuid(0x80000001, data);
|
||||
|
||||
if (EDX & (1U << 31)) type_ |= t3DN;
|
||||
|
@ -419,15 +524,21 @@ public:
|
|||
if (ECX & (1U << 8)) type_ |= tPREFETCHW;
|
||||
}
|
||||
|
||||
if (maxExtendedNum >= 0x80000008) {
|
||||
getCpuid(0x80000008, data);
|
||||
if (EBX & (1U << 0)) type_ |= tCLZERO;
|
||||
}
|
||||
|
||||
getCpuid(1, data);
|
||||
if (ECX & (1U << 0)) type_ |= tSSE3;
|
||||
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
|
||||
if (ECX & (1U << 9)) type_ |= tSSSE3;
|
||||
if (ECX & (1U << 19)) type_ |= tSSE41;
|
||||
if (ECX & (1U << 20)) type_ |= tSSE42;
|
||||
if (ECX & (1U << 22)) type_ |= tMOVBE;
|
||||
if (ECX & (1U << 23)) type_ |= tPOPCNT;
|
||||
if (ECX & (1U << 25)) type_ |= tAESNI;
|
||||
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
|
||||
if (ECX & (1U << 26)) type_ |= tXSAVE;
|
||||
if (ECX & (1U << 27)) type_ |= tOSXSAVE;
|
||||
if (ECX & (1U << 30)) type_ |= tRDRAND;
|
||||
if (ECX & (1U << 29)) type_ |= tF16C;
|
||||
|
@ -460,9 +571,6 @@ public:
|
|||
if (EBX & (1U << 31)) type_ |= tAVX512VL;
|
||||
if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
|
||||
if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
|
||||
if (ECX & (1U << 8)) type_ |= tGFNI;
|
||||
if (ECX & (1U << 9)) type_ |= tVAES;
|
||||
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
|
||||
if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
|
||||
if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
|
||||
if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
|
||||
|
@ -484,20 +592,41 @@ public:
|
|||
if (EBX & (1U << 18)) type_ |= tRDSEED;
|
||||
if (EBX & (1U << 19)) type_ |= tADX;
|
||||
if (EBX & (1U << 20)) type_ |= tSMAP;
|
||||
if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
|
||||
if (EBX & (1U << 4)) type_ |= tHLE;
|
||||
if (EBX & (1U << 11)) type_ |= tRTM;
|
||||
if (EBX & (1U << 14)) type_ |= tMPX;
|
||||
if (EBX & (1U << 29)) type_ |= tSHA;
|
||||
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
|
||||
if (ECX & (1U << 5)) type_ |= tWAITPKG;
|
||||
if (ECX & (1U << 8)) type_ |= tGFNI;
|
||||
if (ECX & (1U << 9)) type_ |= tVAES;
|
||||
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
|
||||
if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
|
||||
if (ECX & (1U << 27)) type_ |= tMOVDIRI;
|
||||
if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
|
||||
if (EDX & (1U << 5)) type_ |= tUINTR;
|
||||
if (EDX & (1U << 14)) type_ |= tSERIALIZE;
|
||||
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
|
||||
if (EDX & (1U << 24)) type_ |= tAMX_TILE;
|
||||
if (EDX & (1U << 25)) type_ |= tAMX_INT8;
|
||||
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
|
||||
if (maxNumSubLeaves >= 1) {
|
||||
getCpuidEx(7, 1, data);
|
||||
if (EAX & (1U << 0)) type_ |= tSHA512;
|
||||
if (EAX & (1U << 1)) type_ |= tSM3;
|
||||
if (EAX & (1U << 2)) type_ |= tSM4;
|
||||
if (EAX & (1U << 3)) type_ |= tRAO_INT;
|
||||
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
|
||||
if (type_ & tAVX512F) {
|
||||
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
|
||||
}
|
||||
if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
|
||||
if (EAX & (1U << 21)) type_ |= tAMX_FP16;
|
||||
if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
|
||||
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
|
||||
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
|
||||
if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
|
||||
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
|
||||
}
|
||||
}
|
||||
setFamily();
|
||||
|
@ -512,9 +641,9 @@ public:
|
|||
printf("display:family=%X, model=%X\n", displayFamily, displayModel);
|
||||
#endif
|
||||
}
|
||||
bool has(Type type) const
|
||||
bool has(const Type& type) const
|
||||
{
|
||||
return (type & type_) != 0;
|
||||
return (type & type_) == type;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -527,7 +656,7 @@ public:
|
|||
#ifdef _MSC_VER
|
||||
return __rdtsc();
|
||||
#else
|
||||
unsigned int eax, edx;
|
||||
uint32_t eax, edx;
|
||||
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
|
||||
return ((uint64_t)edx << 32) | eax;
|
||||
#endif
|
||||
|
@ -564,7 +693,7 @@ const int UseRDX = 1 << 7;
|
|||
|
||||
class Pack {
|
||||
static const size_t maxTblNum = 15;
|
||||
const Xbyak::Reg64 *tbl_[maxTblNum];
|
||||
Xbyak::Reg64 tbl_[maxTblNum];
|
||||
size_t n_;
|
||||
public:
|
||||
Pack() : tbl_(), n_(0) {}
|
||||
|
@ -581,32 +710,36 @@ public:
|
|||
return *this;
|
||||
}
|
||||
Pack(const Xbyak::Reg64& t0)
|
||||
{ n_ = 1; tbl_[0] = &t0; }
|
||||
{ n_ = 1; tbl_[0] = t0; }
|
||||
Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
|
||||
{ n_ = 2; tbl_[0] = t0; tbl_[1] = t1; }
|
||||
Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
|
||||
{ n_ = 3; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; }
|
||||
Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
|
||||
{ n_ = 4; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; }
|
||||
Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
|
||||
{ n_ = 5; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; }
|
||||
Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
|
||||
{ n_ = 6; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; }
|
||||
Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
|
||||
{ n_ = 7; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; }
|
||||
Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
|
||||
{ n_ = 8; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; }
|
||||
Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
|
||||
{ n_ = 9; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; }
|
||||
Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
|
||||
{ n_ = 10; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; }
|
||||
Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 11; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; }
|
||||
Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||
{ n_ = 12; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; tbl_[11] = tb; }
|
||||
Pack& append(const Xbyak::Reg64& t)
|
||||
{
|
||||
if (n_ == maxTblNum) {
|
||||
fprintf(stderr, "ERR Pack::can't append\n");
|
||||
XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
|
||||
}
|
||||
tbl_[n_++] = &t;
|
||||
tbl_[n_++] = t;
|
||||
return *this;
|
||||
}
|
||||
void init(const Xbyak::Reg64 *tbl, size_t n)
|
||||
|
@ -617,7 +750,7 @@ public:
|
|||
}
|
||||
n_ = n;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
tbl_[i] = &tbl[i];
|
||||
tbl_[i] = tbl[i];
|
||||
}
|
||||
}
|
||||
const Xbyak::Reg64& operator[](size_t n) const
|
||||
|
@ -626,7 +759,7 @@ public:
|
|||
fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
|
||||
XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
|
||||
}
|
||||
return *tbl_[n];
|
||||
return tbl_[n];
|
||||
}
|
||||
size_t size() const { return n_; }
|
||||
/*
|
||||
|
@ -649,7 +782,7 @@ public:
|
|||
void put() const
|
||||
{
|
||||
for (size_t i = 0; i < n_; i++) {
|
||||
printf("%s ", tbl_[i]->toString());
|
||||
printf("%s ", tbl_[i].toString());
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
@ -716,7 +849,7 @@ public:
|
|||
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
|
||||
if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
|
||||
const Reg64& _rsp = code->rsp;
|
||||
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
|
||||
saveNum_ = local::max_(0, allRegNum - noSaveNum);
|
||||
const int *tbl = getOrderTbl() + noSaveNum;
|
||||
for (int i = 0; i < saveNum_; i++) {
|
||||
code->push(Reg64(tbl[i]));
|
||||
|
|
Loading…
Reference in New Issue