3rdparty: Update xbyak to 6.73

Updates xbyak from 6.00 to 6.73.
This commit is contained in:
JordanTheToaster 2023-11-23 14:23:23 +00:00 committed by refractionpcsx2
parent b644957cee
commit bca4f15f9d
4 changed files with 767 additions and 211 deletions

View File

@ -77,7 +77,11 @@
#endif
#include <windows.h>
#include <malloc.h>
#define XBYAK_TLS __declspec(thread)
#ifdef _MSC_VER
#define XBYAK_TLS __declspec(thread)
#else
#define XBYAK_TLS __thread
#endif
#elif defined(__GNUC__)
#include <unistd.h>
#include <sys/mman.h>
@ -95,7 +99,9 @@
#include <stdint.h>
#endif
#if !defined(MFD_CLOEXEC) // defined only linux 3.17 or later
// MFD_CLOEXEC defined only linux 3.17 or later.
// Android wraps the memfd_create syscall from API version 30.
#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
#undef XBYAK_USE_MEMFD
#endif
@ -112,7 +118,7 @@
#endif
#endif
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800)
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
#undef XBYAK_TLS
#define XBYAK_TLS thread_local
#define XBYAK_VARIADIC_TEMPLATE
@ -138,11 +144,18 @@
#pragma warning(disable : 4127) /* constant expresison */
#endif
// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
#if defined(__GNUC__) && !defined(__clang__)
#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x6000 /* 0xABCD = A.BC(D) */
VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@ -291,10 +304,10 @@ inline void SetError(int err) {
inline void ClearError() {
local::GetErrorRef() = 0;
}
inline int GetError() { return local::GetErrorRef(); }
inline int GetError() { return Xbyak::local::GetErrorRef(); }
#define XBYAK_THROW(err) { local::SetError(err); return; }
#define XBYAK_THROW_RET(err, r) { local::SetError(err); return r; }
#define XBYAK_THROW(err) { Xbyak::local::SetError(err); return; }
#define XBYAK_THROW_RET(err, r) { Xbyak::local::SetError(err); return r; }
#else
class Error : public std::exception {
@ -358,14 +371,36 @@ inline const To CastTo(From p) XBYAK_NOEXCEPT
}
namespace inner {
static const size_t ALIGN_PAGE_SIZE = 4096;
#ifdef _WIN32
struct SystemInfo {
SYSTEM_INFO info;
SystemInfo()
{
GetSystemInfo(&info);
}
};
#endif
//static const size_t ALIGN_PAGE_SIZE = 4096;
inline size_t getPageSize()
{
#ifdef _WIN32
static const SystemInfo si;
return si.info.dwPageSize;
#elif defined(__GNUC__)
static const long pageSize = sysconf(_SC_PAGESIZE);
if (pageSize > 0) {
return (size_t)pageSize;
}
#endif
return 4096;
}
inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
inline uint32_t VerifyInInt32(uint64_t x)
{
#ifdef XBYAK64
#if defined(XBYAK64) && !defined(__ILP32__)
if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
#endif
return static_cast<uint32_t>(x);
@ -383,7 +418,8 @@ enum LabelMode {
custom allocator
*/
struct Allocator {
virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::ALIGN_PAGE_SIZE)); }
explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator
virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
virtual void free(uint8_t *p) { AlignedFree(p); }
virtual ~Allocator() {}
/* override to return false if you call protect() manually */
@ -414,13 +450,24 @@ inline int getMacOsVersion()
} // util
#endif
class MmapAllocator : Allocator {
typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, size_t> SizeList;
SizeList sizeList_;
class MmapAllocator : public Allocator {
struct Allocation {
size_t size;
#if defined(XBYAK_USE_MEMFD)
// fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
// during the lifetime of each allocation in order to support
// checkpoint/restore by unprivileged users.
int fd;
#endif
};
const std::string name_; // only used with XBYAK_USE_MEMFD
typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
AllocationList allocList_;
public:
explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
uint8_t *alloc(size_t size)
{
const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1;
const size_t alignedSizeM1 = inner::getPageSize() - 1;
size = (size + alignedSizeM1) & ~alignedSizeM1;
#if defined(MAP_ANONYMOUS)
int mode = MAP_PRIVATE | MAP_ANONYMOUS;
@ -435,30 +482,42 @@ public:
#endif
int fd = -1;
#if defined(XBYAK_USE_MEMFD)
fd = memfd_create("xbyak", MFD_CLOEXEC);
fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
if (fd != -1) {
mode = MAP_SHARED;
if (ftruncate(fd, size) != 0) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
if (ftruncate(fd, size) != 0) {
close(fd);
XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
}
}
#endif
void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
#if defined(XBYAK_USE_MEMFD)
if (fd != -1) close(fd);
#endif
if (p == MAP_FAILED) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
if (p == MAP_FAILED) {
if (fd != -1) close(fd);
XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
}
assert(p);
sizeList_[(uintptr_t)p] = size;
Allocation &alloc = allocList_[(uintptr_t)p];
alloc.size = size;
#if defined(XBYAK_USE_MEMFD)
alloc.fd = fd;
#endif
return (uint8_t*)p;
}
void free(uint8_t *p)
{
if (p == 0) return;
SizeList::iterator i = sizeList_.find((uintptr_t)p);
if (i == sizeList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
if (munmap((void*)i->first, i->second) < 0) XBYAK_THROW(ERR_MUNMAP)
sizeList_.erase(i);
AllocationList::iterator i = allocList_.find((uintptr_t)p);
if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
#if defined(XBYAK_USE_MEMFD)
if (i->second.fd != -1) close(i->second.fd);
#endif
allocList_.erase(i);
}
};
#else
typedef Allocator MmapAllocator;
#endif
class Address;
@ -1176,9 +1235,6 @@ public:
size_t pageSize = sysconf(_SC_PAGESIZE);
size_t iaddr = reinterpret_cast<size_t>(addr);
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
#ifndef NDEBUG
if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize);
#endif
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
#else
return true;
@ -1448,7 +1504,6 @@ public:
clabelDefList_.clear();
clabelUndefList_.clear();
resetLabelPtrList();
ClearError();
}
void enterLocal()
{
@ -1574,6 +1629,7 @@ public:
enum LabelType {
T_SHORT,
T_NEAR,
T_FAR, // far jump
T_AUTO // T_SHORT if possible
};
private:
@ -1622,6 +1678,11 @@ private:
{
return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
}
static inline bool isValidSSE(const Operand& op1)
{
// SSE instructions do not support XMM16 - XMM31
return !(op1.isXMM() && op1.getIdx() >= 16);
}
void rex(const Operand& op1, const Operand& op2 = Operand())
{
uint8_t rex = 0;
@ -1784,9 +1845,16 @@ private:
void setSIB(const RegExp& e, int reg, int disp8N = 0)
{
uint64_t disp64 = e.getDisp();
#ifdef XBYAK64
#if defined(XBYAK64) && !defined(__ILP32__)
#ifdef XBYAK_OLD_DISP_CHECK
// treat 0xffffffff as 0xffffffffffffffff
uint64_t high = disp64 >> 32;
if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
#else
// displacement should be a signed 32-bit value, so also check sign bit
uint64_t high = disp64 >> 31;
if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
#endif
#endif
uint32_t disp = static_cast<uint32_t>(disp64);
const Reg& base = e.getBase();
@ -1887,6 +1955,7 @@ private:
template<class T>
void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref)
{
if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
size_t offset = 0;
if (labelMgr_.getOffset(&offset, label)) { /* label exists */
@ -1907,6 +1976,7 @@ private:
}
void opJmpAbs(const void *addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0)
{
if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
if (isAutoGrow()) {
if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
if (size_ + 16 >= maxSize_) growMemory();
@ -1919,6 +1989,16 @@ private:
}
}
void opJmpOp(const Operand& op, LabelType type, int ext)
{
const int bit = 16|i32e;
if (type == T_FAR) {
if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
} else {
opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
}
}
// reg is reg field of ModRM
// immSize is the size for immediate value
// disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
@ -1945,6 +2025,7 @@ private:
void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&), int imm8 = NONE, int preCode = NONE)
{
if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
if (pref != NONE) db(pref);
if (op.isMEM()) {
opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
@ -1955,6 +2036,7 @@ private:
}
void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext)
{
if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
if (mmx.isXMM()) db(0x66);
opModR(Reg32(ext), mmx, 0x0F, code);
db(imm8);
@ -1965,6 +2047,7 @@ private:
}
void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
{
if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
if (pref != NONE) db(pref);
if (op1.isXMM() && op2.isMEM()) {
opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
@ -1976,6 +2059,7 @@ private:
}
void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false)
{
if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
if (mmx.isXMM()) db(0x66);
opModR(op.getReg(), mmx, 0x0F, 0xC5); db(imm);
@ -2132,9 +2216,6 @@ private:
{
if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
int w = op.isBit(16);
#ifdef XBYAK64
if (op.isHigh8bit()) XBYAK_THROW(ERR_BAD_COMBINATION)
#endif
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
}
@ -2356,18 +2437,21 @@ private:
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
opVex(x, 0, addr, type, code);
}
void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding)
{
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
}
int orEvexIf(PreferredEncoding encoding) {
if (encoding == DefaultEncoding) {
encoding = EvexEncoding;
encoding = defaultEncoding_;
}
if (encoding == EvexEncoding) {
#ifdef XBYAK_DISABLE_AVX512
XBYAK_THROW(ERR_EVEX_IS_INVALID)
#endif
type |= T_MUST_EVEX;
return T_MUST_EVEX;
}
opAVX_X_X_XM(x1, x2, op, type, code0);
return 0;
}
void opInOut(const Reg& a, const Reg& d, uint8_t code)
{
@ -2452,6 +2536,7 @@ public:
#endif
private:
bool isDefaultJmpNEAR_;
PreferredEncoding defaultEncoding_;
public:
void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(Label& label) { labelMgr_.defineClabel(label); }
@ -2474,13 +2559,13 @@ public:
// set default type of `jmp` of undefined label to T_NEAR
void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
void jmp(const Operand& op) { opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true); }
void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
void jmp(const char *label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
void jmp(const void *addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
void call(const Operand& op) { opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true); }
void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
// call(string label), not const std::string&
void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
void call(const char *label) { call(std::string(label)); }
@ -2731,11 +2816,13 @@ public:
, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
#endif
, isDefaultJmpNEAR_(false)
, defaultEncoding_(EvexEncoding)
{
labelMgr_.set(this);
}
void reset()
{
ClearError();
resetSize();
labelMgr_.reset();
labelMgr_.set(this);
@ -2767,6 +2854,9 @@ public:
#undef jnl
#endif
// set default encoding to select Vex or Evex
void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
/*
use single byte nop if useMultiByteNop = false
*/
@ -2813,7 +2903,7 @@ public:
{
if (x == 1) return;
if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
if (isAutoGrow() && x > inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", (int)x);
if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
size_t remain = size_t(getCurr()) % x;
if (remain) {
nop(x - remain, useMultiByteNop);
@ -2871,6 +2961,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen
#pragma warning(pop)
#endif
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
} // end of namespace
#endif // XBYAK_XBYAK_H_

258
3rdparty/xbyak/xbyak/xbyak_bin2hex.h vendored Normal file
View File

@ -0,0 +1,258 @@
enum {
B00000000= 0,
B00000001= 1,
B00000010= 2,
B00000011= 3,
B00000100= 4,
B00000101= 5,
B00000110= 6,
B00000111= 7,
B00001000= 8,
B00001001= 9,
B00001010= 10,
B00001011= 11,
B00001100= 12,
B00001101= 13,
B00001110= 14,
B00001111= 15,
B00010000= 16,
B00010001= 17,
B00010010= 18,
B00010011= 19,
B00010100= 20,
B00010101= 21,
B00010110= 22,
B00010111= 23,
B00011000= 24,
B00011001= 25,
B00011010= 26,
B00011011= 27,
B00011100= 28,
B00011101= 29,
B00011110= 30,
B00011111= 31,
B00100000= 32,
B00100001= 33,
B00100010= 34,
B00100011= 35,
B00100100= 36,
B00100101= 37,
B00100110= 38,
B00100111= 39,
B00101000= 40,
B00101001= 41,
B00101010= 42,
B00101011= 43,
B00101100= 44,
B00101101= 45,
B00101110= 46,
B00101111= 47,
B00110000= 48,
B00110001= 49,
B00110010= 50,
B00110011= 51,
B00110100= 52,
B00110101= 53,
B00110110= 54,
B00110111= 55,
B00111000= 56,
B00111001= 57,
B00111010= 58,
B00111011= 59,
B00111100= 60,
B00111101= 61,
B00111110= 62,
B00111111= 63,
B01000000= 64,
B01000001= 65,
B01000010= 66,
B01000011= 67,
B01000100= 68,
B01000101= 69,
B01000110= 70,
B01000111= 71,
B01001000= 72,
B01001001= 73,
B01001010= 74,
B01001011= 75,
B01001100= 76,
B01001101= 77,
B01001110= 78,
B01001111= 79,
B01010000= 80,
B01010001= 81,
B01010010= 82,
B01010011= 83,
B01010100= 84,
B01010101= 85,
B01010110= 86,
B01010111= 87,
B01011000= 88,
B01011001= 89,
B01011010= 90,
B01011011= 91,
B01011100= 92,
B01011101= 93,
B01011110= 94,
B01011111= 95,
B01100000= 96,
B01100001= 97,
B01100010= 98,
B01100011= 99,
B01100100= 100,
B01100101= 101,
B01100110= 102,
B01100111= 103,
B01101000= 104,
B01101001= 105,
B01101010= 106,
B01101011= 107,
B01101100= 108,
B01101101= 109,
B01101110= 110,
B01101111= 111,
B01110000= 112,
B01110001= 113,
B01110010= 114,
B01110011= 115,
B01110100= 116,
B01110101= 117,
B01110110= 118,
B01110111= 119,
B01111000= 120,
B01111001= 121,
B01111010= 122,
B01111011= 123,
B01111100= 124,
B01111101= 125,
B01111110= 126,
B01111111= 127,
B10000000= 128,
B10000001= 129,
B10000010= 130,
B10000011= 131,
B10000100= 132,
B10000101= 133,
B10000110= 134,
B10000111= 135,
B10001000= 136,
B10001001= 137,
B10001010= 138,
B10001011= 139,
B10001100= 140,
B10001101= 141,
B10001110= 142,
B10001111= 143,
B10010000= 144,
B10010001= 145,
B10010010= 146,
B10010011= 147,
B10010100= 148,
B10010101= 149,
B10010110= 150,
B10010111= 151,
B10011000= 152,
B10011001= 153,
B10011010= 154,
B10011011= 155,
B10011100= 156,
B10011101= 157,
B10011110= 158,
B10011111= 159,
B10100000= 160,
B10100001= 161,
B10100010= 162,
B10100011= 163,
B10100100= 164,
B10100101= 165,
B10100110= 166,
B10100111= 167,
B10101000= 168,
B10101001= 169,
B10101010= 170,
B10101011= 171,
B10101100= 172,
B10101101= 173,
B10101110= 174,
B10101111= 175,
B10110000= 176,
B10110001= 177,
B10110010= 178,
B10110011= 179,
B10110100= 180,
B10110101= 181,
B10110110= 182,
B10110111= 183,
B10111000= 184,
B10111001= 185,
B10111010= 186,
B10111011= 187,
B10111100= 188,
B10111101= 189,
B10111110= 190,
B10111111= 191,
B11000000= 192,
B11000001= 193,
B11000010= 194,
B11000011= 195,
B11000100= 196,
B11000101= 197,
B11000110= 198,
B11000111= 199,
B11001000= 200,
B11001001= 201,
B11001010= 202,
B11001011= 203,
B11001100= 204,
B11001101= 205,
B11001110= 206,
B11001111= 207,
B11010000= 208,
B11010001= 209,
B11010010= 210,
B11010011= 211,
B11010100= 212,
B11010101= 213,
B11010110= 214,
B11010111= 215,
B11011000= 216,
B11011001= 217,
B11011010= 218,
B11011011= 219,
B11011100= 220,
B11011101= 221,
B11011110= 222,
B11011111= 223,
B11100000= 224,
B11100001= 225,
B11100010= 226,
B11100011= 227,
B11100100= 228,
B11100101= 229,
B11100110= 230,
B11100111= 231,
B11101000= 232,
B11101001= 233,
B11101010= 234,
B11101011= 235,
B11101100= 236,
B11101101= 237,
B11101110= 238,
B11101111= 239,
B11110000= 240,
B11110001= 241,
B11110010= 242,
B11110011= 243,
B11110100= 244,
B11110101= 245,
B11110110= 246,
B11110111= 247,
B11111000= 248,
B11111001= 249,
B11111010= 250,
B11111011= 251,
B11111100= 252,
B11111101= 253,
B11111110= 254,
B11111111= 255
};

View File

@ -1,4 +1,6 @@
const char *getVersionString() const { return "6.00"; }
const char *getVersionString() const { return "6.73"; }
void aadd(const Address& addr, const Reg32e &reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void aand(const Address& addr, const Reg32e &reg) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM
void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
void aor(const Address& addr, const Reg32e &reg) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void axor(const Address& addr, const Reg32e &reg) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
@ -57,9 +61,11 @@ void cbw() { db(0x66); db(0x98); }
void cdq() { db(0x99); }
void clc() { db(0xF8); }
void cld() { db(0xFC); }
void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
void clflushopt(const Address& addr) { db(0x66); opModM(addr, Reg32(7), 0x0F, 0xAE); }
void cli() { db(0xFA); }
void clwb(const Address& addr) { db(0x66); opMIB(addr, esi, 0x0F, 0xAE); }
void clzero() { db(0x0F); db(0x01); db(0xFC); }
void cmc() { db(0xF5); }
void cmova(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7); }//-V524
@ -323,6 +329,7 @@ void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op,
void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
void hlt() { db(0xF4); }
void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
@ -500,6 +507,8 @@ void movd(const Mmx& mmx, const Address& addr) { if (mmx.isXMM()) db(0x66); opMo
void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); }
void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
void movdir64b(const Reg& reg, const Address& addr) { db(0x66); opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8); }
void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
void movdq2q(const Mmx& mmx, const Xmm& xmm) { db(0xF2); opModR(mmx, xmm, 0x0F, 0xD6); }
void movdqa(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x7F); }
void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
@ -580,9 +589,9 @@ void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pblendw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
void pclmulhqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
void pclmullqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A); }
void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
@ -649,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
void popf() { db(0x9D); }
void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
@ -719,6 +730,7 @@ void repne() { db(0xF2); }
void repnz() { db(0xF2); }
void repz() { db(0xF3); }
void ret(int imm = 0) { if (imm) { db(0xC2); dw(imm); } else { db(0xC3); } }
void retf(int imm = 0) { if (imm) { db(0xCA); dw(imm); } else { db(0xCB); } }
void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
@ -741,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
void scasb() { db(0xAE); }
void scasd() { db(0xAF); }
void scasw() { db(0x66); db(0xAF); }
void serialize() { db(0x0F); db(0x01); db(0xE8); }
void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524
void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524
void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524
@ -811,10 +824,13 @@ void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM
void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
void sysenter() { db(0x0F); db(0x34); }
void sysexit() { db(0x0F); db(0x35); }
void tpause(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
void tzcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
void ud2() { db(0x0F); db(0x0B); }
void umonitor(const Reg& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit(); if (BIT != bit) { if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) { db(0x67); } else { XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) } } db(0xF3); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
void umwait(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2); db(0x0F); db(0xAE); setModRM(3, 6, idx); }
void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
@ -835,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); }
void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); }
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); }
@ -979,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); }
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); }
void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); }
void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
@ -1169,6 +1192,10 @@ void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isME
void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58); }
void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59); }
void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79); }
void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm); }
void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
@ -1182,10 +1209,22 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); }
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2); }
void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3); }
void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2); }
void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3); }
void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2); }
void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); }
@ -1217,6 +1256,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); }
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); }
void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); }
void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); }
void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
@ -1313,8 +1354,16 @@ void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { op
void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm); }
void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
void vsha512msg1(const Ymm& y, const Xmm& x) { if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC); }
void vsha512msg2(const Ymm& y1, const Ymm& y2) { if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD); }
void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) { if (!(y1.isYMM() && y2.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB); }
void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm); }
void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm); }
void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm); }
void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); }
void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51); }
void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51); }
void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51); }
@ -1339,7 +1388,10 @@ void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
void wait() { db(0x9B); }
void wbinvd() { db(0x0F); db(0x09); }
void wrmsr() { db(0x0F); db(0x30); }
void xabort(uint8_t imm) { db(0xC6); db(0xF8); db(imm); }
void xadd(const Operand& op, const Reg& reg) { opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xC0 | (reg.isBit(8) ? 0 : 1)); }
void xbegin(uint32_t rel) { db(0xC7); db(0xF8); dd(rel); }
void xend() { db(0x0F); db(0x01); db(0xD5); }
void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
void xlatb() { db(0xD7); }
void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
@ -1620,6 +1672,10 @@ void scasq() { db(0x48); db(0xAF); }
void stosq() { db(0x48); db(0xAB); }
void syscall() { db(0x0F); db(0x05); }
void sysret() { db(0x0F); db(0x07); }
void clui() { db(0xF3); db(0x0F); db(0x01); db(0xEE); }
void stui() { db(0xF3); db(0x0F); db(0x01); db(0xEF); }
void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); }
void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); }
void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); }
@ -1627,12 +1683,29 @@ void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR(
void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); }
void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); }
void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); }
void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); }
void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); }
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); }
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); }
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); }
void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); }
void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); }
void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); }
void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); }
void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); }
void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); }
void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); }
void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); }
void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); }
void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); }
void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); }
void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); }
void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); }
void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); }
void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); }
void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); }
void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); }
void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); }
void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
@ -1644,6 +1717,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T
void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
#else
void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
@ -1898,7 +1972,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 |
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@ -2132,38 +2205,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T
void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); }
void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); }
void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); }
void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); }
void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); }
void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); }
void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); }
void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); }
void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); }
void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); }
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); }
void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); }
void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); }
void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); }
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); }
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); }
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); }
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); }
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); }
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); }
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); }
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); }
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); }
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); }
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); }
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); }
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); }
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); }
void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); }
void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); }
void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); }
void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); }
void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); }
void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); }
void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); }
void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); }
void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); }
void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); }
void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); }
void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); }
void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); }
void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); }
void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); }
void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); }
void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); }
void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); }
void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); }
void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); }

View File

@ -4,12 +4,18 @@
#ifdef XBYAK_ONLY_CLASS_CPU
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include <assert.h>
#ifndef XBYAK_THROW
#define XBYAK_THROW(x) ;
#define XBYAK_THROW_RET(x, y) return y;
#endif
#ifndef XBYAK_CONSTEXPR
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
#define XBYAK_CONSTEXPR constexpr
#else
#define XBYAK_CONSTEXPR
#endif
#endif
#else
#include <string.h>
@ -17,7 +23,6 @@
utility class and functions for Xbyak
Xbyak::util::Clock ; rdtsc timer
Xbyak::util::Cpu ; detect CPU
@note this header is UNDER CONSTRUCTION!
*/
#include "xbyak.h"
#endif // XBYAK_ONLY_CLASS_CPU
@ -27,8 +32,8 @@
#endif
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#if (_MSC_VER < 1400) && defined(XBYAK32)
#ifdef _WIN32
#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
static inline __declspec(naked) void __cpuid(int[4], int)
{
__asm {
@ -88,32 +93,69 @@ typedef enum {
CoreLevel = 2
} IntelCpuTopologyLevel;
namespace local {
template<uint64_t L, uint64_t H = 0>
struct TypeT {
};
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; }
template<typename T>
inline T min_(T x, T y) { return x < y ? x : y; }
} // local
/**
CPU detection class
@note static inline const member is supported by c++17 or later, so use template hack
*/
class Cpu {
uint64_t type_;
public:
class Type {
uint64_t L;
uint64_t H;
public:
Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) { }
template<uint64_t L_, uint64_t H_>
Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
Type& operator&=(const Type& rhs) { L &= rhs.L; H &= rhs.H; return *this; }
Type& operator|=(const Type& rhs) { L |= rhs.L; H |= rhs.H; return *this; }
Type operator&(const Type& rhs) const { Type t = *this; t &= rhs; return t; }
Type operator|(const Type& rhs) const { Type t = *this; t |= rhs; return t; }
bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
bool operator!=(const Type& rhs) const { return !operator==(rhs); }
// without explicit because backward compatilibity
operator bool() const { return (H | L) != 0; }
uint64_t getL() const { return L; }
uint64_t getH() const { return H; }
};
private:
Type type_;
//system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2;
unsigned int numCores_[maxTopologyLevels];
uint32_t numCores_[maxTopologyLevels];
static const unsigned int maxNumberCacheLevels = 10;
unsigned int dataCacheSize_[maxNumberCacheLevels];
unsigned int coresSharignDataCache_[maxNumberCacheLevels];
unsigned int dataCacheLevels_;
static const uint32_t maxNumberCacheLevels = 10;
uint32_t dataCacheSize_[maxNumberCacheLevels];
uint32_t coresSharignDataCache_[maxNumberCacheLevels];
uint32_t dataCacheLevels_;
unsigned int get32bitAsBE(const char *x) const
uint32_t get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
}
unsigned int mask(int n) const
uint32_t mask(int n) const
{
return (1U << n) - 1;
}
void setFamily()
{
unsigned int data[4] = {};
uint32_t data[4] = {};
getCpuid(1, data);
stepping = data[0] & mask(4);
model = (data[0] >> 4) & mask(4);
@ -132,17 +174,15 @@ class Cpu {
displayModel = model;
}
}
unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end)
{
return (val >> base) & ((1u << (end - base)) - 1);
}
void setNumCores()
{
if ((type_ & tINTEL) == 0) return;
if (!has(tINTEL) && !has(tAMD)) return;
unsigned int data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
uint32_t data[4] = {};
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
@ -152,7 +192,7 @@ class Cpu {
leaf 0xB can be zeroed-out by a hypervisor
*/
x2APIC_supported_ = true;
for (unsigned int i = 0; i < maxTopologyLevels; i++) {
for (uint32_t i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) {
@ -162,8 +202,8 @@ class Cpu {
/*
Fallback values in case a hypervisor has 0xB leaf zeroed-out.
*/
numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
} else {
/*
Failed to deremine num of cores without x2APIC support.
@ -176,14 +216,55 @@ class Cpu {
}
void setCacheHierarchy()
{
if ((type_ & tINTEL) == 0) return;
const unsigned int NO_CACHE = 0;
const unsigned int DATA_CACHE = 1;
// const unsigned int INSTRUCTION_CACHE = 2;
const unsigned int UNIFIED_CACHE = 3;
unsigned int smt_width = 0;
unsigned int logical_cores = 0;
unsigned int data[4] = {};
if (!has(tINTEL) && !has(tAMD)) return;
// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
if (has(tAMD)) {
// There are 3 Data Cache Levels (L1, L2, L3)
dataCacheLevels_ = 3;
const uint32_t leaf = 0x8000001D; // for modern AMD CPus
// Sub leaf value ranges from 0 to 3
// Sub leaf value 0 refers to L1 Data Cache
// Sub leaf value 1 refers to L1 Instruction Cache
// Sub leaf value 2 refers to L2 Cache
// Sub leaf value 3 refers to L3 Cache
// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache
int cache_index = 0;
for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
// Skip sub_leaf = 1 as it refers to
// L1 Instruction Cache (not required)
if (sub_leaf == 1) {
continue;
}
uint32_t data[4] = {};
getCpuidEx(leaf, sub_leaf, data);
// Cache Size = Line Size * Partitions * Associativity * Cache Sets
dataCacheSize_[cache_index] =
(extractBit(data[1], 22, 31) + 1) // Associativity-1
* (extractBit(data[1], 12, 21) + 1) // Partitions-1
* (extractBit(data[1], 0, 11) + 1) // Line Size
* (data[2] + 1);
// Calculate the number of cores sharing the current data cache
int smt_width = numCores_[0];
int logical_cores = numCores_[1];
int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
if (logical_cores != 0) {
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
++cache_index;
}
return;
}
// intel
const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1;
// const uint32_t INSTRUCTION_CACHE = 2;
const uint32_t UNIFIED_CACHE = 3;
uint32_t smt_width = 0;
uint32_t logical_cores = 0;
uint32_t data[4] = {};
if (x2APIC_supported_) {
smt_width = numCores_[0];
@ -201,12 +282,12 @@ class Cpu {
*/
for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
getCpuidEx(0x4, i, data);
unsigned int cacheType = extractBit(data[0], 0, 4);
uint32_t cacheType = extractBit(data[0], 0, 4);
if (cacheType == NO_CACHE) break;
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
@ -216,7 +297,7 @@ class Cpu {
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
@ -231,7 +312,7 @@ public:
int displayFamily; // family + extFamily
int displayModel; // model + extModel
unsigned int getNumCores(IntelCpuTopologyLevel level) const {
uint32_t getNumCores(IntelCpuTopologyLevel level) const {
if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
switch (level) {
case SmtLevel: return numCores_[level - 1];
@ -240,13 +321,13 @@ public:
}
}
unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
unsigned int getCoresSharingDataCache(unsigned int i) const
uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
uint32_t getCoresSharingDataCache(uint32_t i) const
{
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return coresSharignDataCache_[i];
}
unsigned int getDataCacheSize(unsigned int i) const
uint32_t getDataCacheSize(uint32_t i) const
{
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return dataCacheSize_[i];
@ -255,10 +336,10 @@ public:
/*
data[] = { eax, ebx, ecx, edx }
*/
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#ifdef _WIN32
__cpuid(reinterpret_cast<int*>(data), eaxIn);
#else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
@ -268,10 +349,10 @@ public:
(void)data;
#endif
}
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#ifdef _WIN32
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
@ -288,7 +369,7 @@ public:
#ifdef _MSC_VER
return _xgetbv(0);
#else
unsigned int eax, edx;
uint32_t eax, edx;
// xgetvb is not support on gcc 4.2
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
@ -298,93 +379,116 @@ public:
return 0;
#endif
}
typedef uint64_t Type;
static const Type NONE = 0;
static const Type tMMX = 1 << 0;
static const Type tMMX2 = 1 << 1;
static const Type tCMOV = 1 << 2;
static const Type tSSE = 1 << 3;
static const Type tSSE2 = 1 << 4;
static const Type tSSE3 = 1 << 5;
static const Type tSSSE3 = 1 << 6;
static const Type tSSE41 = 1 << 7;
static const Type tSSE42 = 1 << 8;
static const Type tPOPCNT = 1 << 9;
static const Type tAESNI = 1 << 10;
static const Type tOSXSAVE = 1 << 12;
static const Type tPCLMULQDQ = 1 << 13;
static const Type tAVX = 1 << 14;
static const Type tFMA = 1 << 15;
#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
#define XBYAK_DEFINE_TYPE(id, NAME) static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME{}
#else
#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
#endif
XBYAK_DEFINE_TYPE(0, tMMX);
XBYAK_DEFINE_TYPE(1, tMMX2);
XBYAK_DEFINE_TYPE(2, tCMOV);
XBYAK_DEFINE_TYPE(3, tSSE);
XBYAK_DEFINE_TYPE(4, tSSE2);
XBYAK_DEFINE_TYPE(5, tSSE3);
XBYAK_DEFINE_TYPE(6, tSSSE3);
XBYAK_DEFINE_TYPE(7, tSSE41);
XBYAK_DEFINE_TYPE(8, tSSE42);
XBYAK_DEFINE_TYPE(9, tPOPCNT);
XBYAK_DEFINE_TYPE(10, tAESNI);
XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
XBYAK_DEFINE_TYPE(12, tOSXSAVE);
XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
XBYAK_DEFINE_TYPE(14, tAVX);
XBYAK_DEFINE_TYPE(15, tFMA);
XBYAK_DEFINE_TYPE(16, t3DN);
XBYAK_DEFINE_TYPE(17, tE3DN);
XBYAK_DEFINE_TYPE(18, tWAITPKG);
XBYAK_DEFINE_TYPE(19, tRDTSCP);
XBYAK_DEFINE_TYPE(20, tAVX2);
XBYAK_DEFINE_TYPE(21, tBMI1); // andn, bextr, blsi, blsmsk, blsr, tzcnt
XBYAK_DEFINE_TYPE(22, tBMI2); // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
XBYAK_DEFINE_TYPE(23, tLZCNT);
XBYAK_DEFINE_TYPE(24, tINTEL);
XBYAK_DEFINE_TYPE(25, tAMD);
XBYAK_DEFINE_TYPE(26, tENHANCED_REP); // enhanced rep movsb/stosb
XBYAK_DEFINE_TYPE(27, tRDRAND);
XBYAK_DEFINE_TYPE(28, tADX); // adcx, adox
XBYAK_DEFINE_TYPE(29, tRDSEED); // rdseed
XBYAK_DEFINE_TYPE(30, tSMAP); // stac
XBYAK_DEFINE_TYPE(31, tHLE); // xacquire, xrelease, xtest
XBYAK_DEFINE_TYPE(32, tRTM); // xbegin, xend, xabort
XBYAK_DEFINE_TYPE(33, tF16C); // vcvtph2ps, vcvtps2ph
XBYAK_DEFINE_TYPE(34, tMOVBE); // mobve
XBYAK_DEFINE_TYPE(35, tAVX512F);
XBYAK_DEFINE_TYPE(36, tAVX512DQ);
XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
XBYAK_DEFINE_TYPE(37, tAVX512IFMA);// = tAVX512_IFMA;
XBYAK_DEFINE_TYPE(38, tAVX512PF);
XBYAK_DEFINE_TYPE(39, tAVX512ER);
XBYAK_DEFINE_TYPE(40, tAVX512CD);
XBYAK_DEFINE_TYPE(41, tAVX512BW);
XBYAK_DEFINE_TYPE(42, tAVX512VL);
XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
XBYAK_DEFINE_TYPE(43, tAVX512VBMI); // = tAVX512_VBMI; // changed by Intel's manual
XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
XBYAK_DEFINE_TYPE(47, tPREFETCHW);
XBYAK_DEFINE_TYPE(48, tSHA);
XBYAK_DEFINE_TYPE(49, tMPX);
XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
XBYAK_DEFINE_TYPE(51, tGFNI);
XBYAK_DEFINE_TYPE(52, tVAES);
XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
XBYAK_DEFINE_TYPE(59, tAMX_TILE);
XBYAK_DEFINE_TYPE(60, tAMX_INT8);
XBYAK_DEFINE_TYPE(61, tAMX_BF16);
XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
XBYAK_DEFINE_TYPE(65, tMOVDIRI);
XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen
XBYAK_DEFINE_TYPE(68, tAMX_FP16);
XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
XBYAK_DEFINE_TYPE(75, tSERIALIZE);
XBYAK_DEFINE_TYPE(76, tUINTR);
XBYAK_DEFINE_TYPE(77, tXSAVE);
XBYAK_DEFINE_TYPE(78, tSHA512);
XBYAK_DEFINE_TYPE(79, tSM3);
XBYAK_DEFINE_TYPE(80, tSM4);
XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
static const Type t3DN = 1 << 16;
static const Type tE3DN = 1 << 17;
static const Type tRDTSCP = 1 << 19;
static const Type tAVX2 = 1 << 20;
static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
static const Type tLZCNT = 1 << 23;
static const Type tINTEL = 1 << 24;
static const Type tAMD = 1 << 25;
static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
static const Type tRDRAND = 1 << 27;
static const Type tADX = 1 << 28; // adcx, adox
static const Type tRDSEED = 1 << 29; // rdseed
static const Type tSMAP = 1 << 30; // stac
static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
static const Type tMOVBE = uint64_t(1) << 34; // mobve
static const Type tAVX512F = uint64_t(1) << 35;
static const Type tAVX512DQ = uint64_t(1) << 36;
static const Type tAVX512_IFMA = uint64_t(1) << 37;
static const Type tAVX512IFMA = tAVX512_IFMA;
static const Type tAVX512PF = uint64_t(1) << 38;
static const Type tAVX512ER = uint64_t(1) << 39;
static const Type tAVX512CD = uint64_t(1) << 40;
static const Type tAVX512BW = uint64_t(1) << 41;
static const Type tAVX512VL = uint64_t(1) << 42;
static const Type tAVX512_VBMI = uint64_t(1) << 43;
static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
static const Type tAVX512_4VNNIW = uint64_t(1) << 44;
static const Type tAVX512_4FMAPS = uint64_t(1) << 45;
static const Type tPREFETCHWT1 = uint64_t(1) << 46;
static const Type tPREFETCHW = uint64_t(1) << 47;
static const Type tSHA = uint64_t(1) << 48;
static const Type tMPX = uint64_t(1) << 49;
static const Type tAVX512_VBMI2 = uint64_t(1) << 50;
static const Type tGFNI = uint64_t(1) << 51;
static const Type tVAES = uint64_t(1) << 52;
static const Type tVPCLMULQDQ = uint64_t(1) << 53;
static const Type tAVX512_VNNI = uint64_t(1) << 54;
static const Type tAVX512_BITALG = uint64_t(1) << 55;
static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
static const Type tAVX512_BF16 = uint64_t(1) << 57;
static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58;
static const Type tAMX_TILE = uint64_t(1) << 59;
static const Type tAMX_INT8 = uint64_t(1) << 60;
static const Type tAMX_BF16 = uint64_t(1) << 61;
static const Type tAVX_VNNI = uint64_t(1) << 62;
static const Type tAVX512_FP16 = uint64_t(1) << 11;
// 18, 63
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
Cpu()
: type_(NONE)
: type_()
, x2APIC_supported_(false)
, numCores_()
, dataCacheSize_()
, coresSharignDataCache_()
, dataCacheLevels_(0)
{
unsigned int data[4] = {};
const unsigned int& EAX = data[0];
const unsigned int& EBX = data[1];
const unsigned int& ECX = data[2];
const unsigned int& EDX = data[3];
uint32_t data[4] = {};
const uint32_t& EAX = data[0];
const uint32_t& EBX = data[1];
const uint32_t& ECX = data[2];
const uint32_t& EDX = data[3];
getCpuid(0, data);
const unsigned int maxNum = EAX;
const uint32_t maxNum = EAX;
static const char intel[] = "ntel";
static const char amd[] = "cAMD";
if (ECX == get32bitAsBE(amd)) {
@ -407,7 +511,8 @@ public:
// Extended flags information
getCpuid(0x80000000, data);
if (EAX >= 0x80000001) {
const uint32_t maxExtendedNum = EAX;
if (maxExtendedNum >= 0x80000001) {
getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN;
@ -419,15 +524,21 @@ public:
if (ECX & (1U << 8)) type_ |= tPREFETCHW;
}
if (maxExtendedNum >= 0x80000008) {
getCpuid(0x80000008, data);
if (EBX & (1U << 0)) type_ |= tCLZERO;
}
getCpuid(1, data);
if (ECX & (1U << 0)) type_ |= tSSE3;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 9)) type_ |= tSSSE3;
if (ECX & (1U << 19)) type_ |= tSSE41;
if (ECX & (1U << 20)) type_ |= tSSE42;
if (ECX & (1U << 22)) type_ |= tMOVBE;
if (ECX & (1U << 23)) type_ |= tPOPCNT;
if (ECX & (1U << 25)) type_ |= tAESNI;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 26)) type_ |= tXSAVE;
if (ECX & (1U << 27)) type_ |= tOSXSAVE;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (ECX & (1U << 29)) type_ |= tF16C;
@ -460,9 +571,6 @@ public:
if (EBX & (1U << 31)) type_ |= tAVX512VL;
if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
if (ECX & (1U << 8)) type_ |= tGFNI;
if (ECX & (1U << 9)) type_ |= tVAES;
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
@ -484,20 +592,41 @@ public:
if (EBX & (1U << 18)) type_ |= tRDSEED;
if (EBX & (1U << 19)) type_ |= tADX;
if (EBX & (1U << 20)) type_ |= tSMAP;
if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
if (EBX & (1U << 4)) type_ |= tHLE;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 29)) type_ |= tSHA;
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
if (ECX & (1U << 5)) type_ |= tWAITPKG;
if (ECX & (1U << 8)) type_ |= tGFNI;
if (ECX & (1U << 9)) type_ |= tVAES;
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
if (ECX & (1U << 27)) type_ |= tMOVDIRI;
if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
if (EDX & (1U << 5)) type_ |= tUINTR;
if (EDX & (1U << 14)) type_ |= tSERIALIZE;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (EAX & (1U << 0)) type_ |= tSHA512;
if (EAX & (1U << 1)) type_ |= tSM3;
if (EAX & (1U << 2)) type_ |= tSM4;
if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
}
if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
if (EAX & (1U << 21)) type_ |= tAMX_FP16;
if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
}
}
setFamily();
@ -512,9 +641,9 @@ public:
printf("display:family=%X, model=%X\n", displayFamily, displayModel);
#endif
}
bool has(Type type) const
bool has(const Type& type) const
{
return (type & type_) != 0;
return (type & type_) == type;
}
};
@ -527,7 +656,7 @@ public:
#ifdef _MSC_VER
return __rdtsc();
#else
unsigned int eax, edx;
uint32_t eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64_t)edx << 32) | eax;
#endif
@ -564,7 +693,7 @@ const int UseRDX = 1 << 7;
class Pack {
static const size_t maxTblNum = 15;
const Xbyak::Reg64 *tbl_[maxTblNum];
Xbyak::Reg64 tbl_[maxTblNum];
size_t n_;
public:
Pack() : tbl_(), n_(0) {}
@ -581,32 +710,36 @@ public:
return *this;
}
Pack(const Xbyak::Reg64& t0)
{ n_ = 1; tbl_[0] = &t0; }
{ n_ = 1; tbl_[0] = t0; }
Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
{ n_ = 2; tbl_[0] = t0; tbl_[1] = t1; }
Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
{ n_ = 3; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; }
Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
{ n_ = 4; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; }
Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
{ n_ = 5; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; }
Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
{ n_ = 6; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; }
Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
{ n_ = 7; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; }
Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
{ n_ = 8; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; }
Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
{ n_ = 9; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; }
Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
{ n_ = 10; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; }
Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 11; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; }
Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 12; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; tbl_[11] = tb; }
Pack& append(const Xbyak::Reg64& t)
{
if (n_ == maxTblNum) {
fprintf(stderr, "ERR Pack::can't append\n");
XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
}
tbl_[n_++] = &t;
tbl_[n_++] = t;
return *this;
}
void init(const Xbyak::Reg64 *tbl, size_t n)
@ -617,7 +750,7 @@ public:
}
n_ = n;
for (size_t i = 0; i < n; i++) {
tbl_[i] = &tbl[i];
tbl_[i] = tbl[i];
}
}
const Xbyak::Reg64& operator[](size_t n) const
@ -626,7 +759,7 @@ public:
fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
}
return *tbl_[n];
return tbl_[n];
}
size_t size() const { return n_; }
/*
@ -649,7 +782,7 @@ public:
void put() const
{
for (size_t i = 0; i < n_; i++) {
printf("%s ", tbl_[i]->toString());
printf("%s ", tbl_[i].toString());
}
printf("\n");
}
@ -716,7 +849,7 @@ public:
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
const Reg64& _rsp = code->rsp;
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
saveNum_ = local::max_(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum;
for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i]));