3rdparty: Upgrade xbyak to 6.00

This commit is contained in:
Christian Kenny 2021-11-21 21:52:19 -05:00 committed by refractionpcsx2
parent 1917f2b98a
commit 791f2a63ac
4 changed files with 2538 additions and 1393 deletions

47
3rdparty/xbyak/xbyak/COPYRIGHT vendored Normal file
View File

@ -0,0 +1,47 @@
Copyright (c) 2007 MITSUNARI Shigeo
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
Neither the name of the copyright owner nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
ソースコード形式かバイナリ形式か、変更するかしないかを問わず、以下の条件を満た
す場合に限り、再頒布および使用が許可されます。
ソースコードを再頒布する場合、上記の著作権表示、本条件一覧、および下記免責条項
を含めること。
バイナリ形式で再頒布する場合、頒布物に付属のドキュメント等の資料に、上記の著作
権表示、本条件一覧、および下記免責条項を含めること。
書面による特別の許可なしに、本ソフトウェアから派生した製品の宣伝または販売促進
に、著作権者の名前またはコントリビューターの名前を使用してはならない。
本ソフトウェアは、著作権者およびコントリビューターによって「現状のまま」提供さ
れており、明示黙示を問わず、商業的な使用可能性、および特定の目的に対する適合性
に関する暗黙の保証も含め、またそれに限定されない、いかなる保証もありません。
著作権者もコントリビューターも、事由のいかんを問わず、 損害発生の原因いかんを
問わず、かつ責任の根拠が契約であるか厳格責任であるか(過失その他の)不法行為で
あるかを問わず、仮にそのような損害が発生する可能性を知らされていたとしても、
本ソフトウェアの使用によって発生した(代替品または代用サービスの調達、使用の
喪失、データの喪失、利益の喪失、業務の中断も含め、またそれに限定されない)直接
損害、間接損害、偶発的な損害、特別損害、懲罰的損害、または結果損害について、
一切責任を負わないものとします。

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,36 +1,17 @@
/* Copyright (c) 2007 MITSUNARI Shigeo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of the copyright owner nor the names of its contributors may
* be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef XBYAK_XBYAK_UTIL_H_
#define XBYAK_XBYAK_UTIL_H_
// We want to keep this file similar to the original xbyak
// clang-format off
#ifdef XBYAK_ONLY_CLASS_CPU
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include <assert.h>
#ifndef XBYAK_THROW
#define XBYAK_THROW(x) ;
#define XBYAK_THROW_RET(x, y) return y;
#endif
#else
#include <string.h>
/**
utility class and functions for Xbyak
@ -39,7 +20,13 @@
@note this header is UNDER CONSTRUCTION!
*/
#include "xbyak.h"
#endif // XBYAK_ONLY_CLASS_CPU
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define XBYAK_INTEL_CPU_SPECIFIC
#endif
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#if (_MSC_VER < 1400) && defined(XBYAK32)
static inline __declspec(naked) void __cpuid(int[4], int)
@ -78,35 +65,44 @@
#endif
#endif
#endif
#endif
#ifdef _MSC_VER
extern "C" unsigned __int64 __xgetbv(int);
#ifdef XBYAK_USE_VTUNE
// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
#include <jitprofiling.h>
#ifdef _MSC_VER
#pragma comment(lib, "libittnotify.lib")
#endif
#ifdef __linux__
#include <dlfcn.h>
#endif
#endif
#ifdef __linux__
#define XBYAK_USE_PERF
#endif
namespace Xbyak { namespace util {
/* GCC uses AVX/SSE4 operation to handle the uint64 type.
*
* It is quite annoying because the purpose of the code is to test the support
* of AVX/SSEn
*
* So far, we don't need other ISA on i386 so I hacked the code to limit the
* type to 32 bits. If we want to support AVX512 we might need to shuffle the
* code a bit.
*
* Extra note: it would be waste to use AVX512 on 32 bits, registers are
* limited to 8 instead of 32.
*/
typedef enum {
SmtLevel = 1,
CoreLevel = 2
} IntelCpuTopologyLevel;
/**
CPU detection class
*/
class Cpu {
#ifdef XBYAK64
uint64 type_;
#else
uint32 type_;
#endif
uint64_t type_;
//system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2;
unsigned int numCores_[maxTopologyLevels];
static const unsigned int maxNumberCacheLevels = 10;
unsigned int dataCacheSize_[maxNumberCacheLevels];
unsigned int coresSharignDataCache_[maxNumberCacheLevels];
unsigned int dataCacheLevels_;
unsigned int get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
@ -117,7 +113,7 @@ class Cpu {
}
void setFamily()
{
unsigned int data[4];
unsigned int data[4] = {};
getCpuid(1, data);
stepping = data[0] & mask(4);
model = (data[0] >> 4) & mask(4);
@ -136,6 +132,96 @@ class Cpu {
displayModel = model;
}
}
unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end)
{
return (val >> base) & ((1u << (end - base)) - 1);
}
void setNumCores()
{
if ((type_ & tINTEL) == 0) return;
unsigned int data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor
*/
x2APIC_supported_ = true;
for (unsigned int i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15);
}
}
/*
Fallback values in case a hypervisor has 0xB leaf zeroed-out.
*/
numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
} else {
/*
Failed to deremine num of cores without x2APIC support.
TODO: USE initial APIC ID to determine ncores.
*/
numCores_[SmtLevel - 1] = 0;
numCores_[CoreLevel - 1] = 0;
}
}
void setCacheHierarchy()
{
if ((type_ & tINTEL) == 0) return;
const unsigned int NO_CACHE = 0;
const unsigned int DATA_CACHE = 1;
// const unsigned int INSTRUCTION_CACHE = 2;
const unsigned int UNIFIED_CACHE = 3;
unsigned int smt_width = 0;
unsigned int logical_cores = 0;
unsigned int data[4] = {};
if (x2APIC_supported_) {
smt_width = numCores_[0];
logical_cores = numCores_[1];
}
/*
Assumptions:
the first level of data cache is not shared (which is the
case for every existing architecture) and use this to
determine the SMT width for arch not supporting leaf 11.
when leaf 4 reports a number of core less than numCores_
on socket reported by leaf 11, then it is a correct number
of cores not an upperbound.
*/
for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
getCpuidEx(0x4, i, data);
unsigned int cacheType = extractBit(data[0], 0, 4);
if (cacheType == NO_CACHE) break;
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
}
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
(extractBit(data[1], 22, 31) + 1)
* (extractBit(data[1], 12, 21) + 1)
* (extractBit(data[1], 0, 11) + 1)
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
}
public:
int model;
int family;
@ -144,42 +230,76 @@ public:
int extFamily;
int displayFamily; // family + extFamily
int displayModel; // model + extModel
unsigned int getNumCores(IntelCpuTopologyLevel level) const {
if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
switch (level) {
case SmtLevel: return numCores_[level - 1];
case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
}
}
unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
unsigned int getCoresSharingDataCache(unsigned int i) const
{
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return coresSharignDataCache_[i];
}
unsigned int getDataCacheSize(unsigned int i) const
{
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return dataCacheSize_[i];
}
/*
data[] = { eax, ebx, ecx, edx }
*/
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuid(reinterpret_cast<int*>(data), eaxIn);
#else
#else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)data;
#endif
}
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)ecxIn;
(void)data;
#endif
}
static inline uint64 getXfeature()
static inline uint64_t getXfeature()
{
#ifdef _MSC_VER
return __xgetbv(0);
#else
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return _xgetbv(0);
#else
unsigned int eax, edx;
// xgetvb is not support on gcc 4.2
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64)edx << 32) | eax;
return ((uint64_t)edx << 32) | eax;
#endif
#else
return 0;
#endif
}
#ifdef XBYAK64
typedef uint64 Type;
#else
typedef uint32 Type;
#endif
typedef uint64_t Type;
static const Type NONE = 0;
static const Type tMMX = 1 << 0;
static const Type tMMX2 = 1 << 1;
@ -192,7 +312,6 @@ public:
static const Type tSSE42 = 1 << 8;
static const Type tPOPCNT = 1 << 9;
static const Type tAESNI = 1 << 10;
static const Type tSSE5 = 1 << 11;
static const Type tOSXSAVE = 1 << 12;
static const Type tPCLMULQDQ = 1 << 13;
static const Type tAVX = 1 << 14;
@ -200,7 +319,6 @@ public:
static const Type t3DN = 1 << 16;
static const Type tE3DN = 1 << 17;
static const Type tSSE4a = 1 << 18;
static const Type tRDTSCP = 1 << 19;
static const Type tAVX2 = 1 << 20;
static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
@ -215,111 +333,184 @@ public:
static const Type tADX = 1 << 28; // adcx, adox
static const Type tRDSEED = 1 << 29; // rdseed
static const Type tSMAP = 1 << 30; // stac
#ifdef XBYAK64
static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest
static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort
static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph
static const Type tMOVBE = uint64(1) << 34; // mobve
static const Type tAVX512F = uint64(1) << 35;
static const Type tAVX512DQ = uint64(1) << 36;
static const Type tAVX512IFMA = uint64(1) << 37;
static const Type tAVX512PF = uint64(1) << 38;
static const Type tAVX512ER = uint64(1) << 39;
static const Type tAVX512CD = uint64(1) << 40;
static const Type tAVX512BW = uint64(1) << 41;
static const Type tAVX512VL = uint64(1) << 42;
static const Type tAVX512VBMI = uint64(1) << 43;
#endif
static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest
static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort
static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph
static const Type tMOVBE = uint64_t(1) << 34; // mobve
static const Type tAVX512F = uint64_t(1) << 35;
static const Type tAVX512DQ = uint64_t(1) << 36;
static const Type tAVX512_IFMA = uint64_t(1) << 37;
static const Type tAVX512IFMA = tAVX512_IFMA;
static const Type tAVX512PF = uint64_t(1) << 38;
static const Type tAVX512ER = uint64_t(1) << 39;
static const Type tAVX512CD = uint64_t(1) << 40;
static const Type tAVX512BW = uint64_t(1) << 41;
static const Type tAVX512VL = uint64_t(1) << 42;
static const Type tAVX512_VBMI = uint64_t(1) << 43;
static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual
static const Type tAVX512_4VNNIW = uint64_t(1) << 44;
static const Type tAVX512_4FMAPS = uint64_t(1) << 45;
static const Type tPREFETCHWT1 = uint64_t(1) << 46;
static const Type tPREFETCHW = uint64_t(1) << 47;
static const Type tSHA = uint64_t(1) << 48;
static const Type tMPX = uint64_t(1) << 49;
static const Type tAVX512_VBMI2 = uint64_t(1) << 50;
static const Type tGFNI = uint64_t(1) << 51;
static const Type tVAES = uint64_t(1) << 52;
static const Type tVPCLMULQDQ = uint64_t(1) << 53;
static const Type tAVX512_VNNI = uint64_t(1) << 54;
static const Type tAVX512_BITALG = uint64_t(1) << 55;
static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56;
static const Type tAVX512_BF16 = uint64_t(1) << 57;
static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58;
static const Type tAMX_TILE = uint64_t(1) << 59;
static const Type tAMX_INT8 = uint64_t(1) << 60;
static const Type tAMX_BF16 = uint64_t(1) << 61;
static const Type tAVX_VNNI = uint64_t(1) << 62;
static const Type tAVX512_FP16 = uint64_t(1) << 11;
// 18, 63
Cpu()
: type_(NONE)
, x2APIC_supported_(false)
, numCores_()
, dataCacheSize_()
, coresSharignDataCache_()
, dataCacheLevels_(0)
{
unsigned int data[4];
unsigned int data[4] = {};
const unsigned int& EAX = data[0];
const unsigned int& EBX = data[1];
const unsigned int& ECX = data[2];
const unsigned int& EDX = data[3];
getCpuid(0, data);
const unsigned int maxNum = data[0];
const unsigned int maxNum = EAX;
static const char intel[] = "ntel";
static const char amd[] = "cAMD";
if (data[2] == get32bitAsBE(amd)) {
if (ECX == get32bitAsBE(amd)) {
type_ |= tAMD;
getCpuid(0x80000001, data);
if (data[3] & (1U << 31)) type_ |= t3DN;
if (data[3] & (1U << 15)) type_ |= tCMOV;
if (data[3] & (1U << 30)) type_ |= tE3DN;
if (data[3] & (1U << 22)) type_ |= tMMX2;
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
}
if (data[2] == get32bitAsBE(intel)) {
type_ |= tINTEL;
getCpuid(0x80000001, data);
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
if (data[2] & (1U << 5)) type_ |= tLZCNT;
}
getCpuid(1, data);
if (data[2] & (1U << 0)) type_ |= tSSE3;
if (data[2] & (1U << 9)) type_ |= tSSSE3;
if (data[2] & (1U << 19)) type_ |= tSSE41;
if (data[2] & (1U << 20)) type_ |= tSSE42;
#ifdef XBYAK64
if (data[2] & (1U << 22)) type_ |= tMOVBE;
if (data[2] & (1U << 29)) type_ |= tF16C;
#endif
if (data[2] & (1U << 23)) type_ |= tPOPCNT;
if (data[2] & (1U << 25)) type_ |= tAESNI;
if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ;
if (data[2] & (1U << 27)) type_ |= tOSXSAVE;
if (data[2] & (1U << 30)) type_ |= tRDRAND;
if (EDX & (1U << 31)) {
type_ |= t3DN;
// 3DNow! implies support for PREFETCHW on AMD
type_ |= tPREFETCHW;
}
if (data[3] & (1U << 15)) type_ |= tCMOV;
if (data[3] & (1U << 23)) type_ |= tMMX;
if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE;
if (data[3] & (1U << 26)) type_ |= tSSE2;
if (EDX & (1U << 29)) {
// Long mode implies support for PREFETCHW on AMD
type_ |= tPREFETCHW;
}
}
if (ECX == get32bitAsBE(intel)) {
type_ |= tINTEL;
}
// Extended flags information
getCpuid(0x80000000, data);
if (EAX >= 0x80000001) {
getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (ECX & (1U << 5)) type_ |= tLZCNT;
if (ECX & (1U << 8)) type_ |= tPREFETCHW;
}
getCpuid(1, data);
if (ECX & (1U << 0)) type_ |= tSSE3;
if (ECX & (1U << 9)) type_ |= tSSSE3;
if (ECX & (1U << 19)) type_ |= tSSE41;
if (ECX & (1U << 20)) type_ |= tSSE42;
if (ECX & (1U << 22)) type_ |= tMOVBE;
if (ECX & (1U << 23)) type_ |= tPOPCNT;
if (ECX & (1U << 25)) type_ |= tAESNI;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 27)) type_ |= tOSXSAVE;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (ECX & (1U << 29)) type_ |= tF16C;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 23)) type_ |= tMMX;
if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
if (EDX & (1U << 26)) type_ |= tSSE2;
if (type_ & tOSXSAVE) {
// check XFEATURE_ENABLED_MASK[2:1] = '11b'
uint64 bv = getXfeature();
uint64_t bv = getXfeature();
if ((bv & 6) == 6) {
if (data[2] & (1U << 28)) type_ |= tAVX;
if (data[2] & (1U << 12)) type_ |= tFMA;
#ifdef XBYAK64
if (((bv >> 5) & 7) == 7) {
getCpuid(7, data);
if (data[1] & (1U << 16)) type_ |= tAVX512F;
if (ECX & (1U << 28)) type_ |= tAVX;
if (ECX & (1U << 12)) type_ |= tFMA;
// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
#if !defined(__APPLE__)
if (((bv >> 5) & 7) == 7)
#endif
{
getCpuidEx(7, 0, data);
if (EBX & (1U << 16)) type_ |= tAVX512F;
if (type_ & tAVX512F) {
getCpuidEx(7, 0, data);
if (data[1] & (1U << 17)) type_ |= tAVX512DQ;
if (data[1] & (1U << 21)) type_ |= tAVX512IFMA;
if (data[1] & (1U << 26)) type_ |= tAVX512PF;
if (data[1] & (1U << 27)) type_ |= tAVX512ER;
if (data[1] & (1U << 28)) type_ |= tAVX512CD;
if (data[1] & (1U << 30)) type_ |= tAVX512BW;
if (data[1] & (1U << 31)) type_ |= tAVX512VL;
if (data[2] & (1U << 1)) type_ |= tAVX512VBMI;
if (EBX & (1U << 17)) type_ |= tAVX512DQ;
if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
if (EBX & (1U << 26)) type_ |= tAVX512PF;
if (EBX & (1U << 27)) type_ |= tAVX512ER;
if (EBX & (1U << 28)) type_ |= tAVX512CD;
if (EBX & (1U << 30)) type_ |= tAVX512BW;
if (EBX & (1U << 31)) type_ |= tAVX512VL;
if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
if (ECX & (1U << 8)) type_ |= tGFNI;
if (ECX & (1U << 9)) type_ |= tVAES;
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16;
}
}
#endif
}
}
if (maxNum >= 7) {
getCpuidEx(7, 0, data);
if (type_ & tAVX && data[1] & 0x20) type_ |= tAVX2;
if (data[1] & (1U << 3)) type_ |= tBMI1;
if (data[1] & (1U << 8)) type_ |= tBMI2;
if (data[1] & (1U << 9)) type_ |= tENHANCED_REP;
if (data[1] & (1U << 18)) type_ |= tRDSEED;
if (data[1] & (1U << 19)) type_ |= tADX;
if (data[1] & (1U << 20)) type_ |= tSMAP;
#ifdef XBYAK64
if (data[1] & (1U << 4)) type_ |= tHLE;
if (data[1] & (1U << 11)) type_ |= tRTM;
#endif
const uint32_t maxNumSubLeaves = EAX;
if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
if (EBX & (1U << 3)) type_ |= tBMI1;
if (EBX & (1U << 8)) type_ |= tBMI2;
if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
if (EBX & (1U << 18)) type_ |= tRDSEED;
if (EBX & (1U << 19)) type_ |= tADX;
if (EBX & (1U << 20)) type_ |= tSMAP;
if (EBX & (1U << 4)) type_ |= tHLE;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 29)) type_ |= tSHA;
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
}
}
}
setFamily();
setNumCores();
setCacheHierarchy();
}
void putFamily()
void putFamily() const
{
#ifndef XBYAK_ONLY_CLASS_CPU
printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
family, model, stepping, extFamily, extModel);
printf("display:family=%X, model=%X\n", displayFamily, displayModel);
#endif
}
bool has(Type type) const
{
@ -327,16 +518,22 @@ public:
}
};
#ifndef XBYAK_ONLY_CLASS_CPU
class Clock {
public:
static inline uint64 getRdtsc()
static inline uint64_t getRdtsc()
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return __rdtsc();
#else
#else
unsigned int eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64)edx << 32) | eax;
return ((uint64_t)edx << 32) | eax;
#endif
#else
// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
return 0;
#endif
}
Clock()
@ -354,10 +551,10 @@ public:
count_++;
}
int getCount() const { return count_; }
uint64 getClock() const { return clock_; }
uint64_t getClock() const { return clock_; }
void clear() { count_ = 0; clock_ = 0; }
private:
uint64 clock_;
uint64_t clock_;
int count_;
};
@ -366,7 +563,7 @@ const int UseRCX = 1 << 6;
const int UseRDX = 1 << 7;
class Pack {
static const size_t maxTblNum = 10;
static const size_t maxTblNum = 15;
const Xbyak::Reg64 *tbl_[maxTblNum];
size_t n_;
public:
@ -375,9 +572,14 @@ public:
Pack(const Pack& rhs)
: n_(rhs.n_)
{
if (n_ > maxTblNum) throw Error(ERR_INTERNAL);
for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
}
Pack& operator=(const Pack& rhs)
{
n_ = rhs.n_;
for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
return *this;
}
Pack(const Xbyak::Reg64& t0)
{ n_ = 1; tbl_[0] = &t0; }
Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
@ -400,9 +602,9 @@ public:
{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
Pack& append(const Xbyak::Reg64& t)
{
if (n_ == 10) {
if (n_ == maxTblNum) {
fprintf(stderr, "ERR Pack::can't append\n");
throw Error(ERR_BAD_PARAMETER);
XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
}
tbl_[n_++] = &t;
return *this;
@ -411,7 +613,7 @@ public:
{
if (n > maxTblNum) {
fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
throw Error(ERR_BAD_PARAMETER);
XBYAK_THROW(ERR_BAD_PARAMETER)
}
n_ = n;
for (size_t i = 0; i < n; i++) {
@ -421,8 +623,8 @@ public:
const Xbyak::Reg64& operator[](size_t n) const
{
if (n >= n_) {
fprintf(stderr, "ERR Pack bad n=%d\n", (int)n);
throw Error(ERR_BAD_PARAMETER);
fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
}
return *tbl_[n];
}
@ -435,7 +637,7 @@ public:
if (num == size_t(-1)) num = n_ - pos;
if (pos + num > n_) {
fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
throw Error(ERR_BAD_PARAMETER);
XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
}
Pack pack;
pack.n_ = num;
@ -463,6 +665,7 @@ class StackFrame {
static const int rcxPos = 3;
static const int rdxPos = 2;
#endif
static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
Xbyak::CodeGenerator *code_;
int pNum_;
int tNum_;
@ -472,7 +675,7 @@ class StackFrame {
int P_;
bool makeEpilog_;
Xbyak::Reg64 pTbl_[4];
Xbyak::Reg64 tTbl_[10];
Xbyak::Reg64 tTbl_[maxRegNum];
Pack p_;
Pack t_;
StackFrame(const StackFrame&);
@ -484,7 +687,7 @@ public:
make stack frame
@param sf [in] this
@param pNum [in] num of function parameter(0 <= pNum <= 4)
@param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX)
@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
@param stackSizeByte [in] local stack size
@param makeEpilog [in] automatically call close() if true
@ -509,29 +712,19 @@ public:
, t(t_)
{
using namespace Xbyak;
if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM);
if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
const Reg64& _rsp = code->rsp;
const AddressFrame& _ptr = code->ptr;
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum;
P_ = saveNum_ + (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment
for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i]));
}
P_ = (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
P_ *= 8;
if (P_ > 0) code->sub(_rsp, P_);
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
}
for (int i = 4; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#else
for (int i = 0; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#endif
int pos = 0;
for (int i = 0; i < pNum; i++) {
pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
@ -552,36 +745,18 @@ public:
{
using namespace Xbyak;
const Reg64& _rsp = code_->rsp;
const AddressFrame& _ptr = code_->ptr;
const int *tbl = getOrderTbl() + noSaveNum;
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
}
for (int i = 4; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#else
for (int i = 0; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#endif
if (P_ > 0) code_->add(_rsp, P_);
for (int i = 0; i < saveNum_; i++) {
code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
}
if (callRet) code_->ret();
}
~StackFrame()
{
if (!makeEpilog_) return;
try {
close();
} catch (std::exception& e) {
printf("ERR:StackFrame %s\n", e.what());
exit(1);
} catch (...) {
printf("ERR:StackFrame otherwise\n");
exit(1);
}
close();
}
private:
const int *getOrderTbl() const
@ -599,7 +774,7 @@ private:
}
int getRegIdx(int& pos) const
{
assert(pos < 14);
assert(pos < maxRegNum);
using namespace Xbyak;
const int *tbl = getOrderTbl();
int r = tbl[pos++];
@ -616,5 +791,137 @@ private:
};
#endif
} } // end of util
class Profiler {
int mode_;
const char *suffix_;
const void *startAddr_;
#ifdef XBYAK_USE_PERF
FILE *fp_;
#endif
public:
enum {
None = 0,
Perf = 1,
VTune = 2
};
Profiler()
: mode_(None)
, suffix_("")
, startAddr_(0)
#ifdef XBYAK_USE_PERF
, fp_(0)
#endif
{
}
// append suffix to funcName
void setNameSuffix(const char *suffix)
{
suffix_ = suffix;
}
void setStartAddr(const void *startAddr)
{
startAddr_ = startAddr;
}
void init(int mode)
{
mode_ = None;
switch (mode) {
default:
case None:
return;
case Perf:
#ifdef XBYAK_USE_PERF
close();
{
const int pid = getpid();
char name[128];
snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
fp_ = fopen(name, "a+");
if (fp_ == 0) {
fprintf(stderr, "can't open %s\n", name);
return;
}
}
mode_ = Perf;
#endif
return;
case VTune:
#ifdef XBYAK_USE_VTUNE
dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling
if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
fprintf(stderr, "VTune profiling is not active\n");
return;
}
mode_ = VTune;
#endif
return;
}
}
~Profiler()
{
close();
}
void close()
{
#ifdef XBYAK_USE_PERF
if (fp_ == 0) return;
fclose(fp_);
fp_ = 0;
#endif
}
void set(const char *funcName, const void *startAddr, size_t funcSize) const
{
if (mode_ == None) return;
#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
(void)funcName;
(void)startAddr;
(void)funcSize;
#endif
#ifdef XBYAK_USE_PERF
if (mode_ == Perf) {
if (fp_ == 0) return;
fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
/*
perf does not recognize the function name which is less than 3,
so append '_' at the end of the name if necessary
*/
size_t n = strlen(funcName) + strlen(suffix_);
for (size_t i = n; i < 3; i++) {
fprintf(fp_, "_");
}
fprintf(fp_, "\n");
fflush(fp_);
}
#endif
#ifdef XBYAK_USE_VTUNE
if (mode_ != VTune) return;
char className[] = "";
char fileName[] = "";
iJIT_Method_Load jmethod = {};
jmethod.method_id = iJIT_GetNewMethodID();
jmethod.class_file_name = className;
jmethod.source_file_name = fileName;
jmethod.method_load_address = const_cast<void*>(startAddr);
jmethod.method_size = funcSize;
jmethod.line_number_size = 0;
char buf[128];
snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
jmethod.method_name = buf;
iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
#endif
}
/*
for continuous set
funcSize = endAddr - <previous set endAddr>
*/
void set(const char *funcName, const void *endAddr)
{
set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
startAddr_ = endAddr;
}
};
#endif // XBYAK_ONLY_CLASS_CPU
} } // end of util
#endif