3rdparty: Update xbyak to 7.06

Might fix a crash on older AMD CPUs apparently.
This commit is contained in:
JordanTheToaster 2024-06-07 10:22:18 +01:00 committed by refractionpcsx2
parent 1ec4c248fb
commit 7d530228e3
3 changed files with 200 additions and 136 deletions

View File

@ -155,7 +155,7 @@ namespace Xbyak {
enum { enum {
DEFAULT_MAX_CODE_SIZE = 4096, DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x7051 /* 0xABCD = A.BC(.D) */ VERSION = 0x7060 /* 0xABCD = A.BC(.D) */
}; };
#ifndef MIE_INTEGER_TYPE_DEFINED #ifndef MIE_INTEGER_TYPE_DEFINED

View File

@ -1,4 +1,4 @@
const char *getVersionString() const { return "7.05.1"; } const char *getVersionString() const { return "7.06"; }
void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }

View File

@ -91,7 +91,8 @@ namespace Xbyak { namespace util {
typedef enum { typedef enum {
SmtLevel = 1, SmtLevel = 1,
CoreLevel = 2 CoreLevel = 2
} IntelCpuTopologyLevel; } CpuTopologyLevel;
typedef CpuTopologyLevel IntelCpuTopologyLevel; // for backward compatibility
namespace local { namespace local {
@ -136,13 +137,12 @@ public:
private: private:
Type type_; Type type_;
//system topology //system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2; static const size_t maxTopologyLevels = 2;
uint32_t numCores_[maxTopologyLevels]; uint32_t numCores_[maxTopologyLevels];
static const uint32_t maxNumberCacheLevels = 10; static const uint32_t maxNumberCacheLevels = 10;
uint32_t dataCacheSize_[maxNumberCacheLevels]; uint32_t dataCacheSize_[maxNumberCacheLevels];
uint32_t coresSharignDataCache_[maxNumberCacheLevels]; uint32_t coresSharingDataCache_[maxNumberCacheLevels];
uint32_t dataCacheLevels_; uint32_t dataCacheLevels_;
uint32_t avx10version_; uint32_t avx10version_;
@ -154,123 +154,200 @@ private:
{ {
return (1U << n) - 1; return (1U << n) - 1;
} }
// [EBX:ECX:EDX] == s?
bool isEqualStr(uint32_t EBX, uint32_t ECX, uint32_t EDX, const char s[12]) const
{
return get32bitAsBE(&s[0]) == EBX && get32bitAsBE(&s[4]) == EDX && get32bitAsBE(&s[8]) == ECX;
}
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) const
{
return (val >> base) & ((1u << (end + 1 - base)) - 1);
}
void setFamily() void setFamily()
{ {
uint32_t data[4] = {}; uint32_t data[4] = {};
getCpuid(1, data); getCpuid(1, data);
stepping = data[0] & mask(4); stepping = extractBit(data[0], 0, 3);
model = (data[0] >> 4) & mask(4); model = extractBit(data[0], 4, 7);
family = (data[0] >> 8) & mask(4); family = extractBit(data[0], 8, 11);
// type = (data[0] >> 12) & mask(2); //type = extractBit(data[0], 12, 13);
extModel = (data[0] >> 16) & mask(4); extModel = extractBit(data[0], 16, 19);
extFamily = (data[0] >> 20) & mask(8); extFamily = extractBit(data[0], 20, 27);
if (family == 0x0f) { if (family == 0x0f) {
displayFamily = family + extFamily; displayFamily = family + extFamily;
} else { } else {
displayFamily = family; displayFamily = family;
} }
if (family == 6 || family == 0x0f) { if ((has(tINTEL) && family == 6) || family == 0x0f) {
displayModel = (extModel << 4) + model; displayModel = (extModel << 4) + model;
} else { } else {
displayModel = model; displayModel = model;
} }
} }
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end)
{
return (val >> base) & ((1u << (end + 1 - base)) - 1);
}
void setNumCores() void setNumCores()
{ {
if (!has(tINTEL) && !has(tAMD)) return; if (!has(tINTEL) && !has(tAMD)) return;
uint32_t data[4] = {}; uint32_t data[4] = {};
getCpuidEx(0x0, 0, data); getCpuid(0x0, data);
if (data[0] >= 0xB) { if (data[0] >= 0xB) {
// Check if "Extended Topology Enumeration" is implemented.
getCpuidEx(0xB, 0, data);
if (data[0] != 0 || data[1] != 0) {
/* /*
if leaf 11 exists(x2APIC is supported), if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor leaf 0xB can be zeroed-out by a hypervisor
*/ */
x2APIC_supported_ = true;
for (uint32_t i = 0; i < maxTopologyLevels; i++) { for (uint32_t i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data); getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); CpuTopologyLevel level = (CpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) { if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15); numCores_[level - 1] = extractBit(data[1], 0, 15);
} }
} }
/* /*
Fallback values in case a hypervisor has 0xB leaf zeroed-out. Fallback values in case a hypervisor has the leaf zeroed-out.
*/ */
numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]); numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
return;
}
}
// "Extended Topology Enumeration" is not supported.
if (has(tAMD)) {
/*
AMD - Legacy Method
*/
int physicalThreadCount = 0;
getCpuid(0x1, data);
int logicalProcessorCount = extractBit(data[1], 16, 23);
int htt = extractBit(data[3], 28, 28); // Hyper-threading technology.
getCpuid(0x80000000, data);
uint32_t highestExtendedLeaf = data[0];
if (highestExtendedLeaf >= 0x80000008) {
getCpuid(0x80000008, data);
physicalThreadCount = extractBit(data[2], 0, 7) + 1;
}
if (htt == 0) {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = 1;
} else if (physicalThreadCount > 1) {
if ((displayFamily >= 0x17) && (highestExtendedLeaf >= 0x8000001E)) {
// Zen overreports its core count by a factor of two.
getCpuid(0x8000001E, data);
int threadsPerComputeUnit = extractBit(data[1], 8, 15) + 1;
physicalThreadCount /= threadsPerComputeUnit;
}
numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
numCores_[CoreLevel - 1] = logicalProcessorCount;
} else {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = logicalProcessorCount > 1 ? logicalProcessorCount : 2;
}
} else { } else {
/* /*
Failed to deremine num of cores without x2APIC support. Intel - Legacy Method
TODO: USE initial APIC ID to determine ncores.
*/ */
numCores_[SmtLevel - 1] = 0; int physicalThreadCount = 0;
numCores_[CoreLevel - 1] = 0; getCpuid(0x1, data);
int logicalProcessorCount = extractBit(data[1], 16, 23);
int htt = extractBit(data[3], 28, 28); // Hyper-threading technology.
getCpuid(0, data);
if (data[0] >= 0x4) {
getCpuid(0x4, data);
physicalThreadCount = extractBit(data[0], 26, 31) + 1;
}
if (htt == 0) {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = 1;
} else if (physicalThreadCount > 1) {
numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
numCores_[CoreLevel - 1] = logicalProcessorCount;
} else {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = logicalProcessorCount > 0 ? logicalProcessorCount : 1;
}
} }
} }
void setCacheHierarchy() void setCacheHierarchy()
{ {
if (!has(tINTEL) && !has(tAMD)) return;
// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
if (has(tAMD)) {
// There are 3 Data Cache Levels (L1, L2, L3)
dataCacheLevels_ = 3;
const uint32_t leaf = 0x8000001D; // for modern AMD CPus
// Sub leaf value ranges from 0 to 3
// Sub leaf value 0 refers to L1 Data Cache
// Sub leaf value 1 refers to L1 Instruction Cache
// Sub leaf value 2 refers to L2 Cache
// Sub leaf value 3 refers to L3 Cache
// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache
int cache_index = 0;
for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
// Skip sub_leaf = 1 as it refers to
// L1 Instruction Cache (not required)
if (sub_leaf == 1) {
continue;
}
uint32_t data[4] = {}; uint32_t data[4] = {};
getCpuidEx(leaf, sub_leaf, data); if (has(tAMD)) {
// Cache Size = Line Size * Partitions * Associativity * Cache Sets getCpuid(0x80000000, data);
dataCacheSize_[cache_index] = if (data[0] >= 0x8000001D) {
(extractBit(data[1], 22, 31) + 1) // Associativity-1 // For modern AMD CPUs.
* (extractBit(data[1], 12, 21) + 1) // Partitions-1 dataCacheLevels_ = 0;
* (extractBit(data[1], 0, 11) + 1) // Line Size for (uint32_t subLeaf = 0; dataCacheLevels_ < maxNumberCacheLevels; subLeaf++) {
* (data[2] + 1); getCpuidEx(0x8000001D, subLeaf, data);
// Calculate the number of cores sharing the current data cache int cacheType = extractBit(data[0], 0, 4);
int smt_width = numCores_[0]; /*
int logical_cores = numCores_[1]; cacheType
int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1; 00h - Null; no more caches
if (logical_cores != 0) { 01h - Data cache
actual_logical_cores = local::min_(actual_logical_cores, logical_cores); 02h - Instrution cache
03h - Unified cache
04h-1Fh - Reserved
*/
if (cacheType == 0) break; // No more caches.
if (cacheType == 0x2) continue; // Skip instruction cache.
int fullyAssociative = extractBit(data[0], 9, 9);
int numSharingCache = extractBit(data[0], 14, 25) + 1;
int cacheNumWays = extractBit(data[1], 22, 31) + 1;
int cachePhysPartitions = extractBit(data[1], 12, 21) + 1;
int cacheLineSize = extractBit(data[1], 0, 11) + 1;
int cacheNumSets = data[2] + 1;
dataCacheSize_[dataCacheLevels_] =
cacheLineSize * cachePhysPartitions * cacheNumWays;
if (fullyAssociative == 0) {
dataCacheSize_[dataCacheLevels_] *= cacheNumSets;
} }
coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1); if (subLeaf > 0) {
++cache_index; numSharingCache = local::min_(numSharingCache, (int)numCores_[1]);
numSharingCache /= local::max_(1u, coresSharingDataCache_[0]);
} }
return; coresSharingDataCache_[dataCacheLevels_] = numSharingCache;
dataCacheLevels_ += 1;
} }
// intel coresSharingDataCache_[0] = local::min_(1u, coresSharingDataCache_[0]);
} else if (data[0] >= 0x80000006) {
// For legacy AMD CPUs, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache.
dataCacheLevels_ = 1;
getCpuid(0x80000005, data);
int l1dc_size = extractBit(data[2], 24, 31);
dataCacheSize_[0] = l1dc_size * 1024;
coresSharingDataCache_[0] = 1;
getCpuid(0x80000006, data);
// L2 cache
int l2_assoc = extractBit(data[2], 12, 15);
if (l2_assoc > 0) {
dataCacheLevels_ = 2;
int l2_size = extractBit(data[2], 16, 31);
dataCacheSize_[1] = l2_size * 1024;
coresSharingDataCache_[1] = 1;
}
// L3 cache
int l3_assoc = extractBit(data[3], 12, 15);
if (l3_assoc > 0) {
dataCacheLevels_ = 3;
int l3_size = extractBit(data[3], 18, 31);
dataCacheSize_[2] = l3_size * 512 * 1024;
coresSharingDataCache_[2] = numCores_[1];
}
}
} else if (has(tINTEL)) {
// Use the "Deterministic Cache Parameters" leaf is supported.
const uint32_t NO_CACHE = 0; const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1; const uint32_t DATA_CACHE = 1;
//const uint32_t INSTRUCTION_CACHE = 2; //const uint32_t INSTRUCTION_CACHE = 2;
const uint32_t UNIFIED_CACHE = 3; const uint32_t UNIFIED_CACHE = 3;
uint32_t smt_width = 0; uint32_t smt_width = 0;
uint32_t logical_cores = 0; uint32_t logical_cores = 0;
uint32_t data[4] = {};
if (x2APIC_supported_) {
smt_width = numCores_[0]; smt_width = numCores_[0];
logical_cores = numCores_[1]; logical_cores = numCores_[1];
}
/* /*
Assumptions: Assumptions:
@ -298,11 +375,12 @@ private:
* (data[2] + 1); * (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0); assert(smt_width != 0);
coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u); coresSharingDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++; dataCacheLevels_++;
} }
} }
} }
}
public: public:
int model; int model;
@ -313,8 +391,7 @@ public:
int displayFamily; // family + extFamily int displayFamily; // family + extFamily
int displayModel; // model + extModel int displayModel; // model + extModel
uint32_t getNumCores(IntelCpuTopologyLevel level) const { uint32_t getNumCores(CpuTopologyLevel level) const {
if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
switch (level) { switch (level) {
case SmtLevel: return numCores_[level - 1]; case SmtLevel: return numCores_[level - 1];
case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1]; case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
@ -326,7 +403,7 @@ public:
uint32_t getCoresSharingDataCache(uint32_t i) const uint32_t getCoresSharingDataCache(uint32_t i) const
{ {
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return coresSharignDataCache_[i]; return coresSharingDataCache_[i];
} }
uint32_t getDataCacheSize(uint32_t i) const uint32_t getDataCacheSize(uint32_t i) const
{ {
@ -337,19 +414,6 @@ public:
/* /*
data[] = { eax, ebx, ecx, edx } data[] = { eax, ebx, ecx, edx }
*/ */
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _WIN32
__cpuid(reinterpret_cast<int*>(data), eaxIn);
#else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)data;
#endif
}
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{ {
#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef XBYAK_INTEL_CPU_SPECIFIC
@ -364,6 +428,10 @@ public:
(void)data; (void)data;
#endif #endif
} }
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
{
getCpuidEx(eaxIn, 0, data);
}
static inline uint64_t getXfeature() static inline uint64_t getXfeature()
{ {
#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef XBYAK_INTEL_CPU_SPECIFIC
@ -485,10 +553,9 @@ public:
Cpu() Cpu()
: type_() : type_()
, x2APIC_supported_(false)
, numCores_() , numCores_()
, dataCacheSize_() , dataCacheSize_()
, coresSharignDataCache_() , coresSharingDataCache_()
, dataCacheLevels_(0) , dataCacheLevels_(0)
, avx10version_(0) , avx10version_(0)
{ {
@ -499,9 +566,7 @@ public:
const uint32_t& EDX = data[3]; const uint32_t& EDX = data[3];
getCpuid(0, data); getCpuid(0, data);
const uint32_t maxNum = EAX; const uint32_t maxNum = EAX;
static const char intel[] = "ntel"; if (isEqualStr(EBX, ECX, EDX, "AuthenticAMD")) {
static const char amd[] = "cAMD";
if (ECX == get32bitAsBE(amd)) {
type_ |= tAMD; type_ |= tAMD;
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (EDX & (1U << 31)) { if (EDX & (1U << 31)) {
@ -514,8 +579,7 @@ public:
// Long mode implies support for PREFETCHW on AMD // Long mode implies support for PREFETCHW on AMD
type_ |= tPREFETCHW; type_ |= tPREFETCHW;
} }
} } else if (isEqualStr(EBX, ECX, EDX, "GenuineIntel")) {
if (ECX == get32bitAsBE(intel)) {
type_ |= tINTEL; type_ |= tINTEL;
} }