Merge pull request #8586 from Techjar/d2s-no-pext

Avoid using PDEP and PEXT on AMD Zen
This commit is contained in:
Tilka 2020-01-27 03:15:43 +00:00 committed by GitHub
commit f36c735856
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 12 additions and 4 deletions

View File

@ -42,6 +42,9 @@ struct CPUInfo
bool bAVX2 = false; bool bAVX2 = false;
bool bBMI1 = false; bool bBMI1 = false;
bool bBMI2 = false; bool bBMI2 = false;
// PDEP and PEXT are ridiculously slow on AMD Zen, so we have this flag to avoid using them there
// Zen 2 is also affected by this issue
bool bFastBMI2 = false;
bool bFMA = false; bool bFMA = false;
bool bFMA4 = false; bool bFMA4 = false;
bool bAES = false; bool bAES = false;
@ -54,6 +57,7 @@ struct CPUInfo
bool bLAHFSAHF64 = false; bool bLAHFSAHF64 = false;
bool bLongMode = false; bool bLongMode = false;
bool bAtom = false; bool bAtom = false;
bool bZen = false;
// ARMv8 specific // ARMv8 specific
bool bFP = false; bool bFP = false;

View File

@ -118,6 +118,9 @@ void CPUInfo::Detect()
(model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 || (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 ||
model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D)) model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
bAtom = true; bAtom = true;
// Detect AMD Zen (all models)
if (family == 23)
bZen = true;
logical_cpu_count = (cpu_id[1] >> 16) & 0xFF; logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
ht = (cpu_id[3] >> 28) & 1; ht = (cpu_id[3] >> 28) & 1;
@ -172,6 +175,7 @@ void CPUInfo::Detect()
} }
bFlushToZero = bSSE; bFlushToZero = bSSE;
bFastBMI2 = bBMI2 && !bZen;
if (max_ex_fn >= 0x80000004) if (max_ex_fn >= 0x80000004)
{ {

View File

@ -68,7 +68,7 @@ void CommonAsmRoutines::GenConvertDoubleToSingle()
// Don't Denormalize // Don't Denormalize
if (cpu_info.bBMI2) if (cpu_info.bFastBMI2)
{ {
// Extract bits 0-1 and 5-34 // Extract bits 0-1 and 5-34
MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000)); MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000));

View File

@ -284,7 +284,7 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
// RRRRRGGG GGGBBBBB // RRRRRGGG GGGBBBBB
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
LoadAndSwap(16, scratch1, data); LoadAndSwap(16, scratch1, data);
if (cpu_info.bBMI1 && cpu_info.bBMI2) if (cpu_info.bBMI1 && cpu_info.bFastBMI2)
{ {
MOV(32, R(scratch2), Imm32(0x07C3F7C0)); MOV(32, R(scratch2), Imm32(0x07C3F7C0));
PDEP(32, scratch3, scratch1, R(scratch2)); PDEP(32, scratch3, scratch1, R(scratch2));
@ -324,7 +324,7 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
// RRRRGGGG BBBBAAAA // RRRRGGGG BBBBAAAA
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
LoadAndSwap(16, scratch1, data); LoadAndSwap(16, scratch1, data);
if (cpu_info.bBMI2) if (cpu_info.bFastBMI2)
{ {
MOV(32, R(scratch2), Imm32(0x0F0F0F0F)); MOV(32, R(scratch2), Imm32(0x0F0F0F0F));
PDEP(32, scratch1, scratch1, R(scratch2)); PDEP(32, scratch1, scratch1, R(scratch2));
@ -353,7 +353,7 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
// AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
data.AddMemOffset(-1); // subtract one from address so we can use a 32bit load and bswap data.AddMemOffset(-1); // subtract one from address so we can use a 32bit load and bswap
LoadAndSwap(32, scratch1, data); LoadAndSwap(32, scratch1, data);
if (cpu_info.bBMI2) if (cpu_info.bFastBMI2)
{ {
MOV(32, R(scratch2), Imm32(0xFCFCFCFC)); MOV(32, R(scratch2), Imm32(0xFCFCFCFC));
PDEP(32, scratch1, scratch1, R(scratch2)); PDEP(32, scratch1, scratch1, R(scratch2));