diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 6567fe5543..23c3d2647e 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -515,23 +515,63 @@ void XEmitter::INT3() {Write8(0xCC);} void XEmitter::RET() {Write8(0xC3);} void XEmitter::RET_FAST() {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret -void XEmitter::NOP(int count) +// The first sign of decadence: optimized NOPs. +void XEmitter::NOP(int size) { - // TODO: look up the fastest nop sleds for various sizes - int i; - switch (count) { - case 1: - Write8(0x90); - break; - case 2: - Write8(0x66); - Write8(0x90); - break; - default: - for (i = 0; i < count; i++) { + while (true) + { + switch (size) + { + case 0: + return; + case 1: Write8(0x90); + return; + case 2: + Write8(0x66); Write8(0x90); + return; + case 3: + Write8(0x0F); Write8(0x1F); Write8(0x00); + return; + case 4: + Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00); + return; + case 5: + Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00); + Write8(0x00); + return; + case 6: + Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44); + Write8(0x00); Write8(0x00); + return; + case 7: + Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00); + Write8(0x00); Write8(0x00); Write8(0x00); + return; + case 8: + Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00); + Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); + return; + case 9: + Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84); + Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); + Write8(0x00); + return; + case 10: + Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F); + Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00); + Write8(0x00); Write8(0x00); + return; + default: + // Even though x86 instructions are allowed to be up to 15 bytes long, + // AMD advises against using NOPs longer than 11 bytes because they + // carry a performance penalty on CPUs older than AMD family 16h. + Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F); + Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00); + Write8(0x00); Write8(0x00); Write8(0x00); + size -= 11; + continue; } - break; } } diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 025afb7b18..528a4495d8 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -290,7 +290,7 @@ public: void INT3(); // Do nothing - void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals) + void NOP(int count = 1); // Save energy in wait-loops on P4 only. Probably not too useful. void PAUSE();