From 34287b8042563eac236bb1d61ae337576d100f1a Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 26 Jul 2014 23:32:02 -0700 Subject: [PATCH] JIT: some paired singles optimizations --- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 118 +++++++----------- 1 file changed, 45 insertions(+), 73 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index ff0786cca0..f148285468 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -3,19 +3,13 @@ // Refer to the license.txt file included. #include "Common/CommonTypes.h" +#include "Common/CPUDetect.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitRegCache.h" using namespace Gen; -// TODO -// ps_madds0 -// ps_muls0 -// ps_madds1 -// cmppd, andpd, andnpd, or -// lfsx, ps_merge01 etc - static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; @@ -36,9 +30,6 @@ void Jit64::ps_mr(UGeckoInstruction inst) void Jit64::ps_sel(UGeckoInstruction inst) { - // we can't use (V)BLENDVPD here because it just looks at the sign bit - // but we need -0 = +0 - INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); @@ -49,16 +40,26 @@ void Jit64::ps_sel(UGeckoInstruction inst) int c = inst.FC; fpr.Lock(a, b, c, d); - MOVAPD(XMM0, fpr.R(a)); - PXOR(XMM1, R(XMM1)); - // XMM0 = XMM0 < 0 ? all 1s : all 0s - CMPPD(XMM0, R(XMM1), LT); - MOVAPD(XMM1, R(XMM0)); - PAND(XMM0, fpr.R(b)); - PANDN(XMM1, fpr.R(c)); - POR(XMM0, R(XMM1)); + + if (cpu_info.bSSE4_1) + { + PXOR(XMM0, R(XMM0)); + CMPPD(XMM0, fpr.R(a), LT); // XMM0 = XMM0 >= 0 ? all 1s : all 0s + MOVAPD(XMM1, fpr.R(b)); + BLENDVPD(XMM1, fpr.R(c)); + } + else + { + MOVAPD(XMM1, fpr.R(a)); + PXOR(XMM0, R(XMM0)); + CMPPD(XMM1, R(XMM0), LT); // XMM0 = XMM0 < 0 ? all 1s : all 0s + MOVAPD(XMM0, R(XMM1)); + PAND(XMM1, fpr.R(b)); + PANDN(XMM0, fpr.R(c)); + POR(XMM1, R(XMM0)); + } fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); + MOVAPD(fpr.RX(d), R(XMM1)); fpr.UnlockAll(); } @@ -98,20 +99,6 @@ void Jit64::ps_sign(UGeckoInstruction inst) fpr.UnlockAll(); } -//add a, b, c - -//mov a, b -//add a, c -//we need: -/* -psq_l -psq_stu -*/ - -/* -add a,b,a -*/ - //There's still a little bit more optimization that can be squeezed out of this void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { @@ -152,7 +139,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6 MOVAPD(XMM0, fpr.R(b)); fpr.BindToRegister(d, false); MOVAPD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), Gen::R(XMM0)); + (this->*op)(fpr.RX(d), R(XMM0)); } } else @@ -204,32 +191,26 @@ void Jit64::ps_sum(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; fpr.Lock(a,b,c,d); - fpr.BindToRegister(d, d == a || d == b || d == c, true); switch (inst.SUBOP5) { case 10: - // ps_sum0, do the sum in upper subregisters, merge uppers - MOVDDUP(XMM0, fpr.R(a)); - MOVAPD(XMM1, fpr.R(b)); - ADDPD(XMM0, R(XMM1)); - UNPCKHPD(XMM0, fpr.R(c)); //merge - MOVAPD(fpr.R(d), XMM0); + MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0} + ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} + UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1} break; case 11: - // ps_sum1, do the sum in lower subregisters, merge lowers - MOVAPD(XMM0, fpr.R(a)); - MOVAPD(XMM1, fpr.R(b)); - SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower - ADDPD(XMM0, R(XMM1)); // sum lowers - MOVAPD(XMM1, fpr.R(c)); - UNPCKLPD(XMM1, R(XMM0)); // merge - MOVAPD(fpr.R(d), XMM1); + MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0} + ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} + MOVAPD(XMM0, fpr.R(c)); + SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1} break; default: PanicAlert("ps_sum WTF!!!"); } - ForceSinglePrecisionP(fpr.RX(d)); - SetFPRFIfNeeded(inst, fpr.RX(d)); + ForceSinglePrecisionP(XMM0); + SetFPRFIfNeeded(inst, XMM0); + fpr.BindToRegister(d, false); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } @@ -244,37 +225,28 @@ void Jit64::ps_muls(UGeckoInstruction inst) int a = inst.FA; int c = inst.FC; fpr.Lock(a, c, d); - fpr.BindToRegister(d, d == a || d == c, true); switch (inst.SUBOP5) { case 12: - // Single multiply scalar high - // TODO - faster version for when regs are different - MOVDDUP(XMM1, fpr.R(c)); - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - MULPD(XMM0, R(XMM1)); - MOVAPD(fpr.R(d), XMM0); + MOVDDUP(XMM0, fpr.R(c)); break; case 13: - // TODO - faster version for when regs are different - MOVAPD(XMM1, fpr.R(c)); - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower - MULPD(XMM0, R(XMM1)); - MOVAPD(fpr.R(d), XMM0); + MOVAPD(XMM0, fpr.R(c)); + SHUFPD(XMM0, R(XMM0), 3); break; default: PanicAlert("ps_muls WTF!!!"); } - ForceSinglePrecisionP(fpr.RX(d)); - SetFPRFIfNeeded(inst, fpr.RX(d)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); + ForceSinglePrecisionP(XMM0); + SetFPRFIfNeeded(inst, XMM0); + fpr.BindToRegister(d, false); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } -//TODO: find easy cases and optimize them, do a breakout like ps_arith void Jit64::ps_mergeXX(UGeckoInstruction inst) { INSTRUCTION_START @@ -305,7 +277,7 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst) _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); } fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), Gen::R(XMM0)); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } @@ -373,8 +345,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) return; } fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), Gen::R(XMM0)); - ForceSinglePrecisionP(fpr.RX(d)); - SetFPRFIfNeeded(inst, fpr.RX(d)); + ForceSinglePrecisionP(XMM0); + SetFPRFIfNeeded(inst, XMM0); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); }