From a7d753922db93bdfb1ce2ad295f0f4ebc705d991 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 01/11] Interpreter: fix instruction table flags of frsp frsp overwrites both ps0 and ps1 so frD is not an input. Regardless of whether that's what the hardware does, it's what we do. --- Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index fbb88eadcf..147793f984 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -330,7 +330,7 @@ static GekkoOPTemplate table63[] = {72, Interpreter::fmrx, {"fmrx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {136, Interpreter::fnabsx, {"fnabsx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {40, Interpreter::fnegx, {"fnegx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {12, Interpreter::frspx, {"frspx", OPTYPE_DOUBLEFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}}, {583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_RC_BIT_F | FL_INOUT_FLOAT_D | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}}, From 9792976ee9626e42a3f0a66923f5612711e0e5fd Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 02/11] Jit64: fix ForceSinglePrecisionS/P This bug never broke anything because of how these functions are used. Fixing it should avoid some false dependencies though. --- Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 8d0f917ac5..20e42f40d3 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -645,8 +645,8 @@ void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input) // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTSD2SS(input, R(input)); - CVTSS2SD(output, R(input)); + CVTSD2SS(output, R(input)); + CVTSS2SD(output, R(output)); } else if (output != input) { @@ -659,8 +659,8 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input) // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTPD2PS(input, R(input)); - CVTPS2PD(output, R(input)); + CVTPD2PS(output, R(input)); + CVTPS2PD(output, R(output)); } else if (output != input) { From 6b8ab5993affcc03e4b1914d9d520e1f02653a6d Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 03/11] Jit64: make ForceSinglePrecision more versatile --- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 34 +++-------------- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 12 +++--- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 37 +++++++++---------- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 3 +- 4 files changed, 30 insertions(+), 56 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 349a7a269c..bf40cecbec 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -38,17 +38,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible); } if (single) - { - if (packed) - { - ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); - } - else - { - ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d)); - MOVDDUP(fpr.RX(d), fpr.R(d)); - } - } + ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -215,21 +205,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst) fpr.BindToRegister(d, !single); if (single) - { - if (packed) - { - ForceSinglePrecisionP(fpr.RX(d), XMM0); - } - else - { - ForceSinglePrecisionS(fpr.RX(d), XMM0); - MOVDDUP(fpr.RX(d), fpr.R(d)); - } - } + ForceSinglePrecision(fpr.RX(d), R(XMM0), packed, true); else - { MOVSD(fpr.RX(d), R(XMM0)); - } SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -492,11 +470,9 @@ void Jit64::frspx(UGeckoInstruction inst) int d = inst.FD; fpr.Lock(b, d); - fpr.BindToRegister(d, d == b); - if (b != d) - MOVAPD(fpr.RX(d), fpr.R(b)); - ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d)); - MOVDDUP(fpr.RX(d), fpr.R(d)); + OpArg src = fpr.R(b); + fpr.BindToRegister(d, false); + ForceSinglePrecision(fpr.RX(d), src, false, true); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 2c772a3606..ecaa1daa76 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -113,7 +113,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp) { avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible); } - ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); + ForceSinglePrecision(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -173,7 +173,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) PanicAlert("ps_sum WTF!!!"); } fpr.BindToRegister(d, false); - ForceSinglePrecisionP(fpr.RX(d), XMM0); + ForceSinglePrecision(fpr.RX(d), R(XMM0)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -205,7 +205,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) Force25BitPrecision(XMM0, R(XMM0), XMM1); MULPD(XMM0, fpr.R(a)); fpr.BindToRegister(d, false); - ForceSinglePrecisionP(fpr.RX(d), XMM0); + ForceSinglePrecision(fpr.RX(d), R(XMM0)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -264,7 +264,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst) CALL((void *)asm_routines.frsqrte); MOVLHPS(fpr.RX(d), XMM0); - ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); + ForceSinglePrecision(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); gpr.UnlockAllX(); @@ -291,7 +291,7 @@ void Jit64::ps_res(UGeckoInstruction inst) CALL((void *)asm_routines.fres); MOVLHPS(fpr.RX(d), XMM0); - ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); + ForceSinglePrecision(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); gpr.UnlockAllX(); @@ -386,7 +386,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) } fpr.BindToRegister(d, false); - ForceSinglePrecisionP(fpr.RX(d), XMM0); + ForceSinglePrecision(fpr.RX(d), R(XMM0)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 20e42f40d3..b01a9e4262 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -640,31 +640,30 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address MOV(accessSize, MRegSum(RMEM, RSCRATCH2), R(reg)); } -void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input) +void EmuCodeBlock::ForceSinglePrecision(X64Reg output, OpArg input, bool packed, bool duplicate) { // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTSD2SS(output, R(input)); - CVTSS2SD(output, R(output)); + if (packed) + { + CVTPD2PS(output, input); + CVTPS2PD(output, R(output)); + } + else + { + CVTSD2SS(output, input); + CVTSS2SD(output, R(output)); + if (duplicate) + MOVDDUP(output, R(output)); + } } - else if (output != input) + else if (!input.IsSimpleReg() || input.GetSimpleReg() != output) { - MOVAPD(output, R(input)); - } -} - -void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input) -{ - // Most games don't need these. Zelda requires it though - some platforms get stuck without them. - if (jit->jo.accurateSinglePrecision) - { - CVTPD2PS(output, R(input)); - CVTPS2PD(output, R(output)); - } - else if (output != input) - { - MOVAPD(output, R(input)); + if (duplicate) + MOVDDUP(output, input); + else + MOVAPD(output, input); } } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index c3175633ba..7e25131eb4 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -121,8 +121,7 @@ public: void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8), Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm); - void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input); - void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input); + void ForceSinglePrecision(Gen::X64Reg output, Gen::OpArg input, bool packed = true, bool duplicate = false); void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp); // RSCRATCH might get trashed From ece0ef4ca8de37c70d48c9f64f2a9878312497de Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 04/11] Jit64: add packed optimization to frsp --- Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index bf40cecbec..8f45a3f983 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -468,11 +468,12 @@ void Jit64::frspx(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); int b = inst.FB; int d = inst.FD; + bool packed = jit->js.op->fprIsDuplicated[b] && !cpu_info.bAtom; fpr.Lock(b, d); OpArg src = fpr.R(b); fpr.BindToRegister(d, false); - ForceSinglePrecision(fpr.RX(d), src, false, true); + ForceSinglePrecision(fpr.RX(d), src, packed, true); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } From 6593ba7ecc98b9590d02025a3251b3892d9d318d Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 05/11] XEmitter: add BLENDPS/BLENDPD --- Source/Core/Common/x64Emitter.cpp | 2 ++ Source/Core/Common/x64Emitter.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 8a89e4f6c8..da5627eee1 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1775,6 +1775,8 @@ void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} +void XEmitter::BLENDPS(X64Reg dest, OpArg arg, u8 blend) {WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend);} +void XEmitter::BLENDPD(X64Reg dest, OpArg arg, u8 blend) {WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend);} void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);} void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);} diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index dc488b1c3a..83be57062b 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -789,10 +789,12 @@ public: void PMOVZXWQ(X64Reg dest, OpArg arg); void PMOVZXDQ(X64Reg dest, OpArg arg); - // SSE4: variable blend instructions (xmm0 implicit argument) + // SSE4: blend instructions void PBLENDVB(X64Reg dest, OpArg arg); void BLENDVPS(X64Reg dest, OpArg arg); void BLENDVPD(X64Reg dest, OpArg arg); + void BLENDPS(X64Reg dest, OpArg arg, u8 blend); + void BLENDPD(X64Reg dest, OpArg arg, u8 blend); // AVX void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); From 05a55de08fc237592058f8f2b7740f6ae1e3b8b8 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 06/11] Jit64: optimize ps_sum --- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index ecaa1daa76..d440d16e90 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -156,24 +156,40 @@ void Jit64::ps_sum(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; fpr.Lock(a, b, c, d); + OpArg op_a = fpr.R(a); + fpr.BindToRegister(d, false); + X64Reg tmp = d == b || d == c ? XMM0 : fpr.RX(d); + MOVDDUP(tmp, op_a); // {a.ps0, a.ps0} + ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} switch (inst.SUBOP5) { - case 10: - MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0} - ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} - UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1} + case 10: // ps_sum0 + UNPCKHPD(tmp, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1} break; - case 11: - MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0} - ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} - MOVAPD(XMM0, fpr.R(c)); - SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1} + case 11: // ps_sum1 + // {c.ps0, a.ps0 + b.ps1} + if (fpr.R(c).IsSimpleReg()) + { + if (cpu_info.bSSE4_1) + { + BLENDPD(tmp, fpr.R(c), 1); + } + else + { + MOVAPD(XMM1, fpr.R(c)); + SHUFPD(XMM1, R(tmp), 2); + tmp = XMM1; + } + } + else + { + MOVLPD(tmp, fpr.R(c)); + } break; default: PanicAlert("ps_sum WTF!!!"); } - fpr.BindToRegister(d, false); - ForceSinglePrecision(fpr.RX(d), R(XMM0)); + ForceSinglePrecision(fpr.RX(d), R(tmp)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -192,10 +208,10 @@ void Jit64::ps_muls(UGeckoInstruction inst) fpr.Lock(a, c, d); switch (inst.SUBOP5) { - case 12: + case 12: // ps_muls0 MOVDDUP(XMM0, fpr.R(c)); break; - case 13: + case 13: // ps_muls1 avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); break; default: From dc220fa13dff7caaf02061e391f47d9ee0b6a5ed Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:36 +0200 Subject: [PATCH 07/11] Jit64: merge ps_sel into fselx --- Source/Core/Core/PowerPC/Jit64/Jit.h | 1 - .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 2 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 17 +++++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 34 ------------------- 4 files changed, 14 insertions(+), 40 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 3fb54dce73..bbaf971191 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -193,7 +193,6 @@ public: void reg_imm(UGeckoInstruction inst); - void ps_sel(UGeckoInstruction inst); void ps_mr(UGeckoInstruction inst); void ps_sign(UGeckoInstruction inst); //aggregate void ps_arith(UGeckoInstruction inst); //aggregate diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 5b9ac6ae23..0d8cd10cf3 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -127,7 +127,7 @@ static GekkoOPTemplate table4_2[] = {18, &Jit64::ps_arith}, // ps_div {20, &Jit64::ps_arith}, // ps_sub {21, &Jit64::ps_arith}, // ps_add - {23, &Jit64::ps_sel}, // ps_sel + {23, &Jit64::fselx}, // ps_sel {24, &Jit64::ps_res}, // ps_res {25, &Jit64::ps_arith}, // ps_mul {26, &Jit64::ps_rsqrte}, // ps_rsqrte diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 8f45a3f983..3a97f5b439 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -256,13 +256,18 @@ void Jit64::fselx(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; + bool packed = inst.OPCD == 4; // ps_sel + fpr.Lock(a, b, c, d); - MOVAPD(XMM1, fpr.R(a)); PXOR(XMM0, R(XMM0)); // This condition is very tricky; there's only one right way to handle both the case of // negative/positive zero and NaN properly. // (a >= -0.0 ? c : b) transforms into (0 > a ? b : c), hence the NLE. - CMPSD(XMM0, R(XMM1), NLE); + if (packed) + CMPPD(XMM0, fpr.R(a), NLE); + else + CMPSD(XMM0, fpr.R(a), NLE); + if (cpu_info.bSSE4_1) { MOVAPD(XMM1, fpr.R(c)); @@ -275,8 +280,12 @@ void Jit64::fselx(UGeckoInstruction inst) PANDN(XMM1, fpr.R(c)); POR(XMM1, R(XMM0)); } - fpr.BindToRegister(d); - MOVSD(fpr.RX(d), R(XMM1)); + + fpr.BindToRegister(d, !packed); + if (packed) + MOVAPD(fpr.RX(d), R(XMM1)); + else + MOVSD(fpr.RX(d), R(XMM1)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index d440d16e90..bbf6e0e8e9 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -28,40 +28,6 @@ void Jit64::ps_mr(UGeckoInstruction inst) MOVAPD(fpr.RX(d), fpr.R(b)); } -void Jit64::ps_sel(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - - int d = inst.FD; - int a = inst.FA; - int b = inst.FB; - int c = inst.FC; - - fpr.Lock(a, b, c, d); - - if (cpu_info.bSSE4_1) - { - PXOR(XMM0, R(XMM0)); - CMPPD(XMM0, fpr.R(a), NLE); - MOVAPD(XMM1, fpr.R(c)); - BLENDVPD(XMM1, fpr.R(b)); - } - else - { - PXOR(XMM1, R(XMM1)); - CMPPD(XMM1, fpr.R(a), NLE); - MOVAPD(XMM0, R(XMM1)); - PAND(XMM1, fpr.R(b)); - PANDN(XMM0, fpr.R(c)); - POR(XMM1, R(XMM0)); - } - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM1)); - fpr.UnlockAll(); -} - void Jit64::ps_sign(UGeckoInstruction inst) { INSTRUCTION_START From 6d23b511a626f2ccd01cbfebe98db26a50c2636c Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:37 +0200 Subject: [PATCH 08/11] Jit64: merge tri_op into fp_tri_op --- Source/Core/Core/PowerPC/Jit64/Jit.h | 4 +-- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 10 +++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 36 +++---------------- 3 files changed, 10 insertions(+), 40 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index bbaf971191..b468e8aaad 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -137,13 +137,11 @@ public: void MultiplyImmediate(u32 imm, int a, int d, bool overflow); - void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), - void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), - void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false); + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), bool packed = false, bool roundRHS = false); void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 3a97f5b439..0201ed5936 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -16,7 +16,7 @@ static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFF static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), - void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS) + void (XEmitter::*sseOp)(X64Reg, OpArg), bool packed, bool roundRHS) { fpr.Lock(d, a, b); fpr.BindToRegister(d, d == a || d == b || !single); @@ -80,13 +80,13 @@ void Jit64::fp_arith(UGeckoInstruction inst) switch (inst.SUBOP5) { case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, - packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break; + packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break; case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, - packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break; + packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break; case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, - packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break; + packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break; case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, - packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break; + packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break; default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index bbf6e0e8e9..2b66c7bef1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -56,34 +56,6 @@ void Jit64::ps_sign(UGeckoInstruction inst) fpr.UnlockAll(); } -//There's still a little bit more optimization that can be squeezed out of this -void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) -{ - fpr.Lock(d, a, b); - fpr.BindToRegister(d, d == a || d == b); - - if (roundRHS) - { - if (d == a) - { - Force25BitPrecision(XMM0, fpr.R(b), XMM1); - (this->*sseOp)(fpr.RX(d), R(XMM0)); - } - else - { - Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0); - (this->*sseOp)(fpr.RX(d), fpr.R(a)); - } - } - else - { - avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible); - } - ForceSinglePrecision(fpr.RX(d), fpr.R(d)); - SetFPRFIfNeeded(fpr.RX(d)); - fpr.UnlockAll(); -} - void Jit64::ps_arith(UGeckoInstruction inst) { INSTRUCTION_START @@ -94,16 +66,16 @@ void Jit64::ps_arith(UGeckoInstruction inst) switch (inst.SUBOP5) { case 18: // div - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst); + fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VDIVPD, &XEmitter::DIVPD, true); break; case 20: // sub - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst); + fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VSUBPD, &XEmitter::SUBPD, true); break; case 21: // add - tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst); + fp_tri_op(inst.FD, inst.FA, inst.FB, true, true, &XEmitter::VADDPD, &XEmitter::ADDPD, true); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input); + fp_tri_op(inst.FD, inst.FA, inst.FC, true, true, &XEmitter::VMULPD, &XEmitter::MULPD, true, round_input); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); From c6147c5ed51bf02c0dab74d67c6d2f440f2b2292 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:37 +0200 Subject: [PATCH 09/11] Jit64: merge ps_arith into fp_arith --- Source/Core/Core/PowerPC/Jit64/Jit.h | 1 - .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 8 +++--- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 11 +++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 27 ------------------- 4 files changed, 11 insertions(+), 36 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index b468e8aaad..0a367f8fa4 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -193,7 +193,6 @@ public: void ps_mr(UGeckoInstruction inst); void ps_sign(UGeckoInstruction inst); //aggregate - void ps_arith(UGeckoInstruction inst); //aggregate void ps_mergeXX(UGeckoInstruction inst); void ps_maddXX(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 0d8cd10cf3..8b9dc5d3c2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -124,12 +124,12 @@ static GekkoOPTemplate table4_2[] = {13, &Jit64::ps_muls}, // ps_muls1 {14, &Jit64::ps_maddXX}, // ps_madds0 {15, &Jit64::ps_maddXX}, // ps_madds1 - {18, &Jit64::ps_arith}, // ps_div - {20, &Jit64::ps_arith}, // ps_sub - {21, &Jit64::ps_arith}, // ps_add + {18, &Jit64::fp_arith}, // ps_div + {20, &Jit64::fp_arith}, // ps_sub + {21, &Jit64::fp_arith}, // ps_add {23, &Jit64::fselx}, // ps_sel {24, &Jit64::ps_res}, // ps_res - {25, &Jit64::ps_arith}, // ps_mul + {25, &Jit64::fp_arith}, // ps_mul {26, &Jit64::ps_rsqrte}, // ps_rsqrte {28, &Jit64::ps_maddXX}, // ps_msub {29, &Jit64::ps_maddXX}, // ps_madd diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 0201ed5936..5313a76da0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -67,16 +67,19 @@ void Jit64::fp_arith(UGeckoInstruction inst) int d = inst.FD; int arg2 = inst.SUBOP5 == 25 ? c : b; - bool single = inst.OPCD == 59; - bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + bool single = inst.OPCD == 4 || inst.OPCD == 59; // If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by // using packed arithmetic instead. - bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2]; + bool packed = inst.OPCD == 4 || (inst.OPCD == 59 && + jit->js.op->fprIsDuplicated[a] && + jit->js.op->fprIsDuplicated[arg2]); // Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case. // Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones. - if (inst.SUBOP5 == 18 || cpu_info.bAtom) + if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom)) packed = false; + bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + switch (inst.SUBOP5) { case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 2b66c7bef1..8154151be5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -56,33 +56,6 @@ void Jit64::ps_sign(UGeckoInstruction inst) fpr.UnlockAll(); } -void Jit64::ps_arith(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - - bool round_input = !jit->js.op->fprIsSingle[inst.FC]; - switch (inst.SUBOP5) - { - case 18: // div - fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VDIVPD, &XEmitter::DIVPD, true); - break; - case 20: // sub - fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VSUBPD, &XEmitter::SUBPD, true); - break; - case 21: // add - fp_tri_op(inst.FD, inst.FA, inst.FB, true, true, &XEmitter::VADDPD, &XEmitter::ADDPD, true); - break; - case 25: // mul - fp_tri_op(inst.FD, inst.FA, inst.FC, true, true, &XEmitter::VMULPD, &XEmitter::MULPD, true, round_input); - break; - default: - _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); - break; - } -} - void Jit64::ps_sum(UGeckoInstruction inst) { INSTRUCTION_START From 36d6a165590ceed0ada72e956ea2de5c87145979 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:37 +0200 Subject: [PATCH 10/11] Jit64: merge ps_maddXX into fmaddXX --- Source/Core/Core/PowerPC/Jit64/Jit.h | 1 - .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 12 +-- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 51 ++++++---- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 94 ------------------- 4 files changed, 39 insertions(+), 119 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 0a367f8fa4..f068030b61 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -194,7 +194,6 @@ public: void ps_mr(UGeckoInstruction inst); void ps_sign(UGeckoInstruction inst); //aggregate void ps_mergeXX(UGeckoInstruction inst); - void ps_maddXX(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst); void ps_rsqrte(UGeckoInstruction inst); void ps_sum(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 8b9dc5d3c2..8dd66132ca 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -122,8 +122,8 @@ static GekkoOPTemplate table4_2[] = {11, &Jit64::ps_sum}, // ps_sum1 {12, &Jit64::ps_muls}, // ps_muls0 {13, &Jit64::ps_muls}, // ps_muls1 - {14, &Jit64::ps_maddXX}, // ps_madds0 - {15, &Jit64::ps_maddXX}, // ps_madds1 + {14, &Jit64::fmaddXX}, // ps_madds0 + {15, &Jit64::fmaddXX}, // ps_madds1 {18, &Jit64::fp_arith}, // ps_div {20, &Jit64::fp_arith}, // ps_sub {21, &Jit64::fp_arith}, // ps_add @@ -131,10 +131,10 @@ static GekkoOPTemplate table4_2[] = {24, &Jit64::ps_res}, // ps_res {25, &Jit64::fp_arith}, // ps_mul {26, &Jit64::ps_rsqrte}, // ps_rsqrte - {28, &Jit64::ps_maddXX}, // ps_msub - {29, &Jit64::ps_maddXX}, // ps_madd - {30, &Jit64::ps_maddXX}, // ps_nmsub - {31, &Jit64::ps_maddXX}, // ps_nmadd + {28, &Jit64::fmaddXX}, // ps_msub + {29, &Jit64::fmaddXX}, // ps_madd + {30, &Jit64::fmaddXX}, // ps_nmsub + {31, &Jit64::fmaddXX}, // ps_nmadd }; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 5313a76da0..a848c72e6e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -105,14 +105,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; int d = inst.FD; - bool single = inst.OPCD == 59; + bool single = inst.OPCD == 4 || inst.OPCD == 59; bool round_input = single && !jit->js.op->fprIsSingle[c]; - bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c]; - if (cpu_info.bAtom) - packed = false; + bool packed = inst.OPCD == 4 || + (!cpu_info.bAtom && single && + jit->js.op->fprIsDuplicated[a] && + jit->js.op->fprIsDuplicated[b] && + jit->js.op->fprIsDuplicated[c]); fpr.Lock(a, b, c, d); + switch(inst.SUBOP5) + { + case 14: + MOVDDUP(XMM0, fpr.R(c)); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); + break; + case 15: + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); + break; + default: + bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism); + X64Reg tmp1 = special ? XMM1 : XMM0; + X64Reg tmp2 = special ? XMM0 : XMM1; + if (single && round_input) + Force25BitPrecision(tmp1, fpr.R(c), tmp2); + else + MOVAPD(tmp1, fpr.R(c)); + break; + } + // While we don't know if any games are actually affected (replays seem to work with all the usual // suspects for desyncing), netplay and other applications need absolute perfect determinism, so // be extra careful and don't use FMA, even if in theory it might be okay. @@ -121,10 +146,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // instances on different computers giving identical results. if (cpu_info.bFMA && !Core::g_want_determinism) { - if (single && round_input) - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - else - MOVAPD(XMM0, fpr.R(c)); // Statistics suggests b is a lot less likely to be unbound in practice, so // if we have to pick one of a or b to bind, let's make it b. fpr.BindToRegister(b, true, false); @@ -136,6 +157,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst) else VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); break; + case 14: //madds0 + case 15: //madds1 case 29: //madd if (packed) VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); @@ -162,11 +185,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) } else if (inst.SUBOP5 == 30) //nmsub { - // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately - if (single && round_input) - Force25BitPrecision(XMM1, fpr.R(c), XMM0); - else - MOVAPD(XMM1, fpr.R(c)); + // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately. MOVAPD(XMM0, fpr.R(b)); if (packed) { @@ -181,16 +200,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst) } else { - if (single && round_input) - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - else - MOVAPD(XMM0, fpr.R(c)); if (packed) { MULPD(XMM0, fpr.R(a)); if (inst.SUBOP5 == 28) //msub SUBPD(XMM0, fpr.R(b)); - else //(n)madd + else //(n)madd(s[01]) ADDPD(XMM0, fpr.R(b)); } else diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 8154151be5..9a4a6186bf 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -224,100 +224,6 @@ void Jit64::ps_res(UGeckoInstruction inst) gpr.UnlockAllX(); } -//TODO: add optimized cases -void Jit64::ps_maddXX(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - - int a = inst.FA; - int b = inst.FB; - int c = inst.FC; - int d = inst.FD; - bool fma = cpu_info.bFMA && !Core::g_want_determinism; - bool round_input = !jit->js.op->fprIsSingle[c]; - fpr.Lock(a, b, c, d); - - if (fma) - fpr.BindToRegister(b, true, false); - - if (inst.SUBOP5 == 14) - { - MOVDDUP(XMM0, fpr.R(c)); - if (round_input) - Force25BitPrecision(XMM0, R(XMM0), XMM1); - } - else if (inst.SUBOP5 == 15) - { - avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); - if (round_input) - Force25BitPrecision(XMM0, R(XMM0), XMM1); - } - else - { - if (round_input) - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - else - MOVAPD(XMM0, fpr.R(c)); - } - - if (fma) - { - switch (inst.SUBOP5) - { - case 14: //madds0 - case 15: //madds1 - case 29: //madd - VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - case 28: //msub - VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - case 30: //nmsub - VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - case 31: //nmadd - VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - } - } - else - { - switch (inst.SUBOP5) - { - case 14: //madds0 - case 15: //madds1 - case 29: //madd - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 28: //msub - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - break; - case 30: //nmsub - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - PXOR(XMM0, M(psSignBits)); - break; - case 31: //nmadd - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - PXOR(XMM0, M(psSignBits)); - break; - default: - _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); - return; - } - } - - fpr.BindToRegister(d, false); - ForceSinglePrecision(fpr.RX(d), R(XMM0)); - SetFPRFIfNeeded(fpr.RX(d)); - fpr.UnlockAll(); -} - void Jit64::ps_cmpXX(UGeckoInstruction inst) { INSTRUCTION_START From df34d43936411582ac1a1553cd1cc6a32bf03dd7 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:37 +0200 Subject: [PATCH 11/11] Jit64: merge ps_sign into fsign --- Source/Core/Core/PowerPC/Jit64/Jit.h | 1 - .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 6 ++-- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 24 +++++++------- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 31 ------------------- 4 files changed, 15 insertions(+), 47 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index f068030b61..3c7e80bcec 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -192,7 +192,6 @@ public: void reg_imm(UGeckoInstruction inst); void ps_mr(UGeckoInstruction inst); - void ps_sign(UGeckoInstruction inst); //aggregate void ps_mergeXX(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst); void ps_rsqrte(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 8dd66132ca..5da1deb704 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -102,9 +102,9 @@ static GekkoOPTemplate table4[] = { //SUBOP10 {0, &Jit64::ps_cmpXX}, // ps_cmpu0 {32, &Jit64::ps_cmpXX}, // ps_cmpo0 - {40, &Jit64::ps_sign}, // ps_neg - {136, &Jit64::ps_sign}, // ps_nabs - {264, &Jit64::ps_sign}, // ps_abs + {40, &Jit64::fsign}, // ps_neg + {136, &Jit64::fsign}, // ps_nabs + {264, &Jit64::fsign}, // ps_abs {64, &Jit64::ps_cmpXX}, // ps_cmpu1 {72, &Jit64::ps_mr}, // ps_mr {96, &Jit64::ps_cmpXX}, // ps_cmpo1 diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index a848c72e6e..954b2410c1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -13,6 +13,7 @@ using namespace Gen; static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL}; static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; +static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), @@ -238,23 +239,22 @@ void Jit64::fsign(UGeckoInstruction inst) int d = inst.FD; int b = inst.FB; - fpr.Lock(b, d); - fpr.BindToRegister(d); + bool packed = inst.OPCD == 4; + + fpr.Lock(b, d); + OpArg src = fpr.R(b); + fpr.BindToRegister(d, false); - if (d != b) - MOVSD(fpr.RX(d), fpr.R(b)); switch (inst.SUBOP10) { - case 40: // fnegx - // We can cheat and not worry about clobbering the top half by using masks - // that don't modify the top half. - PXOR(fpr.RX(d), M(psSignBits)); + case 40: // neg + avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), src, M(packed ? psSignBits2 : psSignBits), packed); break; - case 264: // fabsx - PAND(fpr.RX(d), M(psAbsMask)); + case 136: // nabs + avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), src, M(packed ? psSignBits2 : psSignBits), packed); break; - case 136: // fnabs - POR(fpr.RX(d), M(psSignBits)); + case 264: // abs + avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), src, M(packed ? psAbsMask2 : psAbsMask), packed); break; default: PanicAlert("fsign bleh"); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 9a4a6186bf..c60e5c8d58 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -10,9 +10,6 @@ using namespace Gen; -static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; -static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; - void Jit64::ps_mr(UGeckoInstruction inst) { INSTRUCTION_START @@ -28,34 +25,6 @@ void Jit64::ps_mr(UGeckoInstruction inst) MOVAPD(fpr.RX(d), fpr.R(b)); } -void Jit64::ps_sign(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - - int d = inst.FD; - int b = inst.FB; - - fpr.Lock(d, b); - fpr.BindToRegister(d, d == b); - - switch (inst.SUBOP10) - { - case 40: //neg - avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M(psSignBits)); - break; - case 136: //nabs - avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M(psSignBits)); - break; - case 264: //abs - avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M(psAbsMask)); - break; - } - - fpr.UnlockAll(); -} - void Jit64::ps_sum(UGeckoInstruction inst) { INSTRUCTION_START