diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index cb3ef5b0a9..4dfbe56eb8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -140,10 +140,13 @@ public: void MultiplyImmediate(u32 imm, int a, int d, bool overflow); - void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); - void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + bool Rc = false, bool carry = false); + void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index fe321d4e63..e7086b59b9 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -14,65 +14,27 @@ static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000 static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; -void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS) +void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), + void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { fpr.Lock(d, a, b); + fpr.BindToRegister(d, d == a || d == b || !single); if (roundRHS) { if (d == a) { - fpr.BindToRegister(d, true); - MOVSD(XMM0, fpr.R(b)); - Force25BitPrecision(XMM0, XMM1); - (this->*op)(fpr.RX(d), R(XMM0)); + Force25BitPrecision(XMM0, fpr.R(b), XMM1); + (this->*sseOp)(fpr.RX(d), R(XMM0)); } else { - fpr.BindToRegister(d, d == b); - if (d != b) - MOVSD(fpr.RX(d), fpr.R(b)); - Force25BitPrecision(fpr.RX(d), XMM0); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - } - else if (d == a) - { - fpr.BindToRegister(d, true); - if (!single) - { - fpr.BindToRegister(b, true, false); - } - (this->*op)(fpr.RX(d), fpr.R(b)); - } - else if (d == b) - { - if (reversible) - { - fpr.BindToRegister(d, true); - if (!single) - { - fpr.BindToRegister(a, true, false); - } - (this->*op)(fpr.RX(d), fpr.R(a)); - } - else - { - MOVSD(XMM0, fpr.R(b)); - fpr.BindToRegister(d, !single); - MOVSD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), Gen::R(XMM0)); + Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0); + (this->*sseOp)(fpr.RX(d), fpr.R(a)); } } else { - // Sources different from d, can use rather quick solution - fpr.BindToRegister(d, !single); - if (!single) - { - fpr.BindToRegister(b, true, false); - } - MOVSD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), fpr.R(b)); + avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible); } if (single) { @@ -104,10 +66,10 @@ void Jit64::fp_arith(UGeckoInstruction inst) bool single = inst.OPCD == 59; switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul + case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div + case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub + case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -131,18 +93,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately if (inst.SUBOP5 == 30) //nmsub { - MOVSD(XMM1, fpr.R(c)); if (single_precision) - Force25BitPrecision(XMM1, XMM0); + Force25BitPrecision(XMM1, fpr.R(c), XMM0); + else + MOVSD(XMM1, fpr.R(c)); MULSD(XMM1, fpr.R(a)); MOVSD(XMM0, fpr.R(b)); SUBSD(XMM0, R(XMM1)); } else { - MOVSD(XMM0, fpr.R(c)); if (single_precision) - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + else + MOVSD(XMM0, fpr.R(c)); MULSD(XMM0, fpr.R(a)); if (inst.SUBOP5 == 28) //msub SUBSD(XMM0, fpr.R(b)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 7df0302e64..cd069bb9fc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -43,17 +43,15 @@ void Jit64::ps_sel(UGeckoInstruction inst) if (cpu_info.bSSE4_1) { - MOVAPD(XMM1, fpr.R(a)); PXOR(XMM0, R(XMM0)); - CMPPD(XMM0, R(XMM1), NLE); + CMPPD(XMM0, fpr.R(a), NLE); MOVAPD(XMM1, fpr.R(c)); BLENDVPD(XMM1, fpr.R(b)); } else { - MOVAPD(XMM0, fpr.R(a)); PXOR(XMM1, R(XMM1)); - CMPPD(XMM1, R(XMM0), NLE); + CMPPD(XMM1, fpr.R(a), NLE); MOVAPD(XMM0, R(XMM1)); PAND(XMM1, fpr.R(b)); PANDN(XMM0, fpr.R(c)); @@ -74,26 +72,18 @@ void Jit64::ps_sign(UGeckoInstruction inst) int b = inst.FB; fpr.Lock(d, b); - if (d != b) - { - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), fpr.R(b)); - } - else - { - fpr.BindToRegister(d, true); - } + fpr.BindToRegister(d, d == b); switch (inst.SUBOP10) { case 40: //neg - PXOR(fpr.RX(d), M((void*)&psSignBits)); + avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits)); break; case 136: //nabs - POR(fpr.RX(d), M((void*)&psSignBits)); + avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits)); break; case 264: //abs - PAND(fpr.RX(d), M((void*)&psAbsMask)); + avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M((void*)&psAbsMask)); break; } @@ -101,56 +91,29 @@ void Jit64::ps_sign(UGeckoInstruction inst) } //There's still a little bit more optimization that can be squeezed out of this -void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) +void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { fpr.Lock(d, a, b); + fpr.BindToRegister(d, d == a || d == b); if (roundRHS) { if (d == a) { - fpr.BindToRegister(d, true); - MOVAPD(XMM0, fpr.R(b)); - Force25BitPrecision(XMM0, XMM1); - (this->*op)(fpr.RX(d), R(XMM0)); + Force25BitPrecision(XMM0, fpr.R(b), XMM1); + (this->*sseOp)(fpr.RX(d), R(XMM0)); } else { - fpr.BindToRegister(d, d == b); - if (d != b) - MOVAPD(fpr.RX(d), fpr.R(b)); - Force25BitPrecision(fpr.RX(d), XMM0); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - } - else if (d == a) - { - fpr.BindToRegister(d, true); - (this->*op)(fpr.RX(d), fpr.R(b)); - } - else if (d == b) - { - if (reversible) - { - fpr.BindToRegister(d, true); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - else - { - MOVAPD(XMM0, fpr.R(b)); - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), R(XMM0)); + Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0); + (this->*sseOp)(fpr.RX(d), fpr.R(a)); } } else { - //sources different from d, can use rather quick solution - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), fpr.R(b)); + avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible); } - ForceSinglePrecisionP(fpr.RX(d)); + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -164,16 +127,16 @@ void Jit64::ps_arith(UGeckoInstruction inst) switch (inst.SUBOP5) { case 18: // div - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst); + tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst); break; case 20: // sub - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst); + tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst); break; case 21: // add - tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst); + tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -208,10 +171,9 @@ void Jit64::ps_sum(UGeckoInstruction inst) default: PanicAlert("ps_sum WTF!!!"); } - ForceSinglePrecisionP(XMM0); - SetFPRFIfNeeded(inst, XMM0); fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -232,18 +194,16 @@ void Jit64::ps_muls(UGeckoInstruction inst) MOVDDUP(XMM0, fpr.R(c)); break; case 13: - MOVAPD(XMM0, fpr.R(c)); - SHUFPD(XMM0, R(XMM0), 3); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); break; default: PanicAlert("ps_muls WTF!!!"); } - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, R(XMM0), XMM1); MULPD(XMM0, fpr.R(a)); - ForceSinglePrecisionP(XMM0); - SetFPRFIfNeeded(inst, XMM0); fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -258,27 +218,25 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst) int a = inst.FA; int b = inst.FB; fpr.Lock(a,b,d); + fpr.BindToRegister(d, d == a || d == b); - MOVAPD(XMM0, fpr.R(a)); switch (inst.SUBOP10) { case 528: - UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf + avx_op(&XEmitter::VUNPCKLPD, &XEmitter::UNPCKLPD, fpr.RX(d), fpr.R(a), fpr.R(b)); break; //00 case 560: - SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 2); break; //01 case 592: - SHUFPD(XMM0, fpr.R(b), 1); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 1); break; //10 case 624: - UNPCKHPD(XMM0, fpr.R(b)); + avx_op(&XEmitter::VUNPCKHPD, &XEmitter::UNPCKHPD, fpr.RX(d), fpr.R(a), fpr.R(b)); break; //11 default: _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); } - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } @@ -303,7 +261,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst) CALL((void *)asm_routines.frsqrte); MOVLHPS(fpr.RX(d), XMM0); - ForceSinglePrecisionP(fpr.RX(d)); + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); gpr.UnlockAllX(); @@ -330,7 +288,7 @@ void Jit64::ps_res(UGeckoInstruction inst) CALL((void *)asm_routines.fres); MOVLHPS(fpr.RX(d), XMM0); - ForceSinglePrecisionP(fpr.RX(d)); + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); gpr.UnlockAllX(); @@ -352,42 +310,35 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) switch (inst.SUBOP5) { case 14: //madds0 - MOVDDUP(XMM1, fpr.R(c)); - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - MULPD(XMM0, R(XMM1)); + MOVDDUP(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, R(XMM0), XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 15: //madds1 - MOVAPD(XMM1, fpr.R(c)); - SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - MULPD(XMM0, R(XMM1)); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); + Force25BitPrecision(XMM0, R(XMM0), XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 28: //msub - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); break; case 29: //madd - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 30: //nmsub - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; case 31: //nmadd - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); @@ -399,9 +350,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) return; } fpr.BindToRegister(d, false); - ForceSinglePrecisionP(XMM0); - SetFPRFIfNeeded(inst, XMM0); - MOVAPD(fpr.RX(d), R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 283ea6ea2e..45f37b6889 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -608,13 +608,98 @@ void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) } } -void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) +void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input) { // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTPD2PS(xmm, R(xmm)); - CVTPS2PD(xmm, R(xmm)); + CVTPD2PS(input, R(input)); + CVTPS2PD(output, R(input)); + } + else if (output != input) + { + MOVAPD(output, R(input)); + } +} + +// Abstract between AVX and SSE: automatically handle 3-operand instructions +void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), + X64Reg regOp, OpArg arg1, OpArg arg2, bool packed, bool reversible) +{ + if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg()) + { + (this->*sseOp)(regOp, arg2); + } + else if (arg1.IsSimpleReg() && cpu_info.bAVX) + { + (this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2); + } + else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp) + { + if (reversible) + { + (this->*sseOp)(regOp, arg1); + } + else + { + // The ugly case: regOp == arg2 without AVX, or with arg1 == memory + if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0) + MOVAPD(XMM0, arg1); + if (cpu_info.bAVX) + { + (this->*avxOp)(regOp, XMM0, arg2); + } + else + { + (this->*sseOp)(XMM0, arg2); + if (packed) + MOVAPD(regOp, R(XMM0)); + else + MOVSD(regOp, R(XMM0)); + } + } + } + else + { + if (packed) + MOVAPD(regOp, arg1); + else + MOVSD(regOp, arg1); + (this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2); + } +} + +// Abstract between AVX and SSE: automatically handle 3-operand instructions +void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg, u8), void (XEmitter::*sseOp)(X64Reg, OpArg, u8), + X64Reg regOp, OpArg arg1, OpArg arg2, u8 imm) +{ + if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg()) + { + (this->*sseOp)(regOp, arg2, imm); + } + else if (arg1.IsSimpleReg() && cpu_info.bAVX) + { + (this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2, imm); + } + else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp) + { + // The ugly case: regOp == arg2 without AVX, or with arg1 == memory + if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0) + MOVAPD(XMM0, arg1); + if (cpu_info.bAVX) + { + (this->*avxOp)(regOp, XMM0, arg2, imm); + } + else + { + (this->*sseOp)(XMM0, arg2, imm); + MOVAPD(regOp, R(XMM0)); + } + } + else + { + MOVAPD(regOp, arg1); + (this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2, imm); } } @@ -625,15 +710,25 @@ static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000}; // a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, // rounding to nearest as it does. // It needs a temp, so let the caller pass that in. -void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp) +void EmuCodeBlock::Force25BitPrecision(X64Reg output, OpArg input, X64Reg tmp) { if (jit->jo.accurateSinglePrecision) { // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1); - MOVAPD(tmp, R(xmm)); - PAND(xmm, M((void*)&psMantissaTruncate)); - PAND(tmp, M((void*)&psRoundBit)); - PADDQ(xmm, R(tmp)); + if (input.IsSimpleReg() && cpu_info.bAVX) + { + VPAND(tmp, input.GetSimpleReg(), M((void*)&psRoundBit)); + VPAND(output, input.GetSimpleReg(), M((void*)&psMantissaTruncate)); + PADDQ(output, R(tmp)); + } + else + { + if (!input.IsSimpleReg() || input.GetSimpleReg() != output) + MOVAPD(output, input); + avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M((void*)&psRoundBit), true, true); + PAND(output, M((void*)&psMantissaTruncate)); + PADDQ(output, R(tmp)); + } } } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index e46621067a..43b54debd9 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -123,9 +123,14 @@ public: void JitSetCAIf(Gen::CCFlags conditionCode); void JitClearCA(); + void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), + Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, bool packed = true, bool reversible = false); + void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8), + Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm); + void ForceSinglePrecisionS(Gen::X64Reg xmm); - void ForceSinglePrecisionP(Gen::X64Reg xmm); - void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp); + void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input); + void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp); // RSCRATCH might get trashed void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);