From c3bcc67653513b3dae7dec4df78699202363cb57 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 17 Aug 2021 19:57:06 +0200 Subject: [PATCH] PowerPC: Update FEX on FPSCR store instead of FPSCR load This is needed not only for the next commit, but also for correctly emulating float instructions that write to CR1. --- .../PowerPC/Interpreter/Interpreter_FPUtils.h | 8 +- .../Interpreter_SystemRegisters.cpp | 30 +---- Source/Core/Core/PowerPC/Jit64/Jit.h | 3 +- .../PowerPC/Jit64/Jit_SystemRegisters.cpp | 126 ++++++++++++++---- Source/Core/Core/PowerPC/JitArm64/Jit.h | 1 + .../JitArm64/JitArm64_SystemRegisters.cpp | 108 +++++++++++---- 6 files changed, 200 insertions(+), 76 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index c3e1d40d4d..b8860eabf5 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -24,6 +24,12 @@ enum class FPCC FU = 1, // ? }; +inline void UpdateFPExceptionSummary(UReg_FPSCR* fpscr) +{ + fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0; + fpscr->FEX = ((fpscr->Hex >> 22) & (fpscr->Hex & FPSCR_ANY_E)) != 0; +} + inline void SetFPException(UReg_FPSCR* fpscr, u32 mask) { if ((fpscr->Hex & mask) != mask) @@ -32,7 +38,7 @@ inline void SetFPException(UReg_FPSCR* fpscr, u32 mask) } fpscr->Hex |= mask; - fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0; + UpdateFPExceptionSummary(fpscr); } inline float ForceSingle(const UReg_FPSCR& fpscr, double value) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp index 3f7b82717a..50d586efa5 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp @@ -25,22 +25,10 @@ mffsx: 80036650 (huh?) */ -static void FPSCRUpdated(UReg_FPSCR fp) +static void FPSCRUpdated(UReg_FPSCR* fpscr) { + UpdateFPExceptionSummary(fpscr); PowerPC::RoundingModeUpdated(); - - if (fp.VE || fp.OE || fp.UE || fp.ZE || fp.XE) - { - // PanicAlert("FPSCR - exceptions enabled. Please report. VE=%i OE=%i UE=%i ZE=%i XE=%i", - // fp.VE, fp.OE, fp.UE, fp.ZE, fp.XE); - // Pokemon Colosseum does this. Gah. - } -} - -static void UpdateFPSCR(UReg_FPSCR* fpscr) -{ - fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0; - fpscr->FEX = ((fpscr->Hex >> 22) & (fpscr->Hex & FPSCR_ANY_E)) != 0; } void Interpreter::mtfsb0x(UGeckoInstruction inst) @@ -48,7 +36,7 @@ void Interpreter::mtfsb0x(UGeckoInstruction inst) u32 b = 0x80000000 >> inst.CRBD; FPSCR.Hex &= ~b; - FPSCRUpdated(FPSCR); + FPSCRUpdated(&FPSCR); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -65,7 +53,7 @@ void Interpreter::mtfsb1x(UGeckoInstruction inst) else FPSCR |= b; - FPSCRUpdated(FPSCR); + FPSCRUpdated(&FPSCR); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -80,7 +68,7 @@ void Interpreter::mtfsfix(UGeckoInstruction inst) FPSCR = (FPSCR.Hex & ~mask) | (imm >> (4 * field)); - FPSCRUpdated(FPSCR); + FPSCRUpdated(&FPSCR); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -97,7 +85,7 @@ void Interpreter::mtfsfx(UGeckoInstruction inst) } FPSCR = (FPSCR.Hex & ~m) | (static_cast(rPS(inst.FB).PS0AsU64()) & m); - FPSCRUpdated(FPSCR); + FPSCRUpdated(&FPSCR); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -563,22 +551,18 @@ void Interpreter::isync(UGeckoInstruction inst) void Interpreter::mcrfs(UGeckoInstruction inst) { - UpdateFPSCR(&FPSCR); const u32 shift = 4 * (7 - inst.CRFS); const u32 fpflags = (FPSCR.Hex >> shift) & 0xF; // If any exception bits were read, clear them FPSCR.Hex &= ~((0xF << shift) & (FPSCR_FX | FPSCR_ANY_X)); + FPSCRUpdated(&FPSCR); PowerPC::ppcState.cr.SetField(inst.CRFD, fpflags); } void Interpreter::mffsx(UGeckoInstruction inst) { - // load from FPSCR - // TODO(ector): grab all overflow flags etc and set them in FPSCR - - UpdateFPSCR(&FPSCR); rPS(inst.FD).SetPS0(UINT64_C(0xFFF8000000000000) | FPSCR.Hex); if (inst.Rc) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 35f198dc6b..70c53bd784 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -116,11 +116,12 @@ public: void ClearCRFieldBit(int field, int bit); void SetCRFieldBit(int field, int bit); void FixGTBeforeSettingCRFieldBit(Gen::X64Reg reg); - // Generates a branch that will check if a given bit of a CR register part // is set or not. Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); + void UpdateFPExceptionSummary(Gen::X64Reg fpscr, Gen::X64Reg tmp1, Gen::X64Reg tmp2); + void SetFPRFIfNeeded(const Gen::OpArg& xmm, bool single); void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true, bool duplicate = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index 34fb820274..3117ef563f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -4,7 +4,9 @@ #include "Common/BitSet.h" #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" +#include "Common/MathUtil.h" #include "Common/x64Emitter.h" + #include "Core/CoreTiming.h" #include "Core/HW/ProcessorInterface.h" #include "Core/PowerPC/Jit64/Jit.h" @@ -185,6 +187,33 @@ FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set) return FixupBranch(); } +// Could be done with one temp register, but with two temp registers it's faster +void Jit64::UpdateFPExceptionSummary(X64Reg fpscr, X64Reg tmp1, X64Reg tmp2) +{ + // Kill dependency on tmp1 (not required for correctness, since SHL will shift out upper bytes) + XOR(32, R(tmp1), R(tmp1)); + + // fpscr.VX = (fpscr & FPSCR_VX_ANY) != 0 + TEST(32, R(fpscr), Imm32(FPSCR_VX_ANY)); + SETcc(CC_NZ, R(tmp1)); + SHL(32, R(tmp1), Imm8(IntLog2(FPSCR_VX))); + AND(32, R(fpscr), Imm32(~(FPSCR_VX | FPSCR_FEX))); + OR(32, R(fpscr), R(tmp1)); + + // fpscr.FEX = ((fpscr >> 22) & (fpscr & FPSCR_ANY_E)) != 0 + MOV(32, R(tmp1), R(fpscr)); + MOV(32, R(tmp2), R(fpscr)); + SHR(32, R(tmp1), Imm8(22)); + AND(32, R(tmp2), Imm32(FPSCR_ANY_E)); + TEST(32, R(tmp1), R(tmp2)); + // Unfortunately we eat a partial register stall below - we can't zero any of the registers before + // the TEST, and we can't use XOR right after the TEST since that would overwrite flags. However, + // there is no false dependency, since SETcc depends on TEST's flags and TEST depends on tmp1. + SETcc(CC_NZ, R(tmp1)); + SHL(32, R(tmp1), Imm8(IntLog2(FPSCR_FEX))); + OR(32, R(fpscr), R(tmp1)); +} + static void DoICacheReset() { PowerPC::ppcState.iCache.Reset(); @@ -637,6 +666,19 @@ void Jit64::mcrfs(UGeckoInstruction inst) // Only clear exception bits (but not FEX/VX). mask &= FPSCR_FX | FPSCR_ANY_X; + RCX64Reg scratch_guard; + X64Reg scratch; + if (mask != 0) + { + scratch_guard = gpr.Scratch(); + RegCache::Realize(scratch_guard); + scratch = scratch_guard; + } + else + { + scratch = RSCRATCH; + } + if (cpu_info.bBMI1) { MOV(32, R(RSCRATCH), PPCSTATE(fpscr)); @@ -652,14 +694,17 @@ void Jit64::mcrfs(UGeckoInstruction inst) SHR(32, R(RSCRATCH2), Imm8(shift)); AND(32, R(RSCRATCH2), Imm32(0xF)); } + + LEA(64, scratch, MConst(PowerPC::ConditionRegister::s_crTable)); + MOV(64, R(scratch), MComplex(scratch, RSCRATCH2, SCALE_8, 0)); + MOV(64, CROffset(inst.CRFD), R(scratch)); + if (mask != 0) { AND(32, R(RSCRATCH), Imm32(~mask)); + UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch); MOV(32, PPCSTATE(fpscr), R(RSCRATCH)); } - LEA(64, RSCRATCH, MConst(PowerPC::ConditionRegister::s_crTable)); - MOV(64, R(RSCRATCH), MComplex(RSCRATCH, RSCRATCH2, SCALE_8, 0)); - MOV(64, CROffset(inst.CRFD), R(RSCRATCH)); } void Jit64::mffsx(UGeckoInstruction inst) @@ -670,18 +715,6 @@ void Jit64::mffsx(UGeckoInstruction inst) MOV(32, R(RSCRATCH), PPCSTATE(fpscr)); - // FPSCR.FEX = 0 (and VX for below) - AND(32, R(RSCRATCH), Imm32(~0x60000000)); - - // FPSCR.VX = (FPSCR.Hex & FPSCR_VX_ANY) != 0; - XOR(32, R(RSCRATCH2), R(RSCRATCH2)); - TEST(32, R(RSCRATCH), Imm32(FPSCR_VX_ANY)); - SETcc(CC_NZ, R(RSCRATCH2)); - SHL(32, R(RSCRATCH2), Imm8(31 - 2)); - OR(32, R(RSCRATCH), R(RSCRATCH2)); - - MOV(32, PPCSTATE(fpscr), R(RSCRATCH)); - int d = inst.FD; RCX64Reg Rd = fpr.Bind(d, RCMode::Write); RegCache::Realize(Rd); @@ -710,17 +743,32 @@ void Jit64::mtfsb0x(UGeckoInstruction inst) JITDISABLE(bJITSystemRegistersOff); FALLBACK_IF(inst.Rc); - u32 mask = ~(0x80000000 >> inst.CRBD); - if (inst.CRBD < 29) + const u32 mask = 0x80000000 >> inst.CRBD; + const u32 inverted_mask = ~mask; + + if (mask == FPSCR_FEX || mask == FPSCR_VX) + return; + + if (inst.CRBD < 29 && (mask & (FPSCR_ANY_X | FPSCR_ANY_E)) == 0) { - AND(32, PPCSTATE(fpscr), Imm32(mask)); + AND(32, PPCSTATE(fpscr), Imm32(inverted_mask)); } else { MOV(32, R(RSCRATCH), PPCSTATE(fpscr)); - AND(32, R(RSCRATCH), Imm32(mask)); + AND(32, R(RSCRATCH), Imm32(inverted_mask)); + + if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + { + RCX64Reg scratch = gpr.Scratch(); + RegCache::Realize(scratch); + + UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch); + } + MOV(32, PPCSTATE(fpscr), R(RSCRATCH)); - UpdateMXCSR(); + if (inst.CRBD >= 29) + UpdateMXCSR(); } } @@ -730,9 +778,13 @@ void Jit64::mtfsb1x(UGeckoInstruction inst) JITDISABLE(bJITSystemRegistersOff); FALLBACK_IF(inst.Rc); - u32 mask = 0x80000000 >> inst.CRBD; + const u32 mask = 0x80000000 >> inst.CRBD; + + if (mask == FPSCR_FEX || mask == FPSCR_VX) + return; + MOV(32, R(RSCRATCH), PPCSTATE(fpscr)); - if (mask & FPSCR_ANY_X) + if ((mask & FPSCR_ANY_X) != 0) { BTS(32, R(RSCRATCH), Imm32(31 - inst.CRBD)); FixupBranch dont_set_fx = J_CC(CC_C); @@ -743,6 +795,15 @@ void Jit64::mtfsb1x(UGeckoInstruction inst) { OR(32, R(RSCRATCH), Imm32(mask)); } + + if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + { + RCX64Reg scratch = gpr.Scratch(); + RegCache::Realize(scratch); + + UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch); + } + MOV(32, PPCSTATE(fpscr), R(RSCRATCH)); if (inst.CRBD >= 29) UpdateMXCSR(); @@ -755,12 +816,22 @@ void Jit64::mtfsfix(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u8 imm = (inst.hex >> (31 - 19)) & 0xF; + u32 mask = 0xF0000000 >> (4 * inst.CRFD); u32 or_mask = imm << (28 - 4 * inst.CRFD); - u32 and_mask = ~(0xF0000000 >> (4 * inst.CRFD)); + u32 and_mask = ~mask; MOV(32, R(RSCRATCH), PPCSTATE(fpscr)); AND(32, R(RSCRATCH), Imm32(and_mask)); OR(32, R(RSCRATCH), Imm32(or_mask)); + + if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + { + RCX64Reg scratch = gpr.Scratch(); + RegCache::Realize(scratch); + + UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch); + } + MOV(32, PPCSTATE(fpscr), R(RSCRATCH)); // Field 7 contains NI and RN. @@ -798,6 +869,15 @@ void Jit64::mtfsfx(UGeckoInstruction inst) AND(32, R(RSCRATCH2), Imm32(~mask)); OR(32, R(RSCRATCH), R(RSCRATCH2)); } + + if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + { + RCX64Reg scratch = gpr.Scratch(); + RegCache::Realize(scratch); + + UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch); + } + MOV(32, PPCSTATE(fpscr), R(RSCRATCH)); if (inst.FM & 1) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index b029f545cc..f19bd33d55 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -273,6 +273,7 @@ protected: Arm64Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); void FixGTBeforeSettingCRFieldBit(Arm64Gen::ARM64Reg reg); + void UpdateFPExceptionSummary(Arm64Gen::ARM64Reg fpscr); void UpdateRoundingMode(); void ComputeRC0(Arm64Gen::ARM64Reg reg); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index d3de831872..568d3072f3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -4,6 +4,7 @@ #include "Common/Arm64Emitter.h" #include "Common/Assert.h" #include "Common/CommonTypes.h" +#include "Common/MathUtil.h" #include "Core/Core.h" #include "Core/CoreTiming.h" @@ -49,6 +50,25 @@ void JitArm64::FixGTBeforeSettingCRFieldBit(Arm64Gen::ARM64Reg reg) gpr.Unlock(WA); } +void JitArm64::UpdateFPExceptionSummary(ARM64Reg fpscr) +{ + ARM64Reg WA = gpr.GetReg(); + + // fpscr.VX = (fpscr & FPSCR_VX_ANY) != 0 + MOVI2R(WA, FPSCR_VX_ANY); + TST(WA, fpscr); + CSET(WA, CCFlags::CC_NEQ); + BFI(fpscr, WA, IntLog2(FPSCR_VX), 1); + + // fpscr.FEX = ((fpscr >> 22) & (fpscr & FPSCR_ANY_E)) != 0 + AND(WA, fpscr, LogicalImm(FPSCR_ANY_E, 32)); + TST(WA, fpscr, ArithOption(fpscr, ShiftType::LSR, 22)); + CSET(WA, CCFlags::CC_NEQ); + BFI(fpscr, WA, IntLog2(FPSCR_FEX), 1); + + gpr.Unlock(WA); +} + void JitArm64::UpdateRoundingMode() { const BitSet32 gprs_to_save = gpr.GetCallerSavedUsed(); @@ -732,6 +752,8 @@ void JitArm64::mcrfs(UGeckoInstruction inst) { const u32 inverted_mask = ~mask; AND(WA, WA, LogicalImm(inverted_mask, 32)); + + UpdateFPExceptionSummary(WA); STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); } @@ -753,24 +775,11 @@ void JitArm64::mffsx(UGeckoInstruction inst) LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); ARM64Reg VD = fpr.RW(inst.FD, RegType::LowerPair); - ARM64Reg WB = gpr.GetReg(); - // FPSCR.FEX = 0; - // FPSCR.VX = (FPSCR.Hex & FPSCR_VX_ANY) != 0; - // (FEX is right next to VX, so we can set both using one BFI instruction) - MOVI2R(WB, FPSCR_VX_ANY); - TST(WA, WB); - CSET(WB, CCFlags::CC_NEQ); - BFI(WA, WB, 31 - 2, 2); - - STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); - - // Vd = FPSCR.Hex | 0xFFF8'0000'0000'0000; ORR(XA, XA, LogicalImm(0xFFF8'0000'0000'0000, 64)); m_float_emit.FMOV(EncodeRegToDouble(VD), XA); gpr.Unlock(WA); - gpr.Unlock(WB); } void JitArm64::mtfsb0x(UGeckoInstruction inst) @@ -779,12 +788,20 @@ void JitArm64::mtfsb0x(UGeckoInstruction inst) JITDISABLE(bJITSystemRegistersOff); FALLBACK_IF(inst.Rc); - u32 mask = ~(0x80000000 >> inst.CRBD); + const u32 mask = 0x80000000 >> inst.CRBD; + const u32 inverted_mask = ~mask; + + if (mask == FPSCR_FEX || mask == FPSCR_VX) + return; ARM64Reg WA = gpr.GetReg(); LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); - AND(WA, WA, LogicalImm(mask, 32)); + + AND(WA, WA, LogicalImm(inverted_mask, 32)); + + if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + UpdateFPExceptionSummary(WA); STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); gpr.Unlock(WA); @@ -799,12 +816,16 @@ void JitArm64::mtfsb1x(UGeckoInstruction inst) JITDISABLE(bJITSystemRegistersOff); FALLBACK_IF(inst.Rc); - u32 mask = 0x80000000 >> inst.CRBD; + const u32 mask = 0x80000000 >> inst.CRBD; + + if (mask == FPSCR_FEX || mask == FPSCR_VX) + return; ARM64Reg WA = gpr.GetReg(); LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); - if (mask & FPSCR_ANY_X) + + if ((mask & FPSCR_ANY_X) != 0) { ARM64Reg WB = gpr.GetReg(); TST(WA, LogicalImm(mask, 32)); @@ -813,6 +834,9 @@ void JitArm64::mtfsb1x(UGeckoInstruction inst) gpr.Unlock(WB); } ORR(WA, WA, LogicalImm(mask, 32)); + + if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + UpdateFPExceptionSummary(WA); STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); gpr.Unlock(WA); @@ -829,13 +853,15 @@ void JitArm64::mtfsfix(UGeckoInstruction inst) u8 imm = (inst.hex >> (31 - 19)) & 0xF; u8 shift = 28 - 4 * inst.CRFD; + u32 mask = 0xF << shift; ARM64Reg WA = gpr.GetReg(); + LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); if (imm == 0xF) { - ORR(WA, WA, LogicalImm(0xF << shift, 32)); + ORR(WA, WA, LogicalImm(mask, 32)); } else if (imm == 0x0) { @@ -849,7 +875,10 @@ void JitArm64::mtfsfix(UGeckoInstruction inst) gpr.Unlock(WB); } + if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + UpdateFPExceptionSummary(WA); STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); + gpr.Unlock(WA); // Field 7 contains NI and RN. @@ -873,24 +902,47 @@ void JitArm64::mtfsfx(UGeckoInstruction inst) if (mask == 0xFFFFFFFF) { ARM64Reg VB = fpr.R(inst.FB, RegType::LowerPair); + ARM64Reg WA = gpr.GetReg(); - m_float_emit.STR(32, IndexType::Unsigned, VB, PPC_REG, PPCSTATE_OFF(fpscr)); + m_float_emit.FMOV(WA, EncodeRegToSingle(VB)); + + UpdateFPExceptionSummary(WA); + STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); + + gpr.Unlock(WA); } else if (mask != 0) { ARM64Reg VB = fpr.R(inst.FB, RegType::LowerPair); - - ARM64Reg V0 = fpr.GetReg(); - ARM64Reg V1 = fpr.GetReg(); ARM64Reg WA = gpr.GetReg(); + ARM64Reg WB = gpr.GetReg(); - m_float_emit.LDR(32, IndexType::Unsigned, V0, PPC_REG, PPCSTATE_OFF(fpscr)); - MOVI2R(WA, mask); - m_float_emit.FMOV(EncodeRegToSingle(V1), WA); - m_float_emit.BIT(EncodeRegToDouble(V0), EncodeRegToDouble(VB), EncodeRegToDouble(V1)); - m_float_emit.STR(32, IndexType::Unsigned, V0, PPC_REG, PPCSTATE_OFF(fpscr)); + LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); + m_float_emit.FMOV(WB, EncodeRegToSingle(VB)); + + if (LogicalImm imm = LogicalImm(mask, 32)) + { + AND(WA, WA, LogicalImm(~mask, 32)); + AND(WB, WB, imm); + } + else + { + ARM64Reg WC = gpr.GetReg(); + + MOVI2R(WC, mask); + BIC(WA, WA, WC); + AND(WB, WB, WC); + + gpr.Unlock(WC); + } + ORR(WA, WA, WB); + + gpr.Unlock(WB); + + if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0) + UpdateFPExceptionSummary(WA); + STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr)); - fpr.Unlock(V0, V1); gpr.Unlock(WA); }