diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 6f0d296a7f..5f4421ee93 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -73,6 +73,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, m_float_emit.REV32(8, D0, D0); m_float_emit.STR(64, Q0, X28, addr); } + else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I) + { + m_float_emit.REV32(8, D0, RS); + m_float_emit.STR(64, Q0, X28, addr); + } else { m_float_emit.REV64(8, Q0, RS); @@ -86,7 +91,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, { m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr); m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); } else { @@ -198,6 +202,13 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, MOVI2R(X30, (u64)PowerPC::Write_U64); BLR(X30); } + else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I) + { + m_float_emit.UMOV(64, X0, RS, 0); + ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); + MOVI2R(X30, (u64)PowerPC::Write_U64); + BLR(X30); + } else { MOVI2R(X30, (u64)&PowerPC::Write_U64); @@ -214,7 +225,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, MOVI2R(X30, (u64)&PowerPC::Read_U32); BLR(X30); m_float_emit.INS(32, RS, 0, X0); - m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); } else { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index f1f956db69..d38a959a8f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -33,34 +33,44 @@ void JitArm64::fp_arith(UGeckoInstruction inst) bool use_c = op5 >= 25; // fmul and all kind of fmaddXX bool use_b = op5 != 25; // fmul uses no B + bool inputs_are_singles = fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) && (!use_c || fpr.IsSingle(c, !packed)); + ARM64Reg VA, VB, VC, VD; if (packed) { - VA = fpr.R(a, REG_REG); + RegType type = inputs_are_singles ? REG_REG_SINGLE : REG_REG; + u8 size = inputs_are_singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad; + + VA = reg_encoder(fpr.R(a, type)); if (use_b) - VB = fpr.R(b, REG_REG); + VB = reg_encoder(fpr.R(b, type)); if (use_c) - VC = fpr.R(c, REG_REG); - VD = fpr.RW(d, REG_REG); + VC = reg_encoder(fpr.R(c, type)); + VD = reg_encoder(fpr.RW(d, type)); switch (op5) { - case 18: m_float_emit.FDIV(64, VD, VA, VB); break; - case 20: m_float_emit.FSUB(64, VD, VA, VB); break; - case 21: m_float_emit.FADD(64, VD, VA, VB); break; - case 25: m_float_emit.FMUL(64, VD, VA, VC); break; + case 18: m_float_emit.FDIV(size, VD, VA, VB); break; + case 20: m_float_emit.FSUB(size, VD, VA, VB); break; + case 21: m_float_emit.FADD(size, VD, VA, VB); break; + case 25: m_float_emit.FMUL(size, VD, VA, VC); break; default: _assert_msg_(DYNA_REC, 0, "fp_arith"); break; } } else { - VA = EncodeRegToDouble(fpr.R(a, REG_IS_LOADED)); + RegType type = (inputs_are_singles && single) ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR; + RegType type_out = single ? (inputs_are_singles ? REG_DUP_SINGLE : REG_DUP) : REG_LOWER_PAIR; + ARM64Reg (*reg_encoder)(ARM64Reg) = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble; + + VA = reg_encoder(fpr.R(a, type)); if (use_b) - VB = EncodeRegToDouble(fpr.R(b, REG_IS_LOADED)); + VB = reg_encoder(fpr.R(b, type)); if (use_c) - VC = EncodeRegToDouble(fpr.R(c, REG_IS_LOADED)); - VD = EncodeRegToDouble(fpr.RW(d, single ? REG_DUP : REG_LOWER_PAIR)); + VC = reg_encoder(fpr.R(c, type)); + VD = reg_encoder(fpr.RW(d, type_out)); switch (op5) { @@ -95,33 +105,42 @@ void JitArm64::fp_logic(UGeckoInstruction inst) if (op10 == 72 && b == d) return; + bool single = fpr.IsSingle(b, !packed); + u8 size = single ? 32 : 64; + if (packed) { - ARM64Reg VB = fpr.R(b, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); + RegType type = single ? REG_REG_SINGLE : REG_REG; + ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToDouble : EncodeRegToQuad; + + ARM64Reg VB = reg_encoder(fpr.R(b, type)); + ARM64Reg VD = reg_encoder(fpr.RW(d, type)); switch (op10) { - case 40: m_float_emit.FNEG(64, VD, VB); break; + case 40: m_float_emit.FNEG(size, VD, VB); break; case 72: m_float_emit.ORR(VD, VB, VB); break; - case 136: m_float_emit.FABS(64, VD, VB); - m_float_emit.FNEG(64, VD, VD); break; - case 264: m_float_emit.FABS(64, VD, VB); break; + case 136: m_float_emit.FABS(size, VD, VB); + m_float_emit.FNEG(size, VD, VD); break; + case 264: m_float_emit.FABS(size, VD, VB); break; default: _assert_msg_(DYNA_REC, 0, "fp_logic"); break; } } else { - ARM64Reg VB = fpr.R(b, REG_IS_LOADED); - ARM64Reg VD = fpr.RW(d); + RegType type = single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR; + ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToSingle : EncodeRegToDouble; + + ARM64Reg VB = fpr.R(b, type); + ARM64Reg VD = fpr.RW(d, type); switch (op10) { - case 40: m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); break; - case 72: m_float_emit.INS(64, VD, 0, VB, 0); break; - case 136: m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); - m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); break; - case 264: m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); break; + case 40: m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VB)); break; + case 72: m_float_emit.INS(size, VD, 0, VB, 0); break; + case 136: m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB)); + m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VD)); break; + case 264: m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB)); break; default: _assert_msg_(DYNA_REC, 0, "fp_logic"); break; } } @@ -135,13 +154,26 @@ void JitArm64::fselx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - ARM64Reg VA = fpr.R(a, REG_IS_LOADED); - ARM64Reg VB = fpr.R(b, REG_IS_LOADED); - ARM64Reg VC = fpr.R(c, REG_IS_LOADED); - ARM64Reg VD = fpr.RW(d); + if (fpr.IsSingle(a, true)) + { + ARM64Reg VA = fpr.R(a, REG_LOWER_PAIR_SINGLE); + m_float_emit.FCMPE(EncodeRegToSingle(VA)); + } + else + { + ARM64Reg VA = fpr.R(a, REG_LOWER_PAIR); + m_float_emit.FCMPE(EncodeRegToDouble(VA)); + } - m_float_emit.FCMPE(EncodeRegToDouble(VA)); - m_float_emit.FCSEL(EncodeRegToDouble(VD), EncodeRegToDouble(VC), EncodeRegToDouble(VB), CC_GE); + bool single = fpr.IsSingle(b, true) && fpr.IsSingle(c, true); + RegType type = single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR; + ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToSingle : EncodeRegToDouble; + + ARM64Reg VB = fpr.R(b, type); + ARM64Reg VC = fpr.R(c, type); + ARM64Reg VD = fpr.RW(d, type); + + m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE); } void JitArm64::frspx(UGeckoInstruction inst) @@ -153,11 +185,22 @@ void JitArm64::frspx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; - ARM64Reg VB = fpr.R(b, REG_IS_LOADED); - ARM64Reg VD = fpr.RW(d, REG_DUP); + if (fpr.IsSingle(b, true)) + { + // Source is already in single precision, so no need to do anything but to copy to PSR1. + ARM64Reg VB = fpr.R(b, REG_LOWER_PAIR_SINGLE); + ARM64Reg VD = fpr.RW(d, REG_DUP_SINGLE); - m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); - m_float_emit.FCVT(64, 32, EncodeRegToDouble(VD), EncodeRegToDouble(VD)); + if (b != d) + m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB)); + } + else + { + ARM64Reg VB = fpr.R(b, REG_LOWER_PAIR); + ARM64Reg VD = fpr.RW(d, REG_DUP_SINGLE); + + m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + } } void JitArm64::fcmpX(UGeckoInstruction inst) @@ -169,8 +212,12 @@ void JitArm64::fcmpX(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB; int crf = inst.CRFD; - ARM64Reg VA = fpr.R(a, REG_IS_LOADED); - ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true); + RegType type = singles ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToSingle : EncodeRegToDouble; + + ARM64Reg VA = reg_encoder(fpr.R(a, type)); + ARM64Reg VB = reg_encoder(fpr.R(b, type)); ARM64Reg WA = gpr.GetReg(); ARM64Reg XA = EncodeRegTo64(WA); @@ -179,7 +226,7 @@ void JitArm64::fcmpX(UGeckoInstruction inst) FixupBranch continue1, continue2, continue3; ORR(XA, ZR, 32, 0, true); - m_float_emit.FCMP(EncodeRegToDouble(VA), EncodeRegToDouble(VB)); + m_float_emit.FCMP(VA, VB); if (a != b) { @@ -231,7 +278,9 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; - ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + bool single = fpr.IsSingle(b, true); + + ARM64Reg VB = fpr.R(b, single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR); ARM64Reg VD = fpr.RW(d); ARM64Reg V0 = fpr.GetReg(); @@ -240,8 +289,15 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL); m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7); - m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); - m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z); + if (single) + { + m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VB), ROUND_Z); + } + else + { + m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z); + } m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0)); fpr.Unlock(V0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 979296474f..e1e7865ff0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -76,7 +76,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) u32 imm_addr = 0; bool is_immediate = false; - RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP; + RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP_SINGLE; gpr.Lock(W0, W30); fpr.Lock(Q0); @@ -270,7 +270,16 @@ void JitArm64::stfXX(UGeckoInstruction inst) gpr.Lock(W0, W1, W30); fpr.Lock(Q0); - ARM64Reg V0 = fpr.R(inst.FS, REG_IS_LOADED); + bool single = (flags & BackPatchInfo::FLAG_SIZE_F32) && fpr.IsSingle(inst.FS, true); + + ARM64Reg V0 = fpr.R(inst.FS, single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR); + + if (single) + { + flags &= ~BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_F32I; + } + ARM64Reg addr_reg = W1; if (update) @@ -407,24 +416,29 @@ void JitArm64::stfXX(UGeckoInstruction inst) ADD(X1, X30, pipe_off); LDR(INDEX_UNSIGNED, W0, X30, count_off); - if (accessSize == 64) + if (flags & BackPatchInfo::FLAG_SIZE_F64) { m_float_emit.REV64(8, Q0, V0); - if (pipe_off) - m_float_emit.STR(64, Q0, X1, ArithOption(X0)); - else - m_float_emit.STR(64, Q0, X30, ArithOption(X0)); } - else if (accessSize == 32) + else if (flags & BackPatchInfo::FLAG_SIZE_F32) { m_float_emit.FCVT(32, 64, D0, EncodeRegToDouble(V0)); m_float_emit.REV32(8, D0, D0); - if (pipe_off) - m_float_emit.STR(32, D0, X1, ArithOption(X0)); - else - m_float_emit.STR(32, D0, X30, ArithOption(X0)); - } + else if (flags & BackPatchInfo::FLAG_SIZE_F32I) + { + m_float_emit.REV32(8, D0, V0); + } + + if (pipe_off) + { + m_float_emit.STR(accessSize, accessSize == 64 ? Q0 : D0, X1, ArithOption(X0)); + } + else + { + m_float_emit.STR(accessSize, accessSize == 64 ? Q0 : D0, X30, ArithOption(X0)); + } + ADD(W0, W0, accessSize >> 3); STR(INDEX_UNSIGNED, W0, X30, count_off); js.fifoBytesThisBlock += accessSize >> 3; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 0f2e51f616..dde52cfad5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -62,20 +62,17 @@ void JitArm64::psq_l(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { - VS = fpr.RW(inst.RS, REG_REG); + VS = fpr.RW(inst.RS, REG_REG_SINGLE); if (!inst.W) { ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28); m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg)); - m_float_emit.REV32(8, VS, VS); - m_float_emit.FCVTL(64, VS, VS); } else { m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28); - m_float_emit.REV32(8, VS, VS); - m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), EncodeRegToDouble(VS)); } + m_float_emit.REV32(8, EncodeRegToDouble(VS), EncodeRegToDouble(VS)); } else { @@ -87,17 +84,14 @@ void JitArm64::psq_l(UGeckoInstruction inst) LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(X30); - VS = fpr.RW(inst.RS, REG_REG); - if (!inst.W) - m_float_emit.FCVTL(64, VS, D0); - else - m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), D0); + VS = fpr.RW(inst.RS, REG_REG_SINGLE); + m_float_emit.ORR(EncodeRegToDouble(VS), D0, D0); } if (inst.W) { - m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double - m_float_emit.INS(64, VS, 1, Q0, 0); + m_float_emit.FMOV(S0, 0x70); // 1.0 as a Single + m_float_emit.INS(32, VS, 1, Q0, 0); } gpr.Unlock(W0, W1, W2, W30); @@ -121,8 +115,10 @@ void JitArm64::psq_st(UGeckoInstruction inst) gpr.Lock(W0, W1, W2, W30); fpr.Lock(Q0, Q1); + bool single = fpr.IsSingle(inst.RS); + ARM64Reg arm_addr = gpr.R(inst.RA); - ARM64Reg VS = fpr.R(inst.RS, REG_REG); + ARM64Reg VS = fpr.R(inst.RS, single ? REG_REG_SINGLE : REG_REG); ARM64Reg scale_reg = W0; ARM64Reg addr_reg = W1; @@ -156,7 +152,12 @@ void JitArm64::psq_st(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { u32 flags = BackPatchInfo::FLAG_STORE; - flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); + + if (single) + flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32I : BackPatchInfo::FLAG_SIZE_F32X2I); + else + flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); + EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, @@ -166,10 +167,17 @@ void JitArm64::psq_st(UGeckoInstruction inst) } else { - if (inst.W) - m_float_emit.FCVT(32, 64, D0, VS); + if (single) + { + m_float_emit.ORR(D0, VS, VS); + } else - m_float_emit.FCVTN(32, D0, VS); + { + if (inst.W) + m_float_emit.FCVT(32, 64, D0, VS); + else + m_float_emit.FCVTN(32, D0, VS); + } LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); UBFM(type_reg, scale_reg, 0, 2); // Type diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index c17c22b549..afc4bc4a91 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -25,36 +25,41 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, d = inst.FD; - ARM64Reg VA = fpr.R(a, REG_REG); - ARM64Reg VB = fpr.R(b, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); + bool singles = fpr.IsSingle(a) && fpr.IsSingle(b); + RegType type = singles ? REG_REG_SINGLE : REG_REG; + u8 size = singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad; + + ARM64Reg VA = fpr.R(a, type); + ARM64Reg VB = fpr.R(b, type); + ARM64Reg VD = fpr.RW(d, type); switch (inst.SUBOP10) { case 528: //00 - m_float_emit.TRN1(64, VD, VA, VB); + m_float_emit.TRN1(size, VD, VA, VB); break; case 560: //01 - m_float_emit.INS(64, VD, 0, VA, 0); - m_float_emit.INS(64, VD, 1, VB, 1); + m_float_emit.INS(size, VD, 0, VA, 0); + m_float_emit.INS(size, VD, 1, VB, 1); break; case 592: //10 if (d != a && d != b) { - m_float_emit.INS(64, VD, 0, VA, 1); - m_float_emit.INS(64, VD, 1, VB, 0); + m_float_emit.INS(size, VD, 0, VA, 1); + m_float_emit.INS(size, VD, 1, VB, 0); } else { ARM64Reg V0 = fpr.GetReg(); - m_float_emit.INS(64, V0, 0, VA, 1); - m_float_emit.INS(64, V0, 1, VB, 0); - m_float_emit.ORR(VD, V0, V0); + m_float_emit.INS(size, V0, 0, VA, 1); + m_float_emit.INS(size, V0, 1, VB, 0); + m_float_emit.ORR(reg_encoder(VD), reg_encoder(V0), reg_encoder(V0)); fpr.Unlock(V0); } break; case 624: //11 - m_float_emit.TRN2(64, VD, VA, VB); + m_float_emit.TRN2(size, VD, VA, VB); break; default: _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); @@ -73,13 +78,19 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst) bool upper = inst.SUBOP5 == 13; - ARM64Reg VA = fpr.R(a, REG_REG); - ARM64Reg VC = fpr.R(c, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); + bool singles = fpr.IsSingle(a) && fpr.IsSingle(c); + RegType type = singles ? REG_REG_SINGLE : REG_REG; + u8 size = singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad; + + ARM64Reg VA = fpr.R(a, type); + ARM64Reg VC = fpr.R(c, type); + ARM64Reg VD = fpr.RW(d, type); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.DUP(64, V0, VC, upper ? 1 : 0); - m_float_emit.FMUL(64, VD, VA, V0); + m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VC), upper ? 1 : 0); + m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(V0)); + fpr.FixSinglePrecision(d); fpr.Unlock(V0); } @@ -94,41 +105,49 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; u32 op5 = inst.SUBOP5; - ARM64Reg VA = fpr.R(a, REG_REG); - ARM64Reg VB = fpr.R(b, REG_REG); - ARM64Reg VC = fpr.R(c, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); - ARM64Reg V0 = fpr.GetReg(); + bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); + RegType type = singles ? REG_REG_SINGLE : REG_REG; + u8 size = singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad; + + ARM64Reg VA = reg_encoder(fpr.R(a, type)); + ARM64Reg VB = reg_encoder(fpr.R(b, type)); + ARM64Reg VC = reg_encoder(fpr.R(c, type)); + ARM64Reg VD = reg_encoder(fpr.RW(d, type)); + ARM64Reg V0Q = fpr.GetReg(); + ARM64Reg V0 = reg_encoder(V0Q); + + // TODO: Do FMUL and FADD/FSUB in *one* host call to save accuracy. switch (op5) { case 14: // ps_madds0 - m_float_emit.DUP(64, V0, VC, 0); - m_float_emit.FMUL(64, V0, V0, VA); - m_float_emit.FADD(64, VD, V0, VB); + m_float_emit.DUP(size, V0, VC, 0); + m_float_emit.FMUL(size, V0, V0, VA); + m_float_emit.FADD(size, VD, V0, VB); break; case 15: // ps_madds1 - m_float_emit.DUP(64, V0, VC, 1); - m_float_emit.FMUL(64, V0, V0, VA); - m_float_emit.FADD(64, VD, V0, VB); + m_float_emit.DUP(size, V0, VC, 1); + m_float_emit.FMUL(size, V0, V0, VA); + m_float_emit.FADD(size, VD, V0, VB); break; case 28: // ps_msub - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FSUB(64, VD, V0, VB); + m_float_emit.FMUL(size, V0, VA, VC); + m_float_emit.FSUB(size, VD, V0, VB); break; case 29: // ps_madd - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FADD(64, VD, V0, VB); + m_float_emit.FMUL(size, V0, VA, VC); + m_float_emit.FADD(size, VD, V0, VB); break; case 30: // ps_nmsub - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FSUB(64, VD, V0, VB); - m_float_emit.FNEG(64, VD, VD); + m_float_emit.FMUL(size, V0, VA, VC); + m_float_emit.FSUB(size, VD, V0, VB); + m_float_emit.FNEG(size, VD, VD); break; case 31: // ps_nmadd - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FADD(64, VD, V0, VB); - m_float_emit.FNEG(64, VD, VD); + m_float_emit.FMUL(size, V0, VA, VC); + m_float_emit.FADD(size, VD, V0, VB); + m_float_emit.FNEG(size, VD, VD); break; default: _assert_msg_(DYNA_REC, 0, "ps_madd - invalid op"); @@ -136,7 +155,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) } fpr.FixSinglePrecision(d); - fpr.Unlock(V0); + fpr.Unlock(V0Q); } void JitArm64::ps_res(UGeckoInstruction inst) @@ -148,10 +167,16 @@ void JitArm64::ps_res(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; - ARM64Reg VB = fpr.R(b, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); + bool singles = fpr.IsSingle(b); + RegType type = singles ? REG_REG_SINGLE : REG_REG; + u8 size = singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad; + + ARM64Reg VB = fpr.R(b, type); + ARM64Reg VD = fpr.RW(d, type); + + m_float_emit.FRSQRTE(size, reg_encoder(VD), reg_encoder(VB)); - m_float_emit.FRSQRTE(64, VD, VB); fpr.FixSinglePrecision(d); } @@ -163,23 +188,29 @@ void JitArm64::ps_sel(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - ARM64Reg VA = fpr.R(a, REG_REG); - ARM64Reg VB = fpr.R(b, REG_REG); - ARM64Reg VC = fpr.R(c, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); + bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); + RegType type = singles ? REG_REG_SINGLE : REG_REG; + u8 size = singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad; - if (d != a && d != b && d != c) + ARM64Reg VA = reg_encoder(fpr.R(a, type)); + ARM64Reg VB = reg_encoder(fpr.R(b, type)); + ARM64Reg VC = reg_encoder(fpr.R(c, type)); + ARM64Reg VD = reg_encoder(fpr.RW(d, type)); + + if (d != b && d != c) { - m_float_emit.FCMGE(64, VD, VA); + m_float_emit.FCMGE(size, VD, VA); m_float_emit.BSL(VD, VC, VB); } else { - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FCMGE(64, V0, VA); + ARM64Reg V0Q = fpr.GetReg(); + ARM64Reg V0 = reg_encoder(V0Q); + m_float_emit.FCMGE(size, V0, VA); m_float_emit.BSL(V0, VC, VB); m_float_emit.ORR(VD, V0, V0); - fpr.Unlock(V0); + fpr.Unlock(V0Q); } } @@ -194,23 +225,29 @@ void JitArm64::ps_sumX(UGeckoInstruction inst) bool upper = inst.SUBOP5 == 11; - ARM64Reg VA = fpr.R(a, REG_REG); - ARM64Reg VB = fpr.R(b, REG_REG); - ARM64Reg VC = fpr.R(c, REG_REG); - ARM64Reg VD = fpr.RW(d, REG_REG); + bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); + RegType type = singles ? REG_REG_SINGLE : REG_REG; + u8 size = singles ? 32 : 64; + ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad; + + ARM64Reg VA = fpr.R(a, type); + ARM64Reg VB = fpr.R(b, type); + ARM64Reg VC = fpr.R(c, type); + ARM64Reg VD = fpr.RW(d, type); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.DUP(64, V0, upper ? VA : VB, upper ? 0 : 1); + m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1); if (d != c) { - m_float_emit.FADD(64, VD, V0, upper ? VB : VA); - m_float_emit.INS(64, VD, upper ? 0 : 1, VC, upper ? 0 : 1); + m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA)); + m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1); } else { - m_float_emit.FADD(64, V0, V0, upper ? VB : VA); - m_float_emit.INS(64, VD, upper ? 1 : 0, V0, upper ? 1 : 0); + m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA)); + m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0); } + fpr.FixSinglePrecision(d); fpr.Unlock(V0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index f08f7605e3..77eed0a70b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -198,7 +198,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg) { ARM64Reg host_reg = GetReg(); m_emit->MOVI2R(host_reg, reg.GetImm()); - reg.LoadToReg(host_reg); + reg.Load(host_reg); reg.SetDirty(true); return host_reg; } @@ -208,7 +208,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg) // This is a bit annoying. We try to keep these preloaded as much as possible // This can also happen on cases where PPCAnalyst isn't feeing us proper register usage statistics ARM64Reg host_reg = GetReg(); - reg.LoadToReg(host_reg); + reg.Load(host_reg); reg.SetDirty(false); m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg])); return host_reg; @@ -240,7 +240,7 @@ void Arm64GPRCache::BindToRegister(u32 preg, bool do_load) if (reg.GetType() == REG_NOTLOADED) { ARM64Reg host_reg = GetReg(); - reg.LoadToReg(host_reg); + reg.Load(host_reg); if (do_load) m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg])); } @@ -307,12 +307,38 @@ ARM64Reg Arm64FPRCache::R(u32 preg, RegType type) OpArg& reg = m_guest_registers[preg]; IncrementAllUsed(); reg.ResetLastUsed(); + ARM64Reg host_reg = reg.GetReg(); switch (reg.GetType()) { + case REG_REG_SINGLE: + { + // We're asked for singles, so just return the register. + if (type == REG_REG_SINGLE || type == REG_LOWER_PAIR_SINGLE) + return host_reg; + + // Else convert this register back to doubles. + m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + reg.Load(host_reg, REG_REG); + + // fall through + } case REG_REG: // already in a reg - return reg.GetReg(); - break; + { + return host_reg; + } + case REG_LOWER_PAIR_SINGLE: + { + // We're asked for the lower single, so just return the register. + if (type == REG_LOWER_PAIR_SINGLE) + return host_reg; + + // Else convert this register back to a double. + m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + reg.Load(host_reg, REG_LOWER_PAIR); + + // fall through + } case REG_LOWER_PAIR: { if (type == REG_REG) @@ -320,48 +346,62 @@ ARM64Reg Arm64FPRCache::R(u32 preg, RegType type) // Load the high 64bits from the file and insert them in to the high 64bits of the host register ARM64Reg tmp_reg = GetReg(); m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, X29, PPCSTATE_OFF(ps[preg][1])); - m_float_emit->INS(64, reg.GetReg(), 1, tmp_reg, 0); + m_float_emit->INS(64, host_reg, 1, tmp_reg, 0); UnlockRegister(tmp_reg); // Change it over to a full 128bit register - reg.LoadToReg(reg.GetReg()); + reg.Load(host_reg, REG_REG); } - return reg.GetReg(); + return host_reg; + } + case REG_DUP_SINGLE: + { + if (type == REG_LOWER_PAIR_SINGLE) + return host_reg; + + if (type == REG_REG_SINGLE) + { + // Duplicate to the top and change over + m_float_emit->INS(32, host_reg, 1, host_reg, 0); + reg.Load(host_reg, REG_REG_SINGLE); + return host_reg; + } + + m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + reg.Load(host_reg, REG_DUP); + + // fall through } - break; case REG_DUP: { - ARM64Reg host_reg = reg.GetReg(); if (type == REG_REG) { // We are requesting a full 128bit register // but we are only available in the lower 64bits // Duplicate to the top and change over m_float_emit->INS(64, host_reg, 1, host_reg, 0); - reg.LoadToReg(host_reg); + reg.Load(host_reg, REG_REG); } return host_reg; } - break; case REG_NOTLOADED: // Register isn't loaded at /all/ { - ARM64Reg host_reg = GetReg(); + host_reg = GetReg(); u32 load_size; if (type == REG_REG) { load_size = 128; - reg.LoadToReg(host_reg); + reg.Load(host_reg, REG_REG); } else { load_size = 64; - reg.LoadLowerReg(host_reg); + reg.Load(host_reg, REG_LOWER_PAIR); } reg.SetDirty(false); m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); return host_reg; } - break; default: _dbg_assert_msg_(DYNA_REC, false, "Invalid OpArg Type!"); break; @@ -380,90 +420,52 @@ ARM64Reg Arm64FPRCache::RW(u32 preg, RegType type) reg.ResetLastUsed(); reg.SetDirty(true); - switch (reg.GetType()) + + // If not loaded at all, just alloc a new one. + if (reg.GetType() == REG_NOTLOADED) { - case REG_NOTLOADED: - { - ARM64Reg host_reg = GetReg(); - if (type == REG_LOWER_PAIR) - { - reg.LoadLowerReg(host_reg); - } - else if (type == REG_DUP) - { - reg.LoadDup(host_reg); - } - else - { - reg.LoadToReg(host_reg); - } + reg.Load(GetReg(), type); + return reg.GetReg(); } - break; - case REG_LOWER_PAIR: + + // Only the lower value will be overwritten, so we must be extra careful to store PSR1 if dirty. + if ((type == REG_LOWER_PAIR || type == REG_LOWER_PAIR_SINGLE) && was_dirty) { + // We must *not* change host_reg as this register might still be in use. So it's fine to + // store this register, but it's *not* fine to convert it to double. So for double convertion, + // a temporary register needs to be used. ARM64Reg host_reg = reg.GetReg(); - if (type == REG_REG) + ARM64Reg flush_reg = host_reg; + + switch (reg.GetType()) { - // Change it over to a full 128bit register - reg.LoadToReg(host_reg); - } - else if (type == REG_DUP) - { - // Register is already the lower pair - // Just convert it over to a dup - reg.LoadDup(host_reg); - } - } - break; - case REG_REG: - { - ARM64Reg host_reg = reg.GetReg(); - if (type == REG_LOWER_PAIR) - { - // If we only want the lower bits, let's store away the high bits and drop to a lower only register + case REG_REG_SINGLE: + flush_reg = GetReg(); + m_float_emit->FCVTL(64, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg)); + // fall through + case REG_REG: // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit store. // It would take longer to do an insert to a temporary and a 64bit store than to just do this. - if (was_dirty) - m_float_emit->STR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); - reg.LoadLowerReg(host_reg); + m_float_emit->STR(128, INDEX_UNSIGNED, flush_reg, X29, PPCSTATE_OFF(ps[preg][0])); + break; + case REG_DUP_SINGLE: + flush_reg = GetReg(); + m_float_emit->FCVT(64, 32, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg)); + // fall through + case REG_DUP: + // Store PSR1 (which is equal to PSR0) in memory. + m_float_emit->STR(64, INDEX_UNSIGNED, flush_reg, X29, PPCSTATE_OFF(ps[preg][1])); + break; + default: + // All other types doesn't store anything in PSR1. + break; } - else if (type == REG_DUP) - { - // If we are going from a full 128bit register to a duplicate - // then we can just change over - reg.LoadDup(host_reg); - } - } - break; - case REG_DUP: - { - ARM64Reg host_reg = reg.GetReg(); - if (type == REG_REG) - { - // We are a duplicated register going to a full 128bit register - // Do an insert of our lower 64bits to the higher 64bits - m_float_emit->INS(64, host_reg, 1, host_reg, 0); - // Change over to the full 128bit register - reg.LoadToReg(host_reg); - } - else if (type == REG_LOWER_PAIR) - { - // We are duplicated changing over to a lower register - // We've got to be careful in this instance and do a store of our lower 64bits - // to the upper 64bits in the PowerPC state - // That way incase if we hit the path of DUP->LOWER->REG we get the correct bits back - if (was_dirty) - m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][1])); - reg.LoadLowerReg(host_reg); - } - } - break; - default: - // Do nothing - break; + if (host_reg != flush_reg) + Unlock(flush_reg); } + reg.Load(reg.GetReg(), type); return reg.GetReg(); } @@ -510,17 +512,37 @@ bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg) void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state) { OpArg& reg = m_guest_registers[preg]; - if (reg.GetType() == REG_REG || - reg.GetType() == REG_LOWER_PAIR) + ARM64Reg host_reg = reg.GetReg(); + RegType type = reg.GetType(); + bool dirty = reg.IsDirty(); + + // If we're in single mode, just convert it back to a double. + if (type == REG_REG_SINGLE) + { + if (dirty) + m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + type = REG_REG; + } + if (type == REG_DUP_SINGLE || type == REG_LOWER_PAIR_SINGLE) + { + if (dirty) + m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + + if (type == REG_DUP_SINGLE) + type = REG_DUP; + else + type = REG_LOWER_PAIR; + } + + if (type == REG_REG || type == REG_LOWER_PAIR) { - ARM64Reg host_reg = reg.GetReg(); u32 store_size; - if (reg.GetType() == REG_REG) + if (type == REG_REG) store_size = 128; else store_size = 64; - if (reg.IsDirty()) + if (dirty) m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); if (!maintain_state) @@ -529,10 +551,9 @@ void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state) reg.Flush(); } } - else if (reg.GetType() == REG_DUP) + else if (type == REG_DUP) { - ARM64Reg host_reg = reg.GetReg(); - if (reg.IsDirty()) + if (dirty) { // If the paired registers were at the start of ppcState we could do an STP here. // Too bad moving them would break savestate compatibility between x86_64 and AArch64 @@ -564,18 +585,25 @@ BitSet32 Arm64FPRCache::GetCallerSavedUsed() return registers; } +bool Arm64FPRCache::IsSingle(u32 preg, bool lower_only) +{ + RegType type = m_guest_registers[preg].GetType(); + return type == REG_REG_SINGLE || type == REG_DUP_SINGLE || (lower_only && type == REG_LOWER_PAIR_SINGLE); +} + void Arm64FPRCache::FixSinglePrecision(u32 preg) { - ARM64Reg host_reg = m_guest_registers[preg].GetReg(); - switch (m_guest_registers[preg].GetType()) + OpArg& reg = m_guest_registers[preg]; + ARM64Reg host_reg = reg.GetReg(); + switch (reg.GetType()) { case REG_DUP: // only PS0 needs to be converted m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); - m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + reg.Load(host_reg, REG_DUP_SINGLE); break; case REG_REG: // PS0 and PS1 needs to be converted m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); - m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + reg.Load(host_reg, REG_REG_SINGLE); break; default: break; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 0d2d1808c4..6616e4b6fb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -22,7 +22,9 @@ enum RegType REG_IMM, // Reg is really a IMM REG_LOWER_PAIR, // Only the lower pair of a paired register REG_DUP, // The lower reg is the same as the upper one (physical upper doesn't actually have the duplicated value) - REG_IS_LOADED, // We don't care what type it is, as long as the lower 64bits are loaded + REG_REG_SINGLE, // Both registers are loaded as single + REG_LOWER_PAIR_SINGLE, // Only the lower pair of a paired register, as single + REG_DUP_SINGLE, // The lower one contains both registers, as single }; enum FlushMode @@ -56,19 +58,9 @@ public: { return m_value; } - void LoadToReg(ARM64Reg reg) + void Load(ARM64Reg reg, RegType type = REG_REG) { - m_type = REG_REG; - m_reg = reg; - } - void LoadLowerReg(ARM64Reg reg) - { - m_type = REG_LOWER_PAIR; - m_reg = reg; - } - void LoadDup(ARM64Reg reg) - { - m_type = REG_DUP; + m_type = type; m_reg = reg; } void LoadToImm(u32 imm) @@ -278,6 +270,8 @@ public: BitSet32 GetCallerSavedUsed() override; + bool IsSingle(u32 preg, bool lower_only = false); + void FixSinglePrecision(u32 preg); protected: diff --git a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h index c712918883..4e9f7ca199 100644 --- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h +++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h @@ -9,22 +9,24 @@ struct BackPatchInfo { enum { - FLAG_STORE = (1 << 0), - FLAG_LOAD = (1 << 1), - FLAG_SIZE_8 = (1 << 2), - FLAG_SIZE_16 = (1 << 3), - FLAG_SIZE_32 = (1 << 4), - FLAG_SIZE_F32 = (1 << 5), - FLAG_SIZE_F32X2 = (1 << 6), - FLAG_SIZE_F64 = (1 << 7), - FLAG_REVERSE = (1 << 8), - FLAG_EXTEND = (1 << 9), - FLAG_SIZE_F32I = (1 << 10), - FLAG_ZERO_256 = (1 << 11), - FLAG_MASK_FLOAT = FLAG_SIZE_F32 | - FLAG_SIZE_F32X2 | - FLAG_SIZE_F64 | - FLAG_SIZE_F32I, + FLAG_STORE = (1 << 0), + FLAG_LOAD = (1 << 1), + FLAG_SIZE_8 = (1 << 2), + FLAG_SIZE_16 = (1 << 3), + FLAG_SIZE_32 = (1 << 4), + FLAG_SIZE_F32 = (1 << 5), + FLAG_SIZE_F32X2 = (1 << 6), + FLAG_SIZE_F32X2I = (1 << 7), + FLAG_SIZE_F64 = (1 << 8), + FLAG_REVERSE = (1 << 9), + FLAG_EXTEND = (1 << 10), + FLAG_SIZE_F32I = (1 << 11), + FLAG_ZERO_256 = (1 << 12), + FLAG_MASK_FLOAT = FLAG_SIZE_F32 | + FLAG_SIZE_F32X2 | + FLAG_SIZE_F32X2I | + FLAG_SIZE_F64 | + FLAG_SIZE_F32I, }; static u32 GetFlagSize(u32 flags)