diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 9925fd4c0d..370ab5c671 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -2065,69 +2065,83 @@ void Jit64::rlwinmx(UGeckoInstruction inst) bool needs_sext = true; int mask_size = inst.ME - inst.MB + 1; - RCOpArg Rs = gpr.Use(s, RCMode::Read); - RCX64Reg Ra = gpr.Bind(a, RCMode::Write); - RegCache::Realize(Rs, Ra); + if (simple_mask && !(inst.SH & (mask_size - 1)) && !gpr.IsBound(s)) + { + // optimized case: byte/word extract from m_ppc_state - if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3) - { - LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0)); - } - // common optimized case: byte/word extract - else if (simple_mask && !(inst.SH & (mask_size - 1))) - { - MOVZX(32, mask_size, Ra, Rs.ExtractWithByteOffset(inst.SH ? (32 - inst.SH) >> 3 : 0)); - needs_sext = false; - } - // another optimized special case: byte/word extract plus rotate - else if (simple_prerotate_mask && !left_shift) - { - MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs); + // Note: If a == s, calling Realize(Ra) will allocate a host register for Rs, + // so we have to get mem_source from Rs before calling Realize(Ra) + + RCOpArg Rs = gpr.Use(s, RCMode::Read); + RegCache::Realize(Rs); + OpArg mem_source = Rs.Location(); if (inst.SH) - ROL(32, Ra, Imm8(inst.SH)); - needs_sext = (mask & 0x80000000) != 0; - } - // Use BEXTR where possible: Only AMD implements this in one uop - else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD) - { - MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH))); - BEXTR(32, Ra, Rs, RSCRATCH); - needs_sext = false; - } - else if (left_shift) - { - if (a != s) - MOV(32, Ra, Rs); + mem_source.AddMemOffset((32 - inst.SH) >> 3); + Rs.Unlock(); - SHL(32, Ra, Imm8(inst.SH)); - } - else if (right_shift) - { - if (a != s) - MOV(32, Ra, Rs); + RCX64Reg Ra = gpr.Bind(a, RCMode::Write); + RegCache::Realize(Ra); + MOVZX(32, mask_size, Ra, mem_source); - SHR(32, Ra, Imm8(inst.MB)); needs_sext = false; } else { - RotateLeft(32, Ra, Rs, inst.SH); + RCOpArg Rs = gpr.Use(s, RCMode::Read); + RCX64Reg Ra = gpr.Bind(a, RCMode::Write); + RegCache::Realize(Rs, Ra); - if (!(inst.MB == 0 && inst.ME == 31)) + if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3) { - // we need flags if we're merging the branch - if (inst.Rc && CheckMergedBranch(0)) - AND(32, Ra, Imm32(mask)); - else - AndWithMask(Ra, mask); - needs_sext = inst.MB == 0; - needs_test = false; + LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0)); + } + // optimized case: byte/word extract plus rotate + else if (simple_prerotate_mask && !left_shift) + { + MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs); + if (inst.SH) + ROL(32, Ra, Imm8(inst.SH)); + needs_sext = (mask & 0x80000000) != 0; + } + // Use BEXTR where possible: Only AMD implements this in one uop + else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD) + { + MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH))); + BEXTR(32, Ra, Rs, RSCRATCH); + needs_sext = false; + } + else if (left_shift) + { + if (a != s) + MOV(32, Ra, Rs); + + SHL(32, Ra, Imm8(inst.SH)); + } + else if (right_shift) + { + if (a != s) + MOV(32, Ra, Rs); + + SHR(32, Ra, Imm8(inst.MB)); + needs_sext = false; + } + else + { + RotateLeft(32, Ra, Rs, inst.SH); + + if (!(inst.MB == 0 && inst.ME == 31)) + { + // we need flags if we're merging the branch + if (inst.Rc && CheckMergedBranch(0)) + AND(32, Ra, Imm32(mask)); + else + AndWithMask(Ra, mask); + needs_sext = inst.MB == 0; + needs_test = false; + } } } - Rs.Unlock(); - Ra.Unlock(); - if (inst.Rc) ComputeRC(a, needs_test, needs_sext); } diff --git a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp index 86f1d9e15a..c647ed32d6 100644 --- a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp @@ -109,19 +109,6 @@ OpArg RCOpArg::Location() const return {}; } -OpArg RCOpArg::ExtractWithByteOffset(int offset) -{ - if (offset == 0) - return Location(); - - ASSERT(rc); - const preg_t preg = std::get(contents); - rc->StoreFromRegister(preg, RegCache::FlushMode::MaintainState); - OpArg result = rc->GetDefaultLocation(preg); - result.AddMemOffset(offset); - return result; -} - void RCOpArg::Unlock() { if (const preg_t* preg = std::get_if(&contents)) diff --git a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h index 158c41c02d..f025190312 100644 --- a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h @@ -47,9 +47,6 @@ public: bool IsSimpleReg(Gen::X64Reg reg) const { return Location().IsSimpleReg(reg); } Gen::X64Reg GetSimpleReg() const { return Location().GetSimpleReg(); } - // Use to extract bytes from a register using the regcache. offset is in bytes. - Gen::OpArg ExtractWithByteOffset(int offset); - void Unlock(); bool IsImm() const; @@ -159,6 +156,8 @@ public: u32 Imm32(preg_t preg) const { return R(preg).Imm32(); } s32 SImm32(preg_t preg) const { return R(preg).SImm32(); } + bool IsBound(preg_t preg) const { return m_regs[preg].IsBound(); } + RCOpArg Use(preg_t preg, RCMode mode); RCOpArg UseNoImm(preg_t preg, RCMode mode); RCOpArg BindOrImm(preg_t preg, RCMode mode);