Jit64: Clean up ExtractWithByteOffset

RCOpArg::ExtractWithByteOffset is only used in one place: a special case
of rlwinmx. ExtractWithByteOffset first stores the value of the
specified register into m_ppc_state (unless it's already there), and
then returns an offset into m_ppc_state. Our use of this function has
two undesirable properties (except in the trivial case `offset == 0`):

1. ExtractWithByteOffset calls StoreFromRegister without going through
   any of the usual functions. This violated an assumption I made when
   working on my constant propagation PR and led to a hard-to-find bug.
2. If the specified register is in a host register and is dirty,
   ExtractWithByteOffset will store its value to m_ppc_state even when
   it's unnecessary. In particular, this always happens when rlwinmx
   uses the same register as input and output, since rlwinmx always
   allocates a host register for the output and marks it as dirty.

Since ExtractWithByteOffset is only used in one place, I figure we might
as well inline it there. This commit does that, and also alters
rlwinmx's logic so the special case code is only triggered when the
input is already in m_ppc_state.

Input in `m_ppc_state`, before (11 bytes):

mov         esi, dword ptr [rbp-104]
mov         dword ptr [rbp-104], esi
movzx       esi, byte ptr [rbp-101]

Input in `m_ppc_state`, after (5 bytes):

movzx       esi, byte ptr [rbp-101]

Input in host register, before (8 bytes):

mov         dword ptr [rbp-104], esi
movzx       esi, byte ptr [rbp-101]

Input in host register, after (3 bytes):

shr         edi, 0x18
This commit is contained in:
JosJuice 2024-03-30 20:29:41 +01:00
parent f30d3c9092
commit fb75ae410f
3 changed files with 66 additions and 66 deletions

View File

@ -2065,69 +2065,83 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
bool needs_sext = true;
int mask_size = inst.ME - inst.MB + 1;
RCOpArg Rs = gpr.Use(s, RCMode::Read);
RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
RegCache::Realize(Rs, Ra);
if (simple_mask && !(inst.SH & (mask_size - 1)) && !gpr.IsBound(s))
{
// optimized case: byte/word extract from m_ppc_state
if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3)
{
LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0));
}
// common optimized case: byte/word extract
else if (simple_mask && !(inst.SH & (mask_size - 1)))
{
MOVZX(32, mask_size, Ra, Rs.ExtractWithByteOffset(inst.SH ? (32 - inst.SH) >> 3 : 0));
needs_sext = false;
}
// another optimized special case: byte/word extract plus rotate
else if (simple_prerotate_mask && !left_shift)
{
MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs);
// Note: If a == s, calling Realize(Ra) will allocate a host register for Rs,
// so we have to get mem_source from Rs before calling Realize(Ra)
RCOpArg Rs = gpr.Use(s, RCMode::Read);
RegCache::Realize(Rs);
OpArg mem_source = Rs.Location();
if (inst.SH)
ROL(32, Ra, Imm8(inst.SH));
needs_sext = (mask & 0x80000000) != 0;
}
// Use BEXTR where possible: Only AMD implements this in one uop
else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD)
{
MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH)));
BEXTR(32, Ra, Rs, RSCRATCH);
needs_sext = false;
}
else if (left_shift)
{
if (a != s)
MOV(32, Ra, Rs);
mem_source.AddMemOffset((32 - inst.SH) >> 3);
Rs.Unlock();
SHL(32, Ra, Imm8(inst.SH));
}
else if (right_shift)
{
if (a != s)
MOV(32, Ra, Rs);
RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
RegCache::Realize(Ra);
MOVZX(32, mask_size, Ra, mem_source);
SHR(32, Ra, Imm8(inst.MB));
needs_sext = false;
}
else
{
RotateLeft(32, Ra, Rs, inst.SH);
RCOpArg Rs = gpr.Use(s, RCMode::Read);
RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
RegCache::Realize(Rs, Ra);
if (!(inst.MB == 0 && inst.ME == 31))
if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3)
{
// we need flags if we're merging the branch
if (inst.Rc && CheckMergedBranch(0))
AND(32, Ra, Imm32(mask));
else
AndWithMask(Ra, mask);
needs_sext = inst.MB == 0;
needs_test = false;
LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0));
}
// optimized case: byte/word extract plus rotate
else if (simple_prerotate_mask && !left_shift)
{
MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs);
if (inst.SH)
ROL(32, Ra, Imm8(inst.SH));
needs_sext = (mask & 0x80000000) != 0;
}
// Use BEXTR where possible: Only AMD implements this in one uop
else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD)
{
MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH)));
BEXTR(32, Ra, Rs, RSCRATCH);
needs_sext = false;
}
else if (left_shift)
{
if (a != s)
MOV(32, Ra, Rs);
SHL(32, Ra, Imm8(inst.SH));
}
else if (right_shift)
{
if (a != s)
MOV(32, Ra, Rs);
SHR(32, Ra, Imm8(inst.MB));
needs_sext = false;
}
else
{
RotateLeft(32, Ra, Rs, inst.SH);
if (!(inst.MB == 0 && inst.ME == 31))
{
// we need flags if we're merging the branch
if (inst.Rc && CheckMergedBranch(0))
AND(32, Ra, Imm32(mask));
else
AndWithMask(Ra, mask);
needs_sext = inst.MB == 0;
needs_test = false;
}
}
}
Rs.Unlock();
Ra.Unlock();
if (inst.Rc)
ComputeRC(a, needs_test, needs_sext);
}

View File

@ -109,19 +109,6 @@ OpArg RCOpArg::Location() const
return {};
}
OpArg RCOpArg::ExtractWithByteOffset(int offset)
{
if (offset == 0)
return Location();
ASSERT(rc);
const preg_t preg = std::get<preg_t>(contents);
rc->StoreFromRegister(preg, RegCache::FlushMode::MaintainState);
OpArg result = rc->GetDefaultLocation(preg);
result.AddMemOffset(offset);
return result;
}
void RCOpArg::Unlock()
{
if (const preg_t* preg = std::get_if<preg_t>(&contents))

View File

@ -47,9 +47,6 @@ public:
bool IsSimpleReg(Gen::X64Reg reg) const { return Location().IsSimpleReg(reg); }
Gen::X64Reg GetSimpleReg() const { return Location().GetSimpleReg(); }
// Use to extract bytes from a register using the regcache. offset is in bytes.
Gen::OpArg ExtractWithByteOffset(int offset);
void Unlock();
bool IsImm() const;
@ -159,6 +156,8 @@ public:
u32 Imm32(preg_t preg) const { return R(preg).Imm32(); }
s32 SImm32(preg_t preg) const { return R(preg).SImm32(); }
bool IsBound(preg_t preg) const { return m_regs[preg].IsBound(); }
RCOpArg Use(preg_t preg, RCMode mode);
RCOpArg UseNoImm(preg_t preg, RCMode mode);
RCOpArg BindOrImm(preg_t preg, RCMode mode);