Add BitSet and, as a test, convert some JitRegCache stuff to it.
This is a higher level, more concise wrapper for bitsets which supports efficiently counting and iterating over set bits. It's similar to std::bitset, but the latter does not support efficient iteration (and at least in libc++, the count algorithm is subpar, not that it really matters). The converted uses include both bitsets and, notably, considerably less efficient regular arrays (for in/out registers in PPCAnalyst). Unfortunately, this may slightly pessimize unoptimized builds.
This commit is contained in:
parent
e51676fdf1
commit
b6a7438053
|
@ -0,0 +1,156 @@
|
||||||
|
// This file is under the public domain.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <initializer_list>
|
||||||
|
#include <type_traits>
|
||||||
|
#include "CommonTypes.h"
|
||||||
|
|
||||||
|
// Helper functions:
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
template <typename T>
|
||||||
|
static inline int CountSetBits(T v)
|
||||||
|
{
|
||||||
|
// from https://graphics.stanford.edu/~seander/bithacks.html
|
||||||
|
// GCC has this built in, but MSVC's intrinsic will only emit the actual
|
||||||
|
// POPCNT instruction, which we're not depending on
|
||||||
|
v = v - ((v >> 1) & (T)~(T)0/3);
|
||||||
|
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
|
||||||
|
v = (v + (v >> 4)) & (T)~(T)0/255*15;
|
||||||
|
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
|
||||||
|
}
|
||||||
|
static inline int LeastSignificantSetBit(u32 val)
|
||||||
|
{
|
||||||
|
unsigned long index;
|
||||||
|
_BitScanForward(&index, val);
|
||||||
|
return (int)index;
|
||||||
|
}
|
||||||
|
static inline int LeastSignificantSetBit(u64 val)
|
||||||
|
{
|
||||||
|
unsigned long index;
|
||||||
|
_BitScanForward64(&index, val);
|
||||||
|
return (int)index;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
|
||||||
|
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
|
||||||
|
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
|
||||||
|
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
|
||||||
|
// using the set bits of an integer to represent a set of integers. Like that
|
||||||
|
// class, it acts like an array of bools:
|
||||||
|
// BitSet32 bs; // use BitSet{32,64} instead of the template directly
|
||||||
|
// bs[1] = true;
|
||||||
|
// but also like the underlying integer ([0] = least significant bit):
|
||||||
|
// BitSet32 bs2 = ...;
|
||||||
|
// bs = (bs ^ bs2) & BitSet32(0xffff);
|
||||||
|
// The following additional functionality is provided:
|
||||||
|
// - Construction using an initializer list.
|
||||||
|
// BitSet bs { 1, 2, 4, 8 };
|
||||||
|
// - Efficiently iterating through the set bits:
|
||||||
|
// for (int i : bs)
|
||||||
|
// [i is the *index* of a set bit]
|
||||||
|
// (This uses the appropriate CPU instruction to find the next set bit in one
|
||||||
|
// operation.)
|
||||||
|
// - Counting set bits using .Count() - see comment on that method.
|
||||||
|
|
||||||
|
// TODO: use constexpr when MSVC gets out of the Dark Ages
|
||||||
|
|
||||||
|
template <typename IntTy>
|
||||||
|
class BitSet
|
||||||
|
{
|
||||||
|
static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
|
||||||
|
public:
|
||||||
|
// A reference to a particular bit, returned from operator[].
|
||||||
|
class Ref
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
|
||||||
|
Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
|
||||||
|
operator bool() const { return (m_bs->m_val & m_mask) != 0; }
|
||||||
|
bool operator=(bool set)
|
||||||
|
{
|
||||||
|
m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
BitSet* m_bs;
|
||||||
|
IntTy m_mask;
|
||||||
|
};
|
||||||
|
|
||||||
|
// A STL-like iterator is required to be able to use range-based for loops.
|
||||||
|
class Iterator
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
|
||||||
|
Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
|
||||||
|
Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
|
||||||
|
int operator*() { return m_bit; }
|
||||||
|
Iterator& operator++()
|
||||||
|
{
|
||||||
|
if (m_val == 0)
|
||||||
|
{
|
||||||
|
m_bit = -1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int bit = LeastSignificantSetBit(m_val);
|
||||||
|
m_val &= ~(1 << bit);
|
||||||
|
m_bit = bit;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
Iterator operator++(int _)
|
||||||
|
{
|
||||||
|
Iterator other(*this);
|
||||||
|
++*this;
|
||||||
|
return other;
|
||||||
|
}
|
||||||
|
bool operator==(Iterator other) const { return m_bit == other.m_bit; }
|
||||||
|
bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
|
||||||
|
private:
|
||||||
|
IntTy m_val;
|
||||||
|
int m_bit;
|
||||||
|
};
|
||||||
|
|
||||||
|
BitSet() : m_val(0) {}
|
||||||
|
explicit BitSet(IntTy val) : m_val(val) {}
|
||||||
|
BitSet(std::initializer_list<int> init)
|
||||||
|
{
|
||||||
|
m_val = 0;
|
||||||
|
for (int bit : init)
|
||||||
|
m_val |= (IntTy)1 << bit;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
|
||||||
|
const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
|
||||||
|
bool operator==(BitSet other) const { return m_val == other.m_val; }
|
||||||
|
bool operator!=(BitSet other) const { return m_val != other.m_val; }
|
||||||
|
BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
|
||||||
|
BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
|
||||||
|
BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
|
||||||
|
BitSet operator~() const { return BitSet(~m_val); }
|
||||||
|
BitSet& operator|=(BitSet other) { return *this = *this | other; }
|
||||||
|
BitSet& operator&=(BitSet other) { return *this = *this & other; }
|
||||||
|
BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
|
||||||
|
operator u32() = delete;
|
||||||
|
operator bool() { return m_val != 0; }
|
||||||
|
|
||||||
|
// Warning: Even though on modern CPUs this is a single fast instruction,
|
||||||
|
// Dolphin's official builds do not currently assume POPCNT support on x86,
|
||||||
|
// so slower explicit bit twiddling is generated. Still should generally
|
||||||
|
// be faster than a loop.
|
||||||
|
unsigned int Count() const { return CountSetBits(m_val); }
|
||||||
|
|
||||||
|
Iterator begin() const { Iterator it(m_val, 0); return ++it; }
|
||||||
|
Iterator end() const { return Iterator(m_val, -1); }
|
||||||
|
|
||||||
|
IntTy m_val;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef BitSet<u32> BitSet32;
|
||||||
|
typedef BitSet<u64> BitSet64;
|
|
@ -39,6 +39,7 @@
|
||||||
<ClInclude Include="Atomic_GCC.h" />
|
<ClInclude Include="Atomic_GCC.h" />
|
||||||
<ClInclude Include="Atomic_Win32.h" />
|
<ClInclude Include="Atomic_Win32.h" />
|
||||||
<ClInclude Include="BitField.h" />
|
<ClInclude Include="BitField.h" />
|
||||||
|
<ClInclude Include="BitSet.h" />
|
||||||
<ClInclude Include="BreakPoints.h" />
|
<ClInclude Include="BreakPoints.h" />
|
||||||
<ClInclude Include="CDUtils.h" />
|
<ClInclude Include="CDUtils.h" />
|
||||||
<ClInclude Include="ChunkFile.h" />
|
<ClInclude Include="ChunkFile.h" />
|
||||||
|
@ -137,4 +138,4 @@
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
<ClInclude Include="Atomic_GCC.h" />
|
<ClInclude Include="Atomic_GCC.h" />
|
||||||
<ClInclude Include="Atomic_Win32.h" />
|
<ClInclude Include="Atomic_Win32.h" />
|
||||||
<ClInclude Include="BitField.h" />
|
<ClInclude Include="BitField.h" />
|
||||||
|
<ClInclude Include="BitSet.h" />
|
||||||
<ClInclude Include="BreakPoints.h" />
|
<ClInclude Include="BreakPoints.h" />
|
||||||
<ClInclude Include="CDUtils.h" />
|
<ClInclude Include="CDUtils.h" />
|
||||||
<ClInclude Include="ChunkFile.h" />
|
<ClInclude Include="ChunkFile.h" />
|
||||||
|
@ -118,4 +119,4 @@
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Text Include="CMakeLists.txt" />
|
<Text Include="CMakeLists.txt" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -736,29 +736,28 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
||||||
// output, which needs to be bound in the actual instruction compilation.
|
// output, which needs to be bound in the actual instruction compilation.
|
||||||
// TODO: make this smarter in the case that we're actually register-starved, i.e.
|
// TODO: make this smarter in the case that we're actually register-starved, i.e.
|
||||||
// prioritize the more important registers.
|
// prioritize the more important registers.
|
||||||
for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
|
for (int reg : ops[i].regsIn)
|
||||||
{
|
{
|
||||||
int reg = ops[i].regsIn[k];
|
if (gpr.NumFreeRegisters() < 2)
|
||||||
if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm())
|
break;
|
||||||
|
if (ops[i].gprInReg[reg] && !gpr.R(reg).IsImm())
|
||||||
gpr.BindToRegister(reg, true, false);
|
gpr.BindToRegister(reg, true, false);
|
||||||
}
|
}
|
||||||
for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
|
for (int reg : ops[i].regsOut)
|
||||||
{
|
{
|
||||||
int reg = ops[i].fregsIn[k];
|
if (fpr.NumFreeRegisters() < 2)
|
||||||
if (reg >= 0 && (ops[i].fprInXmm & (1 << reg)))
|
break;
|
||||||
fpr.BindToRegister(reg, true, false);
|
if (ops[i].fprInXmm[reg])
|
||||||
|
gpr.BindToRegister(reg, true, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
Jit64Tables::CompileInstruction(ops[i]);
|
Jit64Tables::CompileInstruction(ops[i]);
|
||||||
|
|
||||||
// If we have a register that will never be used again, flush it.
|
// If we have a register that will never be used again, flush it.
|
||||||
for (int j = 0; j < 32; j++)
|
for (int j : ~ops[i].gprInUse)
|
||||||
{
|
gpr.StoreFromRegister(j);
|
||||||
if (!(ops[i].gprInUse & (1 << j)))
|
for (int j : ~ops[i].fprInUse)
|
||||||
gpr.StoreFromRegister(j);
|
fpr.StoreFromRegister(j);
|
||||||
if (!(ops[i].fprInUse & (1 << j)))
|
|
||||||
fpr.StoreFromRegister(j);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
|
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
|
||||||
{
|
{
|
||||||
|
|
|
@ -95,42 +95,38 @@ void RegCache::UnlockAllX()
|
||||||
xreg.locked = false;
|
xreg.locked = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 GPRRegCache::GetRegUtilization()
|
BitSet32 GPRRegCache::GetRegUtilization()
|
||||||
{
|
{
|
||||||
return jit->js.op->gprInReg;
|
return jit->js.op->gprInReg;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 FPURegCache::GetRegUtilization()
|
BitSet32 FPURegCache::GetRegUtilization()
|
||||||
{
|
{
|
||||||
return jit->js.op->gprInReg;
|
return jit->js.op->gprInReg;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
|
BitSet32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
|
||||||
{
|
{
|
||||||
u32 regsUsed = 0;
|
BitSet32 regsUsed;
|
||||||
for (u32 i = 1; i < lookahead; i++)
|
for (u32 i = 1; i < lookahead; i++)
|
||||||
{
|
{
|
||||||
for (int j = 0; j < 3; j++)
|
BitSet32 regsIn = jit->js.op[i].regsIn;
|
||||||
if (jit->js.op[i].regsIn[j] >= 0)
|
regsUsed |= regsIn;
|
||||||
regsUsed |= 1 << jit->js.op[i].regsIn[j];
|
if (regsIn[preg])
|
||||||
for (int j = 0; j < 3; j++)
|
return regsUsed;
|
||||||
if ((size_t)jit->js.op[i].regsIn[j] == preg)
|
|
||||||
return regsUsed;
|
|
||||||
}
|
}
|
||||||
return regsUsed;
|
return regsUsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
|
BitSet32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
|
||||||
{
|
{
|
||||||
u32 regsUsed = 0;
|
BitSet32 regsUsed;
|
||||||
for (u32 i = 1; i < lookahead; i++)
|
for (u32 i = 1; i < lookahead; i++)
|
||||||
{
|
{
|
||||||
for (int j = 0; j < 4; j++)
|
BitSet32 regsIn = jit->js.op[i].fregsIn;
|
||||||
if (jit->js.op[i].fregsIn[j] >= 0)
|
regsUsed |= regsIn;
|
||||||
regsUsed |= 1 << jit->js.op[i].fregsIn[j];
|
if (regsIn[preg])
|
||||||
for (int j = 0; j < 4; j++)
|
return regsUsed;
|
||||||
if ((size_t)jit->js.op[i].fregsIn[j] == preg)
|
|
||||||
return regsUsed;
|
|
||||||
}
|
}
|
||||||
return regsUsed;
|
return regsUsed;
|
||||||
}
|
}
|
||||||
|
@ -151,17 +147,14 @@ float RegCache::ScoreRegister(X64Reg xr)
|
||||||
|
|
||||||
// If the register isn't actually needed in a physical register for a later instruction,
|
// If the register isn't actually needed in a physical register for a later instruction,
|
||||||
// writing it back to the register file isn't quite as bad.
|
// writing it back to the register file isn't quite as bad.
|
||||||
if (GetRegUtilization() & (1 << preg))
|
if (GetRegUtilization()[preg])
|
||||||
{
|
{
|
||||||
// Don't look too far ahead; we don't want to have quadratic compilation times for
|
// Don't look too far ahead; we don't want to have quadratic compilation times for
|
||||||
// enormous block sizes!
|
// enormous block sizes!
|
||||||
// This actually improves register allocation a tiny bit; I'm not sure why.
|
// This actually improves register allocation a tiny bit; I'm not sure why.
|
||||||
u32 lookahead = std::min(jit->js.instructionsLeft, 64);
|
u32 lookahead = std::min(jit->js.instructionsLeft, 64);
|
||||||
// Count how many other registers are going to be used before we need this one again.
|
// Count how many other registers are going to be used before we need this one again.
|
||||||
u32 regs_in = CountRegsIn(preg, lookahead);
|
u32 regs_in_count = CountRegsIn(preg, lookahead).Count();
|
||||||
u32 regs_in_count = 0;
|
|
||||||
for (int i = 0; i < 32; i++)
|
|
||||||
regs_in_count += !!(regs_in & (1 << i));
|
|
||||||
// Totally ad-hoc heuristic to bias based on how many other registers we'll need
|
// Totally ad-hoc heuristic to bias based on how many other registers we'll need
|
||||||
// before this one gets used again.
|
// before this one gets used again.
|
||||||
score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count));
|
score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count));
|
||||||
|
|
|
@ -44,8 +44,8 @@ protected:
|
||||||
|
|
||||||
virtual const int *GetAllocationOrder(size_t& count) = 0;
|
virtual const int *GetAllocationOrder(size_t& count) = 0;
|
||||||
|
|
||||||
virtual u32 GetRegUtilization() = 0;
|
virtual BitSet32 GetRegUtilization() = 0;
|
||||||
virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0;
|
virtual BitSet32 CountRegsIn(size_t preg, u32 lookahead) = 0;
|
||||||
|
|
||||||
Gen::XEmitter *emit;
|
Gen::XEmitter *emit;
|
||||||
|
|
||||||
|
@ -137,8 +137,8 @@ public:
|
||||||
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
||||||
const int* GetAllocationOrder(size_t& count) override;
|
const int* GetAllocationOrder(size_t& count) override;
|
||||||
void SetImmediate32(size_t preg, u32 immValue);
|
void SetImmediate32(size_t preg, u32 immValue);
|
||||||
u32 GetRegUtilization();
|
BitSet32 GetRegUtilization() override;
|
||||||
u32 CountRegsIn(size_t preg, u32 lookahead);
|
BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -149,6 +149,6 @@ public:
|
||||||
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
|
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
|
||||||
const int* GetAllocationOrder(size_t& count) override;
|
const int* GetAllocationOrder(size_t& count) override;
|
||||||
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
||||||
u32 GetRegUtilization();
|
BitSet32 GetRegUtilization() override;
|
||||||
u32 CountRegsIn(size_t preg, u32 lookahead);
|
BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
|
||||||
};
|
};
|
||||||
|
|
|
@ -249,21 +249,15 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
|
||||||
// That is, check that none of b's outputs matches any of a's inputs,
|
// That is, check that none of b's outputs matches any of a's inputs,
|
||||||
// and that none of a's outputs matches any of b's inputs.
|
// and that none of a's outputs matches any of b's inputs.
|
||||||
// The latter does not apply if a is a cmp, of course, but doesn't hurt to check.
|
// The latter does not apply if a is a cmp, of course, but doesn't hurt to check.
|
||||||
for (int j = 0; j < 3; j++)
|
// register collision: b outputs to one of a's inputs
|
||||||
{
|
if (b.regsOut & a.regsIn)
|
||||||
int regInA = a.regsIn[j];
|
return false;
|
||||||
int regInB = b.regsIn[j];
|
// register collision: a outputs to one of b's inputs
|
||||||
// register collision: b outputs to one of a's inputs
|
if (a.regsOut & b.regsIn)
|
||||||
if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
|
return false;
|
||||||
return false;
|
// register collision: b outputs to one of a's outputs (overwriting it)
|
||||||
// register collision: a outputs to one of b's inputs
|
if (b.regsOut & a.regsOut)
|
||||||
if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
|
return false;
|
||||||
return false;
|
|
||||||
// register collision: b outputs to one of a's outputs (overwriting it)
|
|
||||||
for (int k = 0; k < 2; k++)
|
|
||||||
if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -520,42 +514,41 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
|
||||||
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
|
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
|
||||||
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
|
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
|
||||||
|
|
||||||
int numOut = 0;
|
code->regsIn = BitSet32(0);
|
||||||
int numIn = 0;
|
code->regsOut = BitSet32(0);
|
||||||
int numFloatIn = 0;
|
|
||||||
if (opinfo->flags & FL_OUT_A)
|
if (opinfo->flags & FL_OUT_A)
|
||||||
{
|
{
|
||||||
code->regsOut[numOut++] = code->inst.RA;
|
code->regsOut[code->inst.RA] = true;
|
||||||
block->m_gpa->SetOutputRegister(code->inst.RA, index);
|
block->m_gpa->SetOutputRegister(code->inst.RA, index);
|
||||||
}
|
}
|
||||||
if (opinfo->flags & FL_OUT_D)
|
if (opinfo->flags & FL_OUT_D)
|
||||||
{
|
{
|
||||||
code->regsOut[numOut++] = code->inst.RD;
|
code->regsOut[code->inst.RD] = true;
|
||||||
block->m_gpa->SetOutputRegister(code->inst.RD, index);
|
block->m_gpa->SetOutputRegister(code->inst.RD, index);
|
||||||
}
|
}
|
||||||
if (opinfo->flags & FL_OUT_S)
|
if (opinfo->flags & FL_OUT_S)
|
||||||
{
|
{
|
||||||
code->regsOut[numOut++] = code->inst.RS;
|
code->regsOut[code->inst.RS] = true;
|
||||||
block->m_gpa->SetOutputRegister(code->inst.RS, index);
|
block->m_gpa->SetOutputRegister(code->inst.RS, index);
|
||||||
}
|
}
|
||||||
if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0))
|
if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0))
|
||||||
{
|
{
|
||||||
code->regsIn[numIn++] = code->inst.RA;
|
code->regsIn[code->inst.RA] = true;
|
||||||
block->m_gpa->SetInputRegister(code->inst.RA, index);
|
block->m_gpa->SetInputRegister(code->inst.RA, index);
|
||||||
}
|
}
|
||||||
if (opinfo->flags & FL_IN_B)
|
if (opinfo->flags & FL_IN_B)
|
||||||
{
|
{
|
||||||
code->regsIn[numIn++] = code->inst.RB;
|
code->regsIn[code->inst.RB] = true;
|
||||||
block->m_gpa->SetInputRegister(code->inst.RB, index);
|
block->m_gpa->SetInputRegister(code->inst.RB, index);
|
||||||
}
|
}
|
||||||
if (opinfo->flags & FL_IN_C)
|
if (opinfo->flags & FL_IN_C)
|
||||||
{
|
{
|
||||||
code->regsIn[numIn++] = code->inst.RC;
|
code->regsIn[code->inst.RC] = true;
|
||||||
block->m_gpa->SetInputRegister(code->inst.RC, index);
|
block->m_gpa->SetInputRegister(code->inst.RC, index);
|
||||||
}
|
}
|
||||||
if (opinfo->flags & FL_IN_S)
|
if (opinfo->flags & FL_IN_S)
|
||||||
{
|
{
|
||||||
code->regsIn[numIn++] = code->inst.RS;
|
code->regsIn[code->inst.RS] = true;
|
||||||
block->m_gpa->SetInputRegister(code->inst.RS, index);
|
block->m_gpa->SetInputRegister(code->inst.RS, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -564,24 +557,17 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
|
||||||
code->fregOut = code->inst.FD;
|
code->fregOut = code->inst.FD;
|
||||||
else if (opinfo->flags & FL_OUT_FLOAT_S)
|
else if (opinfo->flags & FL_OUT_FLOAT_S)
|
||||||
code->fregOut = code->inst.FS;
|
code->fregOut = code->inst.FS;
|
||||||
|
code->fregsIn = BitSet32(0);
|
||||||
if (opinfo->flags & FL_IN_FLOAT_A)
|
if (opinfo->flags & FL_IN_FLOAT_A)
|
||||||
code->fregsIn[numFloatIn++] = code->inst.FA;
|
code->fregsIn[code->inst.FA] = true;
|
||||||
if (opinfo->flags & FL_IN_FLOAT_B)
|
if (opinfo->flags & FL_IN_FLOAT_B)
|
||||||
code->fregsIn[numFloatIn++] = code->inst.FB;
|
code->fregsIn[code->inst.FB] = true;
|
||||||
if (opinfo->flags & FL_IN_FLOAT_C)
|
if (opinfo->flags & FL_IN_FLOAT_C)
|
||||||
code->fregsIn[numFloatIn++] = code->inst.FC;
|
code->fregsIn[code->inst.FC] = true;
|
||||||
if (opinfo->flags & FL_IN_FLOAT_D)
|
if (opinfo->flags & FL_IN_FLOAT_D)
|
||||||
code->fregsIn[numFloatIn++] = code->inst.FD;
|
code->fregsIn[code->inst.FD] = true;
|
||||||
if (opinfo->flags & FL_IN_FLOAT_S)
|
if (opinfo->flags & FL_IN_FLOAT_S)
|
||||||
code->fregsIn[numFloatIn++] = code->inst.FS;
|
code->fregsIn[code->inst.FS] = true;
|
||||||
|
|
||||||
// Set remaining register slots as unused (-1)
|
|
||||||
for (int j = numIn; j < 3; j++)
|
|
||||||
code->regsIn[j] = -1;
|
|
||||||
for (int j = numOut; j < 2; j++)
|
|
||||||
code->regsOut[j] = -1;
|
|
||||||
for (int j = numFloatIn; j < 4; j++)
|
|
||||||
code->fregsIn[j] = -1;
|
|
||||||
|
|
||||||
switch (opinfo->type)
|
switch (opinfo->type)
|
||||||
{
|
{
|
||||||
|
@ -797,7 +783,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||||
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
|
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
|
||||||
// wants flags, to be safe.
|
// wants flags, to be safe.
|
||||||
bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true;
|
bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true;
|
||||||
u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0;
|
BitSet32 fprInUse, gprInUse, gprInReg, fprInXmm;
|
||||||
for (int i = block->m_num_instructions - 1; i >= 0; i--)
|
for (int i = block->m_num_instructions - 1; i >= 0; i--)
|
||||||
{
|
{
|
||||||
bool opWantsCR0 = code[i].wantsCR0;
|
bool opWantsCR0 = code[i].wantsCR0;
|
||||||
|
@ -822,30 +808,20 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||||
code[i].fprInXmm = fprInXmm;
|
code[i].fprInXmm = fprInXmm;
|
||||||
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
|
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
|
||||||
// we can throw away a register if it's going to be overwritten later.
|
// we can throw away a register if it's going to be overwritten later.
|
||||||
for (int j = 0; j < 3; j++)
|
gprInUse |= code[i].regsIn;
|
||||||
if (code[i].regsIn[j] >= 0)
|
gprInReg |= code[i].regsIn;
|
||||||
{
|
fprInUse |= code[i].fregsIn;
|
||||||
gprInUse |= 1 << code[i].regsIn[j];
|
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||||
gprInReg |= 1 << code[i].regsIn[j];
|
fprInXmm |= code[i].fregsIn;
|
||||||
}
|
|
||||||
for (int j = 0; j < 4; j++)
|
|
||||||
if (code[i].fregsIn[j] >= 0)
|
|
||||||
{
|
|
||||||
fprInUse |= 1 << code[i].fregsIn[j];
|
|
||||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
|
||||||
fprInXmm |= 1 << code[i].fregsIn[j];
|
|
||||||
}
|
|
||||||
// For now, we need to count output registers as "used" though; otherwise the flush
|
// For now, we need to count output registers as "used" though; otherwise the flush
|
||||||
// will result in a redundant store (e.g. store to regcache, then store again to
|
// will result in a redundant store (e.g. store to regcache, then store again to
|
||||||
// the same location later).
|
// the same location later).
|
||||||
for (int j = 0; j < 2; j++)
|
gprInUse |= code[i].regsOut;
|
||||||
if (code[i].regsOut[j] >= 0)
|
|
||||||
gprInUse |= 1 << code[i].regsOut[j];
|
|
||||||
if (code[i].fregOut >= 0)
|
if (code[i].fregOut >= 0)
|
||||||
{
|
{
|
||||||
fprInUse |= 1 << code[i].fregOut;
|
fprInUse[code[i].fregOut] = true;
|
||||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||||
fprInXmm |= 1 << code[i].fregOut;
|
fprInXmm[code[i].fregOut] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return address;
|
return address;
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "Common/BitSet.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
#include "Core/PowerPC/PPCTables.h"
|
#include "Core/PowerPC/PPCTables.h"
|
||||||
|
|
||||||
|
@ -26,10 +27,10 @@ struct CodeOp //16B
|
||||||
u32 address;
|
u32 address;
|
||||||
u32 branchTo; //if 0, not a branch
|
u32 branchTo; //if 0, not a branch
|
||||||
int branchToIndex; //index of target block
|
int branchToIndex; //index of target block
|
||||||
s8 regsOut[2];
|
BitSet32 regsOut;
|
||||||
s8 regsIn[3];
|
BitSet32 regsIn;
|
||||||
|
BitSet32 fregsIn;
|
||||||
s8 fregOut;
|
s8 fregOut;
|
||||||
s8 fregsIn[4];
|
|
||||||
bool isBranchTarget;
|
bool isBranchTarget;
|
||||||
bool wantsCR0;
|
bool wantsCR0;
|
||||||
bool wantsCR1;
|
bool wantsCR1;
|
||||||
|
@ -43,13 +44,13 @@ struct CodeOp //16B
|
||||||
bool canEndBlock;
|
bool canEndBlock;
|
||||||
bool skip; // followed BL-s for example
|
bool skip; // followed BL-s for example
|
||||||
// which registers are still needed after this instruction in this block
|
// which registers are still needed after this instruction in this block
|
||||||
u32 fprInUse;
|
BitSet32 fprInUse;
|
||||||
u32 gprInUse;
|
BitSet32 gprInUse;
|
||||||
// just because a register is in use doesn't mean we actually need or want it in an x86 register.
|
// just because a register is in use doesn't mean we actually need or want it in an x86 register.
|
||||||
u32 gprInReg;
|
BitSet32 gprInReg;
|
||||||
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
||||||
// an XMM only to move it again to a GPR afterwards.
|
// an XMM only to move it again to a GPR afterwards.
|
||||||
u32 fprInXmm;
|
BitSet32 fprInXmm;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct BlockStats
|
struct BlockStats
|
||||||
|
|
Loading…
Reference in New Issue