Merge pull request #1307 from comex/bitset

Higher level bitset wrapper
2014-10-28 23:39:35 -04:00 · 2014-10-28 23:39:35 -04:00 · 089e32ba7d
parent 7747c9efbb c81e3da22f
commit 089e32ba7d
31 changed files with 492 additions and 307 deletions
--- a/Source/Core/Common/BitSet.h
+++ b/Source/Core/Common/BitSet.h
@ -0,0 +1,166 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <initializer_list>
+#include <type_traits>
+#include "CommonTypes.h"
+
+// Helper functions:
+
+#ifdef _WIN32
+template <typename T>
+static inline int CountSetBits(T v)
+{
+	// from https://graphics.stanford.edu/~seander/bithacks.html
+	// GCC has this built in, but MSVC's intrinsic will only emit the actual
+	// POPCNT instruction, which we're not depending on
+	v = v - ((v >> 1) & (T)~(T)0/3);
+	v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
+	v = (v + (v >> 4)) & (T)~(T)0/255*15;
+	return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
+}
+static inline int LeastSignificantSetBit(u32 val)
+{
+	unsigned long index;
+	_BitScanForward(&index, val);
+	return (int)index;
+}
+static inline int LeastSignificantSetBit(u64 val)
+{
+	unsigned long index;
+	_BitScanForward64(&index, val);
+	return (int)index;
+}
+#else
+static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
+static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
+static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
+static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
+#endif
+
+// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
+namespace BS
+{
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+	static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+public:
+	// A reference to a particular bit, returned from operator[].
+	class Ref
+	{
+	public:
+		Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+		Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+		operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+		bool operator=(bool set)
+		{
+			m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+			return set;
+		}
+	private:
+		BitSet* m_bs;
+		IntTy m_mask;
+	};
+
+	// A STL-like iterator is required to be able to use range-based for loops.
+	class Iterator
+	{
+	public:
+		Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+		Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+		Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
+		int operator*() { return m_bit; }
+		Iterator& operator++()
+		{
+			if (m_val == 0)
+			{
+				m_bit = -1;
+			}
+			else
+			{
+				int bit = LeastSignificantSetBit(m_val);
+				m_val &= ~(1 << bit);
+				m_bit = bit;
+			}
+			return *this;
+		}
+		Iterator operator++(int _)
+		{
+			Iterator other(*this);
+			++*this;
+			return other;
+		}
+		bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+		bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+	private:
+		IntTy m_val;
+		int m_bit;
+	};
+
+	BitSet() : m_val(0) {}
+	explicit BitSet(IntTy val) : m_val(val) {}
+	BitSet(std::initializer_list<int> init)
+	{
+		m_val = 0;
+		for (int bit : init)
+			m_val |= (IntTy)1 << bit;
+	}
+
+	static BitSet AllTrue(size_t count)
+	{
+		return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+	}
+
+	Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+	const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+	bool operator==(BitSet other) const { return m_val == other.m_val; }
+	bool operator!=(BitSet other) const { return m_val != other.m_val; }
+	BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+	BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+	BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+	BitSet operator~() const { return BitSet(~m_val); }
+	BitSet& operator|=(BitSet other) { return *this = *this | other; }
+	BitSet& operator&=(BitSet other) { return *this = *this & other; }
+	BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+	operator u32() = delete;
+	operator bool() { return m_val != 0; }
+
+	// Warning: Even though on modern CPUs this is a single fast instruction,
+	// Dolphin's official builds do not currently assume POPCNT support on x86,
+	// so slower explicit bit twiddling is generated.  Still should generally
+	// be faster than a loop.
+	unsigned int Count() const { return CountSetBits(m_val); }
+
+	Iterator begin() const { Iterator it(m_val, 0); return ++it; }
+	Iterator end() const { return Iterator(m_val, -1); }
+
+	IntTy m_val;
+};
+
+}
+
+typedef BS::BitSet<u32> BitSet32;
+typedef BS::BitSet<u64> BitSet64;
--- a/Source/Core/Common/Common.vcxproj
+++ b/Source/Core/Common/Common.vcxproj
@ -39,6 +39,7 @@
    <ClInclude Include="Atomic_GCC.h" />
    <ClInclude Include="Atomic_Win32.h" />
    <ClInclude Include="BitField.h" />
+    <ClInclude Include="BitSet.h" />
    <ClInclude Include="BreakPoints.h" />
    <ClInclude Include="CDUtils.h" />
    <ClInclude Include="ChunkFile.h" />
@ -137,4 +138,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Source/Core/Common/Common.vcxproj.filters
+++ b/Source/Core/Common/Common.vcxproj.filters
@ -13,6 +13,7 @@
    <ClInclude Include="Atomic_GCC.h" />
    <ClInclude Include="Atomic_Win32.h" />
    <ClInclude Include="BitField.h" />
+    <ClInclude Include="BitSet.h" />
    <ClInclude Include="BreakPoints.h" />
    <ClInclude Include="CDUtils.h" />
    <ClInclude Include="ChunkFile.h" />
@ -118,4 +119,4 @@
  <ItemGroup>
    <Text Include="CMakeLists.txt" />
  </ItemGroup>
-</Project>
+</Project>
--- a/Source/Core/Common/x64ABI.cpp
+++ b/Source/Core/Common/x64ABI.cpp
@ -10,31 +10,23 @@ using namespace Gen;

 // Shared code between Win64 and Unix64

-void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
 {
 	size_t shadow = 0;
 #if defined(_WIN32)
 	shadow = 0x20;
 #endif

-	int count = 0;
-	for (int r = 0; r < 16; r++)
-	{
-		if (mask & (1 << r))
-			count++;
-	}
+	int count = (mask & ABI_ALL_GPRS).Count();
 	rsp_alignment -= count * 8;
 	size_t subtraction = 0;
-	if (mask & 0xffff0000)
+	int fpr_count = (mask & ABI_ALL_FPRS).Count();
+	if (fpr_count)
 	{
 		// If we have any XMMs to save, we must align the stack here.
 		subtraction = rsp_alignment & 0xf;
 	}
-	for (int x = 0; x < 16; x++)
-	{
-		if (mask & (1 << (16 + x)))
-			subtraction += 16;
-	}
+	subtraction += 16 * fpr_count;
 	size_t xmm_base_subtraction = subtraction;
 	subtraction += needed_frame_size;
 	subtraction += shadow;
@ -47,44 +39,35 @@ void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t nee
 	*xmm_offsetp = subtraction - xmm_base_subtraction;
 }

-size_t XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size)
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
 {
 	size_t shadow, subtraction, xmm_offset;
 	ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);

-	for (int r = 0; r < 16; r++)
-	{
-		if (mask & (1 << r))
-			PUSH((X64Reg) r);
-	}
+	for (int r : mask & ABI_ALL_GPRS)
+		PUSH((X64Reg) r);

 	if (subtraction)
 		SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));

-	for (int x = 0; x < 16; x++)
+	for (int x : mask & ABI_ALL_FPRS)
 	{
-		if (mask & (1 << (16 + x)))
-		{
-			MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) x);
-			xmm_offset += 16;
-		}
+		MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) (x - 16));
+		xmm_offset += 16;
 	}

 	return shadow;
 }

-void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size)
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
 {
 	size_t shadow, subtraction, xmm_offset;
 	ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);

-	for (int x = 0; x < 16; x++)
+	for (int x : mask & ABI_ALL_FPRS)
 	{
-		if (mask & (1 << (16 + x)))
-		{
-			MOVAPD((X64Reg) x, MDisp(RSP, (int)xmm_offset));
-			xmm_offset += 16;
-		}
+		MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset));
+		xmm_offset += 16;
 	}

 	if (subtraction)
@ -92,10 +75,8 @@ void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, si

 	for (int r = 15; r >= 0; r--)
 	{
-		if (mask & (1 << r))
-		{
+		if (mask[r])
 			POP((X64Reg) r);
-		}
 	}
 }

--- a/Source/Core/Common/x64ABI.h
+++ b/Source/Core/Common/x64ABI.h
@ -4,6 +4,7 @@

 #pragma once

+#include "Common/BitSet.h"
 #include "Common/x64Emitter.h"

 // x64 ABI:s, and helpers to help follow them when JIT-ing code.
@ -23,6 +24,9 @@
 // Callee-save:  RBX RBP R12 R13 R14 R15
 // Parameters:   RDI RSI RDX RCX R8 R9

+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
 #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention

 #define ABI_PARAM1 RCX
@ -31,11 +35,9 @@
 #define ABI_PARAM4 R9

 // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
-#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \
-                              (1 << R9) | (1 << R10) | (1 << R11) | \
-                              (1 << (XMM0+16)) | (1 << (XMM1+16)) | (1 << (XMM2+16)) | (1 << (XMM3+16)) | \
-                              (1 << (XMM4+16)) | (1 << (XMM5+16)))
-
+#define ABI_ALL_CALLER_SAVED \
+	(BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \
+	            XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 })
 #else  //64-bit Unix / OS X

 #define ABI_PARAM1 RDI
@ -47,13 +49,12 @@

 // FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
 // don't actually clobber them.
-#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \
-                              (1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \
-                              0xffff0000 /* xmm0..15 */)
-
+#define ABI_ALL_CALLER_SAVED \
+	(BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \
+	 ABI_ALL_FPRS)
 #endif // WIN32

-#define ABI_ALL_CALLEE_SAVED ((u32) ~ABI_ALL_CALLER_SAVED)
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)

 #define ABI_RETURN RAX

--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@ -10,6 +10,7 @@
 #include <cstring>
 #include <functional>

+#include "Common/BitSet.h"
 #include "Common/CodeBlock.h"
 #include "Common/CommonTypes.h"

@ -302,7 +303,7 @@ private:
 	void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);

-	void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+	void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);

 protected:
 	inline void Write8(u8 value)   {*code++ = value;}
@ -883,8 +884,8 @@ public:
 	// Saves/restores the registers and adjusts the stack to be aligned as
 	// required by the ABI, where the previous alignment was as specified.
 	// Push returns the size of the shadow space, i.e. the offset of the frame.
-	size_t ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
-	void ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+	size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+	void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);

 	inline int ABI_GetNumXMMRegs() { return 16; }

--- a/Source/Core/Core/DSP/DSPEmitter.cpp
+++ b/Source/Core/Core/DSP/DSPEmitter.cpp
@ -385,7 +385,7 @@ void DSPEmitter::CompileDispatcher()
 {
 	enterDispatcher = AlignCode16();
 	// We don't use floating point (high 16 bits).
-	u32 registers_used = ABI_ALL_CALLEE_SAVED & 0xffff;
+	BitSet32 registers_used = ABI_ALL_CALLEE_SAVED & BitSet32(0xffff);
 	ABI_PushRegistersAndAdjustStack(registers_used, 8);

 	const u8 *dispatcherLoop = GetCodePtr();
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -241,9 +241,9 @@ void Jit64::WriteCallInterpreter(UGeckoInstruction inst)
 		MOV(32, PPCSTATE(npc), Imm32(js.compilerPC + 4));
 	}
 	Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst);
-	ABI_PushRegistersAndAdjustStack(0, 0);
+	ABI_PushRegistersAndAdjustStack({}, 0);
 	ABI_CallFunctionC((void*)instr, inst.hex);
-	ABI_PopRegistersAndAdjustStack(0, 0);
+	ABI_PopRegistersAndAdjustStack({}, 0);
 }

 void Jit64::unknown_instruction(UGeckoInstruction inst)
@ -260,9 +260,9 @@ void Jit64::HLEFunction(UGeckoInstruction _inst)
 {
 	gpr.Flush();
 	fpr.Flush();
-	ABI_PushRegistersAndAdjustStack(0, 0);
+	ABI_PushRegistersAndAdjustStack({}, 0);
 	ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex);
-	ABI_PopRegistersAndAdjustStack(0, 0);
+	ABI_PopRegistersAndAdjustStack({}, 0);
 }

 void Jit64::DoNothing(UGeckoInstruction _inst)
@ -300,18 +300,18 @@ bool Jit64::Cleanup()

 	if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
 	{
-		ABI_PushRegistersAndAdjustStack(0, 0);
+		ABI_PushRegistersAndAdjustStack({}, 0);
 		ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
-		ABI_PopRegistersAndAdjustStack(0, 0);
+		ABI_PopRegistersAndAdjustStack({}, 0);
 		did_something = true;
 	}

 	// SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time.
 	if (MMCR0.Hex || MMCR1.Hex)
 	{
-		ABI_PushRegistersAndAdjustStack(0, 0);
+		ABI_PushRegistersAndAdjustStack({}, 0);
 		ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst);
-		ABI_PopRegistersAndAdjustStack(0, 0);
+		ABI_PopRegistersAndAdjustStack({}, 0);
 		did_something = true;
 	}

@ -426,9 +426,9 @@ void Jit64::WriteRfiExitDestInRSCRATCH()
 	MOV(32, PPCSTATE(pc), R(RSCRATCH));
 	MOV(32, PPCSTATE(npc), R(RSCRATCH));
 	Cleanup();
-	ABI_PushRegistersAndAdjustStack(0, 0);
+	ABI_PushRegistersAndAdjustStack({}, 0);
 	ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
-	ABI_PopRegistersAndAdjustStack(0, 0);
+	ABI_PopRegistersAndAdjustStack({}, 0);
 	SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
 	JMP(asm_routines.dispatcher, true);
 }
@ -438,9 +438,9 @@ void Jit64::WriteExceptionExit()
 	Cleanup();
 	MOV(32, R(RSCRATCH), PPCSTATE(pc));
 	MOV(32, PPCSTATE(npc), R(RSCRATCH));
-	ABI_PushRegistersAndAdjustStack(0, 0);
+	ABI_PushRegistersAndAdjustStack({}, 0);
 	ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
-	ABI_PopRegistersAndAdjustStack(0, 0);
+	ABI_PopRegistersAndAdjustStack({}, 0);
 	SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
 	JMP(asm_routines.dispatcher, true);
 }
@ -450,9 +450,9 @@ void Jit64::WriteExternalExceptionExit()
 	Cleanup();
 	MOV(32, R(RSCRATCH), PPCSTATE(pc));
 	MOV(32, PPCSTATE(npc), R(RSCRATCH));
-	ABI_PushRegistersAndAdjustStack(0, 0);
+	ABI_PushRegistersAndAdjustStack({}, 0);
 	ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
-	ABI_PopRegistersAndAdjustStack(0, 0);
+	ABI_PopRegistersAndAdjustStack({}, 0);
 	SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
 	JMP(asm_routines.dispatcher, true);
 }
@ -565,9 +565,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc

 	if (ImHereDebug)
 	{
-		ABI_PushRegistersAndAdjustStack(0, 0);
+		ABI_PushRegistersAndAdjustStack({}, 0);
 		ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
-		ABI_PopRegistersAndAdjustStack(0, 0);
+		ABI_PopRegistersAndAdjustStack({}, 0);
 	}

 	// Conditionally add profiling code.
@ -642,7 +642,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 		{
 			js.fifoBytesThisBlock -= 32;
 			MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
-			u32 registersInUse = CallerSavedRegistersInUse();
+			BitSet32 registersInUse = CallerSavedRegistersInUse();
 			ABI_PushRegistersAndAdjustStack(registersInUse, 0);
 			ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
 			ABI_PopRegistersAndAdjustStack(registersInUse, 0);
@ -727,9 +727,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 				fpr.Flush();

 				MOV(32, PPCSTATE(pc), Imm32(ops[i].address));
-				ABI_PushRegistersAndAdjustStack(0, 0);
+				ABI_PushRegistersAndAdjustStack({}, 0);
 				ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
-				ABI_PopRegistersAndAdjustStack(0, 0);
+				ABI_PopRegistersAndAdjustStack({}, 0);
 				TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
 				FixupBranch noBreakpoint = J_CC(CC_Z);

@ -744,29 +744,28 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 			// output, which needs to be bound in the actual instruction compilation.
 			// TODO: make this smarter in the case that we're actually register-starved, i.e.
 			// prioritize the more important registers.
-			for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
+			for (int reg : ops[i].regsIn)
 			{
-				int reg = ops[i].regsIn[k];
-				if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm())
+				if (gpr.NumFreeRegisters() < 2)
+					break;
+				if (ops[i].gprInReg[reg] && !gpr.R(reg).IsImm())
 					gpr.BindToRegister(reg, true, false);
 			}
-			for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
+			for (int reg : ops[i].regsOut)
 			{
-				int reg = ops[i].fregsIn[k];
-				if (reg >= 0 && (ops[i].fprInXmm & (1 << reg)))
-					fpr.BindToRegister(reg, true, false);
+				if (fpr.NumFreeRegisters() < 2)
+					break;
+				if (ops[i].fprInXmm[reg])
+					gpr.BindToRegister(reg, true, false);
 			}

 			Jit64Tables::CompileInstruction(ops[i]);

 			// If we have a register that will never be used again, flush it.
-			for (int j = 0; j < 32; j++)
-			{
-				if (!(ops[i].gprInUse & (1 << j)))
-					gpr.StoreFromRegister(j);
-				if (!(ops[i].fprInUse & (1 << j)))
-					fpr.StoreFromRegister(j);
-			}
+			for (int j : ~ops[i].gprInUse)
+				gpr.StoreFromRegister(j);
+			for (int j : ~ops[i].fprInUse)
+				fpr.StoreFromRegister(j);

 			if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
 			{
@ -852,15 +851,15 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 	return normalEntry;
 }

-u32 Jit64::CallerSavedRegistersInUse()
+BitSet32 Jit64::CallerSavedRegistersInUse()
 {
-	u32 result = 0;
+	BitSet32 result;
 	for (int i = 0; i < NUMXREGS; i++)
 	{
 		if (!gpr.IsFreeX(i))
-			result |= (1 << i);
+			result[i] = true;
 		if (!fpr.IsFreeX(i))
-			result |= (1 << (16 + i));
+			result[16 + i] = true;
 	}
 	return result & ABI_ALL_CALLER_SAVED;
 }
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -78,7 +78,7 @@ public:
 	void Jit(u32 em_address) override;
 	const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b);

-	u32 CallerSavedRegistersInUse();
+	BitSet32 CallerSavedRegistersInUse();

 	JitBlockCache *GetBlockCache() override { return &blocks; }

--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@ -43,9 +43,9 @@ void Jit64AsmRoutineManager::Generate()
 	MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));

 	const u8* outerLoop = GetCodePtr();
-		ABI_PushRegistersAndAdjustStack(0, 0);
+		ABI_PushRegistersAndAdjustStack({}, 0);
 		ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
-		ABI_PopRegistersAndAdjustStack(0, 0);
+		ABI_PopRegistersAndAdjustStack({}, 0);
 		FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
 		dispatcherMispredictedBLR = GetCodePtr();

@ -71,9 +71,9 @@ void Jit64AsmRoutineManager::Generate()
 			{
 				TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING));
 				FixupBranch notStepping = J_CC(CC_Z);
-				ABI_PushRegistersAndAdjustStack(0, 0);
+				ABI_PushRegistersAndAdjustStack({}, 0);
 				ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
-				ABI_PopRegistersAndAdjustStack(0, 0);
+				ABI_PopRegistersAndAdjustStack({}, 0);
 				TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
 				dbg_exit = J_CC(CC_NZ, true);
 				SetJumpTarget(notStepping);
@ -129,9 +129,9 @@ void Jit64AsmRoutineManager::Generate()
 			SetJumpTarget(notfound);

 			//Ok, no block, let's jit
-			ABI_PushRegistersAndAdjustStack(0, 0);
+			ABI_PushRegistersAndAdjustStack({}, 0);
 			ABI_CallFunctionA((void *)&Jit, PPCSTATE(pc));
-			ABI_PopRegistersAndAdjustStack(0, 0);
+			ABI_PopRegistersAndAdjustStack({}, 0);

 			// Jit might have cleared the code cache
 			ResetStack();
@ -146,9 +146,9 @@ void Jit64AsmRoutineManager::Generate()
 		FixupBranch noExtException = J_CC(CC_Z);
 		MOV(32, R(RSCRATCH), PPCSTATE(pc));
 		MOV(32, PPCSTATE(npc), R(RSCRATCH));
-		ABI_PushRegistersAndAdjustStack(0, 0);
+		ABI_PushRegistersAndAdjustStack({}, 0);
 		ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
-		ABI_PopRegistersAndAdjustStack(0, 0);
+		ABI_PopRegistersAndAdjustStack({}, 0);
 		SetJumpTarget(noExtException);

 		TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@ -95,42 +95,38 @@ void RegCache::UnlockAllX()
 		xreg.locked = false;
 }

-u32 GPRRegCache::GetRegUtilization()
+BitSet32 GPRRegCache::GetRegUtilization()
 {
 	return jit->js.op->gprInReg;
 }

-u32 FPURegCache::GetRegUtilization()
+BitSet32 FPURegCache::GetRegUtilization()
 {
 	return jit->js.op->gprInReg;
 }

-u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
+BitSet32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
 {
-	u32 regsUsed = 0;
+	BitSet32 regsUsed;
 	for (u32 i = 1; i < lookahead; i++)
 	{
-		for (int j = 0; j < 3; j++)
-			if (jit->js.op[i].regsIn[j] >= 0)
-				regsUsed |= 1 << jit->js.op[i].regsIn[j];
-		for (int j = 0; j < 3; j++)
-			if ((size_t)jit->js.op[i].regsIn[j] == preg)
-				return regsUsed;
+		BitSet32 regsIn = jit->js.op[i].regsIn;
+		regsUsed |= regsIn;
+		if (regsIn[preg])
+			return regsUsed;
 	}
 	return regsUsed;
 }

-u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
+BitSet32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
 {
-	u32 regsUsed = 0;
+	BitSet32 regsUsed;
 	for (u32 i = 1; i < lookahead; i++)
 	{
-		for (int j = 0; j < 4; j++)
-			if (jit->js.op[i].fregsIn[j] >= 0)
-				regsUsed |= 1 << jit->js.op[i].fregsIn[j];
-		for (int j = 0; j < 4; j++)
-			if ((size_t)jit->js.op[i].fregsIn[j] == preg)
-				return regsUsed;
+		BitSet32 regsIn = jit->js.op[i].fregsIn;
+		regsUsed |= regsIn;
+		if (regsIn[preg])
+			return regsUsed;
 	}
 	return regsUsed;
 }
@ -151,17 +147,14 @@ float RegCache::ScoreRegister(X64Reg xr)

 	// If the register isn't actually needed in a physical register for a later instruction,
 	// writing it back to the register file isn't quite as bad.
-	if (GetRegUtilization() & (1 << preg))
+	if (GetRegUtilization()[preg])
 	{
 		// Don't look too far ahead; we don't want to have quadratic compilation times for
 		// enormous block sizes!
 		// This actually improves register allocation a tiny bit; I'm not sure why.
 		u32 lookahead = std::min(jit->js.instructionsLeft, 64);
 		// Count how many other registers are going to be used before we need this one again.
-		u32 regs_in = CountRegsIn(preg, lookahead);
-		u32 regs_in_count = 0;
-		for (int i = 0; i < 32; i++)
-			regs_in_count += !!(regs_in & (1 << i));
+		u32 regs_in_count = CountRegsIn(preg, lookahead).Count();
 		// Totally ad-hoc heuristic to bias based on how many other registers we'll need
 		// before this one gets used again.
 		score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count));
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h
@ -44,8 +44,8 @@ protected:

 	virtual const int *GetAllocationOrder(size_t& count) = 0;

-	virtual u32 GetRegUtilization() = 0;
-	virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0;
+	virtual BitSet32 GetRegUtilization() = 0;
+	virtual BitSet32 CountRegsIn(size_t preg, u32 lookahead) = 0;

 	Gen::XEmitter *emit;

@ -137,8 +137,8 @@ public:
 	Gen::OpArg GetDefaultLocation(size_t reg) const override;
 	const int* GetAllocationOrder(size_t& count) override;
 	void SetImmediate32(size_t preg, u32 immValue);
-	u32 GetRegUtilization();
-	u32 CountRegsIn(size_t preg, u32 lookahead);
+	BitSet32 GetRegUtilization() override;
+	BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
 };


@ -149,6 +149,6 @@ public:
 	void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
 	const int* GetAllocationOrder(size_t& count) override;
 	Gen::OpArg GetDefaultLocation(size_t reg) const override;
-	u32 GetRegUtilization();
-	u32 CountRegsIn(size_t preg, u32 lookahead);
+	BitSet32 GetRegUtilization() override;
+	BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
 };
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
@ -134,7 +134,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
 		TEST(32, gpr.R(d), gpr.R(d));
 		FixupBranch noIdle = J_CC(CC_NZ);

-		u32 registersInUse = CallerSavedRegistersInUse();
+		BitSet32 registersInUse = CallerSavedRegistersInUse();
 		ABI_PushRegistersAndAdjustStack(registersInUse, 0);

 		ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
@ -246,11 +246,11 @@ void Jit64::lXXx(UGeckoInstruction inst)

 	gpr.Lock(a, b, d);
 	gpr.BindToRegister(d, js.memcheck, true);
-	u32 registersInUse = CallerSavedRegistersInUse();
+	BitSet32 registersInUse = CallerSavedRegistersInUse();
 	if (update && storeAddress)
 	{
 		// We need to save the (usually scratch) address register for the update.
-		registersInUse |= (1 << RSCRATCH2);
+		registersInUse[RSCRATCH2] = true;
 	}
 	SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend);

@ -314,7 +314,7 @@ void Jit64::dcbz(UGeckoInstruction inst)
 	SwitchToFarCode();
 	SetJumpTarget(slow);
 	MOV(32, M(&PC), Imm32(jit->js.compilerPC));
-	u32 registersInUse = CallerSavedRegistersInUse();
+	BitSet32 registersInUse = CallerSavedRegistersInUse();
 	ABI_PushRegistersAndAdjustStack(registersInUse, 0);
 	ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH);
 	ABI_PopRegistersAndAdjustStack(registersInUse, 0);
@ -403,7 +403,7 @@ void Jit64::stX(UGeckoInstruction inst)
 				// Helps external systems know which instruction triggered the write
 				MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));

-				u32 registersInUse = CallerSavedRegistersInUse();
+				BitSet32 registersInUse = CallerSavedRegistersInUse();
 				ABI_PushRegistersAndAdjustStack(registersInUse, 0);
 				switch (accessSize)
 				{
@ -555,7 +555,7 @@ void Jit64::lmw(UGeckoInstruction inst)
 		ADD(32, R(RSCRATCH2), gpr.R(inst.RA));
 	for (int i = inst.RD; i < 32; i++)
 	{
-		SafeLoadToReg(RSCRATCH, R(RSCRATCH2), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << RSCRATCH_EXTRA), false);
+		SafeLoadToReg(RSCRATCH, R(RSCRATCH2), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | BitSet32 { RSCRATCH_EXTRA }, false);
 		gpr.BindToRegister(i, false, true);
 		MOV(32, gpr.R(i), R(RSCRATCH));
 	}
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -65,9 +65,9 @@ void Jit64::lfXXX(UGeckoInstruction inst)
 			offset = (s16)inst.SIMM_16;
 	}

-	u32 registersInUse = CallerSavedRegistersInUse();
+	BitSet32 registersInUse = CallerSavedRegistersInUse();
 	if (update && js.memcheck)
-		registersInUse |= (1 << RSCRATCH2);
+		registersInUse[RSCRATCH2] = true;
 	SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false);
 	fpr.Lock(d);
 	fpr.BindToRegister(d, js.memcheck || !single);
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -26,6 +26,7 @@ The register allocation is linear scan allocation.

 #include <algorithm>

+#include "Common/BitSet.h"
 #include "Common/CPUDetect.h"
 #include "Common/MathUtil.h"
 #include "Core/HW/ProcessorInterface.h"
@ -60,15 +61,15 @@ struct RegInfo
 		RegInfo(RegInfo&); // DO NOT IMPLEMENT
 };

-static u32 regsInUse(RegInfo& R)
+static BitSet32 regsInUse(RegInfo& R)
 {
-	u32 result = 0;
+	BitSet32 result;
 	for (unsigned i = 0; i < MAX_NUMBER_OF_REGS; i++)
 	{
 		if (R.regs[i] != nullptr)
-			result |= (1 << i);
+			result[i] = true;
 		if (R.fregs[i] != nullptr)
-			result |= (1 << (16 + i));
+			result[16 + i] = true;
 	}
 	return result;
 }
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
@ -10,14 +10,11 @@
 #include "Core/PowerPC/JitCommon/JitBase.h"

 #define QUANTIZED_REGS_TO_SAVE \
-	(ABI_ALL_CALLER_SAVED & ~(\
-		(1 << RSCRATCH) | \
-		(1 << RSCRATCH2) | \
-		(1 << RSCRATCH_EXTRA)| \
-		(1 << (XMM0+16)) | \
-		(1 << (XMM1+16))))
+	(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
+		RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
+	})

-#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2))
+#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })

 using namespace Gen;

--- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp
@ -72,7 +72,7 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
 		return false;
 	}

-	u32 registersInUse = it->second;
+	BitSet32 registersInUse = it->second;

 	if (!info.isMemoryWrite)
 	{
@ -98,14 +98,14 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
 	else
 	{
 		// TODO: special case FIFO writes. Also, support 32-bit mode.
-		it = pcAtLoc.find(codePtr);
-		if (it == pcAtLoc.end())
+		auto it2 = pcAtLoc.find(codePtr);
+		if (it2 == pcAtLoc.end())
 		{
 			PanicAlert("BackPatch: no pc entry for address %p", codePtr);
 			return nullptr;
 		}

-		u32 pc = it->second;
+		u32 pc = it2->second;

 		u8 *start;
 		if (info.byteSwap || info.hasImmediate)
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -137,7 +137,7 @@ template <typename T>
 class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T>
 {
 public:
-	MMIOReadCodeGenerator(Gen::X64CodeBlock* code, u32 registers_in_use,
+	MMIOReadCodeGenerator(Gen::X64CodeBlock* code, BitSet32 registers_in_use,
 	                      Gen::X64Reg dst_reg, u32 address, bool sign_extend)
 		: m_code(code), m_registers_in_use(registers_in_use), m_dst_reg(dst_reg),
 		  m_address(address), m_sign_extend(sign_extend)
@ -214,14 +214,14 @@ private:
 	}

 	Gen::X64CodeBlock* m_code;
-	u32 m_registers_in_use;
+	BitSet32 m_registers_in_use;
 	Gen::X64Reg m_dst_reg;
 	u32 m_address;
 	bool m_sign_extend;
 };

 void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
-                                 u32 registers_in_use, u32 address,
+                                 BitSet32 registers_in_use, u32 address,
                                 int access_size, bool sign_extend)
 {
 	switch (access_size)
@ -250,17 +250,17 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
 	}
 }

-FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, u32 registers_in_use, u32 mem_mask)
+FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask)
 {
-	registers_in_use |= (1 << reg_addr);
+	registers_in_use[reg_addr] = true;
 	if (reg_value.IsSimpleReg())
-		registers_in_use |= (1 << reg_value.GetSimpleReg());
+		registers_in_use[reg_value.GetSimpleReg()] = true;

 	// Get ourselves a free register; try to pick one that doesn't involve pushing, if we can.
 	X64Reg scratch = RSCRATCH;
-	if (!(registers_in_use & (1 << RSCRATCH)))
+	if (!registers_in_use[RSCRATCH])
 		scratch = RSCRATCH;
-	else if (!(registers_in_use & (1 << RSCRATCH_EXTRA)))
+	else if (!registers_in_use[RSCRATCH_EXTRA])
 		scratch = RSCRATCH_EXTRA;
 	else
 		scratch = reg_addr;
@ -290,11 +290,11 @@ FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, u
 	}
 }

-void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags)
+void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags)
 {
 	if (!jit->js.memcheck)
 	{
-		registersInUse &= ~(1 << reg_value);
+		registersInUse[reg_value] = false;
 	}
 	if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU &&
 	    SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem &&
@ -461,7 +461,7 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce
 	return result;
 }

-void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags)
+void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags)
 {
 	// set the correct immediate format
 	if (reg_value.IsImm())
@ -566,7 +566,7 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces
 }

 // Destroys the same as SafeWrite plus RSCRATCH.  TODO: see if we can avoid temporaries here
-void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, u32 registersInUse, int flags)
+void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags)
 {
 	// TODO: PSHUFB might be faster if fastmem supported MOVSS.
 	MOVD_xmm(R(RSCRATCH), xmm_value);
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -6,6 +6,7 @@

 #include <unordered_map>

+#include "Common/BitSet.h"
 #include "Common/CPUDetect.h"
 #include "Common/x64Emitter.h"

@ -76,7 +77,7 @@ public:
 	void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
 	void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);

-	Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, u32 registers_in_use, u32 mem_mask);
+	Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask);
 	void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
 	void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false);
 	// these return the address of the MOV, for backpatching
@ -89,7 +90,7 @@ public:

 	// Generate a load/write from the MMIO handler for a given address. Only
 	// call for known addresses in MMIO range (MMIO::IsMMIOAddress).
-	void MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, u32 registers_in_use, u32 address, int access_size, bool sign_extend);
+	void MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, BitSet32 registers_in_use, u32 address, int access_size, bool sign_extend);

 	enum SafeLoadStoreFlags
 	{
@ -99,12 +100,12 @@ public:
 		SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR = 8
 	};

-	void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0);
+	void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags = 0);
 	// Clobbers RSCRATCH or reg_addr depending on the relevant flag.  Preserves
 	// reg_value if the load fails and js.memcheck is enabled.
 	// Works with immediate inputs and simple registers only.
-	void SafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0);
-	void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0)
+	void SafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags = 0);
+	void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags = 0)
 	{
 		SafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, registersInUse, flags);
 	}
@ -115,7 +116,7 @@ public:
 		return swap && !cpu_info.bMOVBE && accessSize > 8;
 	}

-	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
+	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags = 0);

 	void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
 	void JitGetAndClearCAOV(bool oe);
@ -137,6 +138,6 @@ public:
 	void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
 	void SetFPRF(Gen::X64Reg xmm);
 protected:
-	std::unordered_map<u8 *, u32> registersInUseAtLoc;
+	std::unordered_map<u8 *, BitSet32> registersInUseAtLoc;
 	std::unordered_map<u8 *, u32> pcAtLoc;
 };
--- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp
@ -36,7 +36,7 @@ void TrampolineCache::Shutdown()
 	cachedTrampolines.clear();
 }

-const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 registersInUse)
+const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse)
 {
 	TrampolineCacheKey key = { registersInUse, 0, info };

@ -49,7 +49,7 @@ const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
 	return trampoline;
 }

-const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, u32 registersInUse)
+const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse)
 {
 	if (GetSpaceLeft() < 1024)
 		PanicAlert("Trampoline cache full");
@ -97,7 +97,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, u
 	return trampoline;
 }

-const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc)
+const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc)
 {
 	TrampolineCacheKey key = { registersInUse, pc, info };

@ -110,7 +110,7 @@ const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
 	return trampoline;
 }

-const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc)
+const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc)
 {
 	if (GetSpaceLeft() < 1024)
 		PanicAlert("Trampoline cache full");
@ -184,7 +184,7 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info,

 size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const
 {
-	size_t res = std::hash<int>()(k.registersInUse);
+	size_t res = std::hash<int>()(k.registersInUse.m_val);
 	res ^= std::hash<int>()(k.info.operandSize)    >> 1;
 	res ^= std::hash<int>()(k.info.regOperandReg)  >> 2;
 	res ^= std::hash<int>()(k.info.scaledReg)      >> 3;
--- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h
+++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h
@ -6,6 +6,7 @@

 #include <unordered_map>

+#include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
 #include "Common/x64Analyzer.h"
 #include "Common/x64Emitter.h"
@ -15,7 +16,7 @@ const int BACKPATCH_SIZE = 5;

 struct TrampolineCacheKey
 {
-	u32 registersInUse;
+	BitSet32 registersInUse;
 	u32 pc;
 	InstructionInfo info;

@ -33,13 +34,13 @@ public:
 	void Init();
 	void Shutdown();

-	const u8* GetReadTrampoline(const InstructionInfo &info, u32 registersInUse);
-	const u8* GetWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc);
+	const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse);
+	const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc);
 	void ClearCodeSpace();

 private:
-	const u8* GenerateReadTrampoline(const InstructionInfo &info, u32 registersInUse);
-	const u8* GenerateWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc);
+	const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse);
+	const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc);

 	std::unordered_map<TrampolineCacheKey, const u8*, TrampolineCacheKeyHasher> cachedTrampolines;
 };
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -249,21 +249,15 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 	// That is, check that none of b's outputs matches any of a's inputs,
 	// and that none of a's outputs matches any of b's inputs.
 	// The latter does not apply if a is a cmp, of course, but doesn't hurt to check.
-	for (int j = 0; j < 3; j++)
-	{
-		int regInA = a.regsIn[j];
-		int regInB = b.regsIn[j];
-		// register collision: b outputs to one of a's inputs
-		if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
-			return false;
-		// register collision: a outputs to one of b's inputs
-		if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
-			return false;
-		// register collision: b outputs to one of a's outputs (overwriting it)
-		for (int k = 0; k < 2; k++)
-			if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
-				return false;
-	}
+	// register collision: b outputs to one of a's inputs
+	if (b.regsOut & a.regsIn)
+		return false;
+	// register collision: a outputs to one of b's inputs
+	if (a.regsOut & b.regsIn)
+		return false;
+	// register collision: b outputs to one of a's outputs (overwriting it)
+	if (b.regsOut & a.regsOut)
+		return false;

 	return true;
 }
@ -520,42 +514,41 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
 		code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;

-	int numOut = 0;
-	int numIn = 0;
-	int numFloatIn = 0;
+	code->regsIn = BitSet32(0);
+	code->regsOut = BitSet32(0);
 	if (opinfo->flags & FL_OUT_A)
 	{
-		code->regsOut[numOut++] = code->inst.RA;
+		code->regsOut[code->inst.RA] = true;
 		block->m_gpa->SetOutputRegister(code->inst.RA, index);
 	}
 	if (opinfo->flags & FL_OUT_D)
 	{
-		code->regsOut[numOut++] = code->inst.RD;
+		code->regsOut[code->inst.RD] = true;
 		block->m_gpa->SetOutputRegister(code->inst.RD, index);
 	}
 	if (opinfo->flags & FL_OUT_S)
 	{
-		code->regsOut[numOut++] = code->inst.RS;
+		code->regsOut[code->inst.RS] = true;
 		block->m_gpa->SetOutputRegister(code->inst.RS, index);
 	}
 	if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0))
 	{
-		code->regsIn[numIn++] = code->inst.RA;
+		code->regsIn[code->inst.RA] = true;
 		block->m_gpa->SetInputRegister(code->inst.RA, index);
 	}
 	if (opinfo->flags & FL_IN_B)
 	{
-		code->regsIn[numIn++] = code->inst.RB;
+		code->regsIn[code->inst.RB] = true;
 		block->m_gpa->SetInputRegister(code->inst.RB, index);
 	}
 	if (opinfo->flags & FL_IN_C)
 	{
-		code->regsIn[numIn++] = code->inst.RC;
+		code->regsIn[code->inst.RC] = true;
 		block->m_gpa->SetInputRegister(code->inst.RC, index);
 	}
 	if (opinfo->flags & FL_IN_S)
 	{
-		code->regsIn[numIn++] = code->inst.RS;
+		code->regsIn[code->inst.RS] = true;
 		block->m_gpa->SetInputRegister(code->inst.RS, index);
 	}

@ -564,24 +557,17 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 		code->fregOut = code->inst.FD;
 	else if (opinfo->flags & FL_OUT_FLOAT_S)
 		code->fregOut = code->inst.FS;
+	code->fregsIn = BitSet32(0);
 	if (opinfo->flags & FL_IN_FLOAT_A)
-		code->fregsIn[numFloatIn++] = code->inst.FA;
+		code->fregsIn[code->inst.FA] = true;
 	if (opinfo->flags & FL_IN_FLOAT_B)
-		code->fregsIn[numFloatIn++] = code->inst.FB;
+		code->fregsIn[code->inst.FB] = true;
 	if (opinfo->flags & FL_IN_FLOAT_C)
-		code->fregsIn[numFloatIn++] = code->inst.FC;
+		code->fregsIn[code->inst.FC] = true;
 	if (opinfo->flags & FL_IN_FLOAT_D)
-		code->fregsIn[numFloatIn++] = code->inst.FD;
+		code->fregsIn[code->inst.FD] = true;
 	if (opinfo->flags & FL_IN_FLOAT_S)
-		code->fregsIn[numFloatIn++] = code->inst.FS;
-
-	// Set remaining register slots as unused (-1)
-	for (int j = numIn; j < 3; j++)
-		code->regsIn[j] = -1;
-	for (int j = numOut; j < 2; j++)
-		code->regsOut[j] = -1;
-	for (int j = numFloatIn; j < 4; j++)
-		code->fregsIn[j] = -1;
+		code->fregsIn[code->inst.FS] = true;

 	switch (opinfo->type)
 	{
@ -797,7 +783,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 	// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
 	// wants flags, to be safe.
 	bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true;
-	u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0;
+	BitSet32 fprInUse, gprInUse, gprInReg, fprInXmm;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
 		bool opWantsCR0 = code[i].wantsCR0;
@ -822,30 +808,20 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 		code[i].fprInXmm = fprInXmm;
 		// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
 		// we can throw away a register if it's going to be overwritten later.
-		for (int j = 0; j < 3; j++)
-			if (code[i].regsIn[j] >= 0)
-			{
-				gprInUse |= 1 << code[i].regsIn[j];
-				gprInReg |= 1 << code[i].regsIn[j];
-			}
-		for (int j = 0; j < 4; j++)
-			if (code[i].fregsIn[j] >= 0)
-			{
-				fprInUse |= 1 << code[i].fregsIn[j];
-				if (strncmp(code[i].opinfo->opname, "stfd", 4))
-					fprInXmm |= 1 << code[i].fregsIn[j];
-			}
+		gprInUse |= code[i].regsIn;
+		gprInReg |= code[i].regsIn;
+		fprInUse |= code[i].fregsIn;
+		if (strncmp(code[i].opinfo->opname, "stfd", 4))
+			fprInXmm |= code[i].fregsIn;
 		// For now, we need to count output registers as "used" though; otherwise the flush
 		// will result in a redundant store (e.g. store to regcache, then store again to
 		// the same location later).
-		for (int j = 0; j < 2; j++)
-			if (code[i].regsOut[j] >= 0)
-				gprInUse |= 1 << code[i].regsOut[j];
+		gprInUse |= code[i].regsOut;
 		if (code[i].fregOut >= 0)
 		{
-			fprInUse |= 1 << code[i].fregOut;
+			fprInUse[code[i].fregOut] = true;
 			if (strncmp(code[i].opinfo->opname, "stfd", 4))
-				fprInXmm |= 1 << code[i].fregOut;
+				fprInXmm[code[i].fregOut] = true;
 		}
 	}
 	return address;
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -10,6 +10,7 @@
 #include <string>
 #include <vector>

+#include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
 #include "Core/PowerPC/PPCTables.h"

@ -26,10 +27,10 @@ struct CodeOp //16B
 	u32 address;
 	u32 branchTo; //if 0, not a branch
 	int branchToIndex; //index of target block
-	s8 regsOut[2];
-	s8 regsIn[3];
+	BitSet32 regsOut;
+	BitSet32 regsIn;
+	BitSet32 fregsIn;
 	s8 fregOut;
-	s8 fregsIn[4];
 	bool isBranchTarget;
 	bool wantsCR0;
 	bool wantsCR1;
@ -43,13 +44,13 @@ struct CodeOp //16B
 	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 	// which registers are still needed after this instruction in this block
-	u32 fprInUse;
-	u32 gprInUse;
+	BitSet32 fprInUse;
+	BitSet32 gprInUse;
 	// just because a register is in use doesn't mean we actually need or want it in an x86 register.
-	u32 gprInReg;
+	BitSet32 gprInReg;
 	// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
 	// an XMM only to move it again to a GPR afterwards.
-	u32 fprInXmm;
+	BitSet32 fprInXmm;
 };

 struct BlockStats
--- a/Source/Core/Core/PowerPC/Profiler.h
+++ b/Source/Core/Core/PowerPC/Profiler.h
@ -23,7 +23,7 @@
 	MOV(64, M(pdt), R(RSCRATCH));

 #define PROFILER_VPUSH \
-	u32 registersInUse = CallerSavedRegistersInUse(); \
+	BitSet32 registersInUse = CallerSavedRegistersInUse(); \
 	ABI_PushRegistersAndAdjustStack(registersInUse, 0);

 #define PROFILER_VPOP \
--- a/Source/Core/VideoCommon/CPMemory.h
+++ b/Source/Core/VideoCommon/CPMemory.h
@ -4,6 +4,7 @@

 #pragma once

+#include "Common/BitSet.h"
 #include "Common/CommonTypes.h"

 // Vertex array numbers
@ -252,7 +253,7 @@ struct CPState final
    VAT vtx_attr[8];

 	// Attributes that actually belong to VertexLoaderManager:
-	int attr_dirty; // bitfield
+	BitSet32 attr_dirty;
 	VertexLoader* vertex_loaders[8];
 };

--- a/Source/Core/VideoCommon/VertexLoader.cpp
+++ b/Source/Core/VideoCommon/VertexLoader.cpp
@ -141,7 +141,7 @@ void VertexLoader::CompileVertexTranslator()

 	m_compiledCode = GetCodePtr();
 	// We only use RAX (caller saved) and RBX (callee saved).
-	ABI_PushRegistersAndAdjustStack(1 << RBX, 8);
+	ABI_PushRegistersAndAdjustStack({RBX}, 8);

 	// save count
 	MOV(64, R(RBX), R(ABI_PARAM1));
@ -402,7 +402,7 @@ void VertexLoader::CompileVertexTranslator()
 	SUB(64, R(RBX), Imm8(1));

 	J_CC(CC_NZ, loop_start);
-	ABI_PopRegistersAndAdjustStack(1 << RBX, 8);
+	ABI_PopRegistersAndAdjustStack({RBX}, 8);
 	RET();
 #endif
 }
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@ -100,14 +100,14 @@ void AppendListToString(std::string *dest)

 void MarkAllDirty()
 {
-	g_main_cp_state.attr_dirty = 0xff;
-	g_preprocess_cp_state.attr_dirty = 0xff;
+	g_main_cp_state.attr_dirty = BitSet32::AllTrue(8);
+	g_preprocess_cp_state.attr_dirty = BitSet32::AllTrue(8);
 }

 static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state)
 {
 	VertexLoader* loader;
-	if ((state->attr_dirty >> vtx_attr_group) & 1)
+	if (state->attr_dirty[vtx_attr_group])
 	{
 		VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]);
 		std::lock_guard<std::mutex> lk(s_vertex_loader_map_lock);
@ -123,7 +123,7 @@ static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state)
 			INCSTAT(stats.numVertexLoaders);
 		}
 		state->vertex_loaders[vtx_attr_group] = loader;
-		state->attr_dirty &= ~(1 << vtx_attr_group);
+		state->attr_dirty[vtx_attr_group] = false;
 	} else {
 		loader = state->vertex_loaders[vtx_attr_group];
 	}
@ -200,31 +200,31 @@ void LoadCPReg(u32 sub_cmd, u32 value, bool is_preprocess)
 	case 0x50:
 		state->vtx_desc.Hex &= ~0x1FFFF;  // keep the Upper bits
 		state->vtx_desc.Hex |= value;
-		state->attr_dirty = 0xFF;
+		state->attr_dirty = BitSet32::AllTrue(8);
 		break;

 	case 0x60:
 		state->vtx_desc.Hex &= 0x1FFFF;  // keep the lower 17Bits
 		state->vtx_desc.Hex |= (u64)value << 17;
-		state->attr_dirty = 0xFF;
+		state->attr_dirty = BitSet32::AllTrue(8);
 		break;

 	case 0x70:
 		_assert_((sub_cmd & 0x0F) < 8);
 		state->vtx_attr[sub_cmd & 7].g0.Hex = value;
-		state->attr_dirty |= 1 << (sub_cmd & 7);
+		state->attr_dirty[sub_cmd & 7] = true;
 		break;

 	case 0x80:
 		_assert_((sub_cmd & 0x0F) < 8);
 		state->vtx_attr[sub_cmd & 7].g1.Hex = value;
-		state->attr_dirty |= 1 << (sub_cmd & 7);
+		state->attr_dirty[sub_cmd & 7] = true;
 		break;

 	case 0x90:
 		_assert_((sub_cmd & 0x0F) < 8);
 		state->vtx_attr[sub_cmd & 7].g2.Hex = value;
-		state->attr_dirty |= 1 << (sub_cmd & 7);
+		state->attr_dirty[sub_cmd & 7] = true;
 		break;

 	// Pointers to vertex arrays in GC RAM
--- a/Source/Core/VideoCommon/VertexManagerBase.cpp
+++ b/Source/Core/VideoCommon/VertexManagerBase.cpp
@ -180,39 +180,36 @@ void VertexManager::Flush()
 		(int)bpmem.genMode.numtexgens, (u32)bpmem.dstalpha.enable, (bpmem.alpha_test.hex>>16)&0xff);
 #endif

-	u32 usedtextures = 0;
+	BitSet32 usedtextures;
 	for (u32 i = 0; i < bpmem.genMode.numtevstages + 1u; ++i)
 		if (bpmem.tevorders[i / 2].getEnable(i & 1))
-			usedtextures |= 1 << bpmem.tevorders[i/2].getTexMap(i & 1);
+			usedtextures[bpmem.tevorders[i/2].getTexMap(i & 1)] = true;

 	if (bpmem.genMode.numindstages > 0)
 		for (unsigned int i = 0; i < bpmem.genMode.numtevstages + 1u; ++i)
 			if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages)
-				usedtextures |= 1 << bpmem.tevindref.getTexMap(bpmem.tevind[i].bt);
+				usedtextures[bpmem.tevindref.getTexMap(bpmem.tevind[i].bt)] = true;

-	for (unsigned int i = 0; i < 8; i++)
+	for (unsigned int i : usedtextures)
 	{
-		if (usedtextures & (1 << i))
-		{
-			g_renderer->SetSamplerState(i & 3, i >> 2);
-			const FourTexUnits &tex = bpmem.tex[i >> 2];
-			const TextureCache::TCacheEntryBase* tentry = TextureCache::Load(i,
-				(tex.texImage3[i&3].image_base/* & 0x1FFFFF*/) << 5,
-				tex.texImage0[i&3].width + 1, tex.texImage0[i&3].height + 1,
-				tex.texImage0[i&3].format, tex.texTlut[i&3].tmem_offset<<9,
-				tex.texTlut[i&3].tlut_format,
-				((tex.texMode0[i&3].min_filter & 3) != 0),
-				(tex.texMode1[i&3].max_lod + 0xf) / 0x10,
-				(tex.texImage1[i&3].image_type != 0));
+		g_renderer->SetSamplerState(i & 3, i >> 2);
+		const FourTexUnits &tex = bpmem.tex[i >> 2];
+		const TextureCache::TCacheEntryBase* tentry = TextureCache::Load(i,
+			(tex.texImage3[i&3].image_base/* & 0x1FFFFF*/) << 5,
+			tex.texImage0[i&3].width + 1, tex.texImage0[i&3].height + 1,
+			tex.texImage0[i&3].format, tex.texTlut[i&3].tmem_offset<<9,
+			tex.texTlut[i&3].tlut_format,
+			((tex.texMode0[i&3].min_filter & 3) != 0),
+			(tex.texMode1[i&3].max_lod + 0xf) / 0x10,
+			(tex.texImage1[i&3].image_type != 0));

-			if (tentry)
-			{
-				// 0s are probably for no manual wrapping needed.
-				PixelShaderManager::SetTexDims(i, tentry->native_width, tentry->native_height, 0, 0);
-			}
-			else
-				ERROR_LOG(VIDEO, "error loading texture");
+		if (tentry)
+		{
+			// 0s are probably for no manual wrapping needed.
+			PixelShaderManager::SetTexDims(i, tentry->native_width, tentry->native_height, 0, 0);
 		}
+		else
+			ERROR_LOG(VIDEO, "error loading texture");
 	}

 	// set global constants
--- a/Source/Core/VideoCommon/VertexShaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexShaderManager.cpp
@ -5,6 +5,7 @@
 #include <cmath>
 #include <sstream>

+#include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
 #include "Common/MathUtil.h"
 #include "VideoCommon/BPMemory.h"
@ -22,7 +23,7 @@ static float GC_ALIGNED16(g_fProjectionMatrix[16]);

 // track changes
 static bool bTexMatricesChanged[2], bPosNormalMatrixChanged, bProjectionChanged, bViewportChanged;
-static int nMaterialsChanged;
+static BitSet32 nMaterialsChanged;
 static int nTransformMatricesChanged[2]; // min,max
 static int nNormalMatricesChanged[2]; // min,max
 static int nPostTransformMatricesChanged[2]; // min,max
@ -202,7 +203,7 @@ void VertexShaderManager::Dirty()

 	bProjectionChanged = true;

-	nMaterialsChanged = 15;
+	nMaterialsChanged = BitSet32::AllTrue(4);

 	dirty = true;
 }
@ -295,35 +296,16 @@ void VertexShaderManager::SetConstants()
 		nLightsChanged[0] = nLightsChanged[1] = -1;
 	}

-	if (nMaterialsChanged)
+	for (int i : nMaterialsChanged)
 	{
-		for (int i = 0; i < 2; ++i)
-		{
-			if (nMaterialsChanged & (1 << i))
-			{
-				u32 data = xfmem.ambColor[i];
-				constants.materials[i][0] = (data >> 24) & 0xFF;
-				constants.materials[i][1] = (data >> 16) & 0xFF;
-				constants.materials[i][2] = (data >>  8) & 0xFF;
-				constants.materials[i][3] =  data        & 0xFF;
-			}
-		}
-
-		for (int i = 0; i < 2; ++i)
-		{
-			if (nMaterialsChanged & (1 << (i + 2)))
-			{
-				u32 data = xfmem.matColor[i];
-				constants.materials[i+2][0] = (data >> 24) & 0xFF;
-				constants.materials[i+2][1] = (data >> 16) & 0xFF;
-				constants.materials[i+2][2] = (data >>  8) & 0xFF;
-				constants.materials[i+2][3] =  data        & 0xFF;
-			}
-		}
+		u32 data = i >= 2 ? xfmem.matColor[i - 2] : xfmem.ambColor[i];
+		constants.materials[i][0] = (data >> 24) & 0xFF;
+		constants.materials[i][1] = (data >> 16) & 0xFF;
+		constants.materials[i][2] = (data >>  8) & 0xFF;
+		constants.materials[i][3] =  data        & 0xFF;
 		dirty = true;
-
-		nMaterialsChanged = 0;
 	}
+	nMaterialsChanged = BitSet32(0);

 	if (bPosNormalMatrixChanged)
 	{
@ -660,7 +642,7 @@ void VertexShaderManager::SetProjectionChanged()

 void VertexShaderManager::SetMaterialColorChanged(int index, u32 color)
 {
-	nMaterialsChanged  |= (1 << index);
+	nMaterialsChanged[index] = true;
 }

 void VertexShaderManager::TranslateView(float x, float y, float z)
--- a/Source/UnitTests/Common/BitSetTest.cpp
+++ b/Source/UnitTests/Common/BitSetTest.cpp
@ -0,0 +1,84 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <gtest/gtest.h>
+
+#include "Common/BitSet.h"
+
+TEST(BitSet, Basics)
+{
+	BitSet32 bs;
+	BitSet64 bs2(1);
+	BitSet64 bs3(2);
+	EXPECT_EQ(true, !!bs2);
+	EXPECT_EQ(false, !!bs);
+	EXPECT_EQ(bs2, bs2);
+	EXPECT_NE(bs2, bs3);
+	EXPECT_EQ(BitSet32(0xfff), BitSet32::AllTrue(12));
+	EXPECT_EQ(BitSet64(0xffffffffffffffff), BitSet64::AllTrue(64));
+}
+
+TEST(BitSet, BitGetSet)
+{
+	BitSet32 bs;
+	bs[3] = bs[8] = bs[11] = true;
+	EXPECT_EQ(true, bs[3]);
+	EXPECT_EQ(false, bs[4]);
+	EXPECT_EQ((u32)((1 << 3) | (1 << 8) | (1 << 11)), bs.m_val);
+}
+
+TEST(BitSet, Count)
+{
+	u32 random_numbers[] = {
+		0x2cb0b5f3,	0x81ab32a6,	0xd9030dc5,	0x325ffe26,	0xb2fcaee3,
+		0x4ccf188a,	0xf8be36dc,	0xb2fcecd5,	0xb750c2e5,	0x31d19074,
+		0xf267644a,	0xac00a719,	0x6d45f19b,	0xf7e91c5b,	0xf687e694,
+		0x9057c24e,	0x5eb65c39,	0x85d3038b,	0x101f4e66,	0xc202d136
+	};
+	u32 counts[] = {
+		17, 14, 14, 19, 20, 14, 20, 20, 16, 13, 16, 12, 18, 20, 18, 14, 18, 14, 14, 12
+	};
+	for (size_t i = 0; i < 20; i++)
+	{
+		EXPECT_EQ(counts[i], BitSet32(random_numbers[i]).Count());
+	}
+
+	u64 random_numbers_64[] = {
+		0xf86cd6f6ef09d7d4ULL, 0x6f2d8533255ead3cULL, 0x9da7941e0e52b345ULL,
+		0x06e4189be67d2b17ULL, 0x3eb0681f65cb6d25ULL, 0xccab8a7c74a51203ULL,
+		0x09d470516694c64bULL, 0x38cd077e075c778fULL, 0xd69ebfa6355ebfdeULL
+	};
+	u32 counts_64[] = {
+		39, 34, 31, 32, 33, 29, 27, 35, 43
+	};
+	for (size_t i = 0; i < 9; i++)
+	{
+		EXPECT_EQ(counts_64[i], BitSet64(random_numbers_64[i]).Count());
+	}
+}
+
+TEST(BitSet, BitOps)
+{
+	BitSet32 a(3), b(5), c;
+	EXPECT_EQ(BitSet32(7), a | b);
+	EXPECT_EQ(BitSet32(6), a ^ b);
+	EXPECT_EQ(BitSet32(1), a & b);
+	EXPECT_EQ(BitSet32(0xfffffffc), ~a);
+	c = a; c |= b; EXPECT_EQ(BitSet32(7), c);
+	c = a; c ^= b; EXPECT_EQ(BitSet32(6), c);
+	c = a; c &= b; EXPECT_EQ(BitSet32(1), c);
+}
+
+TEST(BitSet, InitializerListsAndIteration)
+{
+	std::vector<int> bits { 1, 10, 15, 17, 20, 30 };
+	BitSet32          bs  { 1, 10, 15, 17, 20, 30 };
+	auto vit = bits.begin();
+	for (auto i : bs)
+	{
+		EXPECT_NE(vit, bits.end());
+		EXPECT_EQ(i, *vit++);
+	}
+	EXPECT_EQ(vit, bits.end());
+}
--- a/Source/UnitTests/Common/CMakeLists.txt
+++ b/Source/UnitTests/Common/CMakeLists.txt
@ -1,4 +1,5 @@
 add_dolphin_test(BitFieldTest BitFieldTest.cpp)
+add_dolphin_test(BitSetTest BitSetTest.cpp)
 add_dolphin_test(CommonFuncsTest CommonFuncsTest.cpp)
 add_dolphin_test(EventTest EventTest.cpp)
 add_dolphin_test(FifoQueueTest FifoQueueTest.cpp)