From b6a74380537305ef33ce12e952bde0155e4a57b3 Mon Sep 17 00:00:00 2001 From: comex Date: Thu, 16 Oct 2014 21:49:48 -0400 Subject: [PATCH 1/5] Add BitSet and, as a test, convert some JitRegCache stuff to it. This is a higher level, more concise wrapper for bitsets which supports efficiently counting and iterating over set bits. It's similar to std::bitset, but the latter does not support efficient iteration (and at least in libc++, the count algorithm is subpar, not that it really matters). The converted uses include both bitsets and, notably, considerably less efficient regular arrays (for in/out registers in PPCAnalyst). Unfortunately, this may slightly pessimize unoptimized builds. --- Source/Core/Common/BitSet.h | 156 ++++++++++++++++++ Source/Core/Common/Common.vcxproj | 3 +- Source/Core/Common/Common.vcxproj.filters | 3 +- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 27 ++- .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 39 ++--- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 12 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 90 ++++------ Source/Core/Core/PowerPC/PPCAnalyst.h | 15 +- 8 files changed, 236 insertions(+), 109 deletions(-) create mode 100644 Source/Core/Common/BitSet.h diff --git a/Source/Core/Common/BitSet.h b/Source/Core/Common/BitSet.h new file mode 100644 index 0000000000..b2cf577b50 --- /dev/null +++ b/Source/Core/Common/BitSet.h @@ -0,0 +1,156 @@ +// This file is under the public domain. + +#pragma once + +#include +#include +#include "CommonTypes.h" + +// Helper functions: + +#ifdef _WIN32 +template +static inline int CountSetBits(T v) +{ + // from https://graphics.stanford.edu/~seander/bithacks.html + // GCC has this built in, but MSVC's intrinsic will only emit the actual + // POPCNT instruction, which we're not depending on + v = v - ((v >> 1) & (T)~(T)0/3); + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); + v = (v + (v >> 4)) & (T)~(T)0/255*15; + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; +} +static inline int LeastSignificantSetBit(u32 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +static inline int LeastSignificantSetBit(u64 val) +{ + unsigned long index; + _BitScanForward64(&index, val); + return (int)index; +} +#else +static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } +static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } +static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } +static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } +#endif + + +// Similar to std::bitset, this is a class which encapsulates a bitset, i.e. +// using the set bits of an integer to represent a set of integers. Like that +// class, it acts like an array of bools: +// BitSet32 bs; // use BitSet{32,64} instead of the template directly +// bs[1] = true; +// but also like the underlying integer ([0] = least significant bit): +// BitSet32 bs2 = ...; +// bs = (bs ^ bs2) & BitSet32(0xffff); +// The following additional functionality is provided: +// - Construction using an initializer list. +// BitSet bs { 1, 2, 4, 8 }; +// - Efficiently iterating through the set bits: +// for (int i : bs) +// [i is the *index* of a set bit] +// (This uses the appropriate CPU instruction to find the next set bit in one +// operation.) +// - Counting set bits using .Count() - see comment on that method. + +// TODO: use constexpr when MSVC gets out of the Dark Ages + +template +class BitSet +{ + static_assert(!std::is_signed::value, "BitSet should not be used with signed types"); +public: + // A reference to a particular bit, returned from operator[]. + class Ref + { + public: + Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {} + Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {} + operator bool() const { return (m_bs->m_val & m_mask) != 0; } + bool operator=(bool set) + { + m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0); + return set; + } + private: + BitSet* m_bs; + IntTy m_mask; + }; + + // A STL-like iterator is required to be able to use range-based for loops. + class Iterator + { + public: + Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} + Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} + Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; } + int operator*() { return m_bit; } + Iterator& operator++() + { + if (m_val == 0) + { + m_bit = -1; + } + else + { + int bit = LeastSignificantSetBit(m_val); + m_val &= ~(1 << bit); + m_bit = bit; + } + return *this; + } + Iterator operator++(int _) + { + Iterator other(*this); + ++*this; + return other; + } + bool operator==(Iterator other) const { return m_bit == other.m_bit; } + bool operator!=(Iterator other) const { return m_bit != other.m_bit; } + private: + IntTy m_val; + int m_bit; + }; + + BitSet() : m_val(0) {} + explicit BitSet(IntTy val) : m_val(val) {} + BitSet(std::initializer_list init) + { + m_val = 0; + for (int bit : init) + m_val |= (IntTy)1 << bit; + } + + Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } + const Ref operator[](size_t bit) const { return (*const_cast(this))[bit]; } + bool operator==(BitSet other) const { return m_val == other.m_val; } + bool operator!=(BitSet other) const { return m_val != other.m_val; } + BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); } + BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); } + BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); } + BitSet operator~() const { return BitSet(~m_val); } + BitSet& operator|=(BitSet other) { return *this = *this | other; } + BitSet& operator&=(BitSet other) { return *this = *this & other; } + BitSet& operator^=(BitSet other) { return *this = *this ^ other; } + operator u32() = delete; + operator bool() { return m_val != 0; } + + // Warning: Even though on modern CPUs this is a single fast instruction, + // Dolphin's official builds do not currently assume POPCNT support on x86, + // so slower explicit bit twiddling is generated. Still should generally + // be faster than a loop. + unsigned int Count() const { return CountSetBits(m_val); } + + Iterator begin() const { Iterator it(m_val, 0); return ++it; } + Iterator end() const { return Iterator(m_val, -1); } + + IntTy m_val; +}; + +typedef BitSet BitSet32; +typedef BitSet BitSet64; diff --git a/Source/Core/Common/Common.vcxproj b/Source/Core/Common/Common.vcxproj index 814ac4f04f..5d7a31904d 100644 --- a/Source/Core/Common/Common.vcxproj +++ b/Source/Core/Common/Common.vcxproj @@ -39,6 +39,7 @@ + @@ -137,4 +138,4 @@ - \ No newline at end of file + diff --git a/Source/Core/Common/Common.vcxproj.filters b/Source/Core/Common/Common.vcxproj.filters index 84997f7441..ac5b5c454f 100644 --- a/Source/Core/Common/Common.vcxproj.filters +++ b/Source/Core/Common/Common.vcxproj.filters @@ -13,6 +13,7 @@ + @@ -118,4 +119,4 @@ - \ No newline at end of file + diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index d091db4ff3..99633dc57d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -736,29 +736,28 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc // output, which needs to be bound in the actual instruction compilation. // TODO: make this smarter in the case that we're actually register-starved, i.e. // prioritize the more important registers. - for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++) + for (int reg : ops[i].regsIn) { - int reg = ops[i].regsIn[k]; - if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm()) + if (gpr.NumFreeRegisters() < 2) + break; + if (ops[i].gprInReg[reg] && !gpr.R(reg).IsImm()) gpr.BindToRegister(reg, true, false); } - for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++) + for (int reg : ops[i].regsOut) { - int reg = ops[i].fregsIn[k]; - if (reg >= 0 && (ops[i].fprInXmm & (1 << reg))) - fpr.BindToRegister(reg, true, false); + if (fpr.NumFreeRegisters() < 2) + break; + if (ops[i].fprInXmm[reg]) + gpr.BindToRegister(reg, true, false); } Jit64Tables::CompileInstruction(ops[i]); // If we have a register that will never be used again, flush it. - for (int j = 0; j < 32; j++) - { - if (!(ops[i].gprInUse & (1 << j))) - gpr.StoreFromRegister(j); - if (!(ops[i].fprInUse & (1 << j))) - fpr.StoreFromRegister(j); - } + for (int j : ~ops[i].gprInUse) + gpr.StoreFromRegister(j); + for (int j : ~ops[i].fprInUse) + fpr.StoreFromRegister(j); if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index bb1b77371d..c7b0dd1db4 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -95,42 +95,38 @@ void RegCache::UnlockAllX() xreg.locked = false; } -u32 GPRRegCache::GetRegUtilization() +BitSet32 GPRRegCache::GetRegUtilization() { return jit->js.op->gprInReg; } -u32 FPURegCache::GetRegUtilization() +BitSet32 FPURegCache::GetRegUtilization() { return jit->js.op->gprInReg; } -u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead) +BitSet32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead) { - u32 regsUsed = 0; + BitSet32 regsUsed; for (u32 i = 1; i < lookahead; i++) { - for (int j = 0; j < 3; j++) - if (jit->js.op[i].regsIn[j] >= 0) - regsUsed |= 1 << jit->js.op[i].regsIn[j]; - for (int j = 0; j < 3; j++) - if ((size_t)jit->js.op[i].regsIn[j] == preg) - return regsUsed; + BitSet32 regsIn = jit->js.op[i].regsIn; + regsUsed |= regsIn; + if (regsIn[preg]) + return regsUsed; } return regsUsed; } -u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead) +BitSet32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead) { - u32 regsUsed = 0; + BitSet32 regsUsed; for (u32 i = 1; i < lookahead; i++) { - for (int j = 0; j < 4; j++) - if (jit->js.op[i].fregsIn[j] >= 0) - regsUsed |= 1 << jit->js.op[i].fregsIn[j]; - for (int j = 0; j < 4; j++) - if ((size_t)jit->js.op[i].fregsIn[j] == preg) - return regsUsed; + BitSet32 regsIn = jit->js.op[i].fregsIn; + regsUsed |= regsIn; + if (regsIn[preg]) + return regsUsed; } return regsUsed; } @@ -151,17 +147,14 @@ float RegCache::ScoreRegister(X64Reg xr) // If the register isn't actually needed in a physical register for a later instruction, // writing it back to the register file isn't quite as bad. - if (GetRegUtilization() & (1 << preg)) + if (GetRegUtilization()[preg]) { // Don't look too far ahead; we don't want to have quadratic compilation times for // enormous block sizes! // This actually improves register allocation a tiny bit; I'm not sure why. u32 lookahead = std::min(jit->js.instructionsLeft, 64); // Count how many other registers are going to be used before we need this one again. - u32 regs_in = CountRegsIn(preg, lookahead); - u32 regs_in_count = 0; - for (int i = 0; i < 32; i++) - regs_in_count += !!(regs_in & (1 << i)); + u32 regs_in_count = CountRegsIn(preg, lookahead).Count(); // Totally ad-hoc heuristic to bias based on how many other registers we'll need // before this one gets used again. score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count)); diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 244ac1be59..3943e83852 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -44,8 +44,8 @@ protected: virtual const int *GetAllocationOrder(size_t& count) = 0; - virtual u32 GetRegUtilization() = 0; - virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0; + virtual BitSet32 GetRegUtilization() = 0; + virtual BitSet32 CountRegsIn(size_t preg, u32 lookahead) = 0; Gen::XEmitter *emit; @@ -137,8 +137,8 @@ public: Gen::OpArg GetDefaultLocation(size_t reg) const override; const int* GetAllocationOrder(size_t& count) override; void SetImmediate32(size_t preg, u32 immValue); - u32 GetRegUtilization(); - u32 CountRegsIn(size_t preg, u32 lookahead); + BitSet32 GetRegUtilization() override; + BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; }; @@ -149,6 +149,6 @@ public: void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; const int* GetAllocationOrder(size_t& count) override; Gen::OpArg GetDefaultLocation(size_t reg) const override; - u32 GetRegUtilization(); - u32 CountRegsIn(size_t preg, u32 lookahead); + BitSet32 GetRegUtilization() override; + BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; }; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 4bd5b60a14..acc5d372ed 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -249,21 +249,15 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) // That is, check that none of b's outputs matches any of a's inputs, // and that none of a's outputs matches any of b's inputs. // The latter does not apply if a is a cmp, of course, but doesn't hurt to check. - for (int j = 0; j < 3; j++) - { - int regInA = a.regsIn[j]; - int regInB = b.regsIn[j]; - // register collision: b outputs to one of a's inputs - if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA)) - return false; - // register collision: a outputs to one of b's inputs - if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB)) - return false; - // register collision: b outputs to one of a's outputs (overwriting it) - for (int k = 0; k < 2; k++) - if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1])) - return false; - } + // register collision: b outputs to one of a's inputs + if (b.regsOut & a.regsIn) + return false; + // register collision: a outputs to one of b's inputs + if (a.regsOut & b.regsIn) + return false; + // register collision: b outputs to one of a's outputs (overwriting it) + if (b.regsOut & a.regsOut) + return false; return true; } @@ -520,42 +514,41 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; - int numOut = 0; - int numIn = 0; - int numFloatIn = 0; + code->regsIn = BitSet32(0); + code->regsOut = BitSet32(0); if (opinfo->flags & FL_OUT_A) { - code->regsOut[numOut++] = code->inst.RA; + code->regsOut[code->inst.RA] = true; block->m_gpa->SetOutputRegister(code->inst.RA, index); } if (opinfo->flags & FL_OUT_D) { - code->regsOut[numOut++] = code->inst.RD; + code->regsOut[code->inst.RD] = true; block->m_gpa->SetOutputRegister(code->inst.RD, index); } if (opinfo->flags & FL_OUT_S) { - code->regsOut[numOut++] = code->inst.RS; + code->regsOut[code->inst.RS] = true; block->m_gpa->SetOutputRegister(code->inst.RS, index); } if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0)) { - code->regsIn[numIn++] = code->inst.RA; + code->regsIn[code->inst.RA] = true; block->m_gpa->SetInputRegister(code->inst.RA, index); } if (opinfo->flags & FL_IN_B) { - code->regsIn[numIn++] = code->inst.RB; + code->regsIn[code->inst.RB] = true; block->m_gpa->SetInputRegister(code->inst.RB, index); } if (opinfo->flags & FL_IN_C) { - code->regsIn[numIn++] = code->inst.RC; + code->regsIn[code->inst.RC] = true; block->m_gpa->SetInputRegister(code->inst.RC, index); } if (opinfo->flags & FL_IN_S) { - code->regsIn[numIn++] = code->inst.RS; + code->regsIn[code->inst.RS] = true; block->m_gpa->SetInputRegister(code->inst.RS, index); } @@ -564,24 +557,17 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->fregOut = code->inst.FD; else if (opinfo->flags & FL_OUT_FLOAT_S) code->fregOut = code->inst.FS; + code->fregsIn = BitSet32(0); if (opinfo->flags & FL_IN_FLOAT_A) - code->fregsIn[numFloatIn++] = code->inst.FA; + code->fregsIn[code->inst.FA] = true; if (opinfo->flags & FL_IN_FLOAT_B) - code->fregsIn[numFloatIn++] = code->inst.FB; + code->fregsIn[code->inst.FB] = true; if (opinfo->flags & FL_IN_FLOAT_C) - code->fregsIn[numFloatIn++] = code->inst.FC; + code->fregsIn[code->inst.FC] = true; if (opinfo->flags & FL_IN_FLOAT_D) - code->fregsIn[numFloatIn++] = code->inst.FD; + code->fregsIn[code->inst.FD] = true; if (opinfo->flags & FL_IN_FLOAT_S) - code->fregsIn[numFloatIn++] = code->inst.FS; - - // Set remaining register slots as unused (-1) - for (int j = numIn; j < 3; j++) - code->regsIn[j] = -1; - for (int j = numOut; j < 2; j++) - code->regsOut[j] = -1; - for (int j = numFloatIn; j < 4; j++) - code->fregsIn[j] = -1; + code->fregsIn[code->inst.FS] = true; switch (opinfo->type) { @@ -797,7 +783,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // Scan for flag dependencies; assume the next block (or any branch that can leave the block) // wants flags, to be safe. bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true; - u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0; + BitSet32 fprInUse, gprInUse, gprInReg, fprInXmm; for (int i = block->m_num_instructions - 1; i >= 0; i--) { bool opWantsCR0 = code[i].wantsCR0; @@ -822,30 +808,20 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 code[i].fprInXmm = fprInXmm; // TODO: if there's no possible endblocks or exceptions in between, tell the regcache // we can throw away a register if it's going to be overwritten later. - for (int j = 0; j < 3; j++) - if (code[i].regsIn[j] >= 0) - { - gprInUse |= 1 << code[i].regsIn[j]; - gprInReg |= 1 << code[i].regsIn[j]; - } - for (int j = 0; j < 4; j++) - if (code[i].fregsIn[j] >= 0) - { - fprInUse |= 1 << code[i].fregsIn[j]; - if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fprInXmm |= 1 << code[i].fregsIn[j]; - } + gprInUse |= code[i].regsIn; + gprInReg |= code[i].regsIn; + fprInUse |= code[i].fregsIn; + if (strncmp(code[i].opinfo->opname, "stfd", 4)) + fprInXmm |= code[i].fregsIn; // For now, we need to count output registers as "used" though; otherwise the flush // will result in a redundant store (e.g. store to regcache, then store again to // the same location later). - for (int j = 0; j < 2; j++) - if (code[i].regsOut[j] >= 0) - gprInUse |= 1 << code[i].regsOut[j]; + gprInUse |= code[i].regsOut; if (code[i].fregOut >= 0) { - fprInUse |= 1 << code[i].fregOut; + fprInUse[code[i].fregOut] = true; if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fprInXmm |= 1 << code[i].fregOut; + fprInXmm[code[i].fregOut] = true; } } return address; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 4ae6e6ded3..8b3f4bc85a 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -10,6 +10,7 @@ #include #include +#include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Core/PowerPC/PPCTables.h" @@ -26,10 +27,10 @@ struct CodeOp //16B u32 address; u32 branchTo; //if 0, not a branch int branchToIndex; //index of target block - s8 regsOut[2]; - s8 regsIn[3]; + BitSet32 regsOut; + BitSet32 regsIn; + BitSet32 fregsIn; s8 fregOut; - s8 fregsIn[4]; bool isBranchTarget; bool wantsCR0; bool wantsCR1; @@ -43,13 +44,13 @@ struct CodeOp //16B bool canEndBlock; bool skip; // followed BL-s for example // which registers are still needed after this instruction in this block - u32 fprInUse; - u32 gprInUse; + BitSet32 fprInUse; + BitSet32 gprInUse; // just because a register is in use doesn't mean we actually need or want it in an x86 register. - u32 gprInReg; + BitSet32 gprInReg; // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into // an XMM only to move it again to a GPR afterwards. - u32 fprInXmm; + BitSet32 fprInXmm; }; struct BlockStats From eb7f4dac507c22da73dd57a8ad008b5d1d4382b1 Mon Sep 17 00:00:00 2001 From: comex Date: Thu, 16 Oct 2014 22:21:55 -0400 Subject: [PATCH 2/5] Convert registersInUse to BitSet. --- Source/Core/Common/x64ABI.cpp | 51 ++++++------------- Source/Core/Common/x64ABI.h | 21 ++++---- Source/Core/Common/x64Emitter.h | 7 +-- Source/Core/Core/DSP/DSPEmitter.cpp | 2 +- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 46 ++++++++--------- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 +- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 16 +++--- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 12 ++--- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 4 +- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 9 ++-- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 11 ++-- .../Core/PowerPC/JitCommon/JitBackpatch.cpp | 8 +-- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 24 ++++----- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 15 +++--- .../PowerPC/JitCommon/TrampolineCache.cpp | 10 ++-- .../Core/PowerPC/JitCommon/TrampolineCache.h | 11 ++-- Source/Core/Core/PowerPC/Profiler.h | 2 +- Source/Core/VideoCommon/VertexLoader.cpp | 4 +- 18 files changed, 119 insertions(+), 136 deletions(-) diff --git a/Source/Core/Common/x64ABI.cpp b/Source/Core/Common/x64ABI.cpp index 0f958a4a6d..f51b760ced 100644 --- a/Source/Core/Common/x64ABI.cpp +++ b/Source/Core/Common/x64ABI.cpp @@ -10,31 +10,23 @@ using namespace Gen; // Shared code between Win64 and Unix64 -void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) +void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) { size_t shadow = 0; #if defined(_WIN32) shadow = 0x20; #endif - int count = 0; - for (int r = 0; r < 16; r++) - { - if (mask & (1 << r)) - count++; - } + int count = (mask & ABI_ALL_GPRS).Count(); rsp_alignment -= count * 8; size_t subtraction = 0; - if (mask & 0xffff0000) + int fpr_count = (mask & ABI_ALL_FPRS).Count(); + if (fpr_count) { // If we have any XMMs to save, we must align the stack here. subtraction = rsp_alignment & 0xf; } - for (int x = 0; x < 16; x++) - { - if (mask & (1 << (16 + x))) - subtraction += 16; - } + subtraction += 16 * fpr_count; size_t xmm_base_subtraction = subtraction; subtraction += needed_frame_size; subtraction += shadow; @@ -47,44 +39,35 @@ void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t nee *xmm_offsetp = subtraction - xmm_base_subtraction; } -size_t XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size) +size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) { size_t shadow, subtraction, xmm_offset; ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); - for (int r = 0; r < 16; r++) - { - if (mask & (1 << r)) - PUSH((X64Reg) r); - } + for (int r : mask & ABI_ALL_GPRS) + PUSH((X64Reg) r); if (subtraction) SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); - for (int x = 0; x < 16; x++) + for (int x : mask & ABI_ALL_FPRS) { - if (mask & (1 << (16 + x))) - { - MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) x); - xmm_offset += 16; - } + MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) (x - 16)); + xmm_offset += 16; } return shadow; } -void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size) +void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) { size_t shadow, subtraction, xmm_offset; ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); - for (int x = 0; x < 16; x++) + for (int x : mask & ABI_ALL_FPRS) { - if (mask & (1 << (16 + x))) - { - MOVAPD((X64Reg) x, MDisp(RSP, (int)xmm_offset)); - xmm_offset += 16; - } + MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset)); + xmm_offset += 16; } if (subtraction) @@ -92,10 +75,8 @@ void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, si for (int r = 15; r >= 0; r--) { - if (mask & (1 << r)) - { + if (mask[r]) POP((X64Reg) r); - } } } diff --git a/Source/Core/Common/x64ABI.h b/Source/Core/Common/x64ABI.h index bf058bc04a..c76759b31c 100644 --- a/Source/Core/Common/x64ABI.h +++ b/Source/Core/Common/x64ABI.h @@ -4,6 +4,7 @@ #pragma once +#include "Common/BitSet.h" #include "Common/x64Emitter.h" // x64 ABI:s, and helpers to help follow them when JIT-ing code. @@ -23,6 +24,9 @@ // Callee-save: RBX RBP R12 R13 R14 R15 // Parameters: RDI RSI RDX RCX R8 R9 +#define ABI_ALL_FPRS BitSet32(0xffff0000) +#define ABI_ALL_GPRS BitSet32(0x0000ffff) + #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention #define ABI_PARAM1 RCX @@ -31,11 +35,9 @@ #define ABI_PARAM4 R9 // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. -#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \ - (1 << R9) | (1 << R10) | (1 << R11) | \ - (1 << (XMM0+16)) | (1 << (XMM1+16)) | (1 << (XMM2+16)) | (1 << (XMM3+16)) | \ - (1 << (XMM4+16)) | (1 << (XMM5+16))) - +#define ABI_ALL_CALLER_SAVED \ + (BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \ + XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 }) #else //64-bit Unix / OS X #define ABI_PARAM1 RDI @@ -47,13 +49,12 @@ // FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably // don't actually clobber them. -#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \ - (1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \ - 0xffff0000 /* xmm0..15 */) - +#define ABI_ALL_CALLER_SAVED \ + (BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \ + ABI_ALL_FPRS) #endif // WIN32 -#define ABI_ALL_CALLEE_SAVED ((u32) ~ABI_ALL_CALLER_SAVED) +#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED) #define ABI_RETURN RAX diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index ed0250e8d0..7f98497456 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -10,6 +10,7 @@ #include #include +#include "Common/BitSet.h" #include "Common/CodeBlock.h" #include "Common/CommonTypes.h" @@ -302,7 +303,7 @@ private: void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); - void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); + void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); protected: inline void Write8(u8 value) {*code++ = value;} @@ -883,8 +884,8 @@ public: // Saves/restores the registers and adjusts the stack to be aligned as // required by the ABI, where the previous alignment was as specified. // Push returns the size of the shadow space, i.e. the offset of the frame. - size_t ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); - void ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); + size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); + void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); inline int ABI_GetNumXMMRegs() { return 16; } diff --git a/Source/Core/Core/DSP/DSPEmitter.cpp b/Source/Core/Core/DSP/DSPEmitter.cpp index 188dfcaf2c..fec47aea93 100644 --- a/Source/Core/Core/DSP/DSPEmitter.cpp +++ b/Source/Core/Core/DSP/DSPEmitter.cpp @@ -385,7 +385,7 @@ void DSPEmitter::CompileDispatcher() { enterDispatcher = AlignCode16(); // We don't use floating point (high 16 bits). - u32 registers_used = ABI_ALL_CALLEE_SAVED & 0xffff; + BitSet32 registers_used = ABI_ALL_CALLEE_SAVED & BitSet32(0xffff); ABI_PushRegistersAndAdjustStack(registers_used, 8); const u8 *dispatcherLoop = GetCodePtr(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 99633dc57d..06a27a5c2d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -248,9 +248,9 @@ void Jit64::WriteCallInterpreter(UGeckoInstruction inst) MOV(32, PPCSTATE(npc), Imm32(js.compilerPC + 4)); } Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunctionC((void*)instr, inst.hex); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); } void Jit64::unknown_instruction(UGeckoInstruction inst) @@ -267,9 +267,9 @@ void Jit64::HLEFunction(UGeckoInstruction _inst) { gpr.Flush(); fpr.Flush(); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); } void Jit64::DoNothing(UGeckoInstruction _inst) @@ -307,18 +307,18 @@ bool Jit64::Cleanup() if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) { - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); did_something = true; } // SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time. if (MMCR0.Hex || MMCR1.Hex) { - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); did_something = true; } @@ -433,9 +433,9 @@ void Jit64::WriteRfiExitDestInRSCRATCH() MOV(32, PPCSTATE(pc), R(RSCRATCH)); MOV(32, PPCSTATE(npc), R(RSCRATCH)); Cleanup(); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -445,9 +445,9 @@ void Jit64::WriteExceptionExit() Cleanup(); MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, PPCSTATE(npc), R(RSCRATCH)); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -457,9 +457,9 @@ void Jit64::WriteExternalExceptionExit() Cleanup(); MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, PPCSTATE(npc), R(RSCRATCH)); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExternalExceptions)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); JMP(asm_routines.dispatcher, true); } @@ -560,9 +560,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc if (ImHereDebug) { - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); } // Conditionally add profiling code. @@ -637,7 +637,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc { js.fifoBytesThisBlock -= 32; MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write - u32 registersInUse = CallerSavedRegistersInUse(); + BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); ABI_PopRegistersAndAdjustStack(registersInUse, 0); @@ -719,9 +719,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc fpr.Flush(); MOV(32, PPCSTATE(pc), Imm32(ops[i].address)); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckBreakPoints)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); FixupBranch noBreakpoint = J_CC(CC_Z); @@ -843,15 +843,15 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc return normalEntry; } -u32 Jit64::CallerSavedRegistersInUse() +BitSet32 Jit64::CallerSavedRegistersInUse() { - u32 result = 0; + BitSet32 result; for (int i = 0; i < NUMXREGS; i++) { if (!gpr.IsFreeX(i)) - result |= (1 << i); + result[i] = true; if (!fpr.IsFreeX(i)) - result |= (1 << (16 + i)); + result[16 + i] = true; } return result & ABI_ALL_CALLER_SAVED; } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 4dfbe56eb8..bb49e9288d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -73,7 +73,7 @@ public: void Jit(u32 em_address) override; const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b); - u32 CallerSavedRegistersInUse(); + BitSet32 CallerSavedRegistersInUse(); JitBlockCache *GetBlockCache() override { return &blocks; } diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index d895ae76db..7adb93d3b1 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -43,9 +43,9 @@ void Jit64AsmRoutineManager::Generate() MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80)); const u8* outerLoop = GetCodePtr(); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&CoreTiming::Advance)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time dispatcherMispredictedBLR = GetCodePtr(); @@ -71,9 +71,9 @@ void Jit64AsmRoutineManager::Generate() { TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING)); FixupBranch notStepping = J_CC(CC_Z); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckBreakPoints)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); dbg_exit = J_CC(CC_NZ, true); SetJumpTarget(notStepping); @@ -129,9 +129,9 @@ void Jit64AsmRoutineManager::Generate() SetJumpTarget(notfound); //Ok, no block, let's jit - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunctionA((void *)&Jit, PPCSTATE(pc)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); // Jit might have cleared the code cache ResetStack(); @@ -146,9 +146,9 @@ void Jit64AsmRoutineManager::Generate() FixupBranch noExtException = J_CC(CC_Z); MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, PPCSTATE(npc), R(RSCRATCH)); - ABI_PushRegistersAndAdjustStack(0, 0); + ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExternalExceptions)); - ABI_PopRegistersAndAdjustStack(0, 0); + ABI_PopRegistersAndAdjustStack({}, 0); SetJumpTarget(noExtException); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index a2abade26b..cb4c6521a3 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -130,7 +130,7 @@ void Jit64::lXXx(UGeckoInstruction inst) TEST(32, gpr.R(d), gpr.R(d)); FixupBranch noIdle = J_CC(CC_NZ); - u32 registersInUse = CallerSavedRegistersInUse(); + BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); @@ -242,11 +242,11 @@ void Jit64::lXXx(UGeckoInstruction inst) gpr.Lock(a, b, d); gpr.BindToRegister(d, js.memcheck, true); - u32 registersInUse = CallerSavedRegistersInUse(); + BitSet32 registersInUse = CallerSavedRegistersInUse(); if (update && storeAddress) { // We need to save the (usually scratch) address register for the update. - registersInUse |= (1 << RSCRATCH2); + registersInUse[RSCRATCH2] = true; } SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend); @@ -310,7 +310,7 @@ void Jit64::dcbz(UGeckoInstruction inst) SwitchToFarCode(); SetJumpTarget(slow); MOV(32, M(&PC), Imm32(jit->js.compilerPC)); - u32 registersInUse = CallerSavedRegistersInUse(); + BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH); ABI_PopRegistersAndAdjustStack(registersInUse, 0); @@ -399,7 +399,7 @@ void Jit64::stX(UGeckoInstruction inst) // Helps external systems know which instruction triggered the write MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); - u32 registersInUse = CallerSavedRegistersInUse(); + BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); switch (accessSize) { @@ -551,7 +551,7 @@ void Jit64::lmw(UGeckoInstruction inst) ADD(32, R(RSCRATCH2), gpr.R(inst.RA)); for (int i = inst.RD; i < 32; i++) { - SafeLoadToReg(RSCRATCH, R(RSCRATCH2), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << RSCRATCH_EXTRA), false); + SafeLoadToReg(RSCRATCH, R(RSCRATCH2), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | BitSet32 { RSCRATCH_EXTRA }, false); gpr.BindToRegister(i, false, true); MOV(32, gpr.R(i), R(RSCRATCH)); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index de5097ceb2..2b158b5948 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -65,9 +65,9 @@ void Jit64::lfXXX(UGeckoInstruction inst) offset = (s16)inst.SIMM_16; } - u32 registersInUse = CallerSavedRegistersInUse(); + BitSet32 registersInUse = CallerSavedRegistersInUse(); if (update && js.memcheck) - registersInUse |= (1 << RSCRATCH2); + registersInUse[RSCRATCH2] = true; SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false); fpr.Lock(d); fpr.BindToRegister(d, js.memcheck || !single); diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index 88c96c327d..854dd0a50a 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -26,6 +26,7 @@ The register allocation is linear scan allocation. #include +#include "Common/BitSet.h" #include "Common/CPUDetect.h" #include "Common/MathUtil.h" #include "Core/HW/ProcessorInterface.h" @@ -60,15 +61,15 @@ struct RegInfo RegInfo(RegInfo&); // DO NOT IMPLEMENT }; -static u32 regsInUse(RegInfo& R) +static BitSet32 regsInUse(RegInfo& R) { - u32 result = 0; + BitSet32 result; for (unsigned i = 0; i < MAX_NUMBER_OF_REGS; i++) { if (R.regs[i] != nullptr) - result |= (1 << i); + result[i] = true; if (R.fregs[i] != nullptr) - result |= (1 << (16 + i)); + result[16 + i] = true; } return result; } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index a4036ab172..c47198865e 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -10,14 +10,11 @@ #include "Core/PowerPC/JitCommon/JitBase.h" #define QUANTIZED_REGS_TO_SAVE \ - (ABI_ALL_CALLER_SAVED & ~(\ - (1 << RSCRATCH) | \ - (1 << RSCRATCH2) | \ - (1 << RSCRATCH_EXTRA)| \ - (1 << (XMM0+16)) | \ - (1 << (XMM1+16)))) + (ABI_ALL_CALLER_SAVED & ~BitSet32 { \ + RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \ + }) -#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2)) +#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 }) using namespace Gen; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp index ea921817b8..64fd24ba73 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp @@ -72,7 +72,7 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) return false; } - u32 registersInUse = it->second; + BitSet32 registersInUse = it->second; if (!info.isMemoryWrite) { @@ -98,14 +98,14 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) else { // TODO: special case FIFO writes. Also, support 32-bit mode. - it = pcAtLoc.find(codePtr); - if (it == pcAtLoc.end()) + auto it2 = pcAtLoc.find(codePtr); + if (it2 == pcAtLoc.end()) { PanicAlert("BackPatch: no pc entry for address %p", codePtr); return nullptr; } - u32 pc = it->second; + u32 pc = it2->second; u8 *start; if (info.byteSwap || info.hasImmediate) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 0e9853948a..c2dc1475a8 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -137,7 +137,7 @@ template class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor { public: - MMIOReadCodeGenerator(Gen::X64CodeBlock* code, u32 registers_in_use, + MMIOReadCodeGenerator(Gen::X64CodeBlock* code, BitSet32 registers_in_use, Gen::X64Reg dst_reg, u32 address, bool sign_extend) : m_code(code), m_registers_in_use(registers_in_use), m_dst_reg(dst_reg), m_address(address), m_sign_extend(sign_extend) @@ -214,14 +214,14 @@ private: } Gen::X64CodeBlock* m_code; - u32 m_registers_in_use; + BitSet32 m_registers_in_use; Gen::X64Reg m_dst_reg; u32 m_address; bool m_sign_extend; }; void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, - u32 registers_in_use, u32 address, + BitSet32 registers_in_use, u32 address, int access_size, bool sign_extend) { switch (access_size) @@ -250,17 +250,17 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, } } -FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, u32 registers_in_use, u32 mem_mask) +FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask) { - registers_in_use |= (1 << reg_addr); + registers_in_use[reg_addr] = true; if (reg_value.IsSimpleReg()) - registers_in_use |= (1 << reg_value.GetSimpleReg()); + registers_in_use[reg_value.GetSimpleReg()] = true; // Get ourselves a free register; try to pick one that doesn't involve pushing, if we can. X64Reg scratch = RSCRATCH; - if (!(registers_in_use & (1 << RSCRATCH))) + if (!registers_in_use[RSCRATCH]) scratch = RSCRATCH; - else if (!(registers_in_use & (1 << RSCRATCH_EXTRA))) + else if (!registers_in_use[RSCRATCH_EXTRA]) scratch = RSCRATCH_EXTRA; else scratch = reg_addr; @@ -290,11 +290,11 @@ FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, u } } -void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags) +void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags) { if (!jit->js.memcheck) { - registersInUse &= ~(1 << reg_value); + registersInUse[reg_value] = false; } if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU && SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && @@ -468,7 +468,7 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce return result; } -void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags) +void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags) { // set the correct immediate format if (reg_value.IsImm()) @@ -580,7 +580,7 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces } // Destroys the same as SafeWrite plus RSCRATCH. TODO: see if we can avoid temporaries here -void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, u32 registersInUse, int flags) +void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags) { // TODO: PSHUFB might be faster if fastmem supported MOVSS. MOVD_xmm(R(RSCRATCH), xmm_value); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 43b54debd9..68f3ced898 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -6,6 +6,7 @@ #include +#include "Common/BitSet.h" #include "Common/CPUDetect.h" #include "Common/x64Emitter.h" @@ -76,7 +77,7 @@ public: void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); - Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, u32 registers_in_use, u32 mem_mask); + Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false); // these return the address of the MOV, for backpatching @@ -89,7 +90,7 @@ public: // Generate a load/write from the MMIO handler for a given address. Only // call for known addresses in MMIO range (MMIO::IsMMIOAddress). - void MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, u32 registers_in_use, u32 address, int access_size, bool sign_extend); + void MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, BitSet32 registers_in_use, u32 address, int access_size, bool sign_extend); enum SafeLoadStoreFlags { @@ -99,12 +100,12 @@ public: SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR = 8 }; - void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0); + void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags = 0); // Clobbers RSCRATCH or reg_addr depending on the relevant flag. Preserves // reg_value if the load fails and js.memcheck is enabled. // Works with immediate inputs and simple registers only. - void SafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0); - void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0) + void SafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags = 0); + void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags = 0) { SafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, registersInUse, flags); } @@ -115,7 +116,7 @@ public: return swap && !cpu_info.bMOVBE && accessSize > 8; } - void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); + void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags = 0); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); void JitGetAndClearCAOV(bool oe); @@ -137,6 +138,6 @@ public: void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src); void SetFPRF(Gen::X64Reg xmm); protected: - std::unordered_map registersInUseAtLoc; + std::unordered_map registersInUseAtLoc; std::unordered_map pcAtLoc; }; diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 7b23ac1427..63fbd20fdc 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -36,7 +36,7 @@ void TrampolineCache::Shutdown() cachedTrampolines.clear(); } -const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 registersInUse) +const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse) { TrampolineCacheKey key = { registersInUse, 0, info }; @@ -49,7 +49,7 @@ const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re return trampoline; } -const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, u32 registersInUse) +const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); @@ -97,7 +97,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, u return trampoline; } -const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc) +const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc) { TrampolineCacheKey key = { registersInUse, pc, info }; @@ -110,7 +110,7 @@ const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r return trampoline; } -const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc) +const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); @@ -184,7 +184,7 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const { - size_t res = std::hash()(k.registersInUse); + size_t res = std::hash()(k.registersInUse.m_val); res ^= std::hash()(k.info.operandSize) >> 1; res ^= std::hash()(k.info.regOperandReg) >> 2; res ^= std::hash()(k.info.scaledReg) >> 3; diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h index cb9fee2978..16e293bce0 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h @@ -6,6 +6,7 @@ #include +#include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Common/x64Analyzer.h" #include "Common/x64Emitter.h" @@ -15,7 +16,7 @@ const int BACKPATCH_SIZE = 5; struct TrampolineCacheKey { - u32 registersInUse; + BitSet32 registersInUse; u32 pc; InstructionInfo info; @@ -33,13 +34,13 @@ public: void Init(); void Shutdown(); - const u8* GetReadTrampoline(const InstructionInfo &info, u32 registersInUse); - const u8* GetWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc); + const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse); + const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc); void ClearCodeSpace(); private: - const u8* GenerateReadTrampoline(const InstructionInfo &info, u32 registersInUse); - const u8* GenerateWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc); + const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse); + const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc); std::unordered_map cachedTrampolines; }; diff --git a/Source/Core/Core/PowerPC/Profiler.h b/Source/Core/Core/PowerPC/Profiler.h index 83c323ad9d..7b51589225 100644 --- a/Source/Core/Core/PowerPC/Profiler.h +++ b/Source/Core/Core/PowerPC/Profiler.h @@ -23,7 +23,7 @@ MOV(64, M(pdt), R(RSCRATCH)); #define PROFILER_VPUSH \ - u32 registersInUse = CallerSavedRegistersInUse(); \ + BitSet32 registersInUse = CallerSavedRegistersInUse(); \ ABI_PushRegistersAndAdjustStack(registersInUse, 0); #define PROFILER_VPOP \ diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 291bb94c62..e4974a8b2b 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -141,7 +141,7 @@ void VertexLoader::CompileVertexTranslator() m_compiledCode = GetCodePtr(); // We only use RAX (caller saved) and RBX (callee saved). - ABI_PushRegistersAndAdjustStack(1 << RBX, 8); + ABI_PushRegistersAndAdjustStack({RBX}, 8); // save count MOV(64, R(RBX), R(ABI_PARAM1)); @@ -402,7 +402,7 @@ void VertexLoader::CompileVertexTranslator() SUB(64, R(RBX), Imm8(1)); J_CC(CC_NZ, loop_start); - ABI_PopRegistersAndAdjustStack(1 << RBX, 8); + ABI_PopRegistersAndAdjustStack({RBX}, 8); RET(); #endif } From f51c233a08969e4126520561ecb15e56674ffe9d Mon Sep 17 00:00:00 2001 From: comex Date: Fri, 17 Oct 2014 01:24:24 -0400 Subject: [PATCH 3/5] Add workaround for OS X symbol clash and AllOnes helper method. --- Source/Core/Common/BitSet.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Source/Core/Common/BitSet.h b/Source/Core/Common/BitSet.h index b2cf577b50..91a66a8b76 100644 --- a/Source/Core/Common/BitSet.h +++ b/Source/Core/Common/BitSet.h @@ -39,11 +39,14 @@ static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } #endif +// namespace avoids conflict with OS X Carbon; don't use BitSet directly +namespace BS +{ // Similar to std::bitset, this is a class which encapsulates a bitset, i.e. // using the set bits of an integer to represent a set of integers. Like that // class, it acts like an array of bools: -// BitSet32 bs; // use BitSet{32,64} instead of the template directly +// BitSet32 bs; // bs[1] = true; // but also like the underlying integer ([0] = least significant bit): // BitSet32 bs2 = ...; @@ -126,6 +129,11 @@ public: m_val |= (IntTy)1 << bit; } + static BitSet AllTrue(size_t count) + { + return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1)); + } + Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } const Ref operator[](size_t bit) const { return (*const_cast(this))[bit]; } bool operator==(BitSet other) const { return m_val == other.m_val; } @@ -152,5 +160,7 @@ public: IntTy m_val; }; -typedef BitSet BitSet32; -typedef BitSet BitSet64; +} + +typedef BS::BitSet BitSet32; +typedef BS::BitSet BitSet64; From b29e5146ec63e1b35cee45e01f54c4b69f15938e Mon Sep 17 00:00:00 2001 From: comex Date: Tue, 21 Oct 2014 20:42:55 -0400 Subject: [PATCH 4/5] Convert some VideoCommon stuff to BitSet. Now with a minor performance improvement removed for no reason. --- Source/Core/VideoCommon/CPMemory.h | 3 +- .../Core/VideoCommon/VertexLoaderManager.cpp | 18 ++++---- Source/Core/VideoCommon/VertexManagerBase.cpp | 43 +++++++++---------- .../Core/VideoCommon/VertexShaderManager.cpp | 40 +++++------------ 4 files changed, 42 insertions(+), 62 deletions(-) diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index ae8ff08303..a256fc6849 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -4,6 +4,7 @@ #pragma once +#include "Common/BitSet.h" #include "Common/CommonTypes.h" // Vertex array numbers @@ -252,7 +253,7 @@ struct CPState final VAT vtx_attr[8]; // Attributes that actually belong to VertexLoaderManager: - int attr_dirty; // bitfield + BitSet32 attr_dirty; VertexLoader* vertex_loaders[8]; }; diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index ca925d0302..b8132fa806 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -100,14 +100,14 @@ void AppendListToString(std::string *dest) void MarkAllDirty() { - g_main_cp_state.attr_dirty = 0xff; - g_preprocess_cp_state.attr_dirty = 0xff; + g_main_cp_state.attr_dirty = BitSet32::AllTrue(8); + g_preprocess_cp_state.attr_dirty = BitSet32::AllTrue(8); } static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state) { VertexLoader* loader; - if ((state->attr_dirty >> vtx_attr_group) & 1) + if (state->attr_dirty[vtx_attr_group]) { VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]); std::lock_guard lk(s_vertex_loader_map_lock); @@ -123,7 +123,7 @@ static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state) INCSTAT(stats.numVertexLoaders); } state->vertex_loaders[vtx_attr_group] = loader; - state->attr_dirty &= ~(1 << vtx_attr_group); + state->attr_dirty[vtx_attr_group] = false; } else { loader = state->vertex_loaders[vtx_attr_group]; } @@ -200,31 +200,31 @@ void LoadCPReg(u32 sub_cmd, u32 value, bool is_preprocess) case 0x50: state->vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits state->vtx_desc.Hex |= value; - state->attr_dirty = 0xFF; + state->attr_dirty = BitSet32::AllTrue(8); break; case 0x60: state->vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits state->vtx_desc.Hex |= (u64)value << 17; - state->attr_dirty = 0xFF; + state->attr_dirty = BitSet32::AllTrue(8); break; case 0x70: _assert_((sub_cmd & 0x0F) < 8); state->vtx_attr[sub_cmd & 7].g0.Hex = value; - state->attr_dirty |= 1 << (sub_cmd & 7); + state->attr_dirty[sub_cmd & 7] = true; break; case 0x80: _assert_((sub_cmd & 0x0F) < 8); state->vtx_attr[sub_cmd & 7].g1.Hex = value; - state->attr_dirty |= 1 << (sub_cmd & 7); + state->attr_dirty[sub_cmd & 7] = true; break; case 0x90: _assert_((sub_cmd & 0x0F) < 8); state->vtx_attr[sub_cmd & 7].g2.Hex = value; - state->attr_dirty |= 1 << (sub_cmd & 7); + state->attr_dirty[sub_cmd & 7] = true; break; // Pointers to vertex arrays in GC RAM diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index d637ad9017..7a18ba435b 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -180,39 +180,36 @@ void VertexManager::Flush() (int)bpmem.genMode.numtexgens, (u32)bpmem.dstalpha.enable, (bpmem.alpha_test.hex>>16)&0xff); #endif - u32 usedtextures = 0; + BitSet32 usedtextures; for (u32 i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) if (bpmem.tevorders[i / 2].getEnable(i & 1)) - usedtextures |= 1 << bpmem.tevorders[i/2].getTexMap(i & 1); + usedtextures[bpmem.tevorders[i/2].getTexMap(i & 1)] = true; if (bpmem.genMode.numindstages > 0) for (unsigned int i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages) - usedtextures |= 1 << bpmem.tevindref.getTexMap(bpmem.tevind[i].bt); + usedtextures[bpmem.tevindref.getTexMap(bpmem.tevind[i].bt)] = true; - for (unsigned int i = 0; i < 8; i++) + for (unsigned int i : usedtextures) { - if (usedtextures & (1 << i)) - { - g_renderer->SetSamplerState(i & 3, i >> 2); - const FourTexUnits &tex = bpmem.tex[i >> 2]; - const TextureCache::TCacheEntryBase* tentry = TextureCache::Load(i, - (tex.texImage3[i&3].image_base/* & 0x1FFFFF*/) << 5, - tex.texImage0[i&3].width + 1, tex.texImage0[i&3].height + 1, - tex.texImage0[i&3].format, tex.texTlut[i&3].tmem_offset<<9, - tex.texTlut[i&3].tlut_format, - ((tex.texMode0[i&3].min_filter & 3) != 0), - (tex.texMode1[i&3].max_lod + 0xf) / 0x10, - (tex.texImage1[i&3].image_type != 0)); + g_renderer->SetSamplerState(i & 3, i >> 2); + const FourTexUnits &tex = bpmem.tex[i >> 2]; + const TextureCache::TCacheEntryBase* tentry = TextureCache::Load(i, + (tex.texImage3[i&3].image_base/* & 0x1FFFFF*/) << 5, + tex.texImage0[i&3].width + 1, tex.texImage0[i&3].height + 1, + tex.texImage0[i&3].format, tex.texTlut[i&3].tmem_offset<<9, + tex.texTlut[i&3].tlut_format, + ((tex.texMode0[i&3].min_filter & 3) != 0), + (tex.texMode1[i&3].max_lod + 0xf) / 0x10, + (tex.texImage1[i&3].image_type != 0)); - if (tentry) - { - // 0s are probably for no manual wrapping needed. - PixelShaderManager::SetTexDims(i, tentry->native_width, tentry->native_height, 0, 0); - } - else - ERROR_LOG(VIDEO, "error loading texture"); + if (tentry) + { + // 0s are probably for no manual wrapping needed. + PixelShaderManager::SetTexDims(i, tentry->native_width, tentry->native_height, 0, 0); } + else + ERROR_LOG(VIDEO, "error loading texture"); } // set global constants diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index 022bf7683d..93f969b0b5 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -5,6 +5,7 @@ #include #include +#include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Common/MathUtil.h" #include "VideoCommon/BPMemory.h" @@ -22,7 +23,7 @@ static float GC_ALIGNED16(g_fProjectionMatrix[16]); // track changes static bool bTexMatricesChanged[2], bPosNormalMatrixChanged, bProjectionChanged, bViewportChanged; -static int nMaterialsChanged; +static BitSet32 nMaterialsChanged; static int nTransformMatricesChanged[2]; // min,max static int nNormalMatricesChanged[2]; // min,max static int nPostTransformMatricesChanged[2]; // min,max @@ -202,7 +203,7 @@ void VertexShaderManager::Dirty() bProjectionChanged = true; - nMaterialsChanged = 15; + nMaterialsChanged = BitSet32::AllTrue(4); dirty = true; } @@ -295,35 +296,16 @@ void VertexShaderManager::SetConstants() nLightsChanged[0] = nLightsChanged[1] = -1; } - if (nMaterialsChanged) + for (int i : nMaterialsChanged) { - for (int i = 0; i < 2; ++i) - { - if (nMaterialsChanged & (1 << i)) - { - u32 data = xfmem.ambColor[i]; - constants.materials[i][0] = (data >> 24) & 0xFF; - constants.materials[i][1] = (data >> 16) & 0xFF; - constants.materials[i][2] = (data >> 8) & 0xFF; - constants.materials[i][3] = data & 0xFF; - } - } - - for (int i = 0; i < 2; ++i) - { - if (nMaterialsChanged & (1 << (i + 2))) - { - u32 data = xfmem.matColor[i]; - constants.materials[i+2][0] = (data >> 24) & 0xFF; - constants.materials[i+2][1] = (data >> 16) & 0xFF; - constants.materials[i+2][2] = (data >> 8) & 0xFF; - constants.materials[i+2][3] = data & 0xFF; - } - } + u32 data = i >= 2 ? xfmem.matColor[i - 2] : xfmem.ambColor[i]; + constants.materials[i][0] = (data >> 24) & 0xFF; + constants.materials[i][1] = (data >> 16) & 0xFF; + constants.materials[i][2] = (data >> 8) & 0xFF; + constants.materials[i][3] = data & 0xFF; dirty = true; - - nMaterialsChanged = 0; } + nMaterialsChanged = BitSet32(0); if (bPosNormalMatrixChanged) { @@ -660,7 +642,7 @@ void VertexShaderManager::SetProjectionChanged() void VertexShaderManager::SetMaterialColorChanged(int index, u32 color) { - nMaterialsChanged |= (1 << index); + nMaterialsChanged[index] = true; } void VertexShaderManager::TranslateView(float x, float y, float z) From c81e3da22f8f0c4d4afdb2c05c9d2282a54d35ac Mon Sep 17 00:00:00 2001 From: comex Date: Tue, 21 Oct 2014 21:08:48 -0400 Subject: [PATCH 5/5] Add unit test. --- Source/UnitTests/Common/BitSetTest.cpp | 84 ++++++++++++++++++++++++++ Source/UnitTests/Common/CMakeLists.txt | 1 + 2 files changed, 85 insertions(+) create mode 100644 Source/UnitTests/Common/BitSetTest.cpp diff --git a/Source/UnitTests/Common/BitSetTest.cpp b/Source/UnitTests/Common/BitSetTest.cpp new file mode 100644 index 0000000000..e0c3d8cd23 --- /dev/null +++ b/Source/UnitTests/Common/BitSetTest.cpp @@ -0,0 +1,84 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include + +#include "Common/BitSet.h" + +TEST(BitSet, Basics) +{ + BitSet32 bs; + BitSet64 bs2(1); + BitSet64 bs3(2); + EXPECT_EQ(true, !!bs2); + EXPECT_EQ(false, !!bs); + EXPECT_EQ(bs2, bs2); + EXPECT_NE(bs2, bs3); + EXPECT_EQ(BitSet32(0xfff), BitSet32::AllTrue(12)); + EXPECT_EQ(BitSet64(0xffffffffffffffff), BitSet64::AllTrue(64)); +} + +TEST(BitSet, BitGetSet) +{ + BitSet32 bs; + bs[3] = bs[8] = bs[11] = true; + EXPECT_EQ(true, bs[3]); + EXPECT_EQ(false, bs[4]); + EXPECT_EQ((u32)((1 << 3) | (1 << 8) | (1 << 11)), bs.m_val); +} + +TEST(BitSet, Count) +{ + u32 random_numbers[] = { + 0x2cb0b5f3, 0x81ab32a6, 0xd9030dc5, 0x325ffe26, 0xb2fcaee3, + 0x4ccf188a, 0xf8be36dc, 0xb2fcecd5, 0xb750c2e5, 0x31d19074, + 0xf267644a, 0xac00a719, 0x6d45f19b, 0xf7e91c5b, 0xf687e694, + 0x9057c24e, 0x5eb65c39, 0x85d3038b, 0x101f4e66, 0xc202d136 + }; + u32 counts[] = { + 17, 14, 14, 19, 20, 14, 20, 20, 16, 13, 16, 12, 18, 20, 18, 14, 18, 14, 14, 12 + }; + for (size_t i = 0; i < 20; i++) + { + EXPECT_EQ(counts[i], BitSet32(random_numbers[i]).Count()); + } + + u64 random_numbers_64[] = { + 0xf86cd6f6ef09d7d4ULL, 0x6f2d8533255ead3cULL, 0x9da7941e0e52b345ULL, + 0x06e4189be67d2b17ULL, 0x3eb0681f65cb6d25ULL, 0xccab8a7c74a51203ULL, + 0x09d470516694c64bULL, 0x38cd077e075c778fULL, 0xd69ebfa6355ebfdeULL + }; + u32 counts_64[] = { + 39, 34, 31, 32, 33, 29, 27, 35, 43 + }; + for (size_t i = 0; i < 9; i++) + { + EXPECT_EQ(counts_64[i], BitSet64(random_numbers_64[i]).Count()); + } +} + +TEST(BitSet, BitOps) +{ + BitSet32 a(3), b(5), c; + EXPECT_EQ(BitSet32(7), a | b); + EXPECT_EQ(BitSet32(6), a ^ b); + EXPECT_EQ(BitSet32(1), a & b); + EXPECT_EQ(BitSet32(0xfffffffc), ~a); + c = a; c |= b; EXPECT_EQ(BitSet32(7), c); + c = a; c ^= b; EXPECT_EQ(BitSet32(6), c); + c = a; c &= b; EXPECT_EQ(BitSet32(1), c); +} + +TEST(BitSet, InitializerListsAndIteration) +{ + std::vector bits { 1, 10, 15, 17, 20, 30 }; + BitSet32 bs { 1, 10, 15, 17, 20, 30 }; + auto vit = bits.begin(); + for (auto i : bs) + { + EXPECT_NE(vit, bits.end()); + EXPECT_EQ(i, *vit++); + } + EXPECT_EQ(vit, bits.end()); +} diff --git a/Source/UnitTests/Common/CMakeLists.txt b/Source/UnitTests/Common/CMakeLists.txt index c7e9f2046c..a35bd455fe 100644 --- a/Source/UnitTests/Common/CMakeLists.txt +++ b/Source/UnitTests/Common/CMakeLists.txt @@ -1,4 +1,5 @@ add_dolphin_test(BitFieldTest BitFieldTest.cpp) +add_dolphin_test(BitSetTest BitSetTest.cpp) add_dolphin_test(CommonFuncsTest CommonFuncsTest.cpp) add_dolphin_test(EventTest EventTest.cpp) add_dolphin_test(FifoQueueTest FifoQueueTest.cpp)