Merge pull request #852 from FioraAeterna/optimizeca

JIT64: optimize CA calculations
This commit is contained in:
comex 2014-09-05 11:52:02 -04:00
commit 97420c6ec6
16 changed files with 530 additions and 534 deletions

View File

@ -175,16 +175,15 @@ struct Rectangle
} // namespace MathUtil } // namespace MathUtil
inline float pow2f(float x) {return x * x;}
inline double pow2(double x) {return x * x;}
float MathFloatVectorSum(const std::vector<float>&); float MathFloatVectorSum(const std::vector<float>&);
#define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1))
#define ROUND_DOWN(x, a) ((x) & ~((a) - 1)) #define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
// Rounds down. 0 -> undefined // Rounds down. 0 -> undefined
inline int Log2(u64 val) inline int IntLog2(u64 val)
{ {
#if defined(__GNUC__) #if defined(__GNUC__)
return 63 - __builtin_clzll(val); return 63 - __builtin_clzll(val);

View File

@ -331,9 +331,12 @@ union UFPR
float f[2]; float f[2];
}; };
#define XER_CA_MASK 0x20000000 #define XER_CA_SHIFT 29
#define XER_OV_MASK 0x40000000 #define XER_OV_SHIFT 30
#define XER_SO_MASK 0x80000000 #define XER_SO_SHIFT 31
#define XER_CA_MASK (1U << XER_CA_SHIFT)
#define XER_OV_MASK (1U << XER_OV_SHIFT)
#define XER_SO_MASK (1U << XER_SO_SHIFT)
// XER // XER
union UReg_XER union UReg_XER
{ {

View File

@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
{10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
{11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
{12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}}, {12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}}, {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
{14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, {14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
{15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, {15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
{922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}}, {54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
{339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}}, {339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
{467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}}, {467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}},
{371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}}, {371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}}, {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
{595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, {595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
{659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, {659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},

View File

@ -100,13 +100,15 @@ public:
void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(bool overflow);
void GenerateConstantOverflow(s64 val); void GenerateConstantOverflow(s64 val);
void GenerateOverflow(); void GenerateOverflow();
void FinalizeCarryOverflow(bool oe, bool inv = false); void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
void GetCarryEAXAndClear();
void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false);
void GenerateCarry();
void GenerateRC();
void ComputeRC(const Gen::OpArg & arg); void ComputeRC(const Gen::OpArg & arg);
// use to extract bytes from a register using the regcache. offset is in bytes.
Gen::OpArg ExtractFromReg(int reg, int offset);
void AndWithMask(Gen::X64Reg reg, u32 mask);
bool CheckMergedBranch(int crf);
void DoMergedBranch();
// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1, // Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
// don't forget to xlock it before. // don't forget to xlock it before.
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
@ -118,6 +120,8 @@ public:
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm); void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
typedef u32 (*Operation)(u32 a, u32 b); typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);

View File

@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
{922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
{24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}}, {54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}},

View File

@ -314,7 +314,10 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)
void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc) void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc)
{ {
emit->MOV(32, ::Gen::R(newLoc), regs[preg].location); if (regs[preg].location.IsImm() && !regs[preg].location.offset)
emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc));
else
emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
} }
void GPRRegCache::StoreRegister(size_t preg, OpArg newLoc) void GPRRegCache::StoreRegister(size_t preg, OpArg newLoc)

File diff suppressed because it is too large Load Diff

View File

@ -1104,7 +1104,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
Jit->JitSetCA(); Jit->JitSetCA();
FixupBranch cont = Jit->J(); FixupBranch cont = Jit->J();
Jit->SetJumpTarget(nocarry); Jit->SetJumpTarget(nocarry);
Jit->JitClearCA(); Jit->JitClearCAOV(true, false);
Jit->SetJumpTarget(cont); Jit->SetJumpTarget(cont);
regNormalRegClear(RI, I); regNormalRegClear(RI, I);
break; break;

View File

@ -802,10 +802,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
OR(32, M(&FPSCR), R(EAX)); OR(32, M(&FPSCR), R(EAX));
} }
void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
void EmuCodeBlock::JitClearCA()
{ {
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 if (oe)
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
} }
void EmuCodeBlock::JitSetCA() void EmuCodeBlock::JitSetCA()
@ -813,10 +814,20 @@ void EmuCodeBlock::JitSetCA()
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
} }
void EmuCodeBlock::JitClearCAOV(bool oe) // Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
// branchless calculation of CA is probably faster in general.
void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
{ {
if (oe) SETcc(conditionCode, R(EAX));
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0 MOVZX(32, 8, EAX, R(AL));
else SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
}
void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
{
u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
if (mask == 0xFFFFFFFF)
return;
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask));
} }

View File

@ -50,9 +50,10 @@ public:
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void JitClearCA(); void JitGetAndClearCAOV(bool oe);
void JitSetCA(); void JitSetCA();
void JitClearCAOV(bool oe); void JitSetCAIf(Gen::CCFlags conditionCode);
void JitClearCAOV(bool ca, bool oe);
void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm);

View File

@ -430,7 +430,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
{ {
code->wantsCR0 = false; code->wantsCR0 = false;
code->wantsCR1 = false; code->wantsCR1 = false;
code->wantsPS1 = false;
if (opinfo->flags & FL_USE_FPU) if (opinfo->flags & FL_USE_FPU)
block->m_fpa->any = true; block->m_fpa->any = true;
@ -458,6 +457,15 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false; code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false; code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
// mfspr/mtspr can affect/use XER, so be super careful here
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
int numOut = 0; int numOut = 0;
int numIn = 0; int numIn = 0;
if (opinfo->flags & FL_OUT_A) if (opinfo->flags & FL_OUT_A)
@ -715,26 +723,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
block->m_broken = true; block->m_broken = true;
} }
// Scan for CR0 dependency // Scan for flag dependencies; assume the next block (or any branch that can leave the block)
// assume next block wants flags to be safe // wants flags, to be safe.
bool wantsCR0 = true; bool wantsCR0 = true;
bool wantsCR1 = true; bool wantsCR1 = true;
bool wantsPS1 = true;
bool wantsFPRF = true; bool wantsFPRF = true;
bool wantsCA = true;
for (int i = block->m_num_instructions - 1; i >= 0; i--) for (int i = block->m_num_instructions - 1; i >= 0; i--)
{ {
wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock; bool opWantsCR0 = code[i].wantsCR0;
wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock; bool opWantsCR1 = code[i].wantsCR1;
wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock; bool opWantsFPRF = code[i].wantsFPRF;
wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock; bool opWantsCA = code[i].wantsCA;
code[i].wantsCR0 = wantsCR0; wantsCR0 |= opWantsCR0 || code[i].canEndBlock;
code[i].wantsCR1 = wantsCR1; wantsCR1 |= opWantsCR1 || code[i].canEndBlock;
code[i].wantsPS1 = wantsPS1; wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
wantsCA |= opWantsCA || code[i].canEndBlock;
code[i].wantsCR0 = wantsCR0;
code[i].wantsCR1 = wantsCR1;
code[i].wantsFPRF = wantsFPRF; code[i].wantsFPRF = wantsFPRF;
wantsCR0 &= !code[i].outputCR0; code[i].wantsCA = wantsCA;
wantsCR1 &= !code[i].outputCR1; wantsCR0 &= !code[i].outputCR0 || opWantsCR0;
wantsPS1 &= !code[i].outputPS1; wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
wantsFPRF &= !code[i].outputFPRF; wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
wantsCA &= !code[i].outputCA || opWantsCA;
} }
return address; return address;
} }

View File

@ -33,12 +33,12 @@ struct CodeOp //16B
bool isBranchTarget; bool isBranchTarget;
bool wantsCR0; bool wantsCR0;
bool wantsCR1; bool wantsCR1;
bool wantsPS1;
bool wantsFPRF; bool wantsFPRF;
bool wantsCA;
bool outputCR0; bool outputCR0;
bool outputCR1; bool outputCR1;
bool outputPS1;
bool outputFPRF; bool outputFPRF;
bool outputCA;
bool canEndBlock; bool canEndBlock;
bool skip; // followed BL-s for example bool skip; // followed BL-s for example
}; };

View File

@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
// Find largest power of 2 less than _size. // Find largest power of 2 less than _size.
// div 10 to get largest named unit less than _size // div 10 to get largest named unit less than _size
// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc) // 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
const u64 unit = Log2(std::max<u64>(_size, 1)) / 10; const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
const u64 unit_size = (1 << (unit * 10)); const u64 unit_size = (1 << (unit * 10));
// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places // mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places

View File

@ -23,7 +23,7 @@ static u32 genBuffer()
} }
StreamBuffer::StreamBuffer(u32 type, u32 size) StreamBuffer::StreamBuffer(u32 type, u32 size)
: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) : m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
{ {
m_iterator = 0; m_iterator = 0;
m_used_iterator = 0; m_used_iterator = 0;

View File

@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1); WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
if (samples == 1) if (samples == 1)
{ {
// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments // 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); WRITE(p, " x_virtual_position = x_virtual_position << 1;\n");
} }
WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1); WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1); WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n");
WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n");

View File

@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN())); EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
} }
TEST(MathUtil, Log2) TEST(MathUtil, IntLog2)
{ {
EXPECT_EQ(0, Log2(1)); EXPECT_EQ(0, IntLog2(1));
EXPECT_EQ(1, Log2(2)); EXPECT_EQ(1, IntLog2(2));
EXPECT_EQ(2, Log2(4)); EXPECT_EQ(2, IntLog2(4));
EXPECT_EQ(3, Log2(8)); EXPECT_EQ(3, IntLog2(8));
EXPECT_EQ(63, Log2(0x8000000000000000ull)); EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
// Rounding behavior. // Rounding behavior.
EXPECT_EQ(3, Log2(15)); EXPECT_EQ(3, IntLog2(15));
EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull)); EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
} }
TEST(MathUtil, FlushToZero) TEST(MathUtil, FlushToZero)