Merge pull request #981 from FioraAeterna/revert-852-optimizeca

Revert "JIT64: optimize CA calculations"
2014-09-05 19:27:41 +02:00 · 2014-09-05 19:27:41 +02:00 · 58c669aa02
parent 97420c6ec6 07e0c917c6
commit 58c669aa02
16 changed files with 534 additions and 530 deletions
--- a/Source/Core/Common/MathUtil.h
+++ b/Source/Core/Common/MathUtil.h
@ -175,15 +175,16 @@ struct Rectangle

 }  // namespace MathUtil

+inline float pow2f(float x) {return x * x;}
+inline double pow2(double x) {return x * x;}
+
 float MathFloatVectorSum(const std::vector<float>&);

 #define ROUND_UP(x, a)   (((x) + (a) - 1) & ~((a) - 1))
 #define ROUND_DOWN(x, a) ((x) & ~((a) - 1))

-inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
-
 // Rounds down. 0 -> undefined
-inline int IntLog2(u64 val)
+inline int Log2(u64 val)
 {
 #if defined(__GNUC__)
 	return 63 - __builtin_clzll(val);
--- a/Source/Core/Core/PowerPC/Gekko.h
+++ b/Source/Core/Core/PowerPC/Gekko.h
@ -331,12 +331,9 @@ union UFPR
 	float f[2];
 };

-#define XER_CA_SHIFT 29
-#define XER_OV_SHIFT 30
-#define XER_SO_SHIFT 31
-#define XER_CA_MASK (1U << XER_CA_SHIFT)
-#define XER_OV_MASK (1U << XER_OV_SHIFT)
-#define XER_SO_MASK (1U << XER_SO_SHIFT)
+#define XER_CA_MASK 0x20000000
+#define XER_OV_MASK 0x40000000
+#define XER_SO_MASK 0x80000000
 // XER
 union UReg_XER
 {
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, Interpreter::cmpli,        {"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{11, Interpreter::cmpi,         {"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{12, Interpreter::addic,        {"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
-	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
+	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
 	{14, Interpreter::addi,         {"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
 	{15, Interpreter::addis,        {"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},

@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
 	{922, Interpreter::extshx,      {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{954, Interpreter::extsbx,      {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{536, Interpreter::srwx,        {"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
+	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{24,  Interpreter::slwx,        {"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},

 	{54,   Interpreter::dcbst,      {"dcbst",  OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
 	{339, Interpreter::mfspr,       {"mfspr",  OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
 	{467, Interpreter::mtspr,       {"mtspr",  OPTYPE_SPR, 0, 2, 0, 0, 0}},
 	{371, Interpreter::mftb,        {"mftb",   OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
-	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
+	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
 	{595, Interpreter::mfsr,        {"mfsr",   OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
 	{659, Interpreter::mfsrin,      {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},

--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -100,15 +100,13 @@ public:
 	void GenerateConstantOverflow(bool overflow);
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
-	void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
+	void FinalizeCarryOverflow(bool oe, bool inv = false);
+	void GetCarryEAXAndClear();
+	void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false);
+	void GenerateCarry();
+	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);

-	// use to extract bytes from a register using the regcache. offset is in bytes.
-	Gen::OpArg ExtractFromReg(int reg, int offset);
-	void AndWithMask(Gen::X64Reg reg, u32 mask);
-	bool CheckMergedBranch(int crf);
-	void DoMergedBranch();
-
 	// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
 	// don't forget to xlock it before.
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
@ -120,8 +118,6 @@ public:
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 	void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);

-	void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
-
 	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
 	{922, &Jit64::extshx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{954, &Jit64::extsbx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{536, &Jit64::srwx},                   //"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
-	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
+	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
+	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 	{24,  &Jit64::slwx},                   //"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},

 	{54,   &Jit64::dcbst},                 //"dcbst",  OPTYPE_DCACHE, 0, 4}},
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@ -314,10 +314,7 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)

 void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc)
 {
-	if (regs[preg].location.IsImm() && !regs[preg].location.offset)
-		emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc));
-	else
-		emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
+	emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
 }

 void GPRRegCache::StoreRegister(size_t preg, OpArg newLoc)
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -1104,7 +1104,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCAOV(true, false);
+			Jit->JitClearCA();
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -802,11 +802,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
 	OR(32, M(&FPSCR), R(EAX));
 }

-void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
+
+void EmuCodeBlock::JitClearCA()
 {
-	if (oe)
-		AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
-	BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
+	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
 }

 void EmuCodeBlock::JitSetCA()
@ -814,20 +813,10 @@ void EmuCodeBlock::JitSetCA()
 	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
 }

-// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
-// branchless calculation of CA is probably faster in general.
-void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
+void EmuCodeBlock::JitClearCAOV(bool oe)
 {
-	SETcc(conditionCode, R(EAX));
-	MOVZX(32, 8, EAX, R(AL));
-	SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
-	OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
-}
-
-void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
-{
-	u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
-	if (mask == 0xFFFFFFFF)
-		return;
-	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask));
+	if (oe)
+		AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
+	else
+		AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
 }
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -50,10 +50,9 @@ public:
 	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);

 	void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
-	void JitGetAndClearCAOV(bool oe);
+	void JitClearCA();
 	void JitSetCA();
-	void JitSetCAIf(Gen::CCFlags conditionCode);
-	void JitClearCAOV(bool ca, bool oe);
+	void JitClearCAOV(bool oe);

 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -430,6 +430,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 {
 	code->wantsCR0 = false;
 	code->wantsCR1 = false;
+	code->wantsPS1 = false;

 	if (opinfo->flags & FL_USE_FPU)
 		block->m_fpa->any = true;
@ -457,15 +458,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
 	code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;

-	code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
-	code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
-
-	// mfspr/mtspr can affect/use XER, so be super careful here
-	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
-		code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
-	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
-		code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
-
 	int numOut = 0;
 	int numIn = 0;
 	if (opinfo->flags & FL_OUT_A)
@ -723,30 +715,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 		block->m_broken = true;
 	}

-	// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
-	// wants flags, to be safe.
+	// Scan for CR0 dependency
+	// assume next block wants flags to be safe
 	bool wantsCR0 = true;
 	bool wantsCR1 = true;
+	bool wantsPS1 = true;
 	bool wantsFPRF = true;
-	bool wantsCA = true;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
-		bool opWantsCR0  = code[i].wantsCR0;
-		bool opWantsCR1  = code[i].wantsCR1;
-		bool opWantsFPRF = code[i].wantsFPRF;
-		bool opWantsCA   = code[i].wantsCA;
-		wantsCR0  |= opWantsCR0  || code[i].canEndBlock;
-		wantsCR1  |= opWantsCR1  || code[i].canEndBlock;
-		wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
-		wantsCA   |= opWantsCA   || code[i].canEndBlock;
-		code[i].wantsCR0  = wantsCR0;
-		code[i].wantsCR1  = wantsCR1;
+		wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
+		wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
+		wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
+		wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
+		code[i].wantsCR0 = wantsCR0;
+		code[i].wantsCR1 = wantsCR1;
+		code[i].wantsPS1 = wantsPS1;
 		code[i].wantsFPRF = wantsFPRF;
-		code[i].wantsCA   = wantsCA;
-		wantsCR0  &= !code[i].outputCR0  || opWantsCR0;
-		wantsCR1  &= !code[i].outputCR1  || opWantsCR1;
-		wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
-		wantsCA   &= !code[i].outputCA   || opWantsCA;
+		wantsCR0 &= !code[i].outputCR0;
+		wantsCR1 &= !code[i].outputCR1;
+		wantsPS1 &= !code[i].outputPS1;
+		wantsFPRF &= !code[i].outputFPRF;
 	}
 	return address;
 }
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -33,12 +33,12 @@ struct CodeOp //16B
 	bool isBranchTarget;
 	bool wantsCR0;
 	bool wantsCR1;
+	bool wantsPS1;
 	bool wantsFPRF;
-	bool wantsCA;
 	bool outputCR0;
 	bool outputCR1;
+	bool outputPS1;
 	bool outputFPRF;
-	bool outputCA;
 	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 };
--- a/Source/Core/DolphinWX/GameListCtrl.cpp
+++ b/Source/Core/DolphinWX/GameListCtrl.cpp
@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
 	// Find largest power of 2 less than _size.
 	// div 10 to get largest named unit less than _size
 	// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
-	const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
+	const u64 unit = Log2(std::max<u64>(_size, 1)) / 10;
 	const u64 unit_size = (1 << (unit * 10));

 	// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places
--- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
@ -23,7 +23,7 @@ static u32 genBuffer()
 }

 StreamBuffer::StreamBuffer(u32 type, u32 size)
-: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
+: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS))
 {
 	m_iterator = 0;
 	m_used_iterator = 0;
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)

 	WRITE(p, "  int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
 	WRITE(p, "  int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
-	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
-	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
+	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples));
+	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1));
 	if (samples == 1)
 	{
 		// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 		WRITE(p, "  x_virtual_position = x_virtual_position << 1;\n");
 	}
 	WRITE(p, "  int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
-	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
+	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1);

 	WRITE(p, "  sampleUv.x = x_offset_in_block + x_block_position;\n");
 	WRITE(p, "  sampleUv.y = y_block_position + y_offset;\n");
--- a/Source/UnitTests/Common/MathUtilTest.cpp
+++ b/Source/UnitTests/Common/MathUtilTest.cpp
@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
 	EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
 }

-TEST(MathUtil, IntLog2)
+TEST(MathUtil, Log2)
 {
-	EXPECT_EQ(0, IntLog2(1));
-	EXPECT_EQ(1, IntLog2(2));
-	EXPECT_EQ(2, IntLog2(4));
-	EXPECT_EQ(3, IntLog2(8));
-	EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
+	EXPECT_EQ(0, Log2(1));
+	EXPECT_EQ(1, Log2(2));
+	EXPECT_EQ(2, Log2(4));
+	EXPECT_EQ(3, Log2(8));
+	EXPECT_EQ(63, Log2(0x8000000000000000ull));

 	// Rounding behavior.
-	EXPECT_EQ(3, IntLog2(15));
-	EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
+	EXPECT_EQ(3, Log2(15));
+	EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull));
 }

 TEST(MathUtil, FlushToZero)