From 8b8310d28cd589fb371f425383c756699766c3a1 Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Sat, 28 Feb 2015 04:39:15 -0600
Subject: [PATCH] [AArch64] Optimize FPR pushing and popping.

Previously on FPR pushing and popping we would do a single STR/LDR per quad FPR we wanted to push/pop.
In most of our cases when we are pushing and popping VFP registers they will be consecutive registers that will save more efficiently using the NEON
loadstores that can do up to four quad registers.
So this can potentially cutting instructions down to ~1/4th the amount of instructions if the registers are all consecutive.

On the Cortex-A57 this is basically just an icache improvement, but on the Nvidia Denver this may be optimized to be more efficient. Either way it's a
win.
---
 Source/Core/Common/Arm64Emitter.cpp | 145 ++++++++++++++++++++++++++--
 Source/Core/Common/Arm64Emitter.h   |   5 +-
 2 files changed, 139 insertions(+), 11 deletions(-)

diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp
index b8a7ee78a8..a730560177 100644
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@@ -1891,6 +1891,27 @@ void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opc
 	        (encoded_size << 10) | (Rn << 5) | Rt);
 }
 
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+	bool quad = IsQuad(Rt);
+	u32 encoded_size = 0;
+
+	if (size == 16)
+		encoded_size = 1;
+	else if (size == 32)
+		encoded_size = 2;
+	else if (size == 64)
+		encoded_size = 3;
+
+	Rt = DecodeReg(Rt);
+	Rn = DecodeReg(Rn);
+	Rm = DecodeReg(Rm);
+
+	Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | \
+	        (encoded_size << 10) | (Rn << 5) | Rt);
+
+}
+
 void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
 {
 	_assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__);
@@ -2234,6 +2255,22 @@ void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
 		opcode = 0b0010;
 	EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
 }
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+	_assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__);
+	_assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__);
+
+	u32 opcode = 0;
+	if (count == 1)
+		opcode = 0b111;
+	else if (count == 2)
+		opcode = 0b1010;
+	else if (count == 3)
+		opcode = 0b0110;
+	else if (count == 4)
+		opcode = 0b0010;
+	EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
 void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
 {
 	_assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__);
@@ -2248,6 +2285,22 @@ void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
 		opcode = 0b0010;
 	EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
 }
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+	_assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__);
+	_assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__);
+
+	u32 opcode = 0;
+	if (count == 1)
+		opcode = 0b111;
+	else if (count == 2)
+		opcode = 0b1010;
+	else if (count == 3)
+		opcode = 0b0110;
+	else if (count == 4)
+		opcode = 0b0010;
+	EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
 
 // Scalar - 1 Source
 void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
@@ -2761,21 +2814,93 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8
 
 void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers)
 {
-	for (auto it : registers)
-		STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16);
-
-}
-void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
-{
-	for (int i = 31; i >= 0; --i)
+	bool bundled_loadstore = false;
+	for (int i = 0; i < 32; ++i)
 	{
 		if (!registers[i])
 			continue;
 
-		if (ignore_mask[i])
-			m_emit->ADD(SP, SP, 16);
-		else
+		int count = 0;
+		while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
+		if (count > 1)
+		{
+			bundled_loadstore = true;
+			break;
+		}
+	}
+
+	if (!bundled_loadstore)
+	{
+		for (auto it : registers)
+			STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16);
+	}
+	else
+	{
+		int num_regs = registers.Count();
+		// Violating the AAPCS64 never felt so right.
+		m_emit->SUB(SP, SP, num_regs * 16);
+		for (int i = 0; i < 32; ++i)
+		{
+			if (!registers[i])
+				continue;
+
+			int count = 0;
+
+			// 0 = true
+			// 1 < 4 && registers[i + 1] true!
+			// 2 < 4 && registers[i + 2] true!
+			// 3 < 4 && registers[i + 3] true!
+			// 4 < 4 && registers[i + 4] false!
+			while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
+
+			ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+			i += count - 1;
+		}
+		m_emit->SUB(SP, SP, num_regs * 16);
+	}
+}
+void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers)
+{
+	bool bundled_loadstore = false;
+	for (int i = 0; i < 32; ++i)
+	{
+		if (!registers[i])
+			continue;
+
+		int count = 0;
+		while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
+		if (count > 1)
+		{
+			bundled_loadstore = true;
+			break;
+		}
+	}
+
+	if (!bundled_loadstore)
+	{
+		for (int i = 31; i >= 0; --i)
+		{
+			if (!registers[i])
+				continue;
+
 			LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
+		}
+	}
+	else
+	{
+		for (int i = 0; i < 32; ++i)
+		{
+			if (!registers[i])
+				continue;
+
+			int count = 0;
+			while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
+
+			LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+			i += count - 1;
+		}
 	}
 }
 
diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h
index bb3bf770ce..77d72183c5 100644
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@@ -658,7 +658,9 @@ public:
 
 	// Loadstore multiple structure
 	void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+	void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
 	void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+	void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
 
 	// Scalar - 1 Source
 	void FABS(ARM64Reg Rd, ARM64Reg Rn);
@@ -748,7 +750,7 @@ public:
 
 	// ABI related
 	void ABI_PushRegisters(BitSet32 registers);
-	void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
+	void ABI_PopRegisters(BitSet32 registers);
 
 private:
 	ARM64XEmitter* m_emit;
@@ -770,6 +772,7 @@ private:
 	void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm);
 	void EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
 	void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+	void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
 	void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);