arm64: store queue write dynarec optimization

2021-03-28 20:58:59 +02:00 · 2021-03-28 20:58:59 +02:00 · e2f309b5cc
parent ed25d45adb
commit e2f309b5cc
1 changed files with 64 additions and 18 deletions
--- a/core/rec-ARM64/rec_arm64.cpp
+++ b/core/rec-ARM64/rec_arm64.cpp
@ -68,6 +68,8 @@ static DynaCode *blockCheckFail;
 static DynaCode *linkBlockGenericStub;
 static DynaCode *linkBlockBranchStub;
 static DynaCode *linkBlockNextStub;
+static DynaCode *writeStoreQueue32;
+static DynaCode *writeStoreQueue64;

 static bool restarting;

@ -349,7 +351,7 @@ public:
 					Mov(w10, op.rs2._imm);
 					Str(w10, sh4_context_mem_operand(&next_pc));
 				}
-				Mov(*call_regs[0], op.rs3._imm);
+				Mov(w0, op.rs3._imm);

 				if (!mmu_enabled())
 				{
@ -357,8 +359,8 @@ public:
 				}
 				else
 				{
-					Mov(*call_regs64[1], reinterpret_cast<uintptr_t>(*OpDesc[op.rs3._imm]->oph));	// op handler
-					Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0));	// pc
+					Mov(x1, reinterpret_cast<uintptr_t>(*OpDesc[op.rs3._imm]->oph));	// op handler
+					Mov(w2, block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0));	// pc

 					GenCallRuntime(interpreter_fallback);
 				}
@ -806,7 +808,7 @@ public:
 				{
 					Label not_sqw;
 					if (op.rs1.is_imm())
-						Mov(*call_regs[0], op.rs1._imm);
+						Mov(w0, op.rs1._imm);
 					else
 					{
 						if (regalloc.IsAllocg(op.rs1))
@ -824,7 +826,7 @@ public:

 					if (mmu_enabled())
 					{
-						Mov(*call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0));	// pc
+						Mov(w1, block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0));	// pc

 						GenCallRuntime(do_sqw_mmu_no_ex);
 					}
@ -1516,11 +1518,40 @@ public:
 		}
 		Br(x0);

+		// Store Queue write handlers
+		Label writeStoreQueue32Label;
+		Bind(&writeStoreQueue32Label);
+		Lsr(x7, x0, 26);
+		Cmp(x7, 0x38);
+		if (!mmu_enabled())
+			GenBranchRuntime(WriteMem32, Condition::ne);
+		else
+			GenBranchRuntime(WriteMemNoEx<u32>, Condition::ne);
+		And(x0, x0, 0x3f);
+		Sub(x7, x0, sizeof(Sh4RCB::sq_buffer), LeaveFlags);
+		Str(w1, MemOperand(x28, x7));
+		Ret();
+
+		Label writeStoreQueue64Label;
+		Bind(&writeStoreQueue64Label);
+		Lsr(x7, x0, 26);
+		Cmp(x7, 0x38);
+		if (!mmu_enabled())
+			GenBranchRuntime(WriteMem64, Condition::ne);
+		else
+			GenBranchRuntime(WriteMemNoEx<u64>, Condition::ne);
+		And(x0, x0, 0x3f);
+		Sub(x7, x0, sizeof(Sh4RCB::sq_buffer), LeaveFlags);
+		Str(x1, MemOperand(x28, x7));
+		Ret();
+
 		FinalizeCode();
 		emit_Skip(GetBuffer()->GetSizeInBytes());

 		arm64_no_update = GetLabelAddress<DynaCode *>(&no_update);
 		handleException = (void (*)())CC_RW2RX(GetLabelAddress<uintptr_t>(&handleExceptionLabel));
+		writeStoreQueue32 = GetLabelAddress<DynaCode *>(&writeStoreQueue32Label);
+		writeStoreQueue64 = GetLabelAddress<DynaCode *>(&writeStoreQueue64Label);

 		// Flush and invalidate caches
 		vmem_platform_flush_cache(
@ -1528,6 +1559,16 @@ public:
 			GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
 	}

+	void GenWriteStoreQueue(u32 size)
+	{
+		Instruction *start_instruction = GetCursorAddress<Instruction *>();
+
+		if (size == 4)
+			GenCall(writeStoreQueue32);
+		else
+			GenCall(writeStoreQueue64);
+		EnsureCodeSize(start_instruction, write_memory_rewrite_size);
+	}

 private:
 	// Runtime branches/calls need to be adjusted if rx space is different to rw space.
@ -1555,14 +1596,17 @@ private:
 	}

   template <typename R, typename... P>
-	void GenBranchRuntime(R (*target)(P...))
+	void GenBranchRuntime(R (*target)(P...), Condition cond = al)
 	{
 		ptrdiff_t offset = reinterpret_cast<uintptr_t>(target) - reinterpret_cast<uintptr_t>(CC_RW2RX(GetBuffer()->GetStartAddress<void*>()));
 		verify(offset >= -128 * 1024 * 1024 && offset <= 128 * 1024 * 1024);
 		verify((offset & 3) == 0);
 		Label target_label;
 		BindToOffset(&target_label, offset);
-		B(&target_label);
+		if (cond == al)
+			B(&target_label);
+		else
+			B(&target_label, cond);
 	}

 	void GenBranch(DynaCode *code, Condition cond = al)
@ -1585,7 +1629,7 @@ private:

 		GenMemAddr(op, call_regs[0]);
 		if (mmu_enabled())
-			Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));	// pc
+			Mov(w2, block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));	// pc

 		u32 size = op.flags & 0x7f;
 		if (!optimise || !GenReadMemoryFast(op, opid))
@ -1756,12 +1800,12 @@ private:
 		// Update ngen_Rewrite (and perhaps read_memory_rewrite_size) if adding or removing code
 		if (!_nvmem_4gb_space())
 		{
-			Ubfx(x1, *call_regs64[0], 0, 29);
+			Ubfx(x1, x0, 0, 29);
 			Add(x1, x1, sizeof(Sh4Context), LeaveFlags);
 		}
 		else
 		{
-			Add(x1, *call_regs64[0], sizeof(Sh4Context), LeaveFlags);
+			Add(x1, x0, sizeof(Sh4Context), LeaveFlags);
 		}

 		u32 size = op.flags & 0x7f;
@ -1795,13 +1839,13 @@ private:

 		GenMemAddr(op, call_regs[0]);
 		if (mmu_enabled())
-			Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));	// pc
+			Mov(w2, block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));	// pc

 		u32 size = op.flags & 0x7f;
 		if (size != 8)
-			shil_param_to_host_reg(op.rs2, *call_regs[1]);
+			shil_param_to_host_reg(op.rs2, w1);
 		else
-			shil_param_to_host_reg(op.rs2, *call_regs64[1]);
+			shil_param_to_host_reg(op.rs2, x1);
 		if (optimise && GenWriteMemoryFast(op, opid))
 			return;

@ -1947,12 +1991,12 @@ private:
 		// Update ngen_Rewrite (and perhaps write_memory_rewrite_size) if adding or removing code
 		if (!_nvmem_4gb_space())
 		{
-			Ubfx(x7, *call_regs64[0], 0, 29);
+			Ubfx(x7, x0, 0, 29);
 			Add(x7, x7, sizeof(Sh4Context), LeaveFlags);
 		}
 		else
 		{
-			Add(x7, *call_regs64[0], sizeof(Sh4Context), LeaveFlags);
+			Add(x7, x0, sizeof(Sh4Context), LeaveFlags);
 		}

 		u32 size = op.flags & 0x7f;
@ -2052,9 +2096,9 @@ private:
 			Ldr(w10, sh4_context_mem_operand(&sr));
 			Tbz(w10, 15, &fpu_enabled);			// test SR.FD bit

-			Mov(*call_regs[0], block->vaddr);	// pc
-			Mov(*call_regs[1], 0x800);			// event
-			Mov(*call_regs[2], 0x100);			// vector
+			Mov(w0, block->vaddr);	// pc
+			Mov(w1, 0x800);			// event
+			Mov(w2, 0x100);			// vector
 			CallRuntime(Do_Exception);
 			Ldr(w29, sh4_context_mem_operand(&next_pc));
 			GenBranch(arm64_no_update);
@ -2227,6 +2271,8 @@ bool ngen_Rewrite(host_context_t &context, void *faultAddress)
 	Arm64Assembler *assembler = new Arm64Assembler(code_rewrite);
 	if (is_read)
 		assembler->GenReadMemorySlow(size);
+	else if (!is_read && size >= 4 && (((u8 *)faultAddress - virt_ram_base) >> 26) == 0x38)
+		assembler->GenWriteStoreQueue(size);
 	else
 		assembler->GenWriteMemorySlow(size);
 	assembler->Finalize(true);