x64 dynarec: check if extension is supported by cpu. seh on win32

Check if FMA/AVX/SSE3 is supported before using it fully naked main loop in win32 with proper seh directives win32: more xmm regs to allocate and no need to save them when calling out
2019-01-18 17:02:50 +01:00 · 2019-01-18 17:02:50 +01:00 · cd4e4cbdc9
parent 573f285f3b
commit cd4e4cbdc9
4 changed files with 213 additions and 114 deletions
--- a/core/hw/sh4/dyna/blockmanager.h
+++ b/core/hw/sh4/dyna/blockmanager.h
@ -86,7 +86,7 @@ void bm_WriteBlockMap(const string& file);
 DynarecCodeEntryPtr DYNACALL bm_GetCode(u32 addr);

 extern "C" {
-DynarecCodeEntryPtr DYNACALL bm_GetCode2(u32 addr);
+__attribute__((used)) DynarecCodeEntryPtr DYNACALL bm_GetCode2(u32 addr);
 }

 RuntimeBlockInfo* bm_GetBlock(void* dynarec_code);
--- a/core/hw/sh4/sh4_interpreter.h
+++ b/core/hw/sh4/sh4_interpreter.h
@ -56,6 +56,6 @@ void ExecuteDelayslot_RTE();
 extern "C" {

 int UpdateSystem();
-int UpdateSystem_INTC();
+__attribute__((used)) int UpdateSystem_INTC();

 }
--- a/core/rec-x64/rec_x64.cpp
+++ b/core/rec-x64/rec_x64.cpp
@ -1,4 +1,5 @@
 #include "deps/xbyak/xbyak.h"
+#include "deps/xbyak/xbyak_util.h"

 #include "types.h"

@ -56,104 +57,132 @@ static __attribute((used)) void end_slice()
 #define _U
 #endif

+#ifdef _WIN32
+#define WIN32_ONLY(x) x
+#else
+#define WIN32_ONLY(x)
+#endif
+
+#define STRINGIFY(x) #x
+#define _S(x) STRINGIFY(x)
+#if RAM_SIZE == 16*1024*1024
+#define CPU_RUNNING 68157284
+#define PC 68157256
+#elif RAM_SIZE == 32*1024*1024
+#define CPU_RUNNING 135266148
+#define PC 135266120
+#else
+#error RAM_SIZE unknown
+#endif
+
+#ifdef _WIN32
+        // Fully naked function in win32 for proper SEH prologue
+	__asm__ (
+			".text                          \n\t"
+			".p2align 4,,15                 \n\t"
+			".globl ngen_mainloop           \n\t"
+			".def   ngen_mainloop;  .scl    2;      .type   32;     .endef  \n\t"
+			".seh_proc      ngen_mainloop   \n\t"
+		"ngen_mainloop:                     \n\t"
+#else
 void ngen_mainloop(void* v_cntx)
 {
-	__asm__ volatile (
-			"pushq %%rbx					\n\t"
+	__asm__ (
+#endif
+			"pushq %rbx						\n\t"
+WIN32_ONLY( ".seh_pushreg %rbx				\n\t")
 #ifndef __MACH__	// rbp is pushed in the standard function prologue
-			"pushq %%rbp					\n\t"
+			"pushq %rbp                     \n\t"
 #endif
 #ifdef _WIN32
-			"pushq %%rdi					\n\t"
-			"pushq %%rsi					\n\t"
+			".seh_pushreg %rbp              \n\t"
+			"pushq %rdi                     \n\t"
+			".seh_pushreg %rdi              \n\t"
+			"pushq %rsi                     \n\t"
+			".seh_pushreg %rsi              \n\t"
 #endif
-			"pushq %%r12					\n\t"
-			"pushq %%r13					\n\t"
-			"pushq %%r14					\n\t"
-			"pushq %%r15					\n\t"
-			"subq $8, %%rsp					\n\t"	// 8 for stack 16-byte alignment
-#if defined(__MACH__) || defined(_ANDROID)
-			"movl %[_SH4_TIMESLICE], " _U "cycle_counter(%%rip)	\n"
-
-		"1:							  			\n\t"
-			"movq " _U "p_sh4rcb(%%rip), %%rax	\n\t"
-			"movl %c[CpuRunning](%%rax), %%edx	\n\t"
-			"testl %%edx, %%edx					\n\t"
-			"je 3f								\n"
-
-		"2:								 		\n\t"
-			"movq " _U "p_sh4rcb(%%rip), %%rax	\n\t"
-			"movl %c[pc](%%rax), %%edi			\n\t"
-			"call " _U "bm_GetCode2				\n\t"
-			"call *%%rax						\n\t"
-			"movl " _U "cycle_counter(%%rip), %%ecx \n\t"
-			"testl %%ecx, %%ecx					\n\t"
-			"jg 2b								\n\t"
-
-			"addl %[_SH4_TIMESLICE], %%ecx		\n\t"
-			"movl %%ecx, " _U "cycle_counter(%%rip)	\n\t"
-			"call " _U "UpdateSystem_INTC		\n\t"
-			"jmp 1b								\n"
-
-		"3:										\n\t"
+			"pushq %r12                     \n\t"
+WIN32_ONLY( ".seh_pushreg %r12              \n\t")
+			"pushq %r13                     \n\t"
+WIN32_ONLY( ".seh_pushreg %r13              \n\t")
+			"pushq %r14                     \n\t"
+WIN32_ONLY( ".seh_pushreg %r14              \n\t")
+			"pushq %r15                     \n\t"
+#ifdef _WIN32
+			".seh_pushreg %r15              \n\t"
+			"subq $40, %rsp                 \n\t"   // 32-byte shadow space + 8 for stack 16-byte alignment
+			".seh_stackalloc 40             \n\t"
+			".seh_endprologue               \n\t"
 #else
-			"movl %[_SH4_TIMESLICE], cycle_counter(%%rip)	\n"
+			"subq $8, %rsp                  \n\t"   // 8 for stack 16-byte alignment
+#endif
+			"movl $" _S(SH4_TIMESLICE) "," _U "cycle_counter(%rip)  \n"

-		"run_loop:							\n\t"
-			"movq p_sh4rcb(%%rip), %%rax	\n\t"
-			"movl %p[CpuRunning](%%rax), %%edx	\n\t"
-			"testl %%edx, %%edx				\n\t"
-			"je end_run_loop				\n"
+		"1:                                 \n\t"   // run_loop
+			"movq " _U "p_sh4rcb(%rip), %rax		\n\t"
+			"movl " _S(CPU_RUNNING) "(%rax), %edx	\n\t"
+			"testl %edx, %edx               \n\t"
+			"je 3f                          \n"     // end_run_loop
+
+		"2:                                 \n\t"   // slice_loop
+			"movq " _U "p_sh4rcb(%rip), %rax	\n\t"
+#ifdef _WIN32
+			"movl " _S(PC)"(%rax), %ecx     \n\t"
+#else
+			"movl " _S(PC)"(%rax), %edi     \n\t"
+#endif
+			"call " _U "bm_GetCode2         \n\t"
+			"call *%rax                     \n\t"
+			"movl " _U "cycle_counter(%rip), %ecx \n\t"
+			"testl %ecx, %ecx               \n\t"
+			"jg 2b                          \n\t"   // slice_loop
+
+			"addl $" _S(SH4_TIMESLICE) ", %ecx		\n\t"
+			"movl %ecx, " _U "cycle_counter(%rip)	\n\t"
+#ifdef PROFILING
+			"call end_slice					\n\t"
+#endif
+			"call " _U "UpdateSystem_INTC   \n\t"
+			"jmp 1b                         \n"     // run_loop
 #ifdef PROFILING
 			"call start_slice				\n\t"
 #endif

-		"slice_loop:						\n\t"
-			"movq p_sh4rcb(%%rip), %%rax	\n\t"
+		"3:                                 \n\t"   // end_run_loop
+
 #ifdef _WIN32
-			"movl %p[pc](%%rax), %%ecx		\n\t"
+			"addq $40, %rsp                 \n\t"
 #else
-			"movl %p[pc](%%rax), %%edi		\n\t"
+			"addq $8, %rsp                  \n\t"
 #endif
-			"call bm_GetCode2				\n\t"
-			"call *%%rax					\n\t"
-			"movl cycle_counter(%%rip), %%ecx \n\t"
-			"testl %%ecx, %%ecx				\n\t"
-			"jg slice_loop					\n\t"
-
-			"addl %[_SH4_TIMESLICE], %%ecx	\n\t"
-			"movl %%ecx, cycle_counter(%%rip)	\n\t"
-#ifdef PROFILING
-			"call end_slice					\n\t"
-#endif
-			"call UpdateSystem_INTC			\n\t"
-			"jmp run_loop					\n"
-
-		"end_run_loop:						\n\t"
-#endif	// !__MACH__
-			"addq $8, %%rsp					\n\t"
-			"popq %%r15						\n\t"
-			"popq %%r14						\n\t"
-			"popq %%r13						\n\t"
-			"popq %%r12						\n\t"
+			"popq %r15                      \n\t"
+			"popq %r14                      \n\t"
+			"popq %r13                      \n\t"
+			"popq %r12                      \n\t"
 #ifdef _WIN32
-			"popq %%rsi						\n\t"
-			"popq %%rdi						\n\t"
+			"popq %rsi                      \n\t"
+			"popq %rdi                      \n\t"
 #endif
 #ifndef __MACH__
-			"popq %%rbp						\n\t"
+			"popq %rbp                      \n\t"
 #endif
-			"popq %%rbx						\n\t"
-			:
-			: [CpuRunning] "i"(offsetof(Sh4RCB, cntx.CpuRunning)),
-			  [pc] "i"(offsetof(Sh4RCB, cntx.pc)),
-			  [_SH4_TIMESLICE] "i"(SH4_TIMESLICE)
-			: "memory"
+			"popq %rbx                      \n\t"
+#ifdef _WIN32
+			"ret                            \n\t"
+			".seh_endproc                   \n"
+	);
+#else
 	);
 }
+#endif
+
+#undef _U
+#undef _S

 void ngen_init()
 {
+	verify(CPU_RUNNING == offsetof(Sh4RCB, cntx.CpuRunning));
+	verify(PC == offsetof(Sh4RCB, cntx.pc));
 }

 void ngen_ResetBlocks()
@ -217,8 +246,7 @@ public:
 		}
 		regalloc.DoAlloc(block);

-		mov(rax, (size_t)&cycle_counter);
-		sub(dword[rax], block->guest_cycles);
+		sub(dword[rip + &cycle_counter], block->guest_cycles);
 #ifdef PROFILING
 		mov(rax, (uintptr_t)&guest_cpu_cycles);
 		mov(ecx, block->guest_cycles);
@ -321,6 +349,7 @@ public:

 						default:
 							die("Invalid immediate size");
+  							break;
 						}
 					}
 					else
@ -342,6 +371,7 @@ public:

 						default:
 							die("Invalid immediate size");
+  							break;
 						}
 						host_reg_to_shil_param(op.rd, ecx);
 					}
@ -355,10 +385,7 @@ public:
 						if (op.rs3.is_imm())
 							add(call_regs[0], op.rs3._imm);
 						else
-						{
-							shil_param_to_host_reg(op.rs3, edx);
-							add(call_regs[0], edx);
-						}
+							add(call_regs[0], regalloc.MapRegister(op.rs3));
 					}

 					if (size == 1) {
@ -407,10 +434,7 @@ public:
 					if (op.rs3.is_imm())
 						add(call_regs[0], op.rs3._imm);
 					else
-					{
-						shil_param_to_host_reg(op.rs3, edx);	// edx is call_regs[1] on win32 so it's safe here
-						add(call_regs[0], edx);
-					}
+						add(call_regs[0], regalloc.MapRegister(op.rs3));
 				}

 				if (size != 8)
@ -710,7 +734,14 @@ public:
 			case shop_fmac:
 				if (regalloc.mapf(op.rd) != regalloc.mapf(op.rs1))
 					movss(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs1));
-				vfmadd231ss(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs2), regalloc.MapXRegister(op.rs3));
+				if (cpu.has(Xbyak::util::Cpu::tFMA))
+					vfmadd231ss(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs2), regalloc.MapXRegister(op.rs3));
+				else
+				{
+					movss(xmm0, regalloc.MapXRegister(op.rs2));
+					mulss(xmm0, regalloc.MapXRegister(op.rs3));
+					addss(regalloc.MapXRegister(op.rd), xmm0);
+				}
 				break;

 			case shop_fsrra:
@ -749,43 +780,104 @@ public:
 				break;

 			case shop_fipr:
-				mov(rax, (size_t)op.rs1.reg_ptr());
-				movaps(regalloc.MapXRegister(op.rd), dword[rax]);
-				mov(rax, (size_t)op.rs2.reg_ptr());
-				mulps(regalloc.MapXRegister(op.rd), dword[rax]);
-				haddps(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rd));
-				haddps(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rd));
+				{
+					mov(rax, (size_t)op.rs1.reg_ptr());
+					movaps(regalloc.MapXRegister(op.rd), dword[rax]);
+					mov(rax, (size_t)op.rs2.reg_ptr());
+					mulps(regalloc.MapXRegister(op.rd), dword[rax]);
+					const Xbyak::Xmm &rd = regalloc.MapXRegister(op.rd);
+					// Only first-generation 64-bit CPUs lack SSE3 support
+					if (cpu.has(Xbyak::util::Cpu::tSSE3))
+					{
+						haddps(rd, rd);
+						haddps(rd, rd);
+					}
+					else
+					{
+						movhlps(xmm1, rd);
+						addps(rd, xmm1);
+						movaps(xmm1, rd);
+						shufps(xmm1, xmm1,1);
+						addss(rd, xmm1);
+					}
+				}
 				break;

 			case shop_ftrv:
 				mov(rax, (uintptr_t)op.rs1.reg_ptr());
-				vmovaps(xmm0, xword[rax]);					// fn[0-4]
-				mov(rax, (uintptr_t)op.rs2.reg_ptr());		// fm[0-15]
+				if (cpu.has(Xbyak::util::Cpu::tFMA))
+				{
+					movaps(xmm0, xword[rax]);					// fn[0-4]
+					mov(rax, (uintptr_t)op.rs2.reg_ptr());		// fm[0-15]

-				pshufd(xmm1, xmm0, 0x00);					// fn[0]
-				vmulps(xmm2, xmm1, xword[rax]);				// fm[0-3]
-				pshufd(xmm1, xmm0, 0x55);					// fn[1]
-				vfmadd231ps(xmm2, xmm1, xword[rax + 16]);	// fm[4-7]
-				pshufd(xmm1, xmm0, 0xaa);					// fn[2]
-				vfmadd231ps(xmm2, xmm1, xword[rax + 32]);	// fm[8-11]
-				pshufd(xmm1, xmm0, 0xff);					// fn[3]
-				vfmadd231ps(xmm2, xmm1, xword[rax + 48]);	// fm[12-15]
-				mov(rax, (uintptr_t)op.rd.reg_ptr());
-				vmovaps(xword[rax], xmm2);
+					pshufd(xmm1, xmm0, 0x00);					// fn[0]
+					vmulps(xmm2, xmm1, xword[rax]);				// fm[0-3]
+					pshufd(xmm1, xmm0, 0x55);					// fn[1]
+					vfmadd231ps(xmm2, xmm1, xword[rax + 16]);	// fm[4-7]
+					pshufd(xmm1, xmm0, 0xaa);					// fn[2]
+					vfmadd231ps(xmm2, xmm1, xword[rax + 32]);	// fm[8-11]
+					pshufd(xmm1, xmm0, 0xff);					// fn[3]
+					vfmadd231ps(xmm2, xmm1, xword[rax + 48]);	// fm[12-15]
+					mov(rax, (uintptr_t)op.rd.reg_ptr());
+					movaps(xword[rax], xmm2);
+				}
+				else
+				{
+					movaps(xmm3, xword[rax]);                   //xmm0=vector
+					pshufd(xmm0, xmm3, 0);                      //xmm0={v0}
+					pshufd(xmm1, xmm3, 0x55);                   //xmm1={v1}
+					pshufd(xmm2, xmm3, 0xaa);                   //xmm2={v2}
+					pshufd(xmm3, xmm3, 0xff);                   //xmm3={v3}
+
+					//do the matrix mult !
+					mov(rax, (uintptr_t)op.rs2.reg_ptr());
+					mulps(xmm0, xword[rax + 0]);   //v0*=vm0
+					mulps(xmm1, xword[rax + 16]);  //v1*=vm1
+					mulps(xmm2, xword[rax + 32]);  //v2*=vm2
+					mulps(xmm3, xword[rax + 48]);  //v3*=vm3
+
+					addps(xmm0, xmm1);	 //sum it all up
+					addps(xmm2, xmm3);
+					addps(xmm0, xmm2);
+
+					mov(rax, (uintptr_t)op.rd.reg_ptr());
+					movaps(xword[rax], xmm0);
+
+				}
 				break;

 			case shop_frswap:
 				mov(rax, (uintptr_t)op.rs1.reg_ptr());
 				mov(rcx, (uintptr_t)op.rd.reg_ptr());
-				vmovaps(ymm0, yword[rax]);
-				vmovaps(ymm1, yword[rcx]);
-				vmovaps(yword[rax], ymm1);
-				vmovaps(yword[rcx], ymm0);
+				if (cpu.has(Xbyak::util::Cpu::tAVX512F))
+				{
+					vmovaps(zmm0, zword[rax]);
+					vmovaps(zmm1, zword[rcx]);
+					vmovaps(zword[rax], zmm1);
+					vmovaps(zword[rcx], zmm0);
+				}
+				else if (cpu.has(Xbyak::util::Cpu::tAVX))
+				{
+					vmovaps(ymm0, yword[rax]);
+					vmovaps(ymm1, yword[rcx]);
+					vmovaps(yword[rax], ymm1);
+					vmovaps(yword[rcx], ymm0);

-				vmovaps(ymm0, yword[rax + 32]);
-				vmovaps(ymm1, yword[rcx + 32]);
-				vmovaps(yword[rax + 32], ymm1);
-				vmovaps(yword[rcx + 32], ymm0);
+					vmovaps(ymm0, yword[rax + 32]);
+					vmovaps(ymm1, yword[rcx + 32]);
+					vmovaps(yword[rax + 32], ymm1);
+					vmovaps(yword[rcx + 32], ymm0);
+				}
+				else
+				{
+					for (int i = 0; i < 4; i++)
+					{
+						movaps(xmm0, xword[rax + (i * 16)]);
+						movaps(xmm1, xword[rcx + (i * 16)]);
+						movaps(xword[rax + (i * 16)], xmm1);
+						movaps(xword[rcx + (i * 16)], xmm0);
+					}
+				}
 				break;

 			case shop_cvt_f2i_t:
@ -1047,19 +1139,24 @@ private:
 	template<class Ret, class... Params>
 	void GenCall(Ret(*function)(Params...))
 	{
+#ifndef _WIN32
+		// Need to save xmm registers as they are not preserved in linux/mach
 		sub(rsp, 16);
 		movd(ptr[rsp + 0], xmm8);
 		movd(ptr[rsp + 4], xmm9);
 		movd(ptr[rsp + 8], xmm10);
 		movd(ptr[rsp + 12], xmm11);
+#endif

 		call(function);

+#ifndef _WIN32
 		movd(xmm8, ptr[rsp + 0]);
 		movd(xmm9, ptr[rsp + 4]);
 		movd(xmm10, ptr[rsp + 8]);
 		movd(xmm11, ptr[rsp + 12]);
 		add(rsp, 16);
+#endif
 	}

 	// uses eax/rax
@ -1129,6 +1226,7 @@ private:
 	vector<CC_PS> CC_pars;

 	X64RegAlloc regalloc;
+	Xbyak::util::Cpu cpu;
 	static const u32 float_sign_mask;
 	static const u32 float_abs_mask;
 	static const f32 cvtf2i_pos_saturation;
--- a/core/rec-x64/x64_regalloc.h
+++ b/core/rec-x64/x64_regalloc.h
@ -26,11 +26,12 @@
 #ifdef _WIN32
 static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI,
 		Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 };
+static s8 alloc_fregs[] = { 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 };          // XMM6 to XMM15 are callee-saved in Windows
 #else
 static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13,
 		Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 };
-#endif
 static s8 alloc_fregs[] = { 8, 9, 10, 11, -1 };		// XMM8-11
+#endif

 class BlockCompiler;