From 8b4da698613a8eae8552567dc838216fe4fe2fd0 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Thu, 17 Nov 2016 22:03:11 +0100 Subject: [PATCH 01/20] cmake: always define avx on 64 bits build --- cmake/BuildParameters.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/BuildParameters.cmake b/cmake/BuildParameters.cmake index e5699ef608..8f03540bb6 100644 --- a/cmake/BuildParameters.cmake +++ b/cmake/BuildParameters.cmake @@ -234,7 +234,7 @@ elseif(${PCSX2_TARGET_ARCHITECTURES} MATCHES "x86_64") if (USE_ICC) set(ARCH_FLAG "-msse2") else() - set(ARCH_FLAG "-msse -msse2 -mfxsr") + set(ARCH_FLAG "-msse -msse2 -mfxsr -mssse3 -msse4.1 -mavx") endif() else() #set(ARCH_FLAG "-march=native -fabi-version=6") From 82d12691e1788faa06f8982d96eb1beae2ae1acc Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 17:59:58 +0100 Subject: [PATCH 02/20] gsdx: properly check SSE support 1/ Check all "levels" 2/ requires AVX for 64 bits --- plugins/GSdx/GSUtil.cpp | 56 ++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index d710057489..2a23156d4b 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -203,38 +203,42 @@ bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm) bool GSUtil::CheckSSE() { + bool status = true; Xbyak::util::Cpu cpu; - Xbyak::util::Cpu::Type type; - const char* instruction_set = ""; - #if _M_SSE >= 0x501 - type = Xbyak::util::Cpu::tAVX2; - instruction_set = "AVX2"; - #elif _M_SSE >= 0x500 - type = Xbyak::util::Cpu::tAVX; - instruction_set = "AVX"; - #elif _M_SSE >= 0x402 - type = Xbyak::util::Cpu::tSSE42; - instruction_set = "SSE4.2"; - #elif _M_SSE >= 0x401 - type = Xbyak::util::Cpu::tSSE41; - instruction_set = "SSE4.1"; - #elif _M_SSE >= 0x301 - type = Xbyak::util::Cpu::tSSSE3; - instruction_set = "SSSE3"; - #elif _M_SSE >= 0x200 - type = Xbyak::util::Cpu::tSSE2; - instruction_set = "SSE2"; - #endif + struct ISA { + Xbyak::util::Cpu::Type type; + const char* name; + }; - if(!cpu.has(type)) - { - fprintf(stderr, "This CPU does not support %s\n", instruction_set); + ISA checks[] = { + {Xbyak::util::Cpu::tSSE2, "SSE2"}, +#if _M_SSE >= 0x301 || defined(_M_AMD64) + {Xbyak::util::Cpu::tSSSE3, "SSSE3"}, +#endif +#if _M_SSE >= 0x401 || defined(_M_AMD64) + {Xbyak::util::Cpu::tSSE41, "SSE41"}, +#endif +#if _M_SSE >= 0x402 || defined(_M_AMD64) + {Xbyak::util::Cpu::tSSE42, "SSE42"}, +#endif +#if _M_SSE >= 0x500 || defined(_M_AMD64) + {Xbyak::util::Cpu::tAVX, "AVX1"}, +#endif +#if _M_SSE >= 0x501 + {Xbyak::util::Cpu::tAVX2, "AVX2"}, +#endif + }; - return false; + for (size_t i = 0; i < countof(checks); i++) { + if(!cpu.has(checks[i].type)) { + fprintf(stderr, "This CPU does not support %s\n", checks[i].name); + + status = false; + } } - return true; + return status; } #define OCL_PROGRAM_VERSION 3 From 43b4cfc21537c950b259d35e989c64f727dc138a Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Wed, 16 Nov 2016 20:24:42 +0100 Subject: [PATCH 03/20] gsdx: separate dump directory for 32/64 bits --- plugins/GSdx/GSState.cpp | 4 ++-- plugins/GSdx/stdafx.h | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 166aef74bc..227fd16128 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -65,8 +65,8 @@ GSState::GSState() m_dump_root = ""; #if defined(__unix__) if (s_dump) { - GSmkdir("/tmp/GS_HW_dump"); - GSmkdir("/tmp/GS_SW_dump"); + GSmkdir(root_hw.c_str()); + GSmkdir(root_sw.c_str()); } #endif diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 776d271c56..08a11afc4f 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -472,6 +472,11 @@ struct GLAutoPop { const std::string root_sw("c:\\temp1\\_"); const std::string root_hw("c:\\temp2\\_"); #else -const std::string root_sw("/tmp/GS_SW_dump/"); -const std::string root_hw("/tmp/GS_HW_dump/"); +#ifdef _M_AMD64 +const std::string root_sw("/tmp/GS_SW_dump64/"); +const std::string root_hw("/tmp/GS_HW_dump64/"); +#else +const std::string root_sw("/tmp/GS_SW_dump32/"); +const std::string root_hw("/tmp/GS_HW_dump32/"); +#endif #endif From 633f7a1db933b7dade82efb50c2571ae40c5418b Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 15 Nov 2016 19:07:48 +0100 Subject: [PATCH 04/20] xbyak: add int3 instruction Very useful to stop the JIT --- plugins/GSdx/xbyak/xbyak_mnemonic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/GSdx/xbyak/xbyak_mnemonic.h b/plugins/GSdx/xbyak/xbyak_mnemonic.h index d551c61323..3eda2556c1 100644 --- a/plugins/GSdx/xbyak/xbyak_mnemonic.h +++ b/plugins/GSdx/xbyak/xbyak_mnemonic.h @@ -370,6 +370,7 @@ void cwde() { db(0x98); } void lahf() { db(0x9F); } void lock() { db(0xF0); } void nop() { db(0x90); } +void int3() { db(0xCC); } void sahf() { db(0x9E); } void stc() { db(0xF9); } void std() { db(0xFD); } From e31ce87bb31bd4a55c838a32db77f2c6819d20e2 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 15 Nov 2016 19:12:31 +0100 Subject: [PATCH 05/20] gsdx: SW JIT debug helper Allow to compare 32/64 bits (and all ISAs too) Allow to breakpoint (int3) Print selector info Print size of buffer and start (disabled by default) --- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 3 +++ plugins/GSdx/GSFunctionMap.h | 20 +++++++++++++++++++- plugins/GSdx/GSScanlineEnvironment.h | 18 ++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index 90c941638e..ce3e1b6801 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -100,6 +100,9 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key { m_sel.key = key; + if(m_sel.breakpoint) + int3(); + Generate(); } diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index ab2f065798..5b6cc281d9 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -26,6 +26,8 @@ #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" +#include "GSScanlineEnvironment.h" + template class GSFunctionMap { protected: @@ -161,6 +163,7 @@ class GSCodeGeneratorFunctionMap : public GSFunctionMap void* m_param; hash_map m_cgmap; GSCodeBuffer m_cb; + size_t m_total_code_size; enum {MAX_SIZE = 8192}; @@ -168,9 +171,15 @@ public: GSCodeGeneratorFunctionMap(const char* name, void* param) : m_name(name) , m_param(param) + , m_total_code_size(0) { } + ~GSCodeGeneratorFunctionMap() + { + fprintf(stderr, "%s generated %zu bytes of instruction\n", m_name.c_str(), m_total_code_size); + } + VALUE GetDefaultFunction(KEY key) { VALUE ret = NULL; @@ -183,10 +192,19 @@ public: } else { - CG* cg = new CG(m_param, key, m_cb.GetBuffer(MAX_SIZE), MAX_SIZE); + void* code_ptr = m_cb.GetBuffer(MAX_SIZE); + CG* cg = new CG(m_param, key, code_ptr, MAX_SIZE); ASSERT(cg->getSize() < MAX_SIZE); +#if 0 + fprintf(stderr, "%s Location:%p Size:%zu Key:%llx\n", m_name.c_str(), code_ptr, cg->getSize(), (uint64)key); + GSScanlineSelector sel(key); + sel.Print(); +#endif + + m_total_code_size += cg->getSize(); + m_cb.ReleaseBuffer(cg->getSize()); ret = (VALUE)cg->getCode(); diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index e9f0551e40..115c10a3db 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -69,6 +69,8 @@ union GSScanlineSelector uint32 mmin:2; // 53 uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction + + uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program }; struct @@ -76,6 +78,7 @@ union GSScanlineSelector uint32 _pad1:22; uint32 ababcd:8; uint32 _pad2:2; + uint32 fb:2; uint32 _pad3:1; uint32 zb:2; @@ -89,6 +92,9 @@ union GSScanlineSelector uint64 key; + GSScanlineSelector() = default; + GSScanlineSelector(uint64 k) : key(k) {} + operator uint32() const {return lo;} operator uint64() const {return key;} @@ -103,6 +109,18 @@ union GSScanlineSelector && date == 0 && fge == 0; } + + void Print() const + { + fprintf(stderr, "fpsm:%d zpsm:%d ztst:%d ztest:%d atst:%d afail:%d iip:%d rfb:%d fb:%d zb:%d zw:%d " + "tfx:%d tcc:%d fst:%d ltf:%d tlu:%d wms:%d wmt:%d mmin:%d lcm:%d tw:%d " + "fba:%d cclamp:%d date:%d datm:%d " + "prim:%d abe:%d %d%d%d%d fge:%d dthe:%d notest:%d\n", + fpsm, zpsm, ztst, ztest, atst, afail, iip, rfb, fb, zb, zwrite, + tfx, tcc, fst, ltf, tlu, wms, wmt, mmin, lcm, tw, + fba, colclamp, date, datm, + prim, abe, aba, abb, abc, abd , fge, dthe, notest); + } }; struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer From 4a47224ac166c459f191c3312dbbb84faf4de3d3 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 15 Nov 2016 19:27:44 +0100 Subject: [PATCH 06/20] gsdx: define the linux x64 ABI --- plugins/GSdx/stdafx.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 08a11afc4f..b6f466afc7 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -269,10 +269,28 @@ using namespace stdext; #define _M_AMD64 + // Yeah let use mips naming ;) + #ifdef _WIN64 + #define a0 rcx + #define a1 rdx + #define a2 r8 + #define a3 r9 + #define t0 rdi + #define t1 rsi + #else + #define a0 rdi + #define a1 rsi + #define a2 rdx + #define a3 rcx + #define t0 r8 + #define t1 r9 + #endif + #endif // sse #if defined(__GNUC__) && !defined(__x86_64__) + // Convert gcc see define into GSdx (windows) define #if defined(__AVX2__) #define _M_SSE 0x501 @@ -290,6 +308,21 @@ using namespace stdext; #define _M_SSE 0x100 #endif +#elif defined(__GNUC__) + +#if defined(__AVX2__) +// FIXME until code is done + #define _M_SSE 0x500 +#elif defined(__AVX__) + #define _M_SSE 0x500 +#elif defined(__SSE4_1__) + #define _M_SSE 0x401 +#else +// FIXME won't likely be supported but allow to compile the code +// Note: from steam survey SSE4.1 is supported by 87% (end of 2015) + #define _M_SSE 0x200 +#endif + #endif #if !defined(_M_SSE) && (!defined(_WIN32) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2) From 8e29e09943122dfe578a9b713a5deac197ff9e77 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 15 Nov 2016 19:31:41 +0100 Subject: [PATCH 07/20] gsdx sw x64: update setup prim generator x64 SSE&AVX --- .../GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp | 120 ++++++++--------- plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp | 125 +++++++++--------- 2 files changed, 123 insertions(+), 122 deletions(-) diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp index 5fe710dad3..f42d4feb1b 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp @@ -29,18 +29,20 @@ using namespace Xbyak; void GSSetupPrimCodeGenerator::Generate() { +#ifdef _WIN64 sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); +#endif - mov(r8, (size_t)&m_local); + mov(t0, (size_t)&m_local); if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { mov(rax, (size_t)&m_shift[0]); - for(int i = 0; i < 5; i++) + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { vmovaps(Xmm(3 + i), ptr[rax + i * 16]); } @@ -52,10 +54,12 @@ void GSSetupPrimCodeGenerator::Generate() Color(); +#ifdef _WIN64 vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); add(rsp, 8 + 2 * 16); +#endif ret(); } @@ -71,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -85,9 +89,9 @@ void GSSetupPrimCodeGenerator::Depth() vcvttps2dq(xmm2, xmm2); vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -97,7 +101,7 @@ void GSSetupPrimCodeGenerator::Depth() vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm2); + vmovdqa(ptr[t0 + variableOffset], xmm2); } } @@ -110,24 +114,28 @@ void GSSetupPrimCodeGenerator::Depth() // m_local.d4.z = dz * 4.0f; vmulps(xmm1, xmm0, xmm3); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; vmulps(xmm1, xmm0, Xmm(4 + i)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm1); + vmovdqa(ptr[t0 + variableOffset], xmm1); } } } else { - // GSVector4 p = vertices[0].p; + // GSVector4 p = vertex[index[1]].p; - vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); + mov(eax, ptr[a1 + sizeof(uint32) * 1]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + + vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -136,41 +144,16 @@ void GSSetupPrimCodeGenerator::Depth() vcvttps2dq(xmm1, xmm0); vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1); } if(m_en.z) { - // GSVector4 z = p.zzzz(); + // uint32 z is bypassed in t.w - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(r9, (size_t)&GSVector4::m_half); - - vbroadcastss(xmm1, ptr[r9]); - vmulps(xmm1, xmm0); - vcvttps2dq(xmm1, xmm1); - vpslld(xmm1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm0, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } - - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); + vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); + vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0); } } } @@ -184,7 +167,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]); vmulps(xmm1, xmm0, xmm3); @@ -194,13 +177,13 @@ void GSSetupPrimCodeGenerator::Texture() vcvttps2dq(xmm1, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } else { // m_local.d4.stq = t * 4.0f; - vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + vmovaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) @@ -211,7 +194,7 @@ void GSSetupPrimCodeGenerator::Texture() vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -228,8 +211,8 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: vmovdqa(ptr[r8 + variableOffsetS], xmm2); break; - case 1: vmovdqa(ptr[r8 + variableOffsetT], xmm2); break; + case 0: vmovdqa(ptr[t0 + variableOffsetS], xmm2); break; + case 1: vmovdqa(ptr[t0 + variableOffsetT], xmm2); break; } } else @@ -242,9 +225,9 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: vmovaps(ptr[r8 + variableOffsetS], xmm2); break; - case 1: vmovaps(ptr[r8 + variableOffsetT], xmm2); break; - case 2: vmovaps(ptr[r8 + variableOffsetQ], xmm2); break; + case 0: vmovaps(ptr[t0 + variableOffsetS], xmm2); break; + case 1: vmovaps(ptr[t0 + variableOffsetT], xmm2); break; + case 2: vmovaps(ptr[t0 + variableOffsetQ], xmm2); break; } } } @@ -262,7 +245,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -270,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color() vcvttps2dq(xmm1, xmm1); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); vpackssdw(xmm1, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm1); // xmm3 is not needed anymore @@ -299,12 +282,12 @@ void GSSetupPrimCodeGenerator::Color() vpunpcklwd(xmm0, xmm1); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm0); + vmovdqa(ptr[t0 + variableOffset], xmm0); } // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); @@ -312,7 +295,7 @@ void GSSetupPrimCodeGenerator::Color() vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); @@ -331,14 +314,31 @@ void GSSetupPrimCodeGenerator::Color() vpunpcklwd(xmm0, xmm1); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm0); + vmovdqa(ptr[t0 + variableOffset], xmm0); } } else { - // GSVector4i c = GSVector4i(vertices[0].c); + // GSVector4i c = GSVector4i(vertex[index[last].c); - vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); + int last = 0; + + switch(m_sel.prim) + { + case GS_POINT_CLASS: last = 0; break; + case GS_LINE_CLASS: last = 1; break; + case GS_TRIANGLE_CLASS: last = 2; break; + case GS_SPRITE_CLASS: last = 1; break; + } + + if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() + { + mov(eax, ptr[a1 + sizeof(uint32) * last]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + } + + vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); @@ -358,8 +358,8 @@ void GSSetupPrimCodeGenerator::Color() vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2); } } diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp index 6456ead387..238dd86bdc 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp @@ -29,16 +29,20 @@ using namespace Xbyak; void GSSetupPrimCodeGenerator::Generate() { +#ifdef _WIN64 sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); +#endif - mov(r8, (size_t)&m_local); + mov(t0, (size_t)&m_local); if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { - for(int i = 0; i < 5; i++) + mov(rax, (size_t)&m_shift[0]); + + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { movaps(Xmm(3 + i), ptr[rax + i * 16]); } @@ -50,10 +54,12 @@ void GSSetupPrimCodeGenerator::Generate() Color(); +#ifdef _WIN64 vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); add(rsp, 8 + 2 * 16); +#endif ret(); } @@ -69,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -85,9 +91,9 @@ void GSSetupPrimCodeGenerator::Depth() cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -98,7 +104,7 @@ void GSSetupPrimCodeGenerator::Depth() pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm2); + movdqa(ptr[t0 + variableOffset], xmm2); } } @@ -112,9 +118,9 @@ void GSSetupPrimCodeGenerator::Depth() movaps(xmm1, xmm0); mulps(xmm1, xmm3); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; @@ -122,15 +128,19 @@ void GSSetupPrimCodeGenerator::Depth() mulps(xmm1, Xmm(4 + i)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm1); + movdqa(ptr[t0 + variableOffset], xmm1); } } } else { - // GSVector4 p = vertices[0].p; + // GSVector4 p = vertex[index[1]].p; - movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); + mov(eax, ptr[a1 + sizeof(uint32) * 1]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + + movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -139,42 +149,16 @@ void GSSetupPrimCodeGenerator::Depth() cvttps2dq(xmm1, xmm0); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1); } if(m_en.z) { - // GSVector4 z = p.zzzz(); + // uint32 z is bypassed in t.w - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(r9, (size_t)&GSVector4::m_half); - - movss(xmm1, ptr[r9]); - shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); - mulps(xmm1, xmm0); - cvttps2dq(xmm1, xmm1); - pslld(xmm1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(xmm2, xmm2); - psrld(xmm2, 31); - pand(xmm0, xmm2); - - por(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); + vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); + vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0); } } } @@ -188,7 +172,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]); movaps(xmm1, xmm0); mulps(xmm1, xmm3); @@ -199,13 +183,13 @@ void GSSetupPrimCodeGenerator::Texture() cvttps2dq(xmm1, xmm1); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } else { // m_local.d4.stq = t * 4.0f; - movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) @@ -217,7 +201,7 @@ void GSSetupPrimCodeGenerator::Texture() movaps(xmm1, xmm0); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -235,8 +219,8 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: movdqa(ptr[r8 + variableOffsetS], xmm2); break; - case 1: movdqa(ptr[r8 + variableOffsetT], xmm2); break; + case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break; + case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break; } } else @@ -249,9 +233,9 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: movaps(ptr[r8 + variableOffsetS], xmm2); break; - case 1: movaps(ptr[r8 + variableOffsetT], xmm2); break; - case 2: movaps(ptr[r8 + variableOffsetQ], xmm2); break; + case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break; + case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break; + case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break; } } } @@ -269,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); movaps(xmm1, xmm0); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -279,7 +263,7 @@ void GSSetupPrimCodeGenerator::Color() cvttps2dq(xmm2, xmm2); pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); packssdw(xmm2, xmm2); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2); // xmm3 is not needed anymore @@ -289,7 +273,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); @@ -310,12 +294,12 @@ void GSSetupPrimCodeGenerator::Color() punpcklwd(xmm2, xmm3); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm2); + movdqa(ptr[t0 + variableOffset], xmm2); } // GSVector4 c = dscan.c; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); @@ -324,7 +308,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); @@ -345,14 +329,31 @@ void GSSetupPrimCodeGenerator::Color() punpcklwd(xmm2, xmm3); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm2); + movdqa(ptr[t0 + variableOffset], xmm2); } } else { - // GSVector4i c = GSVector4i(vertices[0].c); + // GSVector4i c = GSVector4i(vertex[index[last].c); - cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); + int last = 0; + + switch(m_sel.prim) + { + case GS_POINT_CLASS: last = 0; break; + case GS_LINE_CLASS: last = 1; break; + case GS_TRIANGLE_CLASS: last = 2; break; + case GS_SPRITE_CLASS: last = 1; break; + } + + if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() + { + mov(eax, ptr[a1 + sizeof(uint32) * last]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + } + + cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); @@ -372,8 +373,8 @@ void GSSetupPrimCodeGenerator::Color() pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2); } } From a281bda9a661116bd7ae529c93c1e6cbd08363ff Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 18:10:30 +0100 Subject: [PATCH 08/20] gsdx sw x64: port the scanline generator on AVX Based on Gabest's work. * Miss mipmap Note: dithering info It is a bit tricky as a2 on linux was rdx register which overlap with fzm (dh/dl) It might require dedicated windows code --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 798 +++++++++++------- .../GSdx/GSDrawScanlineCodeGenerator.x64.cpp | 3 + .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 4 +- 3 files changed, 491 insertions(+), 314 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index ddff1eeef0..f232ee961a 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -23,14 +23,36 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +// Ease the reading of the code +#define _m_local r11 +#define _m_local__gd r12 +#define _m_local__gd__vm r13 +#define _m_local__gd__clut r14 +#define _m_local__gd__tex r15 +// More pretty name +#define _z xmm8 +#define _f xmm9 +#define _s xmm10 +#define _t xmm11 +#define _q xmm12 +#define _f_rb xmm13 +#define _f_ga xmm14 +#define _test xmm15 +// Extra bonus +#define _rb xmm2 +#define _ga xmm3 +#define _fm xmm4 +#define _zm xmm5 +#define _fd xmm6 -#error TODO +#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) void GSDrawScanlineCodeGenerator::Generate() { - // TODO: on linux/mac rsi, rdi, xmm6-xmm15 are all caller saved + bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; + bool need_clut = need_tex && m_sel.tlu; +#ifdef _WIN64 push(rbx); push(rsi); push(rdi); @@ -39,26 +61,39 @@ void GSDrawScanlineCodeGenerator::Generate() push(r13); sub(rsp, 8 + 10 * 16); - + for(int i = 6; i < 16; i++) { vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); } +#else + // No reservation on the stack as a red zone is available + push(rbp); + mov(ptr[rsp - 1 * 8], rbx); + mov(ptr[rsp - 2 * 8], r12); + mov(ptr[rsp - 3 * 8], r13); + mov(ptr[rsp - 4 * 8], r14); + mov(ptr[rsp - 5 * 8], r15); +#endif mov(r10, (size_t)&m_test[0]); - mov(r11, (size_t)&m_local); - mov(r12, (size_t)m_local.gd); - mov(r13, (size_t)m_local.gd->vm); + mov(_m_local, (size_t)&m_local); + mov(_m_local__gd, ptr[_m_local + offsetof(GSScanlineLocalData, gd)]); + + mov(_m_local__gd__vm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, vm)]); + // FIXME: those 2 load could be optimized when no texture + mov(_m_local__gd__clut, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, clut)]); + mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]); Init(); - // rcx = steps - // rsi = fza_base - // rdi = fza_offset + // a0 = steps + // t1 = fza_base + // t0 = fza_offset // r10 = &m_test[0] - // r11 = &m_local - // r12 = m_local->gd - // r13 = m_local->gd.vm + // _m_local = &m_local + // _m_local__gd = m_local->gd + // _m_local__gd__vm = m_local->gd.vm // xmm7 = vf (sprite && ltf) // xmm8 = z // xmm9 = f @@ -66,7 +101,7 @@ void GSDrawScanlineCodeGenerator::Generate() // xmm11 = t // xmm12 = q // xmm13 = rb - // xmm14 = ga + // xmm14 = ga // xmm15 = test if(!m_sel.edge) @@ -188,7 +223,7 @@ L("step"); if(!m_sel.edge) { - test(rcx, rcx); + test(a0, a0); jle("exit", T_NEAR); @@ -199,6 +234,7 @@ L("step"); L("exit"); +#ifdef _WIN64 for(int i = 6; i < 16; i++) { vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); @@ -212,77 +248,99 @@ L("exit"); pop(rdi); pop(rsi); pop(rbx); +#else + mov(rbx, ptr[rsp - 1 * 8]); + mov(r12, ptr[rsp - 2 * 8]); + mov(r13, ptr[rsp - 3 * 8]); + mov(r14, ptr[rsp - 4 * 8]); + mov(r15, ptr[rsp - 5 * 8]); + pop(rbp); +#endif ret(); } void GSDrawScanlineCodeGenerator::Init() { - // int skip = left & 3; + if(!m_sel.notest) + { + // int skip = left & 3; - mov(rbx, rdx); - and(rdx, 3); + mov(rbx, a1); + and(a1, 3); - // left -= skip; + // left -= skip; - sub(rbx, rdx); + sub(rbx, a1); - // int steps = pixels + skip - 4; + // int steps = pixels + skip - 4; - lea(rcx, ptr[rcx + rdx - 4]); + lea(a0, ptr[a0 + a1 - 4]); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(rdx, 4); + shl(a1, 4); // * sizeof(m_test[0]) - vmovdqa(xmm15, ptr[rdx + r10]); + vmovdqa(_test, ptr[a1 + r10]); - mov(rax, rcx); - sar(rax, 63); - and(rax, rcx); - shl(rax, 4); + mov(rax, a0); + sar(rax, 63); // GH: 63 to extract the sign of the register + and(rax, a0); + shl(rax, 4); // * sizeof(m_test[0]) + + vpor(_test, ptr[rax + r10 + 7 * 16]); + } + else + { + mov(rbx, a1); // left + xor(a1, a1); // skip + lea(a0, ptr[a0 - 4]); // steps + } + + // a0 = steps + // a1 = skip + // rbx = left - vpor(xmm15, ptr[rax + r10 + 7 * 16]); // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - mov(rax, (size_t)m_local.gd->fzbr); - lea(rsi, ptr[rax + r8 * 8]); + mov(rax, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fzbr)]); + lea(t1, ptr[rax + a2 * 8]); // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - mov(rax, (size_t)m_local.gd->fzbc); - lea(rdi, ptr[rax + rbx * 2]); + mov(rax, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fzbc)]); + lea(t0, ptr[rax + rbx * 2]); if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { - // edx = &m_local.d[skip] + // a1 = &m_local.d[skip] // note a1 was (skip << 4) - lea(rdx, ptr[rdx * 8 + r11 + offsetof(GSScanlineLocalData, d)]); + lea(a1, ptr[a1 * 8 + _m_local + offsetof(GSScanlineLocalData, d)]); } if(m_sel.prim != GS_SPRITE_CLASS) { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { - vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p + vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p if(m_sel.fwrite && m_sel.fge) { // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - vcvttps2dq(xmm9, xmm0); - vpshufhw(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm9, ptr[rdx + 16 * 6]); + vcvttps2dq(_f, xmm0); + vpshufhw(_f, _f, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(_f, _f, _MM_SHUFFLE(2, 2, 2, 2)); + vpaddw(_f, ptr[a1 + 16 * 6]); } if(m_sel.zb) { // z = vp.zzzz() + m_local.d[skip].z; - vshufps(xmm8, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm8, ptr[rdx]); + vshufps(_z, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vaddps(_z, ptr[a1]); } } } @@ -290,28 +348,35 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.ztest) { - vmovdqa(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, p.z)]); + vmovdqa(_z, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]); } + + if(m_sel.fwrite && m_sel.fge) + vmovdqa(_f, ptr[_m_local + offsetof(GSScanlineLocalData, p.f)]); } if(m_sel.fb) { if(m_sel.edge || m_sel.tfx != TFX_NONE) { - vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t + vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t } if(m_sel.edge) { + // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); + vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); vpsrlw(xmm1, 9); - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.cov)], xmm1); + vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)], xmm1); } if(m_sel.tfx != TFX_NONE) { + // a1 = &m_local.d[skip] + if(m_sel.fst) { // GSVector4i vti(vt); @@ -321,23 +386,20 @@ void GSDrawScanlineCodeGenerator::Init() // s = vti.xxxx() + m_local.d[skip].s; // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vpshufd(_s, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(_t, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]); + vpaddd(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) { - vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]); + vpaddd(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); } - else + else if(m_sel.ltf) { - if(m_sel.ltf) - { - vpshuflw(xmm6, xmm11, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); - } + vpshuflw(xmm7, _t, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm7, 12); } } else @@ -346,13 +408,13 @@ void GSDrawScanlineCodeGenerator::Init() // t = vt.yyyy() + m_local.d[skip].t; // q = vt.zzzz() + m_local.d[skip].q; - vshufps(xmm10, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vshufps(_s, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(_t, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(_q, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]); - vaddps(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]); - vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]); + vaddps(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); + vaddps(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); + vaddps(_q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]); } } @@ -362,7 +424,7 @@ void GSDrawScanlineCodeGenerator::Init() { // GSVector4i vc = GSVector4i(v.c); - vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c + vcvttps2dq(xmm0, ptr[a3 + offsetof(GSVertexSW, c)]); // v.c // vc = vc.upl16(vc.zwxy()); @@ -372,30 +434,40 @@ void GSDrawScanlineCodeGenerator::Init() // rb = vc.xxxx().add16(m_local.d[skip].rb); // ga = vc.zzzz().add16(m_local.d[skip].ga); - vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(_f_rb, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(_f_ga, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm13, ptr[rdx + offsetof(GSScanlineLocalData::skip, rb)]); - vpaddw(xmm14, ptr[rdx + offsetof(GSScanlineLocalData::skip, ga)]); + vpaddw(_f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]); + vpaddw(_f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]); } else { - vmovdqa(xmm13, ptr[r11 + offsetof(GSScanlineLocalData, c.rb)]); - vmovdqa(xmm14, ptr[r11 + offsetof(GSScanlineLocalData, c.ga)]); + vmovdqa(_f_rb, ptr[_m_local + offsetof(GSScanlineLocalData, c.rb)]); + vmovdqa(_f_ga, ptr[_m_local + offsetof(GSScanlineLocalData, c.ga)]); } + + vmovdqa(_rb, _f_rb); + vmovdqa(_ga, _f_ga); } } + + + if(m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) + { + // On linux, a2 is edx which will be used for fzm + mov(a1, a2); + } } void GSDrawScanlineCodeGenerator::Step() { // steps -= 4; - sub(rcx, 4); + sub(a0, 4); // fza_offset++; - add(rdi, 8); + add(t0, 8); if(m_sel.prim != GS_SPRITE_CLASS) { @@ -403,14 +475,14 @@ void GSDrawScanlineCodeGenerator::Step() if(m_sel.zb) { - vaddps(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, d4.z)]); + vaddps(_z, ptr[_m_local + offsetof(GSScanlineLocalData, d4.z)]); } // f = f.add16(m_local.d4.f); if(m_sel.fwrite && m_sel.fge) { - vpaddw(xmm9, ptr[r11 + offsetof(GSScanlineLocalData, d4.f)]); + vpaddw(_f, ptr[_m_local + offsetof(GSScanlineLocalData, d4.f)]); } } else @@ -431,15 +503,15 @@ void GSDrawScanlineCodeGenerator::Step() // si += st.xxxx(); // if(!sprite) ti += st.yyyy(); - vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]); + vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.stq)]); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(xmm10, xmm1); + vpaddd(_s, xmm1); if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) { vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm11, xmm1); + vpaddd(_t, xmm1); } } else @@ -450,15 +522,15 @@ void GSDrawScanlineCodeGenerator::Step() // t += stq.yyyy(); // q += stq.zzzz(); - vmovaps(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]); + vmovaps(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.stq)]); vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm10, xmm1); - vaddps(xmm11, xmm2); - vaddps(xmm12, xmm3); + vaddps(_s, xmm1); + vaddps(_t, xmm2); + vaddps(_q, xmm3); } } @@ -471,19 +543,19 @@ void GSDrawScanlineCodeGenerator::Step() // rb = rb.add16(c.xxxx()); // ga = ga.add16(c.yyyy()); - vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.c)]); + vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.c)]); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddw(xmm13, xmm1); - vpaddw(xmm14, xmm2); + vpaddw(_f_rb, xmm1); + vpaddw(_f_ga, xmm2); // FIXME: color may underflow and roll over at the end of the line, if decreasing vpxor(xmm0, xmm0); - vpmaxsw(xmm13, xmm0); - vpmaxsw(xmm14, xmm0); + vpmaxsw(_f_rb, xmm0); + vpmaxsw(_f_ga, xmm0); } else { @@ -491,17 +563,23 @@ void GSDrawScanlineCodeGenerator::Step() { } } + + vmovdqa(_rb, _f_rb); + vmovdqa(_ga, _f_ga); } } - // test = m_test[7 + (steps & (steps >> 31))]; + if(!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; - mov(rdx, rcx); - sar(rdx, 63); - and(rdx, rcx); - shl(rdx, 4); + mov(rax, a0); + sar(rax, 63); // GH: 63 to extract the sign of the register + and(rax, a0); + shl(rax, 4); - vmovdqa(xmm15, ptr[rdx + r10 + 7 * 16]); + vmovdqa(_test, ptr[rax + r10 + 7 * 16]); + } } void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) @@ -513,9 +591,10 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) // int za = fza_base.y + fza_offset->y; - movsxd(rbp, dword[rsi + 4]); - movsxd(rax, dword[rdi + 4]); + movsxd(rbp, dword[t1 + 4]); + movsxd(rax, dword[t0 + 4]); add(rbp, rax); + and(rbp, HALF_VM_SIZE - 1); // GSVector4i zs = zi; @@ -524,15 +603,15 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.zoverflow) { // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - + mov(rax, (size_t)&GSVector4::m_half); vbroadcastss(xmm0, ptr[rax]); - vmulps(xmm0, xmm8); + vmulps(xmm0, _z); vcvttps2dq(xmm0, xmm0); vpslld(xmm0, 1); - vcvttps2dq(xmm1, xmm8); + vcvttps2dq(xmm1, _z); vpcmpeqd(xmm2, xmm2); vpsrld(xmm2, 31); vpand(xmm1, xmm2); @@ -543,14 +622,18 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) { // zs = GSVector4i(z); - vcvttps2dq(xmm0, xmm8); + vcvttps2dq(xmm0, _z); } if(m_sel.zwrite) { - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)], xmm0); + vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)], xmm0); } } + else + { + movdqa(xmm0, _z); + } if(m_sel.ztest) { @@ -558,7 +641,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.zwrite && m_sel.zpsm < 2) { - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm1); + vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], xmm1); } // zd &= 0xffffffff >> m_sel.zpsm * 8; @@ -588,7 +671,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) case ZTST_GEQUAL: // test |= zso < zdo; // ~(zso >= zdo) vpcmpgtd(xmm1, xmm0); - vpor(xmm15, xmm1); + vpor(_test, xmm1); break; case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL @@ -596,7 +679,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) vpcmpgtd(xmm0, xmm1); vpcmpeqd(xmm2, xmm2); vpxor(xmm0, xmm2); - vpor(xmm15, xmm0); + vpor(_test, xmm0); break; } @@ -611,16 +694,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() return; } - mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); - - // ebx = tex - if(!m_sel.fst) { - vrcpps(xmm0, xmm12); + vrcpps(xmm0, _q); - vmulps(xmm4, xmm10, xmm0); - vmulps(xmm5, xmm11, xmm0); + vmulps(xmm4, _s, xmm0); + vmulps(xmm5, _t, xmm0); vcvttps2dq(xmm4, xmm4); vcvttps2dq(xmm5, xmm5); @@ -640,25 +719,25 @@ void GSDrawScanlineCodeGenerator::SampleTexture() } else { - vmovdqa(xmm4, xmm10); - vmovdqa(xmm5, xmm11); + vmovdqa(xmm4, _s); + vmovdqa(xmm5, _t); } if(m_sel.ltf) { - // GSVector4i uf = u.xxzzlh().srl16(1); + // GSVector4i uf = u.xxzzlh().srl16(12); vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); + vpsrlw(xmm6, 12); if(m_sel.prim != GS_SPRITE_CLASS) { - // GSVector4i vf = v.xxzzlh().srl16(1); + // GSVector4i vf = v.xxzzlh().srl16(12); vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm7, 1); + vpsrlw(xmm7, 12); } } @@ -746,7 +825,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - + ReadTexel(4, 0); // xmm0 = c00 @@ -779,11 +858,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm6 = uf // xmm7 = vf - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); - lerp16(xmm0, xmm4, xmm6, 0); - lerp16(xmm1, xmm5, xmm6, 0); + lerp16_4(xmm0, xmm4, xmm6); + lerp16_4(xmm1, xmm5, xmm6); // xmm0 = rb00 // xmm1 = ga00 @@ -796,15 +875,15 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i ga10 = (c10 >> 8) & mask; vpsrlw(xmm5, xmm2, 8); - vpsllw(xmm2, 8); - vpsrlw(xmm4, xmm2, 8); + vpsllw(xmm4, xmm2, 8); + vpsrlw(xmm4, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm2, xmm3, 8); - vpsllw(xmm3, 8); + vpsllw(xmm2, xmm3, 8); vpsrlw(xmm3, 8); + vpsrlw(xmm2, 8); // xmm0 = rb00 // xmm1 = ga00 @@ -815,11 +894,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm6 = uf // xmm7 = vf - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); - lerp16(xmm2, xmm4, xmm6, 0); - lerp16(xmm3, xmm5, xmm6, 0); + lerp16_4(xmm2, xmm4, xmm6); + lerp16_4(xmm3, xmm5, xmm6); // xmm0 = rb00 // xmm1 = ga00 @@ -827,17 +906,17 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm3 = ga10 // xmm7 = vf - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); - lerp16(xmm2, xmm0, xmm7, 0); - lerp16(xmm3, xmm1, xmm7, 0); + lerp16_4(xmm2, xmm0, xmm7); + lerp16_4(xmm3, xmm1, xmm7); } else { // GSVector4i addr00 = y0 + x0; - vpaddd(xmm3, xmm2); + vpaddd(xmm0, xmm3, xmm2); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); @@ -848,9 +927,9 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsrlw(xmm3, xmm2, 8); - vpsllw(xmm2, 8); - vpsrlw(xmm2, 8); + vpsllw(_rb, xmm0, 8); + vpsrlw(_rb, 8); + vpsrlw(_ga, xmm0, 8); } // xmm2 = rb @@ -872,7 +951,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) { if(region) { - vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vpmaxsw(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); } else { @@ -880,23 +959,23 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) vpmaxsw(uv, xmm0); } - vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vpminsw(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); } else { - vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vpand(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); if(region) { - vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vpor(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); } } } else { - vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); + vmovdqa(xmm2, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.mask)]); // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; @@ -933,7 +1012,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) { if(region) { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); vpmaxsw(uv0, xmm0); vpmaxsw(uv1, xmm0); } @@ -944,19 +1023,19 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) vpmaxsw(uv1, xmm0); } - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); vpminsw(uv0, xmm0); vpminsw(uv1, xmm0); } else { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); vpand(uv0, xmm0); vpand(uv1, xmm0); if(region) { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); vpor(uv0, xmm0); vpor(uv1, xmm0); } @@ -964,9 +1043,9 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } else { - vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); + vmovdqa(xmm2, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.mask)]); // uv0 @@ -1035,17 +1114,17 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() // gat = gat.modulate16<1>(ga).clamp8(); - modulate16(xmm3, xmm14, 1); + modulate16(_ga, _f_ga, 1); - clamp16(xmm3, xmm0); + clamp16(_ga, xmm0); // if(!tcc) gat = gat.mix16(ga.srl16(7)); if(!m_sel.tcc) { - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); } break; @@ -1056,9 +1135,9 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); } break; @@ -1067,14 +1146,14 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - if(m_sel.tcc) + if(m_sel.tcc) { - vpaddusb(xmm1, xmm3); + vpaddusb(xmm1, _ga); } - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); break; @@ -1084,9 +1163,9 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); } break; @@ -1097,25 +1176,71 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(m_sel.iip) { - vpsrlw(xmm3, xmm14, 7); + vpsrlw(_ga, _f_ga, 7); } break; } - // TODO: aa1 + if(m_sel.aa1) + { + // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha + + // FIXME: bios config screen cubes + + if(!m_sel.abe) + { + // a = cov + + if(m_sel.edge) + { + vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]); + } + else + { + vpcmpeqd(xmm0, xmm0); + vpsllw(xmm0, 15); + vpsrlw(xmm0, 8); + } + + mix16(_ga, xmm0, xmm1); + } + else + { + // a = a == 0x80 ? cov : a + + vpcmpeqd(xmm0, xmm0); + vpsllw(xmm0, 15); + vpsrlw(xmm0, 8); + + if(m_sel.edge) + { + vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]); + } + else + { + vmovdqa(xmm1, xmm0); + } + + vpcmpeqw(xmm0, _ga); + vpsrld(xmm0, 16); + vpslld(xmm0, 16); + + vpblendvb(_ga, xmm1, xmm0); + } + } } void GSDrawScanlineCodeGenerator::ReadMask() { if(m_sel.fwrite) { - vmovdqa(xmm4, ptr[r12 + offsetof(GSScanlineGlobalData, fm)]); + vmovdqa(_fm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fm)]); } if(m_sel.zwrite) { - vmovdqa(xmm5, ptr[r12 + offsetof(GSScanlineGlobalData, zm)]); + vmovdqa(_zm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, zm)]); } } @@ -1134,14 +1259,14 @@ void GSDrawScanlineCodeGenerator::TestAlpha() case ATST_LESS: case ATST_LEQUAL: // t = (ga >> 16) > m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpgtd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm1, _ga, 16); + vpcmpgtd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); break; case ATST_EQUAL: // t = (ga >> 16) != m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm1, _ga, 16); + vpcmpeqd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); vpcmpeqd(xmm0, xmm0); vpxor(xmm1, xmm0); break; @@ -1149,15 +1274,15 @@ void GSDrawScanlineCodeGenerator::TestAlpha() case ATST_GEQUAL: case ATST_GREATER: // t = (ga >> 16) < m_local.gd->aref; - vpsrld(xmm0, xmm3, 16); - vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm0, _ga, 16); + vmovdqa(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); vpcmpgtd(xmm1, xmm0); break; case ATST_NOTEQUAL: // t = (ga >> 16) == m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm1, _ga, 16); + vpcmpeqd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); break; } @@ -1165,27 +1290,27 @@ void GSDrawScanlineCodeGenerator::TestAlpha() { case AFAIL_KEEP: // test |= t; - vpor(xmm15, xmm1); + vpor(_test, xmm1); alltrue(); break; case AFAIL_FB_ONLY: // zm |= t; - vpor(xmm5, xmm1); + vpor(_zm, xmm1); break; case AFAIL_ZB_ONLY: // fm |= t; - vpor(xmm4, xmm1); + vpor(_fm, xmm1); break; case AFAIL_RGB_ONLY: // zm |= t; - vpor(xmm5, xmm1); + vpor(_zm, xmm1); // fm |= t & GSVector4i::xff000000(); vpsrld(xmm1, 24); vpslld(xmm1, 24); - vpor(xmm4, xmm1); + vpor(_fm, xmm1); break; } } @@ -1203,9 +1328,9 @@ void GSDrawScanlineCodeGenerator::ColorTFX() // rbt = rbt.modulate16<1>(rb).clamp8(); - modulate16(xmm2, xmm13, 1); + modulate16(_rb, _f_rb, 1); - clamp16(xmm2, xmm0); + clamp16(_rb, xmm0); break; @@ -1218,27 +1343,27 @@ void GSDrawScanlineCodeGenerator::ColorTFX() // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - vmovdqa(xmm1, xmm3); + vmovdqa(xmm1, _ga); - modulate16(xmm3, xmm14, 1); + modulate16(_ga, _f_ga, 1); - vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1)); + vpshuflw(xmm6, _f_ga, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); vpsrlw(xmm6, 7); - vpaddw(xmm3, xmm6); + vpaddw(_ga, xmm6); - clamp16(xmm3, xmm0); - - mix16(xmm3, xmm1, xmm0); + clamp16(_ga, xmm0); + + mix16(_ga, xmm1, xmm0); // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - modulate16(xmm2, xmm13, 1); + modulate16(_rb, _f_rb, 1); - vpaddw(xmm2, xmm6); - - clamp16(xmm2, xmm0); + vpaddw(_rb, xmm6); + + clamp16(_rb, xmm0); break; @@ -1248,7 +1373,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() if(m_sel.iip) { - vpsrlw(xmm2, xmm13, 7); + vpsrlw(_rb, _f_rb, 7); } break; @@ -1265,15 +1390,15 @@ void GSDrawScanlineCodeGenerator::Fog() // rb = m_local.gd->frb.lerp16<0>(rb, f); // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, frb)]); - vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, fga)]); + vmovdqa(xmm6, _ga); - vmovdqa(xmm6, xmm3); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, frb)]); + vmovdqa(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fga)]); - lerp16(xmm2, xmm0, xmm9, 0); - lerp16(xmm3, xmm1, xmm9, 0); + lerp16(_rb, xmm0, _f, 0); + lerp16(_ga, xmm1, _f, 0); - mix16(xmm3, xmm6, xmm9); + mix16(_ga, xmm6, _f); } void GSDrawScanlineCodeGenerator::ReadFrame() @@ -1285,16 +1410,17 @@ void GSDrawScanlineCodeGenerator::ReadFrame() // int fa = fza_base.x + fza_offset->x; - mov(ebx, dword[rsi]); - add(ebx, dword[rdi]); - movsxd(rbx, ebx); + mov(ebx, dword[t1]); + add(ebx, dword[t0]); + and(ebx, HALF_VM_SIZE - 1); + movsxd(rbx, ebx); // FIXME useful ? if(!m_sel.rfb) { return; } - ReadPixel(xmm6, rbx); + ReadPixel(_fd, rbx); } void GSDrawScanlineCodeGenerator::TestDestAlpha() @@ -1311,15 +1437,15 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() if(m_sel.fpsm == 2) { vpxor(xmm0, xmm0); - //vpsrld(xmm1, xmm6, 15); - vpslld(xmm1, xmm6, 16); + //vpsrld(xmm1, _fd, 15); + vpslld(xmm1, _fd, 16); vpsrad(xmm1, 31); vpcmpeqd(xmm1, xmm0); } else { vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm6, xmm0); + vpxor(xmm1, _fd, xmm0); vpsrad(xmm1, 31); } } @@ -1327,33 +1453,38 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() { if(m_sel.fpsm == 2) { - vpslld(xmm1, xmm6, 16); + vpslld(xmm1, _fd, 16); vpsrad(xmm1, 31); } else { - vpsrad(xmm1, xmm6, 31); + vpsrad(xmm1, _fd, 31); } } - vpor(xmm15, xmm1); + vpor(_test, xmm1); alltrue(); } void GSDrawScanlineCodeGenerator::WriteMask() { + if(m_sel.notest) + { + return; + } + // fm |= test; // zm |= test; if(m_sel.fwrite) { - vpor(xmm4, xmm15); + vpor(_fm, _test); } if(m_sel.zwrite) { - vpor(xmm5, xmm15); + vpor(_zm, _test); } // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); @@ -1362,18 +1493,18 @@ void GSDrawScanlineCodeGenerator::WriteMask() if(m_sel.fwrite && m_sel.zwrite) { - vpcmpeqd(xmm0, xmm1, xmm5); - vpcmpeqd(xmm1, xmm4); + vpcmpeqd(xmm0, xmm1, _zm); + vpcmpeqd(xmm1, _fm); vpackssdw(xmm1, xmm0); } else if(m_sel.fwrite) { - vpcmpeqd(xmm1, xmm4); + vpcmpeqd(xmm1, _fm); vpackssdw(xmm1, xmm1); } else if(m_sel.zwrite) { - vpcmpeqd(xmm1, xmm5); + vpcmpeqd(xmm1, _zm); vpackssdw(xmm1, xmm1); } @@ -1389,17 +1520,20 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() return; } - bool fast = m_sel.ztest && m_sel.zpsm < 2; + if (m_sel.prim != GS_SPRITE_CLASS) + vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)]); + else + vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]); - vmovdqa(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)]); - - if(fast) + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); - vpblendvb(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm4); + vpblendvb(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], _zm); } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1); } @@ -1415,6 +1549,9 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() return; } + const Xmm& _dst_rb = xmm0; + const Xmm& _dst_ga = xmm1; + if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) @@ -1425,9 +1562,9 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - vpsllw(xmm0, xmm6, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, xmm6, 8); + vpsllw(_dst_rb, _fd, 8); + vpsrlw(_dst_rb, 8); + vpsrlw(_dst_ga, _fd, 8); break; @@ -1439,24 +1576,24 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() vpcmpeqd(xmm15, xmm15); vpsrld(xmm15, 27); // 0x0000001f - vpand(xmm0, xmm6, xmm15); - vpslld(xmm0, 3); + vpand(_dst_rb, _fd, xmm15); + vpslld(_dst_rb, 3); vpslld(xmm15, 10); // 0x00007c00 - vpand(xmm5, xmm6, xmm15); + vpand(xmm5, _fd, xmm15); vpslld(xmm5, 9); - vpor(xmm0, xmm1); + vpor(_dst_rb, xmm5); vpsrld(xmm15, 5); // 0x000003e0 - vpand(xmm1, xmm6, xmm15); - vpsrld(xmm1, 2); + vpand(_dst_ga, _fd, xmm15); + vpsrld(_dst_ga, 2); vpsllw(xmm15, 10); // 0x00008000 - vpand(xmm5, xmm6, xmm15); + vpand(xmm5, _fd, xmm15); vpslld(xmm5, 8); - vpor(xmm1, xmm5); + vpor(_dst_ga, xmm5); break; } @@ -1468,7 +1605,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { - vmovdqa(xmm5, xmm2); + vmovdqa(xmm5, _rb); } if(m_sel.aba != m_sel.abb) @@ -1478,16 +1615,16 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.aba) { case 0: break; - case 1: vmovdqa(xmm2, xmm0); break; - case 2: vpxor(xmm2, xmm2); break; + case 1: vmovdqa(_rb, _dst_rb); break; + case 2: vpxor(_rb, _rb); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { - case 0: vpsubw(xmm2, xmm5); break; - case 1: vpsubw(xmm2, xmm0); break; + case 0: vpsubw(_rb, xmm5); break; + case 1: vpsubw(_rb, _dst_rb); break; case 2: break; } @@ -1499,26 +1636,26 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() { case 0: case 1: - vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1)); + vpshuflw(xmm15, m_sel.abc ? _dst_ga : _ga, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); vpsllw(xmm15, 7); break; case 2: - vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]); + vmovdqa(xmm15, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, afix)]); break; } // rb = rb.modulate16<1>(a); - modulate16(xmm2, xmm15, 1); + modulate16(_rb, xmm15, 1); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { - case 0: vpaddw(xmm2, xmm5); break; - case 1: vpaddw(xmm2, xmm0); break; + case 0: vpaddw(_rb, xmm5); break; + case 1: vpaddw(_rb, _dst_rb); break; case 2: break; } } @@ -1529,8 +1666,8 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.abd) { case 0: break; - case 1: vmovdqa(xmm2, xmm0); break; - case 2: vpxor(xmm2, xmm2); break; + case 1: vmovdqa(_rb, _dst_rb); break; + case 2: vpxor(_rb, _rb); break; } } @@ -1538,12 +1675,12 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() { // mask = (c[1] << 8).sra32(31); - vpslld(xmm0, xmm3, 8); + vpslld(xmm0, _ga, 8); vpsrad(xmm0, 31); // rb = c[0].blend8(rb, mask); - vpblendvb(xmm2, xmm5, xmm2, xmm0); + vpblendvb(_rb, xmm5, _rb, xmm0); } // xmm0 = pabe mask @@ -1553,7 +1690,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // xmm15 = a // xmm5 = free - vmovdqa(xmm5, xmm3); + vmovdqa(xmm5, _ga); if(m_sel.aba != m_sel.abb) { @@ -1562,16 +1699,16 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.aba) { case 0: break; - case 1: vmovdqa(xmm3, xmm1); break; - case 2: vpxor(xmm3, xmm3); break; + case 1: vmovdqa(_ga, _dst_ga); break; + case 2: vpxor(_ga, _ga); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { - case 0: vpsubw(xmm3, xmm5); break; - case 1: vpsubw(xmm3, xmm1); break; + case 0: vpsubw(_ga, xmm5); break; + case 1: vpsubw(_ga, _dst_ga); break; case 2: break; } @@ -1579,15 +1716,15 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() { // ga = ga.modulate16<1>(a); - modulate16(xmm3, xmm15, 1); + modulate16(_ga, xmm15, 1); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { - case 0: vpaddw(xmm3, xmm5); break; - case 1: vpaddw(xmm3, xmm1); break; + case 0: vpaddw(_ga, xmm5); break; + case 1: vpaddw(_ga, _dst_ga); break; case 2: break; } } @@ -1598,8 +1735,8 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.abd) { case 0: break; - case 1: vmovdqa(xmm3, xmm1); break; - case 2: vpxor(xmm3, xmm3); break; + case 1: vmovdqa(_ga, _dst_ga); break; + case 2: vpxor(_ga, _ga); break; } } @@ -1615,13 +1752,13 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // ga = c[1].blend8(ga, mask).mix16(c[1]); - vpblendvb(xmm3, xmm5, xmm3, xmm0); + vpblendvb(_ga, xmm5, _ga, xmm0); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { - mix16(xmm3, xmm5, xmm15); + mix16(_ga, xmm5, xmm15); } } } @@ -1633,6 +1770,23 @@ void GSDrawScanlineCodeGenerator::WriteFrame() return; } + if(m_sel.fpsm == 2 && m_sel.dthe) + { + mov(a3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, dimx)]); + + // y = (top & 3) << 5 + + mov(rax, a1); + and(rax, 3); + shl(rax, 5); + + // rb = rb.add16(m_global.dimx[0 + y]); + // ga = ga.add16(m_global.dimx[1 + y]); + + vpaddw(xmm2, ptr[a3 + rax + sizeof(GSVector4i) * 0]); + vpaddw(xmm3, ptr[a3 + rax + sizeof(GSVector4i) * 1]); + } + if(m_sel.colclamp == 0) { // c[0] &= 0x00ff00ff; @@ -1644,15 +1798,6 @@ void GSDrawScanlineCodeGenerator::WriteFrame() vpand(xmm3, xmm15); } - if(m_sel.fpsm == 2 && m_sel.dthe) - { - mov(rax, r8); - and(rax, 3); - shl(rax, 5); - vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx) + sizeof(GSVector4i) * 0]); - vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx) + sizeof(GSVector4i) * 1]); - } - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); vpunpckhwd(xmm15, xmm2, xmm3); @@ -1704,65 +1849,83 @@ void GSDrawScanlineCodeGenerator::WriteFrame() { // fs = fs.blend(fd, fm); - blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm + blend(xmm2, _fd, _fm); // TODO: could be skipped in certain cases, depending on fpsm and fm } - bool fast = m_sel.rfb && m_sel.fpsm < 2; + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); } void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) { - vmovq(dst, qword[r13 + addr * 2]); - vmovhps(dst, qword[r13 + addr * 2 + 8 * 2]); + vmovq(dst, qword[_m_local__gd__vm + addr * 2]); + vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]); } void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) { - if(fast) + if(m_sel.notest) { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[r13 + addr * 2], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[r13 + addr * 2 + 8 * 2], src); - L("@@"); - - // vmaskmovps? + if(fast) + { + vmovq(qword[_m_local__gd__vm + addr * 2], src); + vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src); + } + else + { + WritePixel(src, addr, 0, psm); + WritePixel(src, addr, 1, psm); + WritePixel(src, addr, 2, psm); + WritePixel(src, addr, 3, psm); + } } else { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); + test(mask, 0x0f); + je("@f"); + vmovq(qword[_m_local__gd__vm + addr * 2], src); + L("@@"); - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); + test(mask, 0xf0); + je("@f"); + vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src); + L("@@"); - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } } } @@ -1770,7 +1933,7 @@ static const int s_offsets[4] = {0, 2, 8, 10}; void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) { - Address dst = ptr[r13 + addr * 2 + s_offsets[i] * 2]; + Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2]; switch(psm) { @@ -1795,19 +1958,30 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) { // TODO + const int r[] = {0, 0, 1, 1, 2, 2, 3, 3}; + + for(int i = 0; i < pixels; i++) + { + for(int j = 0; j < 4; j++) + { + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + } + } } void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) { - const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4]; + const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4]; - if(i == 0) vmovd(eax, addr); - else vpextrd(eax, addr, i); + //if(i == 0) vmovd(eax, addr); + //else vpextrd(eax, addr, i); + vpextrd(eax, addr, i); - if(m_sel.tlu) movzx(rax, byte[rbx + rax]); + if(m_sel.tlu) movzx(rax, byte[_m_local__gd__tex + rax]); - if(i == 0) vmovd(dst, src); - else vpinsrd(dst, src, i); + //if(i == 0) vmovd(dst, src); + //else vpinsrd(dst, src, i); + vpinsrd(dst, src, i); } #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp index 40631c1c9e..f78b47da94 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp @@ -24,8 +24,11 @@ #if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +// It is useless to port the code to SSEx, better use the faster 32 bits version instead void GSDrawScanlineCodeGenerator::Generate() { + // Avoid a crash if someone want to use it + ret(); } void GSDrawScanlineCodeGenerator::Init() diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 250432ff2b..02d63f4a0f 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -740,7 +740,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() if(m_sel.ltf) { - // GSVector4i uf = u.xxzzlh().srl16(1); + // GSVector4i uf = u.xxzzlh().srl16(12); vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); @@ -749,7 +749,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() if(m_sel.prim != GS_SPRITE_CLASS) { - // GSVector4i vf = v.xxzzlh().srl16(1); + // GSVector4i vf = v.xxzzlh().srl16(12); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); From 141c9e9c86e3ff0186dc8cf473107e9f75b436ad Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 16:48:25 +0100 Subject: [PATCH 09/20] gsdx sw x64: prefer faster 32 bits operation when possible --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index f232ee961a..fd85fc5cdd 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -72,8 +72,10 @@ void GSDrawScanlineCodeGenerator::Generate() mov(ptr[rsp - 1 * 8], rbx); mov(ptr[rsp - 2 * 8], r12); mov(ptr[rsp - 3 * 8], r13); - mov(ptr[rsp - 4 * 8], r14); - mov(ptr[rsp - 5 * 8], r15); + if(need_clut) + mov(ptr[rsp - 4 * 8], r14); + if(need_tex) + mov(ptr[rsp - 5 * 8], r15); #endif mov(r10, (size_t)&m_test[0]); @@ -81,9 +83,10 @@ void GSDrawScanlineCodeGenerator::Generate() mov(_m_local__gd, ptr[_m_local + offsetof(GSScanlineLocalData, gd)]); mov(_m_local__gd__vm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, vm)]); - // FIXME: those 2 load could be optimized when no texture - mov(_m_local__gd__clut, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, clut)]); - mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]); + if(need_clut) + mov(_m_local__gd__clut, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, clut)]); + if(need_tex) + mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]); Init(); @@ -252,8 +255,10 @@ L("exit"); mov(rbx, ptr[rsp - 1 * 8]); mov(r12, ptr[rsp - 2 * 8]); mov(r13, ptr[rsp - 3 * 8]); - mov(r14, ptr[rsp - 4 * 8]); - mov(r15, ptr[rsp - 5 * 8]); + if(need_clut) + mov(r14, ptr[rsp - 4 * 8]); + if(need_tex) + mov(r15, ptr[rsp - 5 * 8]); pop(rbp); #endif @@ -266,12 +271,12 @@ void GSDrawScanlineCodeGenerator::Init() { // int skip = left & 3; - mov(rbx, a1); - and(a1, 3); + mov(ebx, a1.cvt32()); + and(a1.cvt32(), 3); // left -= skip; - sub(rbx, a1); + sub(ebx, a1.cvt32()); // int steps = pixels + skip - 4; @@ -279,7 +284,7 @@ void GSDrawScanlineCodeGenerator::Init() // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(a1, 4); // * sizeof(m_test[0]) + shl(a1.cvt32(), 4); // * sizeof(m_test[0]) vmovdqa(_test, ptr[a1 + r10]); @@ -292,8 +297,8 @@ void GSDrawScanlineCodeGenerator::Init() } else { - mov(rbx, a1); // left - xor(a1, a1); // skip + mov(ebx, a1.cvt32()); // left + xor(a1.cvt32(), a1.cvt32()); // skip lea(a0, ptr[a0 - 4]); // steps } @@ -591,10 +596,9 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) // int za = fza_base.y + fza_offset->y; - movsxd(rbp, dword[t1 + 4]); - movsxd(rax, dword[t0 + 4]); - add(rbp, rax); - and(rbp, HALF_VM_SIZE - 1); + mov(ebp, dword[t1 + 4]); + add(ebp, dword[t0 + 4]); + and(ebp, HALF_VM_SIZE - 1); // GSVector4i zs = zi; @@ -1413,7 +1417,6 @@ void GSDrawScanlineCodeGenerator::ReadFrame() mov(ebx, dword[t1]); add(ebx, dword[t0]); and(ebx, HALF_VM_SIZE - 1); - movsxd(rbx, ebx); // FIXME useful ? if(!m_sel.rfb) { @@ -1776,9 +1779,9 @@ void GSDrawScanlineCodeGenerator::WriteFrame() // y = (top & 3) << 5 - mov(rax, a1); - and(rax, 3); - shl(rax, 5); + mov(eax, a1.cvt32()); + and(eax, 3); + shl(eax, 5); // rb = rb.add16(m_global.dimx[0 + y]); // ga = ga.add16(m_global.dimx[1 + y]); @@ -1977,7 +1980,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin //else vpextrd(eax, addr, i); vpextrd(eax, addr, i); - if(m_sel.tlu) movzx(rax, byte[_m_local__gd__tex + rax]); + if(m_sel.tlu) movzx(eax, byte[_m_local__gd__tex + rax]); //if(i == 0) vmovd(dst, src); //else vpinsrd(dst, src, i); From 051c5c4bf709eace9a337855a6618a7bf44ffa33 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 17:05:14 +0100 Subject: [PATCH 10/20] gsdx sw x64: small stack optimization on linux mov with the stack pointer require less bytecode --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 60 +++++++++++++++---- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index fd85fc5cdd..38027cb719 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -47,6 +47,18 @@ #if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#ifdef _WIN64 +#else +static const int _rz_rbx = -8 * 1; +static const int _rz_r12 = -8 * 2; +static const int _rz_r13 = -8 * 3; +static const int _rz_r14 = -8 * 4; +static const int _rz_r15 = -8 * 5; +static const int _rz_zs = -8 * 8; +static const int _rz_zd = -8 * 10; +static const int _rz_cov = -8 * 12; +#endif + void GSDrawScanlineCodeGenerator::Generate() { bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; @@ -69,13 +81,13 @@ void GSDrawScanlineCodeGenerator::Generate() #else // No reservation on the stack as a red zone is available push(rbp); - mov(ptr[rsp - 1 * 8], rbx); - mov(ptr[rsp - 2 * 8], r12); - mov(ptr[rsp - 3 * 8], r13); + mov(ptr[rsp + _rz_rbx], rbx); + mov(ptr[rsp + _rz_r12], r12); + mov(ptr[rsp + _rz_r13], r13); if(need_clut) - mov(ptr[rsp - 4 * 8], r14); + mov(ptr[rsp + _rz_r14], r14); if(need_tex) - mov(ptr[rsp - 5 * 8], r15); + mov(ptr[rsp + _rz_r15], r15); #endif mov(r10, (size_t)&m_test[0]); @@ -252,13 +264,13 @@ L("exit"); pop(rsi); pop(rbx); #else - mov(rbx, ptr[rsp - 1 * 8]); - mov(r12, ptr[rsp - 2 * 8]); - mov(r13, ptr[rsp - 3 * 8]); + mov(rbx, ptr[rsp + _rz_rbx]); + mov(r12, ptr[rsp + _rz_r12]); + mov(r13, ptr[rsp + _rz_r13]); if(need_clut) - mov(r14, ptr[rsp - 4 * 8]); + mov(r14, ptr[rsp + _rz_r14]); if(need_tex) - mov(r15, ptr[rsp - 5 * 8]); + mov(r15, ptr[rsp + _rz_r15]); pop(rbp); #endif @@ -375,7 +387,11 @@ void GSDrawScanlineCodeGenerator::Init() vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); vpsrlw(xmm1, 9); +#ifdef _WIN64 vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)], xmm1); +#else + vmovdqa(ptr[rsp + _rz_cov], xmm1); +#endif } if(m_sel.tfx != TFX_NONE) @@ -631,7 +647,11 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.zwrite) { +#ifdef _WIN64 vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)], xmm0); +#else + vmovdqa(ptr[rsp + _rz_zs], xmm0); +#endif } } else @@ -645,7 +665,11 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.zwrite && m_sel.zpsm < 2) { +#ifdef _WIN64 vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], xmm1); +#else + vmovdqa(ptr[rsp + _rz_zd], xmm1); +#endif } // zd &= 0xffffffff >> m_sel.zpsm * 8; @@ -1198,7 +1222,11 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(m_sel.edge) { +#ifdef _WIN64 vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]); +#else + vmovdqa(xmm0, ptr[rsp + _rz_cov]); +#endif } else { @@ -1219,7 +1247,11 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(m_sel.edge) { +#ifdef _WIN64 vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]); +#else + vmovdqa(xmm1, ptr[rsp + _rz_cov]); +#endif } else { @@ -1524,7 +1556,11 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() } if (m_sel.prim != GS_SPRITE_CLASS) +#ifdef _WIN64 vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)]); +#else + vmovdqa(xmm1, ptr[rsp + _rz_zs]); +#endif else vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]); @@ -1532,7 +1568,11 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() { // zs = zs.blend8(zd, zm); +#ifdef _WIN64 vpblendvb(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], _zm); +#else + vpblendvb(xmm1, ptr[rsp + _rz_zd], _zm); +#endif } bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; From 8abf242e2c203402cd88e4b4c57d4f137f032c69 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 18:16:28 +0100 Subject: [PATCH 11/20] gsdx: small x64 printf warning fixes --- plugins/GSdx/GSLzma.cpp | 2 +- plugins/GSdx/stdafx.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/GSdx/GSLzma.cpp b/plugins/GSdx/GSLzma.cpp index 5e29a0bc79..d99a94ff65 100644 --- a/plugins/GSdx/GSLzma.cpp +++ b/plugins/GSdx/GSLzma.cpp @@ -173,7 +173,7 @@ void GSDumpRaw::Read(void* ptr, size_t size) { } else { size_t ret = fread(ptr, 1, size, m_fp); if (ret != size) { - fprintf(stderr, "GSDumpRaw:: Read error (%d/%d)\n", ret, size); + fprintf(stderr, "GSDumpRaw:: Read error (%zu/%zu)\n", ret, size); throw "BAD"; // Just exit the program } } diff --git a/plugins/GSdx/stdafx.cpp b/plugins/GSdx/stdafx.cpp index abb479d3d6..52d63aadab 100644 --- a/plugins/GSdx/stdafx.cpp +++ b/plugins/GSdx/stdafx.cpp @@ -138,7 +138,7 @@ void* fifo_alloc(size_t size, size_t repeat) if (next != base) fprintf(stderr, "Fail to mmap contiguous segment\n"); else - fprintf(stderr, "MMAP next %x\n", (uintptr_t)base); + fprintf(stderr, "MMAP next %p\n", base); } return fifo; From d58e43edbfe1f0a1f21dc029edcfa7d9a44b779e Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 22:40:52 +0100 Subject: [PATCH 12/20] gsdx linux: plug vtune as Windows --- build.sh | 1 + cmake/BuildParameters.cmake | 2 ++ plugins/GSdx/CMakeLists.txt | 15 +++++++++------ plugins/GSdx/stdafx.h | 6 +++--- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/build.sh b/build.sh index 43280d586f..23a758223b 100755 --- a/build.sh +++ b/build.sh @@ -80,6 +80,7 @@ for ARG in "$@"; do --cross-multilib ) flags="$flags -DCMAKE_TOOLCHAIN_FILE=$toolfile"; useCross=1; ;; --no-cross-multilib ) useCross=0; ;; --coverity ) CoverityBuild=1; cleanBuild=1; ;; + --vtune ) flags="$flags -DUSE_VTUNE=TRUE" ;; -D* ) flags="$flags $ARG" ;; *) diff --git a/cmake/BuildParameters.cmake b/cmake/BuildParameters.cmake index 8f03540bb6..05895c5129 100644 --- a/cmake/BuildParameters.cmake +++ b/cmake/BuildParameters.cmake @@ -28,6 +28,8 @@ if(DISABLE_BUILD_DATE OR openSUSE) add_definitions(-DDISABLE_BUILD_DATE) endif() +option(USE_VTUNE "Plug VTUNE to profile GSdx JIT.") + #------------------------------------------------------------------------------- # Graphical option #------------------------------------------------------------------------------- diff --git a/plugins/GSdx/CMakeLists.txt b/plugins/GSdx/CMakeLists.txt index 15c93e0794..09c39f5344 100644 --- a/plugins/GSdx/CMakeLists.txt +++ b/plugins/GSdx/CMakeLists.txt @@ -195,15 +195,18 @@ set(GSdxFinalLibs ) if(EGL_API AND EGL_FOUND) - set(GSdxFinalLibs ${GSdxFinalLibs} - ${EGL_LIBRARIES} - ) + set(GSdxFinalLibs ${GSdxFinalLibs} ${EGL_LIBRARIES}) endif() if(LIBLZMA_FOUND) - set(GSdxFinalLibs ${GSdxFinalLibs} - ${LIBLZMA_LIBRARIES} - ) + set(GSdxFinalLibs ${GSdxFinalLibs} ${LIBLZMA_LIBRARIES}) +endif() + +if(USE_VTUNE) + set(GSdxFinalFlags ${GSdxFinalFlags} -DENABLE_VTUNE) + include_directories("$ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/include") + set(GSdxFinalLibs ${GSdxFinalLibs} $ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/lib64/libjitprofiling.a) + set(GSdxFinalLibs ${GSdxFinalLibs} $ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/lib32/libjitprofiling.a) endif() # Generate Glsl header file. Protect with REBUILD_SHADER to avoid build-dependency on PERL diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index b6f466afc7..9aebd6f332 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -444,11 +444,11 @@ extern void vmfree(void* ptr, size_t size); extern void* fifo_alloc(size_t size, size_t repeat); extern void fifo_free(void* ptr, size_t size, size_t repeat); -#ifdef _WIN32 +#ifdef ENABLE_VTUNE - #ifdef ENABLE_VTUNE + #include "jitprofiling.h" - #include + #ifdef _WIN32 #pragma comment(lib, "jitprofiling.lib") From e728a14c19eb55ea867cbf27d5985f97ed130a9e Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 18 Nov 2016 22:48:06 +0100 Subject: [PATCH 13/20] gsdx sw: factorize color split in split16_2x8 --- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 32 ++++++++ plugins/GSdx/GSDrawScanlineCodeGenerator.h | 1 + .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 24 ++---- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 64 ++++----------- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 82 ++++--------------- 5 files changed, 71 insertions(+), 132 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index ce3e1b6801..0ebec4ea95 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -375,4 +375,36 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) #endif } +void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src) +{ + // l = src & 0xFF; (1 left shift + 1 right shift) + // h = (src >> 8) & 0xFF; (1 right shift) + +#if _M_SSE >= 0x500 + if (src == h) { + vpsllw(l, src, 8); + vpsrlw(h, 8); + } else if (src == l) { + vpsrlw(h, src, 8); + vpsllw(l, 8); + } else { + vpsllw(l, src, 8); + vpsrlw(h, src, 8); + } + vpsrlw(l, 8); +#else + if (src == h) { + movdqa(l, src); + } else if (src == l) { + movdqa(h, src); + } else { + movdqa(l, src); + movdqa(h, src); + } + psllw(l, 8); + psrlw(l, 8); + psrlw(h, 8); +#endif +} + #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index c737e2f0d9..31f3da82a6 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -128,6 +128,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void blendr(const Xmm& b, const Xmm& a, const Xmm& mask); void blend8(const Xmm& a, const Xmm& b); void blend8r(const Xmm& b, const Xmm& a); + void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src); #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index 38027cb719..c8632dfd6b 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -866,16 +866,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm4, xmm0, 8); - vpsrlw(xmm4, 8); - vpsrlw(xmm5, xmm0, 8); + split16_2x8(xmm4, xmm5, xmm0); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm0, xmm1, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm0, xmm1, xmm1); // xmm0 = rb01 // xmm1 = ga01 @@ -902,16 +898,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm5, xmm2, 8); - vpsllw(xmm4, xmm2, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm4, xmm5, xmm2); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsllw(xmm2, xmm3, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm2, 8); + split16_2x8(xmm2, xmm3, xmm3); // xmm0 = rb00 // xmm1 = ga00 @@ -955,9 +947,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(_rb, xmm0, 8); - vpsrlw(_rb, 8); - vpsrlw(_ga, xmm0, 8); + split16_2x8(_rb, _ga, xmm0); } // xmm2 = rb @@ -1605,9 +1595,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - vpsllw(_dst_rb, _fd, 8); - vpsrlw(_dst_rb, 8); - vpsrlw(_dst_ga, _fd, 8); + split16_2x8(_dst_rb, _dst_ga, _fd); break; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 02d63f4a0f..3b4604ab49 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -857,16 +857,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -894,16 +890,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -950,9 +942,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(xmm5, xmm6, 8); - vpsrlw(xmm5, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } } @@ -1452,16 +1442,12 @@ return; // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1489,16 +1475,12 @@ return; // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1545,9 +1527,7 @@ return; // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(xmm5, xmm6, 8); - vpsrlw(xmm5, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } if(m_sel.mmin != 1) // !round-off mode @@ -1693,16 +1673,12 @@ return; // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1730,16 +1706,12 @@ return; // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1786,9 +1758,7 @@ return; // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(xmm5, xmm6, 8); - vpsrlw(xmm5, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); @@ -2436,9 +2406,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - vpsllw(xmm0, xmm2, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, xmm2, 8); + split16_2x8(xmm0, xmm1, xmm2); break; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 42c4b74ae9..b7918fd7b2 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -867,18 +867,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - movdqa(xmm2, xmm6); - psllw(xmm2, 8); - psrlw(xmm2, 8); - psrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - movdqa(xmm3, xmm4); - psllw(xmm3, 8); - psrlw(xmm3, 8); - psrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -906,18 +900,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -965,10 +953,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - movdqa(xmm5, xmm6); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } } @@ -1467,18 +1452,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - movdqa(xmm2, xmm6); - psrlw(xmm6, 8); - psllw(xmm2, 8); - psrlw(xmm2, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - movdqa(xmm3, xmm4); - psrlw(xmm4, 8); - psllw(xmm3, 8); - psrlw(xmm3, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1506,18 +1485,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1565,10 +1538,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - movdqa(xmm5, xmm6); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } if(m_sel.mmin != 1) // !round-off mode @@ -1720,18 +1690,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - movdqa(xmm2, xmm6); - psllw(xmm2, 8); - psrlw(xmm2, 8); - psrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - movdqa(xmm3, xmm4); - psllw(xmm3, 8); - psrlw(xmm3, 8); - psrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1759,18 +1723,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1818,10 +1776,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - movdqa(xmm5, xmm6); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); } movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); @@ -2506,12 +2461,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - movdqa(xmm0, xmm2); - movdqa(xmm1, xmm2); - - psllw(xmm0, 8); - psrlw(xmm0, 8); - psrlw(xmm1, 8); + split16_2x8(xmm0, xmm1, xmm2); break; From 2e2069358302e6a66a97db643f3ea27f1f5c250a Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 19 Nov 2016 00:09:48 +0100 Subject: [PATCH 14/20] gsdx sw x64: restore read texel optimization --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 92 ++++++++++--------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index c8632dfd6b..7a2ffaa3c2 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -856,81 +856,85 @@ void GSDrawScanlineCodeGenerator::SampleTexture() ReadTexel(4, 0); - // xmm0 = c00 - // xmm1 = c01 - // xmm2 = c10 - // xmm3 = c11 + // xmm0 = c10 + // xmm1 = c11 + // xmm4 = c00 + // xmm5 = c01 // xmm6 = uf // xmm7 = vf // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - split16_2x8(xmm4, xmm5, xmm0); + split16_2x8(xmm2, xmm3, xmm4); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - split16_2x8(xmm0, xmm1, xmm1); + split16_2x8(xmm4, xmm5, xmm5); - // xmm0 = rb01 - // xmm1 = ga01 - // xmm2 = c10 - // xmm3 = c11 - // xmm4 = rb00 - // xmm5 = ga00 + // xmm0 = c10 + // xmm1 = c11 + // xmm2 = rb00 + // xmm3 = ga00 + // xmm4 = rb01 + // xmm5 = ga01 // xmm6 = uf // xmm7 = vf // rb00 = rb00.lerp16_4(rb01, uf); // ga00 = ga00.lerp16_4(ga01, uf); - lerp16_4(xmm0, xmm4, xmm6); - lerp16_4(xmm1, xmm5, xmm6); + lerp16_4(xmm4, xmm2, xmm6); + lerp16_4(xmm5, xmm3, xmm6); - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = c10 - // xmm3 = c11 + // xmm0 = c10 + // xmm1 = c11 + // xmm4 = rb00 + // xmm5 = ga00 // xmm6 = uf // xmm7 = vf // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - split16_2x8(xmm4, xmm5, xmm2); + split16_2x8(xmm2, xmm3, xmm0); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - split16_2x8(xmm2, xmm3, xmm3); + split16_2x8(xmm0, xmm1, xmm1); - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = rb11 - // xmm3 = ga11 - // xmm4 = rb10 - // xmm5 = ga10 + // xmm0 = rb11 + // xmm1 = ga11 + // xmm2 = rb10 + // xmm3 = ga10 + // xmm4 = rb00 + // xmm5 = ga00 // xmm6 = uf // xmm7 = vf // rb10 = rb10.lerp16_4(rb11, uf); // ga10 = ga10.lerp16_4(ga11, uf); - lerp16_4(xmm2, xmm4, xmm6); - lerp16_4(xmm3, xmm5, xmm6); + lerp16_4(xmm0, xmm2, xmm6); + lerp16_4(xmm1, xmm3, xmm6); - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = rb10 - // xmm3 = ga10 + // xmm0 = rb10 + // xmm1 = ga10 + // xmm4 = rb00 + // xmm5 = ga00 // xmm7 = vf // rb00 = rb00.lerp16_4(rb10, vf); // ga00 = ga00.lerp16_4(ga10, vf); - lerp16_4(xmm2, xmm0, xmm7); - lerp16_4(xmm3, xmm1, xmm7); + lerp16_4(xmm0, xmm4, xmm7); + lerp16_4(xmm1, xmm5, xmm7); + + // FIXME not ideal (but allow different source in ReadTexel and less register dependency) + vmovdqa(xmm2, xmm0); + vmovdqa(xmm3, xmm1); } else { @@ -947,7 +951,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - split16_2x8(_rb, _ga, xmm0); + split16_2x8(_rb, _ga, xmm4); } // xmm2 = rb @@ -1988,14 +1992,14 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) { - // TODO - const int r[] = {0, 0, 1, 1, 2, 2, 3, 3}; + const int in[] = {0, 1, 2, 3}; + const int out[] = {4, 5, 0, 1}; for(int i = 0; i < pixels; i++) { for(int j = 0; j < 4; j++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel(Xmm(out[i]), Xmm(in[i]), j); } } } @@ -2004,15 +2008,15 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin { const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4]; - //if(i == 0) vmovd(eax, addr); - //else vpextrd(eax, addr, i); - vpextrd(eax, addr, i); + // Extract address offset + if(i == 0) vmovd(eax, addr); + else vpextrd(eax, addr, i); + // If clut, load the value as a byte index if(m_sel.tlu) movzx(eax, byte[_m_local__gd__tex + rax]); - //if(i == 0) vmovd(dst, src); - //else vpinsrd(dst, src, i); - vpinsrd(dst, src, i); + if(i == 0) vmovd(dst, src); + else vpinsrd(dst, src, i); } #endif From 322473c295db9f4ae86cc550f971ef47d1f7b738 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 19 Nov 2016 10:40:39 +0100 Subject: [PATCH 15/20] gsdx sw: add a code example for gather instruction It will requires a generic (register naming) linear interpolation to use it properly Gather instruction requires an extra mask register therefore all registers name will be shuffled Perf wise, initial haswell implementation seems to be microcode emulated. --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index 7a2ffaa3c2..ab3bc33ad1 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -2019,4 +2019,42 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin else vpinsrd(dst, src, i); } +// Gather example (AVX2). Not faster on Haswell but potentially better on recent CPU +// Worst case reduce Icache. +// +// Current limitation requires 1 extra free register for the mask. +// And palette need zero masking. +// It is not possible to use same source/destination so linear interpolation must be updated +#if 0 +void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +{ + const int in[] = {0, 1, 2, 3}; + const int out[] = {4, 5, 0, 1}; + const int mask[] = {5, 0, 1, 2}; + + if (m_sel.tlu) { + for(int i = 0; i < pixels; i++) { + // FIXME can't use same dst and add register + Gather4Texel(Xmm(in[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i])); + // FIXME need a memory and could be faster + vpslld(Xmm(in[i]), 24); + vpsrld(Xmm(in[i]), 24); + Gather4Texel(Xmm(out[i]), _m_local__gd__clut, Xmm(in[i]), Xmm(mask[i])); + } + } else { + for(int i = 0; i < pixels; i++) { + Gather4Texel(Xmm(out[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i])); + } + } +} + +static void Gather4Texel(const Xmm& dst, const Reg64& base, const Xmm& addr, const Xmm& Mask) +{ + //void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) + vpcmpeqd(Mask, Mask); + vpgatherdd(dst, ptr[base + addr * 4], Mask); +} + +#endif + #endif From 6b78b8f9ce6cd816690981a66b2176d6d5afb94f Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 19 Nov 2016 14:17:38 +0100 Subject: [PATCH 16/20] gsdx sw JIT: dynamically select SSE41 at runtime even on SSE2 build (scanline) It won't give the full SSE41 speed boost but it is better than nothing --- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 74 +-- plugins/GSdx/GSDrawScanlineCodeGenerator.h | 1 + .../GSdx/GSDrawScanlineCodeGenerator.x64.cpp | 48 +- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 453 +++++++++--------- plugins/GSdx/GSUtil.cpp | 7 +- plugins/GSdx/GSUtil.h | 3 + 6 files changed, 289 insertions(+), 297 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index 0ebec4ea95..ab14ca4660 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -251,39 +251,45 @@ void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& t #if _M_SSE >= 0x500 vpblendw(a, b, 0xaa); - - #elif _M_SSE >= 0x401 - - pblendw(a, b, 0xaa); #else - pcmpeqd(temp, temp); - psrld(temp, 16); - pand(a, temp); - pandn(temp, b); - por(a, temp); - + if(g_cpu.has(util::Cpu::tSSE41)) + { + pblendw(a, b, 0xaa); + } + else + { + pcmpeqd(temp, temp); + psrld(temp, 16); + pand(a, temp); + pandn(temp, b); + por(a, temp); + } + #endif } void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) { #if _M_SSE >= 0x500 - + vpackuswb(a, a); vpmovzxbw(a, a); - #elif _M_SSE >= 0x401 - - packuswb(a, a); - pmovzxbw(a, a); - #else - packuswb(a, a); - pxor(temp, temp); - punpcklbw(a, temp); + if(g_cpu.has(util::Cpu::tSSE41)) + { + packuswb(a, a); + pmovzxbw(a, a); + } + else + { + packuswb(a, a); + pxor(temp, temp); + punpcklbw(a, temp); + } #endif } @@ -291,7 +297,7 @@ void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) void GSDrawScanlineCodeGenerator::alltrue() { #if _M_SSE >= 0x500 - + vpmovmskb(eax, xmm7); cmp(eax, 0xffff); je("step", T_NEAR); @@ -343,16 +349,15 @@ void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) { #if _M_SSE >= 0x500 - - vpblendvb(a, a, b, xmm0); - #elif _M_SSE >= 0x401 - - pblendvb(a, b); + vpblendvb(a, a, b, xmm0); #else - blend(a, b, xmm0); + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(a, b); + else + blend(a, b, xmm0); #endif } @@ -360,17 +365,20 @@ void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) { #if _M_SSE >= 0x500 - + vpblendvb(b, a, b, xmm0); - #elif _M_SSE >= 0x401 - - pblendvb(a, b); - movdqa(b, a); - #else - blendr(b, a, xmm0); + if(g_cpu.has(util::Cpu::tSSE41)) + { + pblendvb(a, b); + movdqa(b, a); + } + else + { + blendr(b, a, xmm0); + } #endif } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 31f3da82a6..5183b5d1d4 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -23,6 +23,7 @@ #include "GSScanlineEnvironment.h" #include "GSFunctionMap.h" +#include "GSUtil.h" using namespace Xbyak; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp index f78b47da94..6b1102f091 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp @@ -22,102 +22,102 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) // It is useless to port the code to SSEx, better use the faster 32 bits version instead -void GSDrawScanlineCodeGenerator::Generate() +void GSDrawScanlineCodeGenerator::Generate_SSE() { // Avoid a crash if someone want to use it ret(); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_SSE() { } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_SSE() { } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) { } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_SSE() { } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv) { } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) { } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_SSE() { } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_SSE() { } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_SSE() { } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_SSE() { } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_SSE() { } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_SSE() { } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE() { } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_SSE() { } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() { } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() { } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_SSE() { } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg64& addr) { } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) { } static const int s_offsets[4] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, uint8 i, int psm) { } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) { } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i) { } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index b7918fd7b2..428e562e3d 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -1069,16 +1069,15 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) movdqa(xmm4, ptr[&m_local.gd->t.min]); movdqa(xmm5, ptr[&m_local.gd->t.max]); - #if _M_SSE >= 0x401 - - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - #else - - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm6, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + { + movdqa(xmm0, ptr[&m_local.gd->t.mask]); + } + else + { + movdqa(xmm0, ptr[&m_local.gd->t.invmask]); + movdqa(xmm6, xmm0); + } // uv0 @@ -1100,15 +1099,10 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv0, xmm1); - - #else - - blendr(uv0, xmm1, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv0, xmm1); + else + blendr(uv0, xmm1, xmm0); // uv1 @@ -1130,15 +1124,10 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv1, xmm1); - - #else - - blendr(uv1, xmm1, xmm6); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv1, xmm1); + else + blendr(uv1, xmm1, xmm6); } } @@ -1899,16 +1888,15 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) } else { - #if _M_SSE >= 0x401 - - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - #else - - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm4, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + { + movdqa(xmm0, ptr[&m_local.gd->t.mask]); + } + else + { + movdqa(xmm0, ptr[&m_local.gd->t.invmask]); + movdqa(xmm4, xmm0); + } // uv0 @@ -1930,15 +1918,10 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv0, xmm1); - - #else - - blendr(uv0, xmm1, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv0, xmm1); + else + blendr(uv0, xmm1, xmm0); // uv1 @@ -1960,15 +1943,10 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv1, xmm1); - - #else - - blendr(uv1, xmm1, xmm4); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv1, xmm1); + else + blendr(uv1, xmm1, xmm4); } } @@ -2652,15 +2630,14 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() if(m_sel.pabe) { - #if _M_SSE < 0x401 + if(!g_cpu.has(util::Cpu::tSSE41)) + { + // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) + movdqa(xmm0, xmm4); + pslld(xmm0, 8); + psrad(xmm0, 31); - // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) - - movdqa(xmm0, xmm4); - pslld(xmm0, 8); - psrad(xmm0, 31); - - #endif + } psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) @@ -2845,19 +2822,26 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, { case 0: if(i == 0) movd(dst, src); - #if _M_SSE >= 0x401 - else pextrd(dst, src, i); - #else - else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);} - #endif + else { + if(g_cpu.has(util::Cpu::tSSE41)) { + pextrd(dst, src, i); + } else { + pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); + movd(dst, xmm0); + } + + } break; case 1: if(i == 0) movd(eax, src); - #if _M_SSE >= 0x401 - else pextrd(eax, src, i); - #else - else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);} - #endif + else { + if(g_cpu.has(util::Cpu::tSSE41)) { + pextrd(eax, src, i); + } else { + pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); + movd(eax, xmm0); + } + } xor(eax, dst); and(eax, 0xffffff); xor(dst, eax); @@ -2895,152 +2879,154 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) if(m_sel.mmin && !m_sel.lcm) { - #if _M_SSE >= 0x401 - - const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; - - if(pixels == 4) + if(g_cpu.has(util::Cpu::tSSE41)) { - movdqa(ptr[&m_local.temp.test], xmm7); - } - for(int j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; - for(int i = 0; i < pixels; i++) + if(pixels == 4) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + movdqa(ptr[&m_local.temp.test], xmm7); } - } - if(pixels == 4) - { - movdqa(xmm5, xmm7); - movdqa(xmm7, ptr[&m_local.temp.test]); - } + for(int j = 0; j < 4; j++) + { + mov(ebx, ptr[&lod_i->u32[j]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - #else + for(int i = 0; i < pixels; i++) + { + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + } + } - if(pixels == 4) - { - movdqa(ptr[&m_local.temp.test], xmm7); - - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm6, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel(xmm4, xmm2, 0); - psrldq(xmm2, 4); - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel(xmm7, xmm2, 0); - psrldq(xmm2, 4); - - punpckldq(xmm6, xmm1); - punpckldq(xmm4, xmm7); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel(xmm7, xmm2, 0); - psrldq(xmm2, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm5, xmm5, 0); - ReadTexel(xmm2, xmm2, 0); - - punpckldq(xmm1, xmm5); - punpckldq(xmm7, xmm2); - - punpcklqdq(xmm6, xmm1); - punpcklqdq(xmm4, xmm7); - - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm1, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel(xmm5, xmm3, 0); - psrldq(xmm3, 4); - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm2, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel(xmm7, xmm3, 0); - psrldq(xmm3, 4); - - punpckldq(xmm1, xmm2); - punpckldq(xmm5, xmm7); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm2, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel(xmm7, xmm3, 0); - psrldq(xmm3, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm0, xmm0, 0); - ReadTexel(xmm3, xmm3, 0); - - punpckldq(xmm2, xmm0); - punpckldq(xmm7, xmm3); - - punpcklqdq(xmm1, xmm2); - punpcklqdq(xmm5, xmm7); - - movdqa(xmm7, ptr[&m_local.temp.test]); + if(pixels == 4) + { + movdqa(xmm5, xmm7); + movdqa(xmm7, ptr[&m_local.temp.test]); + } } else { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm6, xmm5, 0); - psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) + if(pixels == 4) + { + movdqa(ptr[&m_local.temp.test], xmm7); - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); + ReadTexel(xmm6, xmm5, 0); + psrldq(xmm5, 4); + ReadTexel(xmm4, xmm2, 0); + psrldq(xmm2, 4); - punpckldq(xmm6, xmm1); + mov(ebx, ptr[&lod_i->u32[1]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + ReadTexel(xmm1, xmm5, 0); + psrldq(xmm5, 4); + ReadTexel(xmm7, xmm2, 0); + psrldq(xmm2, 4); - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); + punpckldq(xmm6, xmm1); + punpckldq(xmm4, xmm7); - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + mov(ebx, ptr[&lod_i->u32[2]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm4, xmm5, 0); - // psrldq(xmm5, 4); + ReadTexel(xmm1, xmm5, 0); + psrldq(xmm5, 4); + ReadTexel(xmm7, xmm2, 0); + psrldq(xmm2, 4); - punpckldq(xmm1, xmm4); + mov(ebx, ptr[&lod_i->u32[3]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm5, xmm5, 0); + ReadTexel(xmm2, xmm2, 0); + + punpckldq(xmm1, xmm5); + punpckldq(xmm7, xmm2); + + punpcklqdq(xmm6, xmm1); + punpcklqdq(xmm4, xmm7); + + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm1, xmm0, 0); + psrldq(xmm0, 4); + ReadTexel(xmm5, xmm3, 0); + psrldq(xmm3, 4); + + mov(ebx, ptr[&lod_i->u32[1]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm2, xmm0, 0); + psrldq(xmm0, 4); + ReadTexel(xmm7, xmm3, 0); + psrldq(xmm3, 4); + + punpckldq(xmm1, xmm2); + punpckldq(xmm5, xmm7); + + mov(ebx, ptr[&lod_i->u32[2]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm2, xmm0, 0); + psrldq(xmm0, 4); + ReadTexel(xmm7, xmm3, 0); + psrldq(xmm3, 4); + + mov(ebx, ptr[&lod_i->u32[3]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm0, xmm0, 0); + ReadTexel(xmm3, xmm3, 0); + + punpckldq(xmm2, xmm0); + punpckldq(xmm7, xmm3); + + punpcklqdq(xmm1, xmm2); + punpcklqdq(xmm5, xmm7); + + movdqa(xmm7, ptr[&m_local.temp.test]); + } + else + { + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm6, xmm5, 0); + psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) + + mov(ebx, ptr[&lod_i->u32[1]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm1, xmm5, 0); + psrldq(xmm5, 4); + + punpckldq(xmm6, xmm1); + + mov(ebx, ptr[&lod_i->u32[2]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm1, xmm5, 0); + psrldq(xmm5, 4); + + mov(ebx, ptr[&lod_i->u32[3]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(xmm4, xmm5, 0); + // psrldq(xmm5, 4); + + punpckldq(xmm1, xmm4); + + punpcklqdq(xmm6, xmm1); + } - punpcklqdq(xmm6, xmm1); } - - #endif } else { @@ -3052,43 +3038,42 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - #if _M_SSE >= 0x401 - - for(int i = 0; i < pixels; i++) + if(g_cpu.has(util::Cpu::tSSE41)) { - for(int j = 0; j < 4; j++) + for(int i = 0; i < pixels; i++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + for(int j = 0; j < 4; j++) + { + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + } } + + } else { + const int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; + + for(int i = 0; i < pixels; i++) + { + const Xmm& addr = Xmm(r[i * 2 + 0]); + const Xmm& dst = Xmm(r[i * 2 + 1]); + const Xmm& temp1 = Xmm(t[i * 2 + 0]); + const Xmm& temp2 = Xmm(t[i * 2 + 1]); + + ReadTexel(dst, addr, 0); + psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) + ReadTexel(temp1, addr, 0); + psrldq(addr, 4); + punpckldq(dst, temp1); + + ReadTexel(temp1, addr, 0); + psrldq(addr, 4); + ReadTexel(temp2, addr, 0); + // psrldq(addr, 4); + punpckldq(temp1, temp2); + + punpcklqdq(dst, temp1); + } + } - - #else - - const int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; - - for(int i = 0; i < pixels; i++) - { - const Xmm& addr = Xmm(r[i * 2 + 0]); - const Xmm& dst = Xmm(r[i * 2 + 1]); - const Xmm& temp1 = Xmm(t[i * 2 + 0]); - const Xmm& temp2 = Xmm(t[i * 2 + 1]); - - ReadTexel(dst, addr, 0); - psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) - ReadTexel(temp1, addr, 0); - psrldq(addr, 4); - punpckldq(dst, temp1); - - ReadTexel(temp1, addr, 0); - psrldq(addr, 4); - ReadTexel(temp2, addr, 0); - // psrldq(addr, 4); - punpckldq(temp1, temp2); - - punpcklqdq(dst, temp1); - } - - #endif } } @@ -3096,11 +3081,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin { const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; - #if _M_SSE < 0x401 - - ASSERT(i == 0); - - #endif + ASSERT(i == 0 || g_cpu.has(util::Cpu::tSSE41)); if(i == 0) movd(eax, addr); else pextrd(eax, addr, i); diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 2a23156d4b..797783ae22 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -20,9 +20,7 @@ */ #include "stdafx.h" -#include "GS.h" #include "GSUtil.h" -#include "xbyak/xbyak_util.h" #ifdef _WIN32 #include "GSDeviceDX.h" @@ -33,6 +31,8 @@ #define SVN_MODS 0 #endif +Xbyak::util::Cpu g_cpu; + const char* GSUtil::GetLibName() { // The following ifdef mess is courtesy of "static string str;" @@ -204,7 +204,6 @@ bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm) bool GSUtil::CheckSSE() { bool status = true; - Xbyak::util::Cpu cpu; struct ISA { Xbyak::util::Cpu::Type type; @@ -231,7 +230,7 @@ bool GSUtil::CheckSSE() }; for (size_t i = 0; i < countof(checks); i++) { - if(!cpu.has(checks[i].type)) { + if(!g_cpu.has(checks[i].type)) { fprintf(stderr, "This CPU does not support %s\n", checks[i].name); status = false; diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index 94552fbaf4..2676a5ca61 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -22,6 +22,7 @@ #pragma once #include "GS.h" +#include "xbyak/xbyak_util.h" struct OCLDeviceDesc { @@ -71,3 +72,5 @@ void GSmkdir(const char* dir); #endif const char* psm_str(int psm); + +extern Xbyak::util::Cpu g_cpu; From 574a2c774ed269f9cdcb39c8811c60d94245e9f4 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 19 Nov 2016 14:47:40 +0100 Subject: [PATCH 17/20] gsdx sw JIT: dynamically select between AVX1 and SSE code path (scanline) --- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 315 +++++++++--------- plugins/GSdx/GSDrawScanlineCodeGenerator.h | 104 +++--- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 122 +++---- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 140 ++++---- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 188 +++++------ 5 files changed, 441 insertions(+), 428 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index ab14ca4660..8307e9e99c 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -22,6 +22,17 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" +#if _M_SSE >= 0x501 +#else +void GSDrawScanlineCodeGenerator::Generate() +{ + if(g_cpu.has(util::Cpu::tAVX)) + Generate_AVX(); + else + Generate_SSE(); +} +#endif + #if _M_SSE >= 0x501 alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] = @@ -183,194 +194,179 @@ void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a) void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift) { - #if _M_SSE >= 0x500 - - if(shift == 0) + if(g_cpu.has(util::Cpu::tAVX)) { - vpmulhrsw(a, f); + if(shift == 0) + { + vpmulhrsw(a, f); + } + else + { + vpsllw(a, shift + 1); + vpmulhw(a, f); + } + } else { - vpsllw(a, shift + 1); - vpmulhw(a, f); + if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3)) + { + pmulhrsw(a, f); + } + else + { + psllw(a, shift + 1); + pmulhw(a, f); + } } - - #else - - if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3)) - { - pmulhrsw(a, f); - } - else - { - psllw(a, shift + 1); - pmulhw(a, f); - } - - #endif } void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift) { - #if _M_SSE >= 0x500 - - vpsubw(a, b); - modulate16(a, f, shift); - vpaddw(a, b); - - #else - - psubw(a, b); - modulate16(a, f, shift); - paddw(a, b); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpsubw(a, b); + modulate16(a, f, shift); + vpaddw(a, b); + } + else + { + psubw(a, b); + modulate16(a, f, shift); + paddw(a, b); + } } void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) { - #if _M_SSE >= 0x500 - - vpsubw(a, b); - vpmullw(a, f); - vpsraw(a, 4); - vpaddw(a, b); - - #else - - psubw(a, b); - pmullw(a, f); - psraw(a, 4); - paddw(a, b); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpsubw(a, b); + vpmullw(a, f); + vpsraw(a, 4); + vpaddw(a, b); + } + else + { + psubw(a, b); + pmullw(a, f); + psraw(a, 4); + paddw(a, b); + } } void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) { - #if _M_SSE >= 0x500 - - vpblendw(a, b, 0xaa); - - #else - - if(g_cpu.has(util::Cpu::tSSE41)) + if(g_cpu.has(util::Cpu::tAVX)) { - pblendw(a, b, 0xaa); + vpblendw(a, b, 0xaa); } else { - pcmpeqd(temp, temp); - psrld(temp, 16); - pand(a, temp); - pandn(temp, b); - por(a, temp); + if(g_cpu.has(util::Cpu::tSSE41)) + { + pblendw(a, b, 0xaa); + } + else + { + pcmpeqd(temp, temp); + psrld(temp, 16); + pand(a, temp); + pandn(temp, b); + por(a, temp); + } } - - #endif } void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) { - #if _M_SSE >= 0x500 - - vpackuswb(a, a); - vpmovzxbw(a, a); - - #else - - if(g_cpu.has(util::Cpu::tSSE41)) + if(g_cpu.has(util::Cpu::tAVX)) { - packuswb(a, a); - pmovzxbw(a, a); + vpackuswb(a, a); + vpmovzxbw(a, a); } else { - packuswb(a, a); - pxor(temp, temp); - punpcklbw(a, temp); + if(g_cpu.has(util::Cpu::tSSE41)) + { + packuswb(a, a); + pmovzxbw(a, a); + } + else + { + packuswb(a, a); + pxor(temp, temp); + punpcklbw(a, temp); + } } - - #endif } void GSDrawScanlineCodeGenerator::alltrue() { - #if _M_SSE >= 0x500 - - vpmovmskb(eax, xmm7); - cmp(eax, 0xffff); - je("step", T_NEAR); - - #else - - pmovmskb(eax, xmm7); - cmp(eax, 0xffff); - je("step", T_NEAR); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpmovmskb(eax, xmm7); + cmp(eax, 0xffff); + je("step", T_NEAR); + } + else + { + pmovmskb(eax, xmm7); + cmp(eax, 0xffff); + je("step", T_NEAR); + } } void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) { - #if _M_SSE >= 0x500 - - vpand(b, mask); - vpandn(mask, a); - vpor(a, b, mask); - - #else - - pand(b, mask); - pandn(mask, a); - por(b, mask); - movdqa(a, b); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpand(b, mask); + vpandn(mask, a); + vpor(a, b, mask); + } + else + { + pand(b, mask); + pandn(mask, a); + por(b, mask); + movdqa(a, b); + } } void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) { - #if _M_SSE >= 0x500 - - vpand(b, mask); - vpandn(mask, a); - vpor(b, mask); - - #else - - pand(b, mask); - pandn(mask, a); - por(b, mask); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpand(b, mask); + vpandn(mask, a); + vpor(b, mask); + } + else + { + pand(b, mask); + pandn(mask, a); + por(b, mask); + } } void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) { - #if _M_SSE >= 0x500 - - vpblendvb(a, a, b, xmm0); - - #else - - if(g_cpu.has(util::Cpu::tSSE41)) + if(g_cpu.has(util::Cpu::tAVX)) + vpblendvb(a, a, b, xmm0); + else if(g_cpu.has(util::Cpu::tSSE41)) pblendvb(a, b); else blend(a, b, xmm0); - - #endif } void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) { - #if _M_SSE >= 0x500 - - vpblendvb(b, a, b, xmm0); - - #else - - if(g_cpu.has(util::Cpu::tSSE41)) + if(g_cpu.has(util::Cpu::tAVX)) + { + vpblendvb(b, a, b, xmm0); + } + else if(g_cpu.has(util::Cpu::tSSE41)) { pblendvb(a, b); movdqa(b, a); @@ -379,8 +375,6 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) { blendr(b, a, xmm0); } - - #endif } void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src) @@ -388,31 +382,34 @@ void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const // l = src & 0xFF; (1 left shift + 1 right shift) // h = (src >> 8) & 0xFF; (1 right shift) -#if _M_SSE >= 0x500 - if (src == h) { - vpsllw(l, src, 8); - vpsrlw(h, 8); - } else if (src == l) { - vpsrlw(h, src, 8); - vpsllw(l, 8); - } else { - vpsllw(l, src, 8); - vpsrlw(h, src, 8); + if(g_cpu.has(util::Cpu::tAVX)) + { + if (src == h) { + vpsllw(l, src, 8); + vpsrlw(h, 8); + } else if (src == l) { + vpsrlw(h, src, 8); + vpsllw(l, 8); + } else { + vpsllw(l, src, 8); + vpsrlw(h, src, 8); + } + vpsrlw(l, 8); } - vpsrlw(l, 8); -#else - if (src == h) { - movdqa(l, src); - } else if (src == l) { - movdqa(h, src); - } else { - movdqa(l, src); - movdqa(h, src); + else + { + if (src == h) { + movdqa(l, src); + } else if (src == l) { + movdqa(h, src); + } else { + movdqa(l, src); + movdqa(h, src); + } + psllw(l, 8); + psrlw(l, 8); + psrlw(h, 8); } - psllw(l, 8); - psrlw(l, 8); - psrlw(h, 8); -#endif } #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 5183b5d1d4..a26b970309 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -27,6 +27,12 @@ using namespace Xbyak; +#if defined(_M_AMD64) || defined(_WIN64) +#define RegLong Reg64 +#else +#define RegLong Reg32 +#endif + class GSDrawScanlineCodeGenerator : public GSCodeGenerator { void operator = (const GSDrawScanlineCodeGenerator&); @@ -58,17 +64,9 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void WriteZBuf(); void AlphaBlend(); void WriteFrame(); - - #if defined(_M_AMD64) || defined(_WIN64) - void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr); - void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm); - #else - void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr); - void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm); - #endif - + void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr); + void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz); + void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm); void ReadTexel(int pixels, int mip_offset = 0); void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i); @@ -85,39 +83,59 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator #else - void Init(); - void Step(); - void TestZ(const Xmm& temp1, const Xmm& temp2); - void SampleTexture(); - void Wrap(const Xmm& uv0); - void Wrap(const Xmm& uv0, const Xmm& uv1); - void SampleTextureLOD(); - void WrapLOD(const Xmm& uv0); - void WrapLOD(const Xmm& uv0, const Xmm& uv1); - void AlphaTFX(); - void ReadMask(); - void TestAlpha(); - void ColorTFX(); - void Fog(); - void ReadFrame(); - void TestDestAlpha(); - void WriteMask(); - void WriteZBuf(); - void AlphaBlend(); - void WriteFrame(); + void Generate_SSE(); + void Init_SSE(); + void Step_SSE(); + void TestZ_SSE(const Xmm& temp1, const Xmm& temp2); + void SampleTexture_SSE(); + void Wrap_SSE(const Xmm& uv0); + void Wrap_SSE(const Xmm& uv0, const Xmm& uv1); + void SampleTextureLOD_SSE(); + void WrapLOD_SSE(const Xmm& uv0); + void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1); + void AlphaTFX_SSE(); + void ReadMask_SSE(); + void TestAlpha_SSE(); + void ColorTFX_SSE(); + void Fog_SSE(); + void ReadFrame_SSE(); + void TestDestAlpha_SSE(); + void WriteMask_SSE(); + void WriteZBuf_SSE(); + void AlphaBlend_SSE(); + void WriteFrame_SSE(); + void ReadPixel_SSE(const Xmm& dst, const RegLong& addr); + void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz); + void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm); + void ReadTexel_SSE(int pixels, int mip_offset = 0); + void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i); - #if defined(_M_AMD64) || defined(_WIN64) - void ReadPixel(const Xmm& dst, const Reg64& addr); - void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm); - #else - void ReadPixel(const Xmm& dst, const Reg32& addr); - void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm); - #endif - - void ReadTexel(int pixels, int mip_offset = 0); - void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i); + void Generate_AVX(); + void Init_AVX(); + void Step_AVX(); + void TestZ_AVX(const Xmm& temp1, const Xmm& temp2); + void SampleTexture_AVX(); + void Wrap_AVX(const Xmm& uv0); + void Wrap_AVX(const Xmm& uv0, const Xmm& uv1); + void SampleTextureLOD_AVX(); + void WrapLOD_AVX(const Xmm& uv0); + void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1); + void AlphaTFX_AVX(); + void ReadMask_AVX(); + void TestAlpha_AVX(); + void ColorTFX_AVX(); + void Fog_AVX(); + void ReadFrame_AVX(); + void TestDestAlpha_AVX(); + void WriteMask_AVX(); + void WriteZBuf_AVX(); + void AlphaBlend_AVX(); + void WriteFrame_AVX(); + void ReadPixel_AVX(const Xmm& dst, const RegLong& addr); + void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz); + void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm); + void ReadTexel_AVX(int pixels, int mip_offset = 0); + void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i); void modulate16(const Xmm& a, const Operand& f, int shift); void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index ab3bc33ad1..ea227673e2 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -45,7 +45,7 @@ #define _zm xmm5 #define _fd xmm6 -#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) #ifdef _WIN64 #else @@ -59,7 +59,7 @@ static const int _rz_zd = -8 * 10; static const int _rz_cov = -8 * 12; #endif -void GSDrawScanlineCodeGenerator::Generate() +void GSDrawScanlineCodeGenerator::Generate_AVX() { bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; bool need_clut = need_tex && m_sel.tlu; @@ -100,7 +100,7 @@ void GSDrawScanlineCodeGenerator::Generate() if(need_tex) mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]); - Init(); + Init_AVX(); // a0 = steps // t1 = fza_base @@ -126,30 +126,30 @@ void GSDrawScanlineCodeGenerator::Generate() L("loop"); - TestZ(xmm5, xmm6); + TestZ_AVX(xmm5, xmm6); // ebp = za if(m_sel.mmin) { - SampleTextureLOD(); + SampleTextureLOD_AVX(); } else { - SampleTexture(); + SampleTexture_AVX(); } // ebp = za // xmm2 = rb // xmm3 = ga - AlphaTFX(); + AlphaTFX_AVX(); // ebp = za // xmm2 = rb // xmm3 = ga - ReadMask(); + ReadMask_AVX(); // ebp = za // xmm2 = rb @@ -157,7 +157,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - TestAlpha(); + TestAlpha_AVX(); // ebp = za // xmm2 = rb @@ -165,7 +165,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - ColorTFX(); + ColorTFX_AVX(); // ebp = za // xmm2 = rb @@ -173,7 +173,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - Fog(); + Fog_AVX(); // ebp = za // xmm2 = rb @@ -181,7 +181,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - ReadFrame(); + ReadFrame_AVX(); // ebx = fa // ebp = za @@ -191,7 +191,7 @@ L("loop"); // xmm5 = zm // xmm6 = fd - TestDestAlpha(); + TestDestAlpha_AVX(); // ebx = fa // ebp = za @@ -201,7 +201,7 @@ L("loop"); // xmm5 = zm // xmm6 = fd - WriteMask(); + WriteMask_AVX(); // ebx = fa // edx = fzm @@ -212,7 +212,7 @@ L("loop"); // xmm5 = zm // xmm6 = fd - WriteZBuf(); + WriteZBuf_AVX(); // ebx = fa // edx = fzm @@ -221,7 +221,7 @@ L("loop"); // xmm4 = fm // xmm6 = fd - AlphaBlend(); + AlphaBlend_AVX(); // ebx = fa // edx = fzm @@ -230,7 +230,7 @@ L("loop"); // xmm4 = fm // xmm6 = fd - WriteFrame(); + WriteFrame_AVX(); L("step"); @@ -242,7 +242,7 @@ L("step"); jle("exit", T_NEAR); - Step(); + Step_AVX(); jmp("loop", T_NEAR); } @@ -277,7 +277,7 @@ L("exit"); ret(); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_AVX() { if(!m_sel.notest) { @@ -480,7 +480,7 @@ void GSDrawScanlineCodeGenerator::Init() } } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_AVX() { // steps -= 4; @@ -603,7 +603,7 @@ void GSDrawScanlineCodeGenerator::Step() } } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { @@ -661,7 +661,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.ztest) { - ReadPixel(xmm1, rbp); + ReadPixel_AVX(xmm1, rbp); if(m_sel.zwrite && m_sel.zpsm < 2) { @@ -715,7 +715,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) } } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_AVX() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -786,13 +786,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm4, xmm5); + Wrap_AVX(xmm4, xmm5); } else { // uv0 = Wrap(uv0); - Wrap(xmm4); + Wrap_AVX(xmm4); } // xmm4 = uv0 @@ -854,7 +854,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_AVX(4, 0); // xmm0 = c10 // xmm1 = c11 @@ -944,7 +944,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_AVX(1, 0); // GSVector4i mask = GSVector4i::x00ff(); @@ -958,7 +958,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm3 = ga } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv) { // xmm0, xmm1, xmm2, xmm3 = free @@ -1019,7 +1019,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm2, xmm3 = free @@ -1111,19 +1111,19 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::SampleTextureLOD() +void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX() { } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv) { } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1) { } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_AVX() { if(!m_sel.fb) { @@ -1261,7 +1261,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() } } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_AVX() { if(m_sel.fwrite) { @@ -1274,7 +1274,7 @@ void GSDrawScanlineCodeGenerator::ReadMask() } } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_AVX() { switch(m_sel.atst) { @@ -1345,7 +1345,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha() } } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_AVX() { if(!m_sel.fwrite) { @@ -1410,7 +1410,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() } } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_AVX() { if(!m_sel.fwrite || !m_sel.fge) { @@ -1431,7 +1431,7 @@ void GSDrawScanlineCodeGenerator::Fog() mix16(_ga, xmm6, _f); } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_AVX() { if(!m_sel.fb) { @@ -1449,10 +1449,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame() return; } - ReadPixel(_fd, rbx); + ReadPixel_AVX(_fd, rbx); } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX() { if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) { @@ -1496,7 +1496,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() alltrue(); } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_AVX() { if(m_sel.notest) { @@ -1542,7 +1542,7 @@ void GSDrawScanlineCodeGenerator::WriteMask() not(edx); } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() { if(!m_sel.zwrite) { @@ -1571,10 +1571,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1); + WritePixel_AVX(xmm1, rbp, dh, fast, m_sel.zpsm, 1); } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_AVX() { if(!m_sel.fwrite) { @@ -1798,7 +1798,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() } } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_AVX() { if(!m_sel.fwrite) { @@ -1889,16 +1889,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame() bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); + WritePixel_AVX(xmm2, rbx, dl, fast, m_sel.fpsm, 0); } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg64& addr) { vmovq(dst, qword[_m_local__gd__vm + addr * 2]); vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]); } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) { if(m_sel.notest) { @@ -1909,10 +1909,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, } else { - WritePixel(src, addr, 0, psm); - WritePixel(src, addr, 1, psm); - WritePixel(src, addr, 2, psm); - WritePixel(src, addr, 3, psm); + WritePixel_AVX(src, addr, 0, psm); + WritePixel_AVX(src, addr, 1, psm); + WritePixel_AVX(src, addr, 2, psm); + WritePixel_AVX(src, addr, 3, psm); } } else @@ -1943,22 +1943,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, test(mask, 0x03); je("@f"); - WritePixel(src, addr, 0, psm); + WritePixel_AVX(src, addr, 0, psm); L("@@"); test(mask, 0x0c); je("@f"); - WritePixel(src, addr, 1, psm); + WritePixel_AVX(src, addr, 1, psm); L("@@"); test(mask, 0x30); je("@f"); - WritePixel(src, addr, 2, psm); + WritePixel_AVX(src, addr, 2, psm); L("@@"); test(mask, 0xc0); je("@f"); - WritePixel(src, addr, 3, psm); + WritePixel_AVX(src, addr, 3, psm); L("@@"); } } @@ -1966,7 +1966,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, static const int s_offsets[4] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, uint8 i, int psm) { Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2]; @@ -1990,7 +1990,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) { const int in[] = {0, 1, 2, 3}; const int out[] = {4, 5, 0, 1}; @@ -1999,12 +1999,12 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) { for(int j = 0; j < 4; j++) { - ReadTexel(Xmm(out[i]), Xmm(in[i]), j); + ReadTexel_AVX(Xmm(out[i]), Xmm(in[i]), j); } } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i) { const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4]; @@ -2026,7 +2026,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin // And palette need zero masking. // It is not possible to use same source/destination so linear interpolation must be updated #if 0 -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) { const int in[] = {0, 1, 2, 3}; const int out[] = {4, 5, 0, 1}; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 3b4604ab49..a2278cbad0 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -23,21 +23,20 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) static const int _args = 16; static const int _top = _args + 4; static const int _v = _args + 8; -void GSDrawScanlineCodeGenerator::Generate() +void GSDrawScanlineCodeGenerator::Generate_AVX() { -//ret(8); push(ebx); push(esi); push(edi); push(ebp); - Init(); + Init_AVX(); if(!m_sel.edge) { @@ -59,7 +58,7 @@ L("loop"); bool tme = m_sel.tfx != TFX_NONE; - TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); + TestZ_AVX(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); // ecx = steps // esi = fzbr @@ -75,11 +74,11 @@ L("loop"); if(m_sel.mmin) { - SampleTextureLOD(); + SampleTextureLOD_AVX(); } else { - SampleTexture(); + SampleTexture_AVX(); } // ecx = steps @@ -93,7 +92,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - AlphaTFX(); + AlphaTFX_AVX(); // ecx = steps // esi = fzbr @@ -104,7 +103,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadMask(); + ReadMask_AVX(); // ecx = steps // esi = fzbr @@ -117,7 +116,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestAlpha(); + TestAlpha_AVX(); // ecx = steps // esi = fzbr @@ -130,7 +129,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ColorTFX(); + ColorTFX_AVX(); // ecx = steps // esi = fzbr @@ -142,7 +141,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - Fog(); + Fog_AVX(); // ecx = steps // esi = fzbr @@ -154,7 +153,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadFrame(); + ReadFrame_AVX(); // ecx = steps // esi = fzbr @@ -167,7 +166,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestDestAlpha(); + TestDestAlpha_AVX(); // ecx = steps // esi = fzbr @@ -180,7 +179,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - WriteMask(); + WriteMask_AVX(); // ebx = fa // ecx = steps @@ -194,7 +193,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteZBuf(); + WriteZBuf_AVX(); // ebx = fa // ecx = steps @@ -208,7 +207,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - AlphaBlend(); + AlphaBlend_AVX(); // ebx = fa // ecx = steps @@ -220,7 +219,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteFrame(); + WriteFrame_AVX(); L("step"); @@ -232,7 +231,7 @@ L("step"); jle("exit", T_NEAR); - Step(); + Step_AVX(); jmp("loop", T_NEAR); } @@ -249,7 +248,7 @@ L("exit"); ret(8); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_AVX() { if(!m_sel.notest) { @@ -455,7 +454,7 @@ void GSDrawScanlineCodeGenerator::Init() } } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_AVX() { // steps -= 4; @@ -596,7 +595,7 @@ void GSDrawScanlineCodeGenerator::Step() } } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { @@ -644,7 +643,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.ztest) { - ReadPixel(xmm1, ebp); + ReadPixel_AVX(xmm1, ebp); if(m_sel.zwrite && m_sel.zpsm < 2) { @@ -694,7 +693,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) } } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_AVX() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -775,13 +774,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm2, xmm3); + Wrap_AVX(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - Wrap(xmm2); + Wrap_AVX(xmm2); } // xmm2 = uv0 @@ -843,7 +842,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_AVX(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -935,7 +934,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_AVX(1, 0); // GSVector4i mask = GSVector4i::x00ff(); @@ -946,7 +945,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1007,7 +1006,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1099,7 +1098,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::SampleTextureLOD() +void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -1360,13 +1359,13 @@ return; // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_AVX(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_AVX(xmm2); } // xmm2 = uv0 @@ -1428,7 +1427,7 @@ return; // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_AVX(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -1520,7 +1519,7 @@ return; // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_AVX(1, 0); // GSVector4i mask = GSVector4i::x00ff(); @@ -1591,13 +1590,13 @@ return; // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_AVX(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_AVX(xmm2); } // xmm2 = uv0 @@ -1659,7 +1658,7 @@ return; // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 1); + ReadTexel_AVX(4, 1); // xmm6 = c00 // xmm4 = c01 @@ -1751,7 +1750,7 @@ return; // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 1); + ReadTexel_AVX(1, 1); // GSVector4i mask = GSVector4i::x00ff(); @@ -1774,7 +1773,7 @@ return; pop(ebp); } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv) { // xmm5 = minuv // xmm6 = maxuv @@ -1835,7 +1834,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1) { // xmm5 = minuv // xmm6 = maxuv @@ -1923,7 +1922,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_AVX() { if(!m_sel.fb) { @@ -2071,7 +2070,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() } } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_AVX() { if(m_sel.fwrite) { @@ -2084,7 +2083,7 @@ void GSDrawScanlineCodeGenerator::ReadMask() } } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_AVX() { switch(m_sel.atst) { @@ -2155,7 +2154,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha() } } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_AVX() { if(!m_sel.fwrite) { @@ -2231,7 +2230,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() } } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_AVX() { if(!m_sel.fwrite || !m_sel.fge) { @@ -2252,7 +2251,7 @@ void GSDrawScanlineCodeGenerator::Fog() mix16(xmm6, xmm1, xmm0); } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_AVX() { if(!m_sel.fb) { @@ -2270,10 +2269,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame() return; } - ReadPixel(xmm2, ebx); + ReadPixel_AVX(xmm2, ebx); } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX() { if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) { @@ -2317,7 +2316,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() alltrue(); } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_AVX() { if(m_sel.notest) { @@ -2363,7 +2362,7 @@ void GSDrawScanlineCodeGenerator::WriteMask() not(edx); } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() { if(!m_sel.zwrite) { @@ -2381,10 +2380,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); + WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_AVX() { if(!m_sel.fwrite) { @@ -2606,7 +2605,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() } } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_AVX() { if(!m_sel.fwrite) { @@ -2686,16 +2685,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame() bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); + WritePixel_AVX(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg32& addr) { vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { if(m_sel.notest) { @@ -2706,10 +2705,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } else { - WritePixel(src, addr, 0, psm); - WritePixel(src, addr, 1, psm); - WritePixel(src, addr, 2, psm); - WritePixel(src, addr, 3, psm); + WritePixel_AVX(src, addr, 0, psm); + WritePixel_AVX(src, addr, 1, psm); + WritePixel_AVX(src, addr, 2, psm); + WritePixel_AVX(src, addr, 3, psm); } } else @@ -2740,22 +2739,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, test(mask, 0x03); je("@f"); - WritePixel(src, addr, 0, psm); + WritePixel_AVX(src, addr, 0, psm); L("@@"); test(mask, 0x0c); je("@f"); - WritePixel(src, addr, 1, psm); + WritePixel_AVX(src, addr, 1, psm); L("@@"); test(mask, 0x30); je("@f"); - WritePixel(src, addr, 2, psm); + WritePixel_AVX(src, addr, 2, psm); L("@@"); test(mask, 0xc0); je("@f"); - WritePixel(src, addr, 3, psm); + WritePixel_AVX(src, addr, 3, psm); L("@@"); } } @@ -2763,7 +2762,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, static const int s_offsets[] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, uint8 i, int psm) { Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; @@ -2788,7 +2787,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) { // in // xmm5 = addr00 @@ -2827,7 +2826,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) for(int i = 0; i < pixels; i++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } @@ -2846,19 +2845,18 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) } const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - const int t[] = {4, 1, 5, 2}; for(int i = 0; i < pixels; i++) { for(uint8 j = 0; j < 4; j++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i) { ASSERT(i < 4); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 428e562e3d..60ac137655 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -23,20 +23,20 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) static const int _args = 16; static const int _top = _args + 4; static const int _v = _args + 8; -void GSDrawScanlineCodeGenerator::Generate() +void GSDrawScanlineCodeGenerator::Generate_SSE() { push(ebx); push(esi); push(edi); push(ebp); - Init(); + Init_SSE(); if(!m_sel.edge) { @@ -58,7 +58,7 @@ L("loop"); bool tme = m_sel.tfx != TFX_NONE; - TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); + TestZ_SSE(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); // ecx = steps // esi = fzbr @@ -74,11 +74,11 @@ L("loop"); if(m_sel.mmin) { - SampleTextureLOD(); + SampleTextureLOD_SSE(); } else { - SampleTexture(); + SampleTexture_SSE(); } // ecx = steps @@ -92,7 +92,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - AlphaTFX(); + AlphaTFX_SSE(); // ecx = steps // esi = fzbr @@ -103,7 +103,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadMask(); + ReadMask_SSE(); // ecx = steps // esi = fzbr @@ -116,7 +116,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestAlpha(); + TestAlpha_SSE(); // ecx = steps // esi = fzbr @@ -129,7 +129,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ColorTFX(); + ColorTFX_SSE(); // ecx = steps // esi = fzbr @@ -141,7 +141,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - Fog(); + Fog_SSE(); // ecx = steps // esi = fzbr @@ -153,7 +153,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadFrame(); + ReadFrame_SSE(); // ecx = steps // esi = fzbr @@ -166,7 +166,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestDestAlpha(); + TestDestAlpha_SSE(); // ecx = steps // esi = fzbr @@ -179,7 +179,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - WriteMask(); + WriteMask_SSE(); // ebx = fa // ecx = steps @@ -193,7 +193,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteZBuf(); + WriteZBuf_SSE(); // ebx = fa // ecx = steps @@ -207,7 +207,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - AlphaBlend(); + AlphaBlend_SSE(); // ebx = fa // ecx = steps @@ -219,7 +219,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteFrame(); + WriteFrame_SSE(); L("step"); @@ -231,7 +231,7 @@ L("step"); jle("exit", T_NEAR); - Step(); + Step_SSE(); jmp("loop", T_NEAR); } @@ -248,7 +248,7 @@ L("exit"); ret(8); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_SSE() { if(!m_sel.notest) { @@ -457,7 +457,7 @@ void GSDrawScanlineCodeGenerator::Init() } } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_SSE() { // steps -= 4; @@ -600,7 +600,7 @@ void GSDrawScanlineCodeGenerator::Step() } } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { @@ -648,7 +648,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.ztest) { - ReadPixel(xmm1, ebp); + ReadPixel_SSE(xmm1, ebp); if(m_sel.zwrite && m_sel.zpsm < 2) { @@ -698,7 +698,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) } } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_SSE() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -780,13 +780,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm2, xmm3); + Wrap_SSE(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - Wrap(xmm2); + Wrap_SSE(xmm2); } // xmm2 = uv0 @@ -853,7 +853,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_SSE(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -946,7 +946,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_SSE(1, 0); // GSVector4i mask = GSVector4i::x00ff(); @@ -957,7 +957,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1020,7 +1020,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1131,7 +1131,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::SampleTextureLOD() +void GSDrawScanlineCodeGenerator::SampleTextureLOD_SSE() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -1140,7 +1140,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() push(ebp); - mov(ebp, (size_t)m_local.gd->tex); + mov(ebp, (size_t)m_local.gd->tex); if(m_sel.tlu) { @@ -1354,13 +1354,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_SSE(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_SSE(xmm2); } // xmm2 = uv0 @@ -1427,7 +1427,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_SSE(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -1520,7 +1520,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_SSE(1, 0); // GSVector4i mask = GSVector4i::x00ff(); @@ -1592,13 +1592,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_SSE(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_SSE(xmm2); } // xmm2 = uv0 @@ -1665,7 +1665,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 1); + ReadTexel_SSE(4, 1); // xmm6 = c00 // xmm4 = c01 @@ -1758,7 +1758,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 1); + ReadTexel_SSE(1, 1); // GSVector4i mask = GSVector4i::x00ff(); @@ -1781,7 +1781,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pop(ebp); } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv) { // xmm5 = minuv // xmm6 = maxuv @@ -1844,7 +1844,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1) { // xmm5 = minuv // xmm6 = maxuv @@ -1950,7 +1950,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_SSE() { if(!m_sel.fb) { @@ -2098,7 +2098,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() } } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_SSE() { if(m_sel.fwrite) { @@ -2111,7 +2111,7 @@ void GSDrawScanlineCodeGenerator::ReadMask() } } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_SSE() { switch(m_sel.atst) { @@ -2186,7 +2186,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha() } } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_SSE() { if(!m_sel.fwrite) { @@ -2262,7 +2262,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() } } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_SSE() { if(!m_sel.fwrite || !m_sel.fge) { @@ -2283,7 +2283,7 @@ void GSDrawScanlineCodeGenerator::Fog() mix16(xmm6, xmm1, xmm0); } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_SSE() { if(!m_sel.fb) { @@ -2301,10 +2301,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame() return; } - ReadPixel(xmm2, ebx); + ReadPixel_SSE(xmm2, ebx); } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE() { if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) { @@ -2347,7 +2347,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() alltrue(); } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_SSE() { if(m_sel.notest) { @@ -2394,7 +2394,7 @@ void GSDrawScanlineCodeGenerator::WriteMask() not(edx); } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() { if(!m_sel.zwrite) { @@ -2414,10 +2414,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); + WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() { if(!m_sel.fwrite) { @@ -2654,7 +2654,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() } } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_SSE() { if(!m_sel.fwrite) { @@ -2739,16 +2739,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame() bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); + WritePixel_SSE(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg32& addr) { movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { if(m_sel.notest) { @@ -2759,10 +2759,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } else { - WritePixel(src, addr, 0, psm); - WritePixel(src, addr, 1, psm); - WritePixel(src, addr, 2, psm); - WritePixel(src, addr, 3, psm); + WritePixel_SSE(src, addr, 0, psm); + WritePixel_SSE(src, addr, 1, psm); + WritePixel_SSE(src, addr, 2, psm); + WritePixel_SSE(src, addr, 3, psm); } } else @@ -2791,22 +2791,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, test(mask, 0x03); je("@f"); - WritePixel(src, addr, 0, psm); + WritePixel_SSE(src, addr, 0, psm); L("@@"); test(mask, 0x0c); je("@f"); - WritePixel(src, addr, 1, psm); + WritePixel_SSE(src, addr, 1, psm); L("@@"); test(mask, 0x30); je("@f"); - WritePixel(src, addr, 2, psm); + WritePixel_SSE(src, addr, 2, psm); L("@@"); test(mask, 0xc0); je("@f"); - WritePixel(src, addr, 3, psm); + WritePixel_SSE(src, addr, 3, psm); L("@@"); } } @@ -2814,7 +2814,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, static const int s_offsets[4] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, uint8 i, int psm) { Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; @@ -2854,7 +2854,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) { // in // xmm5 = addr00 @@ -2896,7 +2896,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) for(int i = 0; i < pixels; i++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } @@ -2916,17 +2916,17 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) mov(ebx, ptr[&lod_i->u32[0]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm6, xmm5, 0); + ReadTexel_SSE(xmm6, xmm5, 0); psrldq(xmm5, 4); - ReadTexel(xmm4, xmm2, 0); + ReadTexel_SSE(xmm4, xmm2, 0); psrldq(xmm2, 4); mov(ebx, ptr[&lod_i->u32[1]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm5, 0); + ReadTexel_SSE(xmm1, xmm5, 0); psrldq(xmm5, 4); - ReadTexel(xmm7, xmm2, 0); + ReadTexel_SSE(xmm7, xmm2, 0); psrldq(xmm2, 4); punpckldq(xmm6, xmm1); @@ -2935,16 +2935,16 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) mov(ebx, ptr[&lod_i->u32[2]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm5, 0); + ReadTexel_SSE(xmm1, xmm5, 0); psrldq(xmm5, 4); - ReadTexel(xmm7, xmm2, 0); + ReadTexel_SSE(xmm7, xmm2, 0); psrldq(xmm2, 4); mov(ebx, ptr[&lod_i->u32[3]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm5, xmm5, 0); - ReadTexel(xmm2, xmm2, 0); + ReadTexel_SSE(xmm5, xmm5, 0); + ReadTexel_SSE(xmm2, xmm2, 0); punpckldq(xmm1, xmm5); punpckldq(xmm7, xmm2); @@ -2955,17 +2955,17 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) mov(ebx, ptr[&lod_i->u32[0]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm0, 0); + ReadTexel_SSE(xmm1, xmm0, 0); psrldq(xmm0, 4); - ReadTexel(xmm5, xmm3, 0); + ReadTexel_SSE(xmm5, xmm3, 0); psrldq(xmm3, 4); mov(ebx, ptr[&lod_i->u32[1]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm2, xmm0, 0); + ReadTexel_SSE(xmm2, xmm0, 0); psrldq(xmm0, 4); - ReadTexel(xmm7, xmm3, 0); + ReadTexel_SSE(xmm7, xmm3, 0); psrldq(xmm3, 4); punpckldq(xmm1, xmm2); @@ -2974,16 +2974,16 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) mov(ebx, ptr[&lod_i->u32[2]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm2, xmm0, 0); + ReadTexel_SSE(xmm2, xmm0, 0); psrldq(xmm0, 4); - ReadTexel(xmm7, xmm3, 0); + ReadTexel_SSE(xmm7, xmm3, 0); psrldq(xmm3, 4); mov(ebx, ptr[&lod_i->u32[3]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm0, xmm0, 0); - ReadTexel(xmm3, xmm3, 0); + ReadTexel_SSE(xmm0, xmm0, 0); + ReadTexel_SSE(xmm3, xmm3, 0); punpckldq(xmm2, xmm0); punpckldq(xmm7, xmm3); @@ -2998,13 +2998,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) mov(ebx, ptr[&lod_i->u32[0]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm6, xmm5, 0); + ReadTexel_SSE(xmm6, xmm5, 0); psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) mov(ebx, ptr[&lod_i->u32[1]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm5, 0); + ReadTexel_SSE(xmm1, xmm5, 0); psrldq(xmm5, 4); punpckldq(xmm6, xmm1); @@ -3012,13 +3012,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) mov(ebx, ptr[&lod_i->u32[2]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm5, 0); + ReadTexel_SSE(xmm1, xmm5, 0); psrldq(xmm5, 4); mov(ebx, ptr[&lod_i->u32[3]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm4, xmm5, 0); + ReadTexel_SSE(xmm4, xmm5, 0); // psrldq(xmm5, 4); punpckldq(xmm1, xmm4); @@ -3044,7 +3044,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) { for(int j = 0; j < 4; j++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } @@ -3058,15 +3058,15 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) const Xmm& temp1 = Xmm(t[i * 2 + 0]); const Xmm& temp2 = Xmm(t[i * 2 + 1]); - ReadTexel(dst, addr, 0); + ReadTexel_SSE(dst, addr, 0); psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) - ReadTexel(temp1, addr, 0); + ReadTexel_SSE(temp1, addr, 0); psrldq(addr, 4); punpckldq(dst, temp1); - ReadTexel(temp1, addr, 0); + ReadTexel_SSE(temp1, addr, 0); psrldq(addr, 4); - ReadTexel(temp2, addr, 0); + ReadTexel_SSE(temp2, addr, 0); // psrldq(addr, 4); punpckldq(temp1, temp2); @@ -3077,7 +3077,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i) { const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; From 8fd46e96aae48832b99b09b0fe60c5d18c4c6699 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 19 Nov 2016 15:11:16 +0100 Subject: [PATCH 18/20] gsdx sw JIT: dynamically select ISA for SetupPrim --- plugins/GSdx/GSSetupPrimCodeGenerator.cpp | 13 +++++++++++++ plugins/GSdx/GSSetupPrimCodeGenerator.h | 13 +++++++++++++ .../GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp | 16 ++++++++-------- plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp | 16 ++++++++-------- .../GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp | 18 +++++++++--------- plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp | 18 +++++++++--------- 6 files changed, 60 insertions(+), 34 deletions(-) diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp index 37427898f8..3735ec9a41 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp @@ -22,6 +22,8 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +using namespace Xbyak; + #if _M_SSE >= 0x501 GSVector8 GSSetupPrimCodeGenerator::m_shift[9]; #else @@ -75,3 +77,14 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void Generate(); } + +#if _M_SSE >= 0x501 +#else +void GSSetupPrimCodeGenerator::Generate() +{ + if(g_cpu.has(util::Cpu::tAVX)) + Generate_AVX(); + else + Generate_SSE(); +} +#endif diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.h b/plugins/GSdx/GSSetupPrimCodeGenerator.h index d4b2c1106f..e07eb00334 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.h +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.h @@ -23,6 +23,7 @@ #include "GSScanlineEnvironment.h" #include "GSFunctionMap.h" +#include "GSUtil.h" class GSSetupPrimCodeGenerator : public GSCodeGenerator { @@ -35,9 +36,21 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator void Generate(); +#if _M_SSE < 0x501 + void Generate_SSE(); + void Depth_SSE(); + void Texture_SSE(); + void Color_SSE(); + + void Generate_AVX(); + void Depth_AVX(); + void Texture_AVX(); + void Color_AVX(); +#else void Depth(); void Texture(); void Color(); +#endif public: GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp index f42d4feb1b..f8f88022af 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp @@ -23,11 +23,11 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_AVX() { #ifdef _WIN64 sub(rsp, 8 + 2 * 16); @@ -48,11 +48,11 @@ void GSSetupPrimCodeGenerator::Generate() } } - Depth(); + Depth_AVX(); - Texture(); + Texture_AVX(); - Color(); + Color_AVX(); #ifdef _WIN64 vmovdqa(xmm6, ptr[rsp + 0]); @@ -64,7 +64,7 @@ void GSSetupPrimCodeGenerator::Generate() ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_AVX() { if(!m_en.z && !m_en.f) { @@ -158,7 +158,7 @@ void GSSetupPrimCodeGenerator::Depth() } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_AVX() { if(!m_en.t) { @@ -234,7 +234,7 @@ void GSSetupPrimCodeGenerator::Texture() } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_AVX() { if(!m_en.c) { diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp index 238dd86bdc..b4169e766f 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp @@ -23,11 +23,11 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_SSE() { #ifdef _WIN64 sub(rsp, 8 + 2 * 16); @@ -48,11 +48,11 @@ void GSSetupPrimCodeGenerator::Generate() } } - Depth(); + Depth_SSE(); - Texture(); + Texture_SSE(); - Color(); + Color_SSE(); #ifdef _WIN64 vmovdqa(xmm6, ptr[rsp + 0]); @@ -64,7 +64,7 @@ void GSSetupPrimCodeGenerator::Generate() ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_SSE() { if(!m_en.z && !m_en.f) { @@ -163,7 +163,7 @@ void GSSetupPrimCodeGenerator::Depth() } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_SSE() { if(!m_en.t) { @@ -242,7 +242,7 @@ void GSSetupPrimCodeGenerator::Texture() } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_SSE() { if(!m_en.c) { diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index 21a7d47c97..f75ea3b6d2 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -23,7 +23,7 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; @@ -32,7 +32,7 @@ static const int _vertex = _args + 4; static const int _index = _args + 8; static const int _dscan = _args + 12; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_AVX() { if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { @@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate() } } - Depth(); + Depth_AVX(); - Texture(); + Texture_AVX(); - Color(); + Color_AVX(); ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_AVX() { if(!m_en.z && !m_en.f) { @@ -144,7 +144,7 @@ void GSSetupPrimCodeGenerator::Depth() } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_AVX() { if(!m_en.t) { @@ -213,7 +213,7 @@ void GSSetupPrimCodeGenerator::Texture() } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_AVX() { if(!m_en.c) { @@ -339,4 +339,4 @@ void GSSetupPrimCodeGenerator::Color() } } -#endif \ No newline at end of file +#endif diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 008a12a8f5..951788fa09 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -23,7 +23,7 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; @@ -32,7 +32,7 @@ static const int _vertex = _args + 4; static const int _index = _args + 8; static const int _dscan = _args + 12; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_SSE() { if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { @@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate() } } - Depth(); + Depth_SSE(); - Texture(); + Texture_SSE(); - Color(); + Color_SSE(); ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_SSE() { if(!m_en.z && !m_en.f) { @@ -149,7 +149,7 @@ void GSSetupPrimCodeGenerator::Depth() } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_SSE() { if(!m_en.t) { @@ -221,7 +221,7 @@ void GSSetupPrimCodeGenerator::Texture() } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_SSE() { if(!m_en.c) { @@ -354,4 +354,4 @@ void GSSetupPrimCodeGenerator::Color() } } -#endif \ No newline at end of file +#endif From cc6d193e1d7c11a14cff8f510f45ed3ce4633c1c Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 19 Nov 2016 15:01:14 +0100 Subject: [PATCH 19/20] gsdx: Relax SSE/AVX constraint on 64 bits The JIT will automatically select the best ISA (only AVX1 so far) --- plugins/GSdx/GSUtil.cpp | 8 ++++---- plugins/GSdx/stdafx.h | 31 ++++++++----------------------- 2 files changed, 12 insertions(+), 27 deletions(-) diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 797783ae22..030c6d4660 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -212,16 +212,16 @@ bool GSUtil::CheckSSE() ISA checks[] = { {Xbyak::util::Cpu::tSSE2, "SSE2"}, -#if _M_SSE >= 0x301 || defined(_M_AMD64) +#if _M_SSE >= 0x301 {Xbyak::util::Cpu::tSSSE3, "SSSE3"}, #endif -#if _M_SSE >= 0x401 || defined(_M_AMD64) +#if _M_SSE >= 0x401 {Xbyak::util::Cpu::tSSE41, "SSE41"}, #endif -#if _M_SSE >= 0x402 || defined(_M_AMD64) +#if _M_SSE >= 0x402 {Xbyak::util::Cpu::tSSE42, "SSE42"}, #endif -#if _M_SSE >= 0x500 || defined(_M_AMD64) +#if _M_SSE >= 0x500 {Xbyak::util::Cpu::tAVX, "AVX1"}, #endif #if _M_SSE >= 0x501 diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 9aebd6f332..b4e618f64c 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -266,9 +266,10 @@ using namespace stdext; #define ASSERT assert #ifdef __x86_64__ - #define _M_AMD64 +#endif +#ifdef _M_AMD64 // Yeah let use mips naming ;) #ifdef _WIN64 #define a0 rcx @@ -285,42 +286,26 @@ using namespace stdext; #define t0 r8 #define t1 r9 #endif - #endif // sse -#if defined(__GNUC__) && !defined(__x86_64__) +#if defined(__GNUC__) // Convert gcc see define into GSdx (windows) define #if defined(__AVX2__) - #define _M_SSE 0x501 + #if defined(__x86_64__) + #define _M_SSE 0x500 // TODO + #else + #define _M_SSE 0x501 + #endif #elif defined(__AVX__) #define _M_SSE 0x500 -#elif defined(__SSE4_2__) - #define _M_SSE 0x402 #elif defined(__SSE4_1__) #define _M_SSE 0x401 #elif defined(__SSSE3__) #define _M_SSE 0x301 #elif defined(__SSE2__) #define _M_SSE 0x200 -#elif defined(__SSE__) - #define _M_SSE 0x100 -#endif - -#elif defined(__GNUC__) - -#if defined(__AVX2__) -// FIXME until code is done - #define _M_SSE 0x500 -#elif defined(__AVX__) - #define _M_SSE 0x500 -#elif defined(__SSE4_1__) - #define _M_SSE 0x401 -#else -// FIXME won't likely be supported but allow to compile the code -// Note: from steam survey SSE4.1 is supported by 87% (end of 2015) - #define _M_SSE 0x200 #endif #endif From ef255024919c40e4f4d7d9edac7340e268f43136 Mon Sep 17 00:00:00 2001 From: Jonathan Li Date: Sat, 19 Nov 2016 16:39:34 +0100 Subject: [PATCH 20/20] gsdx build: don't exclude AVX files. Thanks for the patch :) --- plugins/GSdx/GSdx.vcxproj | 50 ++++++++------------------------------- 1 file changed, 10 insertions(+), 40 deletions(-) diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj index 46db6228ec..b44b3374d1 100644 --- a/plugins/GSdx/GSdx.vcxproj +++ b/plugins/GSdx/GSdx.vcxproj @@ -146,26 +146,11 @@ - - true - true - - - true - true - - - true - true - - - true - true - - - true - true - + + + + + @@ -187,26 +172,11 @@ - - true - true - - - true - true - - - true - true - - - true - true - - - true - true - + + + + +