diff --git a/build.sh b/build.sh index 43280d586f..23a758223b 100755 --- a/build.sh +++ b/build.sh @@ -80,6 +80,7 @@ for ARG in "$@"; do --cross-multilib ) flags="$flags -DCMAKE_TOOLCHAIN_FILE=$toolfile"; useCross=1; ;; --no-cross-multilib ) useCross=0; ;; --coverity ) CoverityBuild=1; cleanBuild=1; ;; + --vtune ) flags="$flags -DUSE_VTUNE=TRUE" ;; -D* ) flags="$flags $ARG" ;; *) diff --git a/cmake/BuildParameters.cmake b/cmake/BuildParameters.cmake index e5699ef608..05895c5129 100644 --- a/cmake/BuildParameters.cmake +++ b/cmake/BuildParameters.cmake @@ -28,6 +28,8 @@ if(DISABLE_BUILD_DATE OR openSUSE) add_definitions(-DDISABLE_BUILD_DATE) endif() +option(USE_VTUNE "Plug VTUNE to profile GSdx JIT.") + #------------------------------------------------------------------------------- # Graphical option #------------------------------------------------------------------------------- @@ -234,7 +236,7 @@ elseif(${PCSX2_TARGET_ARCHITECTURES} MATCHES "x86_64") if (USE_ICC) set(ARCH_FLAG "-msse2") else() - set(ARCH_FLAG "-msse -msse2 -mfxsr") + set(ARCH_FLAG "-msse -msse2 -mfxsr -mssse3 -msse4.1 -mavx") endif() else() #set(ARCH_FLAG "-march=native -fabi-version=6") diff --git a/plugins/GSdx/CMakeLists.txt b/plugins/GSdx/CMakeLists.txt index 15c93e0794..09c39f5344 100644 --- a/plugins/GSdx/CMakeLists.txt +++ b/plugins/GSdx/CMakeLists.txt @@ -195,15 +195,18 @@ set(GSdxFinalLibs ) if(EGL_API AND EGL_FOUND) - set(GSdxFinalLibs ${GSdxFinalLibs} - ${EGL_LIBRARIES} - ) + set(GSdxFinalLibs ${GSdxFinalLibs} ${EGL_LIBRARIES}) endif() if(LIBLZMA_FOUND) - set(GSdxFinalLibs ${GSdxFinalLibs} - ${LIBLZMA_LIBRARIES} - ) + set(GSdxFinalLibs ${GSdxFinalLibs} ${LIBLZMA_LIBRARIES}) +endif() + +if(USE_VTUNE) + set(GSdxFinalFlags ${GSdxFinalFlags} -DENABLE_VTUNE) + include_directories("$ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/include") + set(GSdxFinalLibs ${GSdxFinalLibs} $ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/lib64/libjitprofiling.a) + set(GSdxFinalLibs ${GSdxFinalLibs} $ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/lib32/libjitprofiling.a) endif() # Generate Glsl header file. Protect with REBUILD_SHADER to avoid build-dependency on PERL diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index 90c941638e..8307e9e99c 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -22,6 +22,17 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" +#if _M_SSE >= 0x501 +#else +void GSDrawScanlineCodeGenerator::Generate() +{ + if(g_cpu.has(util::Cpu::tAVX)) + Generate_AVX(); + else + Generate_SSE(); +} +#endif + #if _M_SSE >= 0x501 alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] = @@ -100,6 +111,9 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key { m_sel.key = key; + if(m_sel.breakpoint) + int3(); + Generate(); } @@ -180,196 +194,222 @@ void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a) void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift) { - #if _M_SSE >= 0x500 - - if(shift == 0) + if(g_cpu.has(util::Cpu::tAVX)) { - vpmulhrsw(a, f); + if(shift == 0) + { + vpmulhrsw(a, f); + } + else + { + vpsllw(a, shift + 1); + vpmulhw(a, f); + } + } else { - vpsllw(a, shift + 1); - vpmulhw(a, f); + if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3)) + { + pmulhrsw(a, f); + } + else + { + psllw(a, shift + 1); + pmulhw(a, f); + } } - - #else - - if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3)) - { - pmulhrsw(a, f); - } - else - { - psllw(a, shift + 1); - pmulhw(a, f); - } - - #endif } void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift) { - #if _M_SSE >= 0x500 - - vpsubw(a, b); - modulate16(a, f, shift); - vpaddw(a, b); - - #else - - psubw(a, b); - modulate16(a, f, shift); - paddw(a, b); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpsubw(a, b); + modulate16(a, f, shift); + vpaddw(a, b); + } + else + { + psubw(a, b); + modulate16(a, f, shift); + paddw(a, b); + } } void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) { - #if _M_SSE >= 0x500 - - vpsubw(a, b); - vpmullw(a, f); - vpsraw(a, 4); - vpaddw(a, b); - - #else - - psubw(a, b); - pmullw(a, f); - psraw(a, 4); - paddw(a, b); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpsubw(a, b); + vpmullw(a, f); + vpsraw(a, 4); + vpaddw(a, b); + } + else + { + psubw(a, b); + pmullw(a, f); + psraw(a, 4); + paddw(a, b); + } } void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) { - #if _M_SSE >= 0x500 - - vpblendw(a, b, 0xaa); - - #elif _M_SSE >= 0x401 - - pblendw(a, b, 0xaa); - - #else - - pcmpeqd(temp, temp); - psrld(temp, 16); - pand(a, temp); - pandn(temp, b); - por(a, temp); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpblendw(a, b, 0xaa); + } + else + { + if(g_cpu.has(util::Cpu::tSSE41)) + { + pblendw(a, b, 0xaa); + } + else + { + pcmpeqd(temp, temp); + psrld(temp, 16); + pand(a, temp); + pandn(temp, b); + por(a, temp); + } + } } void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) { - #if _M_SSE >= 0x500 - - vpackuswb(a, a); - vpmovzxbw(a, a); - - #elif _M_SSE >= 0x401 - - packuswb(a, a); - pmovzxbw(a, a); - - #else - - packuswb(a, a); - pxor(temp, temp); - punpcklbw(a, temp); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpackuswb(a, a); + vpmovzxbw(a, a); + } + else + { + if(g_cpu.has(util::Cpu::tSSE41)) + { + packuswb(a, a); + pmovzxbw(a, a); + } + else + { + packuswb(a, a); + pxor(temp, temp); + punpcklbw(a, temp); + } + } } void GSDrawScanlineCodeGenerator::alltrue() { - #if _M_SSE >= 0x500 - - vpmovmskb(eax, xmm7); - cmp(eax, 0xffff); - je("step", T_NEAR); - - #else - - pmovmskb(eax, xmm7); - cmp(eax, 0xffff); - je("step", T_NEAR); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpmovmskb(eax, xmm7); + cmp(eax, 0xffff); + je("step", T_NEAR); + } + else + { + pmovmskb(eax, xmm7); + cmp(eax, 0xffff); + je("step", T_NEAR); + } } void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) { - #if _M_SSE >= 0x500 - - vpand(b, mask); - vpandn(mask, a); - vpor(a, b, mask); - - #else - - pand(b, mask); - pandn(mask, a); - por(b, mask); - movdqa(a, b); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpand(b, mask); + vpandn(mask, a); + vpor(a, b, mask); + } + else + { + pand(b, mask); + pandn(mask, a); + por(b, mask); + movdqa(a, b); + } } void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) { - #if _M_SSE >= 0x500 - - vpand(b, mask); - vpandn(mask, a); - vpor(b, mask); - - #else - - pand(b, mask); - pandn(mask, a); - por(b, mask); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + vpand(b, mask); + vpandn(mask, a); + vpor(b, mask); + } + else + { + pand(b, mask); + pandn(mask, a); + por(b, mask); + } } void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) { - #if _M_SSE >= 0x500 - - vpblendvb(a, a, b, xmm0); - - #elif _M_SSE >= 0x401 - - pblendvb(a, b); - - #else - - blend(a, b, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + vpblendvb(a, a, b, xmm0); + else if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(a, b); + else + blend(a, b, xmm0); } void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) { - #if _M_SSE >= 0x500 - - vpblendvb(b, a, b, xmm0); + if(g_cpu.has(util::Cpu::tAVX)) + { + vpblendvb(b, a, b, xmm0); + } + else if(g_cpu.has(util::Cpu::tSSE41)) + { + pblendvb(a, b); + movdqa(b, a); + } + else + { + blendr(b, a, xmm0); + } +} - #elif _M_SSE >= 0x401 +void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src) +{ + // l = src & 0xFF; (1 left shift + 1 right shift) + // h = (src >> 8) & 0xFF; (1 right shift) - pblendvb(a, b); - movdqa(b, a); - - #else - - blendr(b, a, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tAVX)) + { + if (src == h) { + vpsllw(l, src, 8); + vpsrlw(h, 8); + } else if (src == l) { + vpsrlw(h, src, 8); + vpsllw(l, 8); + } else { + vpsllw(l, src, 8); + vpsrlw(h, src, 8); + } + vpsrlw(l, 8); + } + else + { + if (src == h) { + movdqa(l, src); + } else if (src == l) { + movdqa(h, src); + } else { + movdqa(l, src); + movdqa(h, src); + } + psllw(l, 8); + psrlw(l, 8); + psrlw(h, 8); + } } #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index c737e2f0d9..a26b970309 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -23,9 +23,16 @@ #include "GSScanlineEnvironment.h" #include "GSFunctionMap.h" +#include "GSUtil.h" using namespace Xbyak; +#if defined(_M_AMD64) || defined(_WIN64) +#define RegLong Reg64 +#else +#define RegLong Reg32 +#endif + class GSDrawScanlineCodeGenerator : public GSCodeGenerator { void operator = (const GSDrawScanlineCodeGenerator&); @@ -57,17 +64,9 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void WriteZBuf(); void AlphaBlend(); void WriteFrame(); - - #if defined(_M_AMD64) || defined(_WIN64) - void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr); - void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm); - #else - void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr); - void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm); - #endif - + void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr); + void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz); + void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm); void ReadTexel(int pixels, int mip_offset = 0); void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i); @@ -84,39 +83,59 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator #else - void Init(); - void Step(); - void TestZ(const Xmm& temp1, const Xmm& temp2); - void SampleTexture(); - void Wrap(const Xmm& uv0); - void Wrap(const Xmm& uv0, const Xmm& uv1); - void SampleTextureLOD(); - void WrapLOD(const Xmm& uv0); - void WrapLOD(const Xmm& uv0, const Xmm& uv1); - void AlphaTFX(); - void ReadMask(); - void TestAlpha(); - void ColorTFX(); - void Fog(); - void ReadFrame(); - void TestDestAlpha(); - void WriteMask(); - void WriteZBuf(); - void AlphaBlend(); - void WriteFrame(); + void Generate_SSE(); + void Init_SSE(); + void Step_SSE(); + void TestZ_SSE(const Xmm& temp1, const Xmm& temp2); + void SampleTexture_SSE(); + void Wrap_SSE(const Xmm& uv0); + void Wrap_SSE(const Xmm& uv0, const Xmm& uv1); + void SampleTextureLOD_SSE(); + void WrapLOD_SSE(const Xmm& uv0); + void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1); + void AlphaTFX_SSE(); + void ReadMask_SSE(); + void TestAlpha_SSE(); + void ColorTFX_SSE(); + void Fog_SSE(); + void ReadFrame_SSE(); + void TestDestAlpha_SSE(); + void WriteMask_SSE(); + void WriteZBuf_SSE(); + void AlphaBlend_SSE(); + void WriteFrame_SSE(); + void ReadPixel_SSE(const Xmm& dst, const RegLong& addr); + void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz); + void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm); + void ReadTexel_SSE(int pixels, int mip_offset = 0); + void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i); - #if defined(_M_AMD64) || defined(_WIN64) - void ReadPixel(const Xmm& dst, const Reg64& addr); - void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm); - #else - void ReadPixel(const Xmm& dst, const Reg32& addr); - void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm); - #endif - - void ReadTexel(int pixels, int mip_offset = 0); - void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i); + void Generate_AVX(); + void Init_AVX(); + void Step_AVX(); + void TestZ_AVX(const Xmm& temp1, const Xmm& temp2); + void SampleTexture_AVX(); + void Wrap_AVX(const Xmm& uv0); + void Wrap_AVX(const Xmm& uv0, const Xmm& uv1); + void SampleTextureLOD_AVX(); + void WrapLOD_AVX(const Xmm& uv0); + void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1); + void AlphaTFX_AVX(); + void ReadMask_AVX(); + void TestAlpha_AVX(); + void ColorTFX_AVX(); + void Fog_AVX(); + void ReadFrame_AVX(); + void TestDestAlpha_AVX(); + void WriteMask_AVX(); + void WriteZBuf_AVX(); + void AlphaBlend_AVX(); + void WriteFrame_AVX(); + void ReadPixel_AVX(const Xmm& dst, const RegLong& addr); + void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz); + void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm); + void ReadTexel_AVX(int pixels, int mip_offset = 0); + void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i); void modulate16(const Xmm& a, const Operand& f, int shift); void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift); @@ -128,6 +147,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void blendr(const Xmm& b, const Xmm& a, const Xmm& mask); void blend8(const Xmm& a, const Xmm& b); void blend8r(const Xmm& b, const Xmm& a); + void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src); #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index ddff1eeef0..ea227673e2 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -23,14 +23,48 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +// Ease the reading of the code +#define _m_local r11 +#define _m_local__gd r12 +#define _m_local__gd__vm r13 +#define _m_local__gd__clut r14 +#define _m_local__gd__tex r15 +// More pretty name +#define _z xmm8 +#define _f xmm9 +#define _s xmm10 +#define _t xmm11 +#define _q xmm12 +#define _f_rb xmm13 +#define _f_ga xmm14 +#define _test xmm15 +// Extra bonus +#define _rb xmm2 +#define _ga xmm3 +#define _fm xmm4 +#define _zm xmm5 +#define _fd xmm6 -#error TODO +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) -void GSDrawScanlineCodeGenerator::Generate() +#ifdef _WIN64 +#else +static const int _rz_rbx = -8 * 1; +static const int _rz_r12 = -8 * 2; +static const int _rz_r13 = -8 * 3; +static const int _rz_r14 = -8 * 4; +static const int _rz_r15 = -8 * 5; +static const int _rz_zs = -8 * 8; +static const int _rz_zd = -8 * 10; +static const int _rz_cov = -8 * 12; +#endif + +void GSDrawScanlineCodeGenerator::Generate_AVX() { - // TODO: on linux/mac rsi, rdi, xmm6-xmm15 are all caller saved + bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; + bool need_clut = need_tex && m_sel.tlu; +#ifdef _WIN64 push(rbx); push(rsi); push(rdi); @@ -39,26 +73,42 @@ void GSDrawScanlineCodeGenerator::Generate() push(r13); sub(rsp, 8 + 10 * 16); - + for(int i = 6; i < 16; i++) { vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); } +#else + // No reservation on the stack as a red zone is available + push(rbp); + mov(ptr[rsp + _rz_rbx], rbx); + mov(ptr[rsp + _rz_r12], r12); + mov(ptr[rsp + _rz_r13], r13); + if(need_clut) + mov(ptr[rsp + _rz_r14], r14); + if(need_tex) + mov(ptr[rsp + _rz_r15], r15); +#endif mov(r10, (size_t)&m_test[0]); - mov(r11, (size_t)&m_local); - mov(r12, (size_t)m_local.gd); - mov(r13, (size_t)m_local.gd->vm); + mov(_m_local, (size_t)&m_local); + mov(_m_local__gd, ptr[_m_local + offsetof(GSScanlineLocalData, gd)]); - Init(); + mov(_m_local__gd__vm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, vm)]); + if(need_clut) + mov(_m_local__gd__clut, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, clut)]); + if(need_tex) + mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]); - // rcx = steps - // rsi = fza_base - // rdi = fza_offset + Init_AVX(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset // r10 = &m_test[0] - // r11 = &m_local - // r12 = m_local->gd - // r13 = m_local->gd.vm + // _m_local = &m_local + // _m_local__gd = m_local->gd + // _m_local__gd__vm = m_local->gd.vm // xmm7 = vf (sprite && ltf) // xmm8 = z // xmm9 = f @@ -66,7 +116,7 @@ void GSDrawScanlineCodeGenerator::Generate() // xmm11 = t // xmm12 = q // xmm13 = rb - // xmm14 = ga + // xmm14 = ga // xmm15 = test if(!m_sel.edge) @@ -76,30 +126,30 @@ void GSDrawScanlineCodeGenerator::Generate() L("loop"); - TestZ(xmm5, xmm6); + TestZ_AVX(xmm5, xmm6); // ebp = za if(m_sel.mmin) { - SampleTextureLOD(); + SampleTextureLOD_AVX(); } else { - SampleTexture(); + SampleTexture_AVX(); } // ebp = za // xmm2 = rb // xmm3 = ga - AlphaTFX(); + AlphaTFX_AVX(); // ebp = za // xmm2 = rb // xmm3 = ga - ReadMask(); + ReadMask_AVX(); // ebp = za // xmm2 = rb @@ -107,7 +157,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - TestAlpha(); + TestAlpha_AVX(); // ebp = za // xmm2 = rb @@ -115,7 +165,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - ColorTFX(); + ColorTFX_AVX(); // ebp = za // xmm2 = rb @@ -123,7 +173,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - Fog(); + Fog_AVX(); // ebp = za // xmm2 = rb @@ -131,7 +181,7 @@ L("loop"); // xmm4 = fm // xmm5 = zm - ReadFrame(); + ReadFrame_AVX(); // ebx = fa // ebp = za @@ -141,7 +191,7 @@ L("loop"); // xmm5 = zm // xmm6 = fd - TestDestAlpha(); + TestDestAlpha_AVX(); // ebx = fa // ebp = za @@ -151,7 +201,7 @@ L("loop"); // xmm5 = zm // xmm6 = fd - WriteMask(); + WriteMask_AVX(); // ebx = fa // edx = fzm @@ -162,7 +212,7 @@ L("loop"); // xmm5 = zm // xmm6 = fd - WriteZBuf(); + WriteZBuf_AVX(); // ebx = fa // edx = fzm @@ -171,7 +221,7 @@ L("loop"); // xmm4 = fm // xmm6 = fd - AlphaBlend(); + AlphaBlend_AVX(); // ebx = fa // edx = fzm @@ -180,7 +230,7 @@ L("loop"); // xmm4 = fm // xmm6 = fd - WriteFrame(); + WriteFrame_AVX(); L("step"); @@ -188,17 +238,18 @@ L("step"); if(!m_sel.edge) { - test(rcx, rcx); + test(a0, a0); jle("exit", T_NEAR); - Step(); + Step_AVX(); jmp("loop", T_NEAR); } L("exit"); +#ifdef _WIN64 for(int i = 6; i < 16; i++) { vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); @@ -212,77 +263,101 @@ L("exit"); pop(rdi); pop(rsi); pop(rbx); +#else + mov(rbx, ptr[rsp + _rz_rbx]); + mov(r12, ptr[rsp + _rz_r12]); + mov(r13, ptr[rsp + _rz_r13]); + if(need_clut) + mov(r14, ptr[rsp + _rz_r14]); + if(need_tex) + mov(r15, ptr[rsp + _rz_r15]); + pop(rbp); +#endif ret(); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_AVX() { - // int skip = left & 3; + if(!m_sel.notest) + { + // int skip = left & 3; - mov(rbx, rdx); - and(rdx, 3); + mov(ebx, a1.cvt32()); + and(a1.cvt32(), 3); - // left -= skip; + // left -= skip; - sub(rbx, rdx); + sub(ebx, a1.cvt32()); - // int steps = pixels + skip - 4; + // int steps = pixels + skip - 4; - lea(rcx, ptr[rcx + rdx - 4]); + lea(a0, ptr[a0 + a1 - 4]); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(rdx, 4); + shl(a1.cvt32(), 4); // * sizeof(m_test[0]) - vmovdqa(xmm15, ptr[rdx + r10]); + vmovdqa(_test, ptr[a1 + r10]); - mov(rax, rcx); - sar(rax, 63); - and(rax, rcx); - shl(rax, 4); + mov(rax, a0); + sar(rax, 63); // GH: 63 to extract the sign of the register + and(rax, a0); + shl(rax, 4); // * sizeof(m_test[0]) + + vpor(_test, ptr[rax + r10 + 7 * 16]); + } + else + { + mov(ebx, a1.cvt32()); // left + xor(a1.cvt32(), a1.cvt32()); // skip + lea(a0, ptr[a0 - 4]); // steps + } + + // a0 = steps + // a1 = skip + // rbx = left - vpor(xmm15, ptr[rax + r10 + 7 * 16]); // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - mov(rax, (size_t)m_local.gd->fzbr); - lea(rsi, ptr[rax + r8 * 8]); + mov(rax, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fzbr)]); + lea(t1, ptr[rax + a2 * 8]); // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - mov(rax, (size_t)m_local.gd->fzbc); - lea(rdi, ptr[rax + rbx * 2]); + mov(rax, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fzbc)]); + lea(t0, ptr[rax + rbx * 2]); if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { - // edx = &m_local.d[skip] + // a1 = &m_local.d[skip] // note a1 was (skip << 4) - lea(rdx, ptr[rdx * 8 + r11 + offsetof(GSScanlineLocalData, d)]); + lea(a1, ptr[a1 * 8 + _m_local + offsetof(GSScanlineLocalData, d)]); } if(m_sel.prim != GS_SPRITE_CLASS) { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { - vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p + vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p if(m_sel.fwrite && m_sel.fge) { // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - vcvttps2dq(xmm9, xmm0); - vpshufhw(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm9, ptr[rdx + 16 * 6]); + vcvttps2dq(_f, xmm0); + vpshufhw(_f, _f, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(_f, _f, _MM_SHUFFLE(2, 2, 2, 2)); + vpaddw(_f, ptr[a1 + 16 * 6]); } if(m_sel.zb) { // z = vp.zzzz() + m_local.d[skip].z; - vshufps(xmm8, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm8, ptr[rdx]); + vshufps(_z, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vaddps(_z, ptr[a1]); } } } @@ -290,28 +365,39 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.ztest) { - vmovdqa(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, p.z)]); + vmovdqa(_z, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]); } + + if(m_sel.fwrite && m_sel.fge) + vmovdqa(_f, ptr[_m_local + offsetof(GSScanlineLocalData, p.f)]); } if(m_sel.fb) { if(m_sel.edge || m_sel.tfx != TFX_NONE) { - vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t + vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t } if(m_sel.edge) { + // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); + vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); vpsrlw(xmm1, 9); - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.cov)], xmm1); +#ifdef _WIN64 + vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)], xmm1); +#else + vmovdqa(ptr[rsp + _rz_cov], xmm1); +#endif } if(m_sel.tfx != TFX_NONE) { + // a1 = &m_local.d[skip] + if(m_sel.fst) { // GSVector4i vti(vt); @@ -321,23 +407,20 @@ void GSDrawScanlineCodeGenerator::Init() // s = vti.xxxx() + m_local.d[skip].s; // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vpshufd(_s, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(_t, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]); + vpaddd(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) { - vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]); + vpaddd(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); } - else + else if(m_sel.ltf) { - if(m_sel.ltf) - { - vpshuflw(xmm6, xmm11, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); - } + vpshuflw(xmm7, _t, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm7, 12); } } else @@ -346,13 +429,13 @@ void GSDrawScanlineCodeGenerator::Init() // t = vt.yyyy() + m_local.d[skip].t; // q = vt.zzzz() + m_local.d[skip].q; - vshufps(xmm10, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vshufps(_s, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(_t, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(_q, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]); - vaddps(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]); - vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]); + vaddps(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); + vaddps(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); + vaddps(_q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]); } } @@ -362,7 +445,7 @@ void GSDrawScanlineCodeGenerator::Init() { // GSVector4i vc = GSVector4i(v.c); - vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c + vcvttps2dq(xmm0, ptr[a3 + offsetof(GSVertexSW, c)]); // v.c // vc = vc.upl16(vc.zwxy()); @@ -372,30 +455,40 @@ void GSDrawScanlineCodeGenerator::Init() // rb = vc.xxxx().add16(m_local.d[skip].rb); // ga = vc.zzzz().add16(m_local.d[skip].ga); - vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(_f_rb, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(_f_ga, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm13, ptr[rdx + offsetof(GSScanlineLocalData::skip, rb)]); - vpaddw(xmm14, ptr[rdx + offsetof(GSScanlineLocalData::skip, ga)]); + vpaddw(_f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]); + vpaddw(_f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]); } else { - vmovdqa(xmm13, ptr[r11 + offsetof(GSScanlineLocalData, c.rb)]); - vmovdqa(xmm14, ptr[r11 + offsetof(GSScanlineLocalData, c.ga)]); + vmovdqa(_f_rb, ptr[_m_local + offsetof(GSScanlineLocalData, c.rb)]); + vmovdqa(_f_ga, ptr[_m_local + offsetof(GSScanlineLocalData, c.ga)]); } + + vmovdqa(_rb, _f_rb); + vmovdqa(_ga, _f_ga); } } + + + if(m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) + { + // On linux, a2 is edx which will be used for fzm + mov(a1, a2); + } } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_AVX() { // steps -= 4; - sub(rcx, 4); + sub(a0, 4); // fza_offset++; - add(rdi, 8); + add(t0, 8); if(m_sel.prim != GS_SPRITE_CLASS) { @@ -403,14 +496,14 @@ void GSDrawScanlineCodeGenerator::Step() if(m_sel.zb) { - vaddps(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, d4.z)]); + vaddps(_z, ptr[_m_local + offsetof(GSScanlineLocalData, d4.z)]); } // f = f.add16(m_local.d4.f); if(m_sel.fwrite && m_sel.fge) { - vpaddw(xmm9, ptr[r11 + offsetof(GSScanlineLocalData, d4.f)]); + vpaddw(_f, ptr[_m_local + offsetof(GSScanlineLocalData, d4.f)]); } } else @@ -431,15 +524,15 @@ void GSDrawScanlineCodeGenerator::Step() // si += st.xxxx(); // if(!sprite) ti += st.yyyy(); - vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]); + vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.stq)]); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(xmm10, xmm1); + vpaddd(_s, xmm1); if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) { vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm11, xmm1); + vpaddd(_t, xmm1); } } else @@ -450,15 +543,15 @@ void GSDrawScanlineCodeGenerator::Step() // t += stq.yyyy(); // q += stq.zzzz(); - vmovaps(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]); + vmovaps(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.stq)]); vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm10, xmm1); - vaddps(xmm11, xmm2); - vaddps(xmm12, xmm3); + vaddps(_s, xmm1); + vaddps(_t, xmm2); + vaddps(_q, xmm3); } } @@ -471,19 +564,19 @@ void GSDrawScanlineCodeGenerator::Step() // rb = rb.add16(c.xxxx()); // ga = ga.add16(c.yyyy()); - vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.c)]); + vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.c)]); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddw(xmm13, xmm1); - vpaddw(xmm14, xmm2); + vpaddw(_f_rb, xmm1); + vpaddw(_f_ga, xmm2); // FIXME: color may underflow and roll over at the end of the line, if decreasing vpxor(xmm0, xmm0); - vpmaxsw(xmm13, xmm0); - vpmaxsw(xmm14, xmm0); + vpmaxsw(_f_rb, xmm0); + vpmaxsw(_f_ga, xmm0); } else { @@ -491,20 +584,26 @@ void GSDrawScanlineCodeGenerator::Step() { } } + + vmovdqa(_rb, _f_rb); + vmovdqa(_ga, _f_ga); } } - // test = m_test[7 + (steps & (steps >> 31))]; + if(!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; - mov(rdx, rcx); - sar(rdx, 63); - and(rdx, rcx); - shl(rdx, 4); + mov(rax, a0); + sar(rax, 63); // GH: 63 to extract the sign of the register + and(rax, a0); + shl(rax, 4); - vmovdqa(xmm15, ptr[rdx + r10 + 7 * 16]); + vmovdqa(_test, ptr[rax + r10 + 7 * 16]); + } } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { @@ -513,9 +612,9 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) // int za = fza_base.y + fza_offset->y; - movsxd(rbp, dword[rsi + 4]); - movsxd(rax, dword[rdi + 4]); - add(rbp, rax); + mov(ebp, dword[t1 + 4]); + add(ebp, dword[t0 + 4]); + and(ebp, HALF_VM_SIZE - 1); // GSVector4i zs = zi; @@ -524,15 +623,15 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.zoverflow) { // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - + mov(rax, (size_t)&GSVector4::m_half); vbroadcastss(xmm0, ptr[rax]); - vmulps(xmm0, xmm8); + vmulps(xmm0, _z); vcvttps2dq(xmm0, xmm0); vpslld(xmm0, 1); - vcvttps2dq(xmm1, xmm8); + vcvttps2dq(xmm1, _z); vpcmpeqd(xmm2, xmm2); vpsrld(xmm2, 31); vpand(xmm1, xmm2); @@ -543,22 +642,34 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) { // zs = GSVector4i(z); - vcvttps2dq(xmm0, xmm8); + vcvttps2dq(xmm0, _z); } if(m_sel.zwrite) { - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)], xmm0); +#ifdef _WIN64 + vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)], xmm0); +#else + vmovdqa(ptr[rsp + _rz_zs], xmm0); +#endif } } + else + { + movdqa(xmm0, _z); + } if(m_sel.ztest) { - ReadPixel(xmm1, rbp); + ReadPixel_AVX(xmm1, rbp); if(m_sel.zwrite && m_sel.zpsm < 2) { - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm1); +#ifdef _WIN64 + vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], xmm1); +#else + vmovdqa(ptr[rsp + _rz_zd], xmm1); +#endif } // zd &= 0xffffffff >> m_sel.zpsm * 8; @@ -588,7 +699,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) case ZTST_GEQUAL: // test |= zso < zdo; // ~(zso >= zdo) vpcmpgtd(xmm1, xmm0); - vpor(xmm15, xmm1); + vpor(_test, xmm1); break; case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL @@ -596,7 +707,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) vpcmpgtd(xmm0, xmm1); vpcmpeqd(xmm2, xmm2); vpxor(xmm0, xmm2); - vpor(xmm15, xmm0); + vpor(_test, xmm0); break; } @@ -604,23 +715,19 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) } } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_AVX() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } - mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); - - // ebx = tex - if(!m_sel.fst) { - vrcpps(xmm0, xmm12); + vrcpps(xmm0, _q); - vmulps(xmm4, xmm10, xmm0); - vmulps(xmm5, xmm11, xmm0); + vmulps(xmm4, _s, xmm0); + vmulps(xmm5, _t, xmm0); vcvttps2dq(xmm4, xmm4); vcvttps2dq(xmm5, xmm5); @@ -640,25 +747,25 @@ void GSDrawScanlineCodeGenerator::SampleTexture() } else { - vmovdqa(xmm4, xmm10); - vmovdqa(xmm5, xmm11); + vmovdqa(xmm4, _s); + vmovdqa(xmm5, _t); } if(m_sel.ltf) { - // GSVector4i uf = u.xxzzlh().srl16(1); + // GSVector4i uf = u.xxzzlh().srl16(12); vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); + vpsrlw(xmm6, 12); if(m_sel.prim != GS_SPRITE_CLASS) { - // GSVector4i vf = v.xxzzlh().srl16(1); + // GSVector4i vf = v.xxzzlh().srl16(12); vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm7, 1); + vpsrlw(xmm7, 12); } } @@ -679,13 +786,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm4, xmm5); + Wrap_AVX(xmm4, xmm5); } else { // uv0 = Wrap(uv0); - Wrap(xmm4); + Wrap_AVX(xmm4); } // xmm4 = uv0 @@ -746,118 +853,112 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 0); - // xmm0 = c00 - // xmm1 = c01 - // xmm2 = c10 - // xmm3 = c11 + ReadTexel_AVX(4, 0); + + // xmm0 = c10 + // xmm1 = c11 + // xmm4 = c00 + // xmm5 = c01 // xmm6 = uf // xmm7 = vf // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm4, xmm0, 8); - vpsrlw(xmm4, 8); - vpsrlw(xmm5, xmm0, 8); + split16_2x8(xmm2, xmm3, xmm4); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm0, xmm1, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm4, xmm5, xmm5); - // xmm0 = rb01 - // xmm1 = ga01 - // xmm2 = c10 - // xmm3 = c11 - // xmm4 = rb00 - // xmm5 = ga00 + // xmm0 = c10 + // xmm1 = c11 + // xmm2 = rb00 + // xmm3 = ga00 + // xmm4 = rb01 + // xmm5 = ga01 // xmm6 = uf // xmm7 = vf - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); - lerp16(xmm0, xmm4, xmm6, 0); - lerp16(xmm1, xmm5, xmm6, 0); + lerp16_4(xmm4, xmm2, xmm6); + lerp16_4(xmm5, xmm3, xmm6); - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = c10 - // xmm3 = c11 + // xmm0 = c10 + // xmm1 = c11 + // xmm4 = rb00 + // xmm5 = ga00 // xmm6 = uf // xmm7 = vf // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm5, xmm2, 8); - vpsllw(xmm2, 8); - vpsrlw(xmm4, xmm2, 8); + split16_2x8(xmm2, xmm3, xmm0); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm2, xmm3, 8); - vpsllw(xmm3, 8); - vpsrlw(xmm3, 8); + split16_2x8(xmm0, xmm1, xmm1); - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = rb11 - // xmm3 = ga11 - // xmm4 = rb10 - // xmm5 = ga10 + // xmm0 = rb11 + // xmm1 = ga11 + // xmm2 = rb10 + // xmm3 = ga10 + // xmm4 = rb00 + // xmm5 = ga00 // xmm6 = uf // xmm7 = vf - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); - lerp16(xmm2, xmm4, xmm6, 0); - lerp16(xmm3, xmm5, xmm6, 0); + lerp16_4(xmm0, xmm2, xmm6); + lerp16_4(xmm1, xmm3, xmm6); - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = rb10 - // xmm3 = ga10 + // xmm0 = rb10 + // xmm1 = ga10 + // xmm4 = rb00 + // xmm5 = ga00 // xmm7 = vf - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); - lerp16(xmm2, xmm0, xmm7, 0); - lerp16(xmm3, xmm1, xmm7, 0); + lerp16_4(xmm0, xmm4, xmm7); + lerp16_4(xmm1, xmm5, xmm7); + + // FIXME not ideal (but allow different source in ReadTexel and less register dependency) + vmovdqa(xmm2, xmm0); + vmovdqa(xmm3, xmm1); } else { // GSVector4i addr00 = y0 + x0; - vpaddd(xmm3, xmm2); + vpaddd(xmm0, xmm3, xmm2); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_AVX(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsrlw(xmm3, xmm2, 8); - vpsllw(xmm2, 8); - vpsrlw(xmm2, 8); + split16_2x8(_rb, _ga, xmm4); } // xmm2 = rb // xmm3 = ga } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv) { // xmm0, xmm1, xmm2, xmm3 = free @@ -872,7 +973,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) { if(region) { - vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vpmaxsw(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); } else { @@ -880,23 +981,23 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) vpmaxsw(uv, xmm0); } - vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vpminsw(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); } else { - vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vpand(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); if(region) { - vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vpor(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); } } } else { - vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); + vmovdqa(xmm2, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.mask)]); // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; @@ -918,7 +1019,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm2, xmm3 = free @@ -933,7 +1034,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) { if(region) { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); vpmaxsw(uv0, xmm0); vpmaxsw(uv1, xmm0); } @@ -944,19 +1045,19 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) vpmaxsw(uv1, xmm0); } - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); vpminsw(uv0, xmm0); vpminsw(uv1, xmm0); } else { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); vpand(uv0, xmm0); vpand(uv1, xmm0); if(region) { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); vpor(uv0, xmm0); vpor(uv1, xmm0); } @@ -964,9 +1065,9 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } else { - vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); + vmovdqa(xmm2, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.mask)]); // uv0 @@ -1010,19 +1111,19 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::SampleTextureLOD() +void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX() { } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv) { } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1) { } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_AVX() { if(!m_sel.fb) { @@ -1035,17 +1136,17 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() // gat = gat.modulate16<1>(ga).clamp8(); - modulate16(xmm3, xmm14, 1); + modulate16(_ga, _f_ga, 1); - clamp16(xmm3, xmm0); + clamp16(_ga, xmm0); // if(!tcc) gat = gat.mix16(ga.srl16(7)); if(!m_sel.tcc) { - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); } break; @@ -1056,9 +1157,9 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); } break; @@ -1067,14 +1168,14 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - if(m_sel.tcc) + if(m_sel.tcc) { - vpaddusb(xmm1, xmm3); + vpaddusb(xmm1, _ga); } - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); break; @@ -1084,9 +1185,9 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { - vpsrlw(xmm1, xmm14, 7); + vpsrlw(xmm1, _f_ga, 7); - mix16(xmm3, xmm1, xmm0); + mix16(_ga, xmm1, xmm0); } break; @@ -1097,29 +1198,83 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(m_sel.iip) { - vpsrlw(xmm3, xmm14, 7); + vpsrlw(_ga, _f_ga, 7); } break; } - // TODO: aa1 + if(m_sel.aa1) + { + // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha + + // FIXME: bios config screen cubes + + if(!m_sel.abe) + { + // a = cov + + if(m_sel.edge) + { +#ifdef _WIN64 + vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]); +#else + vmovdqa(xmm0, ptr[rsp + _rz_cov]); +#endif + } + else + { + vpcmpeqd(xmm0, xmm0); + vpsllw(xmm0, 15); + vpsrlw(xmm0, 8); + } + + mix16(_ga, xmm0, xmm1); + } + else + { + // a = a == 0x80 ? cov : a + + vpcmpeqd(xmm0, xmm0); + vpsllw(xmm0, 15); + vpsrlw(xmm0, 8); + + if(m_sel.edge) + { +#ifdef _WIN64 + vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]); +#else + vmovdqa(xmm1, ptr[rsp + _rz_cov]); +#endif + } + else + { + vmovdqa(xmm1, xmm0); + } + + vpcmpeqw(xmm0, _ga); + vpsrld(xmm0, 16); + vpslld(xmm0, 16); + + vpblendvb(_ga, xmm1, xmm0); + } + } } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_AVX() { if(m_sel.fwrite) { - vmovdqa(xmm4, ptr[r12 + offsetof(GSScanlineGlobalData, fm)]); + vmovdqa(_fm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fm)]); } if(m_sel.zwrite) { - vmovdqa(xmm5, ptr[r12 + offsetof(GSScanlineGlobalData, zm)]); + vmovdqa(_zm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, zm)]); } } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_AVX() { switch(m_sel.atst) { @@ -1134,14 +1289,14 @@ void GSDrawScanlineCodeGenerator::TestAlpha() case ATST_LESS: case ATST_LEQUAL: // t = (ga >> 16) > m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpgtd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm1, _ga, 16); + vpcmpgtd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); break; case ATST_EQUAL: // t = (ga >> 16) != m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm1, _ga, 16); + vpcmpeqd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); vpcmpeqd(xmm0, xmm0); vpxor(xmm1, xmm0); break; @@ -1149,15 +1304,15 @@ void GSDrawScanlineCodeGenerator::TestAlpha() case ATST_GEQUAL: case ATST_GREATER: // t = (ga >> 16) < m_local.gd->aref; - vpsrld(xmm0, xmm3, 16); - vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm0, _ga, 16); + vmovdqa(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); vpcmpgtd(xmm1, xmm0); break; case ATST_NOTEQUAL: // t = (ga >> 16) == m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpsrld(xmm1, _ga, 16); + vpcmpeqd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]); break; } @@ -1165,32 +1320,32 @@ void GSDrawScanlineCodeGenerator::TestAlpha() { case AFAIL_KEEP: // test |= t; - vpor(xmm15, xmm1); + vpor(_test, xmm1); alltrue(); break; case AFAIL_FB_ONLY: // zm |= t; - vpor(xmm5, xmm1); + vpor(_zm, xmm1); break; case AFAIL_ZB_ONLY: // fm |= t; - vpor(xmm4, xmm1); + vpor(_fm, xmm1); break; case AFAIL_RGB_ONLY: // zm |= t; - vpor(xmm5, xmm1); + vpor(_zm, xmm1); // fm |= t & GSVector4i::xff000000(); vpsrld(xmm1, 24); vpslld(xmm1, 24); - vpor(xmm4, xmm1); + vpor(_fm, xmm1); break; } } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_AVX() { if(!m_sel.fwrite) { @@ -1203,9 +1358,9 @@ void GSDrawScanlineCodeGenerator::ColorTFX() // rbt = rbt.modulate16<1>(rb).clamp8(); - modulate16(xmm2, xmm13, 1); + modulate16(_rb, _f_rb, 1); - clamp16(xmm2, xmm0); + clamp16(_rb, xmm0); break; @@ -1218,27 +1373,27 @@ void GSDrawScanlineCodeGenerator::ColorTFX() // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - vmovdqa(xmm1, xmm3); + vmovdqa(xmm1, _ga); - modulate16(xmm3, xmm14, 1); + modulate16(_ga, _f_ga, 1); - vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1)); + vpshuflw(xmm6, _f_ga, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); vpsrlw(xmm6, 7); - vpaddw(xmm3, xmm6); + vpaddw(_ga, xmm6); - clamp16(xmm3, xmm0); - - mix16(xmm3, xmm1, xmm0); + clamp16(_ga, xmm0); + + mix16(_ga, xmm1, xmm0); // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - modulate16(xmm2, xmm13, 1); + modulate16(_rb, _f_rb, 1); - vpaddw(xmm2, xmm6); - - clamp16(xmm2, xmm0); + vpaddw(_rb, xmm6); + + clamp16(_rb, xmm0); break; @@ -1248,14 +1403,14 @@ void GSDrawScanlineCodeGenerator::ColorTFX() if(m_sel.iip) { - vpsrlw(xmm2, xmm13, 7); + vpsrlw(_rb, _f_rb, 7); } break; } } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_AVX() { if(!m_sel.fwrite || !m_sel.fge) { @@ -1265,18 +1420,18 @@ void GSDrawScanlineCodeGenerator::Fog() // rb = m_local.gd->frb.lerp16<0>(rb, f); // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, frb)]); - vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, fga)]); + vmovdqa(xmm6, _ga); - vmovdqa(xmm6, xmm3); + vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, frb)]); + vmovdqa(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fga)]); - lerp16(xmm2, xmm0, xmm9, 0); - lerp16(xmm3, xmm1, xmm9, 0); + lerp16(_rb, xmm0, _f, 0); + lerp16(_ga, xmm1, _f, 0); - mix16(xmm3, xmm6, xmm9); + mix16(_ga, xmm6, _f); } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_AVX() { if(!m_sel.fb) { @@ -1285,19 +1440,19 @@ void GSDrawScanlineCodeGenerator::ReadFrame() // int fa = fza_base.x + fza_offset->x; - mov(ebx, dword[rsi]); - add(ebx, dword[rdi]); - movsxd(rbx, ebx); + mov(ebx, dword[t1]); + add(ebx, dword[t0]); + and(ebx, HALF_VM_SIZE - 1); if(!m_sel.rfb) { return; } - ReadPixel(xmm6, rbx); + ReadPixel_AVX(_fd, rbx); } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX() { if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) { @@ -1311,15 +1466,15 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() if(m_sel.fpsm == 2) { vpxor(xmm0, xmm0); - //vpsrld(xmm1, xmm6, 15); - vpslld(xmm1, xmm6, 16); + //vpsrld(xmm1, _fd, 15); + vpslld(xmm1, _fd, 16); vpsrad(xmm1, 31); vpcmpeqd(xmm1, xmm0); } else { vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm6, xmm0); + vpxor(xmm1, _fd, xmm0); vpsrad(xmm1, 31); } } @@ -1327,33 +1482,38 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() { if(m_sel.fpsm == 2) { - vpslld(xmm1, xmm6, 16); + vpslld(xmm1, _fd, 16); vpsrad(xmm1, 31); } else { - vpsrad(xmm1, xmm6, 31); + vpsrad(xmm1, _fd, 31); } } - vpor(xmm15, xmm1); + vpor(_test, xmm1); alltrue(); } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_AVX() { + if(m_sel.notest) + { + return; + } + // fm |= test; // zm |= test; if(m_sel.fwrite) { - vpor(xmm4, xmm15); + vpor(_fm, _test); } if(m_sel.zwrite) { - vpor(xmm5, xmm15); + vpor(_zm, _test); } // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); @@ -1362,18 +1522,18 @@ void GSDrawScanlineCodeGenerator::WriteMask() if(m_sel.fwrite && m_sel.zwrite) { - vpcmpeqd(xmm0, xmm1, xmm5); - vpcmpeqd(xmm1, xmm4); + vpcmpeqd(xmm0, xmm1, _zm); + vpcmpeqd(xmm1, _fm); vpackssdw(xmm1, xmm0); } else if(m_sel.fwrite) { - vpcmpeqd(xmm1, xmm4); + vpcmpeqd(xmm1, _fm); vpackssdw(xmm1, xmm1); } else if(m_sel.zwrite) { - vpcmpeqd(xmm1, xmm5); + vpcmpeqd(xmm1, _zm); vpackssdw(xmm1, xmm1); } @@ -1382,28 +1542,39 @@ void GSDrawScanlineCodeGenerator::WriteMask() not(edx); } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() { if(!m_sel.zwrite) { return; } - bool fast = m_sel.ztest && m_sel.zpsm < 2; + if (m_sel.prim != GS_SPRITE_CLASS) +#ifdef _WIN64 + vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)]); +#else + vmovdqa(xmm1, ptr[rsp + _rz_zs]); +#endif + else + vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]); - vmovdqa(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)]); - - if(fast) + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); - vpblendvb(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm4); +#ifdef _WIN64 + vpblendvb(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], _zm); +#else + vpblendvb(xmm1, ptr[rsp + _rz_zd], _zm); +#endif } - WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1); + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + + WritePixel_AVX(xmm1, rbp, dh, fast, m_sel.zpsm, 1); } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_AVX() { if(!m_sel.fwrite) { @@ -1415,6 +1586,9 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() return; } + const Xmm& _dst_rb = xmm0; + const Xmm& _dst_ga = xmm1; + if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) @@ -1425,9 +1599,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - vpsllw(xmm0, xmm6, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, xmm6, 8); + split16_2x8(_dst_rb, _dst_ga, _fd); break; @@ -1439,24 +1611,24 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() vpcmpeqd(xmm15, xmm15); vpsrld(xmm15, 27); // 0x0000001f - vpand(xmm0, xmm6, xmm15); - vpslld(xmm0, 3); + vpand(_dst_rb, _fd, xmm15); + vpslld(_dst_rb, 3); vpslld(xmm15, 10); // 0x00007c00 - vpand(xmm5, xmm6, xmm15); + vpand(xmm5, _fd, xmm15); vpslld(xmm5, 9); - vpor(xmm0, xmm1); + vpor(_dst_rb, xmm5); vpsrld(xmm15, 5); // 0x000003e0 - vpand(xmm1, xmm6, xmm15); - vpsrld(xmm1, 2); + vpand(_dst_ga, _fd, xmm15); + vpsrld(_dst_ga, 2); vpsllw(xmm15, 10); // 0x00008000 - vpand(xmm5, xmm6, xmm15); + vpand(xmm5, _fd, xmm15); vpslld(xmm5, 8); - vpor(xmm1, xmm5); + vpor(_dst_ga, xmm5); break; } @@ -1468,7 +1640,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { - vmovdqa(xmm5, xmm2); + vmovdqa(xmm5, _rb); } if(m_sel.aba != m_sel.abb) @@ -1478,16 +1650,16 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.aba) { case 0: break; - case 1: vmovdqa(xmm2, xmm0); break; - case 2: vpxor(xmm2, xmm2); break; + case 1: vmovdqa(_rb, _dst_rb); break; + case 2: vpxor(_rb, _rb); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { - case 0: vpsubw(xmm2, xmm5); break; - case 1: vpsubw(xmm2, xmm0); break; + case 0: vpsubw(_rb, xmm5); break; + case 1: vpsubw(_rb, _dst_rb); break; case 2: break; } @@ -1499,26 +1671,26 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() { case 0: case 1: - vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1)); + vpshuflw(xmm15, m_sel.abc ? _dst_ga : _ga, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); vpsllw(xmm15, 7); break; case 2: - vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]); + vmovdqa(xmm15, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, afix)]); break; } // rb = rb.modulate16<1>(a); - modulate16(xmm2, xmm15, 1); + modulate16(_rb, xmm15, 1); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { - case 0: vpaddw(xmm2, xmm5); break; - case 1: vpaddw(xmm2, xmm0); break; + case 0: vpaddw(_rb, xmm5); break; + case 1: vpaddw(_rb, _dst_rb); break; case 2: break; } } @@ -1529,8 +1701,8 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.abd) { case 0: break; - case 1: vmovdqa(xmm2, xmm0); break; - case 2: vpxor(xmm2, xmm2); break; + case 1: vmovdqa(_rb, _dst_rb); break; + case 2: vpxor(_rb, _rb); break; } } @@ -1538,12 +1710,12 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() { // mask = (c[1] << 8).sra32(31); - vpslld(xmm0, xmm3, 8); + vpslld(xmm0, _ga, 8); vpsrad(xmm0, 31); // rb = c[0].blend8(rb, mask); - vpblendvb(xmm2, xmm5, xmm2, xmm0); + vpblendvb(_rb, xmm5, _rb, xmm0); } // xmm0 = pabe mask @@ -1553,7 +1725,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // xmm15 = a // xmm5 = free - vmovdqa(xmm5, xmm3); + vmovdqa(xmm5, _ga); if(m_sel.aba != m_sel.abb) { @@ -1562,16 +1734,16 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.aba) { case 0: break; - case 1: vmovdqa(xmm3, xmm1); break; - case 2: vpxor(xmm3, xmm3); break; + case 1: vmovdqa(_ga, _dst_ga); break; + case 2: vpxor(_ga, _ga); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { - case 0: vpsubw(xmm3, xmm5); break; - case 1: vpsubw(xmm3, xmm1); break; + case 0: vpsubw(_ga, xmm5); break; + case 1: vpsubw(_ga, _dst_ga); break; case 2: break; } @@ -1579,15 +1751,15 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() { // ga = ga.modulate16<1>(a); - modulate16(xmm3, xmm15, 1); + modulate16(_ga, xmm15, 1); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { - case 0: vpaddw(xmm3, xmm5); break; - case 1: vpaddw(xmm3, xmm1); break; + case 0: vpaddw(_ga, xmm5); break; + case 1: vpaddw(_ga, _dst_ga); break; case 2: break; } } @@ -1598,8 +1770,8 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() switch(m_sel.abd) { case 0: break; - case 1: vmovdqa(xmm3, xmm1); break; - case 2: vpxor(xmm3, xmm3); break; + case 1: vmovdqa(_ga, _dst_ga); break; + case 2: vpxor(_ga, _ga); break; } } @@ -1615,24 +1787,41 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // ga = c[1].blend8(ga, mask).mix16(c[1]); - vpblendvb(xmm3, xmm5, xmm3, xmm0); + vpblendvb(_ga, xmm5, _ga, xmm0); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { - mix16(xmm3, xmm5, xmm15); + mix16(_ga, xmm5, xmm15); } } } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_AVX() { if(!m_sel.fwrite) { return; } + if(m_sel.fpsm == 2 && m_sel.dthe) + { + mov(a3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, dimx)]); + + // y = (top & 3) << 5 + + mov(eax, a1.cvt32()); + and(eax, 3); + shl(eax, 5); + + // rb = rb.add16(m_global.dimx[0 + y]); + // ga = ga.add16(m_global.dimx[1 + y]); + + vpaddw(xmm2, ptr[a3 + rax + sizeof(GSVector4i) * 0]); + vpaddw(xmm3, ptr[a3 + rax + sizeof(GSVector4i) * 1]); + } + if(m_sel.colclamp == 0) { // c[0] &= 0x00ff00ff; @@ -1644,15 +1833,6 @@ void GSDrawScanlineCodeGenerator::WriteFrame() vpand(xmm3, xmm15); } - if(m_sel.fpsm == 2 && m_sel.dthe) - { - mov(rax, r8); - and(rax, 3); - shl(rax, 5); - vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx) + sizeof(GSVector4i) * 0]); - vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx) + sizeof(GSVector4i) * 1]); - } - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); vpunpckhwd(xmm15, xmm2, xmm3); @@ -1704,73 +1884,91 @@ void GSDrawScanlineCodeGenerator::WriteFrame() { // fs = fs.blend(fd, fm); - blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm + blend(xmm2, _fd, _fm); // TODO: could be skipped in certain cases, depending on fpsm and fm } - bool fast = m_sel.rfb && m_sel.fpsm < 2; + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); + WritePixel_AVX(xmm2, rbx, dl, fast, m_sel.fpsm, 0); } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg64& addr) { - vmovq(dst, qword[r13 + addr * 2]); - vmovhps(dst, qword[r13 + addr * 2 + 8 * 2]); + vmovq(dst, qword[_m_local__gd__vm + addr * 2]); + vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]); } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) { - if(fast) + if(m_sel.notest) { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[r13 + addr * 2], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[r13 + addr * 2 + 8 * 2], src); - L("@@"); - - // vmaskmovps? + if(fast) + { + vmovq(qword[_m_local__gd__vm + addr * 2], src); + vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src); + } + else + { + WritePixel_AVX(src, addr, 0, psm); + WritePixel_AVX(src, addr, 1, psm); + WritePixel_AVX(src, addr, 2, psm); + WritePixel_AVX(src, addr, 3, psm); + } } else { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); + test(mask, 0x0f); + je("@f"); + vmovq(qword[_m_local__gd__vm + addr * 2], src); + L("@@"); - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); + test(mask, 0xf0); + je("@f"); + vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src); + L("@@"); - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); + test(mask, 0x03); + je("@f"); + WritePixel_AVX(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel_AVX(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel_AVX(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel_AVX(src, addr, 3, psm); + L("@@"); + } } } static const int s_offsets[4] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, uint8 i, int psm) { - Address dst = ptr[r13 + addr * 2 + s_offsets[i] * 2]; + Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2]; switch(psm) { @@ -1792,22 +1990,71 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) { - // TODO + const int in[] = {0, 1, 2, 3}; + const int out[] = {4, 5, 0, 1}; + + for(int i = 0; i < pixels; i++) + { + for(int j = 0; j < 4; j++) + { + ReadTexel_AVX(Xmm(out[i]), Xmm(in[i]), j); + } + } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i) { - const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4]; + const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4]; + // Extract address offset if(i == 0) vmovd(eax, addr); else vpextrd(eax, addr, i); - if(m_sel.tlu) movzx(rax, byte[rbx + rax]); + // If clut, load the value as a byte index + if(m_sel.tlu) movzx(eax, byte[_m_local__gd__tex + rax]); if(i == 0) vmovd(dst, src); else vpinsrd(dst, src, i); } +// Gather example (AVX2). Not faster on Haswell but potentially better on recent CPU +// Worst case reduce Icache. +// +// Current limitation requires 1 extra free register for the mask. +// And palette need zero masking. +// It is not possible to use same source/destination so linear interpolation must be updated +#if 0 +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) +{ + const int in[] = {0, 1, 2, 3}; + const int out[] = {4, 5, 0, 1}; + const int mask[] = {5, 0, 1, 2}; + + if (m_sel.tlu) { + for(int i = 0; i < pixels; i++) { + // FIXME can't use same dst and add register + Gather4Texel(Xmm(in[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i])); + // FIXME need a memory and could be faster + vpslld(Xmm(in[i]), 24); + vpsrld(Xmm(in[i]), 24); + Gather4Texel(Xmm(out[i]), _m_local__gd__clut, Xmm(in[i]), Xmm(mask[i])); + } + } else { + for(int i = 0; i < pixels; i++) { + Gather4Texel(Xmm(out[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i])); + } + } +} + +static void Gather4Texel(const Xmm& dst, const Reg64& base, const Xmm& addr, const Xmm& Mask) +{ + //void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) + vpcmpeqd(Mask, Mask); + vpgatherdd(dst, ptr[base + addr * 4], Mask); +} + +#endif + #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp index 40631c1c9e..6b1102f091 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp @@ -22,99 +22,102 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) -void GSDrawScanlineCodeGenerator::Generate() +// It is useless to port the code to SSEx, better use the faster 32 bits version instead +void GSDrawScanlineCodeGenerator::Generate_SSE() +{ + // Avoid a crash if someone want to use it + ret(); +} + +void GSDrawScanlineCodeGenerator::Init_SSE() { } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Step_SSE() { } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) { } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::SampleTexture_SSE() { } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv) { } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) { } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::AlphaTFX_SSE() { } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::ReadMask_SSE() { } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::TestAlpha_SSE() { } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::ColorTFX_SSE() { } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::Fog_SSE() { } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::ReadFrame_SSE() { } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE() { } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::WriteMask_SSE() { } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() { } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() { } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::WriteFrame_SSE() { } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg64& addr) { } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) -{ -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) { } static const int s_offsets[4] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, uint8 i, int psm) { } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) { } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i) { } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 250432ff2b..a2278cbad0 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -23,21 +23,20 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) static const int _args = 16; static const int _top = _args + 4; static const int _v = _args + 8; -void GSDrawScanlineCodeGenerator::Generate() +void GSDrawScanlineCodeGenerator::Generate_AVX() { -//ret(8); push(ebx); push(esi); push(edi); push(ebp); - Init(); + Init_AVX(); if(!m_sel.edge) { @@ -59,7 +58,7 @@ L("loop"); bool tme = m_sel.tfx != TFX_NONE; - TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); + TestZ_AVX(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); // ecx = steps // esi = fzbr @@ -75,11 +74,11 @@ L("loop"); if(m_sel.mmin) { - SampleTextureLOD(); + SampleTextureLOD_AVX(); } else { - SampleTexture(); + SampleTexture_AVX(); } // ecx = steps @@ -93,7 +92,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - AlphaTFX(); + AlphaTFX_AVX(); // ecx = steps // esi = fzbr @@ -104,7 +103,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadMask(); + ReadMask_AVX(); // ecx = steps // esi = fzbr @@ -117,7 +116,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestAlpha(); + TestAlpha_AVX(); // ecx = steps // esi = fzbr @@ -130,7 +129,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ColorTFX(); + ColorTFX_AVX(); // ecx = steps // esi = fzbr @@ -142,7 +141,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - Fog(); + Fog_AVX(); // ecx = steps // esi = fzbr @@ -154,7 +153,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadFrame(); + ReadFrame_AVX(); // ecx = steps // esi = fzbr @@ -167,7 +166,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestDestAlpha(); + TestDestAlpha_AVX(); // ecx = steps // esi = fzbr @@ -180,7 +179,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - WriteMask(); + WriteMask_AVX(); // ebx = fa // ecx = steps @@ -194,7 +193,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteZBuf(); + WriteZBuf_AVX(); // ebx = fa // ecx = steps @@ -208,7 +207,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - AlphaBlend(); + AlphaBlend_AVX(); // ebx = fa // ecx = steps @@ -220,7 +219,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteFrame(); + WriteFrame_AVX(); L("step"); @@ -232,7 +231,7 @@ L("step"); jle("exit", T_NEAR); - Step(); + Step_AVX(); jmp("loop", T_NEAR); } @@ -249,7 +248,7 @@ L("exit"); ret(8); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_AVX() { if(!m_sel.notest) { @@ -455,7 +454,7 @@ void GSDrawScanlineCodeGenerator::Init() } } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_AVX() { // steps -= 4; @@ -596,7 +595,7 @@ void GSDrawScanlineCodeGenerator::Step() } } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { @@ -644,7 +643,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.ztest) { - ReadPixel(xmm1, ebp); + ReadPixel_AVX(xmm1, ebp); if(m_sel.zwrite && m_sel.zpsm < 2) { @@ -694,7 +693,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) } } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_AVX() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -740,7 +739,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() if(m_sel.ltf) { - // GSVector4i uf = u.xxzzlh().srl16(1); + // GSVector4i uf = u.xxzzlh().srl16(12); vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); @@ -749,7 +748,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() if(m_sel.prim != GS_SPRITE_CLASS) { - // GSVector4i vf = v.xxzzlh().srl16(1); + // GSVector4i vf = v.xxzzlh().srl16(12); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); @@ -775,13 +774,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm2, xmm3); + Wrap_AVX(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - Wrap(xmm2); + Wrap_AVX(xmm2); } // xmm2 = uv0 @@ -843,7 +842,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_AVX(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -857,16 +856,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -894,16 +889,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -943,20 +934,18 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_AVX(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(xmm5, xmm6, 8); - vpsrlw(xmm5, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1017,7 +1006,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1109,7 +1098,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::SampleTextureLOD() +void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -1370,13 +1359,13 @@ return; // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_AVX(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_AVX(xmm2); } // xmm2 = uv0 @@ -1438,7 +1427,7 @@ return; // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_AVX(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -1452,16 +1441,12 @@ return; // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1489,16 +1474,12 @@ return; // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1538,16 +1519,14 @@ return; // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_AVX(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(xmm5, xmm6, 8); - vpsrlw(xmm5, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } if(m_sel.mmin != 1) // !round-off mode @@ -1611,13 +1590,13 @@ return; // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_AVX(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_AVX(xmm2); } // xmm2 = uv0 @@ -1679,7 +1658,7 @@ return; // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 1); + ReadTexel_AVX(4, 1); // xmm6 = c00 // xmm4 = c01 @@ -1693,16 +1672,12 @@ return; // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1730,16 +1705,12 @@ return; // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1779,16 +1750,14 @@ return; // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 1); + ReadTexel_AVX(1, 1); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - vpsllw(xmm5, xmm6, 8); - vpsrlw(xmm5, 8); - vpsrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); @@ -1804,7 +1773,7 @@ return; pop(ebp); } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv) { // xmm5 = minuv // xmm6 = maxuv @@ -1865,7 +1834,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1) { // xmm5 = minuv // xmm6 = maxuv @@ -1953,7 +1922,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) } } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_AVX() { if(!m_sel.fb) { @@ -2101,7 +2070,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() } } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_AVX() { if(m_sel.fwrite) { @@ -2114,7 +2083,7 @@ void GSDrawScanlineCodeGenerator::ReadMask() } } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_AVX() { switch(m_sel.atst) { @@ -2185,7 +2154,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha() } } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_AVX() { if(!m_sel.fwrite) { @@ -2261,7 +2230,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() } } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_AVX() { if(!m_sel.fwrite || !m_sel.fge) { @@ -2282,7 +2251,7 @@ void GSDrawScanlineCodeGenerator::Fog() mix16(xmm6, xmm1, xmm0); } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_AVX() { if(!m_sel.fb) { @@ -2300,10 +2269,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame() return; } - ReadPixel(xmm2, ebx); + ReadPixel_AVX(xmm2, ebx); } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX() { if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) { @@ -2347,7 +2316,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() alltrue(); } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_AVX() { if(m_sel.notest) { @@ -2393,7 +2362,7 @@ void GSDrawScanlineCodeGenerator::WriteMask() not(edx); } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() { if(!m_sel.zwrite) { @@ -2411,10 +2380,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); + WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_AVX() { if(!m_sel.fwrite) { @@ -2436,9 +2405,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - vpsllw(xmm0, xmm2, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, xmm2, 8); + split16_2x8(xmm0, xmm1, xmm2); break; @@ -2638,7 +2605,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() } } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_AVX() { if(!m_sel.fwrite) { @@ -2718,16 +2685,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame() bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); + WritePixel_AVX(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg32& addr) { vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { if(m_sel.notest) { @@ -2738,10 +2705,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } else { - WritePixel(src, addr, 0, psm); - WritePixel(src, addr, 1, psm); - WritePixel(src, addr, 2, psm); - WritePixel(src, addr, 3, psm); + WritePixel_AVX(src, addr, 0, psm); + WritePixel_AVX(src, addr, 1, psm); + WritePixel_AVX(src, addr, 2, psm); + WritePixel_AVX(src, addr, 3, psm); } } else @@ -2772,22 +2739,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, test(mask, 0x03); je("@f"); - WritePixel(src, addr, 0, psm); + WritePixel_AVX(src, addr, 0, psm); L("@@"); test(mask, 0x0c); je("@f"); - WritePixel(src, addr, 1, psm); + WritePixel_AVX(src, addr, 1, psm); L("@@"); test(mask, 0x30); je("@f"); - WritePixel(src, addr, 2, psm); + WritePixel_AVX(src, addr, 2, psm); L("@@"); test(mask, 0xc0); je("@f"); - WritePixel(src, addr, 3, psm); + WritePixel_AVX(src, addr, 3, psm); L("@@"); } } @@ -2795,7 +2762,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, static const int s_offsets[] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, uint8 i, int psm) { Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; @@ -2820,7 +2787,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) { // in // xmm5 = addr00 @@ -2859,7 +2826,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) for(int i = 0; i < pixels; i++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } @@ -2878,19 +2845,18 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) } const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - const int t[] = {4, 1, 5, 2}; for(int i = 0; i < pixels; i++) { for(uint8 j = 0; j < 4; j++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i) { ASSERT(i < 4); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 42c4b74ae9..60ac137655 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -23,20 +23,20 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) static const int _args = 16; static const int _top = _args + 4; static const int _v = _args + 8; -void GSDrawScanlineCodeGenerator::Generate() +void GSDrawScanlineCodeGenerator::Generate_SSE() { push(ebx); push(esi); push(edi); push(ebp); - Init(); + Init_SSE(); if(!m_sel.edge) { @@ -58,7 +58,7 @@ L("loop"); bool tme = m_sel.tfx != TFX_NONE; - TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); + TestZ_SSE(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); // ecx = steps // esi = fzbr @@ -74,11 +74,11 @@ L("loop"); if(m_sel.mmin) { - SampleTextureLOD(); + SampleTextureLOD_SSE(); } else { - SampleTexture(); + SampleTexture_SSE(); } // ecx = steps @@ -92,7 +92,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - AlphaTFX(); + AlphaTFX_SSE(); // ecx = steps // esi = fzbr @@ -103,7 +103,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadMask(); + ReadMask_SSE(); // ecx = steps // esi = fzbr @@ -116,7 +116,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestAlpha(); + TestAlpha_SSE(); // ecx = steps // esi = fzbr @@ -129,7 +129,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ColorTFX(); + ColorTFX_SSE(); // ecx = steps // esi = fzbr @@ -141,7 +141,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - Fog(); + Fog_SSE(); // ecx = steps // esi = fzbr @@ -153,7 +153,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - ReadFrame(); + ReadFrame_SSE(); // ecx = steps // esi = fzbr @@ -166,7 +166,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - TestDestAlpha(); + TestDestAlpha_SSE(); // ecx = steps // esi = fzbr @@ -179,7 +179,7 @@ L("loop"); // xmm6 = ga // xmm7 = test - WriteMask(); + WriteMask_SSE(); // ebx = fa // ecx = steps @@ -193,7 +193,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteZBuf(); + WriteZBuf_SSE(); // ebx = fa // ecx = steps @@ -207,7 +207,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - AlphaBlend(); + AlphaBlend_SSE(); // ebx = fa // ecx = steps @@ -219,7 +219,7 @@ L("loop"); // xmm5 = rb // xmm6 = ga - WriteFrame(); + WriteFrame_SSE(); L("step"); @@ -231,7 +231,7 @@ L("step"); jle("exit", T_NEAR); - Step(); + Step_SSE(); jmp("loop", T_NEAR); } @@ -248,7 +248,7 @@ L("exit"); ret(8); } -void GSDrawScanlineCodeGenerator::Init() +void GSDrawScanlineCodeGenerator::Init_SSE() { if(!m_sel.notest) { @@ -457,7 +457,7 @@ void GSDrawScanlineCodeGenerator::Init() } } -void GSDrawScanlineCodeGenerator::Step() +void GSDrawScanlineCodeGenerator::Step_SSE() { // steps -= 4; @@ -600,7 +600,7 @@ void GSDrawScanlineCodeGenerator::Step() } } -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { @@ -648,7 +648,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) if(m_sel.ztest) { - ReadPixel(xmm1, ebp); + ReadPixel_SSE(xmm1, ebp); if(m_sel.zwrite && m_sel.zpsm < 2) { @@ -698,7 +698,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) } } -void GSDrawScanlineCodeGenerator::SampleTexture() +void GSDrawScanlineCodeGenerator::SampleTexture_SSE() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -780,13 +780,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm2, xmm3); + Wrap_SSE(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - Wrap(xmm2); + Wrap_SSE(xmm2); } // xmm2 = uv0 @@ -853,7 +853,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_SSE(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -867,18 +867,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - movdqa(xmm2, xmm6); - psllw(xmm2, 8); - psrlw(xmm2, 8); - psrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - movdqa(xmm3, xmm4); - psllw(xmm3, 8); - psrlw(xmm3, 8); - psrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -906,18 +900,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -958,21 +946,18 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_SSE(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - movdqa(xmm5, xmm6); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1035,7 +1020,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free @@ -1084,16 +1069,15 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) movdqa(xmm4, ptr[&m_local.gd->t.min]); movdqa(xmm5, ptr[&m_local.gd->t.max]); - #if _M_SSE >= 0x401 - - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - #else - - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm6, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + { + movdqa(xmm0, ptr[&m_local.gd->t.mask]); + } + else + { + movdqa(xmm0, ptr[&m_local.gd->t.invmask]); + movdqa(xmm6, xmm0); + } // uv0 @@ -1115,15 +1099,10 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv0, xmm1); - - #else - - blendr(uv0, xmm1, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv0, xmm1); + else + blendr(uv0, xmm1, xmm0); // uv1 @@ -1145,19 +1124,14 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv1, xmm1); - - #else - - blendr(uv1, xmm1, xmm6); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv1, xmm1); + else + blendr(uv1, xmm1, xmm6); } } -void GSDrawScanlineCodeGenerator::SampleTextureLOD() +void GSDrawScanlineCodeGenerator::SampleTextureLOD_SSE() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { @@ -1166,7 +1140,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() push(ebp); - mov(ebp, (size_t)m_local.gd->tex); + mov(ebp, (size_t)m_local.gd->tex); if(m_sel.tlu) { @@ -1380,13 +1354,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_SSE(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_SSE(xmm2); } // xmm2 = uv0 @@ -1453,7 +1427,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 0); + ReadTexel_SSE(4, 0); // xmm6 = c00 // xmm4 = c01 @@ -1467,18 +1441,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - movdqa(xmm2, xmm6); - psrlw(xmm6, 8); - psllw(xmm2, 8); - psrlw(xmm2, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - movdqa(xmm3, xmm4); - psrlw(xmm4, 8); - psllw(xmm3, 8); - psrlw(xmm3, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1506,18 +1474,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1558,17 +1520,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 0); + ReadTexel_SSE(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - movdqa(xmm5, xmm6); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm6); } if(m_sel.mmin != 1) // !round-off mode @@ -1633,13 +1592,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - WrapLOD(xmm2, xmm3); + WrapLOD_SSE(xmm2, xmm3); } else { // uv0 = Wrap(uv0); - WrapLOD(xmm2); + WrapLOD_SSE(xmm2); } // xmm2 = uv0 @@ -1706,7 +1665,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(4, 1); + ReadTexel_SSE(4, 1); // xmm6 = c00 // xmm4 = c01 @@ -1720,18 +1679,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; - movdqa(xmm2, xmm6); - psllw(xmm2, 8); - psrlw(xmm2, 8); - psrlw(xmm6, 8); + split16_2x8(xmm2, xmm6, xmm6); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; - movdqa(xmm3, xmm4); - psllw(xmm3, 8); - psrlw(xmm3, 8); - psrlw(xmm4, 8); + split16_2x8(xmm3, xmm4, xmm4); // xmm0 = uf // xmm2 = rb00 @@ -1759,18 +1712,12 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); + split16_2x8(xmm1, xmm2, xmm1); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); // xmm0 = uf // xmm3 = rb00 @@ -1811,17 +1758,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(1, 1); + ReadTexel_SSE(1, 1); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; - movdqa(xmm5, xmm6); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); + split16_2x8(xmm5, xmm6, xmm5); } movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); @@ -1837,7 +1781,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pop(ebp); } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv) { // xmm5 = minuv // xmm6 = maxuv @@ -1900,7 +1844,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) } } -void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1) { // xmm5 = minuv // xmm6 = maxuv @@ -1944,16 +1888,15 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) } else { - #if _M_SSE >= 0x401 - - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - #else - - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm4, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + { + movdqa(xmm0, ptr[&m_local.gd->t.mask]); + } + else + { + movdqa(xmm0, ptr[&m_local.gd->t.invmask]); + movdqa(xmm4, xmm0); + } // uv0 @@ -1975,15 +1918,10 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv0, xmm1); - - #else - - blendr(uv0, xmm1, xmm0); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv0, xmm1); + else + blendr(uv0, xmm1, xmm0); // uv1 @@ -2005,19 +1943,14 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - #if _M_SSE >= 0x401 - - pblendvb(uv1, xmm1); - - #else - - blendr(uv1, xmm1, xmm4); - - #endif + if(g_cpu.has(util::Cpu::tSSE41)) + pblendvb(uv1, xmm1); + else + blendr(uv1, xmm1, xmm4); } } -void GSDrawScanlineCodeGenerator::AlphaTFX() +void GSDrawScanlineCodeGenerator::AlphaTFX_SSE() { if(!m_sel.fb) { @@ -2165,7 +2098,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() } } -void GSDrawScanlineCodeGenerator::ReadMask() +void GSDrawScanlineCodeGenerator::ReadMask_SSE() { if(m_sel.fwrite) { @@ -2178,7 +2111,7 @@ void GSDrawScanlineCodeGenerator::ReadMask() } } -void GSDrawScanlineCodeGenerator::TestAlpha() +void GSDrawScanlineCodeGenerator::TestAlpha_SSE() { switch(m_sel.atst) { @@ -2253,7 +2186,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha() } } -void GSDrawScanlineCodeGenerator::ColorTFX() +void GSDrawScanlineCodeGenerator::ColorTFX_SSE() { if(!m_sel.fwrite) { @@ -2329,7 +2262,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() } } -void GSDrawScanlineCodeGenerator::Fog() +void GSDrawScanlineCodeGenerator::Fog_SSE() { if(!m_sel.fwrite || !m_sel.fge) { @@ -2350,7 +2283,7 @@ void GSDrawScanlineCodeGenerator::Fog() mix16(xmm6, xmm1, xmm0); } -void GSDrawScanlineCodeGenerator::ReadFrame() +void GSDrawScanlineCodeGenerator::ReadFrame_SSE() { if(!m_sel.fb) { @@ -2368,10 +2301,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame() return; } - ReadPixel(xmm2, ebx); + ReadPixel_SSE(xmm2, ebx); } -void GSDrawScanlineCodeGenerator::TestDestAlpha() +void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE() { if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) { @@ -2414,7 +2347,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() alltrue(); } -void GSDrawScanlineCodeGenerator::WriteMask() +void GSDrawScanlineCodeGenerator::WriteMask_SSE() { if(m_sel.notest) { @@ -2461,7 +2394,7 @@ void GSDrawScanlineCodeGenerator::WriteMask() not(edx); } -void GSDrawScanlineCodeGenerator::WriteZBuf() +void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() { if(!m_sel.zwrite) { @@ -2481,10 +2414,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); + WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } -void GSDrawScanlineCodeGenerator::AlphaBlend() +void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() { if(!m_sel.fwrite) { @@ -2506,12 +2439,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; - movdqa(xmm0, xmm2); - movdqa(xmm1, xmm2); - - psllw(xmm0, 8); - psrlw(xmm0, 8); - psrlw(xmm1, 8); + split16_2x8(xmm0, xmm1, xmm2); break; @@ -2702,15 +2630,14 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() if(m_sel.pabe) { - #if _M_SSE < 0x401 + if(!g_cpu.has(util::Cpu::tSSE41)) + { + // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) + movdqa(xmm0, xmm4); + pslld(xmm0, 8); + psrad(xmm0, 31); - // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) - - movdqa(xmm0, xmm4); - pslld(xmm0, 8); - psrad(xmm0, 31); - - #endif + } psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) @@ -2727,7 +2654,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend() } } -void GSDrawScanlineCodeGenerator::WriteFrame() +void GSDrawScanlineCodeGenerator::WriteFrame_SSE() { if(!m_sel.fwrite) { @@ -2812,16 +2739,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame() bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); + WritePixel_SSE(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg32& addr) { movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); } -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { if(m_sel.notest) { @@ -2832,10 +2759,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } else { - WritePixel(src, addr, 0, psm); - WritePixel(src, addr, 1, psm); - WritePixel(src, addr, 2, psm); - WritePixel(src, addr, 3, psm); + WritePixel_SSE(src, addr, 0, psm); + WritePixel_SSE(src, addr, 1, psm); + WritePixel_SSE(src, addr, 2, psm); + WritePixel_SSE(src, addr, 3, psm); } } else @@ -2864,22 +2791,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, test(mask, 0x03); je("@f"); - WritePixel(src, addr, 0, psm); + WritePixel_SSE(src, addr, 0, psm); L("@@"); test(mask, 0x0c); je("@f"); - WritePixel(src, addr, 1, psm); + WritePixel_SSE(src, addr, 1, psm); L("@@"); test(mask, 0x30); je("@f"); - WritePixel(src, addr, 2, psm); + WritePixel_SSE(src, addr, 2, psm); L("@@"); test(mask, 0xc0); je("@f"); - WritePixel(src, addr, 3, psm); + WritePixel_SSE(src, addr, 3, psm); L("@@"); } } @@ -2887,7 +2814,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, static const int s_offsets[4] = {0, 2, 8, 10}; -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, uint8 i, int psm) { Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; @@ -2895,19 +2822,26 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, { case 0: if(i == 0) movd(dst, src); - #if _M_SSE >= 0x401 - else pextrd(dst, src, i); - #else - else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);} - #endif + else { + if(g_cpu.has(util::Cpu::tSSE41)) { + pextrd(dst, src, i); + } else { + pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); + movd(dst, xmm0); + } + + } break; case 1: if(i == 0) movd(eax, src); - #if _M_SSE >= 0x401 - else pextrd(eax, src, i); - #else - else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);} - #endif + else { + if(g_cpu.has(util::Cpu::tSSE41)) { + pextrd(eax, src, i); + } else { + pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); + movd(eax, xmm0); + } + } xor(eax, dst); and(eax, 0xffffff); xor(dst, eax); @@ -2920,7 +2854,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) { // in // xmm5 = addr00 @@ -2945,152 +2879,154 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) if(m_sel.mmin && !m_sel.lcm) { - #if _M_SSE >= 0x401 - - const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; - - if(pixels == 4) + if(g_cpu.has(util::Cpu::tSSE41)) { - movdqa(ptr[&m_local.temp.test], xmm7); - } - for(int j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; - for(int i = 0; i < pixels; i++) + if(pixels == 4) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + movdqa(ptr[&m_local.temp.test], xmm7); } - } - if(pixels == 4) - { - movdqa(xmm5, xmm7); - movdqa(xmm7, ptr[&m_local.temp.test]); - } + for(int j = 0; j < 4; j++) + { + mov(ebx, ptr[&lod_i->u32[j]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - #else + for(int i = 0; i < pixels; i++) + { + ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + } + } - if(pixels == 4) - { - movdqa(ptr[&m_local.temp.test], xmm7); - - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm6, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel(xmm4, xmm2, 0); - psrldq(xmm2, 4); - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel(xmm7, xmm2, 0); - psrldq(xmm2, 4); - - punpckldq(xmm6, xmm1); - punpckldq(xmm4, xmm7); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel(xmm7, xmm2, 0); - psrldq(xmm2, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm5, xmm5, 0); - ReadTexel(xmm2, xmm2, 0); - - punpckldq(xmm1, xmm5); - punpckldq(xmm7, xmm2); - - punpcklqdq(xmm6, xmm1); - punpcklqdq(xmm4, xmm7); - - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm1, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel(xmm5, xmm3, 0); - psrldq(xmm3, 4); - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm2, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel(xmm7, xmm3, 0); - psrldq(xmm3, 4); - - punpckldq(xmm1, xmm2); - punpckldq(xmm5, xmm7); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm2, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel(xmm7, xmm3, 0); - psrldq(xmm3, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm0, xmm0, 0); - ReadTexel(xmm3, xmm3, 0); - - punpckldq(xmm2, xmm0); - punpckldq(xmm7, xmm3); - - punpcklqdq(xmm1, xmm2); - punpcklqdq(xmm5, xmm7); - - movdqa(xmm7, ptr[&m_local.temp.test]); + if(pixels == 4) + { + movdqa(xmm5, xmm7); + movdqa(xmm7, ptr[&m_local.temp.test]); + } } else { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm6, xmm5, 0); - psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) + if(pixels == 4) + { + movdqa(ptr[&m_local.temp.test], xmm7); - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); + ReadTexel_SSE(xmm6, xmm5, 0); + psrldq(xmm5, 4); + ReadTexel_SSE(xmm4, xmm2, 0); + psrldq(xmm2, 4); - punpckldq(xmm6, xmm1); + mov(ebx, ptr[&lod_i->u32[1]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + ReadTexel_SSE(xmm1, xmm5, 0); + psrldq(xmm5, 4); + ReadTexel_SSE(xmm7, xmm2, 0); + psrldq(xmm2, 4); - ReadTexel(xmm1, xmm5, 0); - psrldq(xmm5, 4); + punpckldq(xmm6, xmm1); + punpckldq(xmm4, xmm7); - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + mov(ebx, ptr[&lod_i->u32[2]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - ReadTexel(xmm4, xmm5, 0); - // psrldq(xmm5, 4); + ReadTexel_SSE(xmm1, xmm5, 0); + psrldq(xmm5, 4); + ReadTexel_SSE(xmm7, xmm2, 0); + psrldq(xmm2, 4); - punpckldq(xmm1, xmm4); + mov(ebx, ptr[&lod_i->u32[3]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm5, xmm5, 0); + ReadTexel_SSE(xmm2, xmm2, 0); + + punpckldq(xmm1, xmm5); + punpckldq(xmm7, xmm2); + + punpcklqdq(xmm6, xmm1); + punpcklqdq(xmm4, xmm7); + + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm1, xmm0, 0); + psrldq(xmm0, 4); + ReadTexel_SSE(xmm5, xmm3, 0); + psrldq(xmm3, 4); + + mov(ebx, ptr[&lod_i->u32[1]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm2, xmm0, 0); + psrldq(xmm0, 4); + ReadTexel_SSE(xmm7, xmm3, 0); + psrldq(xmm3, 4); + + punpckldq(xmm1, xmm2); + punpckldq(xmm5, xmm7); + + mov(ebx, ptr[&lod_i->u32[2]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm2, xmm0, 0); + psrldq(xmm0, 4); + ReadTexel_SSE(xmm7, xmm3, 0); + psrldq(xmm3, 4); + + mov(ebx, ptr[&lod_i->u32[3]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm0, xmm0, 0); + ReadTexel_SSE(xmm3, xmm3, 0); + + punpckldq(xmm2, xmm0); + punpckldq(xmm7, xmm3); + + punpcklqdq(xmm1, xmm2); + punpcklqdq(xmm5, xmm7); + + movdqa(xmm7, ptr[&m_local.temp.test]); + } + else + { + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm6, xmm5, 0); + psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) + + mov(ebx, ptr[&lod_i->u32[1]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm1, xmm5, 0); + psrldq(xmm5, 4); + + punpckldq(xmm6, xmm1); + + mov(ebx, ptr[&lod_i->u32[2]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm1, xmm5, 0); + psrldq(xmm5, 4); + + mov(ebx, ptr[&lod_i->u32[3]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel_SSE(xmm4, xmm5, 0); + // psrldq(xmm5, 4); + + punpckldq(xmm1, xmm4); + + punpcklqdq(xmm6, xmm1); + } - punpcklqdq(xmm6, xmm1); } - - #endif } else { @@ -3102,55 +3038,50 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - #if _M_SSE >= 0x401 - - for(int i = 0; i < pixels; i++) + if(g_cpu.has(util::Cpu::tSSE41)) { - for(int j = 0; j < 4; j++) + for(int i = 0; i < pixels; i++) { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + for(int j = 0; j < 4; j++) + { + ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); + } } + + } else { + const int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; + + for(int i = 0; i < pixels; i++) + { + const Xmm& addr = Xmm(r[i * 2 + 0]); + const Xmm& dst = Xmm(r[i * 2 + 1]); + const Xmm& temp1 = Xmm(t[i * 2 + 0]); + const Xmm& temp2 = Xmm(t[i * 2 + 1]); + + ReadTexel_SSE(dst, addr, 0); + psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) + ReadTexel_SSE(temp1, addr, 0); + psrldq(addr, 4); + punpckldq(dst, temp1); + + ReadTexel_SSE(temp1, addr, 0); + psrldq(addr, 4); + ReadTexel_SSE(temp2, addr, 0); + // psrldq(addr, 4); + punpckldq(temp1, temp2); + + punpcklqdq(dst, temp1); + } + } - - #else - - const int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; - - for(int i = 0; i < pixels; i++) - { - const Xmm& addr = Xmm(r[i * 2 + 0]); - const Xmm& dst = Xmm(r[i * 2 + 1]); - const Xmm& temp1 = Xmm(t[i * 2 + 0]); - const Xmm& temp2 = Xmm(t[i * 2 + 1]); - - ReadTexel(dst, addr, 0); - psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) - ReadTexel(temp1, addr, 0); - psrldq(addr, 4); - punpckldq(dst, temp1); - - ReadTexel(temp1, addr, 0); - psrldq(addr, 4); - ReadTexel(temp2, addr, 0); - // psrldq(addr, 4); - punpckldq(temp1, temp2); - - punpcklqdq(dst, temp1); - } - - #endif } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i) { const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; - #if _M_SSE < 0x401 - - ASSERT(i == 0); - - #endif + ASSERT(i == 0 || g_cpu.has(util::Cpu::tSSE41)); if(i == 0) movd(eax, addr); else pextrd(eax, addr, i); diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index ab2f065798..5b6cc281d9 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -26,6 +26,8 @@ #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" +#include "GSScanlineEnvironment.h" + template class GSFunctionMap { protected: @@ -161,6 +163,7 @@ class GSCodeGeneratorFunctionMap : public GSFunctionMap void* m_param; hash_map m_cgmap; GSCodeBuffer m_cb; + size_t m_total_code_size; enum {MAX_SIZE = 8192}; @@ -168,9 +171,15 @@ public: GSCodeGeneratorFunctionMap(const char* name, void* param) : m_name(name) , m_param(param) + , m_total_code_size(0) { } + ~GSCodeGeneratorFunctionMap() + { + fprintf(stderr, "%s generated %zu bytes of instruction\n", m_name.c_str(), m_total_code_size); + } + VALUE GetDefaultFunction(KEY key) { VALUE ret = NULL; @@ -183,10 +192,19 @@ public: } else { - CG* cg = new CG(m_param, key, m_cb.GetBuffer(MAX_SIZE), MAX_SIZE); + void* code_ptr = m_cb.GetBuffer(MAX_SIZE); + CG* cg = new CG(m_param, key, code_ptr, MAX_SIZE); ASSERT(cg->getSize() < MAX_SIZE); +#if 0 + fprintf(stderr, "%s Location:%p Size:%zu Key:%llx\n", m_name.c_str(), code_ptr, cg->getSize(), (uint64)key); + GSScanlineSelector sel(key); + sel.Print(); +#endif + + m_total_code_size += cg->getSize(); + m_cb.ReleaseBuffer(cg->getSize()); ret = (VALUE)cg->getCode(); diff --git a/plugins/GSdx/GSLzma.cpp b/plugins/GSdx/GSLzma.cpp index 5e29a0bc79..d99a94ff65 100644 --- a/plugins/GSdx/GSLzma.cpp +++ b/plugins/GSdx/GSLzma.cpp @@ -173,7 +173,7 @@ void GSDumpRaw::Read(void* ptr, size_t size) { } else { size_t ret = fread(ptr, 1, size, m_fp); if (ret != size) { - fprintf(stderr, "GSDumpRaw:: Read error (%d/%d)\n", ret, size); + fprintf(stderr, "GSDumpRaw:: Read error (%zu/%zu)\n", ret, size); throw "BAD"; // Just exit the program } } diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index e9f0551e40..115c10a3db 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -69,6 +69,8 @@ union GSScanlineSelector uint32 mmin:2; // 53 uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction + + uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program }; struct @@ -76,6 +78,7 @@ union GSScanlineSelector uint32 _pad1:22; uint32 ababcd:8; uint32 _pad2:2; + uint32 fb:2; uint32 _pad3:1; uint32 zb:2; @@ -89,6 +92,9 @@ union GSScanlineSelector uint64 key; + GSScanlineSelector() = default; + GSScanlineSelector(uint64 k) : key(k) {} + operator uint32() const {return lo;} operator uint64() const {return key;} @@ -103,6 +109,18 @@ union GSScanlineSelector && date == 0 && fge == 0; } + + void Print() const + { + fprintf(stderr, "fpsm:%d zpsm:%d ztst:%d ztest:%d atst:%d afail:%d iip:%d rfb:%d fb:%d zb:%d zw:%d " + "tfx:%d tcc:%d fst:%d ltf:%d tlu:%d wms:%d wmt:%d mmin:%d lcm:%d tw:%d " + "fba:%d cclamp:%d date:%d datm:%d " + "prim:%d abe:%d %d%d%d%d fge:%d dthe:%d notest:%d\n", + fpsm, zpsm, ztst, ztest, atst, afail, iip, rfb, fb, zb, zwrite, + tfx, tcc, fst, ltf, tlu, wms, wmt, mmin, lcm, tw, + fba, colclamp, date, datm, + prim, abe, aba, abb, abc, abd , fge, dthe, notest); + } }; struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp index 37427898f8..3735ec9a41 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp @@ -22,6 +22,8 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +using namespace Xbyak; + #if _M_SSE >= 0x501 GSVector8 GSSetupPrimCodeGenerator::m_shift[9]; #else @@ -75,3 +77,14 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void Generate(); } + +#if _M_SSE >= 0x501 +#else +void GSSetupPrimCodeGenerator::Generate() +{ + if(g_cpu.has(util::Cpu::tAVX)) + Generate_AVX(); + else + Generate_SSE(); +} +#endif diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.h b/plugins/GSdx/GSSetupPrimCodeGenerator.h index d4b2c1106f..e07eb00334 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.h +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.h @@ -23,6 +23,7 @@ #include "GSScanlineEnvironment.h" #include "GSFunctionMap.h" +#include "GSUtil.h" class GSSetupPrimCodeGenerator : public GSCodeGenerator { @@ -35,9 +36,21 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator void Generate(); +#if _M_SSE < 0x501 + void Generate_SSE(); + void Depth_SSE(); + void Texture_SSE(); + void Color_SSE(); + + void Generate_AVX(); + void Depth_AVX(); + void Texture_AVX(); + void Color_AVX(); +#else void Depth(); void Texture(); void Color(); +#endif public: GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp index 5fe710dad3..f8f88022af 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp @@ -23,44 +23,48 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_AVX() { +#ifdef _WIN64 sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); +#endif - mov(r8, (size_t)&m_local); + mov(t0, (size_t)&m_local); if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { mov(rax, (size_t)&m_shift[0]); - for(int i = 0; i < 5; i++) + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { vmovaps(Xmm(3 + i), ptr[rax + i * 16]); } } - Depth(); + Depth_AVX(); - Texture(); + Texture_AVX(); - Color(); + Color_AVX(); +#ifdef _WIN64 vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); add(rsp, 8 + 2 * 16); +#endif ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_AVX() { if(!m_en.z && !m_en.f) { @@ -71,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -85,9 +89,9 @@ void GSSetupPrimCodeGenerator::Depth() vcvttps2dq(xmm2, xmm2); vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -97,7 +101,7 @@ void GSSetupPrimCodeGenerator::Depth() vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm2); + vmovdqa(ptr[t0 + variableOffset], xmm2); } } @@ -110,24 +114,28 @@ void GSSetupPrimCodeGenerator::Depth() // m_local.d4.z = dz * 4.0f; vmulps(xmm1, xmm0, xmm3); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; vmulps(xmm1, xmm0, Xmm(4 + i)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm1); + vmovdqa(ptr[t0 + variableOffset], xmm1); } } } else { - // GSVector4 p = vertices[0].p; + // GSVector4 p = vertex[index[1]].p; - vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); + mov(eax, ptr[a1 + sizeof(uint32) * 1]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + + vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -136,46 +144,21 @@ void GSSetupPrimCodeGenerator::Depth() vcvttps2dq(xmm1, xmm0); vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1); } if(m_en.z) { - // GSVector4 z = p.zzzz(); + // uint32 z is bypassed in t.w - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(r9, (size_t)&GSVector4::m_half); - - vbroadcastss(xmm1, ptr[r9]); - vmulps(xmm1, xmm0); - vcvttps2dq(xmm1, xmm1); - vpslld(xmm1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm0, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } - - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); + vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); + vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0); } } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_AVX() { if(!m_en.t) { @@ -184,7 +167,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]); vmulps(xmm1, xmm0, xmm3); @@ -194,13 +177,13 @@ void GSSetupPrimCodeGenerator::Texture() vcvttps2dq(xmm1, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } else { // m_local.d4.stq = t * 4.0f; - vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + vmovaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) @@ -211,7 +194,7 @@ void GSSetupPrimCodeGenerator::Texture() vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -228,8 +211,8 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: vmovdqa(ptr[r8 + variableOffsetS], xmm2); break; - case 1: vmovdqa(ptr[r8 + variableOffsetT], xmm2); break; + case 0: vmovdqa(ptr[t0 + variableOffsetS], xmm2); break; + case 1: vmovdqa(ptr[t0 + variableOffsetT], xmm2); break; } } else @@ -242,16 +225,16 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: vmovaps(ptr[r8 + variableOffsetS], xmm2); break; - case 1: vmovaps(ptr[r8 + variableOffsetT], xmm2); break; - case 2: vmovaps(ptr[r8 + variableOffsetQ], xmm2); break; + case 0: vmovaps(ptr[t0 + variableOffsetS], xmm2); break; + case 1: vmovaps(ptr[t0 + variableOffsetT], xmm2); break; + case 2: vmovaps(ptr[t0 + variableOffsetQ], xmm2); break; } } } } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_AVX() { if(!m_en.c) { @@ -262,7 +245,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -270,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color() vcvttps2dq(xmm1, xmm1); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); vpackssdw(xmm1, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm1); // xmm3 is not needed anymore @@ -299,12 +282,12 @@ void GSSetupPrimCodeGenerator::Color() vpunpcklwd(xmm0, xmm1); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm0); + vmovdqa(ptr[t0 + variableOffset], xmm0); } // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it + vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); @@ -312,7 +295,7 @@ void GSSetupPrimCodeGenerator::Color() vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); @@ -331,14 +314,31 @@ void GSSetupPrimCodeGenerator::Color() vpunpcklwd(xmm0, xmm1); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); - vmovdqa(ptr[r8 + variableOffset], xmm0); + vmovdqa(ptr[t0 + variableOffset], xmm0); } } else { - // GSVector4i c = GSVector4i(vertices[0].c); + // GSVector4i c = GSVector4i(vertex[index[last].c); - vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); + int last = 0; + + switch(m_sel.prim) + { + case GS_POINT_CLASS: last = 0; break; + case GS_LINE_CLASS: last = 1; break; + case GS_TRIANGLE_CLASS: last = 2; break; + case GS_SPRITE_CLASS: last = 1; break; + } + + if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() + { + mov(eax, ptr[a1 + sizeof(uint32) * last]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + } + + vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); @@ -358,8 +358,8 @@ void GSSetupPrimCodeGenerator::Color() vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2); } } diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp index 6456ead387..b4169e766f 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp @@ -23,42 +23,48 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_SSE() { +#ifdef _WIN64 sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); +#endif - mov(r8, (size_t)&m_local); + mov(t0, (size_t)&m_local); if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { - for(int i = 0; i < 5; i++) + mov(rax, (size_t)&m_shift[0]); + + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { movaps(Xmm(3 + i), ptr[rax + i * 16]); } } - Depth(); + Depth_SSE(); - Texture(); + Texture_SSE(); - Color(); + Color_SSE(); +#ifdef _WIN64 vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); add(rsp, 8 + 2 * 16); +#endif ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_SSE() { if(!m_en.z && !m_en.f) { @@ -69,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -85,9 +91,9 @@ void GSSetupPrimCodeGenerator::Depth() cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -98,7 +104,7 @@ void GSSetupPrimCodeGenerator::Depth() pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm2); + movdqa(ptr[t0 + variableOffset], xmm2); } } @@ -112,9 +118,9 @@ void GSSetupPrimCodeGenerator::Depth() movaps(xmm1, xmm0); mulps(xmm1, xmm3); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; @@ -122,15 +128,19 @@ void GSSetupPrimCodeGenerator::Depth() mulps(xmm1, Xmm(4 + i)); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm1); + movdqa(ptr[t0 + variableOffset], xmm1); } } } else { - // GSVector4 p = vertices[0].p; + // GSVector4 p = vertex[index[1]].p; - movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); + mov(eax, ptr[a1 + sizeof(uint32) * 1]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + + movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -139,47 +149,21 @@ void GSSetupPrimCodeGenerator::Depth() cvttps2dq(xmm1, xmm0); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1); } if(m_en.z) { - // GSVector4 z = p.zzzz(); + // uint32 z is bypassed in t.w - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(r9, (size_t)&GSVector4::m_half); - - movss(xmm1, ptr[r9]); - shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); - mulps(xmm1, xmm0); - cvttps2dq(xmm1, xmm1); - pslld(xmm1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(xmm2, xmm2); - psrld(xmm2, 31); - pand(xmm0, xmm2); - - por(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); + vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); + vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0); } } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_SSE() { if(!m_en.t) { @@ -188,7 +172,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]); movaps(xmm1, xmm0); mulps(xmm1, xmm3); @@ -199,13 +183,13 @@ void GSSetupPrimCodeGenerator::Texture() cvttps2dq(xmm1, xmm1); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } else { // m_local.d4.stq = t * 4.0f; - movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) @@ -217,7 +201,7 @@ void GSSetupPrimCodeGenerator::Texture() movaps(xmm1, xmm0); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -235,8 +219,8 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: movdqa(ptr[r8 + variableOffsetS], xmm2); break; - case 1: movdqa(ptr[r8 + variableOffsetT], xmm2); break; + case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break; + case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break; } } else @@ -249,16 +233,16 @@ void GSSetupPrimCodeGenerator::Texture() switch(j) { - case 0: movaps(ptr[r8 + variableOffsetS], xmm2); break; - case 1: movaps(ptr[r8 + variableOffsetT], xmm2); break; - case 2: movaps(ptr[r8 + variableOffsetQ], xmm2); break; + case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break; + case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break; + case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break; } } } } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_SSE() { if(!m_en.c) { @@ -269,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); movaps(xmm1, xmm0); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -279,7 +263,7 @@ void GSSetupPrimCodeGenerator::Color() cvttps2dq(xmm2, xmm2); pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); packssdw(xmm2, xmm2); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2); // xmm3 is not needed anymore @@ -289,7 +273,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); @@ -310,12 +294,12 @@ void GSSetupPrimCodeGenerator::Color() punpcklwd(xmm2, xmm3); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm2); + movdqa(ptr[t0 + variableOffset], xmm2); } // GSVector4 c = dscan.c; - movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it + movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); @@ -324,7 +308,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); @@ -345,14 +329,31 @@ void GSSetupPrimCodeGenerator::Color() punpcklwd(xmm2, xmm3); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); - movdqa(ptr[r8 + variableOffset], xmm2); + movdqa(ptr[t0 + variableOffset], xmm2); } } else { - // GSVector4i c = GSVector4i(vertices[0].c); + // GSVector4i c = GSVector4i(vertex[index[last].c); - cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); + int last = 0; + + switch(m_sel.prim) + { + case GS_POINT_CLASS: last = 0; break; + case GS_LINE_CLASS: last = 1; break; + case GS_TRIANGLE_CLASS: last = 2; break; + case GS_SPRITE_CLASS: last = 1; break; + } + + if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() + { + mov(eax, ptr[a1 + sizeof(uint32) * last]); + shl(eax, 6); // * sizeof(GSVertexSW) + add(rax, a0); + } + + cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); @@ -372,8 +373,8 @@ void GSSetupPrimCodeGenerator::Color() pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1); + movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2); } } diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index 21a7d47c97..f75ea3b6d2 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -23,7 +23,7 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; @@ -32,7 +32,7 @@ static const int _vertex = _args + 4; static const int _index = _args + 8; static const int _dscan = _args + 12; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_AVX() { if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { @@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate() } } - Depth(); + Depth_AVX(); - Texture(); + Texture_AVX(); - Color(); + Color_AVX(); ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_AVX() { if(!m_en.z && !m_en.f) { @@ -144,7 +144,7 @@ void GSSetupPrimCodeGenerator::Depth() } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_AVX() { if(!m_en.t) { @@ -213,7 +213,7 @@ void GSSetupPrimCodeGenerator::Texture() } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_AVX() { if(!m_en.c) { @@ -339,4 +339,4 @@ void GSSetupPrimCodeGenerator::Color() } } -#endif \ No newline at end of file +#endif diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 008a12a8f5..951788fa09 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -23,7 +23,7 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; @@ -32,7 +32,7 @@ static const int _vertex = _args + 4; static const int _index = _args + 8; static const int _dscan = _args + 12; -void GSSetupPrimCodeGenerator::Generate() +void GSSetupPrimCodeGenerator::Generate_SSE() { if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) { @@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate() } } - Depth(); + Depth_SSE(); - Texture(); + Texture_SSE(); - Color(); + Color_SSE(); ret(); } -void GSSetupPrimCodeGenerator::Depth() +void GSSetupPrimCodeGenerator::Depth_SSE() { if(!m_en.z && !m_en.f) { @@ -149,7 +149,7 @@ void GSSetupPrimCodeGenerator::Depth() } } -void GSSetupPrimCodeGenerator::Texture() +void GSSetupPrimCodeGenerator::Texture_SSE() { if(!m_en.t) { @@ -221,7 +221,7 @@ void GSSetupPrimCodeGenerator::Texture() } } -void GSSetupPrimCodeGenerator::Color() +void GSSetupPrimCodeGenerator::Color_SSE() { if(!m_en.c) { @@ -354,4 +354,4 @@ void GSSetupPrimCodeGenerator::Color() } } -#endif \ No newline at end of file +#endif diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 166aef74bc..227fd16128 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -65,8 +65,8 @@ GSState::GSState() m_dump_root = ""; #if defined(__unix__) if (s_dump) { - GSmkdir("/tmp/GS_HW_dump"); - GSmkdir("/tmp/GS_SW_dump"); + GSmkdir(root_hw.c_str()); + GSmkdir(root_sw.c_str()); } #endif diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index d710057489..030c6d4660 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -20,9 +20,7 @@ */ #include "stdafx.h" -#include "GS.h" #include "GSUtil.h" -#include "xbyak/xbyak_util.h" #ifdef _WIN32 #include "GSDeviceDX.h" @@ -33,6 +31,8 @@ #define SVN_MODS 0 #endif +Xbyak::util::Cpu g_cpu; + const char* GSUtil::GetLibName() { // The following ifdef mess is courtesy of "static string str;" @@ -203,38 +203,41 @@ bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm) bool GSUtil::CheckSSE() { - Xbyak::util::Cpu cpu; - Xbyak::util::Cpu::Type type; - const char* instruction_set = ""; + bool status = true; - #if _M_SSE >= 0x501 - type = Xbyak::util::Cpu::tAVX2; - instruction_set = "AVX2"; - #elif _M_SSE >= 0x500 - type = Xbyak::util::Cpu::tAVX; - instruction_set = "AVX"; - #elif _M_SSE >= 0x402 - type = Xbyak::util::Cpu::tSSE42; - instruction_set = "SSE4.2"; - #elif _M_SSE >= 0x401 - type = Xbyak::util::Cpu::tSSE41; - instruction_set = "SSE4.1"; - #elif _M_SSE >= 0x301 - type = Xbyak::util::Cpu::tSSSE3; - instruction_set = "SSSE3"; - #elif _M_SSE >= 0x200 - type = Xbyak::util::Cpu::tSSE2; - instruction_set = "SSE2"; - #endif + struct ISA { + Xbyak::util::Cpu::Type type; + const char* name; + }; - if(!cpu.has(type)) - { - fprintf(stderr, "This CPU does not support %s\n", instruction_set); + ISA checks[] = { + {Xbyak::util::Cpu::tSSE2, "SSE2"}, +#if _M_SSE >= 0x301 + {Xbyak::util::Cpu::tSSSE3, "SSSE3"}, +#endif +#if _M_SSE >= 0x401 + {Xbyak::util::Cpu::tSSE41, "SSE41"}, +#endif +#if _M_SSE >= 0x402 + {Xbyak::util::Cpu::tSSE42, "SSE42"}, +#endif +#if _M_SSE >= 0x500 + {Xbyak::util::Cpu::tAVX, "AVX1"}, +#endif +#if _M_SSE >= 0x501 + {Xbyak::util::Cpu::tAVX2, "AVX2"}, +#endif + }; - return false; + for (size_t i = 0; i < countof(checks); i++) { + if(!g_cpu.has(checks[i].type)) { + fprintf(stderr, "This CPU does not support %s\n", checks[i].name); + + status = false; + } } - return true; + return status; } #define OCL_PROGRAM_VERSION 3 diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index 94552fbaf4..2676a5ca61 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -22,6 +22,7 @@ #pragma once #include "GS.h" +#include "xbyak/xbyak_util.h" struct OCLDeviceDesc { @@ -71,3 +72,5 @@ void GSmkdir(const char* dir); #endif const char* psm_str(int psm); + +extern Xbyak::util::Cpu g_cpu; diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj index 46db6228ec..b44b3374d1 100644 --- a/plugins/GSdx/GSdx.vcxproj +++ b/plugins/GSdx/GSdx.vcxproj @@ -146,26 +146,11 @@ - - true - true - - - true - true - - - true - true - - - true - true - - - true - true - + + + + + @@ -187,26 +172,11 @@ - - true - true - - - true - true - - - true - true - - - true - true - - - true - true - + + + + + diff --git a/plugins/GSdx/stdafx.cpp b/plugins/GSdx/stdafx.cpp index abb479d3d6..52d63aadab 100644 --- a/plugins/GSdx/stdafx.cpp +++ b/plugins/GSdx/stdafx.cpp @@ -138,7 +138,7 @@ void* fifo_alloc(size_t size, size_t repeat) if (next != base) fprintf(stderr, "Fail to mmap contiguous segment\n"); else - fprintf(stderr, "MMAP next %x\n", (uintptr_t)base); + fprintf(stderr, "MMAP next %p\n", base); } return fifo; diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 776d271c56..b4e618f64c 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -266,28 +266,46 @@ using namespace stdext; #define ASSERT assert #ifdef __x86_64__ - #define _M_AMD64 +#endif +#ifdef _M_AMD64 + // Yeah let use mips naming ;) + #ifdef _WIN64 + #define a0 rcx + #define a1 rdx + #define a2 r8 + #define a3 r9 + #define t0 rdi + #define t1 rsi + #else + #define a0 rdi + #define a1 rsi + #define a2 rdx + #define a3 rcx + #define t0 r8 + #define t1 r9 + #endif #endif // sse -#if defined(__GNUC__) && !defined(__x86_64__) +#if defined(__GNUC__) + // Convert gcc see define into GSdx (windows) define #if defined(__AVX2__) - #define _M_SSE 0x501 + #if defined(__x86_64__) + #define _M_SSE 0x500 // TODO + #else + #define _M_SSE 0x501 + #endif #elif defined(__AVX__) #define _M_SSE 0x500 -#elif defined(__SSE4_2__) - #define _M_SSE 0x402 #elif defined(__SSE4_1__) #define _M_SSE 0x401 #elif defined(__SSSE3__) #define _M_SSE 0x301 #elif defined(__SSE2__) #define _M_SSE 0x200 -#elif defined(__SSE__) - #define _M_SSE 0x100 #endif #endif @@ -411,11 +429,11 @@ extern void vmfree(void* ptr, size_t size); extern void* fifo_alloc(size_t size, size_t repeat); extern void fifo_free(void* ptr, size_t size, size_t repeat); -#ifdef _WIN32 +#ifdef ENABLE_VTUNE - #ifdef ENABLE_VTUNE + #include "jitprofiling.h" - #include + #ifdef _WIN32 #pragma comment(lib, "jitprofiling.lib") @@ -472,6 +490,11 @@ struct GLAutoPop { const std::string root_sw("c:\\temp1\\_"); const std::string root_hw("c:\\temp2\\_"); #else -const std::string root_sw("/tmp/GS_SW_dump/"); -const std::string root_hw("/tmp/GS_HW_dump/"); +#ifdef _M_AMD64 +const std::string root_sw("/tmp/GS_SW_dump64/"); +const std::string root_hw("/tmp/GS_HW_dump64/"); +#else +const std::string root_sw("/tmp/GS_SW_dump32/"); +const std::string root_hw("/tmp/GS_HW_dump32/"); +#endif #endif diff --git a/plugins/GSdx/xbyak/xbyak_mnemonic.h b/plugins/GSdx/xbyak/xbyak_mnemonic.h index d551c61323..3eda2556c1 100644 --- a/plugins/GSdx/xbyak/xbyak_mnemonic.h +++ b/plugins/GSdx/xbyak/xbyak_mnemonic.h @@ -370,6 +370,7 @@ void cwde() { db(0x98); } void lahf() { db(0x9F); } void lock() { db(0xF0); } void nop() { db(0x90); } +void int3() { db(0xCC); } void sahf() { db(0x9E); } void stc() { db(0xF9); } void std() { db(0xFD); }