Merge pull request #1664 from PCSX2/greg/gsdx-64b

Greg/gsdx 64b
This commit is contained in:
Gregory Hainaut 2016-11-19 18:12:41 +01:00 committed by GitHub
commit 58c3794ce7
25 changed files with 1624 additions and 1348 deletions

View File

@ -80,6 +80,7 @@ for ARG in "$@"; do
--cross-multilib ) flags="$flags -DCMAKE_TOOLCHAIN_FILE=$toolfile"; useCross=1; ;; --cross-multilib ) flags="$flags -DCMAKE_TOOLCHAIN_FILE=$toolfile"; useCross=1; ;;
--no-cross-multilib ) useCross=0; ;; --no-cross-multilib ) useCross=0; ;;
--coverity ) CoverityBuild=1; cleanBuild=1; ;; --coverity ) CoverityBuild=1; cleanBuild=1; ;;
--vtune ) flags="$flags -DUSE_VTUNE=TRUE" ;;
-D* ) flags="$flags $ARG" ;; -D* ) flags="$flags $ARG" ;;
*) *)

View File

@ -28,6 +28,8 @@ if(DISABLE_BUILD_DATE OR openSUSE)
add_definitions(-DDISABLE_BUILD_DATE) add_definitions(-DDISABLE_BUILD_DATE)
endif() endif()
option(USE_VTUNE "Plug VTUNE to profile GSdx JIT.")
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Graphical option # Graphical option
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
@ -234,7 +236,7 @@ elseif(${PCSX2_TARGET_ARCHITECTURES} MATCHES "x86_64")
if (USE_ICC) if (USE_ICC)
set(ARCH_FLAG "-msse2") set(ARCH_FLAG "-msse2")
else() else()
set(ARCH_FLAG "-msse -msse2 -mfxsr") set(ARCH_FLAG "-msse -msse2 -mfxsr -mssse3 -msse4.1 -mavx")
endif() endif()
else() else()
#set(ARCH_FLAG "-march=native -fabi-version=6") #set(ARCH_FLAG "-march=native -fabi-version=6")

View File

@ -195,15 +195,18 @@ set(GSdxFinalLibs
) )
if(EGL_API AND EGL_FOUND) if(EGL_API AND EGL_FOUND)
set(GSdxFinalLibs ${GSdxFinalLibs} set(GSdxFinalLibs ${GSdxFinalLibs} ${EGL_LIBRARIES})
${EGL_LIBRARIES}
)
endif() endif()
if(LIBLZMA_FOUND) if(LIBLZMA_FOUND)
set(GSdxFinalLibs ${GSdxFinalLibs} set(GSdxFinalLibs ${GSdxFinalLibs} ${LIBLZMA_LIBRARIES})
${LIBLZMA_LIBRARIES} endif()
)
if(USE_VTUNE)
set(GSdxFinalFlags ${GSdxFinalFlags} -DENABLE_VTUNE)
include_directories("$ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/include")
set(GSdxFinalLibs ${GSdxFinalLibs} $ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/lib64/libjitprofiling.a)
set(GSdxFinalLibs ${GSdxFinalLibs} $ENV{VTUNE_AMPLIFIER_XE_2016_DIR}/lib32/libjitprofiling.a)
endif() endif()
# Generate Glsl header file. Protect with REBUILD_SHADER to avoid build-dependency on PERL # Generate Glsl header file. Protect with REBUILD_SHADER to avoid build-dependency on PERL

View File

@ -22,6 +22,17 @@
#include "stdafx.h" #include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE >= 0x501
#else
void GSDrawScanlineCodeGenerator::Generate()
{
if(g_cpu.has(util::Cpu::tAVX))
Generate_AVX();
else
Generate_SSE();
}
#endif
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] = alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] =
@ -100,6 +111,9 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
{ {
m_sel.key = key; m_sel.key = key;
if(m_sel.breakpoint)
int3();
Generate(); Generate();
} }
@ -180,196 +194,222 @@ void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift) void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
if(shift == 0)
{ {
vpmulhrsw(a, f); if(shift == 0)
{
vpmulhrsw(a, f);
}
else
{
vpsllw(a, shift + 1);
vpmulhw(a, f);
}
} }
else else
{ {
vpsllw(a, shift + 1); if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
vpmulhw(a, f); {
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
} }
#else
if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
{
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
#endif
} }
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift) void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpsubw(a, b); vpsubw(a, b);
modulate16(a, f, shift); modulate16(a, f, shift);
vpaddw(a, b); vpaddw(a, b);
}
#else else
{
psubw(a, b); psubw(a, b);
modulate16(a, f, shift); modulate16(a, f, shift);
paddw(a, b); paddw(a, b);
}
#endif
} }
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpsubw(a, b); vpsubw(a, b);
vpmullw(a, f); vpmullw(a, f);
vpsraw(a, 4); vpsraw(a, 4);
vpaddw(a, b); vpaddw(a, b);
}
#else else
{
psubw(a, b); psubw(a, b);
pmullw(a, f); pmullw(a, f);
psraw(a, 4); psraw(a, 4);
paddw(a, b); paddw(a, b);
}
#endif
} }
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpblendw(a, b, 0xaa); vpblendw(a, b, 0xaa);
}
#elif _M_SSE >= 0x401 else
{
pblendw(a, b, 0xaa); if(g_cpu.has(util::Cpu::tSSE41))
{
#else pblendw(a, b, 0xaa);
}
pcmpeqd(temp, temp); else
psrld(temp, 16); {
pand(a, temp); pcmpeqd(temp, temp);
pandn(temp, b); psrld(temp, 16);
por(a, temp); pand(a, temp);
pandn(temp, b);
#endif por(a, temp);
}
}
} }
void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpackuswb(a, a); vpackuswb(a, a);
vpmovzxbw(a, a); vpmovzxbw(a, a);
}
#elif _M_SSE >= 0x401 else
{
packuswb(a, a); if(g_cpu.has(util::Cpu::tSSE41))
pmovzxbw(a, a); {
packuswb(a, a);
#else pmovzxbw(a, a);
}
packuswb(a, a); else
pxor(temp, temp); {
punpcklbw(a, temp); packuswb(a, a);
pxor(temp, temp);
#endif punpcklbw(a, temp);
}
}
} }
void GSDrawScanlineCodeGenerator::alltrue() void GSDrawScanlineCodeGenerator::alltrue()
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpmovmskb(eax, xmm7); vpmovmskb(eax, xmm7);
cmp(eax, 0xffff); cmp(eax, 0xffff);
je("step", T_NEAR); je("step", T_NEAR);
}
#else else
{
pmovmskb(eax, xmm7); pmovmskb(eax, xmm7);
cmp(eax, 0xffff); cmp(eax, 0xffff);
je("step", T_NEAR); je("step", T_NEAR);
}
#endif
} }
void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpand(b, mask); vpand(b, mask);
vpandn(mask, a); vpandn(mask, a);
vpor(a, b, mask); vpor(a, b, mask);
}
#else else
{
pand(b, mask); pand(b, mask);
pandn(mask, a); pandn(mask, a);
por(b, mask); por(b, mask);
movdqa(a, b); movdqa(a, b);
}
#endif
} }
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpand(b, mask); vpand(b, mask);
vpandn(mask, a); vpandn(mask, a);
vpor(b, mask); vpor(b, mask);
}
#else else
{
pand(b, mask); pand(b, mask);
pandn(mask, a); pandn(mask, a);
por(b, mask); por(b, mask);
}
#endif
} }
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
vpblendvb(a, a, b, xmm0);
vpblendvb(a, a, b, xmm0); else if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(a, b);
#elif _M_SSE >= 0x401 else
blend(a, b, xmm0);
pblendvb(a, b);
#else
blend(a, b, xmm0);
#endif
} }
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpblendvb(b, a, b, xmm0); vpblendvb(b, a, b, xmm0);
}
else if(g_cpu.has(util::Cpu::tSSE41))
{
pblendvb(a, b);
movdqa(b, a);
}
else
{
blendr(b, a, xmm0);
}
}
#elif _M_SSE >= 0x401 void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
{
// l = src & 0xFF; (1 left shift + 1 right shift)
// h = (src >> 8) & 0xFF; (1 right shift)
pblendvb(a, b); if(g_cpu.has(util::Cpu::tAVX))
movdqa(b, a); {
if (src == h) {
#else vpsllw(l, src, 8);
vpsrlw(h, 8);
blendr(b, a, xmm0); } else if (src == l) {
vpsrlw(h, src, 8);
#endif vpsllw(l, 8);
} else {
vpsllw(l, src, 8);
vpsrlw(h, src, 8);
}
vpsrlw(l, 8);
}
else
{
if (src == h) {
movdqa(l, src);
} else if (src == l) {
movdqa(h, src);
} else {
movdqa(l, src);
movdqa(h, src);
}
psllw(l, 8);
psrlw(l, 8);
psrlw(h, 8);
}
} }
#endif #endif

View File

@ -23,9 +23,16 @@
#include "GSScanlineEnvironment.h" #include "GSScanlineEnvironment.h"
#include "GSFunctionMap.h" #include "GSFunctionMap.h"
#include "GSUtil.h"
using namespace Xbyak; using namespace Xbyak;
#if defined(_M_AMD64) || defined(_WIN64)
#define RegLong Reg64
#else
#define RegLong Reg32
#endif
class GSDrawScanlineCodeGenerator : public GSCodeGenerator class GSDrawScanlineCodeGenerator : public GSCodeGenerator
{ {
void operator = (const GSDrawScanlineCodeGenerator&); void operator = (const GSDrawScanlineCodeGenerator&);
@ -57,17 +64,9 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void WriteZBuf(); void WriteZBuf();
void AlphaBlend(); void AlphaBlend();
void WriteFrame(); void WriteFrame();
void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr);
#if defined(_M_AMD64) || defined(_WIN64) void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz);
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr); void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
#else
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
#endif
void ReadTexel(int pixels, int mip_offset = 0); void ReadTexel(int pixels, int mip_offset = 0);
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i); void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
@ -84,39 +83,59 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
#else #else
void Init(); void Generate_SSE();
void Step(); void Init_SSE();
void TestZ(const Xmm& temp1, const Xmm& temp2); void Step_SSE();
void SampleTexture(); void TestZ_SSE(const Xmm& temp1, const Xmm& temp2);
void Wrap(const Xmm& uv0); void SampleTexture_SSE();
void Wrap(const Xmm& uv0, const Xmm& uv1); void Wrap_SSE(const Xmm& uv0);
void SampleTextureLOD(); void Wrap_SSE(const Xmm& uv0, const Xmm& uv1);
void WrapLOD(const Xmm& uv0); void SampleTextureLOD_SSE();
void WrapLOD(const Xmm& uv0, const Xmm& uv1); void WrapLOD_SSE(const Xmm& uv0);
void AlphaTFX(); void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1);
void ReadMask(); void AlphaTFX_SSE();
void TestAlpha(); void ReadMask_SSE();
void ColorTFX(); void TestAlpha_SSE();
void Fog(); void ColorTFX_SSE();
void ReadFrame(); void Fog_SSE();
void TestDestAlpha(); void ReadFrame_SSE();
void WriteMask(); void TestDestAlpha_SSE();
void WriteZBuf(); void WriteMask_SSE();
void AlphaBlend(); void WriteZBuf_SSE();
void WriteFrame(); void AlphaBlend_SSE();
void WriteFrame_SSE();
void ReadPixel_SSE(const Xmm& dst, const RegLong& addr);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_SSE(int pixels, int mip_offset = 0);
void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i);
#if defined(_M_AMD64) || defined(_WIN64) void Generate_AVX();
void ReadPixel(const Xmm& dst, const Reg64& addr); void Init_AVX();
void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz); void Step_AVX();
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm); void TestZ_AVX(const Xmm& temp1, const Xmm& temp2);
#else void SampleTexture_AVX();
void ReadPixel(const Xmm& dst, const Reg32& addr); void Wrap_AVX(const Xmm& uv0);
void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz); void Wrap_AVX(const Xmm& uv0, const Xmm& uv1);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm); void SampleTextureLOD_AVX();
#endif void WrapLOD_AVX(const Xmm& uv0);
void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1);
void ReadTexel(int pixels, int mip_offset = 0); void AlphaTFX_AVX();
void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i); void ReadMask_AVX();
void TestAlpha_AVX();
void ColorTFX_AVX();
void Fog_AVX();
void ReadFrame_AVX();
void TestDestAlpha_AVX();
void WriteMask_AVX();
void WriteZBuf_AVX();
void AlphaBlend_AVX();
void WriteFrame_AVX();
void ReadPixel_AVX(const Xmm& dst, const RegLong& addr);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_AVX(int pixels, int mip_offset = 0);
void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i);
void modulate16(const Xmm& a, const Operand& f, int shift); void modulate16(const Xmm& a, const Operand& f, int shift);
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift); void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift);
@ -128,6 +147,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void blendr(const Xmm& b, const Xmm& a, const Xmm& mask); void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
void blend8(const Xmm& a, const Xmm& b); void blend8(const Xmm& a, const Xmm& b);
void blend8r(const Xmm& b, const Xmm& a); void blend8r(const Xmm& b, const Xmm& a);
void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src);
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -22,99 +22,102 @@
#include "stdafx.h" #include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
void GSDrawScanlineCodeGenerator::Generate() // It is useless to port the code to SSEx, better use the faster 32 bits version instead
void GSDrawScanlineCodeGenerator::Generate_SSE()
{
// Avoid a crash if someone want to use it
ret();
}
void GSDrawScanlineCodeGenerator::Init_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::Init() void GSDrawScanlineCodeGenerator::Step_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::Step() void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
{ {
} }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::SampleTexture() void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
{ {
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
{ {
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::AlphaTFX() void GSDrawScanlineCodeGenerator::ReadMask_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::ReadMask() void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::TestAlpha() void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::ColorTFX() void GSDrawScanlineCodeGenerator::Fog_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::Fog() void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::ReadFrame() void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::WriteMask_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::WriteMask() void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::WriteZBuf() void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::AlphaBlend() void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
{ {
} }
void GSDrawScanlineCodeGenerator::WriteFrame() void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg64& addr)
{ {
} }
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
{
}
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
{ {
} }
static const int s_offsets[4] = {0, 2, 8, 10}; static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, uint8 i, int psm)
{ {
} }
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
{ {
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
{ {
} }

View File

@ -23,21 +23,20 @@
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16; static const int _args = 16;
static const int _top = _args + 4; static const int _top = _args + 4;
static const int _v = _args + 8; static const int _v = _args + 8;
void GSDrawScanlineCodeGenerator::Generate() void GSDrawScanlineCodeGenerator::Generate_AVX()
{ {
//ret(8);
push(ebx); push(ebx);
push(esi); push(esi);
push(edi); push(edi);
push(ebp); push(ebp);
Init(); Init_AVX();
if(!m_sel.edge) if(!m_sel.edge)
{ {
@ -59,7 +58,7 @@ L("loop");
bool tme = m_sel.tfx != TFX_NONE; bool tme = m_sel.tfx != TFX_NONE;
TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); TestZ_AVX(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -75,11 +74,11 @@ L("loop");
if(m_sel.mmin) if(m_sel.mmin)
{ {
SampleTextureLOD(); SampleTextureLOD_AVX();
} }
else else
{ {
SampleTexture(); SampleTexture_AVX();
} }
// ecx = steps // ecx = steps
@ -93,7 +92,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
AlphaTFX(); AlphaTFX_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -104,7 +103,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ReadMask(); ReadMask_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -117,7 +116,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
TestAlpha(); TestAlpha_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -130,7 +129,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ColorTFX(); ColorTFX_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -142,7 +141,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
Fog(); Fog_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -154,7 +153,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ReadFrame(); ReadFrame_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -167,7 +166,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
TestDestAlpha(); TestDestAlpha_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -180,7 +179,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
WriteMask(); WriteMask_AVX();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -194,7 +193,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
WriteZBuf(); WriteZBuf_AVX();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -208,7 +207,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
AlphaBlend(); AlphaBlend_AVX();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -220,7 +219,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
WriteFrame(); WriteFrame_AVX();
L("step"); L("step");
@ -232,7 +231,7 @@ L("step");
jle("exit", T_NEAR); jle("exit", T_NEAR);
Step(); Step_AVX();
jmp("loop", T_NEAR); jmp("loop", T_NEAR);
} }
@ -249,7 +248,7 @@ L("exit");
ret(8); ret(8);
} }
void GSDrawScanlineCodeGenerator::Init() void GSDrawScanlineCodeGenerator::Init_AVX()
{ {
if(!m_sel.notest) if(!m_sel.notest)
{ {
@ -455,7 +454,7 @@ void GSDrawScanlineCodeGenerator::Init()
} }
} }
void GSDrawScanlineCodeGenerator::Step() void GSDrawScanlineCodeGenerator::Step_AVX()
{ {
// steps -= 4; // steps -= 4;
@ -596,7 +595,7 @@ void GSDrawScanlineCodeGenerator::Step()
} }
} }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
{ {
if(!m_sel.zb) if(!m_sel.zb)
{ {
@ -644,7 +643,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest) if(m_sel.ztest)
{ {
ReadPixel(xmm1, ebp); ReadPixel_AVX(xmm1, ebp);
if(m_sel.zwrite && m_sel.zpsm < 2) if(m_sel.zwrite && m_sel.zpsm < 2)
{ {
@ -694,7 +693,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTexture() void GSDrawScanlineCodeGenerator::SampleTexture_AVX()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -740,7 +739,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
if(m_sel.ltf) if(m_sel.ltf)
{ {
// GSVector4i uf = u.xxzzlh().srl16(1); // GSVector4i uf = u.xxzzlh().srl16(12);
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
@ -749,7 +748,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
if(m_sel.prim != GS_SPRITE_CLASS) if(m_sel.prim != GS_SPRITE_CLASS)
{ {
// GSVector4i vf = v.xxzzlh().srl16(1); // GSVector4i vf = v.xxzzlh().srl16(12);
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
@ -775,13 +774,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
Wrap(xmm2, xmm3); Wrap_AVX(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
Wrap(xmm2); Wrap_AVX(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -843,7 +842,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_AVX(4, 0);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -857,16 +856,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// GSVector4i rb00 = c00 & mask; // GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask; // GSVector4i ga00 = (c00 >> 8) & mask;
vpsllw(xmm2, xmm6, 8); split16_2x8(xmm2, xmm6, xmm6);
vpsrlw(xmm2, 8);
vpsrlw(xmm6, 8);
// GSVector4i rb01 = c01 & mask; // GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask; // GSVector4i ga01 = (c01 >> 8) & mask;
vpsllw(xmm3, xmm4, 8); split16_2x8(xmm3, xmm4, xmm4);
vpsrlw(xmm3, 8);
vpsrlw(xmm4, 8);
// xmm0 = uf // xmm0 = uf
// xmm2 = rb00 // xmm2 = rb00
@ -894,16 +889,12 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// GSVector4i rb10 = c10 & mask; // GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask; // GSVector4i ga10 = (c10 >> 8) & mask;
vpsrlw(xmm2, xmm1, 8); split16_2x8(xmm1, xmm2, xmm1);
vpsllw(xmm1, 8);
vpsrlw(xmm1, 8);
// GSVector4i rb11 = c11 & mask; // GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask; // GSVector4i ga11 = (c11 >> 8) & mask;
vpsrlw(xmm6, xmm5, 8); split16_2x8(xmm5, xmm6, xmm5);
vpsllw(xmm5, 8);
vpsrlw(xmm5, 8);
// xmm0 = uf // xmm0 = uf
// xmm3 = rb00 // xmm3 = rb00
@ -943,20 +934,18 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
// c[0] = c00 & mask; // c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask; // c[1] = (c00 >> 8) & mask;
vpsllw(xmm5, xmm6, 8); split16_2x8(xmm5, xmm6, xmm6);
vpsrlw(xmm5, 8);
vpsrlw(xmm6, 8);
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
{ {
// xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1017,7 +1006,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1109,7 +1098,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTextureLOD() void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -1370,13 +1359,13 @@ return;
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3); WrapLOD_AVX(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
WrapLOD(xmm2); WrapLOD_AVX(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -1438,7 +1427,7 @@ return;
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_AVX(4, 0);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -1452,16 +1441,12 @@ return;
// GSVector4i rb00 = c00 & mask; // GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask; // GSVector4i ga00 = (c00 >> 8) & mask;
vpsllw(xmm2, xmm6, 8); split16_2x8(xmm2, xmm6, xmm6);
vpsrlw(xmm2, 8);
vpsrlw(xmm6, 8);
// GSVector4i rb01 = c01 & mask; // GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask; // GSVector4i ga01 = (c01 >> 8) & mask;
vpsllw(xmm3, xmm4, 8); split16_2x8(xmm3, xmm4, xmm4);
vpsrlw(xmm3, 8);
vpsrlw(xmm4, 8);
// xmm0 = uf // xmm0 = uf
// xmm2 = rb00 // xmm2 = rb00
@ -1489,16 +1474,12 @@ return;
// GSVector4i rb10 = c10 & mask; // GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask; // GSVector4i ga10 = (c10 >> 8) & mask;
vpsrlw(xmm2, xmm1, 8); split16_2x8(xmm1, xmm2, xmm1);
vpsllw(xmm1, 8);
vpsrlw(xmm1, 8);
// GSVector4i rb11 = c11 & mask; // GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask; // GSVector4i ga11 = (c11 >> 8) & mask;
vpsrlw(xmm6, xmm5, 8); split16_2x8(xmm5, xmm6, xmm5);
vpsllw(xmm5, 8);
vpsrlw(xmm5, 8);
// xmm0 = uf // xmm0 = uf
// xmm3 = rb00 // xmm3 = rb00
@ -1538,16 +1519,14 @@ return;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
// c[0] = c00 & mask; // c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask; // c[1] = (c00 >> 8) & mask;
vpsllw(xmm5, xmm6, 8); split16_2x8(xmm5, xmm6, xmm6);
vpsrlw(xmm5, 8);
vpsrlw(xmm6, 8);
} }
if(m_sel.mmin != 1) // !round-off mode if(m_sel.mmin != 1) // !round-off mode
@ -1611,13 +1590,13 @@ return;
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3); WrapLOD_AVX(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
WrapLOD(xmm2); WrapLOD_AVX(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -1679,7 +1658,7 @@ return;
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 1); ReadTexel_AVX(4, 1);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -1693,16 +1672,12 @@ return;
// GSVector4i rb00 = c00 & mask; // GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask; // GSVector4i ga00 = (c00 >> 8) & mask;
vpsllw(xmm2, xmm6, 8); split16_2x8(xmm2, xmm6, xmm6);
vpsrlw(xmm2, 8);
vpsrlw(xmm6, 8);
// GSVector4i rb01 = c01 & mask; // GSVector4i rb01 = c01 & mask;
// GSVector4i ga01 = (c01 >> 8) & mask; // GSVector4i ga01 = (c01 >> 8) & mask;
vpsllw(xmm3, xmm4, 8); split16_2x8(xmm3, xmm4, xmm4);
vpsrlw(xmm3, 8);
vpsrlw(xmm4, 8);
// xmm0 = uf // xmm0 = uf
// xmm2 = rb00 // xmm2 = rb00
@ -1730,16 +1705,12 @@ return;
// GSVector4i rb10 = c10 & mask; // GSVector4i rb10 = c10 & mask;
// GSVector4i ga10 = (c10 >> 8) & mask; // GSVector4i ga10 = (c10 >> 8) & mask;
vpsrlw(xmm2, xmm1, 8); split16_2x8(xmm1, xmm2, xmm1);
vpsllw(xmm1, 8);
vpsrlw(xmm1, 8);
// GSVector4i rb11 = c11 & mask; // GSVector4i rb11 = c11 & mask;
// GSVector4i ga11 = (c11 >> 8) & mask; // GSVector4i ga11 = (c11 >> 8) & mask;
vpsrlw(xmm6, xmm5, 8); split16_2x8(xmm5, xmm6, xmm5);
vpsllw(xmm5, 8);
vpsrlw(xmm5, 8);
// xmm0 = uf // xmm0 = uf
// xmm3 = rb00 // xmm3 = rb00
@ -1779,16 +1750,14 @@ return;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 1); ReadTexel_AVX(1, 1);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
// c[0] = c00 & mask; // c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask; // c[1] = (c00 >> 8) & mask;
vpsllw(xmm5, xmm6, 8); split16_2x8(xmm5, xmm6, xmm6);
vpsrlw(xmm5, 8);
vpsrlw(xmm6, 8);
} }
vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
@ -1804,7 +1773,7 @@ return;
pop(ebp); pop(ebp);
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv)
{ {
// xmm5 = minuv // xmm5 = minuv
// xmm6 = maxuv // xmm6 = maxuv
@ -1865,7 +1834,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm5 = minuv // xmm5 = minuv
// xmm6 = maxuv // xmm6 = maxuv
@ -1953,7 +1922,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::AlphaTFX() void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -2101,7 +2070,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
} }
} }
void GSDrawScanlineCodeGenerator::ReadMask() void GSDrawScanlineCodeGenerator::ReadMask_AVX()
{ {
if(m_sel.fwrite) if(m_sel.fwrite)
{ {
@ -2114,7 +2083,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
} }
} }
void GSDrawScanlineCodeGenerator::TestAlpha() void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
{ {
switch(m_sel.atst) switch(m_sel.atst)
{ {
@ -2185,7 +2154,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
} }
} }
void GSDrawScanlineCodeGenerator::ColorTFX() void GSDrawScanlineCodeGenerator::ColorTFX_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2261,7 +2230,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
} }
} }
void GSDrawScanlineCodeGenerator::Fog() void GSDrawScanlineCodeGenerator::Fog_AVX()
{ {
if(!m_sel.fwrite || !m_sel.fge) if(!m_sel.fwrite || !m_sel.fge)
{ {
@ -2282,7 +2251,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(xmm6, xmm1, xmm0); mix16(xmm6, xmm1, xmm0);
} }
void GSDrawScanlineCodeGenerator::ReadFrame() void GSDrawScanlineCodeGenerator::ReadFrame_AVX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -2300,10 +2269,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return; return;
} }
ReadPixel(xmm2, ebx); ReadPixel_AVX(xmm2, ebx);
} }
void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX()
{ {
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{ {
@ -2347,7 +2316,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue(); alltrue();
} }
void GSDrawScanlineCodeGenerator::WriteMask() void GSDrawScanlineCodeGenerator::WriteMask_AVX()
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -2393,7 +2362,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx); not(edx);
} }
void GSDrawScanlineCodeGenerator::WriteZBuf() void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
{ {
if(!m_sel.zwrite) if(!m_sel.zwrite)
{ {
@ -2411,10 +2380,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
} }
void GSDrawScanlineCodeGenerator::AlphaBlend() void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2436,9 +2405,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
// c[2] = fd & mask; // c[2] = fd & mask;
// c[3] = (fd >> 8) & mask; // c[3] = (fd >> 8) & mask;
vpsllw(xmm0, xmm2, 8); split16_2x8(xmm0, xmm1, xmm2);
vpsrlw(xmm0, 8);
vpsrlw(xmm1, xmm2, 8);
break; break;
@ -2638,7 +2605,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
} }
} }
void GSDrawScanlineCodeGenerator::WriteFrame() void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2718,16 +2685,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); WritePixel_AVX(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
} }
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg32& addr)
{ {
vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
} }
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -2738,10 +2705,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
} }
else else
{ {
WritePixel(src, addr, 0, psm); WritePixel_AVX(src, addr, 0, psm);
WritePixel(src, addr, 1, psm); WritePixel_AVX(src, addr, 1, psm);
WritePixel(src, addr, 2, psm); WritePixel_AVX(src, addr, 2, psm);
WritePixel(src, addr, 3, psm); WritePixel_AVX(src, addr, 3, psm);
} }
} }
else else
@ -2772,22 +2739,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
test(mask, 0x03); test(mask, 0x03);
je("@f"); je("@f");
WritePixel(src, addr, 0, psm); WritePixel_AVX(src, addr, 0, psm);
L("@@"); L("@@");
test(mask, 0x0c); test(mask, 0x0c);
je("@f"); je("@f");
WritePixel(src, addr, 1, psm); WritePixel_AVX(src, addr, 1, psm);
L("@@"); L("@@");
test(mask, 0x30); test(mask, 0x30);
je("@f"); je("@f");
WritePixel(src, addr, 2, psm); WritePixel_AVX(src, addr, 2, psm);
L("@@"); L("@@");
test(mask, 0xc0); test(mask, 0xc0);
je("@f"); je("@f");
WritePixel(src, addr, 3, psm); WritePixel_AVX(src, addr, 3, psm);
L("@@"); L("@@");
} }
} }
@ -2795,7 +2762,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
static const int s_offsets[] = {0, 2, 8, 10}; static const int s_offsets[] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{ {
Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
@ -2820,7 +2787,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{ {
// in // in
// xmm5 = addr00 // xmm5 = addr00
@ -2859,7 +2826,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int i = 0; i < pixels; i++) for(int i = 0; i < pixels; i++)
{ {
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
} }
} }
@ -2878,19 +2845,18 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
} }
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
const int t[] = {4, 1, 5, 2};
for(int i = 0; i < pixels; i++) for(int i = 0; i < pixels; i++)
{ {
for(uint8 j = 0; j < 4; j++) for(uint8 j = 0; j < 4; j++)
{ {
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
} }
} }
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i)
{ {
ASSERT(i < 4); ASSERT(i < 4);

File diff suppressed because it is too large Load Diff

View File

@ -26,6 +26,8 @@
#include "xbyak/xbyak.h" #include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h" #include "xbyak/xbyak_util.h"
#include "GSScanlineEnvironment.h"
template<class KEY, class VALUE> class GSFunctionMap template<class KEY, class VALUE> class GSFunctionMap
{ {
protected: protected:
@ -161,6 +163,7 @@ class GSCodeGeneratorFunctionMap : public GSFunctionMap<KEY, VALUE>
void* m_param; void* m_param;
hash_map<uint64, VALUE> m_cgmap; hash_map<uint64, VALUE> m_cgmap;
GSCodeBuffer m_cb; GSCodeBuffer m_cb;
size_t m_total_code_size;
enum {MAX_SIZE = 8192}; enum {MAX_SIZE = 8192};
@ -168,9 +171,15 @@ public:
GSCodeGeneratorFunctionMap(const char* name, void* param) GSCodeGeneratorFunctionMap(const char* name, void* param)
: m_name(name) : m_name(name)
, m_param(param) , m_param(param)
, m_total_code_size(0)
{ {
} }
~GSCodeGeneratorFunctionMap()
{
fprintf(stderr, "%s generated %zu bytes of instruction\n", m_name.c_str(), m_total_code_size);
}
VALUE GetDefaultFunction(KEY key) VALUE GetDefaultFunction(KEY key)
{ {
VALUE ret = NULL; VALUE ret = NULL;
@ -183,10 +192,19 @@ public:
} }
else else
{ {
CG* cg = new CG(m_param, key, m_cb.GetBuffer(MAX_SIZE), MAX_SIZE); void* code_ptr = m_cb.GetBuffer(MAX_SIZE);
CG* cg = new CG(m_param, key, code_ptr, MAX_SIZE);
ASSERT(cg->getSize() < MAX_SIZE); ASSERT(cg->getSize() < MAX_SIZE);
#if 0
fprintf(stderr, "%s Location:%p Size:%zu Key:%llx\n", m_name.c_str(), code_ptr, cg->getSize(), (uint64)key);
GSScanlineSelector sel(key);
sel.Print();
#endif
m_total_code_size += cg->getSize();
m_cb.ReleaseBuffer(cg->getSize()); m_cb.ReleaseBuffer(cg->getSize());
ret = (VALUE)cg->getCode(); ret = (VALUE)cg->getCode();

View File

@ -173,7 +173,7 @@ void GSDumpRaw::Read(void* ptr, size_t size) {
} else { } else {
size_t ret = fread(ptr, 1, size, m_fp); size_t ret = fread(ptr, 1, size, m_fp);
if (ret != size) { if (ret != size) {
fprintf(stderr, "GSDumpRaw:: Read error (%d/%d)\n", ret, size); fprintf(stderr, "GSDumpRaw:: Read error (%zu/%zu)\n", ret, size);
throw "BAD"; // Just exit the program throw "BAD"; // Just exit the program
} }
} }

View File

@ -69,6 +69,8 @@ union GSScanlineSelector
uint32 mmin:2; // 53 uint32 mmin:2; // 53
uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program
}; };
struct struct
@ -76,6 +78,7 @@ union GSScanlineSelector
uint32 _pad1:22; uint32 _pad1:22;
uint32 ababcd:8; uint32 ababcd:8;
uint32 _pad2:2; uint32 _pad2:2;
uint32 fb:2; uint32 fb:2;
uint32 _pad3:1; uint32 _pad3:1;
uint32 zb:2; uint32 zb:2;
@ -89,6 +92,9 @@ union GSScanlineSelector
uint64 key; uint64 key;
GSScanlineSelector() = default;
GSScanlineSelector(uint64 k) : key(k) {}
operator uint32() const {return lo;} operator uint32() const {return lo;}
operator uint64() const {return key;} operator uint64() const {return key;}
@ -103,6 +109,18 @@ union GSScanlineSelector
&& date == 0 && date == 0
&& fge == 0; && fge == 0;
} }
void Print() const
{
fprintf(stderr, "fpsm:%d zpsm:%d ztst:%d ztest:%d atst:%d afail:%d iip:%d rfb:%d fb:%d zb:%d zw:%d "
"tfx:%d tcc:%d fst:%d ltf:%d tlu:%d wms:%d wmt:%d mmin:%d lcm:%d tw:%d "
"fba:%d cclamp:%d date:%d datm:%d "
"prim:%d abe:%d %d%d%d%d fge:%d dthe:%d notest:%d\n",
fpsm, zpsm, ztst, ztest, atst, afail, iip, rfb, fb, zb, zwrite,
tfx, tcc, fst, ltf, tlu, wms, wmt, mmin, lcm, tw,
fba, colclamp, date, datm,
prim, abe, aba, abb, abc, abd , fge, dthe, notest);
}
}; };
struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer struct alignas(32) GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer

View File

@ -22,6 +22,8 @@
#include "stdafx.h" #include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.h"
using namespace Xbyak;
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
GSVector8 GSSetupPrimCodeGenerator::m_shift[9]; GSVector8 GSSetupPrimCodeGenerator::m_shift[9];
#else #else
@ -75,3 +77,14 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void
Generate(); Generate();
} }
#if _M_SSE >= 0x501
#else
void GSSetupPrimCodeGenerator::Generate()
{
if(g_cpu.has(util::Cpu::tAVX))
Generate_AVX();
else
Generate_SSE();
}
#endif

View File

@ -23,6 +23,7 @@
#include "GSScanlineEnvironment.h" #include "GSScanlineEnvironment.h"
#include "GSFunctionMap.h" #include "GSFunctionMap.h"
#include "GSUtil.h"
class GSSetupPrimCodeGenerator : public GSCodeGenerator class GSSetupPrimCodeGenerator : public GSCodeGenerator
{ {
@ -35,9 +36,21 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
void Generate(); void Generate();
#if _M_SSE < 0x501
void Generate_SSE();
void Depth_SSE();
void Texture_SSE();
void Color_SSE();
void Generate_AVX();
void Depth_AVX();
void Texture_AVX();
void Color_AVX();
#else
void Depth(); void Depth();
void Texture(); void Texture();
void Color(); void Color();
#endif
public: public:
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);

View File

@ -23,44 +23,48 @@
#include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak; using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate_AVX()
{ {
#ifdef _WIN64
sub(rsp, 8 + 2 * 16); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
#endif
mov(r8, (size_t)&m_local); mov(t0, (size_t)&m_local);
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{ {
mov(rax, (size_t)&m_shift[0]); mov(rax, (size_t)&m_shift[0]);
for(int i = 0; i < 5; i++) for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{ {
vmovaps(Xmm(3 + i), ptr[rax + i * 16]); vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
} }
} }
Depth(); Depth_AVX();
Texture(); Texture_AVX();
Color(); Color_AVX();
#ifdef _WIN64
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16); add(rsp, 8 + 2 * 16);
#endif
ret(); ret();
} }
void GSSetupPrimCodeGenerator::Depth() void GSSetupPrimCodeGenerator::Depth_AVX()
{ {
if(!m_en.z && !m_en.f) if(!m_en.z && !m_en.f)
{ {
@ -71,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth()
{ {
// GSVector4 p = dscan.p; // GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
if(m_en.f) if(m_en.f)
{ {
@ -85,9 +89,9 @@ void GSSetupPrimCodeGenerator::Depth()
vcvttps2dq(xmm2, xmm2); vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
@ -97,7 +101,7 @@ void GSSetupPrimCodeGenerator::Depth()
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[r8 + variableOffset], xmm2); vmovdqa(ptr[t0 + variableOffset], xmm2);
} }
} }
@ -110,24 +114,28 @@ void GSSetupPrimCodeGenerator::Depth()
// m_local.d4.z = dz * 4.0f; // m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3); vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// m_local.d[i].z = dz * m_shift[i]; // m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i)); vmulps(xmm1, xmm0, Xmm(4 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[r8 + variableOffset], xmm1); vmovdqa(ptr[t0 + variableOffset], xmm1);
} }
} }
} }
else else
{ {
// GSVector4 p = vertices[0].p; // GSVector4 p = vertex[index[1]].p;
vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); mov(eax, ptr[a1 + sizeof(uint32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
if(m_en.f) if(m_en.f)
{ {
@ -136,46 +144,21 @@ void GSSetupPrimCodeGenerator::Depth()
vcvttps2dq(xmm1, xmm0); vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
} }
if(m_en.z) if(m_en.z)
{ {
// GSVector4 z = p.zzzz(); // uint32 z is bypassed in t.w
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
if(m_sel.zoverflow) vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
mov(r9, (size_t)&GSVector4::m_half);
vbroadcastss(xmm1, ptr[r9]);
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
} }
} }
} }
void GSSetupPrimCodeGenerator::Texture() void GSSetupPrimCodeGenerator::Texture_AVX()
{ {
if(!m_en.t) if(!m_en.t)
{ {
@ -184,7 +167,7 @@ void GSSetupPrimCodeGenerator::Texture()
// GSVector4 t = dscan.t; // GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
vmulps(xmm1, xmm0, xmm3); vmulps(xmm1, xmm0, xmm3);
@ -194,13 +177,13 @@ void GSSetupPrimCodeGenerator::Texture()
vcvttps2dq(xmm1, xmm1); vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
else else
{ {
// m_local.d4.stq = t * 4.0f; // m_local.d4.stq = t * 4.0f;
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); vmovaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -211,7 +194,7 @@ void GSSetupPrimCodeGenerator::Texture()
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// GSVector4 v = ds/dt * m_shift[i]; // GSVector4 v = ds/dt * m_shift[i];
@ -228,8 +211,8 @@ void GSSetupPrimCodeGenerator::Texture()
switch(j) switch(j)
{ {
case 0: vmovdqa(ptr[r8 + variableOffsetS], xmm2); break; case 0: vmovdqa(ptr[t0 + variableOffsetS], xmm2); break;
case 1: vmovdqa(ptr[r8 + variableOffsetT], xmm2); break; case 1: vmovdqa(ptr[t0 + variableOffsetT], xmm2); break;
} }
} }
else else
@ -242,16 +225,16 @@ void GSSetupPrimCodeGenerator::Texture()
switch(j) switch(j)
{ {
case 0: vmovaps(ptr[r8 + variableOffsetS], xmm2); break; case 0: vmovaps(ptr[t0 + variableOffsetS], xmm2); break;
case 1: vmovaps(ptr[r8 + variableOffsetT], xmm2); break; case 1: vmovaps(ptr[t0 + variableOffsetT], xmm2); break;
case 2: vmovaps(ptr[r8 + variableOffsetQ], xmm2); break; case 2: vmovaps(ptr[t0 + variableOffsetQ], xmm2); break;
} }
} }
} }
} }
} }
void GSSetupPrimCodeGenerator::Color() void GSSetupPrimCodeGenerator::Color_AVX()
{ {
if(!m_en.c) if(!m_en.c)
{ {
@ -262,7 +245,7 @@ void GSSetupPrimCodeGenerator::Color()
{ {
// GSVector4 c = dscan.c; // GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -270,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color()
vcvttps2dq(xmm1, xmm1); vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1); vpackssdw(xmm1, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
// xmm3 is not needed anymore // xmm3 is not needed anymore
@ -299,12 +282,12 @@ void GSSetupPrimCodeGenerator::Color()
vpunpcklwd(xmm0, xmm1); vpunpcklwd(xmm0, xmm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[r8 + variableOffset], xmm0); vmovdqa(ptr[t0 + variableOffset], xmm0);
} }
// GSVector4 c = dscan.c; // GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy(); // GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww(); // GSVector4 da = c.wwww();
@ -312,7 +295,7 @@ void GSSetupPrimCodeGenerator::Color()
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
@ -331,14 +314,31 @@ void GSSetupPrimCodeGenerator::Color()
vpunpcklwd(xmm0, xmm1); vpunpcklwd(xmm0, xmm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[r8 + variableOffset], xmm0); vmovdqa(ptr[t0 + variableOffset], xmm0);
} }
} }
else else
{ {
// GSVector4i c = GSVector4i(vertices[0].c); // GSVector4i c = GSVector4i(vertex[index[last].c);
vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); int last = 0;
switch(m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(eax, ptr[a1 + sizeof(uint32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
}
vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy()); // c = c.upl16(c.zwxy());
@ -358,8 +358,8 @@ void GSSetupPrimCodeGenerator::Color()
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
} }
} }

View File

@ -23,42 +23,48 @@
#include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak; using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate_SSE()
{ {
#ifdef _WIN64
sub(rsp, 8 + 2 * 16); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
#endif
mov(r8, (size_t)&m_local); mov(t0, (size_t)&m_local);
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{ {
for(int i = 0; i < 5; i++) mov(rax, (size_t)&m_shift[0]);
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{ {
movaps(Xmm(3 + i), ptr[rax + i * 16]); movaps(Xmm(3 + i), ptr[rax + i * 16]);
} }
} }
Depth(); Depth_SSE();
Texture(); Texture_SSE();
Color(); Color_SSE();
#ifdef _WIN64
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16); add(rsp, 8 + 2 * 16);
#endif
ret(); ret();
} }
void GSSetupPrimCodeGenerator::Depth() void GSSetupPrimCodeGenerator::Depth_SSE()
{ {
if(!m_en.z && !m_en.f) if(!m_en.z && !m_en.f)
{ {
@ -69,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth()
{ {
// GSVector4 p = dscan.p; // GSVector4 p = dscan.p;
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
if(m_en.f) if(m_en.f)
{ {
@ -85,9 +91,9 @@ void GSSetupPrimCodeGenerator::Depth()
cvttps2dq(xmm2, xmm2); cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
@ -98,7 +104,7 @@ void GSSetupPrimCodeGenerator::Depth()
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[r8 + variableOffset], xmm2); movdqa(ptr[t0 + variableOffset], xmm2);
} }
} }
@ -112,9 +118,9 @@ void GSSetupPrimCodeGenerator::Depth()
movaps(xmm1, xmm0); movaps(xmm1, xmm0);
mulps(xmm1, xmm3); mulps(xmm1, xmm3);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// m_local.d[i].z = dz * m_shift[i]; // m_local.d[i].z = dz * m_shift[i];
@ -122,15 +128,19 @@ void GSSetupPrimCodeGenerator::Depth()
mulps(xmm1, Xmm(4 + i)); mulps(xmm1, Xmm(4 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[r8 + variableOffset], xmm1); movdqa(ptr[t0 + variableOffset], xmm1);
} }
} }
} }
else else
{ {
// GSVector4 p = vertices[0].p; // GSVector4 p = vertex[index[1]].p;
movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); mov(eax, ptr[a1 + sizeof(uint32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
if(m_en.f) if(m_en.f)
{ {
@ -139,47 +149,21 @@ void GSSetupPrimCodeGenerator::Depth()
cvttps2dq(xmm1, xmm0); cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
} }
if(m_en.z) if(m_en.z)
{ {
// GSVector4 z = p.zzzz(); // uint32 z is bypassed in t.w
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
if(m_sel.zoverflow) vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
mov(r9, (size_t)&GSVector4::m_half);
movss(xmm1, ptr[r9]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
} }
} }
} }
void GSSetupPrimCodeGenerator::Texture() void GSSetupPrimCodeGenerator::Texture_SSE()
{ {
if(!m_en.t) if(!m_en.t)
{ {
@ -188,7 +172,7 @@ void GSSetupPrimCodeGenerator::Texture()
// GSVector4 t = dscan.t; // GSVector4 t = dscan.t;
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
movaps(xmm1, xmm0); movaps(xmm1, xmm0);
mulps(xmm1, xmm3); mulps(xmm1, xmm3);
@ -199,13 +183,13 @@ void GSSetupPrimCodeGenerator::Texture()
cvttps2dq(xmm1, xmm1); cvttps2dq(xmm1, xmm1);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
else else
{ {
// m_local.d4.stq = t * 4.0f; // m_local.d4.stq = t * 4.0f;
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -217,7 +201,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(xmm1, xmm0); movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// GSVector4 v = ds/dt * m_shift[i]; // GSVector4 v = ds/dt * m_shift[i];
@ -235,8 +219,8 @@ void GSSetupPrimCodeGenerator::Texture()
switch(j) switch(j)
{ {
case 0: movdqa(ptr[r8 + variableOffsetS], xmm2); break; case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break;
case 1: movdqa(ptr[r8 + variableOffsetT], xmm2); break; case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break;
} }
} }
else else
@ -249,16 +233,16 @@ void GSSetupPrimCodeGenerator::Texture()
switch(j) switch(j)
{ {
case 0: movaps(ptr[r8 + variableOffsetS], xmm2); break; case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break;
case 1: movaps(ptr[r8 + variableOffsetT], xmm2); break; case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break;
case 2: movaps(ptr[r8 + variableOffsetQ], xmm2); break; case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break;
} }
} }
} }
} }
} }
void GSSetupPrimCodeGenerator::Color() void GSSetupPrimCodeGenerator::Color_SSE()
{ {
if(!m_en.c) if(!m_en.c)
{ {
@ -269,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color()
{ {
// GSVector4 c = dscan.c; // GSVector4 c = dscan.c;
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
movaps(xmm1, xmm0); movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -279,7 +263,7 @@ void GSSetupPrimCodeGenerator::Color()
cvttps2dq(xmm2, xmm2); cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2); packssdw(xmm2, xmm2);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
// xmm3 is not needed anymore // xmm3 is not needed anymore
@ -289,7 +273,7 @@ void GSSetupPrimCodeGenerator::Color()
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
@ -310,12 +294,12 @@ void GSSetupPrimCodeGenerator::Color()
punpcklwd(xmm2, xmm3); punpcklwd(xmm2, xmm3);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[r8 + variableOffset], xmm2); movdqa(ptr[t0 + variableOffset], xmm2);
} }
// GSVector4 c = dscan.c; // GSVector4 c = dscan.c;
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
movaps(xmm1, xmm0); movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy(); // GSVector4 dg = c.yyyy();
@ -324,7 +308,7 @@ void GSSetupPrimCodeGenerator::Color()
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
@ -345,14 +329,31 @@ void GSSetupPrimCodeGenerator::Color()
punpcklwd(xmm2, xmm3); punpcklwd(xmm2, xmm3);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[r8 + variableOffset], xmm2); movdqa(ptr[t0 + variableOffset], xmm2);
} }
} }
else else
{ {
// GSVector4i c = GSVector4i(vertices[0].c); // GSVector4i c = GSVector4i(vertex[index[last].c);
cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); int last = 0;
switch(m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(eax, ptr[a1 + sizeof(uint32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
}
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy()); // c = c.upl16(c.zwxy());
@ -372,8 +373,8 @@ void GSSetupPrimCodeGenerator::Color()
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
} }
} }

View File

@ -23,7 +23,7 @@
#include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak; using namespace Xbyak;
@ -32,7 +32,7 @@ static const int _vertex = _args + 4;
static const int _index = _args + 8; static const int _index = _args + 8;
static const int _dscan = _args + 12; static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate_AVX()
{ {
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{ {
@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate()
} }
} }
Depth(); Depth_AVX();
Texture(); Texture_AVX();
Color(); Color_AVX();
ret(); ret();
} }
void GSSetupPrimCodeGenerator::Depth() void GSSetupPrimCodeGenerator::Depth_AVX()
{ {
if(!m_en.z && !m_en.f) if(!m_en.z && !m_en.f)
{ {
@ -144,7 +144,7 @@ void GSSetupPrimCodeGenerator::Depth()
} }
} }
void GSSetupPrimCodeGenerator::Texture() void GSSetupPrimCodeGenerator::Texture_AVX()
{ {
if(!m_en.t) if(!m_en.t)
{ {
@ -213,7 +213,7 @@ void GSSetupPrimCodeGenerator::Texture()
} }
} }
void GSSetupPrimCodeGenerator::Color() void GSSetupPrimCodeGenerator::Color_AVX()
{ {
if(!m_en.c) if(!m_en.c)
{ {
@ -339,4 +339,4 @@ void GSSetupPrimCodeGenerator::Color()
} }
} }
#endif #endif

View File

@ -23,7 +23,7 @@
#include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak; using namespace Xbyak;
@ -32,7 +32,7 @@ static const int _vertex = _args + 4;
static const int _index = _args + 8; static const int _index = _args + 8;
static const int _dscan = _args + 12; static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate_SSE()
{ {
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{ {
@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate()
} }
} }
Depth(); Depth_SSE();
Texture(); Texture_SSE();
Color(); Color_SSE();
ret(); ret();
} }
void GSSetupPrimCodeGenerator::Depth() void GSSetupPrimCodeGenerator::Depth_SSE()
{ {
if(!m_en.z && !m_en.f) if(!m_en.z && !m_en.f)
{ {
@ -149,7 +149,7 @@ void GSSetupPrimCodeGenerator::Depth()
} }
} }
void GSSetupPrimCodeGenerator::Texture() void GSSetupPrimCodeGenerator::Texture_SSE()
{ {
if(!m_en.t) if(!m_en.t)
{ {
@ -221,7 +221,7 @@ void GSSetupPrimCodeGenerator::Texture()
} }
} }
void GSSetupPrimCodeGenerator::Color() void GSSetupPrimCodeGenerator::Color_SSE()
{ {
if(!m_en.c) if(!m_en.c)
{ {
@ -354,4 +354,4 @@ void GSSetupPrimCodeGenerator::Color()
} }
} }
#endif #endif

View File

@ -65,8 +65,8 @@ GSState::GSState()
m_dump_root = ""; m_dump_root = "";
#if defined(__unix__) #if defined(__unix__)
if (s_dump) { if (s_dump) {
GSmkdir("/tmp/GS_HW_dump"); GSmkdir(root_hw.c_str());
GSmkdir("/tmp/GS_SW_dump"); GSmkdir(root_sw.c_str());
} }
#endif #endif

View File

@ -20,9 +20,7 @@
*/ */
#include "stdafx.h" #include "stdafx.h"
#include "GS.h"
#include "GSUtil.h" #include "GSUtil.h"
#include "xbyak/xbyak_util.h"
#ifdef _WIN32 #ifdef _WIN32
#include "GSDeviceDX.h" #include "GSDeviceDX.h"
@ -33,6 +31,8 @@
#define SVN_MODS 0 #define SVN_MODS 0
#endif #endif
Xbyak::util::Cpu g_cpu;
const char* GSUtil::GetLibName() const char* GSUtil::GetLibName()
{ {
// The following ifdef mess is courtesy of "static string str;" // The following ifdef mess is courtesy of "static string str;"
@ -203,38 +203,41 @@ bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm)
bool GSUtil::CheckSSE() bool GSUtil::CheckSSE()
{ {
Xbyak::util::Cpu cpu; bool status = true;
Xbyak::util::Cpu::Type type;
const char* instruction_set = "";
#if _M_SSE >= 0x501 struct ISA {
type = Xbyak::util::Cpu::tAVX2; Xbyak::util::Cpu::Type type;
instruction_set = "AVX2"; const char* name;
#elif _M_SSE >= 0x500 };
type = Xbyak::util::Cpu::tAVX;
instruction_set = "AVX";
#elif _M_SSE >= 0x402
type = Xbyak::util::Cpu::tSSE42;
instruction_set = "SSE4.2";
#elif _M_SSE >= 0x401
type = Xbyak::util::Cpu::tSSE41;
instruction_set = "SSE4.1";
#elif _M_SSE >= 0x301
type = Xbyak::util::Cpu::tSSSE3;
instruction_set = "SSSE3";
#elif _M_SSE >= 0x200
type = Xbyak::util::Cpu::tSSE2;
instruction_set = "SSE2";
#endif
if(!cpu.has(type)) ISA checks[] = {
{ {Xbyak::util::Cpu::tSSE2, "SSE2"},
fprintf(stderr, "This CPU does not support %s\n", instruction_set); #if _M_SSE >= 0x301
{Xbyak::util::Cpu::tSSSE3, "SSSE3"},
#endif
#if _M_SSE >= 0x401
{Xbyak::util::Cpu::tSSE41, "SSE41"},
#endif
#if _M_SSE >= 0x402
{Xbyak::util::Cpu::tSSE42, "SSE42"},
#endif
#if _M_SSE >= 0x500
{Xbyak::util::Cpu::tAVX, "AVX1"},
#endif
#if _M_SSE >= 0x501
{Xbyak::util::Cpu::tAVX2, "AVX2"},
#endif
};
return false; for (size_t i = 0; i < countof(checks); i++) {
if(!g_cpu.has(checks[i].type)) {
fprintf(stderr, "This CPU does not support %s\n", checks[i].name);
status = false;
}
} }
return true; return status;
} }
#define OCL_PROGRAM_VERSION 3 #define OCL_PROGRAM_VERSION 3

View File

@ -22,6 +22,7 @@
#pragma once #pragma once
#include "GS.h" #include "GS.h"
#include "xbyak/xbyak_util.h"
struct OCLDeviceDesc struct OCLDeviceDesc
{ {
@ -71,3 +72,5 @@ void GSmkdir(const char* dir);
#endif #endif
const char* psm_str(int psm); const char* psm_str(int psm);
extern Xbyak::util::Cpu g_cpu;

View File

@ -146,26 +146,11 @@
<ClCompile Include="GSDrawingContext.cpp" /> <ClCompile Include="GSDrawingContext.cpp" />
<ClCompile Include="GSDrawScanline.cpp" /> <ClCompile Include="GSDrawScanline.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp"> <ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp" />
<ExcludedFromBuild Condition="'$(Platform)'=='Win32'">true</ExcludedFromBuild> <ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp" />
<ExcludedFromBuild Condition="'$(Configuration)'!='Release AVX' And '$(Configuration)'!='Debug AVX'">true</ExcludedFromBuild> <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp" />
</ClCompile> <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp"> <ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp" />
<ExcludedFromBuild Condition="'$(Platform)'=='Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="$(Configuration.Contains(AVX))">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)'!='Release AVX' And '$(Configuration)'!='Debug AVX'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)'!='Release AVX2' And '$(Configuration)'!='Debug AVX2'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="$(Configuration.Contains(AVX))">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDump.cpp" /> <ClCompile Include="GSDump.cpp" />
<ClCompile Include="GSdx.cpp" /> <ClCompile Include="GSdx.cpp" />
<ClCompile Include="GSFunctionMap.cpp" /> <ClCompile Include="GSFunctionMap.cpp" />
@ -187,26 +172,11 @@
<ClCompile Include="GSSetting.cpp" /> <ClCompile Include="GSSetting.cpp" />
<ClCompile Include="GSSettingsDlg.cpp" /> <ClCompile Include="GSSettingsDlg.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp"> <ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp" />
<ExcludedFromBuild Condition="'$(Platform)'=='Win32'">true</ExcludedFromBuild> <ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp" />
<ExcludedFromBuild Condition="'$(Configuration)'!='Release AVX' And '$(Configuration)'!='Debug AVX'">true</ExcludedFromBuild> <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp" />
</ClCompile> <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp"> <ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp" />
<ExcludedFromBuild Condition="'$(Platform)'=='Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="$(Configuration.Contains(AVX))">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)'!='Release AVX' And '$(Configuration)'!='Debug AVX'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)'!='Release AVX2' And '$(Configuration)'!='Debug AVX2'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="$(Configuration.Contains(AVX))">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSShaderOGL.cpp" /> <ClCompile Include="GSShaderOGL.cpp" />
<ClCompile Include="GSState.cpp" /> <ClCompile Include="GSState.cpp" />
<ClCompile Include="GSTables.cpp" /> <ClCompile Include="GSTables.cpp" />

View File

@ -138,7 +138,7 @@ void* fifo_alloc(size_t size, size_t repeat)
if (next != base) if (next != base)
fprintf(stderr, "Fail to mmap contiguous segment\n"); fprintf(stderr, "Fail to mmap contiguous segment\n");
else else
fprintf(stderr, "MMAP next %x\n", (uintptr_t)base); fprintf(stderr, "MMAP next %p\n", base);
} }
return fifo; return fifo;

View File

@ -266,28 +266,46 @@ using namespace stdext;
#define ASSERT assert #define ASSERT assert
#ifdef __x86_64__ #ifdef __x86_64__
#define _M_AMD64 #define _M_AMD64
#endif
#ifdef _M_AMD64
// Yeah let use mips naming ;)
#ifdef _WIN64
#define a0 rcx
#define a1 rdx
#define a2 r8
#define a3 r9
#define t0 rdi
#define t1 rsi
#else
#define a0 rdi
#define a1 rsi
#define a2 rdx
#define a3 rcx
#define t0 r8
#define t1 r9
#endif
#endif #endif
// sse // sse
#if defined(__GNUC__) && !defined(__x86_64__) #if defined(__GNUC__)
// Convert gcc see define into GSdx (windows) define // Convert gcc see define into GSdx (windows) define
#if defined(__AVX2__) #if defined(__AVX2__)
#define _M_SSE 0x501 #if defined(__x86_64__)
#define _M_SSE 0x500 // TODO
#else
#define _M_SSE 0x501
#endif
#elif defined(__AVX__) #elif defined(__AVX__)
#define _M_SSE 0x500 #define _M_SSE 0x500
#elif defined(__SSE4_2__)
#define _M_SSE 0x402
#elif defined(__SSE4_1__) #elif defined(__SSE4_1__)
#define _M_SSE 0x401 #define _M_SSE 0x401
#elif defined(__SSSE3__) #elif defined(__SSSE3__)
#define _M_SSE 0x301 #define _M_SSE 0x301
#elif defined(__SSE2__) #elif defined(__SSE2__)
#define _M_SSE 0x200 #define _M_SSE 0x200
#elif defined(__SSE__)
#define _M_SSE 0x100
#endif #endif
#endif #endif
@ -411,11 +429,11 @@ extern void vmfree(void* ptr, size_t size);
extern void* fifo_alloc(size_t size, size_t repeat); extern void* fifo_alloc(size_t size, size_t repeat);
extern void fifo_free(void* ptr, size_t size, size_t repeat); extern void fifo_free(void* ptr, size_t size, size_t repeat);
#ifdef _WIN32 #ifdef ENABLE_VTUNE
#ifdef ENABLE_VTUNE #include "jitprofiling.h"
#include <JITProfiling.h> #ifdef _WIN32
#pragma comment(lib, "jitprofiling.lib") #pragma comment(lib, "jitprofiling.lib")
@ -472,6 +490,11 @@ struct GLAutoPop {
const std::string root_sw("c:\\temp1\\_"); const std::string root_sw("c:\\temp1\\_");
const std::string root_hw("c:\\temp2\\_"); const std::string root_hw("c:\\temp2\\_");
#else #else
const std::string root_sw("/tmp/GS_SW_dump/"); #ifdef _M_AMD64
const std::string root_hw("/tmp/GS_HW_dump/"); const std::string root_sw("/tmp/GS_SW_dump64/");
const std::string root_hw("/tmp/GS_HW_dump64/");
#else
const std::string root_sw("/tmp/GS_SW_dump32/");
const std::string root_hw("/tmp/GS_HW_dump32/");
#endif
#endif #endif

View File

@ -370,6 +370,7 @@ void cwde() { db(0x98); }
void lahf() { db(0x9F); } void lahf() { db(0x9F); }
void lock() { db(0xF0); } void lock() { db(0xF0); }
void nop() { db(0x90); } void nop() { db(0x90); }
void int3() { db(0xCC); }
void sahf() { db(0x9E); } void sahf() { db(0x9E); }
void stc() { db(0xF9); } void stc() { db(0xF9); }
void std() { db(0xFD); } void std() { db(0xFD); }