gsdx sw x64: add AVX2 implementation for VS

FS was copied from 32 bits (require massive update)
This commit is contained in:
Gregory Hainaut 2016-11-22 19:00:06 +01:00
parent 15220c386a
commit 608bb5ccb2
9 changed files with 3469 additions and 33 deletions

View File

@ -65,11 +65,12 @@ set(GSdxSources
GSDrawingContext.cpp GSDrawingContext.cpp
GSDrawScanline.cpp GSDrawScanline.cpp
GSDrawScanlineCodeGenerator.cpp GSDrawScanlineCodeGenerator.cpp
GSDrawScanlineCodeGenerator.x64.cpp
GSDrawScanlineCodeGenerator.x64.avx.cpp
GSDrawScanlineCodeGenerator.x64.avx2.cpp
GSDrawScanlineCodeGenerator.x86.cpp
GSDrawScanlineCodeGenerator.x86.avx.cpp GSDrawScanlineCodeGenerator.x86.avx.cpp
GSDrawScanlineCodeGenerator.x86.avx2.cpp GSDrawScanlineCodeGenerator.x86.avx2.cpp
GSDrawScanlineCodeGenerator.x64.cpp
GSDrawScanlineCodeGenerator.x86.cpp
GSDrawScanlineCodeGenerator.x64.avx.cpp
GSDump.cpp GSDump.cpp
GSFunctionMap.cpp GSFunctionMap.cpp
GSHwHack.cpp GSHwHack.cpp
@ -87,11 +88,12 @@ set(GSdxSources
GSRendererSW.cpp GSRendererSW.cpp
GSSetting.cpp GSSetting.cpp
GSSetupPrimCodeGenerator.cpp GSSetupPrimCodeGenerator.cpp
GSSetupPrimCodeGenerator.x64.cpp
GSSetupPrimCodeGenerator.x64.avx.cpp
GSSetupPrimCodeGenerator.x64.avx2.cpp
GSSetupPrimCodeGenerator.x86.cpp
GSSetupPrimCodeGenerator.x86.avx.cpp GSSetupPrimCodeGenerator.x86.avx.cpp
GSSetupPrimCodeGenerator.x86.avx2.cpp GSSetupPrimCodeGenerator.x86.avx2.cpp
GSSetupPrimCodeGenerator.x64.avx.cpp
GSSetupPrimCodeGenerator.x86.cpp
GSSetupPrimCodeGenerator.x64.cpp
GSShaderOGL.cpp GSShaderOGL.cpp
GSState.cpp GSState.cpp
GSTables.cpp GSTables.cpp

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@ static const int _v = _args + 8;
void GSDrawScanlineCodeGenerator::Generate() void GSDrawScanlineCodeGenerator::Generate()
{ {
//ret(8); //ret(8);
push(ebx); push(ebx);
push(esi); push(esi);

View File

@ -77,19 +77,15 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0; m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
try { try {
Generate();
} catch (std::exception& e) {
fprintf(stderr, "ERR:GSSetupPrimCodeGenerator %s\n", e.what());
}
}
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
Generate_AVX2();
#else #else
void GSSetupPrimCodeGenerator::Generate()
{
if(g_cpu.has(util::Cpu::tAVX)) if(g_cpu.has(util::Cpu::tAVX))
Generate_AVX(); Generate_AVX();
else else
Generate_SSE(); Generate_SSE();
}
#endif #endif
} catch (std::exception& e) {
fprintf(stderr, "ERR:GSSetupPrimCodeGenerator %s\n", e.what());
}
}

View File

@ -35,8 +35,6 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
struct {uint32 z:1, f:1, t:1, c:1;} m_en; struct {uint32 z:1, f:1, t:1, c:1;} m_en;
void Generate();
#if _M_SSE < 0x501 #if _M_SSE < 0x501
void Generate_SSE(); void Generate_SSE();
void Depth_SSE(); void Depth_SSE();
@ -48,9 +46,10 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
void Texture_AVX(); void Texture_AVX();
void Color_AVX(); void Color_AVX();
#else #else
void Depth(); void Generate_AVX2();
void Texture(); void Depth_AVX2();
void Color(); void Texture_AVX2();
void Color_AVX2();
#endif #endif
public: public:

View File

@ -0,0 +1,376 @@
/*
* Copyright (C) 2016-2016 Gregory
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x501 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
#define _m_shift(i) (Ymm(7+i))
// FIXME windows ?
#define _vertex rcx
void GSSetupPrimCodeGenerator::Generate_AVX2()
{
// Technically we just need the delta < 2GB
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
#ifdef _WIN64
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], ymm6);
vmovdqa(ptr[rsp + 16], ymm7);
#endif
if (!m_rip)
mov(t0, (size_t)&m_local);
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)&m_shift[0]);
for(int i = 0; i < (m_sel.notest ? 2 : 9); i++)
{
vmovaps(_m_shift(i), ptr[rax + i * 32]);
}
}
// ymm7 to ymm 15 = m_shift[i]
Depth_AVX2();
Texture_AVX2();
Color_AVX2();
#ifdef _WIN64
vmovdqa(ymm6, ptr[rsp + 0]);
vmovdqa(ymm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
#endif
ret();
}
void GSSetupPrimCodeGenerator::Depth_AVX2()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(m_sel.prim != GS_SPRITE_CLASS)
{
const Ymm& dscan_p = ymm6;
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
vbroadcastf128(dscan_p, ptr[a2 + offsetof(GSVertexSW, p)]);
vmulps(ymm1, dscan_p, _m_shift(0));
if(m_en.z)
{
// m_local.d8.p.z = dp8.extract32<2>();
vextractps(_rip_local(d8.p.z), xmm1, 2);
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vshufps(ymm2, dscan_p, dscan_p, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// m_local.d[i].z = dz * shift[1 + i];
vmulps(ymm0, ymm2, _m_shift(1 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovaps(_rip_local_v(d[i].z, variableOffset), ymm0);
}
}
if(m_en.f)
{
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
// FIXME no truncate ? why ? vcvttps2dq ?
//vcvtps2dq(ymm2, ymm1); // let's guess a typo
vcvttps2dq(ymm2, ymm1);
vpextrd(_rip_local(d8.p.f), xmm2, 3);
// GSVector8 df = GSVector8(dscan.p).wwww();
vshufps(ymm3, dscan_p, dscan_p, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
vmulps(ymm0, ymm3, _m_shift(1 + i));
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].f, variableOffset), ymm0);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * 1]);
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
add(_vertex, a0);
if(m_en.f)
{
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
vmovaps(xmm0, ptr[_vertex + offsetof(GSVertexSW, p)]);
vcvttps2dq(xmm0, xmm0);
vpextrd(_rip_local(p.f), xmm0, 3);
}
if(m_en.z)
{
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), eax);
}
}
}
void GSSetupPrimCodeGenerator::Texture_AVX2()
{
if(!m_en.t)
{
return;
}
// GSVector8 dt(dscan.t);
vbroadcastf128(ymm0, ptr[a2 + offsetof(GSVertexSW, t)]);
// GSVector8 dt8 = dt * shift[0];
vmulps(ymm1, ymm0, _m_shift(0));
if(m_sel.fst)
{
// m_local.84.stq = GSVector4i(t * 4.0f);
vcvttps2dq(ymm1, ymm1);
vmovdqa(_rip_local(d8.stq), xmm1);
}
else
{
// m_local.d8.stq = t * 4.0f;
vmovaps(_rip_local(d8.stq), xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8 v = dstq * shift[1 + i];
vmulps(ymm2, ymm1, _m_shift(1+i));
if(m_sel.fst)
{
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
vcvttps2dq(ymm2, ymm2);
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
switch(j)
{
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
switch(j)
{
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), ymm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_AVX2()
{
if(!m_en.c)
{
return;
}
if(m_sel.iip)
{
const Ymm& dscan_c = ymm6;
// GSVector8 dc(dscan.c);
vbroadcastf128(dscan_c, ptr[a2 + offsetof(GSVertexSW, c)]);
// m_local.d8.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(ymm1, dscan_c, ymm3);
vcvttps2dq(ymm1, ymm1);
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(ymm1, ymm1);
vmovq(_rip_local(d8.c), xmm1);
// GSVector8 dr = dc.xxxx();
// GSVector8 db = dc.zzzz();
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
vmulps(ymm0, ymm2, _m_shift(1 + i));
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
vmulps(ymm1, ymm3, _m_shift(1 + i));
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(ymm0, ymm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].rb, variableOffset), ymm0);
}
// GSVector8 dg = dc.yyyy();
// GSVector8 da = dc.wwww();
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
vmulps(ymm0, ymm2, _m_shift(1 + i));
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
vmulps(ymm1, ymm3, _m_shift(1 + i));
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(ymm0, ymm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].ga, variableOffset), ymm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch(m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * last]);
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
add(_vertex, a0);
}
vbroadcasti128(ymm0, ptr[_vertex + offsetof(GSVertexSW, c)]);
vcvttps2dq(ymm0, ymm0);
// c = c.upl16(c.zwxy());
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(ymm0, ymm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
vpsrlw(ymm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(_rip_local(c.rb), ymm1);
vmovdqa(_rip_local(c.ga), ymm2);
}
}
#endif

View File

@ -32,7 +32,7 @@ static const int _vertex = _args + 4;
static const int _index = _args + 8; static const int _index = _args + 8;
static const int _dscan = _args + 12; static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate_AVX2()
{ {
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{ {
@ -44,16 +44,16 @@ void GSSetupPrimCodeGenerator::Generate()
} }
} }
Depth(); Depth_AVX2();
Texture(); Texture_AVX2();
Color(); Color_AVX2();
ret(); ret();
} }
void GSSetupPrimCodeGenerator::Depth() void GSSetupPrimCodeGenerator::Depth_AVX2()
{ {
if(!m_en.z && !m_en.f) if(!m_en.z && !m_en.f)
{ {
@ -149,7 +149,7 @@ void GSSetupPrimCodeGenerator::Depth()
} }
} }
void GSSetupPrimCodeGenerator::Texture() void GSSetupPrimCodeGenerator::Texture_AVX2()
{ {
if(!m_en.t) if(!m_en.t)
{ {
@ -219,7 +219,7 @@ void GSSetupPrimCodeGenerator::Texture()
} }
} }
void GSSetupPrimCodeGenerator::Color() void GSSetupPrimCodeGenerator::Color_AVX2()
{ {
if(!m_en.c) if(!m_en.c)
{ {

View File

@ -147,6 +147,7 @@
<ClCompile Include="GSDrawScanline.cpp" /> <ClCompile Include="GSDrawScanline.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx2.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp" />
@ -173,6 +174,7 @@
<ClCompile Include="GSSettingsDlg.cpp" /> <ClCompile Include="GSSettingsDlg.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx2.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" />

View File

@ -231,6 +231,9 @@
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp"> <ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp"> <ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
@ -249,6 +252,9 @@
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp"> <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSRendererCL.cpp"> <ClCompile Include="GSRendererCL.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>