gsdx sw JIT: dynamically select between AVX1 and SSE code path (scanline)

This commit is contained in:
Gregory Hainaut 2016-11-19 14:47:40 +01:00
parent 6b78b8f9ce
commit 574a2c774e
5 changed files with 441 additions and 428 deletions

View File

@ -22,6 +22,17 @@
#include "stdafx.h" #include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE >= 0x501
#else
void GSDrawScanlineCodeGenerator::Generate()
{
if(g_cpu.has(util::Cpu::tAVX))
Generate_AVX();
else
Generate_SSE();
}
#endif
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] = alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] =
@ -183,194 +194,179 @@ void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift) void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
if(shift == 0)
{ {
vpmulhrsw(a, f); if(shift == 0)
{
vpmulhrsw(a, f);
}
else
{
vpsllw(a, shift + 1);
vpmulhw(a, f);
}
} }
else else
{ {
vpsllw(a, shift + 1); if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
vpmulhw(a, f); {
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
} }
#else
if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
{
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
#endif
} }
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift) void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpsubw(a, b); vpsubw(a, b);
modulate16(a, f, shift); modulate16(a, f, shift);
vpaddw(a, b); vpaddw(a, b);
}
#else else
{
psubw(a, b); psubw(a, b);
modulate16(a, f, shift); modulate16(a, f, shift);
paddw(a, b); paddw(a, b);
}
#endif
} }
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpsubw(a, b); vpsubw(a, b);
vpmullw(a, f); vpmullw(a, f);
vpsraw(a, 4); vpsraw(a, 4);
vpaddw(a, b); vpaddw(a, b);
}
#else else
{
psubw(a, b); psubw(a, b);
pmullw(a, f); pmullw(a, f);
psraw(a, 4); psraw(a, 4);
paddw(a, b); paddw(a, b);
}
#endif
} }
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
vpblendw(a, b, 0xaa);
#else
if(g_cpu.has(util::Cpu::tSSE41))
{ {
pblendw(a, b, 0xaa); vpblendw(a, b, 0xaa);
} }
else else
{ {
pcmpeqd(temp, temp); if(g_cpu.has(util::Cpu::tSSE41))
psrld(temp, 16); {
pand(a, temp); pblendw(a, b, 0xaa);
pandn(temp, b); }
por(a, temp); else
{
pcmpeqd(temp, temp);
psrld(temp, 16);
pand(a, temp);
pandn(temp, b);
por(a, temp);
}
} }
#endif
} }
void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
vpackuswb(a, a);
vpmovzxbw(a, a);
#else
if(g_cpu.has(util::Cpu::tSSE41))
{ {
packuswb(a, a); vpackuswb(a, a);
pmovzxbw(a, a); vpmovzxbw(a, a);
} }
else else
{ {
packuswb(a, a); if(g_cpu.has(util::Cpu::tSSE41))
pxor(temp, temp); {
punpcklbw(a, temp); packuswb(a, a);
pmovzxbw(a, a);
}
else
{
packuswb(a, a);
pxor(temp, temp);
punpcklbw(a, temp);
}
} }
#endif
} }
void GSDrawScanlineCodeGenerator::alltrue() void GSDrawScanlineCodeGenerator::alltrue()
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpmovmskb(eax, xmm7); vpmovmskb(eax, xmm7);
cmp(eax, 0xffff); cmp(eax, 0xffff);
je("step", T_NEAR); je("step", T_NEAR);
}
#else else
{
pmovmskb(eax, xmm7); pmovmskb(eax, xmm7);
cmp(eax, 0xffff); cmp(eax, 0xffff);
je("step", T_NEAR); je("step", T_NEAR);
}
#endif
} }
void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpand(b, mask); vpand(b, mask);
vpandn(mask, a); vpandn(mask, a);
vpor(a, b, mask); vpor(a, b, mask);
}
#else else
{
pand(b, mask); pand(b, mask);
pandn(mask, a); pandn(mask, a);
por(b, mask); por(b, mask);
movdqa(a, b); movdqa(a, b);
}
#endif
} }
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpand(b, mask); vpand(b, mask);
vpandn(mask, a); vpandn(mask, a);
vpor(b, mask); vpor(b, mask);
}
#else else
{
pand(b, mask); pand(b, mask);
pandn(mask, a); pandn(mask, a);
por(b, mask); por(b, mask);
}
#endif
} }
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
vpblendvb(a, a, b, xmm0);
vpblendvb(a, a, b, xmm0); else if(g_cpu.has(util::Cpu::tSSE41))
#else
if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(a, b); pblendvb(a, b);
else else
blend(a, b, xmm0); blend(a, b, xmm0);
#endif
} }
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{ {
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
{
vpblendvb(b, a, b, xmm0); vpblendvb(b, a, b, xmm0);
}
#else else if(g_cpu.has(util::Cpu::tSSE41))
if(g_cpu.has(util::Cpu::tSSE41))
{ {
pblendvb(a, b); pblendvb(a, b);
movdqa(b, a); movdqa(b, a);
@ -379,8 +375,6 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{ {
blendr(b, a, xmm0); blendr(b, a, xmm0);
} }
#endif
} }
void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src) void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
@ -388,31 +382,34 @@ void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const
// l = src & 0xFF; (1 left shift + 1 right shift) // l = src & 0xFF; (1 left shift + 1 right shift)
// h = (src >> 8) & 0xFF; (1 right shift) // h = (src >> 8) & 0xFF; (1 right shift)
#if _M_SSE >= 0x500 if(g_cpu.has(util::Cpu::tAVX))
if (src == h) { {
vpsllw(l, src, 8); if (src == h) {
vpsrlw(h, 8); vpsllw(l, src, 8);
} else if (src == l) { vpsrlw(h, 8);
vpsrlw(h, src, 8); } else if (src == l) {
vpsllw(l, 8); vpsrlw(h, src, 8);
} else { vpsllw(l, 8);
vpsllw(l, src, 8); } else {
vpsrlw(h, src, 8); vpsllw(l, src, 8);
vpsrlw(h, src, 8);
}
vpsrlw(l, 8);
} }
vpsrlw(l, 8); else
#else {
if (src == h) { if (src == h) {
movdqa(l, src); movdqa(l, src);
} else if (src == l) { } else if (src == l) {
movdqa(h, src); movdqa(h, src);
} else { } else {
movdqa(l, src); movdqa(l, src);
movdqa(h, src); movdqa(h, src);
}
psllw(l, 8);
psrlw(l, 8);
psrlw(h, 8);
} }
psllw(l, 8);
psrlw(l, 8);
psrlw(h, 8);
#endif
} }
#endif #endif

View File

@ -27,6 +27,12 @@
using namespace Xbyak; using namespace Xbyak;
#if defined(_M_AMD64) || defined(_WIN64)
#define RegLong Reg64
#else
#define RegLong Reg32
#endif
class GSDrawScanlineCodeGenerator : public GSCodeGenerator class GSDrawScanlineCodeGenerator : public GSCodeGenerator
{ {
void operator = (const GSDrawScanlineCodeGenerator&); void operator = (const GSDrawScanlineCodeGenerator&);
@ -58,17 +64,9 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void WriteZBuf(); void WriteZBuf();
void AlphaBlend(); void AlphaBlend();
void WriteFrame(); void WriteFrame();
void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr);
#if defined(_M_AMD64) || defined(_WIN64) void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz);
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr); void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
#else
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
#endif
void ReadTexel(int pixels, int mip_offset = 0); void ReadTexel(int pixels, int mip_offset = 0);
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i); void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
@ -85,39 +83,59 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
#else #else
void Init(); void Generate_SSE();
void Step(); void Init_SSE();
void TestZ(const Xmm& temp1, const Xmm& temp2); void Step_SSE();
void SampleTexture(); void TestZ_SSE(const Xmm& temp1, const Xmm& temp2);
void Wrap(const Xmm& uv0); void SampleTexture_SSE();
void Wrap(const Xmm& uv0, const Xmm& uv1); void Wrap_SSE(const Xmm& uv0);
void SampleTextureLOD(); void Wrap_SSE(const Xmm& uv0, const Xmm& uv1);
void WrapLOD(const Xmm& uv0); void SampleTextureLOD_SSE();
void WrapLOD(const Xmm& uv0, const Xmm& uv1); void WrapLOD_SSE(const Xmm& uv0);
void AlphaTFX(); void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1);
void ReadMask(); void AlphaTFX_SSE();
void TestAlpha(); void ReadMask_SSE();
void ColorTFX(); void TestAlpha_SSE();
void Fog(); void ColorTFX_SSE();
void ReadFrame(); void Fog_SSE();
void TestDestAlpha(); void ReadFrame_SSE();
void WriteMask(); void TestDestAlpha_SSE();
void WriteZBuf(); void WriteMask_SSE();
void AlphaBlend(); void WriteZBuf_SSE();
void WriteFrame(); void AlphaBlend_SSE();
void WriteFrame_SSE();
void ReadPixel_SSE(const Xmm& dst, const RegLong& addr);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_SSE(int pixels, int mip_offset = 0);
void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i);
#if defined(_M_AMD64) || defined(_WIN64) void Generate_AVX();
void ReadPixel(const Xmm& dst, const Reg64& addr); void Init_AVX();
void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz); void Step_AVX();
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm); void TestZ_AVX(const Xmm& temp1, const Xmm& temp2);
#else void SampleTexture_AVX();
void ReadPixel(const Xmm& dst, const Reg32& addr); void Wrap_AVX(const Xmm& uv0);
void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz); void Wrap_AVX(const Xmm& uv0, const Xmm& uv1);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm); void SampleTextureLOD_AVX();
#endif void WrapLOD_AVX(const Xmm& uv0);
void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1);
void ReadTexel(int pixels, int mip_offset = 0); void AlphaTFX_AVX();
void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i); void ReadMask_AVX();
void TestAlpha_AVX();
void ColorTFX_AVX();
void Fog_AVX();
void ReadFrame_AVX();
void TestDestAlpha_AVX();
void WriteMask_AVX();
void WriteZBuf_AVX();
void AlphaBlend_AVX();
void WriteFrame_AVX();
void ReadPixel_AVX(const Xmm& dst, const RegLong& addr);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_AVX(int pixels, int mip_offset = 0);
void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i);
void modulate16(const Xmm& a, const Operand& f, int shift); void modulate16(const Xmm& a, const Operand& f, int shift);
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift); void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift);

View File

@ -45,7 +45,7 @@
#define _zm xmm5 #define _zm xmm5
#define _fd xmm6 #define _fd xmm6
#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
#ifdef _WIN64 #ifdef _WIN64
#else #else
@ -59,7 +59,7 @@ static const int _rz_zd = -8 * 10;
static const int _rz_cov = -8 * 12; static const int _rz_cov = -8 * 12;
#endif #endif
void GSDrawScanlineCodeGenerator::Generate() void GSDrawScanlineCodeGenerator::Generate_AVX()
{ {
bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
bool need_clut = need_tex && m_sel.tlu; bool need_clut = need_tex && m_sel.tlu;
@ -100,7 +100,7 @@ void GSDrawScanlineCodeGenerator::Generate()
if(need_tex) if(need_tex)
mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]); mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]);
Init(); Init_AVX();
// a0 = steps // a0 = steps
// t1 = fza_base // t1 = fza_base
@ -126,30 +126,30 @@ void GSDrawScanlineCodeGenerator::Generate()
L("loop"); L("loop");
TestZ(xmm5, xmm6); TestZ_AVX(xmm5, xmm6);
// ebp = za // ebp = za
if(m_sel.mmin) if(m_sel.mmin)
{ {
SampleTextureLOD(); SampleTextureLOD_AVX();
} }
else else
{ {
SampleTexture(); SampleTexture_AVX();
} }
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
// xmm3 = ga // xmm3 = ga
AlphaTFX(); AlphaTFX_AVX();
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
// xmm3 = ga // xmm3 = ga
ReadMask(); ReadMask_AVX();
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
@ -157,7 +157,7 @@ L("loop");
// xmm4 = fm // xmm4 = fm
// xmm5 = zm // xmm5 = zm
TestAlpha(); TestAlpha_AVX();
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
@ -165,7 +165,7 @@ L("loop");
// xmm4 = fm // xmm4 = fm
// xmm5 = zm // xmm5 = zm
ColorTFX(); ColorTFX_AVX();
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
@ -173,7 +173,7 @@ L("loop");
// xmm4 = fm // xmm4 = fm
// xmm5 = zm // xmm5 = zm
Fog(); Fog_AVX();
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
@ -181,7 +181,7 @@ L("loop");
// xmm4 = fm // xmm4 = fm
// xmm5 = zm // xmm5 = zm
ReadFrame(); ReadFrame_AVX();
// ebx = fa // ebx = fa
// ebp = za // ebp = za
@ -191,7 +191,7 @@ L("loop");
// xmm5 = zm // xmm5 = zm
// xmm6 = fd // xmm6 = fd
TestDestAlpha(); TestDestAlpha_AVX();
// ebx = fa // ebx = fa
// ebp = za // ebp = za
@ -201,7 +201,7 @@ L("loop");
// xmm5 = zm // xmm5 = zm
// xmm6 = fd // xmm6 = fd
WriteMask(); WriteMask_AVX();
// ebx = fa // ebx = fa
// edx = fzm // edx = fzm
@ -212,7 +212,7 @@ L("loop");
// xmm5 = zm // xmm5 = zm
// xmm6 = fd // xmm6 = fd
WriteZBuf(); WriteZBuf_AVX();
// ebx = fa // ebx = fa
// edx = fzm // edx = fzm
@ -221,7 +221,7 @@ L("loop");
// xmm4 = fm // xmm4 = fm
// xmm6 = fd // xmm6 = fd
AlphaBlend(); AlphaBlend_AVX();
// ebx = fa // ebx = fa
// edx = fzm // edx = fzm
@ -230,7 +230,7 @@ L("loop");
// xmm4 = fm // xmm4 = fm
// xmm6 = fd // xmm6 = fd
WriteFrame(); WriteFrame_AVX();
L("step"); L("step");
@ -242,7 +242,7 @@ L("step");
jle("exit", T_NEAR); jle("exit", T_NEAR);
Step(); Step_AVX();
jmp("loop", T_NEAR); jmp("loop", T_NEAR);
} }
@ -277,7 +277,7 @@ L("exit");
ret(); ret();
} }
void GSDrawScanlineCodeGenerator::Init() void GSDrawScanlineCodeGenerator::Init_AVX()
{ {
if(!m_sel.notest) if(!m_sel.notest)
{ {
@ -480,7 +480,7 @@ void GSDrawScanlineCodeGenerator::Init()
} }
} }
void GSDrawScanlineCodeGenerator::Step() void GSDrawScanlineCodeGenerator::Step_AVX()
{ {
// steps -= 4; // steps -= 4;
@ -603,7 +603,7 @@ void GSDrawScanlineCodeGenerator::Step()
} }
} }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
{ {
if(!m_sel.zb) if(!m_sel.zb)
{ {
@ -661,7 +661,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest) if(m_sel.ztest)
{ {
ReadPixel(xmm1, rbp); ReadPixel_AVX(xmm1, rbp);
if(m_sel.zwrite && m_sel.zpsm < 2) if(m_sel.zwrite && m_sel.zpsm < 2)
{ {
@ -715,7 +715,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTexture() void GSDrawScanlineCodeGenerator::SampleTexture_AVX()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -786,13 +786,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
Wrap(xmm4, xmm5); Wrap_AVX(xmm4, xmm5);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
Wrap(xmm4); Wrap_AVX(xmm4);
} }
// xmm4 = uv0 // xmm4 = uv0
@ -854,7 +854,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_AVX(4, 0);
// xmm0 = c10 // xmm0 = c10
// xmm1 = c11 // xmm1 = c11
@ -944,7 +944,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -958,7 +958,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// xmm3 = ga // xmm3 = ga
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
{ {
// xmm0, xmm1, xmm2, xmm3 = free // xmm0, xmm1, xmm2, xmm3 = free
@ -1019,7 +1019,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm0, xmm1, xmm2, xmm3 = free // xmm0, xmm1, xmm2, xmm3 = free
@ -1111,19 +1111,19 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTextureLOD() void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
{ {
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv)
{ {
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1)
{ {
} }
void GSDrawScanlineCodeGenerator::AlphaTFX() void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -1261,7 +1261,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
} }
} }
void GSDrawScanlineCodeGenerator::ReadMask() void GSDrawScanlineCodeGenerator::ReadMask_AVX()
{ {
if(m_sel.fwrite) if(m_sel.fwrite)
{ {
@ -1274,7 +1274,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
} }
} }
void GSDrawScanlineCodeGenerator::TestAlpha() void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
{ {
switch(m_sel.atst) switch(m_sel.atst)
{ {
@ -1345,7 +1345,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
} }
} }
void GSDrawScanlineCodeGenerator::ColorTFX() void GSDrawScanlineCodeGenerator::ColorTFX_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -1410,7 +1410,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
} }
} }
void GSDrawScanlineCodeGenerator::Fog() void GSDrawScanlineCodeGenerator::Fog_AVX()
{ {
if(!m_sel.fwrite || !m_sel.fge) if(!m_sel.fwrite || !m_sel.fge)
{ {
@ -1431,7 +1431,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(_ga, xmm6, _f); mix16(_ga, xmm6, _f);
} }
void GSDrawScanlineCodeGenerator::ReadFrame() void GSDrawScanlineCodeGenerator::ReadFrame_AVX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -1449,10 +1449,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return; return;
} }
ReadPixel(_fd, rbx); ReadPixel_AVX(_fd, rbx);
} }
void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX()
{ {
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{ {
@ -1496,7 +1496,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue(); alltrue();
} }
void GSDrawScanlineCodeGenerator::WriteMask() void GSDrawScanlineCodeGenerator::WriteMask_AVX()
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -1542,7 +1542,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx); not(edx);
} }
void GSDrawScanlineCodeGenerator::WriteZBuf() void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
{ {
if(!m_sel.zwrite) if(!m_sel.zwrite)
{ {
@ -1571,10 +1571,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1); WritePixel_AVX(xmm1, rbp, dh, fast, m_sel.zpsm, 1);
} }
void GSDrawScanlineCodeGenerator::AlphaBlend() void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -1798,7 +1798,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
} }
} }
void GSDrawScanlineCodeGenerator::WriteFrame() void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -1889,16 +1889,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); WritePixel_AVX(xmm2, rbx, dl, fast, m_sel.fpsm, 0);
} }
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg64& addr)
{ {
vmovq(dst, qword[_m_local__gd__vm + addr * 2]); vmovq(dst, qword[_m_local__gd__vm + addr * 2]);
vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]); vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]);
} }
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -1909,10 +1909,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
} }
else else
{ {
WritePixel(src, addr, 0, psm); WritePixel_AVX(src, addr, 0, psm);
WritePixel(src, addr, 1, psm); WritePixel_AVX(src, addr, 1, psm);
WritePixel(src, addr, 2, psm); WritePixel_AVX(src, addr, 2, psm);
WritePixel(src, addr, 3, psm); WritePixel_AVX(src, addr, 3, psm);
} }
} }
else else
@ -1943,22 +1943,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
test(mask, 0x03); test(mask, 0x03);
je("@f"); je("@f");
WritePixel(src, addr, 0, psm); WritePixel_AVX(src, addr, 0, psm);
L("@@"); L("@@");
test(mask, 0x0c); test(mask, 0x0c);
je("@f"); je("@f");
WritePixel(src, addr, 1, psm); WritePixel_AVX(src, addr, 1, psm);
L("@@"); L("@@");
test(mask, 0x30); test(mask, 0x30);
je("@f"); je("@f");
WritePixel(src, addr, 2, psm); WritePixel_AVX(src, addr, 2, psm);
L("@@"); L("@@");
test(mask, 0xc0); test(mask, 0xc0);
je("@f"); je("@f");
WritePixel(src, addr, 3, psm); WritePixel_AVX(src, addr, 3, psm);
L("@@"); L("@@");
} }
} }
@ -1966,7 +1966,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
static const int s_offsets[4] = {0, 2, 8, 10}; static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, uint8 i, int psm)
{ {
Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2]; Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2];
@ -1990,7 +1990,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{ {
const int in[] = {0, 1, 2, 3}; const int in[] = {0, 1, 2, 3};
const int out[] = {4, 5, 0, 1}; const int out[] = {4, 5, 0, 1};
@ -1999,12 +1999,12 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
{ {
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
{ {
ReadTexel(Xmm(out[i]), Xmm(in[i]), j); ReadTexel_AVX(Xmm(out[i]), Xmm(in[i]), j);
} }
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i)
{ {
const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4]; const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4];
@ -2026,7 +2026,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin
// And palette need zero masking. // And palette need zero masking.
// It is not possible to use same source/destination so linear interpolation must be updated // It is not possible to use same source/destination so linear interpolation must be updated
#if 0 #if 0
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{ {
const int in[] = {0, 1, 2, 3}; const int in[] = {0, 1, 2, 3};
const int out[] = {4, 5, 0, 1}; const int out[] = {4, 5, 0, 1};

View File

@ -23,21 +23,20 @@
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16; static const int _args = 16;
static const int _top = _args + 4; static const int _top = _args + 4;
static const int _v = _args + 8; static const int _v = _args + 8;
void GSDrawScanlineCodeGenerator::Generate() void GSDrawScanlineCodeGenerator::Generate_AVX()
{ {
//ret(8);
push(ebx); push(ebx);
push(esi); push(esi);
push(edi); push(edi);
push(ebp); push(ebp);
Init(); Init_AVX();
if(!m_sel.edge) if(!m_sel.edge)
{ {
@ -59,7 +58,7 @@ L("loop");
bool tme = m_sel.tfx != TFX_NONE; bool tme = m_sel.tfx != TFX_NONE;
TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); TestZ_AVX(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -75,11 +74,11 @@ L("loop");
if(m_sel.mmin) if(m_sel.mmin)
{ {
SampleTextureLOD(); SampleTextureLOD_AVX();
} }
else else
{ {
SampleTexture(); SampleTexture_AVX();
} }
// ecx = steps // ecx = steps
@ -93,7 +92,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
AlphaTFX(); AlphaTFX_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -104,7 +103,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ReadMask(); ReadMask_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -117,7 +116,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
TestAlpha(); TestAlpha_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -130,7 +129,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ColorTFX(); ColorTFX_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -142,7 +141,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
Fog(); Fog_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -154,7 +153,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ReadFrame(); ReadFrame_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -167,7 +166,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
TestDestAlpha(); TestDestAlpha_AVX();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -180,7 +179,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
WriteMask(); WriteMask_AVX();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -194,7 +193,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
WriteZBuf(); WriteZBuf_AVX();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -208,7 +207,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
AlphaBlend(); AlphaBlend_AVX();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -220,7 +219,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
WriteFrame(); WriteFrame_AVX();
L("step"); L("step");
@ -232,7 +231,7 @@ L("step");
jle("exit", T_NEAR); jle("exit", T_NEAR);
Step(); Step_AVX();
jmp("loop", T_NEAR); jmp("loop", T_NEAR);
} }
@ -249,7 +248,7 @@ L("exit");
ret(8); ret(8);
} }
void GSDrawScanlineCodeGenerator::Init() void GSDrawScanlineCodeGenerator::Init_AVX()
{ {
if(!m_sel.notest) if(!m_sel.notest)
{ {
@ -455,7 +454,7 @@ void GSDrawScanlineCodeGenerator::Init()
} }
} }
void GSDrawScanlineCodeGenerator::Step() void GSDrawScanlineCodeGenerator::Step_AVX()
{ {
// steps -= 4; // steps -= 4;
@ -596,7 +595,7 @@ void GSDrawScanlineCodeGenerator::Step()
} }
} }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
{ {
if(!m_sel.zb) if(!m_sel.zb)
{ {
@ -644,7 +643,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest) if(m_sel.ztest)
{ {
ReadPixel(xmm1, ebp); ReadPixel_AVX(xmm1, ebp);
if(m_sel.zwrite && m_sel.zpsm < 2) if(m_sel.zwrite && m_sel.zpsm < 2)
{ {
@ -694,7 +693,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTexture() void GSDrawScanlineCodeGenerator::SampleTexture_AVX()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -775,13 +774,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
Wrap(xmm2, xmm3); Wrap_AVX(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
Wrap(xmm2); Wrap_AVX(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -843,7 +842,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_AVX(4, 0);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -935,7 +934,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -946,7 +945,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
{ {
// xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1007,7 +1006,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1099,7 +1098,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTextureLOD() void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -1360,13 +1359,13 @@ return;
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3); WrapLOD_AVX(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
WrapLOD(xmm2); WrapLOD_AVX(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -1428,7 +1427,7 @@ return;
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_AVX(4, 0);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -1520,7 +1519,7 @@ return;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -1591,13 +1590,13 @@ return;
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3); WrapLOD_AVX(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
WrapLOD(xmm2); WrapLOD_AVX(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -1659,7 +1658,7 @@ return;
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 1); ReadTexel_AVX(4, 1);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -1751,7 +1750,7 @@ return;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 1); ReadTexel_AVX(1, 1);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -1774,7 +1773,7 @@ return;
pop(ebp); pop(ebp);
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv)
{ {
// xmm5 = minuv // xmm5 = minuv
// xmm6 = maxuv // xmm6 = maxuv
@ -1835,7 +1834,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm5 = minuv // xmm5 = minuv
// xmm6 = maxuv // xmm6 = maxuv
@ -1923,7 +1922,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::AlphaTFX() void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -2071,7 +2070,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
} }
} }
void GSDrawScanlineCodeGenerator::ReadMask() void GSDrawScanlineCodeGenerator::ReadMask_AVX()
{ {
if(m_sel.fwrite) if(m_sel.fwrite)
{ {
@ -2084,7 +2083,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
} }
} }
void GSDrawScanlineCodeGenerator::TestAlpha() void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
{ {
switch(m_sel.atst) switch(m_sel.atst)
{ {
@ -2155,7 +2154,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
} }
} }
void GSDrawScanlineCodeGenerator::ColorTFX() void GSDrawScanlineCodeGenerator::ColorTFX_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2231,7 +2230,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
} }
} }
void GSDrawScanlineCodeGenerator::Fog() void GSDrawScanlineCodeGenerator::Fog_AVX()
{ {
if(!m_sel.fwrite || !m_sel.fge) if(!m_sel.fwrite || !m_sel.fge)
{ {
@ -2252,7 +2251,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(xmm6, xmm1, xmm0); mix16(xmm6, xmm1, xmm0);
} }
void GSDrawScanlineCodeGenerator::ReadFrame() void GSDrawScanlineCodeGenerator::ReadFrame_AVX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -2270,10 +2269,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return; return;
} }
ReadPixel(xmm2, ebx); ReadPixel_AVX(xmm2, ebx);
} }
void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX()
{ {
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{ {
@ -2317,7 +2316,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue(); alltrue();
} }
void GSDrawScanlineCodeGenerator::WriteMask() void GSDrawScanlineCodeGenerator::WriteMask_AVX()
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -2363,7 +2362,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx); not(edx);
} }
void GSDrawScanlineCodeGenerator::WriteZBuf() void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
{ {
if(!m_sel.zwrite) if(!m_sel.zwrite)
{ {
@ -2381,10 +2380,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
} }
void GSDrawScanlineCodeGenerator::AlphaBlend() void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2606,7 +2605,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
} }
} }
void GSDrawScanlineCodeGenerator::WriteFrame() void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2686,16 +2685,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); WritePixel_AVX(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
} }
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg32& addr)
{ {
vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
} }
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -2706,10 +2705,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
} }
else else
{ {
WritePixel(src, addr, 0, psm); WritePixel_AVX(src, addr, 0, psm);
WritePixel(src, addr, 1, psm); WritePixel_AVX(src, addr, 1, psm);
WritePixel(src, addr, 2, psm); WritePixel_AVX(src, addr, 2, psm);
WritePixel(src, addr, 3, psm); WritePixel_AVX(src, addr, 3, psm);
} }
} }
else else
@ -2740,22 +2739,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
test(mask, 0x03); test(mask, 0x03);
je("@f"); je("@f");
WritePixel(src, addr, 0, psm); WritePixel_AVX(src, addr, 0, psm);
L("@@"); L("@@");
test(mask, 0x0c); test(mask, 0x0c);
je("@f"); je("@f");
WritePixel(src, addr, 1, psm); WritePixel_AVX(src, addr, 1, psm);
L("@@"); L("@@");
test(mask, 0x30); test(mask, 0x30);
je("@f"); je("@f");
WritePixel(src, addr, 2, psm); WritePixel_AVX(src, addr, 2, psm);
L("@@"); L("@@");
test(mask, 0xc0); test(mask, 0xc0);
je("@f"); je("@f");
WritePixel(src, addr, 3, psm); WritePixel_AVX(src, addr, 3, psm);
L("@@"); L("@@");
} }
} }
@ -2763,7 +2762,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
static const int s_offsets[] = {0, 2, 8, 10}; static const int s_offsets[] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{ {
Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
@ -2788,7 +2787,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{ {
// in // in
// xmm5 = addr00 // xmm5 = addr00
@ -2827,7 +2826,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int i = 0; i < pixels; i++) for(int i = 0; i < pixels; i++)
{ {
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
} }
} }
@ -2846,19 +2845,18 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
} }
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
const int t[] = {4, 1, 5, 2};
for(int i = 0; i < pixels; i++) for(int i = 0; i < pixels; i++)
{ {
for(uint8 j = 0; j < 4; j++) for(uint8 j = 0; j < 4; j++)
{ {
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
} }
} }
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i)
{ {
ASSERT(i < 4); ASSERT(i < 4);

View File

@ -23,20 +23,20 @@
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16; static const int _args = 16;
static const int _top = _args + 4; static const int _top = _args + 4;
static const int _v = _args + 8; static const int _v = _args + 8;
void GSDrawScanlineCodeGenerator::Generate() void GSDrawScanlineCodeGenerator::Generate_SSE()
{ {
push(ebx); push(ebx);
push(esi); push(esi);
push(edi); push(edi);
push(ebp); push(ebp);
Init(); Init_SSE();
if(!m_sel.edge) if(!m_sel.edge)
{ {
@ -58,7 +58,7 @@ L("loop");
bool tme = m_sel.tfx != TFX_NONE; bool tme = m_sel.tfx != TFX_NONE;
TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); TestZ_SSE(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -74,11 +74,11 @@ L("loop");
if(m_sel.mmin) if(m_sel.mmin)
{ {
SampleTextureLOD(); SampleTextureLOD_SSE();
} }
else else
{ {
SampleTexture(); SampleTexture_SSE();
} }
// ecx = steps // ecx = steps
@ -92,7 +92,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
AlphaTFX(); AlphaTFX_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -103,7 +103,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ReadMask(); ReadMask_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -116,7 +116,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
TestAlpha(); TestAlpha_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -129,7 +129,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ColorTFX(); ColorTFX_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -141,7 +141,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
Fog(); Fog_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -153,7 +153,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
ReadFrame(); ReadFrame_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -166,7 +166,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
TestDestAlpha(); TestDestAlpha_SSE();
// ecx = steps // ecx = steps
// esi = fzbr // esi = fzbr
@ -179,7 +179,7 @@ L("loop");
// xmm6 = ga // xmm6 = ga
// xmm7 = test // xmm7 = test
WriteMask(); WriteMask_SSE();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -193,7 +193,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
WriteZBuf(); WriteZBuf_SSE();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -207,7 +207,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
AlphaBlend(); AlphaBlend_SSE();
// ebx = fa // ebx = fa
// ecx = steps // ecx = steps
@ -219,7 +219,7 @@ L("loop");
// xmm5 = rb // xmm5 = rb
// xmm6 = ga // xmm6 = ga
WriteFrame(); WriteFrame_SSE();
L("step"); L("step");
@ -231,7 +231,7 @@ L("step");
jle("exit", T_NEAR); jle("exit", T_NEAR);
Step(); Step_SSE();
jmp("loop", T_NEAR); jmp("loop", T_NEAR);
} }
@ -248,7 +248,7 @@ L("exit");
ret(8); ret(8);
} }
void GSDrawScanlineCodeGenerator::Init() void GSDrawScanlineCodeGenerator::Init_SSE()
{ {
if(!m_sel.notest) if(!m_sel.notest)
{ {
@ -457,7 +457,7 @@ void GSDrawScanlineCodeGenerator::Init()
} }
} }
void GSDrawScanlineCodeGenerator::Step() void GSDrawScanlineCodeGenerator::Step_SSE()
{ {
// steps -= 4; // steps -= 4;
@ -600,7 +600,7 @@ void GSDrawScanlineCodeGenerator::Step()
} }
} }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
{ {
if(!m_sel.zb) if(!m_sel.zb)
{ {
@ -648,7 +648,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest) if(m_sel.ztest)
{ {
ReadPixel(xmm1, ebp); ReadPixel_SSE(xmm1, ebp);
if(m_sel.zwrite && m_sel.zpsm < 2) if(m_sel.zwrite && m_sel.zpsm < 2)
{ {
@ -698,7 +698,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTexture() void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -780,13 +780,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
Wrap(xmm2, xmm3); Wrap_SSE(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
Wrap(xmm2); Wrap_SSE(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -853,7 +853,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_SSE(4, 0);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -946,7 +946,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_SSE(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -957,7 +957,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
{ {
// xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1020,7 +1020,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1131,7 +1131,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTextureLOD() void GSDrawScanlineCodeGenerator::SampleTextureLOD_SSE()
{ {
if(!m_sel.fb || m_sel.tfx == TFX_NONE) if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{ {
@ -1140,7 +1140,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
push(ebp); push(ebp);
mov(ebp, (size_t)m_local.gd->tex); mov(ebp, (size_t)m_local.gd->tex);
if(m_sel.tlu) if(m_sel.tlu)
{ {
@ -1354,13 +1354,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3); WrapLOD_SSE(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
WrapLOD(xmm2); WrapLOD_SSE(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -1427,7 +1427,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0); ReadTexel_SSE(4, 0);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -1520,7 +1520,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0); ReadTexel_SSE(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -1592,13 +1592,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
// uv1 = Wrap(uv1); // uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3); WrapLOD_SSE(xmm2, xmm3);
} }
else else
{ {
// uv0 = Wrap(uv0); // uv0 = Wrap(uv0);
WrapLOD(xmm2); WrapLOD_SSE(xmm2);
} }
// xmm2 = uv0 // xmm2 = uv0
@ -1665,7 +1665,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 1); ReadTexel_SSE(4, 1);
// xmm6 = c00 // xmm6 = c00
// xmm4 = c01 // xmm4 = c01
@ -1758,7 +1758,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 1); ReadTexel_SSE(1, 1);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -1781,7 +1781,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
pop(ebp); pop(ebp);
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv)
{ {
// xmm5 = minuv // xmm5 = minuv
// xmm6 = maxuv // xmm6 = maxuv
@ -1844,7 +1844,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
} }
} }
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1)
{ {
// xmm5 = minuv // xmm5 = minuv
// xmm6 = maxuv // xmm6 = maxuv
@ -1950,7 +1950,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::AlphaTFX() void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -2098,7 +2098,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
} }
} }
void GSDrawScanlineCodeGenerator::ReadMask() void GSDrawScanlineCodeGenerator::ReadMask_SSE()
{ {
if(m_sel.fwrite) if(m_sel.fwrite)
{ {
@ -2111,7 +2111,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
} }
} }
void GSDrawScanlineCodeGenerator::TestAlpha() void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
{ {
switch(m_sel.atst) switch(m_sel.atst)
{ {
@ -2186,7 +2186,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
} }
} }
void GSDrawScanlineCodeGenerator::ColorTFX() void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2262,7 +2262,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
} }
} }
void GSDrawScanlineCodeGenerator::Fog() void GSDrawScanlineCodeGenerator::Fog_SSE()
{ {
if(!m_sel.fwrite || !m_sel.fge) if(!m_sel.fwrite || !m_sel.fge)
{ {
@ -2283,7 +2283,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(xmm6, xmm1, xmm0); mix16(xmm6, xmm1, xmm0);
} }
void GSDrawScanlineCodeGenerator::ReadFrame() void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
{ {
if(!m_sel.fb) if(!m_sel.fb)
{ {
@ -2301,10 +2301,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return; return;
} }
ReadPixel(xmm2, ebx); ReadPixel_SSE(xmm2, ebx);
} }
void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
{ {
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{ {
@ -2347,7 +2347,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue(); alltrue();
} }
void GSDrawScanlineCodeGenerator::WriteMask() void GSDrawScanlineCodeGenerator::WriteMask_SSE()
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -2394,7 +2394,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx); not(edx);
} }
void GSDrawScanlineCodeGenerator::WriteZBuf() void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
{ {
if(!m_sel.zwrite) if(!m_sel.zwrite)
{ {
@ -2414,10 +2414,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
} }
void GSDrawScanlineCodeGenerator::AlphaBlend() void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2654,7 +2654,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
} }
} }
void GSDrawScanlineCodeGenerator::WriteFrame() void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
{ {
if(!m_sel.fwrite) if(!m_sel.fwrite)
{ {
@ -2739,16 +2739,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); WritePixel_SSE(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
} }
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg32& addr)
{ {
movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
} }
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{ {
if(m_sel.notest) if(m_sel.notest)
{ {
@ -2759,10 +2759,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
} }
else else
{ {
WritePixel(src, addr, 0, psm); WritePixel_SSE(src, addr, 0, psm);
WritePixel(src, addr, 1, psm); WritePixel_SSE(src, addr, 1, psm);
WritePixel(src, addr, 2, psm); WritePixel_SSE(src, addr, 2, psm);
WritePixel(src, addr, 3, psm); WritePixel_SSE(src, addr, 3, psm);
} }
} }
else else
@ -2791,22 +2791,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
test(mask, 0x03); test(mask, 0x03);
je("@f"); je("@f");
WritePixel(src, addr, 0, psm); WritePixel_SSE(src, addr, 0, psm);
L("@@"); L("@@");
test(mask, 0x0c); test(mask, 0x0c);
je("@f"); je("@f");
WritePixel(src, addr, 1, psm); WritePixel_SSE(src, addr, 1, psm);
L("@@"); L("@@");
test(mask, 0x30); test(mask, 0x30);
je("@f"); je("@f");
WritePixel(src, addr, 2, psm); WritePixel_SSE(src, addr, 2, psm);
L("@@"); L("@@");
test(mask, 0xc0); test(mask, 0xc0);
je("@f"); je("@f");
WritePixel(src, addr, 3, psm); WritePixel_SSE(src, addr, 3, psm);
L("@@"); L("@@");
} }
} }
@ -2814,7 +2814,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
static const int s_offsets[4] = {0, 2, 8, 10}; static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{ {
Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
@ -2854,7 +2854,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
{ {
// in // in
// xmm5 = addr00 // xmm5 = addr00
@ -2896,7 +2896,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int i = 0; i < pixels; i++) for(int i = 0; i < pixels; i++)
{ {
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
} }
} }
@ -2916,17 +2916,17 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[0]]); mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0); ReadTexel_SSE(xmm6, xmm5, 0);
psrldq(xmm5, 4); psrldq(xmm5, 4);
ReadTexel(xmm4, xmm2, 0); ReadTexel_SSE(xmm4, xmm2, 0);
psrldq(xmm2, 4); psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[1]]); mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0); ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4); psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0); ReadTexel_SSE(xmm7, xmm2, 0);
psrldq(xmm2, 4); psrldq(xmm2, 4);
punpckldq(xmm6, xmm1); punpckldq(xmm6, xmm1);
@ -2935,16 +2935,16 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[2]]); mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0); ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4); psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0); ReadTexel_SSE(xmm7, xmm2, 0);
psrldq(xmm2, 4); psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[3]]); mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm5, xmm5, 0); ReadTexel_SSE(xmm5, xmm5, 0);
ReadTexel(xmm2, xmm2, 0); ReadTexel_SSE(xmm2, xmm2, 0);
punpckldq(xmm1, xmm5); punpckldq(xmm1, xmm5);
punpckldq(xmm7, xmm2); punpckldq(xmm7, xmm2);
@ -2955,17 +2955,17 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[0]]); mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm0, 0); ReadTexel_SSE(xmm1, xmm0, 0);
psrldq(xmm0, 4); psrldq(xmm0, 4);
ReadTexel(xmm5, xmm3, 0); ReadTexel_SSE(xmm5, xmm3, 0);
psrldq(xmm3, 4); psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[1]]); mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0); ReadTexel_SSE(xmm2, xmm0, 0);
psrldq(xmm0, 4); psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0); ReadTexel_SSE(xmm7, xmm3, 0);
psrldq(xmm3, 4); psrldq(xmm3, 4);
punpckldq(xmm1, xmm2); punpckldq(xmm1, xmm2);
@ -2974,16 +2974,16 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[2]]); mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0); ReadTexel_SSE(xmm2, xmm0, 0);
psrldq(xmm0, 4); psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0); ReadTexel_SSE(xmm7, xmm3, 0);
psrldq(xmm3, 4); psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[3]]); mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm0, xmm0, 0); ReadTexel_SSE(xmm0, xmm0, 0);
ReadTexel(xmm3, xmm3, 0); ReadTexel_SSE(xmm3, xmm3, 0);
punpckldq(xmm2, xmm0); punpckldq(xmm2, xmm0);
punpckldq(xmm7, xmm3); punpckldq(xmm7, xmm3);
@ -2998,13 +2998,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[0]]); mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0); ReadTexel_SSE(xmm6, xmm5, 0);
psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
mov(ebx, ptr[&lod_i->u32[1]]); mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0); ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4); psrldq(xmm5, 4);
punpckldq(xmm6, xmm1); punpckldq(xmm6, xmm1);
@ -3012,13 +3012,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[2]]); mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0); ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4); psrldq(xmm5, 4);
mov(ebx, ptr[&lod_i->u32[3]]); mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm4, xmm5, 0); ReadTexel_SSE(xmm4, xmm5, 0);
// psrldq(xmm5, 4); // psrldq(xmm5, 4);
punpckldq(xmm1, xmm4); punpckldq(xmm1, xmm4);
@ -3044,7 +3044,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
{ {
for(int j = 0; j < 4; j++) for(int j = 0; j < 4; j++)
{ {
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
} }
} }
@ -3058,15 +3058,15 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
const Xmm& temp1 = Xmm(t[i * 2 + 0]); const Xmm& temp1 = Xmm(t[i * 2 + 0]);
const Xmm& temp2 = Xmm(t[i * 2 + 1]); const Xmm& temp2 = Xmm(t[i * 2 + 1]);
ReadTexel(dst, addr, 0); ReadTexel_SSE(dst, addr, 0);
psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation)
ReadTexel(temp1, addr, 0); ReadTexel_SSE(temp1, addr, 0);
psrldq(addr, 4); psrldq(addr, 4);
punpckldq(dst, temp1); punpckldq(dst, temp1);
ReadTexel(temp1, addr, 0); ReadTexel_SSE(temp1, addr, 0);
psrldq(addr, 4); psrldq(addr, 4);
ReadTexel(temp2, addr, 0); ReadTexel_SSE(temp2, addr, 0);
// psrldq(addr, 4); // psrldq(addr, 4);
punpckldq(temp1, temp2); punpckldq(temp1, temp2);
@ -3077,7 +3077,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
{ {
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];