gsdx sw JIT: dynamically select between AVX1 and SSE code path (scanline)

This commit is contained in:
Gregory Hainaut 2016-11-19 14:47:40 +01:00
parent 6b78b8f9ce
commit 574a2c774e
5 changed files with 441 additions and 428 deletions

View File

@ -22,6 +22,17 @@
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE >= 0x501
#else
void GSDrawScanlineCodeGenerator::Generate()
{
if(g_cpu.has(util::Cpu::tAVX))
Generate_AVX();
else
Generate_SSE();
}
#endif
#if _M_SSE >= 0x501
alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] =
@ -183,194 +194,179 @@ void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
{
#if _M_SSE >= 0x500
if(shift == 0)
if(g_cpu.has(util::Cpu::tAVX))
{
vpmulhrsw(a, f);
if(shift == 0)
{
vpmulhrsw(a, f);
}
else
{
vpsllw(a, shift + 1);
vpmulhw(a, f);
}
}
else
{
vpsllw(a, shift + 1);
vpmulhw(a, f);
if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
{
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
}
#else
if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
{
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
#endif
}
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift)
{
#if _M_SSE >= 0x500
vpsubw(a, b);
modulate16(a, f, shift);
vpaddw(a, b);
#else
psubw(a, b);
modulate16(a, f, shift);
paddw(a, b);
#endif
if(g_cpu.has(util::Cpu::tAVX))
{
vpsubw(a, b);
modulate16(a, f, shift);
vpaddw(a, b);
}
else
{
psubw(a, b);
modulate16(a, f, shift);
paddw(a, b);
}
}
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
{
#if _M_SSE >= 0x500
vpsubw(a, b);
vpmullw(a, f);
vpsraw(a, 4);
vpaddw(a, b);
#else
psubw(a, b);
pmullw(a, f);
psraw(a, 4);
paddw(a, b);
#endif
if(g_cpu.has(util::Cpu::tAVX))
{
vpsubw(a, b);
vpmullw(a, f);
vpsraw(a, 4);
vpaddw(a, b);
}
else
{
psubw(a, b);
pmullw(a, f);
psraw(a, 4);
paddw(a, b);
}
}
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
{
#if _M_SSE >= 0x500
vpblendw(a, b, 0xaa);
#else
if(g_cpu.has(util::Cpu::tSSE41))
if(g_cpu.has(util::Cpu::tAVX))
{
pblendw(a, b, 0xaa);
vpblendw(a, b, 0xaa);
}
else
{
pcmpeqd(temp, temp);
psrld(temp, 16);
pand(a, temp);
pandn(temp, b);
por(a, temp);
if(g_cpu.has(util::Cpu::tSSE41))
{
pblendw(a, b, 0xaa);
}
else
{
pcmpeqd(temp, temp);
psrld(temp, 16);
pand(a, temp);
pandn(temp, b);
por(a, temp);
}
}
#endif
}
void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
{
#if _M_SSE >= 0x500
vpackuswb(a, a);
vpmovzxbw(a, a);
#else
if(g_cpu.has(util::Cpu::tSSE41))
if(g_cpu.has(util::Cpu::tAVX))
{
packuswb(a, a);
pmovzxbw(a, a);
vpackuswb(a, a);
vpmovzxbw(a, a);
}
else
{
packuswb(a, a);
pxor(temp, temp);
punpcklbw(a, temp);
if(g_cpu.has(util::Cpu::tSSE41))
{
packuswb(a, a);
pmovzxbw(a, a);
}
else
{
packuswb(a, a);
pxor(temp, temp);
punpcklbw(a, temp);
}
}
#endif
}
void GSDrawScanlineCodeGenerator::alltrue()
{
#if _M_SSE >= 0x500
vpmovmskb(eax, xmm7);
cmp(eax, 0xffff);
je("step", T_NEAR);
#else
pmovmskb(eax, xmm7);
cmp(eax, 0xffff);
je("step", T_NEAR);
#endif
if(g_cpu.has(util::Cpu::tAVX))
{
vpmovmskb(eax, xmm7);
cmp(eax, 0xffff);
je("step", T_NEAR);
}
else
{
pmovmskb(eax, xmm7);
cmp(eax, 0xffff);
je("step", T_NEAR);
}
}
void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
{
#if _M_SSE >= 0x500
vpand(b, mask);
vpandn(mask, a);
vpor(a, b, mask);
#else
pand(b, mask);
pandn(mask, a);
por(b, mask);
movdqa(a, b);
#endif
if(g_cpu.has(util::Cpu::tAVX))
{
vpand(b, mask);
vpandn(mask, a);
vpor(a, b, mask);
}
else
{
pand(b, mask);
pandn(mask, a);
por(b, mask);
movdqa(a, b);
}
}
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
{
#if _M_SSE >= 0x500
vpand(b, mask);
vpandn(mask, a);
vpor(b, mask);
#else
pand(b, mask);
pandn(mask, a);
por(b, mask);
#endif
if(g_cpu.has(util::Cpu::tAVX))
{
vpand(b, mask);
vpandn(mask, a);
vpor(b, mask);
}
else
{
pand(b, mask);
pandn(mask, a);
por(b, mask);
}
}
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{
#if _M_SSE >= 0x500
vpblendvb(a, a, b, xmm0);
#else
if(g_cpu.has(util::Cpu::tSSE41))
if(g_cpu.has(util::Cpu::tAVX))
vpblendvb(a, a, b, xmm0);
else if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(a, b);
else
blend(a, b, xmm0);
#endif
}
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{
#if _M_SSE >= 0x500
vpblendvb(b, a, b, xmm0);
#else
if(g_cpu.has(util::Cpu::tSSE41))
if(g_cpu.has(util::Cpu::tAVX))
{
vpblendvb(b, a, b, xmm0);
}
else if(g_cpu.has(util::Cpu::tSSE41))
{
pblendvb(a, b);
movdqa(b, a);
@ -379,8 +375,6 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{
blendr(b, a, xmm0);
}
#endif
}
void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
@ -388,31 +382,34 @@ void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const
// l = src & 0xFF; (1 left shift + 1 right shift)
// h = (src >> 8) & 0xFF; (1 right shift)
#if _M_SSE >= 0x500
if (src == h) {
vpsllw(l, src, 8);
vpsrlw(h, 8);
} else if (src == l) {
vpsrlw(h, src, 8);
vpsllw(l, 8);
} else {
vpsllw(l, src, 8);
vpsrlw(h, src, 8);
if(g_cpu.has(util::Cpu::tAVX))
{
if (src == h) {
vpsllw(l, src, 8);
vpsrlw(h, 8);
} else if (src == l) {
vpsrlw(h, src, 8);
vpsllw(l, 8);
} else {
vpsllw(l, src, 8);
vpsrlw(h, src, 8);
}
vpsrlw(l, 8);
}
vpsrlw(l, 8);
#else
if (src == h) {
movdqa(l, src);
} else if (src == l) {
movdqa(h, src);
} else {
movdqa(l, src);
movdqa(h, src);
else
{
if (src == h) {
movdqa(l, src);
} else if (src == l) {
movdqa(h, src);
} else {
movdqa(l, src);
movdqa(h, src);
}
psllw(l, 8);
psrlw(l, 8);
psrlw(h, 8);
}
psllw(l, 8);
psrlw(l, 8);
psrlw(h, 8);
#endif
}
#endif

View File

@ -27,6 +27,12 @@
using namespace Xbyak;
#if defined(_M_AMD64) || defined(_WIN64)
#define RegLong Reg64
#else
#define RegLong Reg32
#endif
class GSDrawScanlineCodeGenerator : public GSCodeGenerator
{
void operator = (const GSDrawScanlineCodeGenerator&);
@ -58,17 +64,9 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void WriteZBuf();
void AlphaBlend();
void WriteFrame();
#if defined(_M_AMD64) || defined(_WIN64)
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
#else
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
#endif
void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm);
void ReadTexel(int pixels, int mip_offset = 0);
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
@ -85,39 +83,59 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
#else
void Init();
void Step();
void TestZ(const Xmm& temp1, const Xmm& temp2);
void SampleTexture();
void Wrap(const Xmm& uv0);
void Wrap(const Xmm& uv0, const Xmm& uv1);
void SampleTextureLOD();
void WrapLOD(const Xmm& uv0);
void WrapLOD(const Xmm& uv0, const Xmm& uv1);
void AlphaTFX();
void ReadMask();
void TestAlpha();
void ColorTFX();
void Fog();
void ReadFrame();
void TestDestAlpha();
void WriteMask();
void WriteZBuf();
void AlphaBlend();
void WriteFrame();
void Generate_SSE();
void Init_SSE();
void Step_SSE();
void TestZ_SSE(const Xmm& temp1, const Xmm& temp2);
void SampleTexture_SSE();
void Wrap_SSE(const Xmm& uv0);
void Wrap_SSE(const Xmm& uv0, const Xmm& uv1);
void SampleTextureLOD_SSE();
void WrapLOD_SSE(const Xmm& uv0);
void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1);
void AlphaTFX_SSE();
void ReadMask_SSE();
void TestAlpha_SSE();
void ColorTFX_SSE();
void Fog_SSE();
void ReadFrame_SSE();
void TestDestAlpha_SSE();
void WriteMask_SSE();
void WriteZBuf_SSE();
void AlphaBlend_SSE();
void WriteFrame_SSE();
void ReadPixel_SSE(const Xmm& dst, const RegLong& addr);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_SSE(int pixels, int mip_offset = 0);
void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i);
#if defined(_M_AMD64) || defined(_WIN64)
void ReadPixel(const Xmm& dst, const Reg64& addr);
void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm);
#else
void ReadPixel(const Xmm& dst, const Reg32& addr);
void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm);
#endif
void ReadTexel(int pixels, int mip_offset = 0);
void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i);
void Generate_AVX();
void Init_AVX();
void Step_AVX();
void TestZ_AVX(const Xmm& temp1, const Xmm& temp2);
void SampleTexture_AVX();
void Wrap_AVX(const Xmm& uv0);
void Wrap_AVX(const Xmm& uv0, const Xmm& uv1);
void SampleTextureLOD_AVX();
void WrapLOD_AVX(const Xmm& uv0);
void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1);
void AlphaTFX_AVX();
void ReadMask_AVX();
void TestAlpha_AVX();
void ColorTFX_AVX();
void Fog_AVX();
void ReadFrame_AVX();
void TestDestAlpha_AVX();
void WriteMask_AVX();
void WriteZBuf_AVX();
void AlphaBlend_AVX();
void WriteFrame_AVX();
void ReadPixel_AVX(const Xmm& dst, const RegLong& addr);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_AVX(int pixels, int mip_offset = 0);
void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i);
void modulate16(const Xmm& a, const Operand& f, int shift);
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift);

View File

@ -45,7 +45,7 @@
#define _zm xmm5
#define _fd xmm6
#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64))
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
#ifdef _WIN64
#else
@ -59,7 +59,7 @@ static const int _rz_zd = -8 * 10;
static const int _rz_cov = -8 * 12;
#endif
void GSDrawScanlineCodeGenerator::Generate()
void GSDrawScanlineCodeGenerator::Generate_AVX()
{
bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
bool need_clut = need_tex && m_sel.tlu;
@ -100,7 +100,7 @@ void GSDrawScanlineCodeGenerator::Generate()
if(need_tex)
mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]);
Init();
Init_AVX();
// a0 = steps
// t1 = fza_base
@ -126,30 +126,30 @@ void GSDrawScanlineCodeGenerator::Generate()
L("loop");
TestZ(xmm5, xmm6);
TestZ_AVX(xmm5, xmm6);
// ebp = za
if(m_sel.mmin)
{
SampleTextureLOD();
SampleTextureLOD_AVX();
}
else
{
SampleTexture();
SampleTexture_AVX();
}
// ebp = za
// xmm2 = rb
// xmm3 = ga
AlphaTFX();
AlphaTFX_AVX();
// ebp = za
// xmm2 = rb
// xmm3 = ga
ReadMask();
ReadMask_AVX();
// ebp = za
// xmm2 = rb
@ -157,7 +157,7 @@ L("loop");
// xmm4 = fm
// xmm5 = zm
TestAlpha();
TestAlpha_AVX();
// ebp = za
// xmm2 = rb
@ -165,7 +165,7 @@ L("loop");
// xmm4 = fm
// xmm5 = zm
ColorTFX();
ColorTFX_AVX();
// ebp = za
// xmm2 = rb
@ -173,7 +173,7 @@ L("loop");
// xmm4 = fm
// xmm5 = zm
Fog();
Fog_AVX();
// ebp = za
// xmm2 = rb
@ -181,7 +181,7 @@ L("loop");
// xmm4 = fm
// xmm5 = zm
ReadFrame();
ReadFrame_AVX();
// ebx = fa
// ebp = za
@ -191,7 +191,7 @@ L("loop");
// xmm5 = zm
// xmm6 = fd
TestDestAlpha();
TestDestAlpha_AVX();
// ebx = fa
// ebp = za
@ -201,7 +201,7 @@ L("loop");
// xmm5 = zm
// xmm6 = fd
WriteMask();
WriteMask_AVX();
// ebx = fa
// edx = fzm
@ -212,7 +212,7 @@ L("loop");
// xmm5 = zm
// xmm6 = fd
WriteZBuf();
WriteZBuf_AVX();
// ebx = fa
// edx = fzm
@ -221,7 +221,7 @@ L("loop");
// xmm4 = fm
// xmm6 = fd
AlphaBlend();
AlphaBlend_AVX();
// ebx = fa
// edx = fzm
@ -230,7 +230,7 @@ L("loop");
// xmm4 = fm
// xmm6 = fd
WriteFrame();
WriteFrame_AVX();
L("step");
@ -242,7 +242,7 @@ L("step");
jle("exit", T_NEAR);
Step();
Step_AVX();
jmp("loop", T_NEAR);
}
@ -277,7 +277,7 @@ L("exit");
ret();
}
void GSDrawScanlineCodeGenerator::Init()
void GSDrawScanlineCodeGenerator::Init_AVX()
{
if(!m_sel.notest)
{
@ -480,7 +480,7 @@ void GSDrawScanlineCodeGenerator::Init()
}
}
void GSDrawScanlineCodeGenerator::Step()
void GSDrawScanlineCodeGenerator::Step_AVX()
{
// steps -= 4;
@ -603,7 +603,7 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
{
if(!m_sel.zb)
{
@ -661,7 +661,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest)
{
ReadPixel(xmm1, rbp);
ReadPixel_AVX(xmm1, rbp);
if(m_sel.zwrite && m_sel.zpsm < 2)
{
@ -715,7 +715,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
}
}
void GSDrawScanlineCodeGenerator::SampleTexture()
void GSDrawScanlineCodeGenerator::SampleTexture_AVX()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
@ -786,13 +786,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
Wrap(xmm4, xmm5);
Wrap_AVX(xmm4, xmm5);
}
else
{
// uv0 = Wrap(uv0);
Wrap(xmm4);
Wrap_AVX(xmm4);
}
// xmm4 = uv0
@ -854,7 +854,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
ReadTexel_AVX(4, 0);
// xmm0 = c10
// xmm1 = c11
@ -944,7 +944,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff();
@ -958,7 +958,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// xmm3 = ga
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
{
// xmm0, xmm1, xmm2, xmm3 = free
@ -1019,7 +1019,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
{
// xmm0, xmm1, xmm2, xmm3 = free
@ -1111,19 +1111,19 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
}
}
void GSDrawScanlineCodeGenerator::SampleTextureLOD()
void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
{
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv)
{
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1)
{
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
{
if(!m_sel.fb)
{
@ -1261,7 +1261,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
}
}
void GSDrawScanlineCodeGenerator::ReadMask()
void GSDrawScanlineCodeGenerator::ReadMask_AVX()
{
if(m_sel.fwrite)
{
@ -1274,7 +1274,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
}
}
void GSDrawScanlineCodeGenerator::TestAlpha()
void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
{
switch(m_sel.atst)
{
@ -1345,7 +1345,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
}
}
void GSDrawScanlineCodeGenerator::ColorTFX()
void GSDrawScanlineCodeGenerator::ColorTFX_AVX()
{
if(!m_sel.fwrite)
{
@ -1410,7 +1410,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
}
}
void GSDrawScanlineCodeGenerator::Fog()
void GSDrawScanlineCodeGenerator::Fog_AVX()
{
if(!m_sel.fwrite || !m_sel.fge)
{
@ -1431,7 +1431,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(_ga, xmm6, _f);
}
void GSDrawScanlineCodeGenerator::ReadFrame()
void GSDrawScanlineCodeGenerator::ReadFrame_AVX()
{
if(!m_sel.fb)
{
@ -1449,10 +1449,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return;
}
ReadPixel(_fd, rbx);
ReadPixel_AVX(_fd, rbx);
}
void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX()
{
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{
@ -1496,7 +1496,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue();
}
void GSDrawScanlineCodeGenerator::WriteMask()
void GSDrawScanlineCodeGenerator::WriteMask_AVX()
{
if(m_sel.notest)
{
@ -1542,7 +1542,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx);
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
{
if(!m_sel.zwrite)
{
@ -1571,10 +1571,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1);
WritePixel_AVX(xmm1, rbp, dh, fast, m_sel.zpsm, 1);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
{
if(!m_sel.fwrite)
{
@ -1798,7 +1798,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
}
}
void GSDrawScanlineCodeGenerator::WriteFrame()
void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
{
if(!m_sel.fwrite)
{
@ -1889,16 +1889,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0);
WritePixel_AVX(xmm2, rbx, dl, fast, m_sel.fpsm, 0);
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr)
void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg64& addr)
{
vmovq(dst, qword[_m_local__gd__vm + addr * 2]);
vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]);
}
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(m_sel.notest)
{
@ -1909,10 +1909,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
WritePixel_AVX(src, addr, 0, psm);
WritePixel_AVX(src, addr, 1, psm);
WritePixel_AVX(src, addr, 2, psm);
WritePixel_AVX(src, addr, 3, psm);
}
}
else
@ -1943,22 +1943,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
WritePixel_AVX(src, addr, 0, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
WritePixel_AVX(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
WritePixel_AVX(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
WritePixel_AVX(src, addr, 3, psm);
L("@@");
}
}
@ -1966,7 +1966,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm)
void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, uint8 i, int psm)
{
Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2];
@ -1990,7 +1990,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{
const int in[] = {0, 1, 2, 3};
const int out[] = {4, 5, 0, 1};
@ -1999,12 +1999,12 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
{
for(int j = 0; j < 4; j++)
{
ReadTexel(Xmm(out[i]), Xmm(in[i]), j);
ReadTexel_AVX(Xmm(out[i]), Xmm(in[i]), j);
}
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i)
{
const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4];
@ -2026,7 +2026,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin
// And palette need zero masking.
// It is not possible to use same source/destination so linear interpolation must be updated
#if 0
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{
const int in[] = {0, 1, 2, 3};
const int out[] = {4, 5, 0, 1};

View File

@ -23,21 +23,20 @@
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16;
static const int _top = _args + 4;
static const int _v = _args + 8;
void GSDrawScanlineCodeGenerator::Generate()
void GSDrawScanlineCodeGenerator::Generate_AVX()
{
//ret(8);
push(ebx);
push(esi);
push(edi);
push(ebp);
Init();
Init_AVX();
if(!m_sel.edge)
{
@ -59,7 +58,7 @@ L("loop");
bool tme = m_sel.tfx != TFX_NONE;
TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
TestZ_AVX(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
// ecx = steps
// esi = fzbr
@ -75,11 +74,11 @@ L("loop");
if(m_sel.mmin)
{
SampleTextureLOD();
SampleTextureLOD_AVX();
}
else
{
SampleTexture();
SampleTexture_AVX();
}
// ecx = steps
@ -93,7 +92,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
AlphaTFX();
AlphaTFX_AVX();
// ecx = steps
// esi = fzbr
@ -104,7 +103,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
ReadMask();
ReadMask_AVX();
// ecx = steps
// esi = fzbr
@ -117,7 +116,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
TestAlpha();
TestAlpha_AVX();
// ecx = steps
// esi = fzbr
@ -130,7 +129,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
ColorTFX();
ColorTFX_AVX();
// ecx = steps
// esi = fzbr
@ -142,7 +141,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
Fog();
Fog_AVX();
// ecx = steps
// esi = fzbr
@ -154,7 +153,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
ReadFrame();
ReadFrame_AVX();
// ecx = steps
// esi = fzbr
@ -167,7 +166,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
TestDestAlpha();
TestDestAlpha_AVX();
// ecx = steps
// esi = fzbr
@ -180,7 +179,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
WriteMask();
WriteMask_AVX();
// ebx = fa
// ecx = steps
@ -194,7 +193,7 @@ L("loop");
// xmm5 = rb
// xmm6 = ga
WriteZBuf();
WriteZBuf_AVX();
// ebx = fa
// ecx = steps
@ -208,7 +207,7 @@ L("loop");
// xmm5 = rb
// xmm6 = ga
AlphaBlend();
AlphaBlend_AVX();
// ebx = fa
// ecx = steps
@ -220,7 +219,7 @@ L("loop");
// xmm5 = rb
// xmm6 = ga
WriteFrame();
WriteFrame_AVX();
L("step");
@ -232,7 +231,7 @@ L("step");
jle("exit", T_NEAR);
Step();
Step_AVX();
jmp("loop", T_NEAR);
}
@ -249,7 +248,7 @@ L("exit");
ret(8);
}
void GSDrawScanlineCodeGenerator::Init()
void GSDrawScanlineCodeGenerator::Init_AVX()
{
if(!m_sel.notest)
{
@ -455,7 +454,7 @@ void GSDrawScanlineCodeGenerator::Init()
}
}
void GSDrawScanlineCodeGenerator::Step()
void GSDrawScanlineCodeGenerator::Step_AVX()
{
// steps -= 4;
@ -596,7 +595,7 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
{
if(!m_sel.zb)
{
@ -644,7 +643,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest)
{
ReadPixel(xmm1, ebp);
ReadPixel_AVX(xmm1, ebp);
if(m_sel.zwrite && m_sel.zpsm < 2)
{
@ -694,7 +693,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
}
}
void GSDrawScanlineCodeGenerator::SampleTexture()
void GSDrawScanlineCodeGenerator::SampleTexture_AVX()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
@ -775,13 +774,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
Wrap(xmm2, xmm3);
Wrap_AVX(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
Wrap(xmm2);
Wrap_AVX(xmm2);
}
// xmm2 = uv0
@ -843,7 +842,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
ReadTexel_AVX(4, 0);
// xmm6 = c00
// xmm4 = c01
@ -935,7 +934,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff();
@ -946,7 +945,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
{
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1007,7 +1006,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
{
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1099,7 +1098,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
}
}
void GSDrawScanlineCodeGenerator::SampleTextureLOD()
void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
@ -1360,13 +1359,13 @@ return;
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3);
WrapLOD_AVX(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2);
WrapLOD_AVX(xmm2);
}
// xmm2 = uv0
@ -1428,7 +1427,7 @@ return;
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
ReadTexel_AVX(4, 0);
// xmm6 = c00
// xmm4 = c01
@ -1520,7 +1519,7 @@ return;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
ReadTexel_AVX(1, 0);
// GSVector4i mask = GSVector4i::x00ff();
@ -1591,13 +1590,13 @@ return;
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3);
WrapLOD_AVX(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2);
WrapLOD_AVX(xmm2);
}
// xmm2 = uv0
@ -1659,7 +1658,7 @@ return;
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 1);
ReadTexel_AVX(4, 1);
// xmm6 = c00
// xmm4 = c01
@ -1751,7 +1750,7 @@ return;
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 1);
ReadTexel_AVX(1, 1);
// GSVector4i mask = GSVector4i::x00ff();
@ -1774,7 +1773,7 @@ return;
pop(ebp);
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv)
{
// xmm5 = minuv
// xmm6 = maxuv
@ -1835,7 +1834,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
}
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1)
{
// xmm5 = minuv
// xmm6 = maxuv
@ -1923,7 +1922,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
}
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
{
if(!m_sel.fb)
{
@ -2071,7 +2070,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
}
}
void GSDrawScanlineCodeGenerator::ReadMask()
void GSDrawScanlineCodeGenerator::ReadMask_AVX()
{
if(m_sel.fwrite)
{
@ -2084,7 +2083,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
}
}
void GSDrawScanlineCodeGenerator::TestAlpha()
void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
{
switch(m_sel.atst)
{
@ -2155,7 +2154,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
}
}
void GSDrawScanlineCodeGenerator::ColorTFX()
void GSDrawScanlineCodeGenerator::ColorTFX_AVX()
{
if(!m_sel.fwrite)
{
@ -2231,7 +2230,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
}
}
void GSDrawScanlineCodeGenerator::Fog()
void GSDrawScanlineCodeGenerator::Fog_AVX()
{
if(!m_sel.fwrite || !m_sel.fge)
{
@ -2252,7 +2251,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(xmm6, xmm1, xmm0);
}
void GSDrawScanlineCodeGenerator::ReadFrame()
void GSDrawScanlineCodeGenerator::ReadFrame_AVX()
{
if(!m_sel.fb)
{
@ -2270,10 +2269,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return;
}
ReadPixel(xmm2, ebx);
ReadPixel_AVX(xmm2, ebx);
}
void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX()
{
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{
@ -2317,7 +2316,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue();
}
void GSDrawScanlineCodeGenerator::WriteMask()
void GSDrawScanlineCodeGenerator::WriteMask_AVX()
{
if(m_sel.notest)
{
@ -2363,7 +2362,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx);
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
{
if(!m_sel.zwrite)
{
@ -2381,10 +2380,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
{
if(!m_sel.fwrite)
{
@ -2606,7 +2605,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
}
}
void GSDrawScanlineCodeGenerator::WriteFrame()
void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
{
if(!m_sel.fwrite)
{
@ -2686,16 +2685,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
WritePixel_AVX(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg32& addr)
{
vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
}
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(m_sel.notest)
{
@ -2706,10 +2705,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
WritePixel_AVX(src, addr, 0, psm);
WritePixel_AVX(src, addr, 1, psm);
WritePixel_AVX(src, addr, 2, psm);
WritePixel_AVX(src, addr, 3, psm);
}
}
else
@ -2740,22 +2739,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
WritePixel_AVX(src, addr, 0, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
WritePixel_AVX(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
WritePixel_AVX(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
WritePixel_AVX(src, addr, 3, psm);
L("@@");
}
}
@ -2763,7 +2762,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
static const int s_offsets[] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{
Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
@ -2788,7 +2787,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
{
// in
// xmm5 = addr00
@ -2827,7 +2826,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int i = 0; i < pixels; i++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
@ -2846,19 +2845,18 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
}
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
const int t[] = {4, 1, 5, 2};
for(int i = 0; i < pixels; i++)
{
for(uint8 j = 0; j < 4; j++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i)
{
ASSERT(i < 4);

View File

@ -23,20 +23,20 @@
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16;
static const int _top = _args + 4;
static const int _v = _args + 8;
void GSDrawScanlineCodeGenerator::Generate()
void GSDrawScanlineCodeGenerator::Generate_SSE()
{
push(ebx);
push(esi);
push(edi);
push(ebp);
Init();
Init_SSE();
if(!m_sel.edge)
{
@ -58,7 +58,7 @@ L("loop");
bool tme = m_sel.tfx != TFX_NONE;
TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
TestZ_SSE(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
// ecx = steps
// esi = fzbr
@ -74,11 +74,11 @@ L("loop");
if(m_sel.mmin)
{
SampleTextureLOD();
SampleTextureLOD_SSE();
}
else
{
SampleTexture();
SampleTexture_SSE();
}
// ecx = steps
@ -92,7 +92,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
AlphaTFX();
AlphaTFX_SSE();
// ecx = steps
// esi = fzbr
@ -103,7 +103,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
ReadMask();
ReadMask_SSE();
// ecx = steps
// esi = fzbr
@ -116,7 +116,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
TestAlpha();
TestAlpha_SSE();
// ecx = steps
// esi = fzbr
@ -129,7 +129,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
ColorTFX();
ColorTFX_SSE();
// ecx = steps
// esi = fzbr
@ -141,7 +141,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
Fog();
Fog_SSE();
// ecx = steps
// esi = fzbr
@ -153,7 +153,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
ReadFrame();
ReadFrame_SSE();
// ecx = steps
// esi = fzbr
@ -166,7 +166,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
TestDestAlpha();
TestDestAlpha_SSE();
// ecx = steps
// esi = fzbr
@ -179,7 +179,7 @@ L("loop");
// xmm6 = ga
// xmm7 = test
WriteMask();
WriteMask_SSE();
// ebx = fa
// ecx = steps
@ -193,7 +193,7 @@ L("loop");
// xmm5 = rb
// xmm6 = ga
WriteZBuf();
WriteZBuf_SSE();
// ebx = fa
// ecx = steps
@ -207,7 +207,7 @@ L("loop");
// xmm5 = rb
// xmm6 = ga
AlphaBlend();
AlphaBlend_SSE();
// ebx = fa
// ecx = steps
@ -219,7 +219,7 @@ L("loop");
// xmm5 = rb
// xmm6 = ga
WriteFrame();
WriteFrame_SSE();
L("step");
@ -231,7 +231,7 @@ L("step");
jle("exit", T_NEAR);
Step();
Step_SSE();
jmp("loop", T_NEAR);
}
@ -248,7 +248,7 @@ L("exit");
ret(8);
}
void GSDrawScanlineCodeGenerator::Init()
void GSDrawScanlineCodeGenerator::Init_SSE()
{
if(!m_sel.notest)
{
@ -457,7 +457,7 @@ void GSDrawScanlineCodeGenerator::Init()
}
}
void GSDrawScanlineCodeGenerator::Step()
void GSDrawScanlineCodeGenerator::Step_SSE()
{
// steps -= 4;
@ -600,7 +600,7 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
{
if(!m_sel.zb)
{
@ -648,7 +648,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
if(m_sel.ztest)
{
ReadPixel(xmm1, ebp);
ReadPixel_SSE(xmm1, ebp);
if(m_sel.zwrite && m_sel.zpsm < 2)
{
@ -698,7 +698,7 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
}
}
void GSDrawScanlineCodeGenerator::SampleTexture()
void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
@ -780,13 +780,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
Wrap(xmm2, xmm3);
Wrap_SSE(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
Wrap(xmm2);
Wrap_SSE(xmm2);
}
// xmm2 = uv0
@ -853,7 +853,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
ReadTexel_SSE(4, 0);
// xmm6 = c00
// xmm4 = c01
@ -946,7 +946,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
ReadTexel_SSE(1, 0);
// GSVector4i mask = GSVector4i::x00ff();
@ -957,7 +957,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
{
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1020,7 +1020,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
{
// xmm0, xmm1, xmm4, xmm5, xmm6 = free
@ -1131,7 +1131,7 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
}
}
void GSDrawScanlineCodeGenerator::SampleTextureLOD()
void GSDrawScanlineCodeGenerator::SampleTextureLOD_SSE()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
@ -1140,7 +1140,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
push(ebp);
mov(ebp, (size_t)m_local.gd->tex);
mov(ebp, (size_t)m_local.gd->tex);
if(m_sel.tlu)
{
@ -1354,13 +1354,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3);
WrapLOD_SSE(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2);
WrapLOD_SSE(xmm2);
}
// xmm2 = uv0
@ -1427,7 +1427,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
ReadTexel_SSE(4, 0);
// xmm6 = c00
// xmm4 = c01
@ -1520,7 +1520,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
ReadTexel_SSE(1, 0);
// GSVector4i mask = GSVector4i::x00ff();
@ -1592,13 +1592,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(xmm2, xmm3);
WrapLOD_SSE(xmm2, xmm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(xmm2);
WrapLOD_SSE(xmm2);
}
// xmm2 = uv0
@ -1665,7 +1665,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 1);
ReadTexel_SSE(4, 1);
// xmm6 = c00
// xmm4 = c01
@ -1758,7 +1758,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 1);
ReadTexel_SSE(1, 1);
// GSVector4i mask = GSVector4i::x00ff();
@ -1781,7 +1781,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
pop(ebp);
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv)
{
// xmm5 = minuv
// xmm6 = maxuv
@ -1844,7 +1844,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
}
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1)
{
// xmm5 = minuv
// xmm6 = maxuv
@ -1950,7 +1950,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
}
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
{
if(!m_sel.fb)
{
@ -2098,7 +2098,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
}
}
void GSDrawScanlineCodeGenerator::ReadMask()
void GSDrawScanlineCodeGenerator::ReadMask_SSE()
{
if(m_sel.fwrite)
{
@ -2111,7 +2111,7 @@ void GSDrawScanlineCodeGenerator::ReadMask()
}
}
void GSDrawScanlineCodeGenerator::TestAlpha()
void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
{
switch(m_sel.atst)
{
@ -2186,7 +2186,7 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
}
}
void GSDrawScanlineCodeGenerator::ColorTFX()
void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
{
if(!m_sel.fwrite)
{
@ -2262,7 +2262,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
}
}
void GSDrawScanlineCodeGenerator::Fog()
void GSDrawScanlineCodeGenerator::Fog_SSE()
{
if(!m_sel.fwrite || !m_sel.fge)
{
@ -2283,7 +2283,7 @@ void GSDrawScanlineCodeGenerator::Fog()
mix16(xmm6, xmm1, xmm0);
}
void GSDrawScanlineCodeGenerator::ReadFrame()
void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
{
if(!m_sel.fb)
{
@ -2301,10 +2301,10 @@ void GSDrawScanlineCodeGenerator::ReadFrame()
return;
}
ReadPixel(xmm2, ebx);
ReadPixel_SSE(xmm2, ebx);
}
void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
{
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{
@ -2347,7 +2347,7 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
alltrue();
}
void GSDrawScanlineCodeGenerator::WriteMask()
void GSDrawScanlineCodeGenerator::WriteMask_SSE()
{
if(m_sel.notest)
{
@ -2394,7 +2394,7 @@ void GSDrawScanlineCodeGenerator::WriteMask()
not(edx);
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
{
if(!m_sel.zwrite)
{
@ -2414,10 +2414,10 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
{
if(!m_sel.fwrite)
{
@ -2654,7 +2654,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
}
}
void GSDrawScanlineCodeGenerator::WriteFrame()
void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
{
if(!m_sel.fwrite)
{
@ -2739,16 +2739,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
WritePixel_SSE(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg32& addr)
{
movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
}
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(m_sel.notest)
{
@ -2759,10 +2759,10 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
WritePixel_SSE(src, addr, 0, psm);
WritePixel_SSE(src, addr, 1, psm);
WritePixel_SSE(src, addr, 2, psm);
WritePixel_SSE(src, addr, 3, psm);
}
}
else
@ -2791,22 +2791,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
WritePixel_SSE(src, addr, 0, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
WritePixel_SSE(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
WritePixel_SSE(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
WritePixel_SSE(src, addr, 3, psm);
L("@@");
}
}
@ -2814,7 +2814,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{
Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
@ -2854,7 +2854,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
{
// in
// xmm5 = addr00
@ -2896,7 +2896,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int i = 0; i < pixels; i++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
@ -2916,17 +2916,17 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
ReadTexel_SSE(xmm6, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm4, xmm2, 0);
ReadTexel_SSE(xmm4, xmm2, 0);
psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0);
ReadTexel_SSE(xmm7, xmm2, 0);
psrldq(xmm2, 4);
punpckldq(xmm6, xmm1);
@ -2935,16 +2935,16 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0);
ReadTexel_SSE(xmm7, xmm2, 0);
psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm5, xmm5, 0);
ReadTexel(xmm2, xmm2, 0);
ReadTexel_SSE(xmm5, xmm5, 0);
ReadTexel_SSE(xmm2, xmm2, 0);
punpckldq(xmm1, xmm5);
punpckldq(xmm7, xmm2);
@ -2955,17 +2955,17 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm0, 0);
ReadTexel_SSE(xmm1, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm5, xmm3, 0);
ReadTexel_SSE(xmm5, xmm3, 0);
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
ReadTexel_SSE(xmm2, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0);
ReadTexel_SSE(xmm7, xmm3, 0);
psrldq(xmm3, 4);
punpckldq(xmm1, xmm2);
@ -2974,16 +2974,16 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
ReadTexel_SSE(xmm2, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0);
ReadTexel_SSE(xmm7, xmm3, 0);
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm0, xmm0, 0);
ReadTexel(xmm3, xmm3, 0);
ReadTexel_SSE(xmm0, xmm0, 0);
ReadTexel_SSE(xmm3, xmm3, 0);
punpckldq(xmm2, xmm0);
punpckldq(xmm7, xmm3);
@ -2998,13 +2998,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
ReadTexel_SSE(xmm6, xmm5, 0);
psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4);
punpckldq(xmm6, xmm1);
@ -3012,13 +3012,13 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
ReadTexel_SSE(xmm1, xmm5, 0);
psrldq(xmm5, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm4, xmm5, 0);
ReadTexel_SSE(xmm4, xmm5, 0);
// psrldq(xmm5, 4);
punpckldq(xmm1, xmm4);
@ -3044,7 +3044,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
{
for(int j = 0; j < 4; j++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
@ -3058,15 +3058,15 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
const Xmm& temp1 = Xmm(t[i * 2 + 0]);
const Xmm& temp2 = Xmm(t[i * 2 + 1]);
ReadTexel(dst, addr, 0);
ReadTexel_SSE(dst, addr, 0);
psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation)
ReadTexel(temp1, addr, 0);
ReadTexel_SSE(temp1, addr, 0);
psrldq(addr, 4);
punpckldq(dst, temp1);
ReadTexel(temp1, addr, 0);
ReadTexel_SSE(temp1, addr, 0);
psrldq(addr, 4);
ReadTexel(temp2, addr, 0);
ReadTexel_SSE(temp2, addr, 0);
// psrldq(addr, 4);
punpckldq(temp1, temp2);
@ -3077,7 +3077,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
{
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];