gsdx sw JIT: dynamically select SSE41 at runtime even on SSE2 build (scanline)

It won't give the full SSE41 speed boost but it is better than nothing
This commit is contained in:
Gregory Hainaut 2016-11-19 14:17:38 +01:00
parent 322473c295
commit 6b78b8f9ce
6 changed files with 289 additions and 297 deletions

View File

@ -252,17 +252,20 @@ void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& t
vpblendw(a, b, 0xaa);
#elif _M_SSE >= 0x401
pblendw(a, b, 0xaa);
#else
pcmpeqd(temp, temp);
psrld(temp, 16);
pand(a, temp);
pandn(temp, b);
por(a, temp);
if(g_cpu.has(util::Cpu::tSSE41))
{
pblendw(a, b, 0xaa);
}
else
{
pcmpeqd(temp, temp);
psrld(temp, 16);
pand(a, temp);
pandn(temp, b);
por(a, temp);
}
#endif
}
@ -274,16 +277,19 @@ void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
vpackuswb(a, a);
vpmovzxbw(a, a);
#elif _M_SSE >= 0x401
packuswb(a, a);
pmovzxbw(a, a);
#else
packuswb(a, a);
pxor(temp, temp);
punpcklbw(a, temp);
if(g_cpu.has(util::Cpu::tSSE41))
{
packuswb(a, a);
pmovzxbw(a, a);
}
else
{
packuswb(a, a);
pxor(temp, temp);
punpcklbw(a, temp);
}
#endif
}
@ -346,13 +352,12 @@ void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
vpblendvb(a, a, b, xmm0);
#elif _M_SSE >= 0x401
pblendvb(a, b);
#else
blend(a, b, xmm0);
if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(a, b);
else
blend(a, b, xmm0);
#endif
}
@ -363,14 +368,17 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
vpblendvb(b, a, b, xmm0);
#elif _M_SSE >= 0x401
pblendvb(a, b);
movdqa(b, a);
#else
blendr(b, a, xmm0);
if(g_cpu.has(util::Cpu::tSSE41))
{
pblendvb(a, b);
movdqa(b, a);
}
else
{
blendr(b, a, xmm0);
}
#endif
}

View File

@ -23,6 +23,7 @@
#include "GSScanlineEnvironment.h"
#include "GSFunctionMap.h"
#include "GSUtil.h"
using namespace Xbyak;

View File

@ -22,102 +22,102 @@
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
// It is useless to port the code to SSEx, better use the faster 32 bits version instead
void GSDrawScanlineCodeGenerator::Generate()
void GSDrawScanlineCodeGenerator::Generate_SSE()
{
// Avoid a crash if someone want to use it
ret();
}
void GSDrawScanlineCodeGenerator::Init()
void GSDrawScanlineCodeGenerator::Init_SSE()
{
}
void GSDrawScanlineCodeGenerator::Step()
void GSDrawScanlineCodeGenerator::Step_SSE()
{
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
{
}
void GSDrawScanlineCodeGenerator::SampleTexture()
void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
{
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
{
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
{
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
{
}
void GSDrawScanlineCodeGenerator::ReadMask()
void GSDrawScanlineCodeGenerator::ReadMask_SSE()
{
}
void GSDrawScanlineCodeGenerator::TestAlpha()
void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
{
}
void GSDrawScanlineCodeGenerator::ColorTFX()
void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
{
}
void GSDrawScanlineCodeGenerator::Fog()
void GSDrawScanlineCodeGenerator::Fog_SSE()
{
}
void GSDrawScanlineCodeGenerator::ReadFrame()
void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
{
}
void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
{
}
void GSDrawScanlineCodeGenerator::WriteMask()
void GSDrawScanlineCodeGenerator::WriteMask_SSE()
{
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
{
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
{
}
void GSDrawScanlineCodeGenerator::WriteFrame()
void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
{
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr)
void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg64& addr)
{
}
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
{
}
static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm)
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, uint8 i, int psm)
{
}
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
{
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
{
}

View File

@ -1069,16 +1069,15 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
movdqa(xmm4, ptr[&m_local.gd->t.min]);
movdqa(xmm5, ptr[&m_local.gd->t.max]);
#if _M_SSE >= 0x401
movdqa(xmm0, ptr[&m_local.gd->t.mask]);
#else
movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
movdqa(xmm6, xmm0);
#endif
if(g_cpu.has(util::Cpu::tSSE41))
{
movdqa(xmm0, ptr[&m_local.gd->t.mask]);
}
else
{
movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
movdqa(xmm6, xmm0);
}
// uv0
@ -1100,15 +1099,10 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
// clamp.blend8(repeat, m_local.gd->t.mask);
#if _M_SSE >= 0x401
pblendvb(uv0, xmm1);
#else
blendr(uv0, xmm1, xmm0);
#endif
if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(uv0, xmm1);
else
blendr(uv0, xmm1, xmm0);
// uv1
@ -1130,15 +1124,10 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
// clamp.blend8(repeat, m_local.gd->t.mask);
#if _M_SSE >= 0x401
pblendvb(uv1, xmm1);
#else
blendr(uv1, xmm1, xmm6);
#endif
if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(uv1, xmm1);
else
blendr(uv1, xmm1, xmm6);
}
}
@ -1899,16 +1888,15 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
}
else
{
#if _M_SSE >= 0x401
movdqa(xmm0, ptr[&m_local.gd->t.mask]);
#else
movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
movdqa(xmm4, xmm0);
#endif
if(g_cpu.has(util::Cpu::tSSE41))
{
movdqa(xmm0, ptr[&m_local.gd->t.mask]);
}
else
{
movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
movdqa(xmm4, xmm0);
}
// uv0
@ -1930,15 +1918,10 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
// clamp.blend8(repeat, m_local.gd->t.mask);
#if _M_SSE >= 0x401
pblendvb(uv0, xmm1);
#else
blendr(uv0, xmm1, xmm0);
#endif
if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(uv0, xmm1);
else
blendr(uv0, xmm1, xmm0);
// uv1
@ -1960,15 +1943,10 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
// clamp.blend8(repeat, m_local.gd->t.mask);
#if _M_SSE >= 0x401
pblendvb(uv1, xmm1);
#else
blendr(uv1, xmm1, xmm4);
#endif
if(g_cpu.has(util::Cpu::tSSE41))
pblendvb(uv1, xmm1);
else
blendr(uv1, xmm1, xmm4);
}
}
@ -2652,15 +2630,14 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
if(m_sel.pabe)
{
#if _M_SSE < 0x401
if(!g_cpu.has(util::Cpu::tSSE41))
{
// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
movdqa(xmm0, xmm4);
pslld(xmm0, 8);
psrad(xmm0, 31);
// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
movdqa(xmm0, xmm4);
pslld(xmm0, 8);
psrad(xmm0, 31);
#endif
}
psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
@ -2845,19 +2822,26 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
{
case 0:
if(i == 0) movd(dst, src);
#if _M_SSE >= 0x401
else pextrd(dst, src, i);
#else
else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);}
#endif
else {
if(g_cpu.has(util::Cpu::tSSE41)) {
pextrd(dst, src, i);
} else {
pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i));
movd(dst, xmm0);
}
}
break;
case 1:
if(i == 0) movd(eax, src);
#if _M_SSE >= 0x401
else pextrd(eax, src, i);
#else
else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);}
#endif
else {
if(g_cpu.has(util::Cpu::tSSE41)) {
pextrd(eax, src, i);
} else {
pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i));
movd(eax, xmm0);
}
}
xor(eax, dst);
and(eax, 0xffffff);
xor(dst, eax);
@ -2895,152 +2879,154 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
if(m_sel.mmin && !m_sel.lcm)
{
#if _M_SSE >= 0x401
const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
if(pixels == 4)
if(g_cpu.has(util::Cpu::tSSE41))
{
movdqa(ptr[&m_local.temp.test], xmm7);
}
for(int j = 0; j < 4; j++)
{
mov(ebx, ptr[&lod_i->u32[j]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
for(int i = 0; i < pixels; i++)
if(pixels == 4)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
movdqa(ptr[&m_local.temp.test], xmm7);
}
}
if(pixels == 4)
{
movdqa(xmm5, xmm7);
movdqa(xmm7, ptr[&m_local.temp.test]);
}
for(int j = 0; j < 4; j++)
{
mov(ebx, ptr[&lod_i->u32[j]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
#else
for(int i = 0; i < pixels; i++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
if(pixels == 4)
{
movdqa(ptr[&m_local.temp.test], xmm7);
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm4, xmm2, 0);
psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0);
psrldq(xmm2, 4);
punpckldq(xmm6, xmm1);
punpckldq(xmm4, xmm7);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0);
psrldq(xmm2, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm5, xmm5, 0);
ReadTexel(xmm2, xmm2, 0);
punpckldq(xmm1, xmm5);
punpckldq(xmm7, xmm2);
punpcklqdq(xmm6, xmm1);
punpcklqdq(xmm4, xmm7);
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm5, xmm3, 0);
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0);
psrldq(xmm3, 4);
punpckldq(xmm1, xmm2);
punpckldq(xmm5, xmm7);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0);
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm0, xmm0, 0);
ReadTexel(xmm3, xmm3, 0);
punpckldq(xmm2, xmm0);
punpckldq(xmm7, xmm3);
punpcklqdq(xmm1, xmm2);
punpcklqdq(xmm5, xmm7);
movdqa(xmm7, ptr[&m_local.temp.test]);
if(pixels == 4)
{
movdqa(xmm5, xmm7);
movdqa(xmm7, ptr[&m_local.temp.test]);
}
}
else
{
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
if(pixels == 4)
{
movdqa(ptr[&m_local.temp.test], xmm7);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm6, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm4, xmm2, 0);
psrldq(xmm2, 4);
punpckldq(xmm6, xmm1);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0);
psrldq(xmm2, 4);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
punpckldq(xmm6, xmm1);
punpckldq(xmm4, xmm7);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm4, xmm5, 0);
// psrldq(xmm5, 4);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
ReadTexel(xmm7, xmm2, 0);
psrldq(xmm2, 4);
punpckldq(xmm1, xmm4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm5, xmm5, 0);
ReadTexel(xmm2, xmm2, 0);
punpckldq(xmm1, xmm5);
punpckldq(xmm7, xmm2);
punpcklqdq(xmm6, xmm1);
punpcklqdq(xmm4, xmm7);
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm5, xmm3, 0);
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0);
psrldq(xmm3, 4);
punpckldq(xmm1, xmm2);
punpckldq(xmm5, xmm7);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm2, xmm0, 0);
psrldq(xmm0, 4);
ReadTexel(xmm7, xmm3, 0);
psrldq(xmm3, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm0, xmm0, 0);
ReadTexel(xmm3, xmm3, 0);
punpckldq(xmm2, xmm0);
punpckldq(xmm7, xmm3);
punpcklqdq(xmm1, xmm2);
punpcklqdq(xmm5, xmm7);
movdqa(xmm7, ptr[&m_local.temp.test]);
}
else
{
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm6, xmm5, 0);
psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
mov(ebx, ptr[&lod_i->u32[1]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
punpckldq(xmm6, xmm1);
mov(ebx, ptr[&lod_i->u32[2]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm1, xmm5, 0);
psrldq(xmm5, 4);
mov(ebx, ptr[&lod_i->u32[3]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(xmm4, xmm5, 0);
// psrldq(xmm5, 4);
punpckldq(xmm1, xmm4);
punpcklqdq(xmm6, xmm1);
}
punpcklqdq(xmm6, xmm1);
}
#endif
}
else
{
@ -3052,43 +3038,42 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
#if _M_SSE >= 0x401
for(int i = 0; i < pixels; i++)
if(g_cpu.has(util::Cpu::tSSE41))
{
for(int j = 0; j < 4; j++)
for(int i = 0; i < pixels; i++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
for(int j = 0; j < 4; j++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
} else {
const int t[] = {1, 4, 1, 5, 2, 5, 2, 0};
for(int i = 0; i < pixels; i++)
{
const Xmm& addr = Xmm(r[i * 2 + 0]);
const Xmm& dst = Xmm(r[i * 2 + 1]);
const Xmm& temp1 = Xmm(t[i * 2 + 0]);
const Xmm& temp2 = Xmm(t[i * 2 + 1]);
ReadTexel(dst, addr, 0);
psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation)
ReadTexel(temp1, addr, 0);
psrldq(addr, 4);
punpckldq(dst, temp1);
ReadTexel(temp1, addr, 0);
psrldq(addr, 4);
ReadTexel(temp2, addr, 0);
// psrldq(addr, 4);
punpckldq(temp1, temp2);
punpcklqdq(dst, temp1);
}
}
#else
const int t[] = {1, 4, 1, 5, 2, 5, 2, 0};
for(int i = 0; i < pixels; i++)
{
const Xmm& addr = Xmm(r[i * 2 + 0]);
const Xmm& dst = Xmm(r[i * 2 + 1]);
const Xmm& temp1 = Xmm(t[i * 2 + 0]);
const Xmm& temp2 = Xmm(t[i * 2 + 1]);
ReadTexel(dst, addr, 0);
psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation)
ReadTexel(temp1, addr, 0);
psrldq(addr, 4);
punpckldq(dst, temp1);
ReadTexel(temp1, addr, 0);
psrldq(addr, 4);
ReadTexel(temp2, addr, 0);
// psrldq(addr, 4);
punpckldq(temp1, temp2);
punpcklqdq(dst, temp1);
}
#endif
}
}
@ -3096,11 +3081,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin
{
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
#if _M_SSE < 0x401
ASSERT(i == 0);
#endif
ASSERT(i == 0 || g_cpu.has(util::Cpu::tSSE41));
if(i == 0) movd(eax, addr);
else pextrd(eax, addr, i);

View File

@ -20,9 +20,7 @@
*/
#include "stdafx.h"
#include "GS.h"
#include "GSUtil.h"
#include "xbyak/xbyak_util.h"
#ifdef _WIN32
#include "GSDeviceDX.h"
@ -33,6 +31,8 @@
#define SVN_MODS 0
#endif
Xbyak::util::Cpu g_cpu;
const char* GSUtil::GetLibName()
{
// The following ifdef mess is courtesy of "static string str;"
@ -204,7 +204,6 @@ bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm)
bool GSUtil::CheckSSE()
{
bool status = true;
Xbyak::util::Cpu cpu;
struct ISA {
Xbyak::util::Cpu::Type type;
@ -231,7 +230,7 @@ bool GSUtil::CheckSSE()
};
for (size_t i = 0; i < countof(checks); i++) {
if(!cpu.has(checks[i].type)) {
if(!g_cpu.has(checks[i].type)) {
fprintf(stderr, "This CPU does not support %s\n", checks[i].name);
status = false;

View File

@ -22,6 +22,7 @@
#pragma once
#include "GS.h"
#include "xbyak/xbyak_util.h"
struct OCLDeviceDesc
{
@ -71,3 +72,5 @@ void GSmkdir(const char* dir);
#endif
const char* psm_str(int psm);
extern Xbyak::util::Cpu g_cpu;