mirror of https://github.com/PCSX2/pcsx2.git
gsdx sw JIT: dynamically select SSE41 at runtime even on SSE2 build (scanline)
It won't give the full SSE41 speed boost but it is better than nothing
This commit is contained in:
parent
322473c295
commit
6b78b8f9ce
|
@ -252,17 +252,20 @@ void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& t
|
|||
|
||||
vpblendw(a, b, 0xaa);
|
||||
|
||||
#elif _M_SSE >= 0x401
|
||||
|
||||
pblendw(a, b, 0xaa);
|
||||
|
||||
#else
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pblendw(a, b, 0xaa);
|
||||
}
|
||||
else
|
||||
{
|
||||
pcmpeqd(temp, temp);
|
||||
psrld(temp, 16);
|
||||
pand(a, temp);
|
||||
pandn(temp, b);
|
||||
por(a, temp);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
@ -274,16 +277,19 @@ void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
|
|||
vpackuswb(a, a);
|
||||
vpmovzxbw(a, a);
|
||||
|
||||
#elif _M_SSE >= 0x401
|
||||
|
||||
packuswb(a, a);
|
||||
pmovzxbw(a, a);
|
||||
|
||||
#else
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
packuswb(a, a);
|
||||
pmovzxbw(a, a);
|
||||
}
|
||||
else
|
||||
{
|
||||
packuswb(a, a);
|
||||
pxor(temp, temp);
|
||||
punpcklbw(a, temp);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
@ -346,12 +352,11 @@ void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
|
|||
|
||||
vpblendvb(a, a, b, xmm0);
|
||||
|
||||
#elif _M_SSE >= 0x401
|
||||
|
||||
pblendvb(a, b);
|
||||
|
||||
#else
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
pblendvb(a, b);
|
||||
else
|
||||
blend(a, b, xmm0);
|
||||
|
||||
#endif
|
||||
|
@ -363,14 +368,17 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
|
|||
|
||||
vpblendvb(b, a, b, xmm0);
|
||||
|
||||
#elif _M_SSE >= 0x401
|
||||
|
||||
pblendvb(a, b);
|
||||
movdqa(b, a);
|
||||
|
||||
#else
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pblendvb(a, b);
|
||||
movdqa(b, a);
|
||||
}
|
||||
else
|
||||
{
|
||||
blendr(b, a, xmm0);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include "GSScanlineEnvironment.h"
|
||||
#include "GSFunctionMap.h"
|
||||
#include "GSUtil.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
|
|
|
@ -22,102 +22,102 @@
|
|||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
// It is useless to port the code to SSEx, better use the faster 32 bits version instead
|
||||
void GSDrawScanlineCodeGenerator::Generate()
|
||||
void GSDrawScanlineCodeGenerator::Generate_SSE()
|
||||
{
|
||||
// Avoid a crash if someone want to use it
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Init()
|
||||
void GSDrawScanlineCodeGenerator::Init_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Step()
|
||||
void GSDrawScanlineCodeGenerator::Step_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
|
||||
void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture()
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
|
||||
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
||||
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX()
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadMask()
|
||||
void GSDrawScanlineCodeGenerator::ReadMask_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha()
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX()
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Fog()
|
||||
void GSDrawScanlineCodeGenerator::Fog_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame()
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha()
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteMask()
|
||||
void GSDrawScanlineCodeGenerator::WriteMask_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf()
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend()
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame()
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr)
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg64& addr)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
{
|
||||
}
|
||||
|
||||
static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm)
|
||||
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg64& addr, uint8 i, int psm)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -1069,16 +1069,15 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
|||
movdqa(xmm4, ptr[&m_local.gd->t.min]);
|
||||
movdqa(xmm5, ptr[&m_local.gd->t.max]);
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
movdqa(xmm0, ptr[&m_local.gd->t.mask]);
|
||||
|
||||
#else
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
|
||||
movdqa(xmm6, xmm0);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
// uv0
|
||||
|
||||
|
@ -1100,16 +1099,11 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
|||
|
||||
// clamp.blend8(repeat, m_local.gd->t.mask);
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
pblendvb(uv0, xmm1);
|
||||
|
||||
#else
|
||||
|
||||
else
|
||||
blendr(uv0, xmm1, xmm0);
|
||||
|
||||
#endif
|
||||
|
||||
// uv1
|
||||
|
||||
// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
|
||||
|
@ -1130,15 +1124,10 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
|||
|
||||
// clamp.blend8(repeat, m_local.gd->t.mask);
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
pblendvb(uv1, xmm1);
|
||||
|
||||
#else
|
||||
|
||||
else
|
||||
blendr(uv1, xmm1, xmm6);
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1899,16 +1888,15 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
|
|||
}
|
||||
else
|
||||
{
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
movdqa(xmm0, ptr[&m_local.gd->t.mask]);
|
||||
|
||||
#else
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
|
||||
movdqa(xmm4, xmm0);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
// uv0
|
||||
|
||||
|
@ -1930,16 +1918,11 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
|
|||
|
||||
// clamp.blend8(repeat, m_local.gd->t.mask);
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
pblendvb(uv0, xmm1);
|
||||
|
||||
#else
|
||||
|
||||
else
|
||||
blendr(uv0, xmm1, xmm0);
|
||||
|
||||
#endif
|
||||
|
||||
// uv1
|
||||
|
||||
// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
|
||||
|
@ -1960,15 +1943,10 @@ void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
|
|||
|
||||
// clamp.blend8(repeat, m_local.gd->t.mask);
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
pblendvb(uv1, xmm1);
|
||||
|
||||
#else
|
||||
|
||||
else
|
||||
blendr(uv1, xmm1, xmm4);
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2652,15 +2630,14 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
|
|||
|
||||
if(m_sel.pabe)
|
||||
{
|
||||
#if _M_SSE < 0x401
|
||||
|
||||
if(!g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
|
||||
|
||||
movdqa(xmm0, xmm4);
|
||||
pslld(xmm0, 8);
|
||||
psrad(xmm0, 31);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
|
||||
|
||||
|
@ -2845,19 +2822,26 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
|
|||
{
|
||||
case 0:
|
||||
if(i == 0) movd(dst, src);
|
||||
#if _M_SSE >= 0x401
|
||||
else pextrd(dst, src, i);
|
||||
#else
|
||||
else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);}
|
||||
#endif
|
||||
else {
|
||||
if(g_cpu.has(util::Cpu::tSSE41)) {
|
||||
pextrd(dst, src, i);
|
||||
} else {
|
||||
pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i));
|
||||
movd(dst, xmm0);
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
if(i == 0) movd(eax, src);
|
||||
#if _M_SSE >= 0x401
|
||||
else pextrd(eax, src, i);
|
||||
#else
|
||||
else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);}
|
||||
#endif
|
||||
else {
|
||||
if(g_cpu.has(util::Cpu::tSSE41)) {
|
||||
pextrd(eax, src, i);
|
||||
} else {
|
||||
pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i));
|
||||
movd(eax, xmm0);
|
||||
}
|
||||
}
|
||||
xor(eax, dst);
|
||||
and(eax, 0xffffff);
|
||||
xor(dst, eax);
|
||||
|
@ -2895,7 +2879,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
|
||||
if(m_sel.mmin && !m_sel.lcm)
|
||||
{
|
||||
#if _M_SSE >= 0x401
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
|
||||
const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
|
||||
|
||||
|
@ -2920,8 +2905,9 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
movdqa(xmm5, xmm7);
|
||||
movdqa(xmm7, ptr[&m_local.temp.test]);
|
||||
}
|
||||
|
||||
#else
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if(pixels == 4)
|
||||
{
|
||||
|
@ -3040,7 +3026,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
punpcklqdq(xmm6, xmm1);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -3052,8 +3038,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
|
||||
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if(g_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
for(int i = 0; i < pixels; i++)
|
||||
{
|
||||
for(int j = 0; j < 4; j++)
|
||||
|
@ -3062,8 +3048,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
} else {
|
||||
const int t[] = {1, 4, 1, 5, 2, 5, 2, 0};
|
||||
|
||||
for(int i = 0; i < pixels; i++)
|
||||
|
@ -3088,7 +3073,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
|||
punpcklqdq(dst, temp1);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3096,11 +3081,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uin
|
|||
{
|
||||
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
|
||||
|
||||
#if _M_SSE < 0x401
|
||||
|
||||
ASSERT(i == 0);
|
||||
|
||||
#endif
|
||||
ASSERT(i == 0 || g_cpu.has(util::Cpu::tSSE41));
|
||||
|
||||
if(i == 0) movd(eax, addr);
|
||||
else pextrd(eax, addr, i);
|
||||
|
|
|
@ -20,9 +20,7 @@
|
|||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GS.h"
|
||||
#include "GSUtil.h"
|
||||
#include "xbyak/xbyak_util.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include "GSDeviceDX.h"
|
||||
|
@ -33,6 +31,8 @@
|
|||
#define SVN_MODS 0
|
||||
#endif
|
||||
|
||||
Xbyak::util::Cpu g_cpu;
|
||||
|
||||
const char* GSUtil::GetLibName()
|
||||
{
|
||||
// The following ifdef mess is courtesy of "static string str;"
|
||||
|
@ -204,7 +204,6 @@ bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm)
|
|||
bool GSUtil::CheckSSE()
|
||||
{
|
||||
bool status = true;
|
||||
Xbyak::util::Cpu cpu;
|
||||
|
||||
struct ISA {
|
||||
Xbyak::util::Cpu::Type type;
|
||||
|
@ -231,7 +230,7 @@ bool GSUtil::CheckSSE()
|
|||
};
|
||||
|
||||
for (size_t i = 0; i < countof(checks); i++) {
|
||||
if(!cpu.has(checks[i].type)) {
|
||||
if(!g_cpu.has(checks[i].type)) {
|
||||
fprintf(stderr, "This CPU does not support %s\n", checks[i].name);
|
||||
|
||||
status = false;
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "GS.h"
|
||||
#include "xbyak/xbyak_util.h"
|
||||
|
||||
struct OCLDeviceDesc
|
||||
{
|
||||
|
@ -71,3 +72,5 @@ void GSmkdir(const char* dir);
|
|||
#endif
|
||||
|
||||
const char* psm_str(int psm);
|
||||
|
||||
extern Xbyak::util::Cpu g_cpu;
|
||||
|
|
Loading…
Reference in New Issue