GSdx: The sw renderer now uses avx2, not much faster though, +10% maybe, if the game is not EE limited. I'm not sure if haswell has that much better sse execution (load/store units doubled for example), or the avx2 code is not fully optimized yet.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5677 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11@gmail.com 2013-06-20 05:07:52 +00:00
parent 3b753bec42
commit d20bd4f86a
20 changed files with 5021 additions and 88 deletions

View File

@ -1789,23 +1789,39 @@ public:
GSVector8i TA0(TEXA.TA0 << 24);
GSVector8i mask = GSVector8i::x00ffffff();
for(int i = 0; i < 4; i++, dst += dstpitch * 2)
{
GSVector8i v0 = s[i * 2 + 0];
GSVector8i v1 = s[i * 2 + 1];
GSVector8i v0, v1, v2, v3;
GSVector8i::sw128(v0, v1);
GSVector8i::sw64(v0, v1);
v0 = s[0] & mask;
v1 = s[1] & mask;
v2 = s[2] & mask;
v3 = s[3] & mask;
v0 &= mask;
v1 &= mask;
GSVector8i::sw128(v0, v1);
GSVector8i::sw64(v0, v1);
GSVector8i::sw128(v2, v3);
GSVector8i::sw64(v2, v3);
GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1];
*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);
d0[0] = Expand24to32<AEM>(v0, TA0);
d1[0] = Expand24to32<AEM>(v1, TA0);
}
v0 = s[4] & mask;
v1 = s[5] & mask;
v2 = s[6] & mask;
v3 = s[7] & mask;
GSVector8i::sw128(v0, v1);
GSVector8i::sw64(v0, v1);
GSVector8i::sw128(v2, v3);
GSVector8i::sw64(v2, v3);
dst += dstpitch * 4;
*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);
#else

File diff suppressed because it is too large Load Diff

View File

@ -81,8 +81,8 @@ public:
bool IsEdge() const {return m_global.sel.aa1;}
bool IsRect() const {return m_global.sel.IsSolidRect();}
bool TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga);
void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm);
template<class T> bool TestAlpha(T& test, T& fm, T& zm, const T& ga);
template<class T> void WritePixel(const T& src, int addr, int i, uint32 psm);
#endif

View File

@ -22,6 +22,38 @@
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE >= 0x501
const GSVector8i GSDrawScanlineCodeGenerator::m_test[16] =
{
GSVector8i::zero(),
GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector8i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
GSVector8i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff),
GSVector8i::zero(),
};
const GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4] =
{
GSVector8(0.204446009836232697516f),
GSVector8(-1.04913055217340124191f),
GSVector8(2.28330284476918490682f),
GSVector8(1.0f),
};
#else
const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] =
{
GSVector4i::zero(),
@ -42,6 +74,8 @@ const GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4] =
GSVector4(1.0f),
};
#endif
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)
@ -51,6 +85,81 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
Generate();
}
#if _M_SSE >= 0x501
void GSDrawScanlineCodeGenerator::modulate16(const Ymm& a, const Operand& f, int shift)
{
if(shift == 0)
{
vpmulhrsw(a, f);
}
else
{
vpsllw(a, (uint8)(shift + 1));
vpmulhw(a, f);
}
}
void GSDrawScanlineCodeGenerator::lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift)
{
vpsubw(a, b);
modulate16(a, f, shift);
vpaddw(a, b);
}
void GSDrawScanlineCodeGenerator::lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f)
{
vpsubw(a, b);
vpmullw(a, f);
vpsraw(a, 4);
vpaddw(a, b);
}
void GSDrawScanlineCodeGenerator::mix16(const Ymm& a, const Ymm& b, const Ymm& temp)
{
vpblendw(a, b, 0xaa);
}
void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp)
{
vpackuswb(a, a);
vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
vpmovzxbw(a, a);
}
void GSDrawScanlineCodeGenerator::alltrue()
{
vpmovmskb(eax, ymm7);
cmp(eax, 0xffffffff);
je("step", T_NEAR);
}
void GSDrawScanlineCodeGenerator::blend(const Ymm& a, const Ymm& b, const Ymm& mask)
{
vpand(b, mask);
vpandn(mask, a);
vpor(a, b, mask);
}
void GSDrawScanlineCodeGenerator::blendr(const Ymm& b, const Ymm& a, const Ymm& mask)
{
vpand(b, mask);
vpandn(mask, a);
vpor(b, mask);
}
void GSDrawScanlineCodeGenerator::blend8(const Ymm& a, const Ymm& b)
{
vpblendvb(a, a, b, xmm0);
}
void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
{
vpblendvb(b, a, b, xmm0);
}
#else
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
{
#if _M_SSE >= 0x500
@ -244,3 +353,5 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
#endif
}
#endif

View File

@ -35,6 +35,55 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void Generate();
#if _M_SSE >= 0x501
void Init();
void Step();
void TestZ(const Ymm& temp1, const Ymm& temp2);
void SampleTexture();
void Wrap(const Ymm& uv0);
void Wrap(const Ymm& uv0, const Ymm& uv1);
void SampleTextureLOD();
void WrapLOD(const Ymm& uv0);
void WrapLOD(const Ymm& uv0, const Ymm& uv1);
void AlphaTFX();
void ReadMask();
void TestAlpha();
void ColorTFX();
void Fog();
void ReadFrame();
void TestDestAlpha();
void WriteMask();
void WriteZBuf();
void AlphaBlend();
void WriteFrame();
#if defined(_M_AMD64) || defined(_WIN64)
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
#else
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
#endif
void ReadTexel(int pixels, int mip_offset = 0);
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
void modulate16(const Ymm& a, const Operand& f, int shift);
void lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift);
void lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f);
void mix16(const Ymm& a, const Ymm& b, const Ymm& temp);
void clamp16(const Ymm& a, const Ymm& temp);
void alltrue();
void blend(const Ymm& a, const Ymm& b, const Ymm& mask);
void blendr(const Ymm& b, const Ymm& a, const Ymm& mask);
void blend8(const Ymm& a, const Ymm& b);
void blend8r(const Ymm& b, const Ymm& a);
#else
void Init();
void Step();
void TestZ(const Xmm& temp1, const Xmm& temp2);
@ -80,9 +129,17 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void blend8(const Xmm& a, const Xmm& b);
void blend8r(const Xmm& b, const Xmm& a);
#endif
public:
GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
#if _M_SSE >= 0x501
static const GSVector8i m_test[16];
static const GSVector8 m_log2_coef[4];
#else
static const GSVector4i m_test[8];
static const GSVector4 m_log2_coef[4];
#endif
};

View File

@ -23,7 +23,7 @@
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16;
static const int _top = _args + 4;
@ -1236,24 +1236,21 @@ return;
// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]
vmovq(xmm4, ptr[&m_local.gd->t.minmax]); // x x x x maxv maxu minv minu
vpunpcklwd(xmm4, xmm4); // maxv maxv maxu maxu minv minv minu minu
vpxor(xmm1, xmm1);
vpunpckldq(xmm6, xmm4, xmm4); // minv minv minv minv minu minu minu minu
vpunpcklwd(xmm5, xmm6, xmm1); // 0 minu 0 minu 0 minu 0 minu
vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
vpunpcklwd(xmm5, xmm4, xmm1); // minu
vpunpckhwd(xmm6, xmm4, xmm1); // minv
vpsrlvd(xmm5, xmm5, xmm0);
vpunpckhwd(xmm6, xmm6, xmm1); // 0 minv 0 minv 0 minv 0 minv
vpsrlvd(xmm6, xmm6, xmm0);
vpackusdw(xmm5, xmm6); // xmm5 = minv minv minv minv minu minu minu minu
vpunpckhdq(xmm4, xmm4); // maxv maxv maxv maxv maxu maxu maxu maxu
vpunpcklwd(xmm6, xmm4, xmm1); // 0 maxu 0 maxu 0 maxu 0 maxu
vpackusdw(xmm5, xmm6);
vmovdqa(xmm4, ptr[&m_local.gd->t.max]);
vpunpcklwd(xmm6, xmm4, xmm1); // maxu
vpunpckhwd(xmm4, xmm4, xmm1); // maxv
vpsrlvd(xmm6, xmm6, xmm0);
vpunpckhwd(xmm4, xmm1); // 0 maxv 0 maxv 0 maxv 0 maxv
vpsrlvd(xmm4, xmm4, xmm0);
vpackusdw(xmm6, xmm4); // xmm6 = maxv maxv maxv maxv maxu maxu maxu maxu
vpackusdw(xmm6, xmm4);
vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5);
vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6);
@ -2807,7 +2804,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
}
}
static const int s_offsets[4] = {0, 2, 8, 10};
static const int s_offsets[] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{
@ -2865,7 +2862,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
vmovdqa(ptr[&m_local.temp.test], xmm7);
}
for(int j = 0; j < 4; j++)
for(uint8 j = 0; j < 4; j++)
{
mov(ebx, ptr[&lod_i->u32[j]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
@ -2895,18 +2892,9 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
for(int i = 0; i < pixels; i++)
{
if(m_cpu.has(util::Cpu::tAVX2) && !m_sel.tlu) // vpgatherdd seems to be dead slow for byte aligned offsets, not using it for palette lookups
for(uint8 j = 0; j < 4; j++)
{
Xmm mask = Xmm(t[i]);
vpcmpeqd(mask, mask);
vpgatherdd(Xmm(r[i * 2 + 1]), ptr[ebx + Xmm(r[i * 2 + 0]) * 4], mask);
}
else
{
for(int j = 0; j < 4; j++)
{
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
}
}
}
@ -2914,6 +2902,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
{
ASSERT(i < 4);
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
if(i == 0) vmovd(eax, addr);

File diff suppressed because it is too large Load Diff

View File

@ -209,7 +209,7 @@ public:
iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &ml);
/*
name = format("c:/temp/%s_%016llx.bin", m_name.c_str(), (uint64)key);
name = format("c:/temp1/%s_%016llx.bin", m_name.c_str(), (uint64)key);
if(FILE* fp = fopen(name.c_str(), "wb"))
{
@ -218,7 +218,7 @@ public:
fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
fwrite(cg->getCode(), cg->getSize(), 1, fp);
fputc(0xBB, fp); fputc(0xDE, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp);
fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
fputc(0x0F, fp); fputc(0x0B, fp);

View File

@ -208,6 +208,10 @@ void GSRasterizer::Draw(GSRasterizerData* data)
__assume(0);
}
#if _M_SSE >= 0x501
_mm256_zeroupper();
#endif
data->pixels = m_pixels;
uint64 ticks = __rdtsc() - data->start;
@ -917,7 +921,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
{
for(int i = 0; i < threads; i++, row++)
{
m_scanline[row] = i;
m_scanline[row] = (uint8)i;
}
}
}

View File

@ -400,7 +400,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
dst->t = t;
#if _M_SSE >= 0x501
#if 0 //_M_SSE >= 0x501
dst->_pad = GSVector4::zero();
@ -1342,8 +1342,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
{
gd.sel.fge = 1;
gd.frb = GSVector4i((int)env.FOGCOL.u32[0] & 0x00ff00ff);
gd.fga = GSVector4i((int)(env.FOGCOL.u32[0] >> 8) & 0x00ff00ff);
gd.frb = env.FOGCOL.u32[0] & 0x00ff00ff;
gd.fga = (env.FOGCOL.u32[0] >> 8) & 0x00ff00ff;
}
if(context->FRAME.PSM != PSM_PSMCT24)
@ -1403,6 +1403,34 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
}
#if _M_SSE >= 0x501
gd.fm = fm;
gd.zm = zm;
if(gd.sel.fpsm == 1)
{
gd.fm |= 0xff000000;
}
else if(gd.sel.fpsm == 2)
{
uint32 rb = gd.fm & 0x00f800f8;
uint32 ga = gd.fm & 0x8000f800;
gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
}
if(gd.sel.zpsm == 1)
{
gd.zm |= 0xff000000;
}
else if(gd.sel.zpsm == 2)
{
gd.zm |= 0xffff0000;
}
#else
gd.fm = GSVector4i(fm);
gd.zm = GSVector4i(zm);
@ -1427,6 +1455,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.zm |= GSVector4i::xffff0000();
}
#endif
if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) // TODO: check scissor horizontally only
{
gd.sel.notest = 1;
@ -1435,7 +1465,11 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
for(int i = 0, j = m_vertex.tail; i < j; i++)
{
#if _M_SSE >= 0x501
if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8
#else
if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4
#endif
{
gd.sel.notest = 0;

View File

@ -116,7 +116,7 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
void* vm;
const void* tex[7];
uint32* clut;
GSVector4i* dimx;
GSVector4i* dimx;
const int* fbr;
const int* zbr;
@ -125,19 +125,63 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
const GSVector2i* fzbr;
const GSVector2i* fzbc;
GSVector4i fm, zm;
struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
GSVector4i aref;
GSVector4i afix;
struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
#if _M_SSE >= 0x501
uint32 fm, zm;
uint32 frb, fga;
GSVector8 mxl;
GSVector8 k; // TEX1.K * 0x10000
GSVector8 l; // TEX1.L * -0x10000
struct {GSVector8i i, f;} lod; // lcm == 1
#else
GSVector4i fm, zm;
GSVector4i frb, fga;
GSVector4 mxl;
GSVector4 k; // TEX1.K * 0x10000
GSVector4 l; // TEX1.L * -0x10000
struct {GSVector4i i, f;} lod; // lcm == 1
#endif
};
__aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has its own
{
#if _M_SSE >= 0x501
struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
struct step {GSVector8 z, stq; GSVector8i c, f;} d8;
struct {GSVector8i rb, ga;} c;
struct {uint32 z, f;} p;
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
struct
{
GSVector8 z, zo;
GSVector8i f;
GSVector8 s, t, q;
GSVector8i rb, ga;
GSVector8i zs, zd;
GSVector8i uf, vf;
GSVector8i cov;
// mipmapping
struct {GSVector8i i, f;} lod;
GSVector8i uv[2];
GSVector8i uv_minmax[2];
GSVector8i trb, tga;
GSVector8i test;
} temp;
#else
struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4];
struct step {GSVector4 z, stq; GSVector4i c, f;} d4;
struct {GSVector4i rb, ga;} c;
@ -164,6 +208,8 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has
GSVector4i test;
} temp;
#endif
//
const GSScanlineGlobalData* gd;

View File

@ -22,6 +22,23 @@
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#if _M_SSE >= 0x501
const GSVector8 GSSetupPrimCodeGenerator::m_shift[9] =
{
GSVector8(8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f),
GSVector8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
GSVector8(-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f),
GSVector8(-2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f),
GSVector8(-3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f),
GSVector8(-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f),
GSVector8(-5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f),
GSVector8(-6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f),
GSVector8(-7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f),
};
#else
const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
{
GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
@ -31,6 +48,8 @@ const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
};
#endif
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)

View File

@ -42,5 +42,9 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
public:
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
#if _M_SSE >= 0x501
static const GSVector8 m_shift[9];
#else
static const GSVector4 m_shift[5];
#endif
};

View File

@ -23,7 +23,7 @@
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;

View File

@ -0,0 +1,353 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
static const int _args = 0;
static const int _vertex = _args + 4;
static const int _index = _args + 8;
static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate()
{
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(edx, dword[esp + _dscan]);
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Ymm(3 + i), ptr[&m_shift[i]]);
}
}
Depth();
Texture();
Color();
ret();
}
void GSSetupPrimCodeGenerator::Depth()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
if(m_en.f)
{
// GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]);
}
if(m_en.z)
{
// GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]);
}
if(m_en.f)
{
// m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
vmulps(ymm0, ymm1, ymm3);
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d8.f], ymm0);
}
if(m_en.z)
{
// m_local.d8.z = dz * shift[0];
vmulps(ymm0, ymm2, ymm3);
vmovaps(ptr[&m_local.d8.z], ymm0);
}
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
if(m_en.f)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
if(i < 4) vmulps(ymm0, ymm1, Ymm(4 + i));
else vmulps(ymm0, ymm1, ptr[&m_shift[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d[i].f], ymm0);
}
if(m_en.z)
{
// m_local.d[i].z = dz * shift[1 + i];
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
vmovaps(ptr[&m_local.d[i].z], ymm0);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
vcvttps2dq(xmm0, xmm0);
vpextrd(ptr[&m_local.p.f], xmm0, 3);
}
if(m_en.z)
{
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
mov(ptr[&m_local.p.z], eax);
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if(!m_en.t)
{
return;
}
// GSVector8 dt(dscan.t);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
// GSVector8 dt8 = dt * shift[0];
vmulps(ymm1, ymm0, ymm3);
if(m_sel.fst)
{
// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
vcvttps2dq(ymm1, ymm1);
vmovdqa(ptr[&m_local.d8.stq], ymm1);
}
else
{
// m_local.d8.stq = dt8;
vmovaps(ptr[&m_local.d8.stq], ymm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8 v = dstq * shift[1 + i];
if(i < 4) vmulps(ymm2, ymm1, Ymm(4 + i));
else vmulps(ymm2, ymm1, ptr[&m_shift[i + 1]]);
if(m_sel.fst)
{
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
vcvttps2dq(ymm2, ymm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if(!m_en.c)
{
return;
}
if(m_sel.iip)
{
// GSVector8 dc(dscan.c);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
vmulps(ymm1, ymm0, ymm3);
vcvttps2dq(ymm1, ymm1);
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(ymm1, ymm1);
vmovdqa(ptr[&m_local.d8.c], ymm1);
// ymm3 is not needed anymore
// GSVector8 dr = dc.xxxx();
// GSVector8 db = dc.zzzz();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(ymm0, ymm1);
vmovdqa(ptr[&m_local.d[i].rb], ymm0);
}
// GSVector8 dc(dscan.c);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector8 dg = dc.yyyy();
// GSVector8 da = dc.wwww();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(ymm0, ymm1);
vmovdqa(ptr[&m_local.d[i].ga], ymm0);
}
}
else
{
// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
int last = 0;
switch(m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
}
vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
vcvttps2dq(ymm0, ymm0);
// c = c.upl16(c.zwxy());
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(ymm0, ymm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
vpsrlw(ymm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.c.rb], ymm1);
vmovdqa(ptr[&m_local.c.ga], ymm2);
}
}
#endif

View File

@ -75,6 +75,7 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000
#if _M_SSE >= 0x500
const GSVector8 GSVector8::m_half(0.5f);
const GSVector8 GSVector8::m_one(1.0f);
const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));

View File

@ -4119,6 +4119,15 @@ public:
// TODO: extract/insert
template<int i> __forceinline int extract32() const
{
GSVector4i v = extract<i / 4>();
if((i & 3) == 0) return GSVector4i::store(v);
return v.extract32<i>();
}
template<int i> __forceinline GSVector4i extract() const
{
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
@ -4141,7 +4150,6 @@ public:
GSVector4i a0 = extract<0>();
GSVector4i a1 = extract<1>();
v0 = GSVector4i::load((int)ptr[a0.extract32<0>()]);
v0 = v0.insert32<1>((int)ptr[a0.extract32<1>()]);
v0 = v0.insert32<2>((int)ptr[a0.extract32<2>()]);
@ -4191,14 +4199,14 @@ public:
return cast(v0).insert<1>(v1);
}
template<> __forceinline GSVector8i gather32_32<uint32, uint8>(const uint32* ptr1, const uint8* ptr2) const
template<> __forceinline GSVector8i gather32_32<uint8, uint32>(const uint8* ptr1, const uint32* ptr2) const
{
return gather32_32<uint8>(ptr2).gather32_32<uint32>(ptr1);
return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2);
}
template<> __forceinline GSVector8i gather32_32<uint32, uint32>(const uint32* ptr1, const uint32* ptr2) const
{
return gather32_32<uint32>(ptr2).gather32_32<uint32>(ptr1);
return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2);
}
template<class T> __forceinline void gather32_32(const T* RESTRICT ptr, GSVector8i* RESTRICT dst) const
@ -4731,6 +4739,16 @@ public:
return GSVector8i(_mm256_broadcastq_epi64(v.m));
}
__forceinline static GSVector8i broadcast128(const GSVector4i& v)
{
// this one only has m128 source op, it will be saved to a temp on stack if the compiler is not smart enough and use the address of v directly (<= vs2012u3rc2)
return GSVector8i(_mm256_broadcastsi128_si256(v)); // fastest
//return GSVector8i(v); // almost as fast as broadcast
//return cast(v).insert<1>(v); // slow
//return cast(v).aa(); // slowest
}
__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}
__forceinline static GSVector8i xffffffff() {return zero() == zero();}
@ -4958,6 +4976,7 @@ public:
__m128 m0, m1;
};
static const GSVector8 m_half;
static const GSVector8 m_one;
static const GSVector8 m_x7fffffff;
static const GSVector8 m_x80000000;

View File

@ -646,6 +646,18 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
@ -737,6 +749,7 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
@ -2054,4 +2067,4 @@
<UserProperties RESOURCE_FILE="GSdx.rc" />
</VisualStudio>
</ProjectExtensions>
</Project>
</Project>

View File

@ -336,6 +336,12 @@
<ClCompile Include="GSRendererCS.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="GLLoader.h">
@ -728,4 +734,4 @@
<Filter>Resource Files</Filter>
</ResourceCompile>
</ItemGroup>
</Project>
</Project>

View File

@ -789,22 +789,22 @@ void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(x
void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, true, -1); }
void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, true, -1); }
void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, true, -1); }
void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); }
void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); }
void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); }
void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); }
void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); }
void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); }
void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); }
void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); }
void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); }
void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); }
void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); }
void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); }
void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); }
void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); }
void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); }
void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); }
void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, true, -1); }
void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, true, -1); }
void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, true, -1); }
void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, true, -1); }
void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, true, -1); }
void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, true, -1); }
void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, true, -1); }
void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, true, -1); }
void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, true, -1); }
void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, true, -1); }
void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, true, -1); }
void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, true, -1); }
void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, true, -1); }
void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, true, -1); }
void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, true, -1); }
void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, true, -1); }
void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, true, -1); }
void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, true, -1); }
void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, true, -1); }
@ -1345,8 +1345,8 @@ void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
@ -1410,3 +1410,13 @@ void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1
void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 0, 2); }
void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 1, 0); }
void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 1, 1); }
// mods
void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x78, true, 0); }
void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x79, true, 0); }
void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x58, true, 0); }
void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x59, true, 0); }
// supportYMM = true
// vpblendvb, vpsllw-vpsrlq