GSdx: still working on the rasterizer, would be nice to add some avx code there, but it's just so unfitting for anything.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4407 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-03-09 11:52:53 +00:00
parent c1e844b7f7
commit 9586e38dd4
18 changed files with 343 additions and 319 deletions

View File

@ -23,6 +23,7 @@
#include "stdafx.h"
#include "GPUDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
static const int _args = 8;
static const int _top = _args + 4;
@ -152,7 +153,7 @@ void GPUDrawScanlineCodeGenerator::Init()
// GSVector4i vt = GSVector4i(v.t).xxzzl();
cvttps2dq(xmm4, ptr[edx + 32]);
cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]);
pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
// s = vt.xxxx().add16(m_local.d.s);
@ -186,7 +187,7 @@ void GPUDrawScanlineCodeGenerator::Init()
{
// GSVector4i vc = GSVector4i(v.c).xxzzlh();
cvttps2dq(xmm6, ptr[edx]);
cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]);
pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));

View File

@ -22,8 +22,8 @@
// TODO: x64
#include "stdafx.h"
#include "GSVertexSW.h"
#include "GPUSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
using namespace Xbyak;
@ -50,7 +50,7 @@ void GPUSetupPrimCodeGenerator::Generate()
{
// t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001();
cvttps2dq(xmm1, ptr[ecx + sizeof(GSVertexSW) * 1 + 32]);
cvttps2dq(xmm1, ptr[ecx + sizeof(GSVertexSW) * 1 + offsetof(GSVertexSW, t)]);
psrld(xmm1, 8);
psrld(xmm0, 31);
psubd(xmm1, xmm0);
@ -94,8 +94,8 @@ void GPUSetupPrimCodeGenerator::Generate()
// GSVector4 dt = dscan.t;
// GSVector4 dc = dscan.c;
movaps(xmm4, ptr[edx]);
movaps(xmm3, ptr[edx + 32]);
movaps(xmm4, ptr[edx + offsetof(GSVertexSW, c)]);
movaps(xmm3, ptr[edx + offsetof(GSVertexSW, t)]);
// GSVector4i dtc8 = GSVector4i(dt * 8.0f).ps32(GSVector4i(dc * 8.0f));

View File

@ -21,6 +21,7 @@
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
@ -264,7 +265,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
{
vmovaps(xmm0, ptr[r9 + 16]); // v.p
vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p
if(m_sel.fwrite && m_sel.fge)
{
@ -297,7 +298,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
if(m_sel.edge || m_sel.tfx != TFX_NONE)
{
vmovaps(xmm0, ptr[r9 + 32]); // v.t
vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t
}
if(m_sel.edge)
@ -361,7 +362,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
// GSVector4i vc = GSVector4i(v.c);
vcvttps2dq(xmm0, ptr[r9]); // v.c
vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c
// vc = vc.upl16(vc.zwxy());

View File

@ -24,6 +24,7 @@
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
@ -296,7 +297,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
{
vmovaps(xmm0, ptr[ebx + 16]); // v.p
vmovaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
if(m_sel.fwrite && m_sel.fge)
{
@ -333,7 +334,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
if(m_sel.edge || m_sel.tfx != TFX_NONE)
{
vmovaps(xmm4, ptr[ebx + 32]); // v.t
vmovaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
}
if(m_sel.edge)
@ -410,7 +411,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
// GSVector4i vc = GSVector4i(v.c);
vcvttps2dq(xmm6, ptr[ebx]); // v.c
vcvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
// vc = vc.upl16(vc.zwxy());

View File

@ -21,6 +21,7 @@
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
@ -293,7 +294,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
{
movaps(xmm0, ptr[ebx + 16]); // v.p
movaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
if(m_sel.fwrite && m_sel.fge)
{
@ -330,7 +331,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
if(m_sel.edge || m_sel.tfx != TFX_NONE)
{
movaps(xmm4, ptr[ebx + 32]); // v.t
movaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
}
if(m_sel.edge)
@ -410,7 +411,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
// GSVector4i vc = GSVector4i(v.c);
cvttps2dq(xmm6, ptr[ebx]); // v.c
cvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
// vc = vc.upl16(vc.zwxy());

View File

@ -44,13 +44,13 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds)
, m_id(0)
, m_threads(1)
{
m_edge.buff = (GSScanline*)vmalloc(sizeof(GSScanline) * 2048, false);
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
m_edge.count = 0;
}
GSRasterizer::~GSRasterizer()
{
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSScanline) * 2048);
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048);
delete m_ds;
}
@ -119,8 +119,6 @@ void GSRasterizer::GetStats(GSRasterizerStats& stats)
void GSRasterizer::DrawPoint(const GSVertexSW* v)
{
// TODO: round to closest for point, prestep for line
GSVector4i p(v->p);
if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
@ -142,22 +140,20 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
GSVector4 dp = dv.p.abs();
int i = (dp < dp.yxwz()).mask() & 1; // |dx| <= |dy|
if(m_ds->IsEdge())
{
int i = (dp < dp.yxwz()).mask() & 1; // |x| <= |y|
GSVertexSW dscan;
dscan.p = GSVector4::zero();
dscan.t = GSVector4::zero();
dscan.c = GSVector4::zero();
m_ds->SetupPrim(v, dscan);
DrawEdge(v[0], v[1], dv, i, 0);
DrawEdge(v[0], v[1], dv, i, 1);
FlushEdge();
Flush(v, dscan, true);
return;
}
@ -188,34 +184,60 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
{
GSVertexSW dscan = dv / dv.p.xxxx();
m_ds->SetupPrim(v, dscan);
l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y
DrawTriangleSection(p.y, p.y + 1, l, dl, dscan);
Flush();
Flush(v, dscan);
}
}
return;
}
int i = dpi.x > dpi.y ? 0 : 1;
GSVertexSW edge = v[0];
GSVertexSW dedge = dv / dp.v[i];
// TODO: prestep + clip with the scissor
// TODO: inline drawpoint + Flush()
int steps = dpi.v[i];
while(steps-- > 0)
if(steps > 0)
{
DrawPoint(&edge);
GSVertexSW edge = v[0];
GSVertexSW dedge = dv / GSVector4(dp.v[i]);
edge += dedge;
GSVertexSW* RESTRICT e = m_edge.buff;
while(1)
{
GSVector4i p(edge.p);
if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
{
if(IsOneOfMyScanlines(p.y))
{
*e = edge;
e->p.i16[0] = (int16)p.x;
e->p.i16[1] = (int16)p.y;
e->p.i16[2] = (int16)(p.x + 1);
e++;
}
}
if(--steps == 0) break;
edge += dedge;
}
m_edge.count = e - m_edge.buff;
m_stats.pixels += m_edge.count;
GSVertexSW dscan;
dscan.p = GSVector4::zero();
dscan.t = GSVector4::zero();
dscan.c = GSVector4::zero();
Flush(v, dscan);
}
}
@ -233,7 +255,7 @@ static const int s_abc[8][4] =
void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
{
// edge buffer is used here to avoid xmm save-restores (except when we do aa1 in the middle)
// TODO: GSVertexSW::c/t could be merged into a GSVector8
GSVertexSW v[4];
GSVertexSW dv[3];
@ -276,7 +298,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
case 1: // a == b < c
ddv[1] = dv[1] / dv[1].p.yyyy();
ddv[2] = dv[2] / dv[2].p.yyyy();
longest = dv[0];
longest = dv[0]; // should be negated to be equal to "ddv[1] * dv[0].p.yyyy() - dv[0]", but it's easier to change the index of v/ddv later
break;
case 4: // a < b == c
ddv[0] = dv[0] / dv[0].p.yyyy();
@ -319,9 +341,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
dscan.t = GSVector4::zero();
dscan.c = GSVector4::zero();
m_ds->SetupPrim(v, dscan);
FlushEdge();
Flush(v, dscan, true);
}
switch(i)
@ -402,50 +422,42 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
__assume(0);
}
m_ds->SetupPrim(v, dscan);
Flush();
Flush(v, dscan);
}
void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan)
{
ASSERT(top < bottom);
GSScanline* RESTRICT e = &m_edge.buff[m_edge.count];
GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
while(1)
{
do
if(IsOneOfMyScanlines(top))
{
if(IsOneOfMyScanlines(top))
GSVector4 lrf = l.p.ceil();
GSVector4 lrmax = lrf.max(m_fscissor.xxxx());
GSVector4 lrmin = lrf.min(m_fscissor.zzzz());
GSVector4i lr = GSVector4i(lrmax.xxyy(lrmin));
int left = lr.extract32<0>();
int right = lr.extract32<2>();
int pixels = right - left;
if(pixels > 0)
{
GSVector4 lr = l.p.ceil();
m_stats.pixels += pixels;
GSVector4 lrmax = lr.max(m_fscissor.xxxx());
GSVector4 lrmin = lr.min(m_fscissor.zzzz());
*e = l + dscan * (lrmax - l.p).xxxx();
GSVector4i lri = GSVector4i(lrmax.xxyy(lrmin));
e->p.i16[0] = (int16)left;
e->p.i16[1] = (int16)top;
e->p.i16[2] = (int16)right;
int left = lri.extract32<0>();
int right = lri.extract32<2>();
int pixels = right - left;
if(pixels > 0)
{
m_stats.pixels += pixels;
e->scan = l + dscan * (lrmax - l.p).xxxx();
e->p.left = left;
e->p.top = top;
e->p.right = right;
e++;
}
e++;
}
}
while(0);
if(++top >= bottom) break;
@ -508,7 +520,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
m_ds->SetupPrim(v, dscan);
for(; r.top < r.bottom; r.top++, scan.t += dedge.t)
while(1)
{
if(IsOneOfMyScanlines(r.top))
{
@ -516,6 +528,10 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
m_ds->DrawScanline(r.right, r.left, r.top, scan);
}
if(++r.top >= r.bottom) break;
scan.t += dedge.t;
}
}
@ -531,7 +547,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
// TODO: bit slow and too much duplicated code
// TODO: inner pre-step is still missing (hardly noticable)
GSScanline* RESTRICT dst = &m_edge.buff[m_edge.count];
GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
GSVector4 lrtb = v0.p.upl(v1.p).ceil();
@ -540,7 +556,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
GSVector4 tbmax = lrtb.max(m_fscissor.yyyy());
GSVector4 tbmin = lrtb.min(m_fscissor.wwww());
GSVector4i tbi = GSVector4i(tbmax.zwzw(tbmin));
GSVector4i tb = GSVector4i(tbmax.zwzw(tbmin));
int top, bottom;
@ -548,8 +564,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
if((dv.p >= GSVector4::zero()).mask() & 2)
{
top = tbi.extract32<0>();
bottom = tbi.extract32<3>();
top = tb.extract32<0>();
bottom = tb.extract32<3>();
if(top >= bottom) return;
@ -560,8 +576,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
}
else
{
top = tbi.extract32<1>();
bottom = tbi.extract32<2>();
top = tb.extract32<1>();
bottom = tb.extract32<2>();
if(top >= bottom) return;
@ -580,26 +596,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
while(1)
{
do
int xi = x >> 16;
int xf = x & 0xffff;
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
{
int xi = x >> 16;
int xf = x & 0xffff;
m_stats.pixels++;
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
{
m_stats.pixels++;
*e = edge;
dst->scan = edge;
dst->scan.t.u32[3] = (0x10000 - xf) & 0xffff;
e->t.u32[3] = (0x10000 - xf) & 0xffff;
dst->p.left = xi;
dst->p.top = top;
dst->p.right = xi + 1;
e->p.i16[0] = (int16)xi;
e->p.i16[1] = (int16)top;
e->p.i16[2] = (int16)(xi + 1);
dst++;
}
e++;
}
while(0);
if(++top >= bottom) break;
@ -611,26 +624,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
while(1)
{
do
int xi = (x >> 16) + 1;
int xf = x & 0xffff;
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
{
int xi = (x >> 16) + 1;
int xf = x & 0xffff;
m_stats.pixels++;
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
{
m_stats.pixels++;
*e = edge;
dst->scan = edge;
dst->scan.t.u32[3] = xf;
e->t.u32[3] = xf;
dst->p.left = xi;
dst->p.top = top;
dst->p.right = xi + 1;
e->p.i16[0] = (int16)xi;
e->p.i16[1] = (int16)top;
e->p.i16[2] = (int16)(xi + 1);
dst++;
}
e++;
}
while(0);
if(++top >= bottom) break;
@ -644,7 +654,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
GSVector4 lrmax = lrtb.max(m_fscissor.xxxx());
GSVector4 lrmin = lrtb.min(m_fscissor.zzzz());
GSVector4i lri = GSVector4i(lrmax.xyxy(lrmin));
GSVector4i lr = GSVector4i(lrmax.xyxy(lrmin));
int left, right;
@ -652,8 +662,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
if((dv.p >= GSVector4::zero()).mask() & 1)
{
left = lri.extract32<0>();
right = lri.extract32<3>();
left = lr.extract32<0>();
right = lr.extract32<3>();
if(left >= right) return;
@ -664,8 +674,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
}
else
{
left = lri.extract32<1>();
right = lri.extract32<2>();
left = lr.extract32<1>();
right = lr.extract32<2>();
if(left >= right) return;
@ -684,26 +694,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
while(1)
{
do
int yi = y >> 16;
int yf = y & 0xffff;
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
{
int yi = y >> 16;
int yf = y & 0xffff;
m_stats.pixels++;
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
{
m_stats.pixels++;
*e = edge;
e->t.u32[3] = (0x10000 - yf) & 0xffff;
dst->scan = edge;
dst->scan.t.u32[3] = (0x10000 - yf) & 0xffff;
e->p.i16[0] = (int16)left;
e->p.i16[1] = (int16)yi;
e->p.i16[2] = (int16)(left + 1);
dst->p.left = left;
dst->p.top = yi;
dst->p.right = left + 1;
dst++;
}
e++;
}
while(0);
if(++left >= right) break;
@ -715,26 +722,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
{
while(1)
{
do
int yi = (y >> 16) + 1;
int yf = y & 0xffff;
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
{
int yi = (y >> 16) + 1;
int yf = y & 0xffff;
m_stats.pixels++;
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
{
m_stats.pixels++;
*e = edge;
e->t.u32[3] = yf;
dst->scan = edge;
dst->scan.t.u32[3] = yf;
e->p.i16[0] = (int16)left;
e->p.i16[1] = (int16)yi;
e->p.i16[2] = (int16)(left + 1);
dst->p.left = left;
dst->p.top = yi;
dst->p.right = left + 1;
dst++;
}
e++;
}
while(0);
if(++left >= right) break;
@ -744,33 +748,36 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
}
}
m_edge.count += dst - &m_edge.buff[m_edge.count];
m_edge.count += e - &m_edge.buff[m_edge.count];
}
void GSRasterizer::Flush()
void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge)
{
// TODO: on win64 this could be the place where xmm6-15 are preserved (not by each DrawScanline)
const GSScanline* s = m_edge.buff;
int count = m_edge.count;
for(int count = m_edge.count; count > 0; count--, s++)
if(count > 0)
{
m_ds->DrawScanline(s->p.right, s->p.left, s->p.top, s->scan);
m_ds->SetupPrim(vertices, dscan);
const GSVertexSW* RESTRICT e = m_edge.buff;
int i = 0;
if(!edge)
{
do {m_ds->DrawScanline(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);}
while(++i < count);
}
else
{
do {m_ds->DrawEdge(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);}
while(++i < count);
}
m_edge.count = 0;
}
m_edge.count = 0;
}
void GSRasterizer::FlushEdge()
{
const GSScanline* s = m_edge.buff;
for(int count = m_edge.count; count > 0; count--, s++)
{
m_ds->DrawEdge(s->p.right, s->p.left, s->p.top, s->scan);
}
m_edge.count = 0;
}
//

View File

@ -81,8 +81,6 @@ public:
__aligned(class, 32) GSRasterizer : public GSAlignedClass<32>, public IRasterizer
{
struct GSScanline {GSVertexSW scan; GSVector4i p;};
protected:
IDrawScanline* m_ds;
int m_id;
@ -90,7 +88,7 @@ protected:
GSRasterizerStats m_stats;
GSVector4i m_scissor;
GSVector4 m_fscissor;
struct {GSScanline* buff; int count;} m_edge;
struct {GSVertexSW* buff; int count;} m_edge;
void DrawPoint(const GSVertexSW* v);
void DrawLine(const GSVertexSW* v);
@ -104,8 +102,7 @@ protected:
__forceinline bool IsOneOfMyScanlines(int scanline) const;
void Flush();
void FlushEdge();
__forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false);
public:
GSRasterizer(IDrawScanline* ds);

View File

@ -21,6 +21,7 @@
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
@ -70,7 +71,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[rdx + 16]);
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -122,7 +123,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = vertices[0].p;
vmovaps(xmm0, ptr[rcx + 16]);
vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -179,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture()
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[rdx + 32]);
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
vmulps(xmm1, xmm0, xmm3);
@ -249,7 +250,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[rdx]);
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -289,7 +290,7 @@ void GSSetupPrimCodeGenerator::Color()
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
@ -321,7 +322,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4i c = GSVector4i(vertices[0].c);
vcvttps2dq(xmm0, ptr[rcx]);
vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());

View File

@ -21,6 +21,7 @@
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
@ -68,7 +69,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[rdx + 16]);
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -125,7 +126,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = vertices[0].p;
movaps(xmm0, ptr[rcx + 16]);
movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -183,7 +184,7 @@ void GSSetupPrimCodeGenerator::Texture()
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[rdx + 32]);
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
@ -256,7 +257,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[rdx]);
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -300,7 +301,7 @@ void GSSetupPrimCodeGenerator::Color()
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
@ -335,7 +336,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4i c = GSVector4i(vertices[0].c);
cvttps2dq(xmm0, ptr[rcx]);
cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());

View File

@ -21,6 +21,7 @@
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
@ -56,7 +57,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[edx + 16]);
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -108,7 +109,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = vertices[0].p;
vmovaps(xmm0, ptr[ecx + 16]);
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -163,7 +164,7 @@ void GSSetupPrimCodeGenerator::Texture()
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[edx + 32]);
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
vmulps(xmm1, xmm0, xmm3);
@ -233,7 +234,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]);
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -273,7 +274,7 @@ void GSSetupPrimCodeGenerator::Color()
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
@ -305,7 +306,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4i c = GSVector4i(vertices[0].c);
vcvttps2dq(xmm0, ptr[ecx]);
vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());

View File

@ -21,6 +21,7 @@
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
@ -56,7 +57,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[edx + 16]);
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -113,7 +114,7 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = vertices[0].p;
movaps(xmm0, ptr[ecx + 16]);
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
if(m_en.f)
{
@ -168,7 +169,7 @@ void GSSetupPrimCodeGenerator::Texture()
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[edx + 32]);
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
@ -241,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]);
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -285,7 +286,7 @@ void GSSetupPrimCodeGenerator::Color()
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
@ -320,7 +321,7 @@ void GSSetupPrimCodeGenerator::Color()
{
// GSVector4i c = GSVector4i(vertices[0].c);
movaps(xmm0, ptr[ecx]);
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
cvttps2dq(xmm0, xmm0);
// c = c.upl16(c.zwxy());

View File

@ -3030,6 +3030,8 @@ public:
uint32 u32[8];
uint64 u64[4];
__m256 m;
// TODO: _M_SSE < 0x500 => union {__m128 m0, m1;}; and replace each function with a pair of 128 bit intructions
};
__forceinline GSVector8()
@ -3050,7 +3052,7 @@ public:
__forceinline GSVector8(__m128 m0, __m128 m1)
{
m = zero().insert<0>(m0).insert<1>(m1);
m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);
}
__forceinline GSVector8(const GSVector8& v)
@ -3065,7 +3067,8 @@ public:
__forceinline explicit GSVector8(__m128 m)
{
this->m = zero().insert<0>(m).insert<1>(m);
this->m = _mm256_castps128_ps256(m);
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
}
__forceinline explicit GSVector8(__m256 m)
@ -3087,7 +3090,8 @@ public:
__forceinline void operator = (__m128 m)
{
this->m = zero().insert<0>(m).insert<1>(m);
this->m = _mm256_castps128_ps256(m);
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
}
__forceinline void operator = (__m256 m)
@ -3104,7 +3108,7 @@ public:
__forceinline GSVector8 abs() const
{
return *this & cast(GSVector8i(GSVector4i::x7fffffff()));
return *this & cast(GSVector8i(GSVector4i::x7fffffff())); // TODO: add GSVector8 consts
}
__forceinline GSVector8 neg() const
@ -3143,17 +3147,27 @@ public:
// TODO
__forceinline GSVector8 min(const GSVector8& a) const
{
return GSVector8(_mm256_min_ps(m, a));
}
__forceinline GSVector8 max(const GSVector8& a) const
{
return GSVector8(_mm256_max_ps(m, a));
}
__forceinline GSVector8 blend8(const GSVector8& a, const GSVector8& mask) const
{
return GSVector8(_mm256_blendv_ps(m, a, mask));
}
__forceinline GSVector8 upl32(const GSVector8& a) const
__forceinline GSVector8 upl(const GSVector8& a) const
{
return GSVector8(_mm256_unpacklo_ps(m, a));
}
__forceinline GSVector8 uph32(const GSVector8& a) const
__forceinline GSVector8 uph(const GSVector8& a) const
{
return GSVector8(_mm256_unpackhi_ps(m, a));
}
@ -3392,6 +3406,23 @@ public:
return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
}
#define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \
__forceinline GSVector8 xs##ys() const {return GSVector8(_mm256_permute2f128_ps(m, m, xn | (yn << 4)));} \
__forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, xn | (yn << 4)));} \
#define VECTOR8_PERMUTE_1(xs, xn) \
VECTOR8_PERMUTE_2(xs, xn, x, 0) \
VECTOR8_PERMUTE_2(xs, xn, y, 1) \
VECTOR8_PERMUTE_2(xs, xn, z, 2) \
VECTOR8_PERMUTE_2(xs, xn, w, 3) \
VECTOR8_PERMUTE_2(xs, xn, _, 8) \
VECTOR8_PERMUTE_1(x, 0)
VECTOR8_PERMUTE_1(y, 1)
VECTOR8_PERMUTE_1(z, 2)
VECTOR8_PERMUTE_1(w, 3)
VECTOR8_PERMUTE_1(_, 8)
#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \

View File

@ -25,10 +25,10 @@
__aligned(struct, 16) GSVertexSW
{
GSVector4 c, p, t;
GSVector4 p, t, c;
GSVertexSW() {}
GSVertexSW(const GSVertexSW& v) {*this = v;}
__forceinline GSVertexSW() {}
__forceinline GSVertexSW(const GSVertexSW& v) {*this = v;}
__forceinline void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;}
__forceinline void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;}
@ -37,8 +37,6 @@ __aligned(struct, 16) GSVertexSW
friend GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2);
friend GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv);
friend GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv);
friend GSVertexSW operator * (const GSVertexSW& v, float f);
friend GSVertexSW operator / (const GSVertexSW& v, float f);
static bool IsQuad(const GSVertexSW* v, int& tl, int& br)
{
@ -192,22 +190,3 @@ __forceinline GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv)
return v0;
}
__forceinline GSVertexSW operator * (const GSVertexSW& v, float f)
{
GSVertexSW v0;
GSVector4 vf(f);
v0.c = v.c * vf;
v0.p = v.p * vf;
v0.t = v.t * vf;
return v0;
}
__forceinline GSVertexSW operator / (const GSVertexSW& v, float f)
{
GSVertexSW v0;
GSVector4 vf(f);
v0.c = v.c / vf;
v0.p = v.p / vf;
v0.t = v.t / vf;
return v0;
}

View File

@ -31,9 +31,11 @@ class GSState;
__aligned(class, 32) GSVertexTrace
{
public:
struct Vertex {GSVector4i c; GSVector4 p, t;};
struct VertexAlpha {int min, max; bool valid;};
private:
typedef void (*VertexTracePtr)(int count, const void* v, Vertex& min, Vertex& max);
class CGSW : public GSCodeGenerator

View File

@ -90,7 +90,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
@ -101,7 +101,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
@ -110,7 +110,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
@ -120,7 +120,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
if(!fst)
{
@ -149,20 +149,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[r8], xmm2);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[r9], xmm3);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
vmovaps(ptr[r8 + 16], xmm4);
vmovaps(ptr[r9 + 16], xmm5);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
vmovaps(ptr[r8 + 32], xmm6);
vmovaps(ptr[r9 + 32], xmm7);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
@ -239,7 +239,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
@ -248,7 +248,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
@ -260,7 +260,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
@ -309,15 +309,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[r8], xmm2);
vmovaps(ptr[r9], xmm3);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[r8 + 16], xmm4);
vmovaps(ptr[r9 + 16], xmm5);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
@ -327,8 +327,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[r8 + 32], xmm6);
vmovaps(ptr[r9 + 32], xmm7);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
@ -463,8 +463,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[r8], xmm2);
vmovaps(ptr[r9], xmm3);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
@ -473,16 +473,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[r8 + 16], xmm4);
vmovaps(ptr[r9 + 16], xmm5);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[r8 + 32], xmm6);
vmovaps(ptr[r9 + 32], xmm7);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);

View File

@ -93,7 +93,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
@ -104,7 +104,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
@ -113,7 +113,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
@ -123,7 +123,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
if(!fst)
{
@ -153,20 +153,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[r8], xmm2);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[r9], xmm3);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
movaps(ptr[r8 + 16], xmm4);
movaps(ptr[r9 + 16], xmm5);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
movaps(ptr[r8 + 32], xmm6);
movaps(ptr[r9 + 32], xmm7);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
@ -246,7 +246,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
@ -255,7 +255,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
@ -268,7 +268,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
@ -330,15 +330,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
punpcklwd(xmm3, xmm0);
}
movaps(ptr[r8], xmm2);
movaps(ptr[r9], xmm3);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[r8 + 16], xmm4);
movaps(ptr[r9 + 16], xmm5);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
@ -348,8 +348,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[r8 + 32], xmm6);
movaps(ptr[r9 + 32], xmm7);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
@ -510,8 +510,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
punpcklwd(xmm3, xmm0);
}
movaps(ptr[r8], xmm2);
movaps(ptr[r9], xmm3);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
@ -520,16 +520,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[r8 + 16], xmm4);
movaps(ptr[r9 + 16], xmm5);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[r8 + 32], xmm6);
movaps(ptr[r9 + 32], xmm7);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);

View File

@ -92,7 +92,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
@ -103,7 +103,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
@ -112,7 +112,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
@ -122,7 +122,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
if(!fst)
{
@ -154,20 +154,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[edx], xmm3);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
ret();
@ -235,7 +235,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
@ -244,7 +244,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
@ -256,7 +256,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
@ -308,15 +308,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[edx], xmm3);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
@ -326,8 +326,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
ret();
@ -456,8 +456,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[edx], xmm3);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
@ -466,16 +466,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
ret();

View File

@ -95,7 +95,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
@ -106,7 +106,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
@ -115,7 +115,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
@ -125,7 +125,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
if(!fst)
{
@ -158,20 +158,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[eax], xmm2);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[edx], xmm3);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
ret();
@ -242,7 +242,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
@ -251,7 +251,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
@ -264,7 +264,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
@ -329,15 +329,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
@ -347,8 +347,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
ret();
@ -503,8 +503,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
@ -513,16 +513,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
ret();