mirror of https://github.com/PCSX2/pcsx2.git
GSdx: still working on the rasterizer, would be nice to add some avx code there, but it's just so unfitting for anything.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4407 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
c1e844b7f7
commit
9586e38dd4
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GPUDrawScanlineCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
static const int _args = 8;
|
||||
static const int _top = _args + 4;
|
||||
|
@ -152,7 +153,7 @@ void GPUDrawScanlineCodeGenerator::Init()
|
|||
|
||||
// GSVector4i vt = GSVector4i(v.t).xxzzl();
|
||||
|
||||
cvttps2dq(xmm4, ptr[edx + 32]);
|
||||
cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
// s = vt.xxxx().add16(m_local.d.s);
|
||||
|
@ -186,7 +187,7 @@ void GPUDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
// GSVector4i vc = GSVector4i(v.c).xxzzlh();
|
||||
|
||||
cvttps2dq(xmm6, ptr[edx]);
|
||||
cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
|
|
|
@ -22,8 +22,8 @@
|
|||
// TODO: x64
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GPUSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
|
@ -50,7 +50,7 @@ void GPUSetupPrimCodeGenerator::Generate()
|
|||
{
|
||||
// t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001();
|
||||
|
||||
cvttps2dq(xmm1, ptr[ecx + sizeof(GSVertexSW) * 1 + 32]);
|
||||
cvttps2dq(xmm1, ptr[ecx + sizeof(GSVertexSW) * 1 + offsetof(GSVertexSW, t)]);
|
||||
psrld(xmm1, 8);
|
||||
psrld(xmm0, 31);
|
||||
psubd(xmm1, xmm0);
|
||||
|
@ -94,8 +94,8 @@ void GPUSetupPrimCodeGenerator::Generate()
|
|||
// GSVector4 dt = dscan.t;
|
||||
// GSVector4 dc = dscan.c;
|
||||
|
||||
movaps(xmm4, ptr[edx]);
|
||||
movaps(xmm3, ptr[edx + 32]);
|
||||
movaps(xmm4, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm3, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
// GSVector4i dtc8 = GSVector4i(dt * 8.0f).ps32(GSVector4i(dc * 8.0f));
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -264,7 +265,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
|
||||
{
|
||||
vmovaps(xmm0, ptr[r9 + 16]); // v.p
|
||||
vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p
|
||||
|
||||
if(m_sel.fwrite && m_sel.fge)
|
||||
{
|
||||
|
@ -297,7 +298,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
if(m_sel.edge || m_sel.tfx != TFX_NONE)
|
||||
{
|
||||
vmovaps(xmm0, ptr[r9 + 32]); // v.t
|
||||
vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t
|
||||
}
|
||||
|
||||
if(m_sel.edge)
|
||||
|
@ -361,7 +362,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
// GSVector4i vc = GSVector4i(v.c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[r9]); // v.c
|
||||
vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c
|
||||
|
||||
// vc = vc.upl16(vc.zwxy());
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -296,7 +297,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
|
||||
{
|
||||
vmovaps(xmm0, ptr[ebx + 16]); // v.p
|
||||
vmovaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
|
||||
|
||||
if(m_sel.fwrite && m_sel.fge)
|
||||
{
|
||||
|
@ -333,7 +334,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
if(m_sel.edge || m_sel.tfx != TFX_NONE)
|
||||
{
|
||||
vmovaps(xmm4, ptr[ebx + 32]); // v.t
|
||||
vmovaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
|
||||
}
|
||||
|
||||
if(m_sel.edge)
|
||||
|
@ -410,7 +411,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
// GSVector4i vc = GSVector4i(v.c);
|
||||
|
||||
vcvttps2dq(xmm6, ptr[ebx]); // v.c
|
||||
vcvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
|
||||
|
||||
// vc = vc.upl16(vc.zwxy());
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -293,7 +294,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
|
||||
{
|
||||
movaps(xmm0, ptr[ebx + 16]); // v.p
|
||||
movaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
|
||||
|
||||
if(m_sel.fwrite && m_sel.fge)
|
||||
{
|
||||
|
@ -330,7 +331,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
if(m_sel.edge || m_sel.tfx != TFX_NONE)
|
||||
{
|
||||
movaps(xmm4, ptr[ebx + 32]); // v.t
|
||||
movaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
|
||||
}
|
||||
|
||||
if(m_sel.edge)
|
||||
|
@ -410,7 +411,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
|||
{
|
||||
// GSVector4i vc = GSVector4i(v.c);
|
||||
|
||||
cvttps2dq(xmm6, ptr[ebx]); // v.c
|
||||
cvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
|
||||
|
||||
// vc = vc.upl16(vc.zwxy());
|
||||
|
||||
|
|
|
@ -44,13 +44,13 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds)
|
|||
, m_id(0)
|
||||
, m_threads(1)
|
||||
{
|
||||
m_edge.buff = (GSScanline*)vmalloc(sizeof(GSScanline) * 2048, false);
|
||||
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
|
||||
m_edge.count = 0;
|
||||
}
|
||||
|
||||
GSRasterizer::~GSRasterizer()
|
||||
{
|
||||
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSScanline) * 2048);
|
||||
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048);
|
||||
|
||||
delete m_ds;
|
||||
}
|
||||
|
@ -119,8 +119,6 @@ void GSRasterizer::GetStats(GSRasterizerStats& stats)
|
|||
|
||||
void GSRasterizer::DrawPoint(const GSVertexSW* v)
|
||||
{
|
||||
// TODO: round to closest for point, prestep for line
|
||||
|
||||
GSVector4i p(v->p);
|
||||
|
||||
if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
|
||||
|
@ -142,22 +140,20 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
|
|||
|
||||
GSVector4 dp = dv.p.abs();
|
||||
|
||||
int i = (dp < dp.yxwz()).mask() & 1; // |dx| <= |dy|
|
||||
|
||||
if(m_ds->IsEdge())
|
||||
{
|
||||
int i = (dp < dp.yxwz()).mask() & 1; // |x| <= |y|
|
||||
|
||||
GSVertexSW dscan;
|
||||
|
||||
dscan.p = GSVector4::zero();
|
||||
dscan.t = GSVector4::zero();
|
||||
dscan.c = GSVector4::zero();
|
||||
|
||||
m_ds->SetupPrim(v, dscan);
|
||||
|
||||
DrawEdge(v[0], v[1], dv, i, 0);
|
||||
DrawEdge(v[0], v[1], dv, i, 1);
|
||||
|
||||
FlushEdge();
|
||||
Flush(v, dscan, true);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -188,34 +184,60 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
|
|||
{
|
||||
GSVertexSW dscan = dv / dv.p.xxxx();
|
||||
|
||||
m_ds->SetupPrim(v, dscan);
|
||||
|
||||
l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y
|
||||
|
||||
DrawTriangleSection(p.y, p.y + 1, l, dl, dscan);
|
||||
|
||||
Flush();
|
||||
Flush(v, dscan);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int i = dpi.x > dpi.y ? 0 : 1;
|
||||
|
||||
GSVertexSW edge = v[0];
|
||||
GSVertexSW dedge = dv / dp.v[i];
|
||||
|
||||
// TODO: prestep + clip with the scissor
|
||||
// TODO: inline drawpoint + Flush()
|
||||
|
||||
int steps = dpi.v[i];
|
||||
|
||||
while(steps-- > 0)
|
||||
if(steps > 0)
|
||||
{
|
||||
DrawPoint(&edge);
|
||||
GSVertexSW edge = v[0];
|
||||
GSVertexSW dedge = dv / GSVector4(dp.v[i]);
|
||||
|
||||
edge += dedge;
|
||||
GSVertexSW* RESTRICT e = m_edge.buff;
|
||||
|
||||
while(1)
|
||||
{
|
||||
GSVector4i p(edge.p);
|
||||
|
||||
if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
|
||||
{
|
||||
if(IsOneOfMyScanlines(p.y))
|
||||
{
|
||||
*e = edge;
|
||||
|
||||
e->p.i16[0] = (int16)p.x;
|
||||
e->p.i16[1] = (int16)p.y;
|
||||
e->p.i16[2] = (int16)(p.x + 1);
|
||||
|
||||
e++;
|
||||
}
|
||||
}
|
||||
|
||||
if(--steps == 0) break;
|
||||
|
||||
edge += dedge;
|
||||
}
|
||||
|
||||
m_edge.count = e - m_edge.buff;
|
||||
|
||||
m_stats.pixels += m_edge.count;
|
||||
|
||||
GSVertexSW dscan;
|
||||
|
||||
dscan.p = GSVector4::zero();
|
||||
dscan.t = GSVector4::zero();
|
||||
dscan.c = GSVector4::zero();
|
||||
|
||||
Flush(v, dscan);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -233,7 +255,7 @@ static const int s_abc[8][4] =
|
|||
|
||||
void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
|
||||
{
|
||||
// edge buffer is used here to avoid xmm save-restores (except when we do aa1 in the middle)
|
||||
// TODO: GSVertexSW::c/t could be merged into a GSVector8
|
||||
|
||||
GSVertexSW v[4];
|
||||
GSVertexSW dv[3];
|
||||
|
@ -276,7 +298,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
|
|||
case 1: // a == b < c
|
||||
ddv[1] = dv[1] / dv[1].p.yyyy();
|
||||
ddv[2] = dv[2] / dv[2].p.yyyy();
|
||||
longest = dv[0];
|
||||
longest = dv[0]; // should be negated to be equal to "ddv[1] * dv[0].p.yyyy() - dv[0]", but it's easier to change the index of v/ddv later
|
||||
break;
|
||||
case 4: // a < b == c
|
||||
ddv[0] = dv[0] / dv[0].p.yyyy();
|
||||
|
@ -319,9 +341,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
|
|||
dscan.t = GSVector4::zero();
|
||||
dscan.c = GSVector4::zero();
|
||||
|
||||
m_ds->SetupPrim(v, dscan);
|
||||
|
||||
FlushEdge();
|
||||
Flush(v, dscan, true);
|
||||
}
|
||||
|
||||
switch(i)
|
||||
|
@ -402,50 +422,42 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
|
|||
__assume(0);
|
||||
}
|
||||
|
||||
m_ds->SetupPrim(v, dscan);
|
||||
|
||||
Flush();
|
||||
Flush(v, dscan);
|
||||
}
|
||||
|
||||
void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan)
|
||||
{
|
||||
ASSERT(top < bottom);
|
||||
|
||||
GSScanline* RESTRICT e = &m_edge.buff[m_edge.count];
|
||||
GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
|
||||
|
||||
while(1)
|
||||
{
|
||||
do
|
||||
if(IsOneOfMyScanlines(top))
|
||||
{
|
||||
if(IsOneOfMyScanlines(top))
|
||||
GSVector4 lrf = l.p.ceil();
|
||||
GSVector4 lrmax = lrf.max(m_fscissor.xxxx());
|
||||
GSVector4 lrmin = lrf.min(m_fscissor.zzzz());
|
||||
GSVector4i lr = GSVector4i(lrmax.xxyy(lrmin));
|
||||
|
||||
int left = lr.extract32<0>();
|
||||
int right = lr.extract32<2>();
|
||||
|
||||
int pixels = right - left;
|
||||
|
||||
if(pixels > 0)
|
||||
{
|
||||
GSVector4 lr = l.p.ceil();
|
||||
m_stats.pixels += pixels;
|
||||
|
||||
GSVector4 lrmax = lr.max(m_fscissor.xxxx());
|
||||
GSVector4 lrmin = lr.min(m_fscissor.zzzz());
|
||||
*e = l + dscan * (lrmax - l.p).xxxx();
|
||||
|
||||
GSVector4i lri = GSVector4i(lrmax.xxyy(lrmin));
|
||||
e->p.i16[0] = (int16)left;
|
||||
e->p.i16[1] = (int16)top;
|
||||
e->p.i16[2] = (int16)right;
|
||||
|
||||
int left = lri.extract32<0>();
|
||||
int right = lri.extract32<2>();
|
||||
|
||||
int pixels = right - left;
|
||||
|
||||
if(pixels > 0)
|
||||
{
|
||||
m_stats.pixels += pixels;
|
||||
|
||||
e->scan = l + dscan * (lrmax - l.p).xxxx();
|
||||
|
||||
e->p.left = left;
|
||||
e->p.top = top;
|
||||
e->p.right = right;
|
||||
|
||||
e++;
|
||||
}
|
||||
e++;
|
||||
}
|
||||
}
|
||||
while(0);
|
||||
|
||||
if(++top >= bottom) break;
|
||||
|
||||
|
@ -508,7 +520,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
|
|||
|
||||
m_ds->SetupPrim(v, dscan);
|
||||
|
||||
for(; r.top < r.bottom; r.top++, scan.t += dedge.t)
|
||||
while(1)
|
||||
{
|
||||
if(IsOneOfMyScanlines(r.top))
|
||||
{
|
||||
|
@ -516,6 +528,10 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
|
|||
|
||||
m_ds->DrawScanline(r.right, r.left, r.top, scan);
|
||||
}
|
||||
|
||||
if(++r.top >= r.bottom) break;
|
||||
|
||||
scan.t += dedge.t;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -531,7 +547,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
// TODO: bit slow and too much duplicated code
|
||||
// TODO: inner pre-step is still missing (hardly noticable)
|
||||
|
||||
GSScanline* RESTRICT dst = &m_edge.buff[m_edge.count];
|
||||
GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
|
||||
|
||||
GSVector4 lrtb = v0.p.upl(v1.p).ceil();
|
||||
|
||||
|
@ -540,7 +556,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
GSVector4 tbmax = lrtb.max(m_fscissor.yyyy());
|
||||
GSVector4 tbmin = lrtb.min(m_fscissor.wwww());
|
||||
|
||||
GSVector4i tbi = GSVector4i(tbmax.zwzw(tbmin));
|
||||
GSVector4i tb = GSVector4i(tbmax.zwzw(tbmin));
|
||||
|
||||
int top, bottom;
|
||||
|
||||
|
@ -548,8 +564,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
|
||||
if((dv.p >= GSVector4::zero()).mask() & 2)
|
||||
{
|
||||
top = tbi.extract32<0>();
|
||||
bottom = tbi.extract32<3>();
|
||||
top = tb.extract32<0>();
|
||||
bottom = tb.extract32<3>();
|
||||
|
||||
if(top >= bottom) return;
|
||||
|
||||
|
@ -560,8 +576,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
}
|
||||
else
|
||||
{
|
||||
top = tbi.extract32<1>();
|
||||
bottom = tbi.extract32<2>();
|
||||
top = tb.extract32<1>();
|
||||
bottom = tb.extract32<2>();
|
||||
|
||||
if(top >= bottom) return;
|
||||
|
||||
|
@ -580,26 +596,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
while(1)
|
||||
{
|
||||
do
|
||||
int xi = x >> 16;
|
||||
int xf = x & 0xffff;
|
||||
|
||||
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
|
||||
{
|
||||
int xi = x >> 16;
|
||||
int xf = x & 0xffff;
|
||||
m_stats.pixels++;
|
||||
|
||||
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
|
||||
{
|
||||
m_stats.pixels++;
|
||||
*e = edge;
|
||||
|
||||
dst->scan = edge;
|
||||
dst->scan.t.u32[3] = (0x10000 - xf) & 0xffff;
|
||||
e->t.u32[3] = (0x10000 - xf) & 0xffff;
|
||||
|
||||
dst->p.left = xi;
|
||||
dst->p.top = top;
|
||||
dst->p.right = xi + 1;
|
||||
e->p.i16[0] = (int16)xi;
|
||||
e->p.i16[1] = (int16)top;
|
||||
e->p.i16[2] = (int16)(xi + 1);
|
||||
|
||||
dst++;
|
||||
}
|
||||
e++;
|
||||
}
|
||||
while(0);
|
||||
|
||||
if(++top >= bottom) break;
|
||||
|
||||
|
@ -611,26 +624,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
while(1)
|
||||
{
|
||||
do
|
||||
int xi = (x >> 16) + 1;
|
||||
int xf = x & 0xffff;
|
||||
|
||||
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
|
||||
{
|
||||
int xi = (x >> 16) + 1;
|
||||
int xf = x & 0xffff;
|
||||
m_stats.pixels++;
|
||||
|
||||
if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi))
|
||||
{
|
||||
m_stats.pixels++;
|
||||
*e = edge;
|
||||
|
||||
dst->scan = edge;
|
||||
dst->scan.t.u32[3] = xf;
|
||||
e->t.u32[3] = xf;
|
||||
|
||||
dst->p.left = xi;
|
||||
dst->p.top = top;
|
||||
dst->p.right = xi + 1;
|
||||
e->p.i16[0] = (int16)xi;
|
||||
e->p.i16[1] = (int16)top;
|
||||
e->p.i16[2] = (int16)(xi + 1);
|
||||
|
||||
dst++;
|
||||
}
|
||||
e++;
|
||||
}
|
||||
while(0);
|
||||
|
||||
if(++top >= bottom) break;
|
||||
|
||||
|
@ -644,7 +654,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
GSVector4 lrmax = lrtb.max(m_fscissor.xxxx());
|
||||
GSVector4 lrmin = lrtb.min(m_fscissor.zzzz());
|
||||
|
||||
GSVector4i lri = GSVector4i(lrmax.xyxy(lrmin));
|
||||
GSVector4i lr = GSVector4i(lrmax.xyxy(lrmin));
|
||||
|
||||
int left, right;
|
||||
|
||||
|
@ -652,8 +662,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
|
||||
if((dv.p >= GSVector4::zero()).mask() & 1)
|
||||
{
|
||||
left = lri.extract32<0>();
|
||||
right = lri.extract32<3>();
|
||||
left = lr.extract32<0>();
|
||||
right = lr.extract32<3>();
|
||||
|
||||
if(left >= right) return;
|
||||
|
||||
|
@ -664,8 +674,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
}
|
||||
else
|
||||
{
|
||||
left = lri.extract32<1>();
|
||||
right = lri.extract32<2>();
|
||||
left = lr.extract32<1>();
|
||||
right = lr.extract32<2>();
|
||||
|
||||
if(left >= right) return;
|
||||
|
||||
|
@ -684,26 +694,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
while(1)
|
||||
{
|
||||
do
|
||||
int yi = y >> 16;
|
||||
int yf = y & 0xffff;
|
||||
|
||||
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
|
||||
{
|
||||
int yi = y >> 16;
|
||||
int yf = y & 0xffff;
|
||||
m_stats.pixels++;
|
||||
|
||||
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
|
||||
{
|
||||
m_stats.pixels++;
|
||||
*e = edge;
|
||||
|
||||
dst->scan = edge;
|
||||
dst->scan.t.u32[3] = (0x10000 - yf) & 0xffff;
|
||||
e->t.u32[3] = (0x10000 - yf) & 0xffff;
|
||||
|
||||
dst->p.left = left;
|
||||
dst->p.top = yi;
|
||||
dst->p.right = left + 1;
|
||||
e->p.i16[0] = (int16)left;
|
||||
e->p.i16[1] = (int16)yi;
|
||||
e->p.i16[2] = (int16)(left + 1);
|
||||
|
||||
dst++;
|
||||
}
|
||||
e++;
|
||||
}
|
||||
while(0);
|
||||
|
||||
if(++left >= right) break;
|
||||
|
||||
|
@ -715,26 +722,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
while(1)
|
||||
{
|
||||
do
|
||||
int yi = (y >> 16) + 1;
|
||||
int yf = y & 0xffff;
|
||||
|
||||
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
|
||||
{
|
||||
int yi = (y >> 16) + 1;
|
||||
int yf = y & 0xffff;
|
||||
m_stats.pixels++;
|
||||
|
||||
if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
|
||||
{
|
||||
m_stats.pixels++;
|
||||
*e = edge;
|
||||
|
||||
dst->scan = edge;
|
||||
dst->scan.t.u32[3] = yf;
|
||||
e->t.u32[3] = yf;
|
||||
|
||||
dst->p.left = left;
|
||||
dst->p.top = yi;
|
||||
dst->p.right = left + 1;
|
||||
e->p.i16[0] = (int16)left;
|
||||
e->p.i16[1] = (int16)yi;
|
||||
e->p.i16[2] = (int16)(left + 1);
|
||||
|
||||
dst++;
|
||||
}
|
||||
e++;
|
||||
}
|
||||
while(0);
|
||||
|
||||
if(++left >= right) break;
|
||||
|
||||
|
@ -744,33 +748,36 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
}
|
||||
}
|
||||
|
||||
m_edge.count += dst - &m_edge.buff[m_edge.count];
|
||||
m_edge.count += e - &m_edge.buff[m_edge.count];
|
||||
}
|
||||
|
||||
void GSRasterizer::Flush()
|
||||
void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge)
|
||||
{
|
||||
// TODO: on win64 this could be the place where xmm6-15 are preserved (not by each DrawScanline)
|
||||
|
||||
const GSScanline* s = m_edge.buff;
|
||||
int count = m_edge.count;
|
||||
|
||||
for(int count = m_edge.count; count > 0; count--, s++)
|
||||
if(count > 0)
|
||||
{
|
||||
m_ds->DrawScanline(s->p.right, s->p.left, s->p.top, s->scan);
|
||||
m_ds->SetupPrim(vertices, dscan);
|
||||
|
||||
const GSVertexSW* RESTRICT e = m_edge.buff;
|
||||
|
||||
int i = 0;
|
||||
|
||||
if(!edge)
|
||||
{
|
||||
do {m_ds->DrawScanline(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);}
|
||||
while(++i < count);
|
||||
}
|
||||
else
|
||||
{
|
||||
do {m_ds->DrawEdge(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);}
|
||||
while(++i < count);
|
||||
}
|
||||
|
||||
m_edge.count = 0;
|
||||
}
|
||||
|
||||
m_edge.count = 0;
|
||||
}
|
||||
|
||||
void GSRasterizer::FlushEdge()
|
||||
{
|
||||
const GSScanline* s = m_edge.buff;
|
||||
|
||||
for(int count = m_edge.count; count > 0; count--, s++)
|
||||
{
|
||||
m_ds->DrawEdge(s->p.right, s->p.left, s->p.top, s->scan);
|
||||
}
|
||||
|
||||
m_edge.count = 0;
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
@ -81,8 +81,6 @@ public:
|
|||
|
||||
__aligned(class, 32) GSRasterizer : public GSAlignedClass<32>, public IRasterizer
|
||||
{
|
||||
struct GSScanline {GSVertexSW scan; GSVector4i p;};
|
||||
|
||||
protected:
|
||||
IDrawScanline* m_ds;
|
||||
int m_id;
|
||||
|
@ -90,7 +88,7 @@ protected:
|
|||
GSRasterizerStats m_stats;
|
||||
GSVector4i m_scissor;
|
||||
GSVector4 m_fscissor;
|
||||
struct {GSScanline* buff; int count;} m_edge;
|
||||
struct {GSVertexSW* buff; int count;} m_edge;
|
||||
|
||||
void DrawPoint(const GSVertexSW* v);
|
||||
void DrawLine(const GSVertexSW* v);
|
||||
|
@ -104,8 +102,7 @@ protected:
|
|||
|
||||
__forceinline bool IsOneOfMyScanlines(int scanline) const;
|
||||
|
||||
void Flush();
|
||||
void FlushEdge();
|
||||
__forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false);
|
||||
|
||||
public:
|
||||
GSRasterizer(IDrawScanline* ds);
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -70,7 +71,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 16]);
|
||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -122,7 +123,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[rcx + 16]);
|
||||
vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -179,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
|||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 32]);
|
||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
|
@ -249,7 +250,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]);
|
||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
|
@ -289,7 +290,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
@ -321,7 +322,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[rcx]);
|
||||
vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -68,7 +69,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 16]);
|
||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -125,7 +126,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[rcx + 16]);
|
||||
movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -183,7 +184,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
|||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 32]);
|
||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
@ -256,7 +257,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]);
|
||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
@ -300,7 +301,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
|
@ -335,7 +336,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
cvttps2dq(xmm0, ptr[rcx]);
|
||||
cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -56,7 +57,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 16]);
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -108,7 +109,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + 16]);
|
||||
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -163,7 +164,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
|||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 32]);
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
|
@ -233,7 +234,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]);
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
|
@ -273,7 +274,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
@ -305,7 +306,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx]);
|
||||
vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
|
@ -56,7 +57,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + 16]);
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -113,7 +114,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
|||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[ecx + 16]);
|
||||
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
|
@ -168,7 +169,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
|||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + 32]);
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
@ -241,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]);
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
@ -285,7 +286,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
|
@ -320,7 +321,7 @@ void GSSetupPrimCodeGenerator::Color()
|
|||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
movaps(xmm0, ptr[ecx]);
|
||||
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
|
|
@ -3030,6 +3030,8 @@ public:
|
|||
uint32 u32[8];
|
||||
uint64 u64[4];
|
||||
__m256 m;
|
||||
|
||||
// TODO: _M_SSE < 0x500 => union {__m128 m0, m1;}; and replace each function with a pair of 128 bit intructions
|
||||
};
|
||||
|
||||
__forceinline GSVector8()
|
||||
|
@ -3050,7 +3052,7 @@ public:
|
|||
|
||||
__forceinline GSVector8(__m128 m0, __m128 m1)
|
||||
{
|
||||
m = zero().insert<0>(m0).insert<1>(m1);
|
||||
m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);
|
||||
}
|
||||
|
||||
__forceinline GSVector8(const GSVector8& v)
|
||||
|
@ -3065,7 +3067,8 @@ public:
|
|||
|
||||
__forceinline explicit GSVector8(__m128 m)
|
||||
{
|
||||
this->m = zero().insert<0>(m).insert<1>(m);
|
||||
this->m = _mm256_castps128_ps256(m);
|
||||
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector8(__m256 m)
|
||||
|
@ -3087,7 +3090,8 @@ public:
|
|||
|
||||
__forceinline void operator = (__m128 m)
|
||||
{
|
||||
this->m = zero().insert<0>(m).insert<1>(m);
|
||||
this->m = _mm256_castps128_ps256(m);
|
||||
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
|
||||
}
|
||||
|
||||
__forceinline void operator = (__m256 m)
|
||||
|
@ -3104,7 +3108,7 @@ public:
|
|||
|
||||
__forceinline GSVector8 abs() const
|
||||
{
|
||||
return *this & cast(GSVector8i(GSVector4i::x7fffffff()));
|
||||
return *this & cast(GSVector8i(GSVector4i::x7fffffff())); // TODO: add GSVector8 consts
|
||||
}
|
||||
|
||||
__forceinline GSVector8 neg() const
|
||||
|
@ -3143,17 +3147,27 @@ public:
|
|||
|
||||
// TODO
|
||||
|
||||
__forceinline GSVector8 min(const GSVector8& a) const
|
||||
{
|
||||
return GSVector8(_mm256_min_ps(m, a));
|
||||
}
|
||||
|
||||
__forceinline GSVector8 max(const GSVector8& a) const
|
||||
{
|
||||
return GSVector8(_mm256_max_ps(m, a));
|
||||
}
|
||||
|
||||
__forceinline GSVector8 blend8(const GSVector8& a, const GSVector8& mask) const
|
||||
{
|
||||
return GSVector8(_mm256_blendv_ps(m, a, mask));
|
||||
}
|
||||
|
||||
__forceinline GSVector8 upl32(const GSVector8& a) const
|
||||
__forceinline GSVector8 upl(const GSVector8& a) const
|
||||
{
|
||||
return GSVector8(_mm256_unpacklo_ps(m, a));
|
||||
}
|
||||
|
||||
__forceinline GSVector8 uph32(const GSVector8& a) const
|
||||
__forceinline GSVector8 uph(const GSVector8& a) const
|
||||
{
|
||||
return GSVector8(_mm256_unpackhi_ps(m, a));
|
||||
}
|
||||
|
@ -3392,6 +3406,23 @@ public:
|
|||
return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
|
||||
}
|
||||
|
||||
#define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \
|
||||
__forceinline GSVector8 xs##ys() const {return GSVector8(_mm256_permute2f128_ps(m, m, xn | (yn << 4)));} \
|
||||
__forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, xn | (yn << 4)));} \
|
||||
|
||||
#define VECTOR8_PERMUTE_1(xs, xn) \
|
||||
VECTOR8_PERMUTE_2(xs, xn, x, 0) \
|
||||
VECTOR8_PERMUTE_2(xs, xn, y, 1) \
|
||||
VECTOR8_PERMUTE_2(xs, xn, z, 2) \
|
||||
VECTOR8_PERMUTE_2(xs, xn, w, 3) \
|
||||
VECTOR8_PERMUTE_2(xs, xn, _, 8) \
|
||||
|
||||
VECTOR8_PERMUTE_1(x, 0)
|
||||
VECTOR8_PERMUTE_1(y, 1)
|
||||
VECTOR8_PERMUTE_1(z, 2)
|
||||
VECTOR8_PERMUTE_1(w, 3)
|
||||
VECTOR8_PERMUTE_1(_, 8)
|
||||
|
||||
#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
|
||||
__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
|
||||
__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
|
||||
|
|
|
@ -25,10 +25,10 @@
|
|||
|
||||
__aligned(struct, 16) GSVertexSW
|
||||
{
|
||||
GSVector4 c, p, t;
|
||||
GSVector4 p, t, c;
|
||||
|
||||
GSVertexSW() {}
|
||||
GSVertexSW(const GSVertexSW& v) {*this = v;}
|
||||
__forceinline GSVertexSW() {}
|
||||
__forceinline GSVertexSW(const GSVertexSW& v) {*this = v;}
|
||||
|
||||
__forceinline void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;}
|
||||
__forceinline void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;}
|
||||
|
@ -37,8 +37,6 @@ __aligned(struct, 16) GSVertexSW
|
|||
friend GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2);
|
||||
friend GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv);
|
||||
friend GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv);
|
||||
friend GSVertexSW operator * (const GSVertexSW& v, float f);
|
||||
friend GSVertexSW operator / (const GSVertexSW& v, float f);
|
||||
|
||||
static bool IsQuad(const GSVertexSW* v, int& tl, int& br)
|
||||
{
|
||||
|
@ -192,22 +190,3 @@ __forceinline GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv)
|
|||
return v0;
|
||||
}
|
||||
|
||||
__forceinline GSVertexSW operator * (const GSVertexSW& v, float f)
|
||||
{
|
||||
GSVertexSW v0;
|
||||
GSVector4 vf(f);
|
||||
v0.c = v.c * vf;
|
||||
v0.p = v.p * vf;
|
||||
v0.t = v.t * vf;
|
||||
return v0;
|
||||
}
|
||||
|
||||
__forceinline GSVertexSW operator / (const GSVertexSW& v, float f)
|
||||
{
|
||||
GSVertexSW v0;
|
||||
GSVector4 vf(f);
|
||||
v0.c = v.c / vf;
|
||||
v0.p = v.p / vf;
|
||||
v0.t = v.t / vf;
|
||||
return v0;
|
||||
}
|
||||
|
|
|
@ -31,9 +31,11 @@ class GSState;
|
|||
|
||||
__aligned(class, 32) GSVertexTrace
|
||||
{
|
||||
public:
|
||||
struct Vertex {GSVector4i c; GSVector4 p, t;};
|
||||
struct VertexAlpha {int min, max; bool valid;};
|
||||
|
||||
private:
|
||||
typedef void (*VertexTracePtr)(int count, const void* v, Vertex& min, Vertex& max);
|
||||
|
||||
class CGSW : public GSCodeGenerator
|
||||
|
|
|
@ -90,7 +90,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
|
@ -101,7 +101,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
|
@ -110,7 +110,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
@ -120,7 +120,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
|
@ -149,20 +149,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
|
@ -239,7 +239,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
|
@ -248,7 +248,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
@ -260,7 +260,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
|
@ -309,15 +309,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
|
@ -327,8 +327,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
|
@ -463,8 +463,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
|
@ -473,16 +473,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
|
|
|
@ -93,7 +93,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
|
@ -104,7 +104,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
|
||||
|
||||
minps(xmm2, xmm0);
|
||||
maxps(xmm3, xmm0);
|
||||
|
@ -113,7 +113,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
@ -123,7 +123,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
|
@ -153,20 +153,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
{
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
psrld(xmm2, 7);
|
||||
movaps(ptr[r8], xmm2);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
psrld(xmm3, 7);
|
||||
movaps(ptr[r9], xmm3);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
movaps(ptr[r8 + 16], xmm4);
|
||||
movaps(ptr[r9 + 16], xmm5);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
movaps(ptr[r8 + 32], xmm6);
|
||||
movaps(ptr[r9 + 32], xmm7);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
movdqa(xmm6, ptr[rsp + 0]);
|
||||
|
@ -246,7 +246,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
|
@ -255,7 +255,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
@ -268,7 +268,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
|
@ -330,15 +330,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[r8], xmm2);
|
||||
movaps(ptr[r9], xmm3);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
movaps(ptr[r8 + 16], xmm4);
|
||||
movaps(ptr[r9 + 16], xmm5);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
|
@ -348,8 +348,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[r8 + 32], xmm6);
|
||||
movaps(ptr[r9 + 32], xmm7);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
movdqa(xmm6, ptr[rsp + 0]);
|
||||
|
@ -510,8 +510,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[r8], xmm2);
|
||||
movaps(ptr[r9], xmm3);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
|
@ -520,16 +520,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[r8 + 16], xmm4);
|
||||
movaps(ptr[r9 + 16], xmm5);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
movaps(ptr[r8 + 32], xmm6);
|
||||
movaps(ptr[r9 + 32], xmm7);
|
||||
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
movdqa(xmm6, ptr[rsp + 0]);
|
||||
|
|
|
@ -92,7 +92,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
|
@ -103,7 +103,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
|
@ -112,7 +112,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
@ -122,7 +122,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
|
@ -154,20 +154,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
|
@ -235,7 +235,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
|
@ -244,7 +244,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
@ -256,7 +256,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
|
@ -308,15 +308,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
|
@ -326,8 +326,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
|
@ -456,8 +456,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
|
@ -466,16 +466,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
|
|
|
@ -95,7 +95,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
|
@ -106,7 +106,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
|
||||
|
||||
minps(xmm2, xmm0);
|
||||
maxps(xmm3, xmm0);
|
||||
|
@ -115,7 +115,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
@ -125,7 +125,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
|
@ -158,20 +158,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
|
|||
{
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
psrld(xmm2, 7);
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
psrld(xmm3, 7);
|
||||
movaps(ptr[edx], xmm3);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
|
@ -242,7 +242,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
|
@ -251,7 +251,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
@ -264,7 +264,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
|
@ -329,15 +329,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[edx], xmm3);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
|
@ -347,8 +347,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
|
|||
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
|
@ -503,8 +503,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[edx], xmm3);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
|
@ -513,16 +513,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
|
|||
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
|
||||
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
|
|
Loading…
Reference in New Issue