diff --git a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp index 4dd9415702..ed31b4c136 100644 --- a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp @@ -23,6 +23,7 @@ #include "stdafx.h" #include "GPUDrawScanlineCodeGenerator.h" +#include "GSVertexSW.h" static const int _args = 8; static const int _top = _args + 4; @@ -152,7 +153,7 @@ void GPUDrawScanlineCodeGenerator::Init() // GSVector4i vt = GSVector4i(v.t).xxzzl(); - cvttps2dq(xmm4, ptr[edx + 32]); + cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); // s = vt.xxxx().add16(m_local.d.s); @@ -186,7 +187,7 @@ void GPUDrawScanlineCodeGenerator::Init() { // GSVector4i vc = GSVector4i(v.c).xxzzlh(); - cvttps2dq(xmm6, ptr[edx]); + cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]); pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); diff --git a/plugins/GSdx/GPUSetupPrimCodeGenerator.cpp b/plugins/GSdx/GPUSetupPrimCodeGenerator.cpp index 7d43967dd9..a189c7b8c8 100644 --- a/plugins/GSdx/GPUSetupPrimCodeGenerator.cpp +++ b/plugins/GSdx/GPUSetupPrimCodeGenerator.cpp @@ -22,8 +22,8 @@ // TODO: x64 #include "stdafx.h" -#include "GSVertexSW.h" #include "GPUSetupPrimCodeGenerator.h" +#include "GSVertexSW.h" using namespace Xbyak; @@ -50,7 +50,7 @@ void GPUSetupPrimCodeGenerator::Generate() { // t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001(); - cvttps2dq(xmm1, ptr[ecx + sizeof(GSVertexSW) * 1 + 32]); + cvttps2dq(xmm1, ptr[ecx + sizeof(GSVertexSW) * 1 + offsetof(GSVertexSW, t)]); psrld(xmm1, 8); psrld(xmm0, 31); psubd(xmm1, xmm0); @@ -94,8 +94,8 @@ void GPUSetupPrimCodeGenerator::Generate() // GSVector4 dt = dscan.t; // GSVector4 dc = dscan.c; - movaps(xmm4, ptr[edx]); - movaps(xmm3, ptr[edx + 32]); + movaps(xmm4, ptr[edx + offsetof(GSVertexSW, c)]); + movaps(xmm3, ptr[edx + offsetof(GSVertexSW, t)]); // GSVector4i dtc8 = GSVector4i(dt * 8.0f).ps32(GSVector4i(dc * 8.0f)); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index bc90117f10..cd432f9e8d 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) @@ -264,7 +265,7 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { - vmovaps(xmm0, ptr[r9 + 16]); // v.p + vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p if(m_sel.fwrite && m_sel.fge) { @@ -297,7 +298,7 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.edge || m_sel.tfx != TFX_NONE) { - vmovaps(xmm0, ptr[r9 + 32]); // v.t + vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t } if(m_sel.edge) @@ -361,7 +362,7 @@ void GSDrawScanlineCodeGenerator::Init() { // GSVector4i vc = GSVector4i(v.c); - vcvttps2dq(xmm0, ptr[r9]); // v.c + vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c // vc = vc.upl16(vc.zwxy()); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 8df198c6e7..1ff9c35bf1 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -24,6 +24,7 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) @@ -296,7 +297,7 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { - vmovaps(xmm0, ptr[ebx + 16]); // v.p + vmovaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p if(m_sel.fwrite && m_sel.fge) { @@ -333,7 +334,7 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.edge || m_sel.tfx != TFX_NONE) { - vmovaps(xmm4, ptr[ebx + 32]); // v.t + vmovaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t } if(m_sel.edge) @@ -410,7 +411,7 @@ void GSDrawScanlineCodeGenerator::Init() { // GSVector4i vc = GSVector4i(v.c); - vcvttps2dq(xmm6, ptr[ebx]); // v.c + vcvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c // vc = vc.upl16(vc.zwxy()); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 47f1b96753..7586038513 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) @@ -293,7 +294,7 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { - movaps(xmm0, ptr[ebx + 16]); // v.p + movaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p if(m_sel.fwrite && m_sel.fge) { @@ -330,7 +331,7 @@ void GSDrawScanlineCodeGenerator::Init() { if(m_sel.edge || m_sel.tfx != TFX_NONE) { - movaps(xmm4, ptr[ebx + 32]); // v.t + movaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t } if(m_sel.edge) @@ -410,7 +411,7 @@ void GSDrawScanlineCodeGenerator::Init() { // GSVector4i vc = GSVector4i(v.c); - cvttps2dq(xmm6, ptr[ebx]); // v.c + cvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c // vc = vc.upl16(vc.zwxy()); diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index fcbb6f6081..89c5333377 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -44,13 +44,13 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds) , m_id(0) , m_threads(1) { - m_edge.buff = (GSScanline*)vmalloc(sizeof(GSScanline) * 2048, false); + m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false); m_edge.count = 0; } GSRasterizer::~GSRasterizer() { - if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSScanline) * 2048); + if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048); delete m_ds; } @@ -119,8 +119,6 @@ void GSRasterizer::GetStats(GSRasterizerStats& stats) void GSRasterizer::DrawPoint(const GSVertexSW* v) { - // TODO: round to closest for point, prestep for line - GSVector4i p(v->p); if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom) @@ -142,22 +140,20 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) GSVector4 dp = dv.p.abs(); + int i = (dp < dp.yxwz()).mask() & 1; // |dx| <= |dy| + if(m_ds->IsEdge()) { - int i = (dp < dp.yxwz()).mask() & 1; // |x| <= |y| - GSVertexSW dscan; dscan.p = GSVector4::zero(); dscan.t = GSVector4::zero(); dscan.c = GSVector4::zero(); - m_ds->SetupPrim(v, dscan); - DrawEdge(v[0], v[1], dv, i, 0); DrawEdge(v[0], v[1], dv, i, 1); - FlushEdge(); + Flush(v, dscan, true); return; } @@ -188,34 +184,60 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) { GSVertexSW dscan = dv / dv.p.xxxx(); - m_ds->SetupPrim(v, dscan); - l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y DrawTriangleSection(p.y, p.y + 1, l, dl, dscan); - Flush(); + Flush(v, dscan); } } return; } - int i = dpi.x > dpi.y ? 0 : 1; - - GSVertexSW edge = v[0]; - GSVertexSW dedge = dv / dp.v[i]; - - // TODO: prestep + clip with the scissor - // TODO: inline drawpoint + Flush() - int steps = dpi.v[i]; - while(steps-- > 0) + if(steps > 0) { - DrawPoint(&edge); + GSVertexSW edge = v[0]; + GSVertexSW dedge = dv / GSVector4(dp.v[i]); - edge += dedge; + GSVertexSW* RESTRICT e = m_edge.buff; + + while(1) + { + GSVector4i p(edge.p); + + if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom) + { + if(IsOneOfMyScanlines(p.y)) + { + *e = edge; + + e->p.i16[0] = (int16)p.x; + e->p.i16[1] = (int16)p.y; + e->p.i16[2] = (int16)(p.x + 1); + + e++; + } + } + + if(--steps == 0) break; + + edge += dedge; + } + + m_edge.count = e - m_edge.buff; + + m_stats.pixels += m_edge.count; + + GSVertexSW dscan; + + dscan.p = GSVector4::zero(); + dscan.t = GSVector4::zero(); + dscan.c = GSVector4::zero(); + + Flush(v, dscan); } } @@ -233,7 +255,7 @@ static const int s_abc[8][4] = void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) { - // edge buffer is used here to avoid xmm save-restores (except when we do aa1 in the middle) + // TODO: GSVertexSW::c/t could be merged into a GSVector8 GSVertexSW v[4]; GSVertexSW dv[3]; @@ -276,7 +298,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) case 1: // a == b < c ddv[1] = dv[1] / dv[1].p.yyyy(); ddv[2] = dv[2] / dv[2].p.yyyy(); - longest = dv[0]; + longest = dv[0]; // should be negated to be equal to "ddv[1] * dv[0].p.yyyy() - dv[0]", but it's easier to change the index of v/ddv later break; case 4: // a < b == c ddv[0] = dv[0] / dv[0].p.yyyy(); @@ -319,9 +341,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) dscan.t = GSVector4::zero(); dscan.c = GSVector4::zero(); - m_ds->SetupPrim(v, dscan); - - FlushEdge(); + Flush(v, dscan, true); } switch(i) @@ -402,50 +422,42 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) __assume(0); } - m_ds->SetupPrim(v, dscan); - - Flush(); + Flush(v, dscan); } void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan) { ASSERT(top < bottom); - GSScanline* RESTRICT e = &m_edge.buff[m_edge.count]; + GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count]; while(1) { - do + if(IsOneOfMyScanlines(top)) { - if(IsOneOfMyScanlines(top)) + GSVector4 lrf = l.p.ceil(); + GSVector4 lrmax = lrf.max(m_fscissor.xxxx()); + GSVector4 lrmin = lrf.min(m_fscissor.zzzz()); + GSVector4i lr = GSVector4i(lrmax.xxyy(lrmin)); + + int left = lr.extract32<0>(); + int right = lr.extract32<2>(); + + int pixels = right - left; + + if(pixels > 0) { - GSVector4 lr = l.p.ceil(); + m_stats.pixels += pixels; - GSVector4 lrmax = lr.max(m_fscissor.xxxx()); - GSVector4 lrmin = lr.min(m_fscissor.zzzz()); + *e = l + dscan * (lrmax - l.p).xxxx(); - GSVector4i lri = GSVector4i(lrmax.xxyy(lrmin)); + e->p.i16[0] = (int16)left; + e->p.i16[1] = (int16)top; + e->p.i16[2] = (int16)right; - int left = lri.extract32<0>(); - int right = lri.extract32<2>(); - - int pixels = right - left; - - if(pixels > 0) - { - m_stats.pixels += pixels; - - e->scan = l + dscan * (lrmax - l.p).xxxx(); - - e->p.left = left; - e->p.top = top; - e->p.right = right; - - e++; - } + e++; } } - while(0); if(++top >= bottom) break; @@ -508,7 +520,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices) m_ds->SetupPrim(v, dscan); - for(; r.top < r.bottom; r.top++, scan.t += dedge.t) + while(1) { if(IsOneOfMyScanlines(r.top)) { @@ -516,6 +528,10 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices) m_ds->DrawScanline(r.right, r.left, r.top, scan); } + + if(++r.top >= r.bottom) break; + + scan.t += dedge.t; } } @@ -531,7 +547,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS // TODO: bit slow and too much duplicated code // TODO: inner pre-step is still missing (hardly noticable) - GSScanline* RESTRICT dst = &m_edge.buff[m_edge.count]; + GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count]; GSVector4 lrtb = v0.p.upl(v1.p).ceil(); @@ -540,7 +556,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS GSVector4 tbmax = lrtb.max(m_fscissor.yyyy()); GSVector4 tbmin = lrtb.min(m_fscissor.wwww()); - GSVector4i tbi = GSVector4i(tbmax.zwzw(tbmin)); + GSVector4i tb = GSVector4i(tbmax.zwzw(tbmin)); int top, bottom; @@ -548,8 +564,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS if((dv.p >= GSVector4::zero()).mask() & 2) { - top = tbi.extract32<0>(); - bottom = tbi.extract32<3>(); + top = tb.extract32<0>(); + bottom = tb.extract32<3>(); if(top >= bottom) return; @@ -560,8 +576,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS } else { - top = tbi.extract32<1>(); - bottom = tbi.extract32<2>(); + top = tb.extract32<1>(); + bottom = tb.extract32<2>(); if(top >= bottom) return; @@ -580,26 +596,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS { while(1) { - do + int xi = x >> 16; + int xf = x & 0xffff; + + if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) { - int xi = x >> 16; - int xf = x & 0xffff; + m_stats.pixels++; - if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) - { - m_stats.pixels++; + *e = edge; - dst->scan = edge; - dst->scan.t.u32[3] = (0x10000 - xf) & 0xffff; + e->t.u32[3] = (0x10000 - xf) & 0xffff; - dst->p.left = xi; - dst->p.top = top; - dst->p.right = xi + 1; + e->p.i16[0] = (int16)xi; + e->p.i16[1] = (int16)top; + e->p.i16[2] = (int16)(xi + 1); - dst++; - } + e++; } - while(0); if(++top >= bottom) break; @@ -611,26 +624,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS { while(1) { - do + int xi = (x >> 16) + 1; + int xf = x & 0xffff; + + if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) { - int xi = (x >> 16) + 1; - int xf = x & 0xffff; + m_stats.pixels++; - if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) - { - m_stats.pixels++; + *e = edge; - dst->scan = edge; - dst->scan.t.u32[3] = xf; + e->t.u32[3] = xf; - dst->p.left = xi; - dst->p.top = top; - dst->p.right = xi + 1; + e->p.i16[0] = (int16)xi; + e->p.i16[1] = (int16)top; + e->p.i16[2] = (int16)(xi + 1); - dst++; - } + e++; } - while(0); if(++top >= bottom) break; @@ -644,7 +654,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS GSVector4 lrmax = lrtb.max(m_fscissor.xxxx()); GSVector4 lrmin = lrtb.min(m_fscissor.zzzz()); - GSVector4i lri = GSVector4i(lrmax.xyxy(lrmin)); + GSVector4i lr = GSVector4i(lrmax.xyxy(lrmin)); int left, right; @@ -652,8 +662,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS if((dv.p >= GSVector4::zero()).mask() & 1) { - left = lri.extract32<0>(); - right = lri.extract32<3>(); + left = lr.extract32<0>(); + right = lr.extract32<3>(); if(left >= right) return; @@ -664,8 +674,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS } else { - left = lri.extract32<1>(); - right = lri.extract32<2>(); + left = lr.extract32<1>(); + right = lr.extract32<2>(); if(left >= right) return; @@ -684,26 +694,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS { while(1) { - do + int yi = y >> 16; + int yf = y & 0xffff; + + if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) { - int yi = y >> 16; - int yf = y & 0xffff; + m_stats.pixels++; - if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) - { - m_stats.pixels++; + *e = edge; + + e->t.u32[3] = (0x10000 - yf) & 0xffff; - dst->scan = edge; - dst->scan.t.u32[3] = (0x10000 - yf) & 0xffff; + e->p.i16[0] = (int16)left; + e->p.i16[1] = (int16)yi; + e->p.i16[2] = (int16)(left + 1); - dst->p.left = left; - dst->p.top = yi; - dst->p.right = left + 1; - - dst++; - } + e++; } - while(0); if(++left >= right) break; @@ -715,26 +722,23 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS { while(1) { - do + int yi = (y >> 16) + 1; + int yf = y & 0xffff; + + if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) { - int yi = (y >> 16) + 1; - int yf = y & 0xffff; + m_stats.pixels++; - if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) - { - m_stats.pixels++; + *e = edge; + + e->t.u32[3] = yf; - dst->scan = edge; - dst->scan.t.u32[3] = yf; + e->p.i16[0] = (int16)left; + e->p.i16[1] = (int16)yi; + e->p.i16[2] = (int16)(left + 1); - dst->p.left = left; - dst->p.top = yi; - dst->p.right = left + 1; - - dst++; - } + e++; } - while(0); if(++left >= right) break; @@ -744,33 +748,36 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS } } - m_edge.count += dst - &m_edge.buff[m_edge.count]; + m_edge.count += e - &m_edge.buff[m_edge.count]; } -void GSRasterizer::Flush() +void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge) { // TODO: on win64 this could be the place where xmm6-15 are preserved (not by each DrawScanline) - const GSScanline* s = m_edge.buff; + int count = m_edge.count; - for(int count = m_edge.count; count > 0; count--, s++) + if(count > 0) { - m_ds->DrawScanline(s->p.right, s->p.left, s->p.top, s->scan); + m_ds->SetupPrim(vertices, dscan); + + const GSVertexSW* RESTRICT e = m_edge.buff; + + int i = 0; + + if(!edge) + { + do {m_ds->DrawScanline(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);} + while(++i < count); + } + else + { + do {m_ds->DrawEdge(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);} + while(++i < count); + } + + m_edge.count = 0; } - - m_edge.count = 0; -} - -void GSRasterizer::FlushEdge() -{ - const GSScanline* s = m_edge.buff; - - for(int count = m_edge.count; count > 0; count--, s++) - { - m_ds->DrawEdge(s->p.right, s->p.left, s->p.top, s->scan); - } - - m_edge.count = 0; } // diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 9ca9c6f267..308bda8b21 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -81,8 +81,6 @@ public: __aligned(class, 32) GSRasterizer : public GSAlignedClass<32>, public IRasterizer { - struct GSScanline {GSVertexSW scan; GSVector4i p;}; - protected: IDrawScanline* m_ds; int m_id; @@ -90,7 +88,7 @@ protected: GSRasterizerStats m_stats; GSVector4i m_scissor; GSVector4 m_fscissor; - struct {GSScanline* buff; int count;} m_edge; + struct {GSVertexSW* buff; int count;} m_edge; void DrawPoint(const GSVertexSW* v); void DrawLine(const GSVertexSW* v); @@ -104,8 +102,7 @@ protected: __forceinline bool IsOneOfMyScanlines(int scanline) const; - void Flush(); - void FlushEdge(); + __forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false); public: GSRasterizer(IDrawScanline* ds); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp index afc28b246c..1af18632c7 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) @@ -70,7 +71,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - vmovaps(xmm0, ptr[rdx + 16]); + vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -122,7 +123,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = vertices[0].p; - vmovaps(xmm0, ptr[rcx + 16]); + vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -179,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - vmovaps(xmm0, ptr[rdx + 32]); + vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); vmulps(xmm1, xmm0, xmm3); @@ -249,7 +250,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[rdx]); + vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -289,7 +290,7 @@ void GSSetupPrimCodeGenerator::Color() // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it + vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); @@ -321,7 +322,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4i c = GSVector4i(vertices[0].c); - vcvttps2dq(xmm0, ptr[rcx]); + vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp index c0adc5607f..d523904615 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) @@ -68,7 +69,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - movaps(xmm0, ptr[rdx + 16]); + movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -125,7 +126,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = vertices[0].p; - movaps(xmm0, ptr[rcx + 16]); + movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -183,7 +184,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - movaps(xmm0, ptr[rdx + 32]); + movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]); movaps(xmm1, xmm0); mulps(xmm1, xmm3); @@ -256,7 +257,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - movaps(xmm0, ptr[rdx]); + movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); movaps(xmm1, xmm0); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -300,7 +301,7 @@ void GSSetupPrimCodeGenerator::Color() // GSVector4 c = dscan.c; - movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it + movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); @@ -335,7 +336,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4i c = GSVector4i(vertices[0].c); - cvttps2dq(xmm0, ptr[rcx]); + cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index 46ac4df8af..e5c5402576 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) @@ -56,7 +57,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - vmovaps(xmm0, ptr[edx + 16]); + vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -108,7 +109,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = vertices[0].p; - vmovaps(xmm0, ptr[ecx + 16]); + vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -163,7 +164,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - vmovaps(xmm0, ptr[edx + 32]); + vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]); vmulps(xmm1, xmm0, xmm3); @@ -233,7 +234,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[edx]); + vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -273,7 +274,7 @@ void GSSetupPrimCodeGenerator::Color() // GSVector4 c = dscan.c; - vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it + vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); @@ -305,7 +306,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4i c = GSVector4i(vertices[0].c); - vcvttps2dq(xmm0, ptr[ecx]); + vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 28354c4086..cdb491ec07 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +#include "GSVertexSW.h" #if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) @@ -56,7 +57,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = dscan.p; - movaps(xmm0, ptr[edx + 16]); + movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -113,7 +114,7 @@ void GSSetupPrimCodeGenerator::Depth() { // GSVector4 p = vertices[0].p; - movaps(xmm0, ptr[ecx + 16]); + movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]); if(m_en.f) { @@ -168,7 +169,7 @@ void GSSetupPrimCodeGenerator::Texture() // GSVector4 t = dscan.t; - movaps(xmm0, ptr[edx + 32]); + movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]); movaps(xmm1, xmm0); mulps(xmm1, xmm3); @@ -241,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4 c = dscan.c; - movaps(xmm0, ptr[edx]); + movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); movaps(xmm1, xmm0); // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); @@ -285,7 +286,7 @@ void GSSetupPrimCodeGenerator::Color() // GSVector4 c = dscan.c; - movaps(xmm0, ptr[edx]); // not enough regs, have to reload it + movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); @@ -320,7 +321,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4i c = GSVector4i(vertices[0].c); - movaps(xmm0, ptr[ecx]); + movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]); cvttps2dq(xmm0, xmm0); // c = c.upl16(c.zwxy()); diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 4f54331ecf..5d94535ccc 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -3030,6 +3030,8 @@ public: uint32 u32[8]; uint64 u64[4]; __m256 m; + + // TODO: _M_SSE < 0x500 => union {__m128 m0, m1;}; and replace each function with a pair of 128 bit intructions }; __forceinline GSVector8() @@ -3050,7 +3052,7 @@ public: __forceinline GSVector8(__m128 m0, __m128 m1) { - m = zero().insert<0>(m0).insert<1>(m1); + m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20); } __forceinline GSVector8(const GSVector8& v) @@ -3065,7 +3067,8 @@ public: __forceinline explicit GSVector8(__m128 m) { - this->m = zero().insert<0>(m).insert<1>(m); + this->m = _mm256_castps128_ps256(m); + this->m = _mm256_permute2f128_ps(this->m, this->m, 0); } __forceinline explicit GSVector8(__m256 m) @@ -3087,7 +3090,8 @@ public: __forceinline void operator = (__m128 m) { - this->m = zero().insert<0>(m).insert<1>(m); + this->m = _mm256_castps128_ps256(m); + this->m = _mm256_permute2f128_ps(this->m, this->m, 0); } __forceinline void operator = (__m256 m) @@ -3104,7 +3108,7 @@ public: __forceinline GSVector8 abs() const { - return *this & cast(GSVector8i(GSVector4i::x7fffffff())); + return *this & cast(GSVector8i(GSVector4i::x7fffffff())); // TODO: add GSVector8 consts } __forceinline GSVector8 neg() const @@ -3143,17 +3147,27 @@ public: // TODO + __forceinline GSVector8 min(const GSVector8& a) const + { + return GSVector8(_mm256_min_ps(m, a)); + } + + __forceinline GSVector8 max(const GSVector8& a) const + { + return GSVector8(_mm256_max_ps(m, a)); + } + __forceinline GSVector8 blend8(const GSVector8& a, const GSVector8& mask) const { return GSVector8(_mm256_blendv_ps(m, a, mask)); } - __forceinline GSVector8 upl32(const GSVector8& a) const + __forceinline GSVector8 upl(const GSVector8& a) const { return GSVector8(_mm256_unpacklo_ps(m, a)); } - __forceinline GSVector8 uph32(const GSVector8& a) const + __forceinline GSVector8 uph(const GSVector8& a) const { return GSVector8(_mm256_unpackhi_ps(m, a)); } @@ -3392,6 +3406,23 @@ public: return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ)); } + #define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \ + __forceinline GSVector8 xs##ys() const {return GSVector8(_mm256_permute2f128_ps(m, m, xn | (yn << 4)));} \ + __forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, xn | (yn << 4)));} \ + + #define VECTOR8_PERMUTE_1(xs, xn) \ + VECTOR8_PERMUTE_2(xs, xn, x, 0) \ + VECTOR8_PERMUTE_2(xs, xn, y, 1) \ + VECTOR8_PERMUTE_2(xs, xn, z, 2) \ + VECTOR8_PERMUTE_2(xs, xn, w, 3) \ + VECTOR8_PERMUTE_2(xs, xn, _, 8) \ + + VECTOR8_PERMUTE_1(x, 0) + VECTOR8_PERMUTE_1(y, 1) + VECTOR8_PERMUTE_1(z, 2) + VECTOR8_PERMUTE_1(w, 3) + VECTOR8_PERMUTE_1(_, 8) + #define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ __forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ __forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ diff --git a/plugins/GSdx/GSVertexSW.h b/plugins/GSdx/GSVertexSW.h index a900b026e3..e2d7151ec5 100644 --- a/plugins/GSdx/GSVertexSW.h +++ b/plugins/GSdx/GSVertexSW.h @@ -25,10 +25,10 @@ __aligned(struct, 16) GSVertexSW { - GSVector4 c, p, t; + GSVector4 p, t, c; - GSVertexSW() {} - GSVertexSW(const GSVertexSW& v) {*this = v;} + __forceinline GSVertexSW() {} + __forceinline GSVertexSW(const GSVertexSW& v) {*this = v;} __forceinline void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;} __forceinline void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;} @@ -37,8 +37,6 @@ __aligned(struct, 16) GSVertexSW friend GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2); friend GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv); friend GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv); - friend GSVertexSW operator * (const GSVertexSW& v, float f); - friend GSVertexSW operator / (const GSVertexSW& v, float f); static bool IsQuad(const GSVertexSW* v, int& tl, int& br) { @@ -192,22 +190,3 @@ __forceinline GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv) return v0; } -__forceinline GSVertexSW operator * (const GSVertexSW& v, float f) -{ - GSVertexSW v0; - GSVector4 vf(f); - v0.c = v.c * vf; - v0.p = v.p * vf; - v0.t = v.t * vf; - return v0; -} - -__forceinline GSVertexSW operator / (const GSVertexSW& v, float f) -{ - GSVertexSW v0; - GSVector4 vf(f); - v0.c = v.c / vf; - v0.p = v.p / vf; - v0.t = v.t / vf; - return v0; -} diff --git a/plugins/GSdx/GSVertexTrace.h b/plugins/GSdx/GSVertexTrace.h index 2893e3e533..f7eaf35361 100644 --- a/plugins/GSdx/GSVertexTrace.h +++ b/plugins/GSdx/GSVertexTrace.h @@ -31,9 +31,11 @@ class GSState; __aligned(class, 32) GSVertexTrace { +public: struct Vertex {GSVector4i c; GSVector4 p, t;}; struct VertexAlpha {int min, max; bool valid;}; +private: typedef void (*VertexTracePtr)(int count, const void* v, Vertex& min, Vertex& max); class CGSW : public GSCodeGenerator diff --git a/plugins/GSdx/GSVertexTrace.x64.avx.cpp b/plugins/GSdx/GSVertexTrace.x64.avx.cpp index b3ba691c37..3624ed5b6b 100644 --- a/plugins/GSdx/GSVertexTrace.x64.avx.cpp +++ b/plugins/GSdx/GSVertexTrace.x64.avx.cpp @@ -90,7 +90,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs if(tme && !fst && primclass == GS_SPRITE_CLASS) { - vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]); + vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); } @@ -101,7 +101,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.c = min.c.minv(v[i + j].c); // max.c = max.c.maxv(v[i + j].c); - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]); + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]); vminps(xmm2, xmm0); vmaxps(xmm3, xmm0); @@ -110,7 +110,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]); + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]); vminps(xmm4, xmm0); vmaxps(xmm5, xmm0); @@ -120,7 +120,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.t = min.t.minv(v[i + j].t); // max.t = max.t.maxv(v[i + j].t); - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]); + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); if(!fst) { @@ -149,20 +149,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs { vcvttps2dq(xmm2, xmm2); vpsrld(xmm2, 7); - vmovaps(ptr[r8], xmm2); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); vcvttps2dq(xmm3, xmm3); vpsrld(xmm3, 7); - vmovaps(ptr[r9], xmm3); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); } - vmovaps(ptr[r8 + 16], xmm4); - vmovaps(ptr[r9 + 16], xmm5); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { - vmovaps(ptr[r8 + 32], xmm6); - vmovaps(ptr[r9 + 32], xmm7); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); } vmovdqa(xmm6, ptr[rsp + 0]); @@ -239,7 +239,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(tme && !fst && primclass == GS_SPRITE_CLASS) { - vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]); + vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); } @@ -248,7 +248,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]); + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); vminps(xmm4, xmm0); vmaxps(xmm5, xmm0); @@ -260,7 +260,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(color && (iip || j == n - 1) || tme) { - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]); + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]); } if(color && (iip || j == n - 1)) @@ -309,15 +309,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); vpmovzxbd(xmm3, xmm3); - vmovaps(ptr[r8], xmm2); - vmovaps(ptr[r9], xmm3); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin; // m_max.p = pmax; - vmovaps(ptr[r8 + 16], xmm4); - vmovaps(ptr[r9 + 16], xmm5); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { @@ -327,8 +327,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - vmovaps(ptr[r8 + 32], xmm6); - vmovaps(ptr[r9 + 32], xmm7); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); } vmovdqa(xmm6, ptr[rsp + 0]); @@ -463,8 +463,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); vpmovzxbd(xmm3, xmm3); - vmovaps(ptr[r8], xmm2); - vmovaps(ptr[r9], xmm3); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin.xyww(); @@ -473,16 +473,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - vmovaps(ptr[r8 + 16], xmm4); - vmovaps(ptr[r9 + 16], xmm5); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { // m_min.t = tmin; // m_max.t = tmax; - vmovaps(ptr[r8 + 32], xmm6); - vmovaps(ptr[r9 + 32], xmm7); + vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); + vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); } vmovdqa(xmm6, ptr[rsp + 0]); diff --git a/plugins/GSdx/GSVertexTrace.x64.cpp b/plugins/GSdx/GSVertexTrace.x64.cpp index d584b4b3c0..e7e9a0a0f2 100644 --- a/plugins/GSdx/GSVertexTrace.x64.cpp +++ b/plugins/GSdx/GSVertexTrace.x64.cpp @@ -93,7 +93,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs if(tme && !fst && primclass == GS_SPRITE_CLASS) { - movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]); + movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); } @@ -104,7 +104,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.c = min.c.minv(v[i + j].c); // max.c = max.c.maxv(v[i + j].c); - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]); + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]); minps(xmm2, xmm0); maxps(xmm3, xmm0); @@ -113,7 +113,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]); + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]); minps(xmm4, xmm0); maxps(xmm5, xmm0); @@ -123,7 +123,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.t = min.t.minv(v[i + j].t); // max.t = max.t.maxv(v[i + j].t); - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]); + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); if(!fst) { @@ -153,20 +153,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs { cvttps2dq(xmm2, xmm2); psrld(xmm2, 7); - movaps(ptr[r8], xmm2); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); cvttps2dq(xmm3, xmm3); psrld(xmm3, 7); - movaps(ptr[r9], xmm3); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); } - movaps(ptr[r8 + 16], xmm4); - movaps(ptr[r9 + 16], xmm5); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { - movaps(ptr[r8 + 32], xmm6); - movaps(ptr[r9 + 32], xmm7); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); } movdqa(xmm6, ptr[rsp + 0]); @@ -246,7 +246,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(tme && !fst && primclass == GS_SPRITE_CLASS) { - movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]); + movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); } @@ -255,7 +255,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]); + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); minps(xmm4, xmm0); maxps(xmm5, xmm0); @@ -268,7 +268,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(color && (iip || j == n - 1) || tme) { - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]); + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]); } if(color && (iip || j == n - 1)) @@ -330,15 +330,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma punpcklwd(xmm3, xmm0); } - movaps(ptr[r8], xmm2); - movaps(ptr[r9], xmm3); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin; // m_max.p = pmax; - movaps(ptr[r8 + 16], xmm4); - movaps(ptr[r9 + 16], xmm5); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { @@ -348,8 +348,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - movaps(ptr[r8 + 32], xmm6); - movaps(ptr[r9 + 32], xmm7); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); } movdqa(xmm6, ptr[rsp + 0]); @@ -510,8 +510,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t punpcklwd(xmm3, xmm0); } - movaps(ptr[r8], xmm2); - movaps(ptr[r9], xmm3); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin.xyww(); @@ -520,16 +520,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - movaps(ptr[r8 + 16], xmm4); - movaps(ptr[r9 + 16], xmm5); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { // m_min.t = tmin; // m_max.t = tmax; - movaps(ptr[r8 + 32], xmm6); - movaps(ptr[r9 + 32], xmm7); + movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); + movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); } movdqa(xmm6, ptr[rsp + 0]); diff --git a/plugins/GSdx/GSVertexTrace.x86.avx.cpp b/plugins/GSdx/GSVertexTrace.x86.avx.cpp index c1323d276d..5fb291fb9a 100644 --- a/plugins/GSdx/GSVertexTrace.x86.avx.cpp +++ b/plugins/GSdx/GSVertexTrace.x86.avx.cpp @@ -92,7 +92,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs if(tme && !fst && primclass == GS_SPRITE_CLASS) { - vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]); + vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); } @@ -103,7 +103,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.c = min.c.minv(v[i + j].c); // max.c = max.c.maxv(v[i + j].c); - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]); + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]); vminps(xmm2, xmm0); vmaxps(xmm3, xmm0); @@ -112,7 +112,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]); + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]); vminps(xmm4, xmm0); vmaxps(xmm5, xmm0); @@ -122,7 +122,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.t = min.t.minv(v[i + j].t); // max.t = max.t.maxv(v[i + j].t); - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]); + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); if(!fst) { @@ -154,20 +154,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs { vcvttps2dq(xmm2, xmm2); vpsrld(xmm2, 7); - vmovaps(ptr[eax], xmm2); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); vcvttps2dq(xmm3, xmm3); vpsrld(xmm3, 7); - vmovaps(ptr[edx], xmm3); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); } - vmovaps(ptr[eax + 16], xmm4); - vmovaps(ptr[edx + 16], xmm5); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { - vmovaps(ptr[eax + 32], xmm6); - vmovaps(ptr[edx + 32], xmm7); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); } ret(); @@ -235,7 +235,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(tme && !fst && primclass == GS_SPRITE_CLASS) { - vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]); + vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); } @@ -244,7 +244,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]); + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); vminps(xmm4, xmm0); vmaxps(xmm5, xmm0); @@ -256,7 +256,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(color && (iip || j == n - 1) || tme) { - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]); + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]); } if(color && (iip || j == n - 1)) @@ -308,15 +308,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); vpmovzxbd(xmm3, xmm3); - vmovaps(ptr[eax], xmm2); - vmovaps(ptr[edx], xmm3); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin; // m_max.p = pmax; - vmovaps(ptr[eax + 16], xmm4); - vmovaps(ptr[edx + 16], xmm5); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { @@ -326,8 +326,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - vmovaps(ptr[eax + 32], xmm6); - vmovaps(ptr[edx + 32], xmm7); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); } ret(); @@ -456,8 +456,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); vpmovzxbd(xmm3, xmm3); - vmovaps(ptr[eax], xmm2); - vmovaps(ptr[edx], xmm3); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin.xyww(); @@ -466,16 +466,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - vmovaps(ptr[eax + 16], xmm4); - vmovaps(ptr[edx + 16], xmm5); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { // m_min.t = tmin; // m_max.t = tmax; - vmovaps(ptr[eax + 32], xmm6); - vmovaps(ptr[edx + 32], xmm7); + vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); + vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); } ret(); diff --git a/plugins/GSdx/GSVertexTrace.x86.cpp b/plugins/GSdx/GSVertexTrace.x86.cpp index 2d0a4ed3f5..6b792df4be 100644 --- a/plugins/GSdx/GSVertexTrace.x86.cpp +++ b/plugins/GSdx/GSVertexTrace.x86.cpp @@ -95,7 +95,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs if(tme && !fst && primclass == GS_SPRITE_CLASS) { - movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]); + movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); } @@ -106,7 +106,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.c = min.c.minv(v[i + j].c); // max.c = max.c.maxv(v[i + j].c); - movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]); + movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]); minps(xmm2, xmm0); maxps(xmm3, xmm0); @@ -115,7 +115,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]); + movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]); minps(xmm4, xmm0); maxps(xmm5, xmm0); @@ -125,7 +125,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs // min.t = min.t.minv(v[i + j].t); // max.t = max.t.maxv(v[i + j].t); - movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]); + movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); if(!fst) { @@ -158,20 +158,20 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs { cvttps2dq(xmm2, xmm2); psrld(xmm2, 7); - movaps(ptr[eax], xmm2); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); cvttps2dq(xmm3, xmm3); psrld(xmm3, 7); - movaps(ptr[edx], xmm3); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); } - movaps(ptr[eax + 16], xmm4); - movaps(ptr[edx + 16], xmm5); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { - movaps(ptr[eax + 32], xmm6); - movaps(ptr[edx + 32], xmm7); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); } ret(); @@ -242,7 +242,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(tme && !fst && primclass == GS_SPRITE_CLASS) { - movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]); + movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); } @@ -251,7 +251,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma // min.p = min.p.minv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p); - movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]); + movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); minps(xmm4, xmm0); maxps(xmm5, xmm0); @@ -264,7 +264,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma if(color && (iip || j == n - 1) || tme) { - movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]); + movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]); } if(color && (iip || j == n - 1)) @@ -329,15 +329,15 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma punpcklwd(xmm3, xmm0); } - movaps(ptr[eax], xmm2); - movaps(ptr[edx], xmm3); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin; // m_max.p = pmax; - movaps(ptr[eax + 16], xmm4); - movaps(ptr[edx + 16], xmm5); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { @@ -347,8 +347,8 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - movaps(ptr[eax + 32], xmm6); - movaps(ptr[edx + 32], xmm7); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); } ret(); @@ -503,8 +503,8 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t punpcklwd(xmm3, xmm0); } - movaps(ptr[eax], xmm2); - movaps(ptr[edx], xmm3); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); } // m_min.p = pmin.xyww(); @@ -513,16 +513,16 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - movaps(ptr[eax + 16], xmm4); - movaps(ptr[edx + 16], xmm5); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); if(tme) { // m_min.t = tmin; // m_max.t = tmax; - movaps(ptr[eax + 32], xmm6); - movaps(ptr[edx + 32], xmm7); + movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); + movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); } ret();