From 20d99ae9fcd5ce10b7be2e14b840b8eb5c3cb0d2 Mon Sep 17 00:00:00 2001 From: "gabest11@gmail.com" Date: Sun, 23 Jun 2013 10:46:24 +0000 Subject: [PATCH] GSdx: vs2010 fix and minor changes git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5678 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GPUDrawScanline.cpp | 4 +- plugins/GSdx/GPUDrawScanline.h | 2 +- plugins/GSdx/GSDrawScanline.cpp | 8 +- plugins/GSdx/GSDrawScanline.h | 2 +- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 34 ++++---- plugins/GSdx/GSDrawScanlineCodeGenerator.h | 2 +- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 3 +- .../GSDrawScanlineCodeGenerator.x86.avx2.cpp | 19 +++-- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 34 +++----- plugins/GSdx/GSFunctionMap.h | 19 +++-- plugins/GSdx/GSRasterizer.cpp | 79 ++++++++++++------- plugins/GSdx/GSRasterizer.h | 7 +- plugins/GSdx/GSState.cpp | 11 ++- plugins/GSdx/GSVector.h | 64 ++++++++++++++- plugins/GSdx/xbyak/xbyak_mnemonic.h | 2 +- 15 files changed, 187 insertions(+), 103 deletions(-) diff --git a/plugins/GSdx/GPUDrawScanline.cpp b/plugins/GSdx/GPUDrawScanline.cpp index 7706b70adf..4159fd9d93 100644 --- a/plugins/GSdx/GPUDrawScanline.cpp +++ b/plugins/GSdx/GPUDrawScanline.cpp @@ -76,9 +76,9 @@ void GPUDrawScanline::BeginDraw(const GSRasterizerData* data) m_sp = m_sp_map[sel]; } -void GPUDrawScanline::EndDraw(uint64 frame, uint64 ticks, int pixels) +void GPUDrawScanline::EndDraw(uint64 frame, uint64 ticks, int actual, int total) { - m_ds_map.UpdateStats(frame, ticks, pixels); + m_ds_map.UpdateStats(frame, ticks, actual, total); } #ifndef ENABLE_JIT_RASTERIZER diff --git a/plugins/GSdx/GPUDrawScanline.h b/plugins/GSdx/GPUDrawScanline.h index eb95451b4b..d7c7e26155 100644 --- a/plugins/GSdx/GPUDrawScanline.h +++ b/plugins/GSdx/GPUDrawScanline.h @@ -61,7 +61,7 @@ public: // IDrawScanline void BeginDraw(const GSRasterizerData* data); - void EndDraw(uint64 frame, uint64 ticks, int pixels); + void EndDraw(uint64 frame, uint64 ticks, int actual, int total); #ifndef ENABLE_JIT_RASTERIZER diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index c2bc515e45..222ff373d0 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -96,9 +96,9 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data) m_sp = m_sp_map[sel]; } -void GSDrawScanline::EndDraw(uint64 frame, uint64 ticks, int pixels) +void GSDrawScanline::EndDraw(uint64 frame, uint64 ticks, int actual, int total) { - m_ds_map.UpdateStats(frame, ticks, pixels); + m_ds_map.UpdateStats(frame, ticks, actual, total); } #ifndef ENABLE_JIT_RASTERIZER @@ -434,7 +434,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS skip = left & 7; steps = pixels + skip - 8; left -= skip; - test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]; + test = GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[skip]) | GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]); } else { @@ -1524,7 +1524,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(!sel.notest) { - test = GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]; + test = GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]); } } diff --git a/plugins/GSdx/GSDrawScanline.h b/plugins/GSdx/GSDrawScanline.h index 3e0e3b0464..f1acc6a0b9 100644 --- a/plugins/GSdx/GSDrawScanline.h +++ b/plugins/GSdx/GSDrawScanline.h @@ -68,7 +68,7 @@ public: // IDrawScanline void BeginDraw(const GSRasterizerData* data); - void EndDraw(uint64 frame, uint64 ticks, int pixels); + void EndDraw(uint64 frame, uint64 ticks, int actual, int total); void DrawRect(const GSVector4i& r, const GSVertexSW& v); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index 9de1c01107..3909a6e787 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -24,24 +24,24 @@ #if _M_SSE >= 0x501 -const GSVector8i GSDrawScanlineCodeGenerator::m_test[16] = +__aligned(const uint8, 8) GSDrawScanlineCodeGenerator::m_test[16][8] = { - GSVector8i::zero(), - GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), - GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), - GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), - GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), - GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000), - GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000), - GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000), - GSVector8i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), - GSVector8i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), - GSVector8i(0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), - GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), - GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff), - GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff), - GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff), - GSVector8i::zero(), + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, + {0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, }; const GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4] = diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 808f6d1f84..282285bcbd 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -135,7 +135,7 @@ public: GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); #if _M_SSE >= 0x501 - static const GSVector8i m_test[16]; + static __aligned(const uint8, 8) m_test[16][8]; static const GSVector8 m_log2_coef[4]; #else static const GSVector4i m_test[8]; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 3924291cfe..6148f33b9f 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -2824,7 +2824,8 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, xor(dst, eax); break; case 2: - vpextrw(eax, src, i * 2); + if(i == 0) vmovd(eax, src); + else vpextrw(eax, src, i * 2); mov(dst, ax); break; } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp index 02ef50003b..03a51a777d 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp @@ -268,17 +268,16 @@ void GSDrawScanlineCodeGenerator::Init() sub(ebx, edx); // GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))]; - - shl(edx, 5); - - vmovdqa(ymm7, ptr[edx + (size_t)&m_test[0]]); - + mov(eax, ecx); sar(eax, 31); and(eax, ecx); - shl(eax, 5); - vpor(ymm7, ptr[eax + (size_t)&m_test[15]]); + vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[0]]); + vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)&m_test[15]]); + vpor(ymm7, ymm0); + + shl(edx, 5); } else { @@ -592,9 +591,8 @@ void GSDrawScanlineCodeGenerator::Step() mov(edx, ecx); sar(edx, 31); and(edx, ecx); - shl(edx, 5); - vmovdqa(ymm7, ptr[edx + (size_t)&m_test[15]]); + vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[15]]); } } @@ -2843,7 +2841,8 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, xor(dst, eax); break; case 2: - vpextrw(eax, src, j * 2); + if(j == 0) vmovd(eax, src); + else vpextrw(eax, src, j * 2); mov(dst, ax); break; } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 9d15a89762..eb95e857ce 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -2902,49 +2902,33 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, { Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - #if _M_SSE >= 0x401 - switch(psm) { case 0: if(i == 0) movd(dst, src); + #if _M_SSE >= 0x401 else pextrd(dst, src, i); - break; - case 1: - if(i == 0) movd(eax, src); - else pextrd(eax, src, i); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - pextrw(eax, src, i * 2); - mov(dst, ax); - break; - } - - #else - - switch(psm) - { - case 0: - if(i == 0) movd(dst, src); + #else else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);} + #endif break; case 1: if(i == 0) movd(eax, src); + #if _M_SSE >= 0x401 + else pextrd(eax, src, i); + #else else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);} + #endif xor(eax, dst); and(eax, 0xffffff); xor(dst, eax); break; case 2: - pextrw(eax, src, i * 2); + if(i == 0) movd(eax, src); + else pextrw(eax, src, i * 2); mov(dst, ax); break; } - - #endif } void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index 68cd72fa29..60af124ff4 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -32,7 +32,7 @@ protected: struct ActivePtr { uint64 frame, frames; - uint64 ticks, pixels; + uint64 ticks, actual, total; VALUE f; }; @@ -84,7 +84,7 @@ public: return m_active->f; } - void UpdateStats(uint64 frame, uint64 ticks, int pixels) + void UpdateStats(uint64 frame, uint64 ticks, int actual, int total) { if(m_active) { @@ -95,7 +95,10 @@ public: } m_active->ticks += ticks; - m_active->pixels += pixels; + m_active->actual += actual; + m_active->total += total; + + ASSERT(m_active->total >= m_active->actual); } } @@ -124,15 +127,15 @@ public: if(p->frames > 0) { - uint64 tpp = p->pixels > 0 ? p->ticks / p->pixels : 0; + uint64 tpp = p->actual > 0 ? p->ticks / p->actual : 0; uint64 tpf = p->frames > 0 ? p->ticks / p->frames : 0; - uint64 ppf = p->frames > 0 ? p->pixels / p->frames : 0; + uint64 ppf = p->frames > 0 ? p->actual / p->frames : 0; - printf("[%014llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n", + printf("[%014llx]%c %6.2f%% %5.2f%% f %4lld t %12lld p %12lld w %12lld tpp %4lld tpf %9lld ppf %9lld\n", (uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ', - (float)(tpf * 10000 / 50000000) / 100, + (float)(tpf * 10000 / 34000000) / 100, (float)(tpf * 10000 / ttpf) / 100, - p->frames, p->pixels, + p->frames, p->ticks, p->actual, p->total - p->actual, tpp, tpf, ppf); } } diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index ba11a90a37..f9d0f45d14 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -37,8 +37,9 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe , m_id(id) , m_threads(threads) , m_perfmon(perfmon) - , m_pixels(0) { + memset(&m_pixels, 0, sizeof(m_pixels)); + m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false); m_edge.count = 0; @@ -110,11 +111,11 @@ void GSRasterizer::Queue(shared_ptr data) int GSRasterizer::GetPixels(bool reset) { - int pixels = m_pixels; + int pixels = m_pixels.sum; if(reset) { - m_pixels = 0; + m_pixels.sum = 0; } return pixels; @@ -126,6 +127,9 @@ void GSRasterizer::Draw(GSRasterizerData* data) if(data->vertex != NULL && data->vertex_count == 0 || data->index != NULL && data->index_count == 0) return; + m_pixels.actual = 0; + m_pixels.total = 0; + data->start = __rdtsc(); m_ds->BeginDraw(data); @@ -212,11 +216,13 @@ void GSRasterizer::Draw(GSRasterizerData* data) _mm256_zeroupper(); #endif - data->pixels = m_pixels; + data->pixels = m_pixels.actual; uint64 ticks = __rdtsc() - data->start; - m_ds->EndDraw(data->frame, ticks, m_pixels); + m_pixels.sum += m_pixels.actual; + + m_ds->EndDraw(data->frame, ticks, m_pixels.actual, m_pixels.total); } template @@ -234,11 +240,9 @@ void GSRasterizer::DrawPoint(const GSVertexSW* vertex, int vertex_count, const u { if(IsOneOfMyScanlines(p.y)) { - m_pixels++; - m_ds->SetupPrim(vertex, index, GSVertexSW::zero()); - m_ds->DrawScanline(1, p.x, p.y, v); + DrawScanline(1, p.x, p.y, v); } } } @@ -257,11 +261,9 @@ void GSRasterizer::DrawPoint(const GSVertexSW* vertex, int vertex_count, const u { if(IsOneOfMyScanlines(p.y)) { - m_pixels++; - m_ds->SetupPrim(vertex, tmp_index, GSVertexSW::zero()); - m_ds->DrawScanline(1, p.x, p.y, v); + DrawScanline(1, p.x, p.y, v); } } } @@ -321,15 +323,13 @@ void GSRasterizer::DrawLine(const GSVertexSW* vertex, const uint32* index) if(pixels > 0) { - m_pixels += pixels; - GSVertexSW dscan = dv / dv.p.xxxx(); scan += dscan * (l - scan.p).xxxx(); m_ds->SetupPrim(vertex, index, dscan); - m_ds->DrawScanline(pixels, left, p.y, scan); + DrawScanline(pixels, left, p.y, scan); } } } @@ -560,8 +560,6 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co scan.c = scan.c + dscan.c * prestep; AddScanline(e++, pixels, left, top, scan); - - //m_pixels += pixels; m_ds->DrawScanline(pixels, left, top, scan); } top++; @@ -605,7 +603,10 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index) { m_ds->DrawRect(r, scan); - m_pixels += r.width() * r.height(); + int pixels = r.width() * r.height(); + + m_pixels.actual += pixels; + m_pixels.total += pixels; } else { @@ -619,7 +620,10 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index) m_ds->DrawRect(r, scan); - m_pixels += r.width() * r.height(); + int pixels = r.width() * r.height(); + + m_pixels.actual += pixels; + m_pixels.total += pixels; top = r.bottom + ((m_threads - 1) << THREAD_HEIGHT); } @@ -651,9 +655,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index) { if(IsOneOfMyScanlines(r.top)) { - m_pixels += r.width(); - - m_ds->DrawScanline(r.width(), r.left, r.top, scan); + DrawScanline(r.width(), r.left, r.top, scan); } if(++r.top >= r.bottom) break; @@ -883,9 +885,7 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS int left = e->_pad.i32[1]; int top = e->_pad.i32[2]; - m_pixels += pixels; - - m_ds->DrawScanline(pixels, left, top, *e++); + DrawScanline(pixels, left, top, *e++); } while(e < ee); } @@ -897,9 +897,7 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS int left = e->_pad.i32[1]; int top = e->_pad.i32[2]; - m_pixels += pixels; - - m_ds->DrawEdge(pixels, left, top, *e++); + DrawEdge(pixels, left, top, *e++); } while(e < ee); } @@ -908,6 +906,33 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS } } +#if _M_SSE >= 0x501 +#define PIXELS_PER_LOOP 8 +#else +#define PIXELS_PER_LOOP 4 +#endif + +void GSRasterizer::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) +{ + m_pixels.actual += pixels; + m_pixels.total += ((left + pixels + (PIXELS_PER_LOOP - 1)) & ~(PIXELS_PER_LOOP - 1)) - (left & (PIXELS_PER_LOOP - 1)); + //m_pixels.total += ((left + pixels + (PIXELS_PER_LOOP - 1)) & ~(PIXELS_PER_LOOP - 1)) - left; + + ASSERT(m_pixels.actual <= m_pixels.total); + + m_ds->DrawScanline(pixels, left, top, scan); +} + +void GSRasterizer::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) +{ + m_pixels.actual += 1; + m_pixels.total += PIXELS_PER_LOOP - 1; + + ASSERT(m_pixels.actual <= m_pixels.total); + + m_ds->DrawEdge(pixels, left, top, scan); +} + // GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon) diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 4281b201eb..63279c47d5 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -86,7 +86,7 @@ public: virtual ~IDrawScanline() {} virtual void BeginDraw(const GSRasterizerData* data) = 0; - virtual void EndDraw(uint64 frame, uint64 ticks, int pixels) = 0; + virtual void EndDraw(uint64 frame, uint64 ticks, int actual, int total) = 0; #ifdef ENABLE_JIT_RASTERIZER @@ -134,7 +134,7 @@ protected: GSVector4 m_fscissor_x; GSVector4 m_fscissor_y; struct {GSVertexSW* buff; int count;} m_edge; - int m_pixels; + struct {int sum, actual, total;} m_pixels; typedef void (GSRasterizer::*DrawPrimPtr)(const GSVertexSW* v, int count); @@ -151,6 +151,9 @@ protected: __forceinline void AddScanline(GSVertexSW* e, int pixels, int left, int top, const GSVertexSW& scan); __forceinline void Flush(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan, bool edge = false); + __forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan); + __forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan); + public: GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon); virtual ~GSRasterizer(); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 4cb4043d03..bf55aed415 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -2321,7 +2321,7 @@ void GSState::GrowVertexBuffer() template __forceinline void GSState::VertexKick(uint32 skip) { - ASSERT(m_vertex.tail < m_vertex.maxcount); + ASSERT(m_vertex.tail < m_vertex.maxcount + 3); size_t head = m_vertex.head; size_t tail = m_vertex.tail; @@ -2340,7 +2340,7 @@ __forceinline void GSState::VertexKick(uint32 skip) GSVector4i xy = v1.xxxx().sub16(m_ofxy); - #if _M_SSE >= 0x401 + #if _M_SSE >= 0x501 GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4))); #else GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy())); @@ -2421,14 +2421,21 @@ __forceinline void GSState::VertexKick(uint32 skip) case GS_TRIANGLELIST: case GS_TRIANGLESTRIP: // TODO: any way to do a 16-bit integer cross product? + // cross product is zero most of the time because either of the vertices are the same + /* cross = GSVector4(v2.xyxyl().i16to32().sub32(v0.upl32(v1).i16to32())); // x20, y20, x21, y21 cross = cross * cross.wzwz(); // x20 * y21, y20 * x21 test |= GSVector4i::cast(cross == cross.yxwz()); + */ + test = (test | v0 == v1) | (v1 == v2 | v0 == v2); break; case GS_TRIANGLEFAN: + /* cross = GSVector4(v2.xyxyl().i16to32().sub32(v3.upl32(v1).i16to32())); // x23, y23, x21, y21 cross = cross * cross.wzwz(); // x23 * y21, y23 * x21 test |= GSVector4i::cast(cross == cross.yxwz()); + */ + test = (test | v0 == v1) | (v1 == v2 | v0 == v2); break; } diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 890491e182..7cd5eafb6f 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -1261,7 +1261,7 @@ public: #endif - #if _M_SSE >= 0x401 + #if _M_SSE >= 0x501 template __forceinline GSVector4i blend32(const GSVector4i& v) const { @@ -3725,6 +3725,68 @@ public: // + static __forceinline GSVector8i i8to16c(const void* p) + { + return GSVector8i(_mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)p))); + } + + static __forceinline GSVector8i u8to16c(const void* p) + { + return GSVector8i(_mm256_cvtepu8_epi16(_mm_load_si128((__m128i*)p))); + } + + static __forceinline GSVector8i i8to32c(const void* p) + { + return GSVector8i(_mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*)p))); + } + + static __forceinline GSVector8i u8to32c(const void* p) + { + return GSVector8i(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)p))); + } + + static __forceinline GSVector8i i8to64c(int i) + { + return GSVector8i(_mm256_cvtepi8_epi64(_mm_cvtsi32_si128(i))); + } + + static __forceinline GSVector8i u8to64c(int i) + { + return GSVector8i(_mm256_cvtepu8_epi64(_mm_cvtsi32_si128(i))); + } + + static __forceinline GSVector8i i16to32c(const void* p) + { + return GSVector8i(_mm256_cvtepi16_epi32(_mm_load_si128((__m128i*)p))); + } + + static __forceinline GSVector8i u16to32c(const void* p) + { + return GSVector8i(_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)p))); + } + + static __forceinline GSVector8i i16to64c(const void* p) + { + return GSVector8i(_mm256_cvtepi16_epi64(_mm_loadl_epi64((__m128i*)p))); + } + + static __forceinline GSVector8i u16to64c(const void* p) + { + return GSVector8i(_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)p))); + } + + static __forceinline GSVector8i i32to64c(const void* p) + { + return GSVector8i(_mm256_cvtepi32_epi64(_mm_load_si128((__m128i*)p))); + } + + static __forceinline GSVector8i u32to64c(const void* p) + { + return GSVector8i(_mm256_cvtepu32_epi64(_mm_load_si128((__m128i*)p))); + } + + // + template __forceinline GSVector8i srl() const { return GSVector8i(_mm256_srli_si256(m, i)); diff --git a/plugins/GSdx/xbyak/xbyak_mnemonic.h b/plugins/GSdx/xbyak/xbyak_mnemonic.h index c8a785a3f7..02a60cd14f 100644 --- a/plugins/GSdx/xbyak/xbyak_mnemonic.h +++ b/plugins/GSdx/xbyak/xbyak_mnemonic.h @@ -903,7 +903,7 @@ void vpmovzxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F3 void vpshufd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x70, true, -1, imm); } void vpshufhw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x70, true, -1, imm); } void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x70, true, -1, imm); } -void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x17, false, -1); } +void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x17, true, -1); } void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x53, true, -1); } void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x52, true, -1); } void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x51, true, -1); }