From b31634df8fa49f35646777973d2a73d64d0c9ec0 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Sat, 12 Mar 2011 22:10:58 +0000 Subject: [PATCH] GSdx: using mipmap levels (only per batch, no tri-linear) and a couple of small changes, including the stdcall fix for linux. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4419 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp | 3 +- plugins/GSdx/GPURendererSW.cpp | 6 +- plugins/GSdx/GS.h | 2 +- plugins/GSdx/GSDirtyRect.cpp | 4 +- plugins/GSdx/GSDrawScanline.cpp | 2 +- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 3 - .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 11 +- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 11 +- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 9 +- plugins/GSdx/GSLocalMemory.cpp | 2 +- plugins/GSdx/GSLocalMemory.h | 18 +- plugins/GSdx/GSRasterizer.cpp | 83 +- plugins/GSdx/GSRasterizer.h | 2 +- plugins/GSdx/GSRenderer.cpp | 63 +- plugins/GSdx/GSRenderer.h | 4 +- plugins/GSdx/GSRendererDX.h | 2 +- plugins/GSdx/GSRendererHW.h | 2 +- plugins/GSdx/GSRendererSW.cpp | 164 +++- plugins/GSdx/GSState.cpp | 41 + plugins/GSdx/GSTextureCache.cpp | 4 +- plugins/GSdx/GSTextureCacheSW.cpp | 44 +- plugins/GSdx/GSTextureCacheSW.h | 3 +- plugins/GSdx/GSVector.h | 824 +++++++++++++++--- plugins/GSdx/GSVertexSW.h | 132 ++- plugins/GSdx/GSVertexTrace.cpp | 53 ++ plugins/GSdx/GSVertexTrace.h | 24 +- plugins/GSdx/stdafx.h | 2 +- 27 files changed, 1178 insertions(+), 340 deletions(-) diff --git a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp index ed31b4c136..1e2dbfd629 100644 --- a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp @@ -128,9 +128,8 @@ void GPUDrawScanlineCodeGenerator::Init() add(edi, edx); lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]); - // int steps = right - left - 8; + // int steps = pixels - 8; - sub(ecx, edx); sub(ecx, 8); if(m_sel.dtd) diff --git a/plugins/GSdx/GPURendererSW.cpp b/plugins/GSdx/GPURendererSW.cpp index f13d2e611d..97afc7de84 100644 --- a/plugins/GSdx/GPURendererSW.cpp +++ b/plugins/GSdx/GPURendererSW.cpp @@ -174,10 +174,10 @@ void GPURendererSW::VertexKick() int x = (int)(m_v.XY.X + m_env.DROFF.X) << m_scale.x; int y = (int)(m_v.XY.Y + m_env.DROFF.Y) << m_scale.y; - int s = m_v.UV.X; - int t = m_v.UV.Y; + int u = m_v.UV.X; + int v = m_v.UV.Y; - GSVector4 pt(x, y, s, t); + GSVector4 pt(x, y, u, v); dst.p = pt.xyxy(GSVector4::zero()); dst.t = (pt.zwzw(GSVector4::zero()) + GSVector4(0.125f)) * 256.0f; diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h index 3169bf0880..ac32878bef 100644 --- a/plugins/GSdx/GS.h +++ b/plugins/GSdx/GS.h @@ -815,7 +815,7 @@ union }; }; REG_END2 - __forceinline bool IsRepeating() {return (1 << TW) > (int)(TBW << 6);} + __forceinline bool IsRepeating() {return (1 << TW) > (int)(TBW << 6) || (PSM == PSM_PSMT8 || PSM == PSM_PSMT4) && TBW == 1;} REG_END2 REG64_(GIFReg, TEX1) diff --git a/plugins/GSdx/GSDirtyRect.cpp b/plugins/GSdx/GSDirtyRect.cpp index 0ce2017e17..bf4ede42f7 100644 --- a/plugins/GSdx/GSDirtyRect.cpp +++ b/plugins/GSdx/GSDirtyRect.cpp @@ -54,7 +54,7 @@ GSVector4i GSDirtyRect::GetDirtyRect(const GIFRegTEX0& TEX0) } else { - r = GSVector4i(left, top, right, bottom).ralign(src); + r = GSVector4i(left, top, right, bottom).ralign(src); } return r; @@ -77,7 +77,7 @@ GSVector4i GSDirtyRectList::GetDirtyRectAndClear(const GIFRegTEX0& TEX0, const G GSVector2i bs = GSLocalMemory::m_psm[TEX0.PSM].bs; - return r.ralign(bs).rintersect(GSVector4i(0, 0, size.x, size.y)); + return r.ralign(bs).rintersect(GSVector4i(0, 0, size.x, size.y)); } return GSVector4i::zero(); diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 04d8cb7ab9..ad995dedbf 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -189,7 +189,7 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col, color = color.andnot(mask); - GSVector4i br = r.ralign(GSVector2i(8 * 4 / sizeof(T), 8)); + GSVector4i br = r.ralign(GSVector2i(8 * 4 / sizeof(T), 8)); if(!br.rempty()) { diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index dd3975bce7..8a1dbd85f2 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -19,9 +19,6 @@ * */ -// TODO: x64 (use the extra regs to avoid spills of zs, zd, uf, vf, rb, ga and keep a few constants in the last two like aref or afix) -// TODO: for edges doing 4 pixels is wasteful (needed memory access * 4) - #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index cd432f9e8d..bbaa210067 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -224,10 +224,9 @@ void GSDrawScanlineCodeGenerator::Init() sub(rbx, rdx); - // int steps = right - left - 4; + // int steps = pixels + skip - 4; - sub(rcx, rbx); - sub(rcx, 4); + lea(rcx, ptr[rcx + rdx - 4]); // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; @@ -478,6 +477,12 @@ void GSDrawScanlineCodeGenerator::Step() vpaddw(xmm13, xmm1); vpaddw(xmm14, xmm2); + + // FIXME: color may underflow and roll over at the end of the line, if decreasing + + vpxor(xmm0, xmm0); + vpmaxsw(xmm13, xmm0); + vpmaxsw(xmm14, xmm0); } else { diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 1ff9c35bf1..ff75b94f9f 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -252,10 +252,9 @@ void GSDrawScanlineCodeGenerator::Init() sub(ebx, edx); - // int steps = right - left - 4; + // int steps = pixels + skip - 4; - sub(ecx, ebx); - sub(ecx, 4); + lea(ecx, ptr[ecx + edx - 4]); // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; @@ -553,6 +552,12 @@ void GSDrawScanlineCodeGenerator::Step() vpaddw(xmm5, ptr[&m_local.temp.rb]); vpaddw(xmm6, ptr[&m_local.temp.ga]); + // FIXME: color may underflow and roll over at the end of the line, if decreasing + + vpxor(xmm7, xmm7); + vpmaxsw(xmm5, xmm7); + vpmaxsw(xmm6, xmm7); + vmovdqa(ptr[&m_local.temp.rb], xmm5); vmovdqa(ptr[&m_local.temp.ga], xmm6); } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 7586038513..ef3faa26d8 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -251,8 +251,7 @@ void GSDrawScanlineCodeGenerator::Init() // int steps = right - left - 4; - sub(ecx, ebx); - sub(ecx, 4); + lea(ecx, ptr[ecx + edx - 4]); // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; @@ -555,6 +554,12 @@ void GSDrawScanlineCodeGenerator::Step() paddw(xmm5, ptr[&m_local.temp.rb]); paddw(xmm6, ptr[&m_local.temp.ga]); + // FIXME: color may underflow and roll over at the end of the line, if decreasing + + pxor(xmm7, xmm7); + pmaxsw(xmm5, xmm7); + pmaxsw(xmm6, xmm7); + movdqa(ptr[&m_local.temp.rb], xmm5); movdqa(ptr[&m_local.temp.ga], xmm6); } diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index a0fe87fe6d..8df9b14164 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -1708,7 +1708,7 @@ void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, TEX0.TBW = o->bw; TEX0.PSM = o->psm; - GSVector4i cr = r.ralign(psm.bs); + GSVector4i cr = r.ralign(psm.bs); bool aligned = ((size_t)(dst + (cr.left - r.left) * sizeof(uint32)) & 0xf) == 0; diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index c6cf33dd9d..1bd4943e71 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -173,14 +173,14 @@ public: static uint32 BlockNumber8(int x, int y, uint32 bp, uint32 bw) { - ASSERT((bw & 1) == 0); + // ASSERT((bw & 1) == 0); // allowed for mipmap levels return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; } static uint32 BlockNumber4(int x, int y, uint32 bp, uint32 bw) { - ASSERT((bw & 1) == 0); + // ASSERT((bw & 1) == 0); // allowed for mipmap levels return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; } @@ -291,6 +291,7 @@ public: { uint32 page = (bp >> 5) + (y >> 5) * bw + (x >> 6); uint32 word = (page << 11) + pageOffset32[bp & 0x1f][y & 0x1f][x & 0x3f]; + return word; } @@ -298,6 +299,7 @@ public: { uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6); uint32 word = (page << 12) + pageOffset16[bp & 0x1f][y & 0x3f][x & 0x3f]; + return word; } @@ -305,22 +307,27 @@ public: { uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6); uint32 word = (page << 12) + pageOffset16S[bp & 0x1f][y & 0x3f][x & 0x3f]; + return word; } static __forceinline uint32 PixelAddress8(int x, int y, uint32 bp, uint32 bw) { - ASSERT((bw & 1) == 0); + // ASSERT((bw & 1) == 0); // allowed for mipmap levels + uint32 page = (bp >> 5) + (y >> 6) * (bw >> 1) + (x >> 7); uint32 word = (page << 13) + pageOffset8[bp & 0x1f][y & 0x3f][x & 0x7f]; + return word; } static __forceinline uint32 PixelAddress4(int x, int y, uint32 bp, uint32 bw) { - ASSERT((bw & 1) == 0); + // ASSERT((bw & 1) == 0); // allowed for mipmap levels + uint32 page = (bp >> 5) + (y >> 7) * (bw >> 1) + (x >> 7); uint32 word = (page << 14) + pageOffset4[bp & 0x1f][y & 0x7f][x & 0x7f]; + return word; } @@ -328,6 +335,7 @@ public: { uint32 page = (bp >> 5) + (y >> 5) * bw + (x >> 6); uint32 word = (page << 11) + pageOffset32Z[bp & 0x1f][y & 0x1f][x & 0x3f]; + return word; } @@ -335,6 +343,7 @@ public: { uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6); uint32 word = (page << 12) + pageOffset16Z[bp & 0x1f][y & 0x3f][x & 0x3f]; + return word; } @@ -342,6 +351,7 @@ public: { uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6); uint32 word = (page << 12) + pageOffset16SZ[bp & 0x1f][y & 0x3f][x & 0x3f]; + return word; } diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 89c5333377..a457e14843 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -129,7 +129,7 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v) m_ds->SetupPrim(v, *v); - m_ds->DrawScanline(p.x + 1, p.x, p.y, *v); + m_ds->DrawScanline(1, p.x, p.y, *v); } } } @@ -144,16 +144,10 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) if(m_ds->IsEdge()) { - GSVertexSW dscan; - - dscan.p = GSVector4::zero(); - dscan.t = GSVector4::zero(); - dscan.c = GSVector4::zero(); - DrawEdge(v[0], v[1], dv, i, 0); DrawEdge(v[0], v[1], dv, i, 1); - Flush(v, dscan, true); + Flush(v, GSVertexSW::zero(), true); return; } @@ -170,13 +164,13 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) GSVertexSW l, dl; - l.p = v[0].p.blend8(v[1].p, mask); - l.t = v[0].t.blend8(v[1].t, mask); - l.c = v[0].c.blend8(v[1].c, mask); + l.p = v[0].p.blend32(v[1].p, mask); + l.t = v[0].t.blend32(v[1].t, mask); + l.c = v[0].c.blend32(v[1].c, mask); GSVector4 r; - r = v[1].p.blend8(v[0].p, mask); + r = v[1].p.blend32(v[0].p, mask); GSVector4i p(l.p); @@ -216,7 +210,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) e->p.i16[0] = (int16)p.x; e->p.i16[1] = (int16)p.y; - e->p.i16[2] = (int16)(p.x + 1); + e->p.i16[2] = 1; e++; } @@ -231,13 +225,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) m_stats.pixels += m_edge.count; - GSVertexSW dscan; - - dscan.p = GSVector4::zero(); - dscan.t = GSVector4::zero(); - dscan.c = GSVector4::zero(); - - Flush(v, dscan); + Flush(v, GSVertexSW::zero()); } } @@ -278,8 +266,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) int i = (aabb == bccb).mask() & 7; GSVector4 tbf = aabb.xzxz(bccb).ceil(); - GSVector4 tbmax = tbf.max(m_fscissor.yyyy()); - GSVector4 tbmin = tbf.min(m_fscissor.wwww()); + GSVector4 tbmax = tbf.max(m_fscissor.ywyw()); + GSVector4 tbmin = tbf.min(m_fscissor.ywyw()); GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); dv[0] = v[1] - v[0]; @@ -335,13 +323,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) DrawEdge(v[0], v[2], dv[1], i & 2, j & 2); DrawEdge(v[1], v[2], dv[2], i & 4, j & 4); - GSVertexSW dscan; - - dscan.p = GSVector4::zero(); - dscan.t = GSVector4::zero(); - dscan.c = GSVector4::zero(); - - Flush(v, dscan, true); + Flush(v, GSVertexSW::zero(), true); } switch(i) @@ -365,6 +347,10 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) if(tb.y < tb.w) { + // TODO: j == 1 (x2 < x3 < x0 < x1) + // v[3] isn't accurate enough, it may leave gaps horizontally if it happens to be on the left side of the triangle + // example: previous triangle's scanline ends on 48.9999, this one's starts from 49.0001, the pixel at 49 isn't drawn + GSVertexSW l = v[1 + (1 << j)]; GSVertexSW dl = ddv[2 - j]; @@ -436,8 +422,8 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const if(IsOneOfMyScanlines(top)) { GSVector4 lrf = l.p.ceil(); - GSVector4 lrmax = lrf.max(m_fscissor.xxxx()); - GSVector4 lrmin = lrf.min(m_fscissor.zzzz()); + GSVector4 lrmax = lrf.max(m_fscissor.xzxz()); + GSVector4 lrmin = lrf.min(m_fscissor.xzxz()); GSVector4i lr = GSVector4i(lrmax.xxyy(lrmin)); int left = lr.extract32<0>(); @@ -453,7 +439,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const e->p.i16[0] = (int16)left; e->p.i16[1] = (int16)top; - e->p.i16[2] = (int16)right; + e->p.i16[2] = (int16)pixels; e++; } @@ -473,12 +459,12 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices) GSVector4 mask = (vertices[0].p < vertices[1].p).xyzw(GSVector4::zero()); - v[0].p = vertices[1].p.blend8(vertices[0].p, mask); - v[0].t = vertices[1].t.blend8(vertices[0].t, mask); + v[0].p = vertices[1].p.blend32(vertices[0].p, mask); + v[0].t = vertices[1].t.blend32(vertices[0].t, mask); v[0].c = vertices[1].c; - v[1].p = vertices[0].p.blend8(vertices[1].p, mask); - v[1].t = vertices[0].t.blend8(vertices[1].t, mask); + v[1].p = vertices[0].p.blend32(vertices[1].p, mask); + v[1].t = vertices[0].t.blend32(vertices[1].t, mask); GSVector4i r(v[0].p.xyxy(v[1].p).ceil()); @@ -500,18 +486,13 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices) return; } - GSVector4 zero = GSVector4::zero(); - - GSVertexSW dedge, dscan; - - dedge.p = zero; - dscan.p = zero; - - dedge.c = zero; - dscan.c = zero; + GSVertexSW dedge = GSVertexSW::zero(); + GSVertexSW dscan = GSVertexSW::zero(); GSVertexSW dv = v[1] - v[0]; + GSVector4 zero = GSVector4::zero(); + dedge.t = (dv.t / dv.p.yyyy()).xyxy(zero).wyww(); dscan.t = (dv.t / dv.p.xxxx()).xyxy(zero).xwww(); @@ -526,7 +507,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices) { m_stats.pixels += r.width(); - m_ds->DrawScanline(r.right, r.left, r.top, scan); + m_ds->DrawScanline(r.width(), r.left, r.top, scan); } if(++r.top >= r.bottom) break; @@ -555,7 +536,6 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS { GSVector4 tbmax = lrtb.max(m_fscissor.yyyy()); GSVector4 tbmin = lrtb.min(m_fscissor.wwww()); - GSVector4i tb = GSVector4i(tbmax.zwzw(tbmin)); int top, bottom; @@ -609,7 +589,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS e->p.i16[0] = (int16)xi; e->p.i16[1] = (int16)top; - e->p.i16[2] = (int16)(xi + 1); + e->p.i16[2] = 1; e++; } @@ -637,7 +617,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS e->p.i16[0] = (int16)xi; e->p.i16[1] = (int16)top; - e->p.i16[2] = (int16)(xi + 1); + e->p.i16[2] = 1; e++; } @@ -653,7 +633,6 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS { GSVector4 lrmax = lrtb.max(m_fscissor.xxxx()); GSVector4 lrmin = lrtb.min(m_fscissor.zzzz()); - GSVector4i lr = GSVector4i(lrmax.xyxy(lrmin)); int left, right; @@ -707,7 +686,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS e->p.i16[0] = (int16)left; e->p.i16[1] = (int16)yi; - e->p.i16[2] = (int16)(left + 1); + e->p.i16[2] = 1; e++; } @@ -735,7 +714,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS e->p.i16[0] = (int16)left; e->p.i16[1] = (int16)yi; - e->p.i16[2] = (int16)(left + 1); + e->p.i16[2] = 1; e++; } diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 308bda8b21..dfb7725621 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -42,7 +42,7 @@ class IDrawScanline : public GSAlignedClass<32> { public: typedef void (__fastcall *SetupPrimPtr)(const GSVertexSW* vertices, const GSVertexSW& dscan); - typedef void (__fastcall *DrawScanlinePtr)(int right, int left, int top, const GSVertexSW& scan); + typedef void (__fastcall *DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan); typedef void (IDrawScanline::*DrawRectPtr)(const GSVector4i& r, const GSVertexSW& v); // TODO: jit protected: diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp index 2aad4d892e..99ec01710a 100644 --- a/plugins/GSdx/GSRenderer.cpp +++ b/plugins/GSdx/GSRenderer.cpp @@ -35,6 +35,7 @@ GSRenderer::GSRenderer() m_filter = theApp.GetConfig("filter", 1); m_vsync = !!theApp.GetConfig("vsync", 0); m_aa1 = !!theApp.GetConfig("aa1", 0); + m_mipmap = !!theApp.GetConfig("mipmap", 1); s_n = 0; s_dump = !!theApp.GetConfig("dump", 0); @@ -513,6 +514,9 @@ void GSRenderer::KeyEvent(GSKeyEventData* e) case VK_DELETE: m_aa1 = !m_aa1; return; + case VK_INSERT: + m_mipmap = !m_mipmap; + return; } #else @@ -523,25 +527,23 @@ void GSRenderer::KeyEvent(GSKeyEventData* e) } } -void GSRenderer::GetTextureMinMax(GSVector4i& r, bool linear) +void GSRenderer::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear) { - const GSDrawingContext* context = m_context; - - int tw = context->TEX0.TW; - int th = context->TEX0.TH; + int tw = TEX0.TW; + int th = TEX0.TH; int w = 1 << tw; int h = 1 << th; GSVector4i tr(0, 0, w, h); - int wms = context->CLAMP.WMS; - int wmt = context->CLAMP.WMT; + int wms = CLAMP.WMS; + int wmt = CLAMP.WMT; - int minu = (int)context->CLAMP.MINU; - int minv = (int)context->CLAMP.MINV; - int maxu = (int)context->CLAMP.MAXU; - int maxv = (int)context->CLAMP.MAXV; + int minu = (int)CLAMP.MINU; + int minv = (int)CLAMP.MINV; + int maxu = (int)CLAMP.MAXU; + int maxv = (int)CLAMP.MAXV; GSVector4i vr = tr; @@ -619,7 +621,7 @@ void GSRenderer::GetTextureMinMax(GSVector4i& r, bool linear) if(vr.x < uv.x) vr.x = uv.x; if(vr.z > uv.z + 1) vr.z = uv.z + 1; break; - case CLAMP_REGION_REPEAT: // TODO + case CLAMP_REGION_REPEAT: break; default: __assume(0); @@ -635,9 +637,7 @@ void GSRenderer::GetTextureMinMax(GSVector4i& r, bool linear) if(vr.y < uv.y) vr.y = uv.y; if(vr.w > uv.w + 1) vr.w = uv.w + 1; break; - case CLAMP_REGION_REPEAT: // TODO - //Xenosaga 2 and 3 use it - //printf("gsdx: CLAMP_REGION_REPEAT not implemented, please report\n"); + case CLAMP_REGION_REPEAT: break; default: __assume(0); @@ -791,39 +791,6 @@ bool GSRenderer::TryAlphaTest(uint32& fm, uint32& zm) return true; } -bool GSRenderer::IsLinear() -{ - const GIFRegTEX1& TEX1 = m_context->TEX1; - - bool mmin = TEX1.IsMinLinear(); - bool mmag = TEX1.IsMagLinear(); - - if(mmag == mmin || TEX1.MXL == 0) // MXL == 0 => MMIN ignored, tested it on ps2 - { - return mmag; - } - - // if FST => assume Q = 1.0f (should not, but Q is very often bogus, 0 or DEN) - // Fixme : Why should Q be bogus? (it used to be - Gabest) - - if(!TEX1.LCM && !PRIM->FST) - { - float K = (float)TEX1.K / 16; - float f = (float)(1 << TEX1.L) / log(2.0f); - - // TODO: abs(Qmin) may not be <= abs(Qmax), check the sign - - float LODmin = K + log(1.0f / fabs(m_vt.m_max.t.z)) * f; - float LODmax = K + log(1.0f / fabs(m_vt.m_min.t.z)) * f; - - return LODmax <= 0 ? mmag : LODmin > 0 ? mmin : mmag || mmin; - } - else - { - return TEX1.K <= 0 ? mmag : TEX1.K > 0 ? mmin : mmag || mmin; - } -} - bool GSRenderer::IsOpaque() { if(PRIM->AA1) diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h index d7659928c9..975938868b 100644 --- a/plugins/GSdx/GSRenderer.h +++ b/plugins/GSdx/GSRenderer.h @@ -43,6 +43,7 @@ protected: int m_filter; bool m_vsync; bool m_aa1; + bool m_mipmap; bool m_framelimit; virtual GSTexture* GetOutput(int i) = 0; @@ -51,10 +52,9 @@ protected: // following functions need m_vt to be initialized - void GetTextureMinMax(GSVector4i& r, bool linear); + void GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear); void GetAlphaMinMax(); bool TryAlphaTest(uint32& fm, uint32& zm); - bool IsLinear(); bool IsOpaque(); public: diff --git a/plugins/GSdx/GSRendererDX.h b/plugins/GSdx/GSRendererDX.h index fc794cc3f4..9534570ae3 100644 --- a/plugins/GSdx/GSRendererDX.h +++ b/plugins/GSdx/GSRendererDX.h @@ -296,7 +296,7 @@ public: ps_sel.aem = env.TEXA.AEM; ps_sel.tfx = context->TEX0.TFX; ps_sel.tcc = context->TEX0.TCC; - ps_sel.ltf = m_filter == 2 ? IsLinear() : m_filter; + ps_sel.ltf = m_filter == 2 ? m_vt.IsLinear() : m_filter; ps_sel.rt = tex->m_target; int w = tex->m_texture->GetWidth(); diff --git a/plugins/GSdx/GSRendererHW.h b/plugins/GSdx/GSRendererHW.h index 561ac01734..4bb4a5766d 100644 --- a/plugins/GSdx/GSRendererHW.h +++ b/plugins/GSdx/GSRendererHW.h @@ -566,7 +566,7 @@ protected: GSVector4i r; - GetTextureMinMax(r, IsLinear()); + GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear()); tex = m_tc->LookupSource(context->TEX0, env.TEXA, r); diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 9b1c39a6d8..cb07b8600a 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -102,7 +102,7 @@ GSTexture* GSRendererSW::GetOutput(int i) const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; - (m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign(psm.bs), m_output, pitch, m_env.TEXA); + (m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign(psm.bs), m_output, pitch, m_env.TEXA); m_texture[i]->Update(r, m_output, pitch); @@ -136,7 +136,7 @@ void GSRendererSW::Draw() return; } - if(s_dump) + if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0) { uint64 frame = m_perfmon.GetFrame(); @@ -204,7 +204,7 @@ void GSRendererSW::Draw() m_perfmon.Put(GSPerfMon::Prim, stats.prims); m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels); - if(s_dump) + if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0) { uint64 frame = m_perfmon.GetFrame(); @@ -324,7 +324,7 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) gd.sel.tfx = context->TEX0.TFX; gd.sel.tcc = context->TEX0.TCC; gd.sel.fst = PRIM->FST; - gd.sel.ltf = IsLinear(); + gd.sel.ltf = m_vt.IsLinear(); gd.sel.tlu = GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0; gd.sel.wms = context->CLAMP.WMS; gd.sel.wmt = context->CLAMP.WMT; @@ -370,6 +370,134 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) } } + GIFRegTEX0 MIP_TEX0 = context->TEX0; + GIFRegCLAMP MIP_CLAMP = context->CLAMP; + + if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0) + { + int level = (int)(m_vt.m_lod.x + 0.5f); + + // FIXME: onimusa 3 + + level = std::min(level, context->TEX1.MXL); + level = std::min(level, 6); + + if(level > 0) + { + // printf("lvl %d\n", level); + + switch(level) + { + case 1: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP1; + MIP_TEX0.TBW = context->MIPTBP1.TBW1; + break; + case 2: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP2; + MIP_TEX0.TBW = context->MIPTBP1.TBW2; + break; + case 3: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP3; + MIP_TEX0.TBW = context->MIPTBP1.TBW3; + break; + case 4: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP4; + MIP_TEX0.TBW = context->MIPTBP2.TBW4; + break; + case 5: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP5; + MIP_TEX0.TBW = context->MIPTBP2.TBW5; + break; + case 6: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP6; + MIP_TEX0.TBW = context->MIPTBP2.TBW6; + break; + default: + __assume(0); + } + + ASSERT(MIP_TEX0.TBP0 != 0 && MIP_TEX0.TBW != 0); + + int tw = (int)MIP_TEX0.TW - level; + int th = (int)MIP_TEX0.TH - level; + + switch(context->TEX1.MMIN) + { + case 2: case 3: // point (min size 1) + tw = std::max(tw, 0); + th = std::max(th, 0); + break; + case 4: case 5: // linear (min size 8) + tw = std::max(tw, 3); + th = std::max(th, 3); + break; + default: + __assume(0); + } + + // scale down the texture coordinates, including vertex trace + + GSVector4 scale = GSVector4(1.0f) / GSVector4(1 << ((int)MIP_TEX0.TW - tw), 1 << ((int)MIP_TEX0.TH - th), 1, 1); + + GSVertexSW* v = m_vertices; + + for(int i = 0, j = m_count; i < j; i++) + { + v[i].t *= scale; + } + + m_vt.m_min.t *= scale; + m_vt.m_max.t *= scale; + + MIP_TEX0.TW = (uint32)tw; + MIP_TEX0.TH = (uint32)th; + + // this shift is done even for repeat modes + + MIP_CLAMP.MINU >>= level; + MIP_CLAMP.MAXU >>= level; + MIP_CLAMP.MINV >>= level; + MIP_CLAMP.MAXV >>= level; +/* + printf("%d%d%d%d%d L %d K %03x %.2f lod %.2f %.2f q %f %f\n", + m_context->TEX1.MXL, + m_context->TEX1.MMAG, + m_context->TEX1.MMIN, + PRIM->FST, + m_context->TEX1.LCM, + m_context->TEX1.L, + m_context->TEX1.K, + (float)m_context->TEX1.K / 16, + m_context->TEX1.MXL > 0 ? m_vt.m_lod.x : 0, + m_context->TEX1.MXL > 0 ? m_vt.m_lod.y : 0, + 1.0f / m_vt.m_min.t.z, + 1.0f / m_vt.m_max.t.z); +*/ + if(s_dump) + { + uint64 frame = m_perfmon.GetFrame(); + + string s; + + if(s_save && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d_(%d%d%d%d%d %.2f %.2f).bmp", + s_n, frame, (int)MIP_TEX0.TBP0, (int)MIP_TEX0.PSM, + m_context->TEX1.MXL, + m_context->TEX1.MMAG, + m_vt.m_filter.mmag, + m_context->TEX1.MMIN, + m_vt.m_filter.mmin, + m_context->TEX1.MXL > 0 ? m_vt.m_lod.x : 0, + m_context->TEX1.MXL > 0 ? m_vt.m_lod.y : 0 + ); + + m_mem.SaveBMP(s, MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM, 1 << MIP_TEX0.TW, 1 << MIP_TEX0.TH); + } + } + } + } + if(gd.sel.ltf) { if(gd.sel.fst) @@ -389,9 +517,9 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) GSVector4i r; - GetTextureMinMax(r, gd.sel.ltf); + GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, gd.sel.ltf); - const GSTextureCacheSW::GSTexture* t = m_tc->Lookup(context->TEX0, env.TEXA, r); + const GSTextureCacheSW::GSTexture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, r); if(!t) {ASSERT(0); return;} @@ -400,10 +528,10 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) gd.sel.tw = t->m_tw - 3; - uint16 tw = (uint16)(1 << context->TEX0.TW); - uint16 th = (uint16)(1 << context->TEX0.TH); + uint16 tw = (uint16)(1 << MIP_TEX0.TW); + uint16 th = (uint16)(1 << MIP_TEX0.TH); - switch(context->CLAMP.WMS) + switch(MIP_CLAMP.WMS) { case CLAMP_REPEAT: gd.t.min.u16[0] = tw - 1; @@ -416,20 +544,20 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) gd.t.mask.u32[0] = 0; break; case CLAMP_REGION_CLAMP: - gd.t.min.u16[0] = std::min(context->CLAMP.MINU, tw - 1); - gd.t.max.u16[0] = std::min(context->CLAMP.MAXU, tw - 1); + gd.t.min.u16[0] = std::min(MIP_CLAMP.MINU, tw - 1); + gd.t.max.u16[0] = std::min(MIP_CLAMP.MAXU, tw - 1); gd.t.mask.u32[0] = 0; break; case CLAMP_REGION_REPEAT: - gd.t.min.u16[0] = context->CLAMP.MINU; - gd.t.max.u16[0] = context->CLAMP.MAXU; + gd.t.min.u16[0] = MIP_CLAMP.MINU; + gd.t.max.u16[0] = MIP_CLAMP.MAXU; gd.t.mask.u32[0] = 0xffffffff; break; default: __assume(0); } - switch(context->CLAMP.WMT) + switch(MIP_CLAMP.WMT) { case CLAMP_REPEAT: gd.t.min.u16[4] = th - 1; @@ -442,13 +570,13 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) gd.t.mask.u32[2] = 0; break; case CLAMP_REGION_CLAMP: - gd.t.min.u16[4] = std::min(context->CLAMP.MINV, th - 1); - gd.t.max.u16[4] = std::min(context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) + gd.t.min.u16[4] = std::min(MIP_CLAMP.MINV, th - 1); + gd.t.max.u16[4] = std::min(MIP_CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) gd.t.mask.u32[2] = 0; break; case CLAMP_REGION_REPEAT: - gd.t.min.u16[4] = context->CLAMP.MINV; - gd.t.max.u16[4] = context->CLAMP.MAXV; + gd.t.min.u16[4] = MIP_CLAMP.MINV; + gd.t.max.u16[4] = MIP_CLAMP.MAXV; gd.t.mask.u32[2] = 0xffffffff; break; default: diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index ab8ac4d87e..019a228cd4 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -587,6 +587,47 @@ template void GSState::GIFRegHandlerTEX0(const GIFReg* r) if(TEX0.TH > 10) TEX0.TH = 10; ApplyTEX0(i, TEX0); + + if(m_env.CTXT[i].TEX1.MTBA) + { + uint32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp; + + uint32 tbp = TEX0.TBP0; + uint32 tbw = TEX0.TBW; + uint32 th = TEX0.TH; + + if(th >= 3) + { + tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; + tbw = std::max(tbw >> 1, 1); + th--; + + m_env.CTXT[i].MIPTBP1.TBP1 = tbp; + m_env.CTXT[i].MIPTBP1.TBW1 = tbw; + + tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; + tbw = std::max(tbw >> 1, 1); + th--; + + m_env.CTXT[i].MIPTBP1.TBP2 = tbp; + m_env.CTXT[i].MIPTBP1.TBW2 = tbw; + + tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; + tbw = std::max(tbw >> 1, 1); + th--; + + m_env.CTXT[i].MIPTBP1.TBP3 = tbp; + m_env.CTXT[i].MIPTBP1.TBW3 = tbw; + + // NOTE: TEX1.MXL must not be automatically set to 3 here + } + else + { + ASSERT(0); + } + + // printf("MTBA\n"); + } } template void GSState::GIFRegHandlerCLAMP(const GIFReg* r) diff --git a/plugins/GSdx/GSTextureCache.cpp b/plugins/GSdx/GSTextureCache.cpp index 84b1c43ec1..209de07871 100644 --- a/plugins/GSdx/GSTextureCache.cpp +++ b/plugins/GSdx/GSTextureCache.cpp @@ -292,7 +292,7 @@ void GSTextureCache::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rec GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs; - GSVector4i r = rect.ralign(bs); + GSVector4i r = rect.ralign(bs); if(!target) { @@ -881,7 +881,7 @@ void GSTextureCache::Source::Update(const GIFRegTEX0& TEX0, const GIFRegTEXA& TE int tw = std::max(1 << m_TEX0.TW, bs.x); int th = std::max(1 << m_TEX0.TH, bs.y); - GSVector4i r = rect.ralign(bs); + GSVector4i r = rect.ralign(bs); if(r.eq(GSVector4i(0, 0, tw, th))) { diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp index 52af9e756e..19cf4dbaf0 100644 --- a/plugins/GSdx/GSTextureCacheSW.cpp +++ b/plugins/GSdx/GSTextureCacheSW.cpp @@ -66,12 +66,12 @@ const GSTextureCacheSW::GSTexture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TE if(t == NULL) { - t = new GSTexture(m_state); + const GSOffset* o = m_state->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); + + t = new GSTexture(m_state, o); m_textures.insert(t); - const GSOffset* o = m_state->m_context->offset.tex; - GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs; int tw = 1 << TEX0.TW; @@ -132,7 +132,7 @@ void GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs; - GSVector4i r = rect.ralign(bs); + GSVector4i r = rect.ralign(bs); for(int y = r.top; y < r.bottom; y += bs.y) { @@ -209,8 +209,9 @@ void GSTextureCacheSW::IncAge() // -GSTextureCacheSW::GSTexture::GSTexture(GSState* state) +GSTextureCacheSW::GSTexture::GSTexture(GSState* state, const GSOffset* offset) : m_state(state) + , m_offset(offset) , m_buff(NULL) , m_tw(0) , m_age(0) @@ -241,10 +242,25 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX GSVector2i bs = psm.bs; + int shift = psm.pal == 0 ? 2 : 0; + int tw = std::max(1 << TEX0.TW, bs.x); int th = std::max(1 << TEX0.TH, bs.y); - GSVector4i r = rect.ralign(bs); + GSVector4i r = rect; + + bool repeating = m_TEX0.IsRepeating(); + + if(m_TEX0.TBW == 1) // repeating) + { + // FIXME: + // - marking a block prevents fetching it again to a different part of the texture + // - only a real issue for TBW = 1 mipmap levels, where the repeating part is below and often exploited + + r = GSVector4i(0, 0, tw, th); + } + + r = r.ralign(bs); if(r.eq(GSVector4i(0, 0, tw, th))) { @@ -260,20 +276,20 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX return false; } - m_tw = std::max(TEX0.TW, psm.pal > 0 ? 5 : 3); // makes one row 32 bytes at least, matches the smallest block size that is allocated above for m_buff +#ifdef DEBUG + for(uint32 i = 0, j = tw * th * sizeof(uint8); i < j; i++) ((uint8*)m_buff)[i] = 0xff; +#endif + + m_tw = std::max(TEX0.TW, 5 - shift); // makes one row 32 bytes at least, matches the smallest block size that is allocated above for m_buff } GSLocalMemory& mem = m_state->m_mem; - const GSOffset* o = m_state->m_context->offset.tex; - - bool repeating = m_TEX0.IsRepeating(); + const GSOffset* RESTRICT o = m_offset; uint32 blocks = 0; - GSLocalMemory::readTextureBlock rtxb = psm.rtxbP; - - int shift = psm.pal == 0 ? 2 : 0; + GSLocalMemory::readTextureBlock rtxbP = psm.rtxbP; uint32 pitch = (1 << m_tw) << shift; @@ -299,7 +315,7 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX m_valid[row] |= col; } - (mem.*rtxb)(block, &dst[x << shift], pitch, TEXA); + (mem.*rtxbP)(block, &dst[x << shift], pitch, TEXA); blocks++; } diff --git a/plugins/GSdx/GSTextureCacheSW.h b/plugins/GSdx/GSTextureCacheSW.h index fd2a1c031f..3ab4bc9760 100644 --- a/plugins/GSdx/GSTextureCacheSW.h +++ b/plugins/GSdx/GSTextureCacheSW.h @@ -30,6 +30,7 @@ public: { public: GSState* m_state; + const GSOffset* m_offset; GIFRegTEX0 m_TEX0; GIFRegTEXA m_TEXA; void* m_buff; @@ -38,7 +39,7 @@ public: uint32 m_age; bool m_complete; - explicit GSTexture(GSState* state); + explicit GSTexture(GSState* state, const GSOffset* offset); virtual ~GSTexture(); bool Update(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r); diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 5d94535ccc..f5bc6af07b 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -3,7 +3,21 @@ #pragma once -// NOTE: x64 version of the _mm_set_* functions are terrible, first they store components into memory then reload in one piece (VS2008 SP1) +enum Align_Mode +{ + Align_Outside, + Align_Inside, + Align_NegInf, + Align_PosInf +}; + +enum Round_Mode +{ + Round_NearestInt = 8, + Round_NegInf = 9, + Round_PosInf = 10, + Round_Truncate = 11 +}; #pragma pack(push, 1) @@ -196,8 +210,6 @@ public: return sat_i32(a); } - enum RoundMode {Outside, Inside, NegInf, PosInf}; - template __forceinline GSVector4i ralign(const GSVector2i& a) const { // a must be 1 << n @@ -208,10 +220,10 @@ public: switch(mode) { - case Inside: v = *this + mask; break; - case Outside: v = *this + mask.zwxy(); break; - case NegInf: v = *this; break; - case PosInf: v = *this + mask.zwzw(); break; + case Align_Inside: v = *this + mask; break; + case Align_Outside: v = *this + mask.zwxy(); break; + case Align_NegInf: v = *this; break; + case Align_PosInf: v = *this + mask.zwzw(); break; default: ASSERT(0); break; } @@ -1029,15 +1041,19 @@ public: __forceinline bool alltrue() const { - return _mm_movemask_epi8(m) == 0xffff; + return mask() == 0xffff; } __forceinline bool allfalse() const { #if _M_SSE >= 0x401 + return _mm_testz_si128(m, m) != 0; + #else - return _mm_movemask_epi8(m) == 0; + + return mask() == 0; + #endif } @@ -1053,9 +1069,13 @@ public: template __forceinline int extract8() const { #if _M_SSE >= 0x401 + return _mm_extract_epi8(m, i); + #else + return (int)u8[i]; + #endif } @@ -1081,10 +1101,15 @@ public: template __forceinline int extract32() const { if(i == 0) return GSVector4i::store(*this); + #if _M_SSE >= 0x401 + return _mm_extract_epi32(m, i); + #else + return i32[i]; + #endif } @@ -1102,10 +1127,15 @@ public: template __forceinline int64 extract64() const { if(i == 0) return GSVector4i::storeq(*this); + #if _M_SSE >= 0x401 + return _mm_extract_epi64(m, i); + #else + return i64[i]; + #endif } @@ -2388,8 +2418,6 @@ public: return (v + v) - (v * v) * *this; } - enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10, Truncate = 11}; - template __forceinline GSVector4 round() const { #if _M_SSE >= 0x401 @@ -2404,17 +2432,17 @@ public: b = a + b - b; - if((mode & 7) == (NegInf & 7)) + if((mode & 7) == (Round_NegInf & 7)) { return b - ((a < b) & m_x3f800000); } - if((mode & 7) == (PosInf & 7)) + if((mode & 7) == (Round_PosInf & 7)) { return b + ((a > b) & m_x3f800000); } - ASSERT((mode & 7) == (NearestInt & 7)); // other modes aren't implemented + ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented return b; @@ -2423,12 +2451,62 @@ public: __forceinline GSVector4 floor() const { - return round(); + return round(); } __forceinline GSVector4 ceil() const { - return round(); + return round(); + } + + // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html + + #define LOG_POLY0(x, c0) GSVector4(c0) + #define LOG_POLY1(x, c0, c1) ((LOG_POLY0(x, c1) * x) + GSVector4(c0)) + #define LOG_POLY2(x, c0, c1, c2) ((LOG_POLY1(x, c1, c2) * x) + GSVector4(c0)) + #define LOG_POLY3(x, c0, c1, c2, c3) ((LOG_POLY2(x, c1, c2, c3) * x) + GSVector4(c0)) + #define LOG_POLY4(x, c0, c1, c2, c3, c4) ((LOG_POLY3(x, c1, c2, c3, c4) * x) + GSVector4(c0)) + #define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) ((LOG_POLY4(x, c1, c2, c3, c4, c5) * x) + GSVector4(c0)) + + __forceinline GSVector4 log2(int precision = 5) const + { + // NOTE: sign bit ignored, safe to pass negative numbers + + GSVector4i exp = GSVector4i::xff000000() >> 1; + GSVector4i mant = GSVector4i::x007fffff(); + GSVector4 one(1.0f); + + GSVector4i i = GSVector4i::cast(*this); + + GSVector4 e = GSVector4(((i & exp) >> 23) - GSVector4i::x0000007f()); + GSVector4 m = GSVector4::cast(i & mant) | one; + + GSVector4 p; + + // Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ + + switch(precision) + { + case 3: + p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + break; + case 4: + p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + break; + default: + case 5: + p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + break; + case 6: + p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + break; + } + + // This effectively increases the polynomial degree by one, but ensures that log2(1) == 0 + + p = p * (m - one); + + return p + e; } __forceinline GSVector4 mod2x(const GSVector4& f, const int scale = 256) const @@ -2528,7 +2606,7 @@ public: return GSVector4(_mm_max_ps(m, a)); } - __forceinline GSVector4 blend8(const GSVector4& a, const GSVector4& mask) const + __forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const { #if _M_SSE >= 0x401 @@ -2573,16 +2651,19 @@ public: __forceinline bool alltrue() const { - return _mm_movemask_ps(m) == 0xf; + return mask() == 0xf; } __forceinline bool allfalse() const { #if _M_SSE >= 0x401 - __m128i a = _mm_castps_si128(m); - return _mm_testz_si128(a, a) != 0; + + return _mm_testz_ps(m, m) != 0; + #else - return _mm_movemask_ps(m) == 0; + + return mask() == 0; + #endif } @@ -2591,9 +2672,13 @@ public: template __forceinline int extract() const { #if _M_SSE >= 0x401 + return _mm_extract_ps(m, i); + #else + return i32[i]; + #endif } @@ -2861,8 +2946,6 @@ public: VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ #define VECTOR4_SHUFFLE_1(xs, xn) \ - __forceinline GSVector4 xs##4() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ - __forceinline GSVector4 xs##4(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ @@ -2894,8 +2977,6 @@ __forceinline GSVector4 GSVector4::cast(const GSVector4i& v) return GSVector4(_mm_castsi128_ps(v.m)); } -#if _M_SSE >= 0x500 - class GSVector8; __aligned(class, 32) GSVector8i @@ -2915,16 +2996,25 @@ public: uint16 u16[16]; uint32 u32[8]; uint64 u64[4]; + #if _M_SSE >= 0x500 __m256i m; + __m128i m0, m1; + #else + __m128i m[2]; + #endif }; - __forceinline GSVector8i() - { - } + __forceinline GSVector8i() {} + + __forceinline explicit GSVector8i(const GSVector8& v); + + static GSVector8i cast(const GSVector8& v); + + #if _M_SSE >= 0x500 __forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) { - m = _mm256_set_epi32(w0, z0, y0, x0, w0, z0, y0, x0); + m = _mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0); } __forceinline GSVector8i(__m128i m0, __m128i m1) @@ -2952,8 +3042,6 @@ public: this->m = m; } - __forceinline explicit GSVector8i(const GSVector8& v); - __forceinline void operator = (const GSVector8i& v) { m = v.m; @@ -2979,8 +3067,6 @@ public: return m; } - static GSVector8i cast(const GSVector8& v); - // TODO template __forceinline GSVector4i extract() const @@ -3002,7 +3088,7 @@ public: template __forceinline static GSVector8i load(const void* p) { - return GSVector8i(aligned ? _mm_load256i_si256((__m256i*)p) : _mm256i_loadu_si256((__m128i*)p)); + return GSVector8i(aligned ? _mm_load256i_si256((__m256i*)p) : _mm256i_loadu_si256((__m256i*)p)); } template __forceinline static void store(void* p, const GSVector4i& v) @@ -3010,6 +3096,103 @@ public: if(aligned) _mm256i_store_si256((__m256i*)p, v.m); else _mm256i_storeu_si256((__m256i*)p, v.m); } + + #else + + __forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) + { + m[0] = _mm_set_epi32(w0, z0, y0, x0); + m[1] = _mm_set_epi32(w1, z1, y1, x1); + } + + __forceinline GSVector8i(__m128i m0, __m128i m1) + { + m[0] = m0; + m[1] = m1; + } + + __forceinline GSVector8i(const GSVector8i& v) + { + m[0] = v.m[0]; + m[1] = v.m[1]; + } + + __forceinline explicit GSVector8i(int i) + { + m[0] = m[1] = _mm_set1_epi32(i); + } + + __forceinline explicit GSVector8i(__m128i m) + { + this->m[0] = this->m[1] = m; + } + + __forceinline void operator = (const GSVector8i& v) + { + m[0] = v.m[0]; + m[1] = v.m[1]; + } + + __forceinline void operator = (int i) + { + m[0] = m[1] = _mm_set1_epi32(i); + } + + __forceinline void operator = (__m128i m) + { + this->m[0] = this->m[1] = m; + } + + // TODO + + template __forceinline GSVector4i extract() const + { + return GSVector4i(m[i]); + } + + template __forceinline GSVector8i insert(__m128i m) const + { + GSVector8i v = *this; + + v.m[i] = m; + + return v; + } + + __forceinline static GSVector8i zero() + { + GSVector8i v; + + v.m[0] = v.m[1] = _mm_setzero_si128(); + + return v; + } + + // TODO + + template __forceinline static GSVector8i load(const void* p) + { + return GSVector8i( + aligned ? _mm_load_si128((__m128i*)p + 0) : _mm_loadu_si128((__m128i*)p + 0), + aligned ? _mm_load_si128((__m128i*)p + 1) : _mm_loadu_si128((__m128i*)p + 1), + ); + } + + template __forceinline static void store(void* p, const GSVector4i& v) + { + if(aligned) + { + _mm_store_si128((__m128i*)p + 0, v.m[0]); + _mm_store_si128((__m128i*)p + 1, v.m[1]); + } + else + { + _mm_storeu_si128((__m128i*)p + 0, v.m[0]); + _mm_storeu_si128((__m128i*)p + 1, v.m[1]); + } + } + + #endif }; __aligned(class, 32) GSVector8 @@ -3029,25 +3212,97 @@ public: uint16 u16[16]; uint32 u32[8]; uint64 u64[4]; + #if _M_SSE >= 0x500 __m256 m; - - // TODO: _M_SSE < 0x500 => union {__m128 m0, m1;}; and replace each function with a pair of 128 bit intructions + __m128 m0, m1; + #else + __m128 m[2]; + #endif }; - __forceinline GSVector8() + __forceinline GSVector8() {} + + __forceinline explicit GSVector8(const GSVector8i& v); + + __forceinline static GSVector8 cast(const GSVector8i& v); + + __forceinline GSVector8 rcpnr() const { + GSVector8 v = rcp(); + + return (v + v) - (v * v) * *this; } + __forceinline GSVector8 floor() const + { + return round(); + } + + __forceinline GSVector8 ceil() const + { + return round(); + } + + __forceinline GSVector8 operator - () const + { + return neg(); + } + + __forceinline void operator += (float f) + { + *this += GSVector8(f); + } + + __forceinline void operator -= (float f) + { + *this -= GSVector8(f); + } + + __forceinline void operator *= (float f) + { + *this *= GSVector8(f); + } + + __forceinline void operator /= (float f) + { + *this /= GSVector8(f); + } + + __forceinline friend GSVector8 operator + (const GSVector8& v, float f) + { + return v + GSVector8(f); + } + + __forceinline friend GSVector8 operator - (const GSVector8& v, float f) + { + return v - GSVector8(f); + } + + __forceinline friend GSVector8 operator * (const GSVector8& v, float f) + { + return v * GSVector8(f); + } + + __forceinline friend GSVector8 operator / (const GSVector8& v, float f) + { + return v / GSVector8(f); + } + + __forceinline static GSVector8 xffffffff() + { + return zero() == zero(); + } + + #if _M_SSE >= 0x500 + __forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1) { - m = _mm256_set_ps(w0, z0, y0, x0, w0, z0, y0, x0); + m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0); } __forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) { - GSVector8i v(x0, y0, z0, w0, x1, y1, z1, w1); - - m = _mm256_cvtepi32_ps(v); + m = _mm256_cvtepi32_ps(_mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0)); } __forceinline GSVector8(__m128 m0, __m128 m1) @@ -3062,7 +3317,7 @@ public: __forceinline explicit GSVector8(float f) { - m = _mm256_set1_ps(f); // _mm256_broadcast_ss(&f); ? + m = _mm256_set1_ps(f); } __forceinline explicit GSVector8(__m128 m) @@ -3076,8 +3331,6 @@ public: this->m = m; } - __forceinline explicit GSVector8(const GSVector8i& v); - __forceinline void operator = (const GSVector8& v) { m = v.m; @@ -3104,8 +3357,6 @@ public: return m; } - __forceinline static GSVector8 cast(const GSVector8i& v); - __forceinline GSVector8 abs() const { return *this & cast(GSVector8i(GSVector4i::x7fffffff())); // TODO: add GSVector8 consts @@ -3121,30 +3372,11 @@ public: return GSVector8(_mm256_rcp_ps(m)); } - __forceinline GSVector8 rcpnr() const - { - GSVector8 v = rcp(); - - return (v + v) - (v * v) * *this; - } - - enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10, Truncate = 11}; - template __forceinline GSVector8 round() const { return GSVector8(_mm256_round_ps(m, mode)); } - __forceinline GSVector8 floor() const - { - return round(); - } - - __forceinline GSVector8 ceil() const - { - return round(); - } - // TODO __forceinline GSVector8 min(const GSVector8& a) const @@ -3157,7 +3389,12 @@ public: return GSVector8(_mm256_max_ps(m, a)); } - __forceinline GSVector8 blend8(const GSVector8& a, const GSVector8& mask) const + template __forceinline GSVector8 blend32(const GSVector8& a) const + { + return GSVector8(_mm256_blend_ps(m, a, mask)); + } + + __forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask) const { return GSVector8(_mm256_blendv_ps(m, a, mask)); } @@ -3186,12 +3423,12 @@ public: __forceinline GSVector8 l2h() const { - return insert<1>(extract<0>()); + return GSVector8(_mm256_shuffle_ps(m, m, 0x88)); } __forceinline GSVector8 h2l() const { - return insert<0>(extract<1>()); + return GSVector8(_mm256_shuffle_ps(m, m, 0x22)); } __forceinline GSVector8 andnot(const GSVector8& v) const @@ -3206,12 +3443,12 @@ public: __forceinline bool alltrue() const { - return _mm256_movemask_ps(m) == 0xff; + return mask() == 0xff; } __forceinline bool allfalse() const { - return _mm256_movemask_ps(m) == 0; + return _mm256_testz_ps(m, m) != 0; } template __forceinline GSVector4 extract() const @@ -3239,11 +3476,6 @@ public: _mm256_zeroall(); } - __forceinline static GSVector8 xffffffff() - { - return zero() == zero(); - } - // TODO: load low, ss template __forceinline static GSVector8 load(const void* p) @@ -3261,11 +3493,6 @@ public: // TODO - __forceinline GSVector8 operator - () const - { - return neg(); - } - __forceinline void operator += (const GSVector8& v) { m = _mm256_add_ps(m, v); @@ -3286,26 +3513,6 @@ public: m = _mm256_div_ps(m, v); } - __forceinline void operator += (float f) - { - *this += GSVector8(f); - } - - __forceinline void operator -= (float f) - { - *this -= GSVector8(f); - } - - __forceinline void operator *= (float f) - { - *this *= GSVector8(f); - } - - __forceinline void operator /= (float f) - { - *this /= GSVector8(f); - } - __forceinline void operator &= (const GSVector8& v) { m = _mm256_and_ps(m, v); @@ -3341,26 +3548,6 @@ public: return GSVector8(_mm256_div_ps(v1, v2)); } - __forceinline friend GSVector8 operator + (const GSVector8& v, float f) - { - return v + GSVector8(f); - } - - __forceinline friend GSVector8 operator - (const GSVector8& v, float f) - { - return v - GSVector8(f); - } - - __forceinline friend GSVector8 operator * (const GSVector8& v, float f) - { - return v * GSVector8(f); - } - - __forceinline friend GSVector8 operator / (const GSVector8& v, float f) - { - return v / GSVector8(f); - } - __forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2) { return GSVector8(_mm256_and_ps(v1, v2)); @@ -3410,6 +3597,361 @@ public: __forceinline GSVector8 xs##ys() const {return GSVector8(_mm256_permute2f128_ps(m, m, xn | (yn << 4)));} \ __forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, xn | (yn << 4)));} \ + #define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + __forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + __forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + + #else + + __forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1) + { + m[0] = _mm_set_ps(w0, z0, y0, x0); + m[1] = _mm_set_ps(w1, z1, y1, x1); + } + + __forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1) + { + m[0] = _mm_cvtepi32_ps(_mm_set_epi32(w0, z0, y0, x0)); + m[1] = _mm_cvtepi32_ps(_mm_set_epi32(w1, z1, y1, x1)); + } + + __forceinline GSVector8(__m128 m0, __m128 m1) + { + m[0] = m0; + m[1] = m1; + } + + __forceinline GSVector8(const GSVector8& v) + { + m[0] = v.m[0]; + m[1] = v.m[1]; + } + + __forceinline explicit GSVector8(float f) + { + m[0] = m[1] = _mm_set1_ps(f); + } + + __forceinline explicit GSVector8(__m128 m) + { + this->m[0] = this->m[1] = m; + } + + __forceinline void operator = (const GSVector8& v) + { + m[0] = v.m[0]; + m[1] = v.m[1]; + } + + __forceinline void operator = (float f) + { + m[0] = m[1] = _mm_set1_ps(f); + } + + __forceinline void operator = (__m128 m) + { + this->m[0] = this->m[1] = m; + } + + __forceinline GSVector8 abs() const + { + GSVector4 mask = GSVector4::cast(GSVector4i::x7fffffff()); + + return GSVector8(_mm_and_ps(m[0], mask), _mm_and_ps(m[1], mask)); + } + + __forceinline GSVector8 neg() const + { + GSVector4 mask = GSVector4::cast(GSVector4i::x80000000()); + + return GSVector8(_mm_xor_ps(m[0], mask), _mm_xor_ps(m[1], mask)); + } + + __forceinline GSVector8 rcp() const + { + return GSVector8(_mm_rcp_ps(m[0]), _mm_rcp_ps(m[1])); + } + + template __forceinline GSVector8 round() const + { + return GSVector8(_mm_round_ps(m[0], mode), _mm_round_ps(m[1], mode)); + } + + // TODO + + __forceinline GSVector8 min(const GSVector8& a) const + { + return GSVector8(_mm_min_ps(m[0], a.m[0]), _mm_min_ps(m[1], a.m[1])); + } + + __forceinline GSVector8 max(const GSVector8& a) const + { + return GSVector8(_mm_max_ps(m[0], a.m[0]), _mm_max_ps(m[1], a.m[1])); + } + + #if _M_SSE >= 0x401 + + template __forceinline GSVector8 blend32(const GSVector8& a) const + { + return GSVector8(_mm_blend_ps(m[0], a.m[0], mask & 0x0f), _mm_blend_ps(m[1], a.m[1], (mask >> 4) & 0x0f)); + } + + #endif + + __forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask) const + { + return GSVector8(_mm_blendv_ps(m[0], a.m[0], mask.m[0]), _mm_blendv_ps(m[1], a.m[1], mask.m[1])); + } + + __forceinline GSVector8 upl(const GSVector8& a) const + { + return GSVector8(_mm_unpacklo_ps(m[0], a.m[0]), _mm_unpacklo_ps(m[1], a.m[1])); + } + + __forceinline GSVector8 uph(const GSVector8& a) const + { + return GSVector8(_mm_unpackhi_ps(m[0], a.m[0]), _mm_unpackhi_ps(m[1], a.m[1])); + } + + __forceinline GSVector8 upl64(const GSVector8& a) const + { + return GSVector8( + _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m[0]), _mm_castps_pd(a.m[0]))), + _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m[1]), _mm_castps_pd(a.m[1]))) + ); + } + + __forceinline GSVector8 uph64(const GSVector8& a) const + { + return GSVector8( + _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m[0]), _mm_castps_pd(a.m[0]))), + _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m[1]), _mm_castps_pd(a.m[1]))) + ); + } + + // TODO + + __forceinline GSVector8 l2h() const + { + return GSVector8(_mm_movelh_ps(m[0], m[0]), _mm_movelh_ps(m[1], m[1])); + } + + __forceinline GSVector8 h2l() const + { + return GSVector8(_mm_movehl_ps(m[0], m[0]), _mm_movehl_ps(m[1], m[1])); + } + + __forceinline GSVector8 andnot(const GSVector8& v) const + { + return GSVector8(_mm_andnot_ps(v.m[0], m[0]), _mm_andnot_ps(v.m[1], m[1])); + } + + __forceinline int mask() const + { + return _mm_movemask_ps(m[0]) | (_mm_movemask_ps(m[1]) << 4); + } + + __forceinline bool alltrue() const + { + return mask() == 0xff; + } + + __forceinline bool allfalse() const + { + #if _M_SSE >= 0x401 + + return (_mm_testz_ps(m[0], m[0]) & _mm_testz_ps(m[1], m[1])) != 0; + + #else + + return mask() == 0; + + #endif + } + + template __forceinline GSVector4 extract() const + { + return GSVector4(m[i]); + } + + template __forceinline GSVector8 insert(__m128 m) const + { + return GSVector8(i == 0 ? m : this->m[0], i == 1 ? m : this->m[1]); + } + + __forceinline static GSVector8 zero() + { + return GSVector8(_mm_setzero_ps(), _mm_setzero_ps()); + } + + __forceinline static void zeroupper() + { + // N/A + } + + __forceinline static void zeroall() + { + // N/A + } + + // TODO: load low, ss + + template __forceinline static GSVector8 load(const void* p) + { + return GSVector8( + aligned ? _mm_load_ps((const float*)p + 0) : _mm_loadu_ps((const float*)p + 0), + aligned ? _mm_load_ps((const float*)p + 4) : _mm_loadu_ps((const float*)p + 4), + ); + } + + // TODO: store low, ss + + template __forceinline static void store(void* p, const GSVector8& v) + { + if(aligned) + { + _mm_store_ps((float*)p + 0, v.m[0]); + _mm_store_ps((float*)p + 4, v.m[1]); + } + else + { + _mm_storeu_ps((float*)p + 0, v.m[0]); + _mm_storeu_ps((float*)p + 4, v.m[1]); + } + } + + // TODO + + __forceinline void operator += (const GSVector8& v) + { + m[0] = _mm_add_ps(m[0], v.m[0]); + m[1] = _mm_add_ps(m[1], v.m[1]); + } + + __forceinline void operator -= (const GSVector8& v) + { + m[0] = _mm_sub_ps(m[0], v.m[0]); + m[1] = _mm_sub_ps(m[1], v.m[1]); + } + + __forceinline void operator *= (const GSVector8& v) + { + m[0] = _mm_mul_ps(m[0], v.m[0]); + m[1] = _mm_mul_ps(m[1], v.m[1]); + } + + __forceinline void operator /= (const GSVector8& v) + { + m[0] = _mm_div_ps(m[0], v.m[0]); + m[1] = _mm_div_ps(m[1], v.m[1]); + } + + __forceinline void operator &= (const GSVector8& v) + { + m[0] = _mm_and_ps(m[0], v.m[0]); + m[1] = _mm_and_ps(m[1], v.m[1]); + } + + __forceinline void operator |= (const GSVector8& v) + { + m[0] = _mm_or_ps(m[0], v.m[0]); + m[1] = _mm_or_ps(m[1], v.m[1]); + } + + __forceinline void operator ^= (const GSVector8& v) + { + m[0] = _mm_xor_ps(m[0], v.m[0]); + m[1] = _mm_xor_ps(m[1], v.m[1]); + } + + __forceinline friend GSVector8 operator + (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_add_ps(v1.m[0], v2.m[0]), _mm_add_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator - (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_sub_ps(v1.m[0], v2.m[0]), _mm_sub_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator * (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_mul_ps(v1.m[0], v2.m[0]), _mm_mul_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator / (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_div_ps(v1.m[0], v2.m[0]), _mm_div_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_and_ps(v1.m[0], v2.m[0]), _mm_and_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator | (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_or_ps(v1.m[0], v2.m[0]), _mm_or_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator ^ (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_xor_ps(v1.m[0], v2.m[0]), _mm_xor_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator == (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_cmpeq_ps(v1.m[0], v2.m[0]), _mm_cmpeq_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator != (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_cmpneq_ps(v1.m[0], v2.m[0]), _mm_cmpeq_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator > (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_cmpgt_ps(v1.m[0], v2.m[0]), _mm_cmpgt_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator < (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_cmplt_ps(v1.m[0], v2.m[0]), _mm_cmplt_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator >= (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_cmpge_ps(v1.m[0], v2.m[0]), _mm_cmpge_ps(v1.m[1], v2.m[1])); + } + + __forceinline friend GSVector8 operator <= (const GSVector8& v1, const GSVector8& v2) + { + return GSVector8(_mm_cmple_ps(v1.m[0], v2.m[0]), _mm_cmple_ps(v1.m[1], v2.m[1])); + } + + __forceinline static __m128 VECTOR8_SELECT(const GSVector8& v1, const GSVector8& v2, int n) + { + switch(n) + { + case 0: return v1.m[0]; + case 1: return v1.m[1]; + case 2: return v2.m[0]; + case 3: return v2.m[1]; + } + + return _mm_setzero_ps(); + } + + #define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \ + __forceinline GSVector8 xs##ys() const {return GSVector8(VECTOR8_SELECT(*this, *this, xn), VECTOR8_SELECT(*this, *this, yn));} \ + __forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(VECTOR8_SELECT(*this, v, xn), VECTOR8_SELECT(*this, v, yn));} \ + + #define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + __forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm_shuffle_ps(m[0], m[0], _MM_SHUFFLE(wn, zn, yn, xn)), _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(wn, zn, yn, xn)));} \ + __forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm_shuffle_ps(m[0], v.m[0], _MM_SHUFFLE(wn, zn, yn, xn)), _mm_shuffle_ps(m[1], v.m[1], _MM_SHUFFLE(wn, zn, yn, xn)));} \ + + #endif + #define VECTOR8_PERMUTE_1(xs, xn) \ VECTOR8_PERMUTE_2(xs, xn, x, 0) \ VECTOR8_PERMUTE_2(xs, xn, y, 1) \ @@ -3423,10 +3965,6 @@ public: VECTOR8_PERMUTE_1(w, 3) VECTOR8_PERMUTE_1(_, 8) - #define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ - __forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ - __forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ - #define VECTOR8_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ @@ -3440,8 +3978,6 @@ public: VECTOR8_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ #define VECTOR8_SHUFFLE_1(xs, xn) \ - __forceinline GSVector8 xs##4() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ - __forceinline GSVector8 xs##4(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \ VECTOR8_SHUFFLE_2(xs, xn, x, 0) \ VECTOR8_SHUFFLE_2(xs, xn, y, 1) \ VECTOR8_SHUFFLE_2(xs, xn, z, 2) \ @@ -3453,6 +3989,8 @@ public: VECTOR8_SHUFFLE_1(w, 3) }; +#if _M_SSE >= 0x500 + __forceinline GSVector8i::GSVector8i(const GSVector8& v) { m = _mm256_cvttps_epi32(v); @@ -3473,6 +4011,40 @@ __forceinline GSVector8 GSVector8::cast(const GSVector8i& v) return GSVector8(_mm256_castsi256_ps(v.m)); } +#else + +__forceinline GSVector8i::GSVector8i(const GSVector8& v) +{ + m[0] = _mm_cvttps_epi32(v.m[0]); + m[1] = _mm_cvttps_epi32(v.m[1]); +} + +__forceinline GSVector8::GSVector8(const GSVector8i& v) +{ + m[0] = _mm_cvtepi32_ps(v.m[0]); + m[1] = _mm_cvtepi32_ps(v.m[1]); +} + +__forceinline GSVector8i GSVector8i::cast(const GSVector8& v) +{ + GSVector8i v2; + + v2.m[0] = _mm_castps_si128(v.m[0]); + v2.m[1] = _mm_castps_si128(v.m[1]); + + return v2; +} + +__forceinline GSVector8 GSVector8::cast(const GSVector8i& v) +{ + GSVector8 v2; + + v2.m[0] = _mm_castsi128_ps(v.m[0]); + v2.m[1] = _mm_castsi128_ps(v.m[1]); + + return v2; +} + #endif #pragma pack(pop) diff --git a/plugins/GSdx/GSVertexSW.h b/plugins/GSdx/GSVertexSW.h index e2d7151ec5..953cd7a672 100644 --- a/plugins/GSdx/GSVertexSW.h +++ b/plugins/GSdx/GSVertexSW.h @@ -30,13 +30,73 @@ __aligned(struct, 16) GSVertexSW __forceinline GSVertexSW() {} __forceinline GSVertexSW(const GSVertexSW& v) {*this = v;} - __forceinline void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;} - __forceinline void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;} + __forceinline static GSVertexSW zero() + { + GSVertexSW v; - friend GSVertexSW operator + (const GSVertexSW& v1, const GSVertexSW& v2); - friend GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2); - friend GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv); - friend GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv); + v.p = GSVector4::zero(); + v.t = GSVector4::zero(); + v.c = GSVector4::zero(); + + return v; + } + __forceinline void operator = (const GSVertexSW& v) + { + p = v.p; + t = v.t; + c = v.c; + } + + __forceinline void operator += (const GSVertexSW& v) + { + p += v.p; + t += v.t; + c += v.c; + } + + __forceinline friend GSVertexSW operator + (const GSVertexSW& a, const GSVertexSW& b) + { + GSVertexSW v; + + v.p = a.p + b.p; + v.t = a.t + b.t; + v.c = a.c + b.c; + + return v; + } + + __forceinline friend GSVertexSW operator - (const GSVertexSW& a, const GSVertexSW& b) + { + GSVertexSW v; + + v.p = a.p - b.p; + v.t = a.t - b.t; + v.c = a.c - b.c; + + return v; + } + + __forceinline friend GSVertexSW operator * (const GSVertexSW& a, const GSVector4& b) + { + GSVertexSW v; + + v.p = a.p * b; + v.t = a.t * b; + v.c = a.c * b; + + return v; + } + + __forceinline friend GSVertexSW operator / (const GSVertexSW& a, const GSVector4& b) + { + GSVertexSW v; + + v.p = a.p / b; + v.t = a.t / b; + v.c = a.c / b; + + return v; + } static bool IsQuad(const GSVertexSW* v, int& tl, int& br) { @@ -122,6 +182,25 @@ __aligned(struct, 16) GSVertexSW br = i; + #if _M_SSE >= 0x500 + + { + // p.z, p.w, t.z, t.w, c.x, c.y, c.z, c.w + + GSVector8 v0 = GSVector8(v[0].p.zwzw(v[0].t), v[0].c); + GSVector8 v1 = GSVector8(v[1].p.zwzw(v[1].t), v[1].c); + GSVector8 v2 = GSVector8(v[2].p.zwzw(v[2].t), v[2].c); + GSVector8 v3 = GSVector8(v[3].p.zwzw(v[3].t), v[3].c); + GSVector8 v4 = GSVector8(v[4].p.zwzw(v[4].t), v[4].c); + GSVector8 v5 = GSVector8(v[5].p.zwzw(v[5].t), v[5].c); + + GSVector8 test = ((v0 == v1) & (v0 == v2)) & ((v0 == v3) & (v0 == v4)) & (v0 == v5); + + return test.alltrue(); + } + + #else + v0 = v[0].p.zwzw(v[0].t); v1 = v[1].p.zwzw(v[1].t); v2 = v[2].p.zwzw(v[2].t); @@ -151,42 +230,7 @@ __aligned(struct, 16) GSVertexSW } return true; + + #endif } -}; - -__forceinline GSVertexSW operator + (const GSVertexSW& v1, const GSVertexSW& v2) -{ - GSVertexSW v0; - v0.c = v1.c + v2.c; - v0.p = v1.p + v2.p; - v0.t = v1.t + v2.t; - return v0; -} - -__forceinline GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2) -{ - GSVertexSW v0; - v0.c = v1.c - v2.c; - v0.p = v1.p - v2.p; - v0.t = v1.t - v2.t; - return v0; -} - -__forceinline GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv) -{ - GSVertexSW v0; - v0.c = v.c * vv; - v0.p = v.p * vv; - v0.t = v.t * vv; - return v0; -} - -__forceinline GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv) -{ - GSVertexSW v0; - v0.c = v.c / vv; - v0.p = v.p / vv; - v0.t = v.t / vv; - return v0; -} - +}; \ No newline at end of file diff --git a/plugins/GSdx/GSVertexTrace.cpp b/plugins/GSdx/GSVertexTrace.cpp index 3c1a6d1ceb..2510c724f7 100644 --- a/plugins/GSdx/GSVertexTrace.cpp +++ b/plugins/GSdx/GSVertexTrace.cpp @@ -48,6 +48,52 @@ uint32 GSVertexTrace::Hash(GS_PRIM_CLASS primclass) return hash; } +void GSVertexTrace::UpdateLOD() +{ + if(!m_state->PRIM->TME) return; + + const GIFRegTEX1& TEX1 = m_state->m_context->TEX1; + + m_filter.mmag = TEX1.IsMagLinear(); + m_filter.mmin = TEX1.IsMinLinear(); + + if(TEX1.MXL == 0) // MXL == 0 => MMIN ignored, tested it on ps2 + { + m_filter.linear = m_filter.mmag; + + return; + } + + float K = (float)TEX1.K / 16; + + if(TEX1.LCM == 0) // && m_state->PRIM->FST == 0 // if FST => assume Q = 1.0f (should not, but Q is very often bogus, 0 or DEN) + { + // LOD = log2(1/|Q|) * (1 << L) + K + + GSVector4::storel(&m_lod, m_max.t.uph(m_min.t).log2(2).neg() * (float)(1 << TEX1.L) + K); + + if(m_lod.x > m_lod.y) {float tmp = m_lod.x; m_lod.x = m_lod.x; m_lod.y = tmp;} + } + else + { + m_lod.x = K; + m_lod.y = K; + } + + if(m_lod.y <= 0) + { + m_filter.linear = m_filter.mmag; + } + else if(m_lod.x > 0) + { + m_filter.linear = m_filter.mmin; + } + else + { + m_filter.linear = m_filter.mmag | m_filter.mmin; + } +} + void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass) { m_map_sw[Hash(primclass)](count, v, m_min, m_max); @@ -55,6 +101,8 @@ void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primcla m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20); m_alpha.valid = false; + + UpdateLOD(); } void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass) @@ -87,6 +135,8 @@ void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primcl m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20); m_alpha.valid = false; + + UpdateLOD(); } void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass) @@ -119,4 +169,7 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20); m_alpha.valid = false; + + UpdateLOD(); } + diff --git a/plugins/GSdx/GSVertexTrace.h b/plugins/GSdx/GSVertexTrace.h index f7eaf35361..d18c73c1cf 100644 --- a/plugins/GSdx/GSVertexTrace.h +++ b/plugins/GSdx/GSVertexTrace.h @@ -32,7 +32,7 @@ class GSState; __aligned(class, 32) GSVertexTrace { public: - struct Vertex {GSVector4i c; GSVector4 p, t;}; + struct Vertex {GSVector4i c; GSVector4 p, t;}; // t.xy * 0x10000 struct VertexAlpha {int min, max; bool valid;}; private: @@ -60,16 +60,23 @@ private: GSCodeGeneratorFunctionMap m_map_hw9; GSCodeGeneratorFunctionMap m_map_hw11; + const GSState* m_state; + uint32 Hash(GS_PRIM_CLASS primclass); - const GSState* m_state; + void UpdateLOD(); static const GSVector4 s_minmax; public: GS_PRIM_CLASS m_primclass; - Vertex m_min, m_max; // t.xy * 0x10000 - VertexAlpha m_alpha; // source alpha range after tfx, GSRenderer::GetAlphaMinMax() updates it + + Vertex m_min; + Vertex m_max; + + // source alpha range after tfx, GSRenderer::GetAlphaMinMax() updates it + + VertexAlpha m_alpha; union { @@ -78,10 +85,19 @@ public: struct {uint32 rgba:16, xyzf:4, stq:4;}; } m_eq; + union + { + struct {uint32 mmag:1, mmin:1, linear:1;}; + } m_filter; + + GSVector2 m_lod; // x = min, y = max + GSVertexTrace(const GSState* state); void Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass); void Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass); void Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass); void Update(const GSVertexNull* v, int count, GS_PRIM_CLASS primclass) {} + + bool IsLinear() const {return m_filter.linear;} }; diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 9695f5c58c..a9a6a3c209 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -107,7 +107,7 @@ using namespace stdext; #define __aligned(t, n) t __attribute__((aligned(n))) #define __fastcall __attribute__((fastcall)) - #define EXPORT_C_(type) extern "C" type + #define EXPORT_C_(type) extern "C" __attribute__((stdcall,externally_visible,visibility("default"))) type #define EXPORT_C EXPORT_C_(void) #ifdef __GNUC__