diff --git a/plugins/GSdx/GPURenderer.h b/plugins/GSdx/GPURenderer.h index 688af6e62c..4a9884d736 100644 --- a/plugins/GSdx/GPURenderer.h +++ b/plugins/GSdx/GPURenderer.h @@ -100,7 +100,7 @@ protected: r.right = r.left + 256; r.bottom = r.top + 256; - Dump(format("da_%d_%d_%d_%d_%d", m_env.STATUS.TP, r).c_str(), m_env.STATUS.TP, r, false); + Dump(format("da_%d_%d_%d_%d_%d", m_env.STATUS.TP, r.left, r.top, r.right, r.bottom).c_str(), m_env.STATUS.TP, r, false); } */ diff --git a/plugins/GSdx/GPUState.cpp b/plugins/GSdx/GPUState.cpp index 354e9fe20a..f023c5020c 100644 --- a/plugins/GSdx/GPUState.cpp +++ b/plugins/GSdx/GPUState.cpp @@ -610,20 +610,23 @@ int GPUState::PH_Read(GPUReg* r, int size) int w = r[2].XY.X; int h = r[2].XY.Y; - GSVector4i r2; + if(w > 0 && h > 0) + { + GSVector4i r2; - r2.left = r[1].XY.X; - r2.top = r[1].XY.Y; - r2.right = r2.left + w; - r2.bottom = r2.top + h; + r2.left = r[1].XY.X; + r2.top = r[1].XY.Y; + r2.right = r2.left + w; + r2.bottom = r2.top + h; - m_read.bytes = ((w * h + 1) & ~1) * 2; - m_read.cur = 0; - m_read.Reserve(m_read.bytes); + m_read.bytes = ((w * h + 1) & ~1) * 2; + m_read.cur = 0; + m_read.Reserve(m_read.bytes); - m_mem.ReadRect(r2, (uint16*)m_read.buff); + m_mem.ReadRect(r2, (uint16*)m_read.buff); - Dump("r"); + Dump("r"); + } m_env.STATUS.IMG = 1; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index e2578b6396..60c2ecd300 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -2764,42 +2764,33 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) if(m_sel.mmin) { - int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; + const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; if(pixels == 4) { vmovdqa(ptr[&m_local.temp.test], xmm7); + } - for(int j = 0; j < 4; j++) + for(int j = 0; j < 4; j++) + { + mov(ebx, ptr[&lod_i->u32[j]]); + mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]); + + for(int i = 0; i < pixels; i++) { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]); - - for(int i = 0; i < 4; i++) - { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } + } + if(pixels == 4) + { vmovdqa(xmm5, xmm7); vmovdqa(xmm7, ptr[&m_local.temp.test]); } - else - { - for(int j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm6, xmm5, j); - } - } } else { - int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - - // TODO: might be faster to read in columns, inserts into the same register would be further from eachother (last one overwrites xmm5, need to use xmm7) + const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; for(int i = 0; i < pixels; i++) { diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 7e0ace9388..2f97cd953d 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -2928,36 +2928,29 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) { #if _M_SSE >= 0x401 - int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; + const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; if(pixels == 4) { movdqa(ptr[&m_local.temp.test], xmm7); + } - for(int j = 0; j < 4; j++) + for(int j = 0; j < 4; j++) + { + mov(ebx, ptr[&lod_i->u32[j]]); + mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]); + + for(int i = 0; i < pixels; i++) { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]); - - for(int i = 0; i < 4; i++) - { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } + } + if(pixels == 4) + { movdqa(xmm5, xmm7); movdqa(xmm7, ptr[&m_local.temp.test]); } - else - { - for(int j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[edx + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(xmm6, xmm5, j); - } - } #else @@ -3082,12 +3075,10 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) } else { - int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; + const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; #if _M_SSE >= 0x401 - // TODO: might be faster to read in columns, inserts into the same register would be further from eachother (last one overwrites xmm5, need to use xmm7) - for(int i = 0; i < pixels; i++) { for(int j = 0; j < 4; j++) @@ -3098,7 +3089,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) #else - int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; + const int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; for(int i = 0; i < pixels; i++) { diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index a457e14843..9489ea05d7 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -86,7 +86,8 @@ void GSRasterizer::Draw(const GSRasterizerData* data) { case GS_POINT_CLASS: m_stats.prims = count; - for(int i = 0; i < count; i++) DrawPoint(&vertices[i]); + if(data->scissor_test) DrawPoint(vertices, count); + else DrawPoint(vertices, count); break; case GS_LINE_CLASS: ASSERT(!(count & 1)); @@ -117,19 +118,23 @@ void GSRasterizer::GetStats(GSRasterizerStats& stats) stats = m_stats; } -void GSRasterizer::DrawPoint(const GSVertexSW* v) +template +void GSRasterizer::DrawPoint(const GSVertexSW* v, int count) { - GSVector4i p(v->p); - - if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom) + for(; count > 0; count--, v++) { - if(IsOneOfMyScanlines(p.y)) + GSVector4i p(v->p); + + if(!scissor_test || m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom) { - m_stats.pixels++; + if(IsOneOfMyScanlines(p.y)) + { + m_stats.pixels++; - m_ds->SetupPrim(v, *v); + m_ds->SetupPrim(v, *v); - m_ds->DrawScanline(1, p.x, p.y, *v); + m_ds->DrawScanline(1, p.x, p.y, *v); + } } } } @@ -206,11 +211,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v) { if(IsOneOfMyScanlines(p.y)) { - *e = edge; - - e->p.i16[0] = (int16)p.x; - e->p.i16[1] = (int16)p.y; - e->p.i16[2] = 1; + AddScanline(e, 1, p.x, p.y, edge); e++; } @@ -243,8 +244,6 @@ static const int s_abc[8][4] = void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) { - // TODO: GSVertexSW::c/t could be merged into a GSVector8 - GSVertexSW v[4]; GSVertexSW dv[3]; GSVertexSW ddv[3]; @@ -338,7 +337,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); l.p = l.p.xxzw(); // r.x => l.y - dl.p = dl.p.upl(ddv[1 - j].p).xyzw(dl.p); // dr.x => dl.y + dl.p = dl.p.insert<0, 1>(ddv[1 - j].p); // dr.x => dl.y l += dl * dy; @@ -351,13 +350,13 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) // v[3] isn't accurate enough, it may leave gaps horizontally if it happens to be on the left side of the triangle // example: previous triangle's scanline ends on 48.9999, this one's starts from 49.0001, the pixel at 49 isn't drawn - GSVertexSW l = v[1 + (1 << j)]; + GSVertexSW l = v[1 + (j << 1)]; GSVertexSW dl = ddv[2 - j]; GSVector4 dy = tbmax.zzzz() - l.p.yyyy(); - l.p = l.p.upl(v[3 - (1 << j)].p).xyzw(l.p); // r.x => l.y - dl.p = dl.p.upl(ddv[1 + j].p).xyzw(dl.p); // dr.x => dl.y + l.p = l.p.insert<0, 1>(v[3 - (j << 1)].p); // r.x => l.y + dl.p = dl.p.insert<0, 1>(ddv[1 + j].p); // dr.x => dl.y l += dl * dy; @@ -375,8 +374,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); - l.p = l.p.upl(v[1 - j].p).xyzw(l.p); // r.x => l.y - dl.p = dl.p.upl(ddv[2 - j].p).xyzw(dl.p); // dr.x => dl.y + l.p = l.p.insert<0, 1>(v[1 - j].p); // r.x => l.y + dl.p = dl.p.insert<0, 1>(ddv[2 - j].p); // dr.x => dl.y l += dl * dy; @@ -395,7 +394,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) GSVector4 dy = tbmax.xxxx() - l.p.yyyy(); l.p = l.p.xxzw(); // r.x => l.y - dl.p = dl.p.upl(ddv[1 - j].p).xyzw(dl.p); // dr.x => dl.y + dl.p = dl.p.insert<0, 1>(ddv[1 - j].p); // dr.x => dl.y l += dl * dy; @@ -417,13 +416,15 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count]; + GSVector4 scissor = m_fscissor.xzxz(); + while(1) { if(IsOneOfMyScanlines(top)) { GSVector4 lrf = l.p.ceil(); - GSVector4 lrmax = lrf.max(m_fscissor.xzxz()); - GSVector4 lrmin = lrf.min(m_fscissor.xzxz()); + GSVector4 lrmax = lrf.max(scissor); + GSVector4 lrmin = lrf.min(scissor); GSVector4i lr = GSVector4i(lrmax.xxyy(lrmin)); int left = lr.extract32<0>(); @@ -435,11 +436,9 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const { m_stats.pixels += pixels; - *e = l + dscan * (lrmax - l.p).xxxx(); + GSVector4 prestep = lrmax - l.p; - e->p.i16[0] = (int16)left; - e->p.i16[1] = (int16)top; - e->p.i16[2] = (int16)pixels; + AddScanline(e, pixels, left, top, l + dscan * prestep.xxxx()); e++; } @@ -496,8 +495,12 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices) dedge.t = (dv.t / dv.p.yyyy()).xyxy(zero).wyww(); dscan.t = (dv.t / dv.p.xxxx()).xyxy(zero).xwww(); - if(scan.p.y < (float)r.top) scan.t += dedge.t * ((float)r.top - scan.p.y); - if(scan.p.x < (float)r.left) scan.t += dscan.t * ((float)r.left - scan.p.x); + GSVector4 prestep = GSVector4(r.left, r.top) - scan.p; + + int m = (prestep == GSVector4::zero()).mask(); + + if((m & 2) == 0) scan.t += dedge.t * prestep.yyyy(); + if((m & 1) == 0) scan.t += dscan.t * prestep.xxxx(); m_ds->SetupPrim(v, dscan); @@ -581,16 +584,10 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) { - m_stats.pixels++; - - *e = edge; + AddScanline(e, 1, xi, top, edge); e->t.u32[3] = (0x10000 - xf) & 0xffff; - e->p.i16[0] = (int16)xi; - e->p.i16[1] = (int16)top; - e->p.i16[2] = 1; - e++; } @@ -609,16 +606,10 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(xi)) { - m_stats.pixels++; - - *e = edge; + AddScanline(e, 1, xi, top, edge); e->t.u32[3] = xf; - e->p.i16[0] = (int16)xi; - e->p.i16[1] = (int16)top; - e->p.i16[2] = 1; - e++; } @@ -678,16 +669,10 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) { - m_stats.pixels++; + AddScanline(e, 1, left, yi, edge); - *e = edge; - e->t.u32[3] = (0x10000 - yf) & 0xffff; - e->p.i16[0] = (int16)left; - e->p.i16[1] = (int16)yi; - e->p.i16[2] = 1; - e++; } @@ -706,16 +691,10 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi)) { - m_stats.pixels++; + AddScanline(e, 1, left, yi, edge); - *e = edge; - e->t.u32[3] = yf; - e->p.i16[0] = (int16)left; - e->p.i16[1] = (int16)yi; - e->p.i16[2] = 1; - e++; } @@ -727,7 +706,20 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS } } - m_edge.count += e - &m_edge.buff[m_edge.count]; + int count = e - &m_edge.buff[m_edge.count]; + + m_stats.pixels += count; + + m_edge.count += count; +} + +void GSRasterizer::AddScanline(GSVertexSW* e, int pixels, int left, int top, const GSVertexSW& scan) +{ + *e = scan; + + e->p.i16[0] = (int16)pixels; + e->p.i16[1] = (int16)left; + e->p.i16[2] = (int16)top; } void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge) @@ -741,18 +733,31 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo m_ds->SetupPrim(vertices, dscan); const GSVertexSW* RESTRICT e = m_edge.buff; - - int i = 0; + const GSVertexSW* RESTRICT ee = e + count; if(!edge) { - do {m_ds->DrawScanline(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);} - while(++i < count); + do + { + int pixels = e->p.i16[0]; + int left = e->p.i16[1]; + int top = e->p.i16[2]; + + m_ds->DrawScanline(pixels, left, top, *e++); + } + while(e < ee); } else { - do {m_ds->DrawEdge(e[i].p.i16[2], e[i].p.i16[0], e[i].p.i16[1], e[i]);} - while(++i < count); + do + { + int pixels = e->p.i16[0]; + int left = e->p.i16[1]; + int top = e->p.i16[2]; + + m_ds->DrawEdge(pixels, left, top, *e++); + } + while(e < ee); } m_edge.count = 0; diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index dfb7725621..a3e977de77 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -31,11 +31,14 @@ __aligned(class, 32) GSRasterizerData { public: GSVector4i scissor; + bool scissor_test; GS_PRIM_CLASS primclass; const GSVertexSW* vertices; int count; uint64 frame; const void* param; + + GSRasterizerData() : scissor_test(true) {} }; class IDrawScanline : public GSAlignedClass<32> @@ -60,8 +63,8 @@ public: virtual void PrintStats() = 0; __forceinline void SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) {m_sp(vertices, dscan);} - __forceinline void DrawScanline(int right, int left, int top, const GSVertexSW& scan) {m_ds(right, left, top, scan);} - __forceinline void DrawEdge(int right, int left, int top, const GSVertexSW& scan) {m_de(right, left, top, scan);} + __forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) {m_ds(pixels, left, top, scan);} + __forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) {m_de(pixels, left, top, scan);} __forceinline void DrawRect(const GSVector4i& r, const GSVertexSW& v) {(this->*m_dr)(r, v);} __forceinline bool IsEdge() const {return m_de != NULL;} @@ -90,18 +93,20 @@ protected: GSVector4 m_fscissor; struct {GSVertexSW* buff; int count;} m_edge; - void DrawPoint(const GSVertexSW* v); + typedef void (GSRasterizer::*DrawPrimPtr)(const GSVertexSW* v, int count); + + template + void DrawPoint(const GSVertexSW* v, int count); void DrawLine(const GSVertexSW* v); void DrawTriangle(const GSVertexSW* v); void DrawSprite(const GSVertexSW* v); - void DrawEdge(const GSVertexSW* v); __forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan); void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, int orientation, int side); __forceinline bool IsOneOfMyScanlines(int scanline) const; - + __forceinline void AddScanline(GSVertexSW* e, int pixels, int left, int top, const GSVertexSW& scan); __forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false); public: diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index c603ed0b63..b59736b234 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -72,7 +72,8 @@ void GSRendererSW::VSync(int field) m_reset = false; } - // if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats(); + // +if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats(); } void GSRendererSW::ResetDevice() @@ -171,18 +172,21 @@ void GSRendererSW::Draw() s_n++; } + GSVector4i scissor(m_context->scissor.in); + GSVector4i bbox = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)); + GSVector4i r = bbox.rintersect(scissor); + GSRasterizerData data; - data.scissor = GSVector4i(m_context->scissor.in); - data.scissor.z = min(data.scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour + data.scissor = scissor; + data.scissor.z = std::min(data.scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour + data.scissor_test = !bbox.eq(r); data.primclass = m_vt.m_primclass; data.vertices = m_vertices; data.count = m_count; data.frame = m_perfmon.GetFrame(); data.param = &gd; - GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(data.scissor); - m_rl.Draw(&data, r.width(), r.height()); if(gd.sel.fwrite) @@ -364,8 +368,6 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) // 100 l round // 101 l tri - // TODO: (int)m_vt.m_lod.x >= mxl => LCM == 1 - if(m_vt.m_lod.x > 0) { gd.sel.ltf = context->TEX1.MMIN >> 2; @@ -383,13 +385,9 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) if(gd.sel.mmin == 2) { - mxl--; + mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?) } - gd.mxl = GSVector4((float)mxl); - gd.l = GSVector4((float)(-0x10000 << context->TEX1.L)); - gd.k = GSVector4((float)k); - if(gd.sel.fst) { ASSERT(gd.sel.lcm == 1); @@ -398,6 +396,13 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) gd.sel.lcm = 1; } + if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) + { + k = (int)m_vt.m_lod.x << 16; + + gd.sel.lcm = 1; + } + if(gd.sel.lcm) { int lod = std::max(std::min(k, mxl), 0); @@ -412,6 +417,12 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) // TODO: lot to optimize when lod is constant } + else + { + gd.mxl = GSVector4((float)mxl); + gd.l = GSVector4((float)(-0x10000 << context->TEX1.L)); + gd.k = GSVector4((float)k); + } GIFRegTEX0 MIP_TEX0 = context->TEX0; GIFRegCLAMP MIP_CLAMP = context->CLAMP; @@ -486,8 +497,6 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) } else { - // TODO: these shortcuts are not compatible with mipmapping, yet - if(gd.sel.fst == 0) { // skip per pixel division if q is constant @@ -507,8 +516,6 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) v[i].t *= w; } } - - // TODO: q is now destoroyed, but since q is constant we should be able to pre-calc gd.lod and change LCM to 1 } else if(primclass == GS_SPRITE_CLASS) { @@ -521,8 +528,6 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) v[i + 0].t *= w; v[i + 1].t *= w; } - - // TODO: preserve q, or if there only one sprite then see the comment above } } diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h index b14fc6708f..6a20d73c62 100644 --- a/plugins/GSdx/GSThread.h +++ b/plugins/GSdx/GSThread.h @@ -66,6 +66,20 @@ public: void Set() {SetEvent(m_hEvent);} bool Wait() {return WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0;} }; +/* +class GSAutoResetEvent +{ +protected: + long m_sync; + +public: + GSAutoResetEvent() {m_sync = 0;} + ~GSAutoResetEvent() {} + + void Set() {_interlockedbittestandset(&m_sync, 0);} + bool Wait() {while(!_interlockedbittestandreset(&m_sync, 0)) _mm_pause(); return true;} +}; +*/ #else diff --git a/plugins/GSdx/GSVector.cpp b/plugins/GSdx/GSVector.cpp index 0ebb6d8eef..eb2769744a 100644 --- a/plugins/GSdx/GSVector.cpp +++ b/plugins/GSdx/GSVector.cpp @@ -26,7 +26,8 @@ const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f); const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f); const GSVector4 GSVector4::m_half(0.5f); const GSVector4 GSVector4::m_one(1.0f); -const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000))); +const GSVector4 GSVector4::m_two(2.0f); +const GSVector4 GSVector4::m_four(4.0f); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); GSVector4i GSVector4i::fit(int arx, int ary) const diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index f6bbcbb00a..334b2a97de 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -2330,8 +2330,8 @@ public: static const GSVector4 m_ps4567; static const GSVector4 m_half; static const GSVector4 m_one; - - static const GSVector4 m_x3f800000; + static const GSVector4 m_two; + static const GSVector4 m_four; static const GSVector4 m_x4b000000; __forceinline GSVector4() @@ -2462,12 +2462,12 @@ public: if((mode & 7) == (Round_NegInf & 7)) { - return b - ((a < b) & m_x3f800000); + return b - ((a < b) & m_one); } if((mode & 7) == (Round_PosInf & 7)) { - return b + ((a > b) & m_x3f800000); + return b + ((a > b) & m_one); } ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented @@ -2702,7 +2702,66 @@ public: #endif } - // TODO: insert + template __forceinline GSVector4 insert(const GSVector4& v) const + { + #if 0 // _M_SSE >= 0x401 + + // NOTE: it's faster with shuffles... + + return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); + + #else + + switch(dst) + { + case 0: + switch(src) + { + case 0: return v.xxyy(*this).xzzw(*this); + case 1: return v.yyyy(*this).xzzw(*this); + case 2: return v.zzyy(*this).xzzw(*this); + case 3: return v.wwyy(*this).xzzw(*this); + default: __assume(0); + } + break; + case 1: + switch(src) + { + case 0: return v.xxxx(*this).zxzw(*this); + case 1: return v.yyxx(*this).zxzw(*this); + case 2: return v.zzxx(*this).zxzw(*this); + case 3: return v.wwxx(*this).zxzw(*this); + default: __assume(0); + } + break; + case 2: + switch(src) + { + case 0: return xyxz(v.xxww(*this)); + case 1: return xyxz(v.yyww(*this)); + case 2: return xyxz(v.zzww(*this)); + case 3: return xyxz(v.wwww(*this)); + default: __assume(0); + } + break; + case 3: + switch(src) + { + case 0: return xyzx(v.xxzz(*this)); + case 1: return xyzx(v.yyzz(*this)); + case 2: return xyzx(v.zzzz(*this)); + case 3: return xyzx(v.wwzz(*this)); + default: __assume(0); + } + break; + default: + __assume(0); + } + + #endif + + return *this; + } template __forceinline int extract() const {