From f3b6829c74c8d3530c4d6f5ed03e1094e489cb5e Mon Sep 17 00:00:00 2001 From: Jonathan Li Date: Mon, 11 Jan 2016 17:44:17 +0000 Subject: [PATCH 1/5] gsdx-ogl: Restart collision detection algorithm for remaining sprites It's useless on its own, but it prepares for the next commit. --- plugins/GSdx/GSRendererOGL.cpp | 93 +++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 3785a2b4e1..40d5b820f1 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -485,59 +485,68 @@ GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap() // Check intersection of sprite primitive only size_t count = m_vertex.next; - GSVertex* v = &m_vertex.buff[0]; + PRIM_OVERLAP overlap = PRIM_OVERLAP_NO; + GSVertex* v = m_vertex.buff; - // In order to speed up comparaison a boundind-box is accumulated. It removes a - // loop so code is much faster (check game virtua fighter). Besides it allow to check - // properly the Y order. - GSVector4i all; - //FIXME better vector operation - if (v[1].XYZ.Y < v[0].XYZ.Y) { - all.y = v[1].XYZ.Y; - all.w = v[0].XYZ.Y; - } else { - all.y = v[0].XYZ.Y; - all.w = v[1].XYZ.Y; - } - if (v[1].XYZ.X < v[0].XYZ.X) { - all.x = v[1].XYZ.X; - all.z = v[0].XYZ.X; - } else { - all.x = v[0].XYZ.X; - all.z = v[1].XYZ.X; - } - - for(size_t i = 2; i < count; i += 2) { - GSVector4i sprite; + size_t i = 0; + while (i < count) { + // In order to speed up comparison a bounding-box is accumulated. It removes a + // loop so code is much faster (check game virtua fighter). Besides it allow to check + // properly the Y order. + GSVector4i all; //FIXME better vector operation if (v[i+1].XYZ.Y < v[i+0].XYZ.Y) { - sprite.y = v[i+1].XYZ.Y; - sprite.w = v[i+0].XYZ.Y; + all.y = v[i+1].XYZ.Y; + all.w = v[i+0].XYZ.Y; } else { - sprite.y = v[i+0].XYZ.Y; - sprite.w = v[i+1].XYZ.Y; + all.y = v[i+0].XYZ.Y; + all.w = v[i+1].XYZ.Y; } if (v[i+1].XYZ.X < v[i+0].XYZ.X) { - sprite.x = v[i+1].XYZ.X; - sprite.z = v[i+0].XYZ.X; + all.x = v[i+1].XYZ.X; + all.z = v[i+0].XYZ.X; } else { - sprite.x = v[i+0].XYZ.X; - sprite.z = v[i+1].XYZ.X; + all.x = v[i+0].XYZ.X; + all.z = v[i+1].XYZ.X; } - // Be sure to get vertex in good order, otherwise .r* function doesn't - // work as expected. - ASSERT(sprite.x <= sprite.z); - ASSERT(sprite.y <= sprite.w); - ASSERT(all.x <= all.z); - ASSERT(all.y <= all.w); + size_t j = i + 2; + while (j < count) { + GSVector4i sprite; + //FIXME better vector operation + if (v[j+1].XYZ.Y < v[j+0].XYZ.Y) { + sprite.y = v[j+1].XYZ.Y; + sprite.w = v[j+0].XYZ.Y; + } else { + sprite.y = v[j+0].XYZ.Y; + sprite.w = v[j+1].XYZ.Y; + } + if (v[j+1].XYZ.X < v[j+0].XYZ.X) { + sprite.x = v[j+1].XYZ.X; + sprite.z = v[j+0].XYZ.X; + } else { + sprite.x = v[j+0].XYZ.X; + sprite.z = v[j+1].XYZ.X; + } - if (all.rintersect(sprite).rempty()) { - all = all.runion(sprite); - } else { - return PRIM_OVERLAP_YES; + // Be sure to get vertex in good order, otherwise .r* function doesn't + // work as expected. + ASSERT(sprite.x <= sprite.z); + ASSERT(sprite.y <= sprite.w); + ASSERT(all.x <= all.z); + ASSERT(all.y <= all.w); + + if (all.rintersect(sprite).rempty()) { + all = all.runion(sprite); + } else { + overlap = PRIM_OVERLAP_YES; + break; + } + j += 2; } + i = j; } + #if 0 // Old algo: less constraint but O(n^2) instead of O(n) as above @@ -580,7 +589,7 @@ GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap() #endif //fprintf(stderr, "%d: Yes, code can be optimized (draw of %d vertices)\n", s_n, count); - return PRIM_OVERLAP_NO; + return overlap; } GSVector4i GSRendererOGL::ComputeBoundingBox(const GSVector2& rtscale, const GSVector2i& rtsize) From 74ace74d5058b06650d2538ab3fb58856e6bb383 Mon Sep 17 00:00:00 2001 From: Jonathan Li Date: Mon, 11 Jan 2016 17:56:08 +0000 Subject: [PATCH 2/5] gsdx-ogl: Group non-overlapped accurate blend sprite draws Reduces the number of draw calls and barriers when drawing sprites. --- plugins/GSdx/GSRendererOGL.cpp | 13 ++++++++++++- plugins/GSdx/GSRendererOGL.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 40d5b820f1..35415b05f4 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -31,6 +31,9 @@ GSRendererOGL::GSRendererOGL() m_sw_blending = theApp.GetConfig("accurate_blending_unit", 1); + // Hope nothing requires too many draw calls. + m_drawlist.reserve(2048); + UserHacks_TCOffset = theApp.GetConfig("UserHacks_TCOffset", 0); UserHacks_TCO_x = (UserHacks_TCOffset & 0xFFFF) / -1000.0f; UserHacks_TCO_y = ((UserHacks_TCOffset >> 16) & 0xFFFF) / -1000.0f; @@ -488,6 +491,7 @@ GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap() PRIM_OVERLAP overlap = PRIM_OVERLAP_NO; GSVertex* v = m_vertex.buff; + m_drawlist.clear(); size_t i = 0; while (i < count) { // In order to speed up comparison a bounding-box is accumulated. It removes a @@ -544,6 +548,7 @@ GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap() } j += 2; } + m_drawlist.push_back((j - i) >> 1); // Sprite count i = j; } @@ -615,6 +620,13 @@ void GSRendererOGL::SendDraw(bool require_barrier) ASSERT(GLLoader::found_GL_ARB_texture_barrier); glTextureBarrier(); dev->DrawIndexedPrimitive(); + } else if (m_vt.m_primclass == GS_SPRITE_CLASS) { + size_t nb_vertex = (GLLoader::found_geometry_shader) ? 2 : 6; + for (size_t count, p = 0, n = 0; n < m_drawlist.size(); p += count, ++n) { + count = m_drawlist[n] * nb_vertex; + glTextureBarrier(); + dev->DrawIndexedPrimitive(p, count); + } } else { // FIXME: Investigate: a dynamic check to pack as many primitives as possibles // I'm nearly sure GSdx already have this kind of code (maybe we can adapt GSDirtyRect) @@ -622,7 +634,6 @@ void GSRendererOGL::SendDraw(bool require_barrier) switch (m_vt.m_primclass) { case GS_TRIANGLE_CLASS: nb_vertex = 3; break; case GS_POINT_CLASS: nb_vertex = 1; break; - case GS_SPRITE_CLASS: nb_vertex = (GLLoader::found_geometry_shader) ? 2 : 6; break; default: nb_vertex = 2; break; } diff --git a/plugins/GSdx/GSRendererOGL.h b/plugins/GSdx/GSRendererOGL.h index 2f06c18108..7516ab90c6 100644 --- a/plugins/GSdx/GSRendererOGL.h +++ b/plugins/GSdx/GSRendererOGL.h @@ -49,6 +49,7 @@ class GSRendererOGL : public GSRendererHW int m_sw_blending; PRIM_OVERLAP m_prim_overlap; bool m_unsafe_fbmask; + vector m_drawlist; unsigned int UserHacks_TCOffset; float UserHacks_TCO_x, UserHacks_TCO_y; From 2fc3ef812458fb0b7f683becffa35074deef6089 Mon Sep 17 00:00:00 2001 From: Jonathan Li Date: Mon, 11 Jan 2016 23:52:45 +0000 Subject: [PATCH 3/5] gsdx-ogl: Add sprite overlap statistics to debug messages --- plugins/GSdx/GSRendererOGL.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index 35415b05f4..a647d2b449 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -622,11 +622,30 @@ void GSRendererOGL::SendDraw(bool require_barrier) dev->DrawIndexedPrimitive(); } else if (m_vt.m_primclass == GS_SPRITE_CLASS) { size_t nb_vertex = (GLLoader::found_geometry_shader) ? 2 : 6; + + GL_PUSH("Split the draw (SPRITE)"); + +#if defined(_DEBUG) + // Check how draw call is split. + map frequency; + for (const auto& it: m_drawlist) + ++frequency[it]; + + string message; + for (const auto& it: frequency) + message += " " + to_string(it.first) + "(" + to_string(it.second) + ")"; + + GL_PERF("Split single draw (%d sprites) into %zu draws: consecutive draws(frequency):%s", + m_index.tail / nb_vertex, m_drawlist.size(), message.c_str()); +#endif + for (size_t count, p = 0, n = 0; n < m_drawlist.size(); p += count, ++n) { count = m_drawlist[n] * nb_vertex; glTextureBarrier(); dev->DrawIndexedPrimitive(p, count); } + + GL_POP(); } else { // FIXME: Investigate: a dynamic check to pack as many primitives as possibles // I'm nearly sure GSdx already have this kind of code (maybe we can adapt GSDirtyRect) From 14dffa762b0f394390aa4dd6fda9b05d02821568 Mon Sep 17 00:00:00 2001 From: Jonathan Li Date: Thu, 14 Jan 2016 23:20:48 +0000 Subject: [PATCH 4/5] gsdx: Add runion_ordered to GSVector Allows to save a few instructions cycles when xy and zw are already sorted (min and max rectangle coords). --- plugins/GSdx/GSVector.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 835eb0dcdc..cacef41350 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -239,15 +239,7 @@ public: if(i == 0xffff) { - #if _M_SSE >= 0x401 - - return min_i32(a).upl64(max_i32(a).srl<8>()); - - #else - - return GSVector4i(min(x, a.x), min(y, a.y), max(z, a.z), max(w, a.w)); - - #endif + return runion_ordered(a); } if((i & 0x00ff) == 0x00ff) @@ -263,6 +255,19 @@ public: return GSVector4i::zero(); } + __forceinline GSVector4i runion_ordered(const GSVector4i& a) const + { + #if _M_SSE >= 0x401 + + return min_i32(a).upl64(max_i32(a).srl<8>()); + + #else + + return GSVector4i(min(x, a.x), min(y, a.y), max(z, a.z), max(w, a.w)); + + #endif + } + __forceinline GSVector4i rintersect(const GSVector4i& a) const { return sat_i32(a); From 37deeb0d52879cd08e990db3c9eb653b3428fb36 Mon Sep 17 00:00:00 2001 From: Jonathan Li Date: Wed, 13 Jan 2016 21:43:57 +0000 Subject: [PATCH 5/5] gsdx-ogl: Optimise overlap detection algorithm Vectorise coordinate min/max sorting, and use the ordered runion instead. --- plugins/GSdx/GSRendererOGL.cpp | 43 ++++++++-------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/plugins/GSdx/GSRendererOGL.cpp b/plugins/GSdx/GSRendererOGL.cpp index a647d2b449..494ed32318 100644 --- a/plugins/GSdx/GSRendererOGL.cpp +++ b/plugins/GSdx/GSRendererOGL.cpp @@ -497,41 +497,18 @@ GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap() // In order to speed up comparison a bounding-box is accumulated. It removes a // loop so code is much faster (check game virtua fighter). Besides it allow to check // properly the Y order. - GSVector4i all; - //FIXME better vector operation - if (v[i+1].XYZ.Y < v[i+0].XYZ.Y) { - all.y = v[i+1].XYZ.Y; - all.w = v[i+0].XYZ.Y; - } else { - all.y = v[i+0].XYZ.Y; - all.w = v[i+1].XYZ.Y; - } - if (v[i+1].XYZ.X < v[i+0].XYZ.X) { - all.x = v[i+1].XYZ.X; - all.z = v[i+0].XYZ.X; - } else { - all.x = v[i+0].XYZ.X; - all.z = v[i+1].XYZ.X; - } + + // .x = min(v[i].XYZ.X, v[i+1].XYZ.X) + // .y = min(v[i].XYZ.Y, v[i+1].XYZ.Y) + // .z = max(v[i].XYZ.X, v[i+1].XYZ.X) + // .w = max(v[i].XYZ.Y, v[i+1].XYZ.Y) + GSVector4i all = GSVector4i(v[i].m[1]).upl16(GSVector4i(v[i+1].m[1])).upl16().xzyw(); + all = all.xyxy().blend(all.zwzw(), all > all.zwxy()); size_t j = i + 2; while (j < count) { - GSVector4i sprite; - //FIXME better vector operation - if (v[j+1].XYZ.Y < v[j+0].XYZ.Y) { - sprite.y = v[j+1].XYZ.Y; - sprite.w = v[j+0].XYZ.Y; - } else { - sprite.y = v[j+0].XYZ.Y; - sprite.w = v[j+1].XYZ.Y; - } - if (v[j+1].XYZ.X < v[j+0].XYZ.X) { - sprite.x = v[j+1].XYZ.X; - sprite.z = v[j+0].XYZ.X; - } else { - sprite.x = v[j+0].XYZ.X; - sprite.z = v[j+1].XYZ.X; - } + GSVector4i sprite = GSVector4i(v[j].m[1]).upl16(GSVector4i(v[j+1].m[1])).upl16().xzyw(); + sprite = sprite.xyxy().blend(sprite.zwzw(), sprite > sprite.zwxy()); // Be sure to get vertex in good order, otherwise .r* function doesn't // work as expected. @@ -541,7 +518,7 @@ GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap() ASSERT(all.y <= all.w); if (all.rintersect(sprite).rempty()) { - all = all.runion(sprite); + all = all.runion_ordered(sprite); } else { overlap = PRIM_OVERLAP_YES; break;