gsdx sw: do const q division in ConvertVertexBuffer

It allow to do the division before the size multiplication It avoid a float overflow if T is too big. Old behavior: (T * size) / Q New behavior: (T / Q) * size Performance Note: * Rcp was replaced by a slow division (more accurate) * At least we avoid a 2nd loop on the vertex buffer It helps on Pro Soccer Club and Galerians Ash rendering Tric Note: SPRITE must be handled differently because the 'q' of first vertex could be invalid
2017-03-03 17:01:13 +01:00 · 2017-03-03 17:01:13 +01:00 · 6d6ed1a205
parent 611239db5c
commit 6d6ed1a205
2 changed files with 39 additions and 51 deletions
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@ -60,11 +60,15 @@ GSRendererSW::GSRendererSW(int threads)
 		m_tex_pages[i] = 0;
 	}

+	#define InitCVB2(P, Q) \
+		m_cvb[P][0][0][Q] = &GSRendererSW::ConvertVertexBuffer<P, 0, 0, Q>; \
+		m_cvb[P][0][1][Q] = &GSRendererSW::ConvertVertexBuffer<P, 0, 1, Q>; \
+		m_cvb[P][1][0][Q] = &GSRendererSW::ConvertVertexBuffer<P, 1, 0, Q>; \
+		m_cvb[P][1][1][Q] = &GSRendererSW::ConvertVertexBuffer<P, 1, 1, Q>;
+
 	#define InitCVB(P) \
-		m_cvb[P][0][0] = &GSRendererSW::ConvertVertexBuffer<P, 0, 0>; \
-		m_cvb[P][0][1] = &GSRendererSW::ConvertVertexBuffer<P, 0, 1>; \
-		m_cvb[P][1][0] = &GSRendererSW::ConvertVertexBuffer<P, 1, 0>; \
-		m_cvb[P][1][1] = &GSRendererSW::ConvertVertexBuffer<P, 1, 1>; \
+		InitCVB2(P, 0) \
+		InitCVB2(P, 1)

 	InitCVB(GS_POINT_CLASS);
 	InitCVB(GS_LINE_CLASS);
@ -207,9 +211,11 @@ GSTexture* GSRendererSW::GetFeedbackOutput()
 }


-template<uint32 primclass, uint32 tme, uint32 fst>
+template<uint32 primclass, uint32 tme, uint32 fst, uint32 q_div>
 void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
 {
+	// FIXME q_div wasn't added to AVX2 code path.
+
 	#if 0//_M_SSE >= 0x501

 	// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
@ -313,13 +319,28 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 				#if _M_SSE >= 0x401

 				t = GSVector4(xyzuvf.uph16() << (16 - 4));
-					
+
 				#else

 				t = GSVector4(GSVector4i::load(src->UV).upl16() << (16 - 4));

 				#endif
 			}
+			else if(q_div)
+			{
+				// Division is required if number are huge (Pro Soccer Club)
+				if(primclass == GS_SPRITE_CLASS && (i & 1) == 0)
+				{
+					// q(n) isn't valid, you need to take q(n+1)
+					const GSVertex* next = src + 1;
+					GSVector4 stcq1 = GSVector4::load<true>(&next->m[0]); // s t rgba q
+					t = (stcq / stcq1.wwww()) * tsize;
+				}
+				else
+				{
+					t = (stcq / stcq.wwww()) * tsize;
+				}
+			}
 			else
 			{
 				t = stcq.xyww() * tsize;
@ -366,7 +387,12 @@ void GSRendererSW::Draw()
 	sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1));
 	sd->index_count = m_index.tail;

-	(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next);
+	// skip per pixel division if q is constant.
+	// Optimize the division by 1 with a nop. It also means that GS_SPRITE_CLASS must be processed when !m_vt.m_eq.q.
+	// If you have both GS_SPRITE_CLASS && m_vt.m_eq.q, it will depends on the first part of the 'OR'
+	uint32 q_div = !IsMipMapActive() && ((m_vt.m_eq.q && m_vt.m_min.t.z != 1.0f) || (!m_vt.m_eq.q && m_vt.m_primclass == GS_SPRITE_CLASS));
+
+	(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST][q_div])(sd->vertex, m_vertex.buff, m_vertex.next);

 	memcpy(sd->index, m_index.buff, sizeof(uint32) * m_index.tail);

@ -1153,48 +1179,10 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 			}
 			else
 			{
-				if(gd.sel.fst == 0)
-				{
-					// skip per pixel division if q is constant
-
-					GSVertexSW* RESTRICT v = data->vertex;
-
-					if(m_vt.m_eq.q)
-					{
-						gd.sel.fst = 1;
-
-						// Q is flat on sprite (at least GSdx is based on this Hypothesis). So it means
-						// that Q0 could be invalid for example ATV Quad Power Racing 2.
-						const GSVector4& t = v[data->index[primclass == GS_SPRITE_CLASS ? 1 : 0]].t;
-
-						if(t.z != 1.0f)
-						{
-							GSVector4 w = t.zzzz().rcpnr();
-
-							for(int i = 0, j = data->vertex_count; i < j; i++)
-							{
-								GSVector4 t = v[i].t;
-
-								v[i].t = (t * w).xyzw(t);
-							}
-						}
-					}
-					else if(primclass == GS_SPRITE_CLASS)
-					{
-						gd.sel.fst = 1;
-
-						for(int i = 0, j = data->vertex_count; i < j; i += 2)
-						{
-							GSVector4 t0 = v[i + 0].t;
-							GSVector4 t1 = v[i + 1].t;
-
-							GSVector4 w = t1.zzzz().rcpnr();
-
-							v[i + 0].t = (t0 * w).xyzw(t0);
-							v[i + 1].t = (t1 * w).xyzw(t1);
-						}
-					}
-				}
+				// skip per pixel division if q is constant. Sprite uses flat
+				// q, so it's always constant by primitive.
+				// Note: the 'q' division was done in GSRendererSW::ConvertVertexBuffer
+				gd.sel.fst |= (m_vt.m_eq.q || primclass == GS_SPRITE_CLASS);

 				if(gd.sel.ltf && gd.sel.fst)
 				{
--- a/plugins/GSdx/GSRendererSW.h
+++ b/plugins/GSdx/GSRendererSW.h
@ -63,9 +63,9 @@ class GSRendererSW : public GSRenderer

 	typedef void (GSRendererSW::*ConvertVertexBufferPtr)(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);

-	ConvertVertexBufferPtr m_cvb[4][2][2];
+	ConvertVertexBufferPtr m_cvb[4][2][2][2];

-	template<uint32 primclass, uint32 tme, uint32 fst>
+	template<uint32 primclass, uint32 tme, uint32 fst, uint32 q_div>
 	void ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);

 protected: