gsdx sw: do const q division in ConvertVertexBuffer

It allow to do the division before the size multiplication
It avoid a float overflow if T is too big.
Old behavior: (T * size) / Q
New behavior: (T / Q) * size

Performance Note:
* Rcp was replaced by a slow division (more accurate)
* At least we avoid a 2nd loop on the vertex buffer

It helps on Pro Soccer Club and Galerians Ash rendering

Tric Note:
SPRITE must be handled differently because the 'q' of first vertex could
be invalid
This commit is contained in:
Gregory Hainaut 2017-03-03 17:01:13 +01:00
parent 611239db5c
commit 6d6ed1a205
2 changed files with 39 additions and 51 deletions

View File

@ -60,11 +60,15 @@ GSRendererSW::GSRendererSW(int threads)
m_tex_pages[i] = 0;
}
#define InitCVB2(P, Q) \
m_cvb[P][0][0][Q] = &GSRendererSW::ConvertVertexBuffer<P, 0, 0, Q>; \
m_cvb[P][0][1][Q] = &GSRendererSW::ConvertVertexBuffer<P, 0, 1, Q>; \
m_cvb[P][1][0][Q] = &GSRendererSW::ConvertVertexBuffer<P, 1, 0, Q>; \
m_cvb[P][1][1][Q] = &GSRendererSW::ConvertVertexBuffer<P, 1, 1, Q>;
#define InitCVB(P) \
m_cvb[P][0][0] = &GSRendererSW::ConvertVertexBuffer<P, 0, 0>; \
m_cvb[P][0][1] = &GSRendererSW::ConvertVertexBuffer<P, 0, 1>; \
m_cvb[P][1][0] = &GSRendererSW::ConvertVertexBuffer<P, 1, 0>; \
m_cvb[P][1][1] = &GSRendererSW::ConvertVertexBuffer<P, 1, 1>; \
InitCVB2(P, 0) \
InitCVB2(P, 1)
InitCVB(GS_POINT_CLASS);
InitCVB(GS_LINE_CLASS);
@ -207,9 +211,11 @@ GSTexture* GSRendererSW::GetFeedbackOutput()
}
template<uint32 primclass, uint32 tme, uint32 fst>
template<uint32 primclass, uint32 tme, uint32 fst, uint32 q_div>
void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
{
// FIXME q_div wasn't added to AVX2 code path.
#if 0//_M_SSE >= 0x501
// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
@ -313,13 +319,28 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
#if _M_SSE >= 0x401
t = GSVector4(xyzuvf.uph16() << (16 - 4));
#else
t = GSVector4(GSVector4i::load(src->UV).upl16() << (16 - 4));
#endif
}
else if(q_div)
{
// Division is required if number are huge (Pro Soccer Club)
if(primclass == GS_SPRITE_CLASS && (i & 1) == 0)
{
// q(n) isn't valid, you need to take q(n+1)
const GSVertex* next = src + 1;
GSVector4 stcq1 = GSVector4::load<true>(&next->m[0]); // s t rgba q
t = (stcq / stcq1.wwww()) * tsize;
}
else
{
t = (stcq / stcq.wwww()) * tsize;
}
}
else
{
t = stcq.xyww() * tsize;
@ -366,7 +387,12 @@ void GSRendererSW::Draw()
sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1));
sd->index_count = m_index.tail;
(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next);
// skip per pixel division if q is constant.
// Optimize the division by 1 with a nop. It also means that GS_SPRITE_CLASS must be processed when !m_vt.m_eq.q.
// If you have both GS_SPRITE_CLASS && m_vt.m_eq.q, it will depends on the first part of the 'OR'
uint32 q_div = !IsMipMapActive() && ((m_vt.m_eq.q && m_vt.m_min.t.z != 1.0f) || (!m_vt.m_eq.q && m_vt.m_primclass == GS_SPRITE_CLASS));
(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST][q_div])(sd->vertex, m_vertex.buff, m_vertex.next);
memcpy(sd->index, m_index.buff, sizeof(uint32) * m_index.tail);
@ -1153,48 +1179,10 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
}
else
{
if(gd.sel.fst == 0)
{
// skip per pixel division if q is constant
GSVertexSW* RESTRICT v = data->vertex;
if(m_vt.m_eq.q)
{
gd.sel.fst = 1;
// Q is flat on sprite (at least GSdx is based on this Hypothesis). So it means
// that Q0 could be invalid for example ATV Quad Power Racing 2.
const GSVector4& t = v[data->index[primclass == GS_SPRITE_CLASS ? 1 : 0]].t;
if(t.z != 1.0f)
{
GSVector4 w = t.zzzz().rcpnr();
for(int i = 0, j = data->vertex_count; i < j; i++)
{
GSVector4 t = v[i].t;
v[i].t = (t * w).xyzw(t);
}
}
}
else if(primclass == GS_SPRITE_CLASS)
{
gd.sel.fst = 1;
for(int i = 0, j = data->vertex_count; i < j; i += 2)
{
GSVector4 t0 = v[i + 0].t;
GSVector4 t1 = v[i + 1].t;
GSVector4 w = t1.zzzz().rcpnr();
v[i + 0].t = (t0 * w).xyzw(t0);
v[i + 1].t = (t1 * w).xyzw(t1);
}
}
}
// skip per pixel division if q is constant. Sprite uses flat
// q, so it's always constant by primitive.
// Note: the 'q' division was done in GSRendererSW::ConvertVertexBuffer
gd.sel.fst |= (m_vt.m_eq.q || primclass == GS_SPRITE_CLASS);
if(gd.sel.ltf && gd.sel.fst)
{

View File

@ -63,9 +63,9 @@ class GSRendererSW : public GSRenderer
typedef void (GSRendererSW::*ConvertVertexBufferPtr)(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
ConvertVertexBufferPtr m_cvb[4][2][2];
ConvertVertexBufferPtr m_cvb[4][2][2][2];
template<uint32 primclass, uint32 tme, uint32 fst>
template<uint32 primclass, uint32 tme, uint32 fst, uint32 q_div>
void ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
protected: