Merge pull request #749 from PCSX2/gsdx-micro-optimization

Gsdx micro optimization
This commit is contained in:
Gregory Hainaut 2015-08-12 08:54:36 +02:00
commit 6046d6c417
10 changed files with 97 additions and 165 deletions

View File

@ -15,6 +15,8 @@ endif()
set(CommonFlags
# GCC-4.6 crash pcsx2 during the binding of plugins at startup...
# Disable this optimization for the moment
# GCC-4.9 update:
# Crash when you start a game. Likely a stack corruption/alignment
-fno-omit-frame-pointer
# END GCC-4.6
-fno-strict-aliasing

View File

@ -48,10 +48,6 @@ endif()
#Clang doesn't support a few common flags that GCC does.
if(NOT USE_CLANG)
set(GSdxFinalFlags ${GSdxFinalFlags} -fabi-version=6)
if (_M_X86_32 AND NOT USE_ASAN)
# Someone need to seriously test the build of GSdx without this option
set(GSdxFinalFlags ${GSdxFinalFlags} -mpreferred-stack-boundary=2)
endif()
endif()
set(GSdxSources

View File

@ -28,13 +28,12 @@ namespace GLState {
GSVector4i scissor;
bool blend;
GLenum eq_RGB;
GLenum f_sRGB;
GLenum f_dRGB;
uint16 eq_RGB;
uint16 f_sRGB;
uint16 f_dRGB;
uint8 bf;
uint32 wrgba;
float bf;
bool depth;
GLenum depth_func;
bool depth_mask;
@ -58,14 +57,6 @@ namespace GLState {
GLuint vs;
GLuint program;
bool dirty_prog;
#if 0
struct {
GSVertexBufferStateOGL* vb;
GSDepthStencilOGL* dss;
GSBlendStateOGL* bs;
float bf; // blend factor
} m_state;
#endif
void Clear() {
fbo = 0;
@ -76,8 +67,8 @@ namespace GLState {
eq_RGB = 0;
f_sRGB = 0;
f_dRGB = 0;
bf = 0;
wrgba = 0xF;
bf = 0.0;
depth = false;
depth_func = 0;

View File

@ -30,11 +30,11 @@ namespace GLState {
extern GSVector4i scissor;
extern bool blend;
extern GLenum eq_RGB;
extern GLenum f_sRGB;
extern GLenum f_dRGB;
extern uint16 eq_RGB;
extern uint16 f_sRGB;
extern uint16 f_dRGB;
extern uint8 bf;
extern uint32 wrgba;
extern float bf;
extern bool depth;
extern GLenum depth_func;

View File

@ -353,7 +353,6 @@ bool GSDeviceOGL::Create(GSWnd* wnd)
ASSERT(sizeof(PSSamplerSelector) == 4);
ASSERT(sizeof(OMDepthStencilSelector) == 4);
ASSERT(sizeof(OMColorMaskSelector) == 4);
ASSERT(sizeof(OMBlendSelector) == 4);
return true;
}
@ -673,6 +672,7 @@ GLuint GSDeviceOGL::CompilePS(PSSelector sel)
+ format("#define PS_WRITE_RG %d\n", sel.write_rg)
+ format("#define PS_FBMASK %d\n", sel.fbmask)
+ format("#define PS_HDR %d\n", sel.hdr)
+ format("#define PS_PABE %d\n", sel.pabe);
;
return m_shader->Compile("tfx.glsl", "ps_main", GL_FRAGMENT_SHADER, tfx_fs_all_glsl, macro);
@ -1342,7 +1342,7 @@ void GSDeviceOGL::OMSetColorMaskState(OMColorMaskSelector sel)
}
}
void GSDeviceOGL::OMSetBlendState(int blend_index, float blend_factor, bool is_blend_constant)
void GSDeviceOGL::OMSetBlendState(uint8 blend_index, uint8 blend_factor, bool is_blend_constant)
{
if (blend_index) {
if (!GLState::blend) {
@ -1352,28 +1352,27 @@ void GSDeviceOGL::OMSetBlendState(int blend_index, float blend_factor, bool is_b
if (is_blend_constant && GLState::bf != blend_factor) {
GLState::bf = blend_factor;
gl_BlendColor(blend_factor, blend_factor, blend_factor, 0);
float bf = (float)blend_factor / 128.0f;
gl_BlendColor(bf, bf, bf, bf);
}
// FIXME test to use uint16 (cache friendly)
const GLenum& op = m_blendMapD3D9[blend_index].op;
if (GLState::eq_RGB != op) {
GLState::eq_RGB = op;
const OGLBlend& b = m_blendMapOGL[blend_index];
if (GLState::eq_RGB != b.op) {
GLState::eq_RGB = b.op;
if (gl_BlendEquationSeparateiARB)
gl_BlendEquationSeparateiARB(0, op, GL_FUNC_ADD);
gl_BlendEquationSeparateiARB(0, b.op, GL_FUNC_ADD);
else
gl_BlendEquationSeparate(op, GL_FUNC_ADD);
gl_BlendEquationSeparate(b.op, GL_FUNC_ADD);
}
const GLenum& src = m_blendMapD3D9[blend_index].src;
const GLenum& dst = m_blendMapD3D9[blend_index].dst;
if (GLState::f_sRGB != src || GLState::f_dRGB != dst) {
GLState::f_sRGB = src;
GLState::f_dRGB = dst;
if (GLState::f_sRGB != b.src || GLState::f_dRGB != b.dst) {
GLState::f_sRGB = b.src;
GLState::f_dRGB = b.dst;
if (gl_BlendFuncSeparateiARB)
gl_BlendFuncSeparateiARB(0, src, dst, GL_ONE, GL_ZERO);
gl_BlendFuncSeparateiARB(0, b.src, b.dst, GL_ONE, GL_ZERO);
else
gl_BlendFuncSeparate(src, dst, GL_ONE, GL_ZERO);
gl_BlendFuncSeparate(b.src, b.dst, GL_ONE, GL_ZERO);
}
} else {
@ -1541,7 +1540,7 @@ void GSDeviceOGL::DebugOutputToFile(GLenum gl_source, GLenum gl_type, GLuint id,
const int GSDeviceOGL::m_NO_BLEND = 0;
const int GSDeviceOGL::m_MERGE_BLEND = 3*3*3*3;
const GSDeviceOGL::D3D9Blend GSDeviceOGL::m_blendMapD3D9[3*3*3*3 + 1] =
const GSDeviceOGL::OGLBlend GSDeviceOGL::m_blendMapOGL[3*3*3*3 + 1] =
{
{ BLEND_NO_BAR , D3DBLENDOP_ADD , D3DBLEND_ONE , D3DBLEND_ZERO} , // 0000: (Cs - Cs)*As + Cs ==> Cs
{ 0 , D3DBLENDOP_ADD , D3DBLEND_ZERO , D3DBLEND_ONE} , // 0001: (Cs - Cs)*As + Cd ==> Cd

View File

@ -285,6 +285,7 @@ class GSDeviceOGL : public GSDevice
uint32 blend_c:2;
uint32 blend_d:2;
uint32 clr1:1; // useful?
uint32 pabe:1;
uint32 hdr:1;
uint32 colclip:1;
@ -292,7 +293,7 @@ class GSDeviceOGL : public GSDevice
uint32 tcoffsethack:1;
//uint32 point_sampler:1; Not tested, so keep the bit for blend
uint32 _free2:20;
uint32 _free2:19;
};
uint64 key;
@ -378,44 +379,8 @@ class GSDeviceOGL : public GSDevice
OMColorMaskSelector(uint32 c) { wrgba = c; }
};
struct OMBlendSelector
{
union
{
struct
{
uint32 abe:1;
uint32 a:2;
uint32 b:2;
uint32 c:2;
uint32 d:2;
uint32 _free:23;
};
struct
{
uint32 _abe:1;
uint32 abcd:8;
uint32 _free2:23;
};
uint32 key;
};
operator uint32() {return key;}
OMBlendSelector() : key(0) {}
bool IsCLR1() const
{
return (key & 0x19f) == 0x93; // abe == 1 && a == 1 && b == 2 && d == 1
}
};
struct D3D9Blend {int bogus, op, src, dst;};
static const D3D9Blend m_blendMapD3D9[3*3*3*3 + 1];
struct OGLBlend {uint16 bogus, op, src, dst;};
static const OGLBlend m_blendMapOGL[3*3*3*3 + 1];
static const int m_NO_BLEND;
static const int m_MERGE_BLEND;
@ -562,7 +527,7 @@ class GSDeviceOGL : public GSDevice
void PSSetSamplerState(GLuint ss);
void OMSetDepthStencilState(GSDepthStencilOGL* dss);
void OMSetBlendState(int blend_index = 0, float blend_factor = 0.0f, bool is_blend_constant = false);
void OMSetBlendState(uint8 blend_index = 0, uint8 blend_factor = 0, bool is_blend_constant = false);
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
void OMSetWriteBuffer(GLenum buffer = GL_COLOR_ATTACHMENT0);
void OMSetColorMaskState(OMColorMaskSelector sel = OMColorMaskSelector());

View File

@ -152,7 +152,7 @@ void GSRendererOGL::SetupIA()
dev->IASetPrimitiveTopology(t);
}
bool GSRendererOGL::EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::OMColorMaskSelector& om_csel, GSDeviceOGL::PSConstantBuffer& ps_cb)
bool GSRendererOGL::EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::OMColorMaskSelector& om_csel)
{
bool require_barrier = false;
@ -301,93 +301,77 @@ bool GSRendererOGL::EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_s
return require_barrier;
}
bool GSRendererOGL::EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::PSConstantBuffer& ps_cb, bool DATE_GL42)
bool GSRendererOGL::EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, bool DATE_GL42)
{
GSDeviceOGL* dev = (GSDeviceOGL*)m_dev;
const GIFRegALPHA& ALPHA = m_context->ALPHA;
bool require_barrier = false;
GSDeviceOGL* dev = (GSDeviceOGL*)m_dev;
float afix = (float)m_context->ALPHA.FIX / 0x80;
GSDeviceOGL::OMBlendSelector om_bsel;
bool sw_blending = false;
om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS;
om_bsel.a = ALPHA.A;
om_bsel.b = ALPHA.B;
om_bsel.c = ALPHA.C;
om_bsel.d = ALPHA.D;
// No blending so early exit
if (!(PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS)) {
#ifdef ENABLE_OGL_DEBUG
if (m_env.PABE.PABE) {
GL_INS("!!! ENV PABE without ABE !!!");
}
#endif
dev->OMSetBlendState();
return false;
}
if (m_env.PABE.PABE)
{
GL_INS("!!! ENV PABE not supported !!!");
// FIXME it could be supported with SW blending!
if (om_bsel.a == 0 && om_bsel.b == 1 && om_bsel.c == 0 && om_bsel.d == 1)
{
// this works because with PABE alpha blending is on when alpha >= 0x80, but since the pixel shader
// cannot output anything over 0x80 (== 1.0) blending with 0x80 or turning it off gives the same result
om_bsel.abe = 0;
if (m_sw_blending >= ACC_BLEND_CCLIP_DALPHA) {
ps_sel.pabe = 1;
require_barrier |= (ALPHA.C == 1);
sw_blending = true;
}
else
{
//Breath of Fire Dragon Quarter triggers this in battles. Graphics are fine though.
//ASSERT(0);
}
}
// No blending so early exit
if (!om_bsel.abe) {
dev->OMSetBlendState();
return require_barrier;
}
// Compute the blending equation to detect special case
int blend_sel = ((om_bsel.a * 3 + om_bsel.b) * 3 + om_bsel.c) * 3 + om_bsel.d;
int blend_flag = GSDeviceOGL::m_blendMapD3D9[blend_sel].bogus;
uint8 blend_index = ((ALPHA.A * 3 + ALPHA.B) * 3 + ALPHA.C) * 3 + ALPHA.D;
int blend_flag = GSDeviceOGL::m_blendMapOGL[blend_index].bogus;
// SW Blend is (nearly) free. Let's use it.
bool free_blend = (blend_flag & BLEND_NO_BAR) || (m_prim_overlap == PRIM_OVERLAP_NO);
// We really need SW blending for this one, barely used
bool impossible_blend = (blend_flag & BLEND_A_MAX);
bool impossible_or_free_blend = (blend_flag & (BLEND_NO_BAR|BLEND_A_MAX|BLEND_ACCU))
|| (m_prim_overlap == PRIM_OVERLAP_NO);
// Do the multiplication in shader for blending accumulation: Cs*As + Cd or Cs*Af + Cd
bool accumulation_blend = (blend_flag & BLEND_ACCU);
bool sw_blending_base = m_sw_blending && (free_blend || impossible_blend);
// Warning no break on purpose
switch (m_sw_blending) {
case ACC_BLEND_ULTRA: sw_blending |= true;
case ACC_BLEND_FULL: if (!m_vt.m_alpha.valid && (ALPHA.C == 0)) GetAlphaMinMax();
sw_blending |= (ALPHA.A != ALPHA.B) &&
((ALPHA.C == 0 && m_vt.m_alpha.max > 128u) || (ALPHA.C == 2 && ALPHA.FIX > 128u));
case ACC_BLEND_CCLIP_DALPHA: sw_blending |= (ALPHA.C == 1) || (m_env.COLCLAMP.CLAMP == 0);
case ACC_BLEND_SPRITE: sw_blending |= m_vt.m_primclass == GS_SPRITE_CLASS;
case ACC_BLEND_FREE: sw_blending |= ps_sel.fbmask || impossible_or_free_blend;
default: sw_blending |= accumulation_blend;
}
// SW Blending
// GL42 interact very badly with sw blending. GL42 uses the primitiveID to find the primitive
// that write the bad alpha value. Sw blending will force the draw to run primitive by primitive
// (therefore primitiveID will be constant to 1)
sw_blending &= !DATE_GL42;
// Color clip
if (m_env.COLCLAMP.CLAMP == 0) {
if (accumulation_blend) {
ps_sel.hdr = 1;
GL_INS("COLCLIP Fast HDR mode ENABLED");
} else if (m_sw_blending >= ACC_BLEND_CCLIP_DALPHA || sw_blending_base) {
} else if (sw_blending) {
ps_sel.colclip = 1;
sw_blending_base = true;
GL_INS("COLCLIP SW ENABLED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D);
} else {
GL_INS("Sorry colclip isn't supported");
}
}
// Note: Option is duplicated, one impact the blend unit / the other the shader.
sw_blending_base |= accumulation_blend;
// Warning no break on purpose
bool sw_blending_adv = false;
switch (m_sw_blending) {
case ACC_BLEND_ULTRA: sw_blending_adv |= true;
case ACC_BLEND_FULL: sw_blending_adv |= !( (ALPHA.A == ALPHA.B) || (ALPHA.C == 2 && afix <= 1.002f) );
case ACC_BLEND_CCLIP_DALPHA: sw_blending_adv |= (ALPHA.C == 1);
case ACC_BLEND_SPRITE: sw_blending_adv |= m_vt.m_primclass == GS_SPRITE_CLASS;
default: break;
}
bool sw_blending = sw_blending_base // Free case or Impossible blend
|| sw_blending_adv // complex blending case (for special effect)
|| ps_sel.fbmask; // accurate fbmask
// SW Blending
// GL42 interact very badly with sw blending. GL42 uses the primitiveID to find the primitive
// that write the bad alpha value. Sw blending will force the draw to run primitive by primitive
// (therefore primitiveID will be constant to 1)
sw_blending &= !DATE_GL42;
// Seriously don't expect me to support this kind of crazyness.
// No mix of COLCLIP + accumulation_blend + DATE GL42
// Neither fbmask and GL42
@ -397,18 +381,17 @@ bool GSRendererOGL::EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL
// For stat to optimize accurate option
#if 0
GL_INS("BLEND_INFO: %d/%d/%d/%d. Clamp:%d. Prim:%d number %d (sw %d)",
om_bsel.a, om_bsel.b, om_bsel.c, om_bsel.d, m_env.COLCLAMP.CLAMP, m_vt.m_primclass, m_vertex.next, sw_blending);
ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D, m_env.COLCLAMP.CLAMP, m_vt.m_primclass, m_vertex.next, sw_blending);
#endif
if (sw_blending) {
ps_sel.blend_a = om_bsel.a;
ps_sel.blend_b = om_bsel.b;
ps_sel.blend_c = om_bsel.c;
ps_sel.blend_d = om_bsel.d;
ps_sel.blend_a = ALPHA.A;
ps_sel.blend_b = ALPHA.B;
ps_sel.blend_c = ALPHA.C;
ps_sel.blend_d = ALPHA.D;
if (accumulation_blend) {
// Keep HW blending to do the addition
dev->OMSetBlendState(blend_sel);
om_bsel.abe = 1;
dev->OMSetBlendState(blend_index);
// Remove the addition from the SW blending
ps_sel.blend_d = 2;
} else {
@ -418,19 +401,19 @@ bool GSRendererOGL::EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL
// Require the fix alpha vlaue
if (ALPHA.C == 2) {
ps_cb.AlphaCoeff.a = afix;
ps_cb.AlphaCoeff.a = (float)ALPHA.FIX / 128.0f;
}
// No need to flush for every primitive
require_barrier |= !(blend_flag & BLEND_NO_BAR) && !accumulation_blend;
} else {
ps_sel.clr1 = om_bsel.IsCLR1();
ps_sel.clr1 = (blend_flag & BLEND_C_CLR);
if (ps_sel.dfmt == 1 && ALPHA.C == 1) {
// 24 bits doesn't have an alpha channel so use 1.0f fix factor as equivalent
int hacked_blend_sel = blend_sel + 3; // +3 <=> +1 on C
dev->OMSetBlendState(hacked_blend_sel, 1.0f, true);
int hacked_blend_index = blend_index + 3; // +3 <=> +1 on C
dev->OMSetBlendState(hacked_blend_index, 128, true);
} else {
dev->OMSetBlendState(blend_sel, afix, (ALPHA.C == 2));
dev->OMSetBlendState(blend_index, ALPHA.FIX, (ALPHA.C == 2));
}
}
@ -577,16 +560,10 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
GSDeviceOGL* dev = (GSDeviceOGL*)m_dev;
dev->s_n = s_n;
// FIXME: optimization, latch ps_cb & vs_cb in the object
// 1/ Avoid a reset every draw
// 2/ potentially less update
GSDeviceOGL::VSSelector vs_sel;
GSDeviceOGL::VSConstantBuffer vs_cb;
GSDeviceOGL::GSSelector gs_sel;
GSDeviceOGL::PSSelector ps_sel;
GSDeviceOGL::PSConstantBuffer ps_cb;
GSDeviceOGL::PSSamplerSelector ps_ssel;
GSDeviceOGL::OMColorMaskSelector om_csel;
@ -604,7 +581,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
m_prim_overlap = PRIM_OVERLAP_UNKNOW;
}
require_barrier |= EmulateTextureShuffleAndFbmask(ps_sel, om_csel, ps_cb);
require_barrier |= EmulateTextureShuffleAndFbmask(ps_sel, om_csel);
// DATE: selection of the algorithm. Must be done before blending because GL42 is not compatible with blending
@ -632,7 +609,7 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
// Blend
if (!IsOpaque() && rt) {
require_barrier |= EmulateBlending(ps_sel, ps_cb, DATE_GL42);
require_barrier |= EmulateBlending(ps_sel, DATE_GL42);
} else {
dev->OMSetBlendState(); // No blending please
}

View File

@ -48,19 +48,21 @@ class GSRendererOGL : public GSRendererHW
GSVector2 m_pixelcenter;
bool m_accurate_date;
int m_sw_blending;
PRIM_OVERLAP m_prim_overlap;
unsigned int UserHacks_TCOffset;
float UserHacks_TCO_x, UserHacks_TCO_y;
PRIM_OVERLAP m_prim_overlap;
GSDeviceOGL::VSConstantBuffer vs_cb;
GSDeviceOGL::PSConstantBuffer ps_cb;
GSVector4i ComputeBoundingBox(const GSVector2& rtscale, const GSVector2i& rtsize);
protected:
void EmulateGS();
void SetupIA();
bool EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::OMColorMaskSelector& om_csel, GSDeviceOGL::PSConstantBuffer& ps_cb);
bool EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::PSConstantBuffer& ps_cb, bool DATE_GL42);
bool EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::OMColorMaskSelector& om_csel);
bool EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, bool DATE_GL42);
public:
GSRendererOGL();

View File

@ -442,7 +442,7 @@ void ps_blend(inout vec4 Color, float As)
// FIXME dithering
// Correct the Color value based on the output format
#if PS_COLCLIP == 0
#if PS_COLCLIP == 0 && PS_HDR == 0
// Standard Clamp
Color.rgb = clamp(Color.rgb, vec3(0.0f), vec3(255.0f));
#endif
@ -457,7 +457,7 @@ void ps_blend(inout vec4 Color, float As)
// In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania
Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xF8));
#elif PS_COLCLIP == 1
#elif PS_COLCLIP == 1 && PS_HDR == 0
Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xFF));
#endif

View File

@ -1301,7 +1301,7 @@ static const char* tfx_fs_all_glsl =
" // FIXME dithering\n"
"\n"
" // Correct the Color value based on the output format\n"
"#if PS_COLCLIP == 0\n"
"#if PS_COLCLIP == 0 && PS_HDR == 0\n"
" // Standard Clamp\n"
" Color.rgb = clamp(Color.rgb, vec3(0.0f), vec3(255.0f));\n"
"#endif\n"
@ -1316,7 +1316,7 @@ static const char* tfx_fs_all_glsl =
" // In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania\n"
"\n"
" Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xF8));\n"
"#elif PS_COLCLIP == 1\n"
"#elif PS_COLCLIP == 1 && PS_HDR == 0\n"
" Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xFF));\n"
"#endif\n"
"\n"