From 3abfa637b4c76d4057528b99688045c9feb7e612 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 26 Jan 2015 05:22:43 +0000 Subject: [PATCH] Cocoa Port: - Initialize the HQnx LUTs only once, instead of doing it per display window. - Fix issue where the HQnx LUT init code was causing extremely long compile times. (Regression from r5087.) --- desmume/src/cocoa/OGLDisplayOutput.cpp | 275 +++++++++++++------------ desmume/src/cocoa/OGLDisplayOutput.h | 2 +- 2 files changed, 148 insertions(+), 129 deletions(-) diff --git a/desmume/src/cocoa/OGLDisplayOutput.cpp b/desmume/src/cocoa/OGLDisplayOutput.cpp index 384f2633d..379392524 100644 --- a/desmume/src/cocoa/OGLDisplayOutput.cpp +++ b/desmume/src/cocoa/OGLDisplayOutput.cpp @@ -1465,10 +1465,151 @@ enum OGLVertexAttributeID OGLVertexAttributeID_TexCoord0 = 8 }; +typedef struct +{ + uint8_t p0; + uint8_t p1; + uint8_t p2; + uint8_t w0; + uint8_t w1; + uint8_t w2; +} LUTValues; + +static LUTValues _LQ2xLUT[256*(2*2)*16]; +static LUTValues _HQ2xLUT[256*(2*2)*16]; +static LUTValues _HQ4xLUT[256*(4*4)*16]; + static const GLint filterVtxBuffer[8] = {-1, -1, 1, -1, 1, 1, -1, 1}; static const GLubyte filterElementBuffer[6] = {0, 1, 2, 2, 3, 0}; static const GLubyte outputElementBuffer[12] = {0, 1, 2, 2, 3, 0, 4, 5, 6, 6, 7, 4}; +// Turn off inlining for this function so that we don't get hit with extremely long compile times. +static NOINLINE LUTValues PackLUTValues(uint8_t p0, uint8_t p1, uint8_t p2, uint8_t w0, uint8_t w1, uint8_t w2) +{ + if (w1 == 0 && w2 == 0) + { + w0 = 255; + } + else + { + const uint8_t wR = 256 / (w0 + w1 + w2); + w0 *= wR; + w1 *= wR; + w2 *= wR; + } + + return {p0*31, p1*31, p2*31, w0, w1, w2}; +} + +static void InitHQnxLUTs() +{ + static bool lutValuesInited = false; + + if (lutValuesInited) + { + return; + } + +#define MUR (compare & 0x01) // top-right +#define MDR (compare & 0x02) // bottom-right +#define MDL (compare & 0x04) // bottom-left +#define MUL (compare & 0x08) // top-left +#define IC(p0) PackLUTValues(p0, p0, p0, 1, 0, 0) +#define I11(p0,p1) PackLUTValues(p0, p1, p0, 1, 1, 0) +#define I211(p0,p1,p2) PackLUTValues(p0, p1, p2, 2, 1, 1) +#define I31(p0,p1) PackLUTValues(p0, p1, p0, 3, 1, 0) +#define I332(p0,p1,p2) PackLUTValues(p0, p1, p2, 3, 3, 2) +#define I431(p0,p1,p2) PackLUTValues(p0, p1, p2, 4, 3, 1) +#define I521(p0,p1,p2) PackLUTValues(p0, p1, p2, 5, 2, 1) +#define I53(p0,p1) PackLUTValues(p0, p1, p0, 5, 3, 0) +#define I611(p0,p1,p2) PackLUTValues(p0, p1, p2, 6, 1, 1) +#define I71(p0,p1) PackLUTValues(p0, p1, p0, 7, 1, 0) +#define I772(p0,p1,p2) PackLUTValues(p0, p1, p2, 7, 7, 2) +#define I97(p0,p1) PackLUTValues(p0, p1, p0, 9, 7, 0) +#define I1411(p0,p1,p2) PackLUTValues(p0, p1, p2, 14, 1, 1) +#define I151(p0,p1) PackLUTValues(p0, p1, p0, 15, 1, 0) + +#define P0 _LQ2xLUT[pattern+(256*0)+(1024*compare)] +#define P1 _LQ2xLUT[pattern+(256*1)+(1024*compare)] +#define P2 _LQ2xLUT[pattern+(256*2)+(1024*compare)] +#define P3 _LQ2xLUT[pattern+(256*3)+(1024*compare)] + for (size_t compare = 0; compare < 16; compare++) + { + for (size_t pattern = 0; pattern < 256; pattern++) + { + switch (pattern) + { + #include "../filter/lq2x.h" + } + } + } +#undef P0 +#undef P1 +#undef P2 +#undef P3 + +#define P0 _HQ2xLUT[pattern+(256*0)+(1024*compare)] +#define P1 _HQ2xLUT[pattern+(256*1)+(1024*compare)] +#define P2 _HQ2xLUT[pattern+(256*2)+(1024*compare)] +#define P3 _HQ2xLUT[pattern+(256*3)+(1024*compare)] + for (size_t compare = 0; compare < 16; compare++) + { + for (size_t pattern = 0; pattern < 256; pattern++) + { + switch (pattern) + { + #include "../filter/hq2x.h" + } + } + } +#undef P0 +#undef P1 +#undef P2 +#undef P3 + +#define P(a, b) _HQ4xLUT[pattern+(256*((b*4)+a))+(4096*compare)] +#define I1(p0) PackLUTValues(p0, p0, p0, 1, 0, 0) +#define I2(i0, i1, p0, p1) PackLUTValues(p0, p1, p0, i0, i1, 0) +#define I3(i0, i1, i2, p0, p1, p2) PackLUTValues(p0, p1, p2, i0, i1, i2) + for (size_t compare = 0; compare < 16; compare++) + { + for (size_t pattern = 0; pattern < 256; pattern++) + { + switch (pattern) + { + #include "../filter/hq4x.dat" + } + } + } +#undef P +#undef I1 +#undef I2 +#undef I3 + +#undef MUR +#undef MDR +#undef MDL +#undef MUL +#undef IC +#undef I11 +#undef I211 +#undef I31 +#undef I332 +#undef I431 +#undef I521 +#undef I53 +#undef I611 +#undef I71 +#undef I772 +#undef I97 +#undef I1411 +#undef I151 + + lutValuesInited = true; +} + +#pragma mark - + OGLInfo::OGLInfo() { _versionMajor = 0; @@ -2322,7 +2463,7 @@ OGLDisplayLayer::OGLDisplayLayer(OGLVideoOutput *oglVO) OGLShaderProgram *shaderFilterProgram = _shaderFilter->GetProgram(); shaderFilterProgram->SetVertexAndFragmentShaderOGL(Sample1x1_VertShader_110, PassthroughFragShader_110); - InitHQnxPixelScaler(); + UploadHQnxLUTs(); } else { @@ -2368,157 +2509,35 @@ OGLDisplayLayer::~OGLDisplayLayer() free(_vfMasterDstBuffer); } -typedef struct +void OGLDisplayLayer::UploadHQnxLUTs() { - GLubyte p0; - GLubyte p1; - GLubyte p2; - GLubyte w0; - GLubyte w1; - GLubyte w2; -} LUTValues; - -LUTValues PackLUTValues(GLubyte p0, GLubyte p1, GLubyte p2, GLubyte w0, GLubyte w1, GLubyte w2) -{ - if (w1 == 0 && w2 == 0) - { - w0 = 255; - } - else - { - const GLubyte wR = 256 / (w0 + w1 + w2); - w0 *= wR; - w1 *= wR; - w2 *= wR; - } - - return {p0*31, p1*31, p2*31, w0, w1, w2}; -} - -void OGLDisplayLayer::InitHQnxPixelScaler() -{ - LUTValues hqnxLUT[256*16*16]; + InitHQnxLUTs(); glGenTextures(1, &_texLQ2xLUT); glGenTextures(1, &_texHQ2xLUT); glGenTextures(1, &_texHQ4xLUT); glActiveTexture(GL_TEXTURE0 + 1); -#define MUR (compare & 0x01) // top-right -#define MDR (compare & 0x02) // bottom-right -#define MDL (compare & 0x04) // bottom-left -#define MUL (compare & 0x08) // top-left -#define IC(p0) PackLUTValues(p0, p0, p0, 1, 0, 0) -#define I11(p0,p1) PackLUTValues(p0, p1, p0, 1, 1, 0) -#define I211(p0,p1,p2) PackLUTValues(p0, p1, p2, 2, 1, 1) -#define I31(p0,p1) PackLUTValues(p0, p1, p0, 3, 1, 0) -#define I332(p0,p1,p2) PackLUTValues(p0, p1, p2, 3, 3, 2) -#define I431(p0,p1,p2) PackLUTValues(p0, p1, p2, 4, 3, 1) -#define I521(p0,p1,p2) PackLUTValues(p0, p1, p2, 5, 2, 1) -#define I53(p0,p1) PackLUTValues(p0, p1, p0, 5, 3, 0) -#define I611(p0,p1,p2) PackLUTValues(p0, p1, p2, 6, 1, 1) -#define I71(p0,p1) PackLUTValues(p0, p1, p0, 7, 1, 0) -#define I772(p0,p1,p2) PackLUTValues(p0, p1, p2, 7, 7, 2) -#define I97(p0,p1) PackLUTValues(p0, p1, p0, 9, 7, 0) -#define I1411(p0,p1,p2) PackLUTValues(p0, p1, p2, 14, 1, 1) -#define I151(p0,p1) PackLUTValues(p0, p1, p0, 15, 1, 0) - -#define P0 hqnxLUT[pattern+(256*0)+(1024*compare)] -#define P1 hqnxLUT[pattern+(256*1)+(1024*compare)] -#define P2 hqnxLUT[pattern+(256*2)+(1024*compare)] -#define P3 hqnxLUT[pattern+(256*3)+(1024*compare)] - for (size_t compare = 0; compare < 16; compare++) - { - for (size_t pattern = 0; pattern < 256; pattern++) - { - switch (pattern) - { - #include "../filter/lq2x.h" - } - } - } -#undef P0 -#undef P1 -#undef P2 -#undef P3 - glBindTexture(GL_TEXTURE_3D, _texLQ2xLUT); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, 256*2, 4, 16, 0, GL_BGR, GL_UNSIGNED_BYTE, hqnxLUT); - -#define P0 hqnxLUT[pattern+(256*0)+(1024*compare)] -#define P1 hqnxLUT[pattern+(256*1)+(1024*compare)] -#define P2 hqnxLUT[pattern+(256*2)+(1024*compare)] -#define P3 hqnxLUT[pattern+(256*3)+(1024*compare)] - for (size_t compare = 0; compare < 16; compare++) - { - for (size_t pattern = 0; pattern < 256; pattern++) - { - switch (pattern) - { - #include "../filter/hq2x.h" - } - } - } -#undef P0 -#undef P1 -#undef P2 -#undef P3 + glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, 256*2, 4, 16, 0, GL_BGR, GL_UNSIGNED_BYTE, _LQ2xLUT); glBindTexture(GL_TEXTURE_3D, _texHQ2xLUT); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, 256*2, 4, 16, 0, GL_BGR, GL_UNSIGNED_BYTE, hqnxLUT); - -#define P(a, b) hqnxLUT[pattern+(256*((b*4)+a))+(4096*compare)] -#define I1(p0) PackLUTValues(p0, p0, p0, 1, 0, 0) -#define I2(i0, i1, p0, p1) PackLUTValues(p0, p1, p0, i0, i1, 0) -#define I3(i0, i1, i2, p0, p1, p2) PackLUTValues(p0, p1, p2, i0, i1, i2) - for (size_t compare = 0; compare < 16; compare++) - { - for (size_t pattern = 0; pattern < 256; pattern++) - { - switch (pattern) - { - #include "../filter/hq4x.dat" - } - } - } -#undef P -#undef I1 -#undef I2 -#undef I3 - -#undef MUR -#undef MDR -#undef MDL -#undef MUL -#undef IC -#undef I11 -#undef I211 -#undef I31 -#undef I332 -#undef I431 -#undef I521 -#undef I53 -#undef I611 -#undef I71 -#undef I772 -#undef I97 -#undef I1411 -#undef I151 + glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, 256*2, 4, 16, 0, GL_BGR, GL_UNSIGNED_BYTE, _HQ2xLUT); glBindTexture(GL_TEXTURE_3D, _texHQ4xLUT); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, 256*2, 16, 16, 0, GL_BGR, GL_UNSIGNED_BYTE, hqnxLUT); + glTexImage3D(GL_TEXTURE_3D, 0, GL_RGB, 256*2, 16, 16, 0, GL_BGR, GL_UNSIGNED_BYTE, _HQ4xLUT); glBindTexture(GL_TEXTURE_3D, 0); glActiveTexture(GL_TEXTURE0); diff --git a/desmume/src/cocoa/OGLDisplayOutput.h b/desmume/src/cocoa/OGLDisplayOutput.h index d538bf7f9..638f52d16 100644 --- a/desmume/src/cocoa/OGLDisplayOutput.h +++ b/desmume/src/cocoa/OGLDisplayOutput.h @@ -246,7 +246,7 @@ protected: GLint _uniformFinalOutputScalar; GLint _uniformFinalOutputViewSize; - void InitHQnxPixelScaler(); + void UploadHQnxLUTs(); virtual void UploadVerticesOGL(); virtual void UploadTexCoordsOGL();