From 17b7ebfa88dfe16b2c2f8616652cb5d0ad8c3ff6 Mon Sep 17 00:00:00 2001 From: zeromus Date: Sat, 13 Aug 2016 18:24:33 +0000 Subject: [PATCH 1/9] change backup memory whitelist application technique and fix SM64 (KOR) which needs an 0.5KB eeprom apparently (based on its use of WRHI and RDHI commands) --- desmume/src/mc.cpp | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/desmume/src/mc.cpp b/desmume/src/mc.cpp index 98c723f16..2ddae7b26 100644 --- a/desmume/src/mc.cpp +++ b/desmume/src/mc.cpp @@ -1,7 +1,7 @@ /* Copyright (C) 2006 thoduv Copyright (C) 2006-2007 Theo Berkau - Copyright (C) 2008-2015 DeSmuME team + Copyright (C) 2008-2016 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -618,6 +618,21 @@ void BackupDevice::reset() ensure((u32)savesize); //expand properly if necessary addr_size = addr_size_for_old_save_type(savetype); } + + //automatically detect these hardcodes + if(state == DETECTING) + { + if(!memcmp(gameInfo.header.gameCode,"ASMK", 4)) addr_size = 1; //super mario 64 ds (KOR, which is different somehow) + else if(!memcmp(gameInfo.header.gameCode,"ASM", 3)) addr_size = 2; //super mario 64 ds + else if(!memcmp(gameInfo.header.gameCode,"BDE", 3)) addr_size = 2; // Dementium II + else if(!memcmp(gameInfo.header.gameCode,"AL3", 3)) addr_size = 1; //spongebob atlantis squarepantis. + else if(!memcmp(gameInfo.header.gameCode,"AH5", 3)) addr_size = 1; //over the hedge + else if(!memcmp(gameInfo.header.gameCode,"AVH", 3)) addr_size = 1; //over the hedge - Hammy Goes Nuts! + else if(!memcmp(gameInfo.header.gameCode,"AQ3", 3)) addr_size = 1; //spider-man 3 + + //if we found a whitelist match, we dont need to run detection + if(addr_size) state = RUNNING; + } } void BackupDevice::close_rom() @@ -661,36 +676,33 @@ void BackupDevice::detect() addr_size = 1; //choose 1 just to keep the busted savefile from growing too big msgbox->error("Catastrophic error while autodetecting save type.\nIt will need to be specified manually\n"); break; + case 2: //the modern typical case for small eeproms addr_size = 1; break; + case 3: //another modern typical case.. //but unfortunately we select this case on accident sometimes when what it meant to do was present the archaic 1+2 case //(the archaic 1+2 case is: specifying one address byte, and then reading the first two bytes, instead of the first one byte, as most other games would do.) //so, we're gonna hack in checks for the games that are doing this addr_size = 2; - - // TODO: will study a deep, why this happens (wrong detect size) - if(!memcmp(gameInfo.header.gameCode,"AL3", 3)) addr_size = 1; //spongebob atlantis squarepantis. - if(!memcmp(gameInfo.header.gameCode,"AH5", 3)) addr_size = 1; //over the hedge - if(!memcmp(gameInfo.header.gameCode,"AVH", 3)) addr_size = 1; //over the hedge - Hammy Goes Nuts! - if(!memcmp(gameInfo.header.gameCode,"AQ3", 3)) addr_size = 1; //spider-man 3 - break; + case 4: //a modern typical case addr_size = 3; - if(!memcmp(gameInfo.header.gameCode,"ASM", 3)) addr_size = 2; //super mario 64 ds + break; default: //the archaic case: write the address and then some modulo-4 number of bytes //why modulo 4? who knows. - //SM64 (KOR) makes it here with autodetect_size=11 and nothing interesting in the buffer addr_size = autodetect_size & 3; - if(!memcmp(gameInfo.header.gameCode,"BDE", 3)) addr_size = 2; // Dementium II + //SM64 (KOR) makes it here with autodetect_size=11 and nothing interesting in the buffer + //we whitelisted it earlier though + break; } From d89fa0c7616d843abb743738e12330bf1ac1f266 Mon Sep 17 00:00:00 2001 From: zeromus Date: Sat, 13 Aug 2016 18:24:45 +0000 Subject: [PATCH 2/9] winport: fix fastbuild flag --- desmume/src/windows/desmume.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/desmume/src/windows/desmume.props b/desmume/src/windows/desmume.props index 1abb27fe2..569cd79d8 100644 --- a/desmume/src/windows/desmume.props +++ b/desmume/src/windows/desmume.props @@ -94,7 +94,7 @@ _DEBUG;%(PreprocessorDefinitions) RELEASE;NDEBUG;%(PreprocessorDefinitions) - RELEASE;NDEBUG;%(PreprocessorDefinitions) + FASTBUILD;RELEASE;NDEBUG;%(PreprocessorDefinitions) From cc2c86cf1182bf8cb0c19074692d1297aa493af5 Mon Sep 17 00:00:00 2001 From: zeromus Date: Sat, 13 Aug 2016 23:48:51 +0000 Subject: [PATCH 3/9] fix #1555 (regression in Kingdom Hearts Re:coded caused by r5440) by changing how wacky nearly-out-of-limits geometry is handled to a possibly more plausible mechanism --- desmume/src/gfx3d.cpp | 25 ++++++++++++++++++------- desmume/src/matrix.cpp | 5 ----- desmume/src/matrix.h | 16 ---------------- desmume/src/types.h | 14 ++------------ 4 files changed, 20 insertions(+), 40 deletions(-) diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 5a6e669bb..9ed7786e0 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -1,6 +1,6 @@ /* Copyright (C) 2006 yopyop - Copyright (C) 2008-2015 DeSmuME team + Copyright (C) 2008-2016 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -659,12 +659,23 @@ static void SetVertex() return; if(polylist->count >= POLYLIST_SIZE) return; - - //TODO - think about keeping the clip matrix concatenated, - //so that we only have to multiply one matrix here - //(we could lazy cache the concatenated clip matrix and only generate it - //when we need to) - MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed); + + //games will definitely count on overflowing the matrix math + //scenarios to balance here: + //+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen + //morover in some conditions there will be vertical glitched lines sometimes when drilling at the top center of the screen. + //+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA??????? + //+ SM64: skybox + //+ TBD other things, probably, dragon quest worldmaps? + //At first I tried saturating the math elsewhere, but that couldn't fix all cases + //So after some fooling around, I found this nicely aesthetic way of balancing all the cases. I don't doubt that it's still inaccurate, however + //Note, if <<3 seems weird, it's reasonable if you assume the goal is to end up with 16 integer bits and a sign bit. + MatrixMultVec4x4(mtxCurrent[1],coordTransformed); //modelview + for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<3>>3); //balances everything ok + //for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<4>>4); //breaks SM64 skyboxes + //for(int i=0;i<4;i++) coordTransformed[i] = (((u32)coordTransformed[i])<<4>>4)|(((s32)(coordTransformed[i]&0x80000000))>>3); //another way generally to drop precision (but breaks spectrobes which does seem to need some kind of buggy wrap-around behaviour) + MatrixMultVec4x4(mtxCurrent[0],coordTransformed); //projection + for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<3>>3); //no proof this is needed, but suspected to be similar based on above //printf("%f %f %f\n",s16coord[0]/4096.0f,s16coord[1]/4096.0f,s16coord[2]/4096.0f); //printf("x %f %f %f %f\n",mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f); diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index 666029bbf..35ecf271a 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -427,8 +427,3 @@ void MatrixTranslate(s32 *matrix, const s32 *ptr) }); } -void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr) -{ - MatrixMultVec4x4(matrix+16,vecPtr); - MatrixMultVec4x4(matrix,vecPtr); -} diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 8aa87c2fe..d060a4d38 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -276,13 +276,6 @@ FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr) _mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr))); } -FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr) -{ - //there are hardly any gains from merging these manually - MatrixMultVec4x4(matrix+16,vecPtr); - MatrixMultVec4x4(matrix,vecPtr); -} - FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr) { const __m128 vec = _mm_load_ps(vecPtr); @@ -355,13 +348,6 @@ void MatrixMultiply(float * matrix, const float * rightMatrix); void MatrixTranslate(float *matrix, const float *ptr); void MatrixScale(float * matrix, const float * ptr); -FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr) -{ - //there are hardly any gains from merging these manually - MatrixMultVec4x4(matrix+16,vecPtr); - MatrixMultVec4x4(matrix,vecPtr); -} - template FORCEINLINE void vector_fix2float(float* matrix, const float divisor) { @@ -373,8 +359,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor) void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr); -void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr); - void MatrixMultiply(s32* matrix, const s32* rightMatrix); void MatrixScale(s32 *matrix, const s32 *ptr); void MatrixTranslate(s32 *matrix, const s32 *ptr); diff --git a/desmume/src/types.h b/desmume/src/types.h index ce794d5a7..56b225c3b 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -504,21 +504,11 @@ FORCEINLINE s64 fx32_mul(const s32 a, const s32 b) FORCEINLINE s32 fx32_shiftdown(const s64 a) { - s64 shifted; #ifdef _MSC_VER - shifted = __ll_rshift(a,12); + return (s32)__ll_rshift(a,12); #else - shifted = (a>>12); + return (s32)(a>>12); #endif - //either matrix math is happening at higher precision (an extra bit would suffice, I think), or the sums sent to this are saturated. - //tested by: spectrobes beyond the portals excavation blower - //(it sets very large +x,+y in the modelview matrix to push things offscreen, but the +y will overflow and become negative if we're not careful) - //I didnt think very hard about what would be fastest here on 32bit systems - //NOTE: this was intended for use in MatrixMultVec4x4_M2; it may not be appropriate for other uses of fx32_shiftdown. - //if this causes problems we should refactor the math routines a bit to take care of saturating in another function - if(shifted>(s32)0x7FFFFFFF) return 0x7FFFFFFF; - else if(shifted<=(s32)0x80000000) return 0x80000000; - else return shifted; } FORCEINLINE s64 fx32_shiftup(const s32 a) From abbfa4a6b7c9dc7031b908f69f663deae0ad0cea Mon Sep 17 00:00:00 2001 From: zeromus Date: Sat, 13 Aug 2016 23:50:25 +0000 Subject: [PATCH 4/9] fix #1134 (american girls julie finds a way) by changing behaviour of box test busy flag relative to fifo --- desmume/src/FIFO.cpp | 9 +++++++++ desmume/src/gfx3d.cpp | 31 +++++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp index 542bae9fc..38f10876b 100644 --- a/desmume/src/FIFO.cpp +++ b/desmume/src/FIFO.cpp @@ -237,6 +237,15 @@ void GFX_FIFOsend(u8 cmd, u32 param) if(IsMatrixStackCommand(cmd)) gxFIFO.matrix_stack_op_size++; + //along the same lines: + //american girls julie finds a way will put a bunch of stuff and then a box test into the fifo and then immediately test the busy flag + //so we need to set the busy flag here. + //does it expect the fifo to be running then? well, it's definitely jammed -- making it unjammed at one point did fix this bug. + //it's still not clear whether we're handling the immediate vs fifo commands properly at all :( + //anyway, here we go, similar treatment. consider this a hack. + if(cmd == 0x70) MMU_new.gxstat.tb = 1; //just set the flag--youre insane if you queue more than one of these anyway + if(cmd == 0x71) MMU_new.gxstat.tb = 1; + if(gxFIFO.size>=HACK_GXIFO_SIZE) { printf("--FIFO FULL-- : %d\n",gxFIFO.size); } diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 9ed7786e0..0af210e57 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -1496,8 +1496,9 @@ static void gfx3d_glViewPort(u32 v) static BOOL gfx3d_glBoxTest(u32 v) { //printf("boxtest\n"); - MMU_new.gxstat.tr = 0; // clear boxtest bit - MMU_new.gxstat.tb = 1; // busy + + //clear result flag. busy flag has been set by fifo component already + MMU_new.gxstat.tr = 0; BTcoords[BTind++] = v & 0xFFFF; BTcoords[BTind++] = v >> 16; @@ -1505,9 +1506,11 @@ static BOOL gfx3d_glBoxTest(u32 v) if (BTind < 5) return FALSE; BTind = 0; - MMU_new.gxstat.tb = 0; // clear busy GFX_DELAY(103); + //now that we're executing this, we're not busy anymore + MMU_new.gxstat.tb = 0; + #if 0 INFO("BoxTEST: x %f y %f width %f height %f depth %f\n", BTcoords[0], BTcoords[1], BTcoords[2], BTcoords[3], BTcoords[4], BTcoords[5]); @@ -1620,27 +1623,31 @@ static BOOL gfx3d_glBoxTest(u32 v) //if any portion of this poly was retained, then the test passes. if (boxtestClipper.clippedPolyCounter > 0) { - //printf("%06d PASS %d\n",boxcounter,gxFIFO.size); + //printf("%06d PASS %d\n",gxFIFO.size, i); MMU_new.gxstat.tr = 1; break; } + else + { + } + + //if(i==5) printf("%06d FAIL\n",gxFIFO.size); } - if (MMU_new.gxstat.tr == 0) - { - //printf("%06d FAIL %d\n",boxcounter,gxFIFO.size); - } - + //printf("%06d RESULT %d\n",gxFIFO.size, MMU_new.gxstat.tr); + return TRUE; } static BOOL gfx3d_glPosTest(u32 v) { - //printf("postest\n"); //this is apparently tested by transformers decepticons and ultimate spiderman - //printf("POSTEST\n"); - MMU_new.gxstat.tb = 1; + //clear result flag. busy flag has been set by fifo component already + MMU_new.gxstat.tr = 0; + + //now that we're executing this, we're not busy anymore + MMU_new.gxstat.tb = 0; PTcoords[PTind++] = float16table[v & 0xFFFF]; PTcoords[PTind++] = float16table[v >> 16]; From 7abca6975081b9847cd764ff31000f519922883f Mon Sep 17 00:00:00 2001 From: zeromus Date: Mon, 15 Aug 2016 07:10:42 +0000 Subject: [PATCH 5/9] better fix than r5529 for same bug (which fixes some regressions from that commit) --- desmume/src/gfx3d.cpp | 69 ++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 0af210e57..d2e9c1c81 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -628,6 +628,53 @@ FORCEINLINE s32 vec3dot_fixed32(const s32* a, const s32* b) { return fx32_shiftdown(fx32_mul(a[0],b[0]) + fx32_mul(a[1],b[1]) + fx32_mul(a[2],b[2])); } +//--------------- +//I'm going to start name these functions GE for GEOMETRY ENGINE MATH. +//Pretty much any math function in this file should be explicit about how it's handling precision. +//Handling that stuff generically globally is not a winning proposition. + +FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b) +{ +#ifdef _MSC_VER + return __emul(a,b); +#else + return ((s64)a)*((s64)b); +#endif +} + +static s32 GEM_SaturateAndShiftdown36To32(const s64 val) +{ + if(val>0x000007FFFFFFFFFFLL) return 0x7FFFFFFF; + if(val<0xFFFFF80000000000LL) return 0x80000000; + + return fx32_shiftdown(val); +} + +static void GEM_TransformVertex(const s32 *matrix, s32 *vecPtr) +{ + const s32 x = vecPtr[0]; + const s32 y = vecPtr[1]; + const s32 z = vecPtr[2]; + const s32 w = vecPtr[3]; + + //saturation logic is most carefully tested by: + //+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen + //You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like + //but if it's done wrongly, you can get bugs in: + //+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA??????? + + //other test cases that cropped up during this development, but are probably not actually related to this after all + //+ SM64: outside castle skybox + //+ NSMB: mario head screen wipe + + vecPtr[0] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[0]) + GEM_Mul32x32To64(y,matrix[4]) + GEM_Mul32x32To64(z,matrix [8]) + GEM_Mul32x32To64(w,matrix[12])); + vecPtr[1] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[1]) + GEM_Mul32x32To64(y,matrix[5]) + GEM_Mul32x32To64(z,matrix[ 9]) + GEM_Mul32x32To64(w,matrix[13])); + vecPtr[2] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[2]) + GEM_Mul32x32To64(y,matrix[6]) + GEM_Mul32x32To64(z,matrix[10]) + GEM_Mul32x32To64(w,matrix[14])); + vecPtr[3] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[3]) + GEM_Mul32x32To64(y,matrix[7]) + GEM_Mul32x32To64(z,matrix[11]) + GEM_Mul32x32To64(w,matrix[15])); +} +//--------------- + + #define SUBMITVERTEX(ii, nn) polylist->list[polylist->count].vertIndexes[ii] = tempVertInfo.map[nn]; //Submit a vertex to the GE static void SetVertex() @@ -660,26 +707,8 @@ static void SetVertex() if(polylist->count >= POLYLIST_SIZE) return; - //games will definitely count on overflowing the matrix math - //scenarios to balance here: - //+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen - //morover in some conditions there will be vertical glitched lines sometimes when drilling at the top center of the screen. - //+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA??????? - //+ SM64: skybox - //+ TBD other things, probably, dragon quest worldmaps? - //At first I tried saturating the math elsewhere, but that couldn't fix all cases - //So after some fooling around, I found this nicely aesthetic way of balancing all the cases. I don't doubt that it's still inaccurate, however - //Note, if <<3 seems weird, it's reasonable if you assume the goal is to end up with 16 integer bits and a sign bit. - MatrixMultVec4x4(mtxCurrent[1],coordTransformed); //modelview - for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<3>>3); //balances everything ok - //for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<4>>4); //breaks SM64 skyboxes - //for(int i=0;i<4;i++) coordTransformed[i] = (((u32)coordTransformed[i])<<4>>4)|(((s32)(coordTransformed[i]&0x80000000))>>3); //another way generally to drop precision (but breaks spectrobes which does seem to need some kind of buggy wrap-around behaviour) - MatrixMultVec4x4(mtxCurrent[0],coordTransformed); //projection - for(int i=0;i<4;i++) coordTransformed[i] = (((s32)coordTransformed[i])<<3>>3); //no proof this is needed, but suspected to be similar based on above - - //printf("%f %f %f\n",s16coord[0]/4096.0f,s16coord[1]/4096.0f,s16coord[2]/4096.0f); - //printf("x %f %f %f %f\n",mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f); - //printf(" = %f %f %f %f\n",coordTransformed[0]/4096.0f,coordTransformed[1]/4096.0f,coordTransformed[2]/4096.0f,coordTransformed[3]/4096.0f); + GEM_TransformVertex(mtxCurrent[1],coordTransformed); //modelview + GEM_TransformVertex(mtxCurrent[0],coordTransformed); //projection //TODO - culling should be done here. //TODO - viewport transform? From d837653b5fa3a826fd5a606462cbfac07d8185c9 Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 16 Aug 2016 00:12:34 +0000 Subject: [PATCH 6/9] GFX3D: - Fix 3D rendering on non-MSVC builds. --- desmume/src/gfx3d.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index d2e9c1c81..d93485685 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -632,20 +632,20 @@ FORCEINLINE s32 vec3dot_fixed32(const s32* a, const s32* b) { //I'm going to start name these functions GE for GEOMETRY ENGINE MATH. //Pretty much any math function in this file should be explicit about how it's handling precision. //Handling that stuff generically globally is not a winning proposition. - -FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b) -{ -#ifdef _MSC_VER - return __emul(a,b); -#else - return ((s64)a)*((s64)b); -#endif + +FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b) +{ +#ifdef _MSC_VER + return __emul(a,b); +#else + return ((s64)a)*((s64)b); +#endif } static s32 GEM_SaturateAndShiftdown36To32(const s64 val) { - if(val>0x000007FFFFFFFFFFLL) return 0x7FFFFFFF; - if(val<0xFFFFF80000000000LL) return 0x80000000; + if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU; + if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U; return fx32_shiftdown(val); } From d8735a803bf3cbc6a75b0c0224a179f6d8a08d49 Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 16 Aug 2016 06:47:22 +0000 Subject: [PATCH 7/9] Colorspace Handler: - Factor out the generic colorspace handling routines out of GPU.cpp/GPU.h into their own separate files. - Add vectorized routines using AVX2 and AltiVec. --- desmume/src/GPU.cpp | 356 +------- desmume/src/GPU.h | 414 +--------- desmume/src/Makefile.am | 16 + desmume/src/OGLRender.cpp | 43 +- .../project.pbxproj | 40 + .../project.pbxproj | 40 + desmume/src/cocoa/cocoa_output.mm | 2 +- desmume/src/cocoa/cocoa_rom.mm | 2 +- desmume/src/frontend/modules/ImageOut.cpp | 126 ++- desmume/src/render3D.cpp | 8 +- desmume/src/texcache.cpp | 44 +- desmume/src/types.h | 44 + .../colorspacehandler/colorspacehandler.cpp | 776 ++++++++++++++++++ .../colorspacehandler/colorspacehandler.h | 194 +++++ .../colorspacehandler_AVX2.cpp | 491 +++++++++++ .../colorspacehandler_AVX2.h | 74 ++ .../colorspacehandler_AltiVec.cpp | 345 ++++++++ .../colorspacehandler_AltiVec.h | 64 ++ .../colorspacehandler_SSE2.cpp | 503 ++++++++++++ .../colorspacehandler_SSE2.h | 74 ++ desmume/src/version.cpp | 65 +- desmume/src/windows/DeSmuME.vcxproj | 4 + desmume/src/windows/DeSmuME.vcxproj.filters | 15 + desmume/src/windows/aviout.cpp | 11 +- desmume/src/windows/main.cpp | 2 +- 25 files changed, 2877 insertions(+), 876 deletions(-) create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler.cpp create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler.h create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp create mode 100644 desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index cde834c55..a252d92c1 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -18,6 +18,14 @@ along with the this software. If not, see . */ +#ifdef FASTBUILD + #undef FORCEINLINE + #define FORCEINLINE + //compilation speed hack (cuts time exactly in half by cutting out permutations) + #define DISABLE_MOSAIC + #define DISABLE_COLOREFFECTDISABLEHINT +#endif + #include "GPU.h" #include @@ -40,75 +48,8 @@ #include "matrix.h" #include "emufile.h" -#ifdef FASTBUILD - #undef FORCEINLINE - #define FORCEINLINE - //compilation speed hack (cuts time exactly in half by cutting out permutations) - #define DISABLE_MOSAIC -#endif - u32 Render3DFramesPerSecond; -CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; -CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; -CACHE_ALIGN u32 color_555_to_666[32768]; -CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; -CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; -CACHE_ALIGN u32 color_555_to_888[32768]; - -//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX -CACHE_ALIGN const u32 material_5bit_to_31bit[] = { - 0x00000000, 0x04210842, 0x08421084, 0x0C6318C6, - 0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE, - 0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6, - 0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE, - 0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7, - 0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF, - 0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7, - 0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF -}; - -// 5-bit to 6-bit conversions use this formula -- dst = (src == 0) ? 0 : (2*src) + 1 -// Reference GBATEK: http://problemkaputt.de/gbatek.htm#ds3dtextureblending -CACHE_ALIGN const u8 material_5bit_to_6bit[] = { - 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, - 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, - 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, - 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F -}; - -CACHE_ALIGN const u8 material_5bit_to_8bit[] = { - 0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39, - 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B, - 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD, - 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF -}; - -CACHE_ALIGN const u8 material_6bit_to_8bit[] = { - 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, - 0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C, - 0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D, - 0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D, - 0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E, - 0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE, - 0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF, - 0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF -}; - -CACHE_ALIGN const u8 material_3bit_to_8bit[] = { - 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF -}; - -//maybe not very precise -CACHE_ALIGN const u8 material_3bit_to_5bit[] = { - 0, 4, 8, 13, 17, 22, 26, 31 -}; - -//TODO - generate this in the static init method more accurately -CACHE_ALIGN const u8 material_3bit_to_6bit[] = { - 0, 8, 16, 26, 34, 44, 52, 63 -}; - //instantiate static instance u16 GPUEngineBase::_brightnessUpTable555[17][0x8000]; FragmentColor GPUEngineBase::_brightnessUpTable666[17][0x8000]; @@ -167,7 +108,7 @@ const CACHE_ALIGN BGLayerSize GPUEngineBase::_BGLayerSizeLUT[8][4] = { {{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext direct }; -static void ExpandLine8(u8 *__restrict dst, const u8 *__restrict src, size_t dstLength) +static FORCEINLINE void ExpandLine8(u8 *__restrict dst, const u8 *__restrict src, size_t dstLength) { #ifdef ENABLE_SSSE3 const bool isIntegerScale = ((dstLength % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0); @@ -1655,11 +1596,11 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - dstColor32.color = ConvertColor555To6665Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); break; case NDSColorFormat_BGR888_Rev: - dstColor32.color = ConvertColor555To8888Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); break; } @@ -1682,11 +1623,11 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - dstColor32.color = ConvertColor555To6665Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); break; case NDSColorFormat_BGR888_Rev: - dstColor32.color = ConvertColor555To8888Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); break; } @@ -1767,11 +1708,11 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - dstColor32.color = ConvertColor555To6665Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); break; case NDSColorFormat_BGR888_Rev: - dstColor32.color = ConvertColor555To8888Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); break; } break; @@ -1833,13 +1774,13 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - srcColor32.color = ConvertColor555To6665Opaque(srcColor16); + srcColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); dstColor32.a = 0x1F; break; case NDSColorFormat_BGR888_Rev: - srcColor32.color = ConvertColor555To8888Opaque(srcColor16); + srcColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); dstColor32.a = 0xFF; break; @@ -2132,7 +2073,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D(GPUEngineCompositorInfo &compInfo // Render the pixel using the selected color effect. if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { - const u16 srcColor16 = ConvertColor6665To5551(srcColor32); + const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); switch (selectedEffect) { @@ -2695,13 +2636,13 @@ void GPUEngineBase::_RenderPixelsCustom(GPUEngineCompositorInfo &compInfo) if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) { - ConvertColor555To6665Opaque(src16[0], src[0], src[1]); - ConvertColor555To6665Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); } else { - ConvertColor555To8888Opaque(src16[0], src[0], src[1]); - ConvertColor555To8888Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); } } @@ -2796,13 +2737,13 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(GPUEngineCompositorInfo &compInfo) { if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) { - ConvertColor555To6665Opaque(src16[0], src[0], src[1]); - ConvertColor555To6665Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); } else { - ConvertColor555To8888Opaque(src16[0], src[0], src[1]); - ConvertColor555To8888Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); } } @@ -4502,7 +4443,7 @@ void GPUEngineBase::UpdateVRAM3DUsageProperties_OBJLayer(const size_t bankIndex) } template -void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo) { bool useCustomVRAM = false; @@ -4538,26 +4479,28 @@ void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo) } template -void GPUEngineBase::_RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo) { this->_RenderLine_LayerBG_Final(compInfo); } template -void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo) { +#ifndef DISABLE_COLOREFFECTDISABLEHINT if (compInfo.renderState.colorEffect == ColorEffect_Disable) { this->_RenderLine_LayerBG_ApplyColorEffectDisabledHint(compInfo); } else +#endif { this->_RenderLine_LayerBG_ApplyColorEffectDisabledHint(compInfo); } } template -void GPUEngineBase::_RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo) { if (ISDEBUGRENDER) { @@ -4951,7 +4894,7 @@ void GPUEngineBase::ResolveCustomRendering() void GPUEngineBase::ResolveRGB666ToRGB888() { - ConvertColorBuffer6665To8888((u32 *)this->renderedBuffer, (u32 *)this->renderedBuffer, this->renderedWidth * this->renderedHeight); + ColorspaceConvertBuffer6665To8888((u32 *)this->renderedBuffer, (u32 *)this->renderedBuffer, this->renderedWidth * this->renderedHeight); } void GPUEngineBase::ResolveToCustomFramebuffer() @@ -5575,12 +5518,12 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) case NDSColorFormat_BGR666_Rev: renderedLineSrcA16 = (u16 *)malloc_alignedCacheLine(compInfo.line.pixelCount * sizeof(u16)); - ConvertColorBuffer6665To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); + ColorspaceConvertBuffer6665To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); break; case NDSColorFormat_BGR888_Rev: renderedLineSrcA16 = (u16 *)malloc_alignedCacheLine(compInfo.line.pixelCount * sizeof(u16)); - ConvertColorBuffer8888To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); + ColorspaceConvertBuffer8888To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); break; } } @@ -6570,7 +6513,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); - ConvertColorBuffer555To6665Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); + ColorspaceConvertBuffer555To6665Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); break; } @@ -6578,7 +6521,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); - ConvertColorBuffer555To8888Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); + ColorspaceConvertBuffer555To8888Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); break; } } @@ -6598,7 +6541,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth); FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth); - ConvertColorBuffer555To6665Opaque(src, (u32 *)dst, customPixCount); + ColorspaceConvertBuffer555To6665Opaque(src, (u32 *)dst, customPixCount); break; } @@ -6606,7 +6549,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth); FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth); - ConvertColorBuffer555To8888Opaque(src, (u32 *)dst, customPixCount); + ColorspaceConvertBuffer555To8888Opaque(src, (u32 *)dst, customPixCount); break; } } @@ -6802,28 +6745,7 @@ void GPUEngineB::RenderLine(const u16 l) GPUSubsystem::GPUSubsystem() { - static bool needInitTables = true; - - if (needInitTables) - { -#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) -#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) -#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) -#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) - - for (size_t i = 0; i < 32768; i++) - { - color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); - color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); - color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); - - color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); - color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); - color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 ); - } - - needInitTables = false; - } + ColorspaceHandlerInit(); _defaultEventHandler = new GPUEventHandlerDefault; _event = _defaultEventHandler; @@ -7581,178 +7503,6 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID) this->_gpu->SetDisplayByID(this->_ID); } -template -void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - __m128i src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); - __m128i dstConvertedLo, dstConvertedHi; - ConvertColor555To8888Opaque(src_vec128, dstConvertedLo, dstConvertedHi); - - if (IS_UNALIGNED) - { - _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - else - { - _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor555To8888Opaque(src[i]); - } -} - -template -void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - __m128i src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); - __m128i dstConvertedLo, dstConvertedHi; - ConvertColor555To6665Opaque(src_vec128, dstConvertedLo, dstConvertedHi); - - if (IS_UNALIGNED) - { - _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - else - { - _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor555To6665Opaque(src[i]); - } -} - -template -void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To6665(_mm_load_si128((__m128i *)(src + i))) ); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor8888To6665(src[i]); - } -} - -template -void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To8888(_mm_load_si128((__m128i *)(src + i))) ); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor6665To8888(src[i]); - } -} - -template -void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - if (IS_UNALIGNED) - { - _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); - } - else - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor8888To5551(src[i]); - } -} - -template -void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - if (IS_UNALIGNED) - { - _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); - } - else - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor6665To5551(src[i]); - } -} - template void GPUEngineBase::ParseReg_BGnHOFS(); template void GPUEngineBase::ParseReg_BGnHOFS(); template void GPUEngineBase::ParseReg_BGnHOFS(); @@ -7774,29 +7524,3 @@ template void GPUEngineBase::ParseReg_BGnY(); template void GPUSubsystem::RenderLine(const u16 l, bool skip); template void GPUSubsystem::RenderLine(const u16 l, bool skip); template void GPUSubsystem::RenderLine(const u16 l, bool skip); - -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); - -template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); - -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 47c1455db..a32857d36 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -25,9 +25,11 @@ #include #include "types.h" +#include "./utils/colorspacehandler/colorspacehandler.h" #ifdef ENABLE_SSE2 #include +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" #endif #ifdef ENABLE_SSSE3 @@ -101,15 +103,6 @@ enum DisplayCaptureSize DisplayCaptureSize_256x192 = 3, }; -union FragmentColor -{ - u32 color; - struct - { - u8 r,g,b,a; - }; -}; - typedef union { u32 value; @@ -1052,61 +1045,6 @@ enum NDSDisplayID NDSDisplayID_Touch = 1 }; -enum NDSColorFormat -{ - // The color format information is packed in a 32-bit value. - // The bits are as follows: - // FFFOOOOO AAAAAABB BBBBGGGG GGRRRRRR - // - // F = Flags (see below) - // O = Color order (see below) - // A = Bit count for alpha [0-63] - // B = Bit count for blue [0-63] - // G = Bit count for green [0-63] - // R = Bit count for red [0-63] - // - // Flags: - // Bit 29: Reverse order flag. - // Set = Bits are in reverse order, usually for little-endian usage. - // Cleared = Bits are in normal order, usually for big-endian usage. - // - // Color order bits, 24-28: - // 0x00 = RGBA, common format - // 0x01 = RGAB - // 0x02 = RBGA - // 0x03 = RBAG - // 0x04 = RAGB - // 0x05 = RABG - // 0x06 = GRBA - // 0x07 = GRAB - // 0x08 = GBRA - // 0x09 = GBAR - // 0x0A = GARB - // 0x0B = GABR - // 0x0C = BRGA - // 0x0D = BRAG - // 0x0E = BGRA, common format - // 0x0F = BGAR - // 0x10 = BARG - // 0x11 = BAGR - // 0x12 = ARGB - // 0x13 = ARBG - // 0x14 = AGRB - // 0x15 = AGBR - // 0x16 = ABRG - // 0x17 = ABGR - - // Color formats used for internal processing. - //NDSColorFormat_ABGR1555_Rev = 0x20045145, - //NDSColorFormat_ABGR5666_Rev = 0x20186186, - //NDSColorFormat_ABGR8888_Rev = 0x20208208, - - // Color formats used by the output framebuffers. - NDSColorFormat_BGR555_Rev = 0x20005145, - NDSColorFormat_BGR666_Rev = 0x20006186, - NDSColorFormat_BGR888_Rev = 0x20008208 -}; - struct DISPCAPCNT_parsed { u8 EVA; @@ -1410,9 +1348,9 @@ protected: template bool _IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo); void _PerformWindowTesting(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); + template FORCEINLINE void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo); + template FORCEINLINE void _RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo); + template FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); template void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo); template void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item); @@ -1733,346 +1671,4 @@ public: extern GPUSubsystem *GPU; extern MMU_struct MMU; -extern CACHE_ALIGN const u32 material_5bit_to_31bit[32]; -extern CACHE_ALIGN const u8 material_5bit_to_6bit[32]; -extern CACHE_ALIGN const u8 material_5bit_to_8bit[32]; -extern CACHE_ALIGN const u8 material_6bit_to_8bit[64]; -extern CACHE_ALIGN const u8 material_3bit_to_5bit[8]; -extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; -extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; - -extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; -extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; -extern CACHE_ALIGN u32 color_555_to_666[32768]; -extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; -extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; -extern CACHE_ALIGN u32 color_555_to_888[32768]; - -#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color -#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped -#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color - -#ifdef LOCAL_LE - #define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian -#else - #define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian -#endif - -#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color -#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped -#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color - -#ifdef LOCAL_LE - #define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian -#else - #define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian -#endif - -//produce a 15bpp color from individual 5bit components -#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) ) - -//produce a 16bpp color from individual 5bit components -#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) ) - -inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) -{ - FragmentColor ret; - ret.r = r; ret.g = g; ret.b = b; ret.a = a; - return ret; -} - -template -FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src) -{ - return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); -} - -template -FORCEINLINE u32 ConvertColor555To6665Opaque(const u16 src) -{ - return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); -} - -template -FORCEINLINE u32 ConvertColor8888To6665(FragmentColor srcColor) -{ - FragmentColor outColor; - outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; - outColor.g = srcColor.g >> 2; - outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; - outColor.a = srcColor.a >> 3; - - return outColor.color; -} - -template -FORCEINLINE u32 ConvertColor8888To6665(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor8888To6665(srcColorComponent); -} - -template -FORCEINLINE u32 ConvertColor6665To8888(FragmentColor srcColor) -{ - FragmentColor outColor; - outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; - outColor.g = material_6bit_to_8bit[srcColor.g]; - outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; - outColor.a = material_5bit_to_8bit[srcColor.a]; - - return outColor.color; -} - -template -FORCEINLINE u32 ConvertColor6665To8888(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor6665To8888(srcColorComponent); -} - -template -FORCEINLINE u16 ConvertColor8888To5551(FragmentColor srcColor) -{ - return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); -} - -template -FORCEINLINE u16 ConvertColor8888To5551(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor8888To5551(srcColorComponent); -} - -template -FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor) -{ - return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); -} - -template -FORCEINLINE u16 ConvertColor6665To5551(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor6665To5551(srcColorComponent); -} - -#ifdef ENABLE_SSE2 - -template -FORCEINLINE void ConvertColor555To8888(const __m128i &srcColor, const __m128i &srcAlphaBits32Lo, const __m128i &srcAlphaBits32Hi, __m128i &dstLo, __m128i &dstHi) -{ - __m128i src32; - - // Conversion algorithm: - // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); - dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); - dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); - dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); - - src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); - dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); - dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); - dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); -} - -template -FORCEINLINE void ConvertColor555To6665(const __m128i &srcColor, const __m128i &srcAlphaBits32Lo, const __m128i &srcAlphaBits32Hi, __m128i &dstLo, __m128i &dstHi) -{ - __m128i src32; - - // Conversion algorithm: - // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); - dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); - dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); - dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); - - src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); - dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); - dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); - dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); -} - -template -FORCEINLINE void ConvertColor555To8888Opaque(const __m128i &srcColor, __m128i &dstLo, __m128i &dstHi) -{ - const __m128i srcAlphaBits32 = _mm_set1_epi32(0xFF000000); - ConvertColor555To8888(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); -} - -template -FORCEINLINE void ConvertColor555To6665Opaque(const __m128i &srcColor, __m128i &dstLo, __m128i &dstHi) -{ - const __m128i srcAlphaBits32 = _mm_set1_epi32(0x1F000000); - ConvertColor555To6665(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); -} - -template -FORCEINLINE __m128i ConvertColor8888To6665(const __m128i &src) -{ - // Conversion algorithm: - // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) - // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) - __m128i rgb; - const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); - - if (SWAP_RB) - { -#ifdef ENABLE_SSSE3 - rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); - rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2) ); -#else - rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x003F0000)), 18), _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00003F00)), 2), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x0000003F)), 14)) ); -#endif - } - else - { - rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); - } - - return _mm_or_si128(rgb, a); -} - -template -FORCEINLINE __m128i ConvertColor6665To8888(const __m128i &src) -{ - // Conversion algorithm: - // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) - // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) - __m128i rgb = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00FCFCFC)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00030303)) ); - const __m128i a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0xF8000000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x07000000)) ); - - if (SWAP_RB) - { -#ifdef ENABLE_SSSE3 - rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2) ); -#else - rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16)) ); -#endif - } - - return _mm_or_si128(rgb, a); -} - -template -FORCEINLINE __m128i _ConvertColorBaseTo5551(const __m128i &srcLo, const __m128i &srcHi) -{ - if (COLORFORMAT == NDSColorFormat_BGR555_Rev) - { - return srcLo; - } - - __m128i rgbLo; - __m128i rgbHi; - __m128i alpha; - - if (COLORFORMAT == NDSColorFormat_BGR666_Rev) - { - if (SWAP_RB) - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 17), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 17), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); - } - else - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 1), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 1), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); - } - - // Convert alpha - alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x0000001F)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x0000001F)) ); - alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); - alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); - } - else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) - { - if (SWAP_RB) - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 19), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 19), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); - } - else - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 3), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 3), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); - } - - // Convert alpha - alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x000000FF)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x000000FF)) ); - alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); - alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); - } - - return _mm_or_si128(_mm_packs_epi32(rgbLo, rgbHi), alpha); -} - -template -FORCEINLINE __m128i ConvertColor8888To5551(const __m128i &srcLo, const __m128i &srcHi) -{ - return _ConvertColorBaseTo5551(srcLo, srcHi); -} - -template -FORCEINLINE __m128i ConvertColor6665To5551(const __m128i &srcLo, const __m128i &srcHi) -{ - return _ConvertColorBaseTo5551(srcLo, srcHi); -} - -#endif - -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); - -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); - #endif diff --git a/desmume/src/Makefile.am b/desmume/src/Makefile.am index 41a8bb95a..f5d8afceb 100644 --- a/desmume/src/Makefile.am +++ b/desmume/src/Makefile.am @@ -52,6 +52,7 @@ libdesmume_a_SOURCES = \ utils/decrypt/decrypt.h utils/decrypt/header.cpp utils/decrypt/header.h \ utils/task.cpp utils/task.h \ utils/vfat.h utils/vfat.cpp \ + utils/colorspacehandler/colorspacehandler.cpp \ utils/dlditool.cpp \ utils/libfat/bit_ops.h \ utils/libfat/cache.cpp \ @@ -107,6 +108,21 @@ libdesmume_a_SOURCES = \ libretro-common/rthreads/async_job.c \ libretro-common/rthreads/rsemaphore.c \ libretro-common/rthreads/rthreads.c + +if SUPPORT_SSE2 += \ +libdesmume_a_SOURCES += \ + utils/colorspacehandler/colorspacehandler_SSE2.cpp +endif + +if SUPPORT_AVX2 += \ +libdesmume_a_SOURCES += \ + utils/colorspacehandler/colorspacehandler_AVX2.cpp +endif + +if SUPPORT_ALTIVEC += \ +libdesmume_a_SOURCES += \ + utils/colorspacehandler/colorspacehandler_AltiVec.cpp +endif if HAVE_JIT libdesmume_a_SOURCES += \ diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 96b5f699c..c2f64fedb 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -32,6 +32,7 @@ #ifdef ENABLE_SSE2 #include +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" #endif typedef struct @@ -990,9 +991,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4)); - _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), ConvertColor8888To6665(srcColorLo) ); - _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665(srcColorHi) ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), ColorspaceConvert8888To6665_SSE2(srcColorLo) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ColorspaceConvert8888To6665_SSE2(srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1001,17 +1002,17 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; i < pixCount; i++) { - dstFramebuffer[i].color = ConvertColor8888To6665(srcFramebuffer[i]); - dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); + dstFramebuffer[i].color = ColorspaceConvert8888To6665(srcFramebuffer[i]); + dstRGBA5551[i] = ColorspaceConvert8888To5551(srcFramebuffer[i]); } } else if (dstFramebuffer != NULL) { - ConvertColorBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); + ColorspaceConvertBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } else if (this->_outputFormat == NDSColorFormat_BGR888_Rev) @@ -1027,7 +1028,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), srcColorLo ); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1036,8 +1037,8 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; i < pixCount; i++) { - dstFramebuffer[i].color = ConvertColor8888To6665(srcFramebuffer[i]); - dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); + dstFramebuffer[i].color = ColorspaceConvert8888To6665(srcFramebuffer[i]); + dstRGBA5551[i] = ColorspaceConvert8888To5551(srcFramebuffer[i]); } } else if (dstFramebuffer != NULL) @@ -1046,7 +1047,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } } @@ -1068,9 +1069,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4)); - _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), ConvertColor8888To6665(srcColorLo) ); - _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665(srcColorHi) ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), ColorspaceConvert8888To6665_SSE2(srcColorLo) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ColorspaceConvert8888To6665_SSE2(srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1079,8 +1080,8 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; x < pixCount; x++, ir++, iw++) { - dstFramebuffer[iw].color = ConvertColor8888To6665(srcFramebuffer[ir]); - dstRGBA5551[iw] = ConvertColor8888To5551(srcFramebuffer[ir]); + dstFramebuffer[iw].color = ColorspaceConvert8888To6665(srcFramebuffer[ir]); + dstRGBA5551[iw] = ColorspaceConvert8888To5551(srcFramebuffer[ir]); } } } @@ -1088,14 +1089,14 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - ConvertColorBuffer8888To6665((u32 *)srcFramebuffer + ir, (u32 *)dstFramebuffer + iw, pixCount); + ColorspaceConvertBuffer8888To6665((u32 *)srcFramebuffer + ir, (u32 *)dstFramebuffer + iw, pixCount); } } else { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } @@ -1115,7 +1116,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), srcColorLo ); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1125,7 +1126,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor for (; x < pixCount; x++, ir++, iw++) { dstFramebuffer[iw] = srcFramebuffer[ir]; - dstRGBA5551[iw] = ConvertColor8888To5551(srcFramebuffer[ir]); + dstRGBA5551[iw] = ColorspaceConvert8888To5551(srcFramebuffer[ir]); } } } @@ -1146,7 +1147,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } diff --git a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index 79ecf4426..63d240a0b 100644 --- a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -243,6 +243,8 @@ AB564915186E6F67002740F4 /* Image_Piano.png in Resources */ = {isa = PBXBuildFile; fileRef = AB56490B186E6F67002740F4 /* Image_Piano.png */; }; AB5785FD17176AFC002C5FC7 /* OpenEmuBase.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = AB5785FC17176AFC002C5FC7 /* OpenEmuBase.framework */; }; AB58F32D1364F44B0074C376 /* cocoa_file.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB58F32C1364F44B0074C376 /* cocoa_file.mm */; }; + AB5FDDAC1D62C89E0094617C /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + AB5FDDAD1D62C8A00094617C /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; AB64987C13ECC73800EE7DD2 /* FileTypeInfo.plist in Resources */ = {isa = PBXBuildFile; fileRef = AB64987B13ECC73800EE7DD2 /* FileTypeInfo.plist */; }; AB68101B187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png in Resources */ = {isa = PBXBuildFile; fileRef = AB681013187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png */; }; AB68101C187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png in Resources */ = {isa = PBXBuildFile; fileRef = AB681013187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png */; }; @@ -974,6 +976,12 @@ ABB97878144E89CC00793FA3 /* Icon_DeSmuME_32x32.png in Resources */ = {isa = PBXBuildFile; fileRef = ABB97875144E89CC00793FA3 /* Icon_DeSmuME_32x32.png */; }; ABBC0F8D1394B1AA0028B6BD /* DefaultUserPrefs.plist in Resources */ = {isa = PBXBuildFile; fileRef = ABBC0F8C1394B1AA0028B6BD /* DefaultUserPrefs.plist */; }; ABBF04A514B515F300E505A0 /* AppIcon_ROMCheats.icns in Resources */ = {isa = PBXBuildFile; fileRef = ABBF04A414B515F300E505A0 /* AppIcon_ROMCheats.icns */; }; + ABBFFF851D6283C0003CD598 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + ABBFFF861D6283C1003CD598 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + ABBFFF871D6283C1003CD598 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + ABBFFF891D6283D2003CD598 /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; + ABBFFF8A1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; + ABBFFF8B1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; ABC3AF2F14B7F06900D5B13D /* Icon_VolumeFull_16x16.png in Resources */ = {isa = PBXBuildFile; fileRef = ABC3AF2B14B7F06900D5B13D /* Icon_VolumeFull_16x16.png */; }; ABC3AF3014B7F06900D5B13D /* Icon_VolumeMute_16x16.png in Resources */ = {isa = PBXBuildFile; fileRef = ABC3AF2C14B7F06900D5B13D /* Icon_VolumeMute_16x16.png */; }; ABC3AF3114B7F06900D5B13D /* Icon_VolumeOneThird_16x16.png in Resources */ = {isa = PBXBuildFile; fileRef = ABC3AF2D14B7F06900D5B13D /* Icon_VolumeOneThird_16x16.png */; }; @@ -1534,6 +1542,14 @@ ABBB421516B4A5F30012E5AB /* OGLRender_3_2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = OGLRender_3_2.h; path = ../OGLRender_3_2.h; sourceTree = ""; }; ABBC0F8C1394B1AA0028B6BD /* DefaultUserPrefs.plist */ = {isa = PBXFileReference; lastKnownFileType = file.bplist; path = DefaultUserPrefs.plist; sourceTree = ""; }; ABBF04A414B515F300E505A0 /* AppIcon_ROMCheats.icns */ = {isa = PBXFileReference; lastKnownFileType = image.icns; path = AppIcon_ROMCheats.icns; sourceTree = ""; }; + ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler.cpp; sourceTree = ""; }; + ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler.h; sourceTree = ""; }; + ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_SSE2.cpp; sourceTree = ""; }; + ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_SSE2.h; sourceTree = ""; }; + ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX2.cpp; sourceTree = ""; }; + ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX2.h; sourceTree = ""; }; + ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AltiVec.cpp; sourceTree = ""; }; + ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AltiVec.h; sourceTree = ""; }; ABC3AF2B14B7F06900D5B13D /* Icon_VolumeFull_16x16.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_VolumeFull_16x16.png; path = images/Icon_VolumeFull_16x16.png; sourceTree = ""; }; ABC3AF2C14B7F06900D5B13D /* Icon_VolumeMute_16x16.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_VolumeMute_16x16.png; path = images/Icon_VolumeMute_16x16.png; sourceTree = ""; }; ABC3AF2D14B7F06900D5B13D /* Icon_VolumeOneThird_16x16.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_VolumeOneThird_16x16.png; path = images/Icon_VolumeOneThird_16x16.png; sourceTree = ""; }; @@ -2508,6 +2524,21 @@ path = openemu; sourceTree = ""; }; + ABBFFF6E1D5F9C10003CD598 /* colorspacehandler */ = { + isa = PBXGroup; + children = ( + ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */, + ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */, + ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */, + ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */, + ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */, + ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */, + ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */, + ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */, + ); + path = colorspacehandler; + sourceTree = ""; + }; ABC2ECD613B1C87000FAAA2A /* Images */ = { isa = PBXGroup; children = ( @@ -2759,6 +2790,7 @@ ABD1FF211345ACBF00AF11D1 /* decrypt */, ABD1FF2E1345ACBF00AF11D1 /* libfat */, ABE670241415DE6C00E8E4C9 /* tinyxml */, + ABBFFF6E1D5F9C10003CD598 /* colorspacehandler */, ABD1FF1D1345ACBF00AF11D1 /* ConvertUTF.c */, AB9038A517C5ECFD00F410BD /* advanscene.cpp */, ABD1FF1F1345ACBF00AF11D1 /* datetime.cpp */, @@ -3770,6 +3802,7 @@ ABE6840D189E33BC007FD69C /* OGLDisplayOutput.cpp in Sources */, ABD1FF121345AC9C00AF11D1 /* slot2_none.cpp in Sources */, ABD1FF131345AC9C00AF11D1 /* slot2_paddle.cpp in Sources */, + ABBFFF8A1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */, ABD1FF141345AC9C00AF11D1 /* slot2_piano.cpp in Sources */, ABD1FF151345AC9C00AF11D1 /* slot2_rumblepak.cpp in Sources */, ABD1041F1346652500AF11D1 /* sndOSX.cpp in Sources */, @@ -3864,6 +3897,7 @@ AB40565E169F5DBB0016AC3E /* virtualmemory.cpp in Sources */, AB405661169F5DBB0016AC3E /* zonememory.cpp in Sources */, AB405679169F5DCC0016AC3E /* x86assembler.cpp in Sources */, + ABBFFF861D6283C1003CD598 /* colorspacehandler.cpp in Sources */, AB40567C169F5DCC0016AC3E /* x86compiler.cpp in Sources */, ABFEA8A41BB4EC1100B08C25 /* sfnt.c in Sources */, ABA731691BB51FDC00B26147 /* type1cid.c in Sources */, @@ -4017,6 +4051,7 @@ AB796D4315CDCBA200C59155 /* version.cpp in Sources */, ABFEA82B1BB4EC1100B08C25 /* ftinit.c in Sources */, AB796D4415CDCBA200C59155 /* vfat.cpp in Sources */, + AB5FDDAC1D62C89E0094617C /* colorspacehandler.cpp in Sources */, AB796D4515CDCBA200C59155 /* videofilter.cpp in Sources */, AB796D4615CDCBA200C59155 /* WavFile.cpp in Sources */, AB796D4715CDCBA200C59155 /* wifi.cpp in Sources */, @@ -4096,6 +4131,7 @@ AB26D87C16B5253D00A2305C /* OGLRender_3_2.cpp in Sources */, AB3A655E16CC5421001F5D4A /* EmuControllerDelegate.mm in Sources */, AB3A656116CC5438001F5D4A /* cocoa_GPU.mm in Sources */, + AB5FDDAD1D62C8A00094617C /* colorspacehandler_SSE2.cpp in Sources */, AB8967D916D2ED0700F826F1 /* DisplayWindowController.mm in Sources */, AB29B33116D4BEBF000EF671 /* InputManager.mm in Sources */, AB8B7AAC17CE8C440051CEBF /* slot1comp_protocol.cpp in Sources */, @@ -4272,6 +4308,7 @@ AB2ABA401C9F9CFA00173B15 /* rsemaphore.c in Sources */, AB8F3CF01A53AC2600A80BF6 /* ringbuffer.cpp in Sources */, AB8F3CF11A53AC2600A80BF6 /* arm_jit.cpp in Sources */, + ABBFFF891D6283D2003CD598 /* colorspacehandler_SSE2.cpp in Sources */, AB8F3CF21A53AC2600A80BF6 /* troubleshootingWindowDelegate.mm in Sources */, AB8F3CF31A53AC2600A80BF6 /* assembler.cpp in Sources */, AB8F3CF41A53AC2600A80BF6 /* assert.cpp in Sources */, @@ -4295,6 +4332,7 @@ AB8F3D041A53AC2600A80BF6 /* virtualmemory.cpp in Sources */, AB8F3D051A53AC2600A80BF6 /* zonememory.cpp in Sources */, AB8F3D061A53AC2600A80BF6 /* x86assembler.cpp in Sources */, + ABBFFF851D6283C0003CD598 /* colorspacehandler.cpp in Sources */, AB8F3D071A53AC2600A80BF6 /* x86compiler.cpp in Sources */, AB8F3D081A53AC2600A80BF6 /* x86compilercontext.cpp in Sources */, AB8F3D091A53AC2600A80BF6 /* x86compilerfunc.cpp in Sources */, @@ -4367,6 +4405,7 @@ ABB3C6911501C04F00E0C22E /* SoundTouch.cpp in Sources */, ABB3C6921501C04F00E0C22E /* sse_optimized.cpp in Sources */, ABB3C6931501C04F00E0C22E /* TDStretch.cpp in Sources */, + ABBFFF871D6283C1003CD598 /* colorspacehandler.cpp in Sources */, ABB3C6941501C04F00E0C22E /* WavFile.cpp in Sources */, ABB3C6951501C04F00E0C22E /* metaspu.cpp in Sources */, ABB3C6961501C04F00E0C22E /* SndOut.cpp in Sources */, @@ -4436,6 +4475,7 @@ ABB3C6D11501C04F00E0C22E /* slot1.cpp in Sources */, ABB3C6D31501C04F00E0C22E /* SPU.cpp in Sources */, ABB3C6D41501C04F00E0C22E /* texcache.cpp in Sources */, + ABBFFF8B1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */, AB9038BA17C5ED2200F410BD /* slot1comp_rom.cpp in Sources */, ABB3C6D51501C04F00E0C22E /* thumb_instructions.cpp in Sources */, AB2EE13317D57F5000F68622 /* fsnitro.cpp in Sources */, diff --git a/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj index bb4b92e94..e53060176 100644 --- a/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj @@ -740,6 +740,14 @@ AB2F56F11704C86900E28885 /* utilities.c in Sources */ = {isa = PBXBuildFile; fileRef = AB2F56EF1704C86900E28885 /* utilities.c */; }; AB2F56F21704C86900E28885 /* utilities.c in Sources */ = {isa = PBXBuildFile; fileRef = AB2F56EF1704C86900E28885 /* utilities.c */; }; AB2F56F31704C86900E28885 /* utilities.c in Sources */ = {isa = PBXBuildFile; fileRef = AB2F56EF1704C86900E28885 /* utilities.c */; }; + AB37E3741D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E3771D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */; }; + AB37E3781D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E37B1D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */; }; + AB37E37C1D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E37D1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */; }; + AB37E3801D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E38A1D61895F004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */; }; AB3ACB7814C2361100D7D192 /* appDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB3ACB6714C2361100D7D192 /* appDelegate.mm */; }; AB3ACB7914C2361100D7D192 /* cheatWindowDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB3ACB6914C2361100D7D192 /* cheatWindowDelegate.mm */; }; AB3ACB7C14C2361100D7D192 /* inputPrefsView.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB3ACB6F14C2361100D7D192 /* inputPrefsView.mm */; }; @@ -1156,6 +1164,8 @@ AB73AA2E1507C9F500A310C8 /* OpenGL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABC570D4134431DA00E7B0B1 /* OpenGL.framework */; }; AB73AA2F1507C9F500A310C8 /* libz.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = AB0A0D1914AACA9600E83E91 /* libz.dylib */; }; AB75226F14C7BB51009B97B3 /* AppIcon_FirmwareConfig.icns in Resources */ = {isa = PBXBuildFile; fileRef = AB75226D14C7BB51009B97B3 /* AppIcon_FirmwareConfig.icns */; }; + AB7BB17F1D62C8CC00A7A6E2 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB7BB1801D62C8CF00A7A6E2 /* colorspacehandler_AltiVec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */; }; AB7DDA6D173DC38F004F3D07 /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABB6AD5C173A3F2B00EC2E8D /* Carbon.framework */; }; AB7DDA6E173DC399004F3D07 /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABB6AD5C173A3F2B00EC2E8D /* Carbon.framework */; }; AB7DDA6F173DC39E004F3D07 /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABB6AD5C173A3F2B00EC2E8D /* Carbon.framework */; }; @@ -1835,6 +1845,12 @@ AB2F56EF1704C86900E28885 /* utilities.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = utilities.c; sourceTree = ""; }; AB350BA41478AC96007165AC /* IOKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = IOKit.framework; path = System/Library/Frameworks/IOKit.framework; sourceTree = SDKROOT; }; AB350D38147A1D8D007165AC /* English */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = English; path = translations/English.lproj/HID_usage_strings.plist; sourceTree = ""; }; + AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler.cpp; sourceTree = ""; }; + AB37E36D1D6188BC004A2C0D /* colorspacehandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler.h; sourceTree = ""; }; + AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AltiVec.cpp; sourceTree = ""; }; + AB37E36F1D6188BC004A2C0D /* colorspacehandler_AltiVec.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AltiVec.h; sourceTree = ""; }; + AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_SSE2.cpp; sourceTree = ""; }; + AB37E3731D6188BC004A2C0D /* colorspacehandler_SSE2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_SSE2.h; sourceTree = ""; }; AB3ACB6614C2361100D7D192 /* appDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = appDelegate.h; sourceTree = ""; }; AB3ACB6714C2361100D7D192 /* appDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = appDelegate.mm; sourceTree = ""; }; AB3ACB6814C2361100D7D192 /* cheatWindowDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cheatWindowDelegate.h; sourceTree = ""; }; @@ -2894,6 +2910,19 @@ path = src; sourceTree = ""; }; + AB37E36B1D6188BC004A2C0D /* colorspacehandler */ = { + isa = PBXGroup; + children = ( + AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */, + AB37E36D1D6188BC004A2C0D /* colorspacehandler.h */, + AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */, + AB37E36F1D6188BC004A2C0D /* colorspacehandler_AltiVec.h */, + AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */, + AB37E3731D6188BC004A2C0D /* colorspacehandler_SSE2.h */, + ); + path = colorspacehandler; + sourceTree = ""; + }; AB3ACB6514C2361100D7D192 /* userinterface */ = { isa = PBXGroup; children = ( @@ -3207,6 +3236,7 @@ isa = PBXGroup; children = ( ABBCE2A115ACB29100A2C965 /* AsmJit */, + AB37E36B1D6188BC004A2C0D /* colorspacehandler */, ABD1FF211345ACBF00AF11D1 /* decrypt */, ABD1FF2E1345ACBF00AF11D1 /* libfat */, ABE670241415DE6C00E8E4C9 /* tinyxml */, @@ -4508,6 +4538,8 @@ AB50200A1D09E712002FA150 /* file_path.c in Sources */, AB50200B1D09E712002FA150 /* retro_dirent.c in Sources */, AB50200C1D09E712002FA150 /* retro_stat.c in Sources */, + AB7BB17F1D62C8CC00A7A6E2 /* colorspacehandler.cpp in Sources */, + AB7BB1801D62C8CF00A7A6E2 /* colorspacehandler_AltiVec.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -4687,6 +4719,8 @@ AB5020161D09E712002FA150 /* file_path.c in Sources */, AB5020171D09E712002FA150 /* retro_dirent.c in Sources */, AB5020181D09E712002FA150 /* retro_stat.c in Sources */, + AB37E3801D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E38A1D61895F004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -4896,6 +4930,8 @@ AB50200D1D09E712002FA150 /* file_path.c in Sources */, AB50200E1D09E712002FA150 /* retro_dirent.c in Sources */, AB50200F1D09E712002FA150 /* retro_stat.c in Sources */, + AB37E3741D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E3771D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -5105,6 +5141,8 @@ AB5020101D09E712002FA150 /* file_path.c in Sources */, AB5020111D09E712002FA150 /* retro_dirent.c in Sources */, AB5020121D09E712002FA150 /* retro_stat.c in Sources */, + AB37E3781D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E37B1D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -5284,6 +5322,8 @@ AB5020131D09E712002FA150 /* file_path.c in Sources */, AB5020141D09E712002FA150 /* retro_dirent.c in Sources */, AB5020151D09E712002FA150 /* retro_stat.c in Sources */, + AB37E37C1D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E37D1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/desmume/src/cocoa/cocoa_output.mm b/desmume/src/cocoa/cocoa_output.mm index 5d1ef23bf..0a94dd4ae 100644 --- a/desmume/src/cocoa/cocoa_output.mm +++ b/desmume/src/cocoa/cocoa_output.mm @@ -754,7 +754,7 @@ if (dispInfo.pixelBytes == 2) { - ConvertColorBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); + ColorspaceConvertBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); } else if (dispInfo.pixelBytes == 4) { diff --git a/desmume/src/cocoa/cocoa_rom.mm b/desmume/src/cocoa/cocoa_rom.mm index d671da76c..329fa786c 100644 --- a/desmume/src/cocoa/cocoa_rom.mm +++ b/desmume/src/cocoa/cocoa_rom.mm @@ -692,7 +692,7 @@ void RomIconToRGBA8888(uint32_t *bitmapData) // // The first entry always represents the alpha, so we can just ignore it. clut[0] = 0x00000000; - ConvertColorBuffer555To8888Opaque((u16 *)iconClutPtr, &clut[1], 15); + ColorspaceConvertBuffer555To8888Opaque((u16 *)iconClutPtr, &clut[1], 15); // Load the image from the icon pixel data. // diff --git a/desmume/src/frontend/modules/ImageOut.cpp b/desmume/src/frontend/modules/ImageOut.cpp index 16ff5473f..77d4aa294 100644 --- a/desmume/src/frontend/modules/ImageOut.cpp +++ b/desmume/src/frontend/modules/ImageOut.cpp @@ -1,65 +1,63 @@ -/* - Copyright (C) 2008-2015 DeSmuME team - - This file is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - This file is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with the this software. If not, see . -*/ - -#include -#include -#include "types.h" -#include "ImageOut.h" -#include "formats/rpng.h" -#include "formats/rbmp.h" -#include "GPU.h" - -static u8* Convert15To24(const u16* src, int width, int height) -{ - u8 *tmp_buffer; - u8 *tmp_inc; - tmp_inc = tmp_buffer = (u8 *)malloc(width * height * 3); - - for(int y=0;y(*src++); - *tmp_inc++ = dst&0xFF; - *tmp_inc++ = (dst>>8)&0xFF; - *tmp_inc++ = (dst>>16)&0xFF; - } - } - return tmp_buffer; -} - -int NDS_WritePNG_15bpp(int width, int height, const u16 *data, const char *filename) -{ - u8* tmp = Convert15To24(data,width,height); - bool ok = rpng_save_image_bgr24(filename,tmp,width,height,width*3); - free(tmp); - return ok?1:0; -} - -int NDS_WriteBMP_15bpp(int width, int height, const u16 *data, const char *filename) -{ - u8* tmp = Convert15To24(data,width,height); - bool ok = rbmp_save_image(filename,tmp,width,height,width*3,RBMP_SOURCE_TYPE_BGR24); - free(tmp); - return ok?1:0; -} - -int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char *filename) -{ - bool ok = rbmp_save_image(filename,buf,width,height,width*4,RBMP_SOURCE_TYPE_ARGB8888); - return ok?1:0; +/* + Copyright (C) 2008-2015 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#include +#include +#include "types.h" +#include "ImageOut.h" +#include "formats/rpng.h" +#include "formats/rbmp.h" +#include "GPU.h" + +static u8* Convert15To24(const u16* src, int width, int height) +{ + u8 *tmp_buffer; + u8 *tmp_inc; + tmp_inc = tmp_buffer = (u8 *)malloc(width * height * 3); + + for (int i = 0; i < width*height; i++) + { + u32 dst = ColorspaceConvert555To8888Opaque(*src++); + *tmp_inc++ = dst & 0xFF; + *tmp_inc++ = (dst >> 8) & 0xFF; + *tmp_inc++ = (dst >> 16) & 0xFF; + } + + return tmp_buffer; +} + +int NDS_WritePNG_15bpp(int width, int height, const u16 *data, const char *filename) +{ + u8* tmp = Convert15To24(data,width,height); + bool ok = rpng_save_image_bgr24(filename,tmp,width,height,width*3); + free(tmp); + return ok?1:0; +} + +int NDS_WriteBMP_15bpp(int width, int height, const u16 *data, const char *filename) +{ + u8* tmp = Convert15To24(data,width,height); + bool ok = rbmp_save_image(filename,tmp,width,height,width*3,RBMP_SOURCE_TYPE_BGR24); + free(tmp); + return ok?1:0; +} + +int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char *filename) +{ + bool ok = rbmp_save_image(filename,buf,width,height,width*4,RBMP_SOURCE_TYPE_ARGB8888); + return ok?1:0; } \ No newline at end of file diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 3f1f5128d..f777b6db1 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -605,11 +605,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) ) { - ConvertColorBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); + ColorspaceConvertBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) ) { - ConvertColorBuffer6665To8888((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); + ColorspaceConvertBuffer6665To8888((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) || ((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) ) @@ -622,11 +622,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if (this->_outputFormat == NDSColorFormat_BGR666_Rev) { - ConvertColorBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index a936a5c1e..89f86b1d2 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -31,6 +31,10 @@ #include "MMU.h" #include "NDSSystem.h" +#ifdef ENABLE_SSE2 +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" +#endif + using std::min; using std::max; @@ -452,13 +456,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } // Set converted colors to 0 if the palette index is 0. @@ -518,13 +522,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); @@ -581,13 +585,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } // Set converted colors to 0 if the palette index is 0. @@ -647,13 +651,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); @@ -882,11 +886,11 @@ public: tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ConvertColor555To6665(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ConvertColor555To6665(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); } else { @@ -896,11 +900,11 @@ public: tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ConvertColor555To8888(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ConvertColor555To8888(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); diff --git a/desmume/src/types.h b/desmume/src/types.h index 56b225c3b..22a575256 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -76,6 +76,18 @@ #ifdef __SSE4_2__ #define ENABLE_SSE4_2 #endif + + #ifdef __AVX__ + #define ENABLE_AVX + #endif + + #ifdef __AVX2__ + #define ENABLE_AVX2 + #endif + + #ifdef __ALTIVEC__ + #define ENABLE_ALTIVEC + #endif #endif #ifdef _MSC_VER @@ -223,6 +235,38 @@ typedef u32 uint32; #define uint32 u32 //uint32 is defined in Leopard somewhere, avoid conflicts #endif +#ifdef ENABLE_ALTIVEC + #ifndef __APPLE_ALTIVEC__ + #include + #endif +typedef vector unsigned char v128u8; +typedef vector signed char v128s8; +typedef vector unsigned short v128u16; +typedef vector signed short v128s16; +typedef vector unsigned int v128u32; +typedef vector signed int v128s32; +#endif + +#ifdef ENABLE_SSE2 +#include +typedef __m128i v128u8; +typedef __m128i v128s8; +typedef __m128i v128u16; +typedef __m128i v128s16; +typedef __m128i v128u32; +typedef __m128i v128s32; +#endif + +#ifdef ENABLE_AVX2 +#include +typedef __m256i v256u8; +typedef __m256i v256s8; +typedef __m256i v256u16; +typedef __m256i v256s16; +typedef __m256i v256u32; +typedef __m256i v256s32; +#endif + /*---------- GPU3D fixed-points types -----------*/ typedef s32 f32; diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp new file mode 100644 index 000000000..d0757d7cc --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -0,0 +1,776 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#include "colorspacehandler.h" + +#if defined(ENABLE_AVX2) + #include "colorspacehandler_AVX2.h" +#elif defined(ENABLE_SSE2) + #include "colorspacehandler_SSE2.h" +#elif defined(ENABLE_ALTIVEC) + #include "colorspacehandler_AltiVec.h" +#endif + +#if defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) + #define USEVECTORSIZE_128 +#endif + +#if defined(ENABLE_AVX2) + #define USEVECTORSIZE_256 +#endif + +// By default, the hand-coded vectorized code will be used instead of a compiler's built-in +// autovectorization (if supported). However, if USEMANUALVECTORIZATION is not defined, then +// the compiler will use autovectorization (if supported). +#if defined(USEVECTORSIZE_128) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_512) + // Comment out USEMANUALVECTORIZATION to disable the hand-coded vectorized code. + #define USEMANUALVECTORIZATION +#endif + +#ifdef USEMANUALVECTORIZATION + #if defined(ENABLE_AVX2) + static const ColorspaceHandler_AVX2 csh; + #elif defined(ENABLE_SSE2) + static const ColorspaceHandler_SSE2 csh; + #elif defined(ENABLE_ALTIVEC) + static const ColorspaceHandler_AltiVec csh; + #else + static const ColorspaceHandler csh; + #endif +#else + static const ColorspaceHandler csh; +#endif + +CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; +CACHE_ALIGN u32 color_555_to_666[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; +CACHE_ALIGN u32 color_555_to_888[32768]; + +//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX +CACHE_ALIGN const u32 material_5bit_to_31bit[] = { + 0x00000000, 0x04210842, 0x08421084, 0x0C6318C6, + 0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE, + 0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6, + 0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE, + 0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7, + 0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF, + 0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7, + 0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF +}; + +// 5-bit to 6-bit conversions use this formula -- dst = (src == 0) ? 0 : (2*src) + 1 +// Reference GBATEK: http://problemkaputt.de/gbatek.htm#ds3dtextureblending +CACHE_ALIGN const u8 material_5bit_to_6bit[] = { + 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, + 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, + 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, + 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F +}; + +CACHE_ALIGN const u8 material_5bit_to_8bit[] = { + 0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39, + 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B, + 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD, + 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF +}; + +CACHE_ALIGN const u8 material_6bit_to_8bit[] = { + 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, + 0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C, + 0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D, + 0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D, + 0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E, + 0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE, + 0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF, + 0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF +}; + +CACHE_ALIGN const u8 material_3bit_to_8bit[] = { + 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF +}; + +//maybe not very precise +CACHE_ALIGN const u8 material_3bit_to_5bit[] = { + 0, 4, 8, 13, 17, 22, 26, 31 +}; + +//TODO - generate this in the static init method more accurately +CACHE_ALIGN const u8 material_3bit_to_6bit[] = { + 0, 8, 16, 26, 34, 44, 52, 63 +}; + +void ColorspaceHandlerInit() +{ + static bool needInitTables = true; + + if (needInitTables) + { +#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) +#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) +#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) +#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) + + for (size_t i = 0; i < 32768; i++) + { + color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); + color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); + color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); + + color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); + color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); + color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 ); + } + } +} + +template +FORCEINLINE u32 ColorspaceConvert555To8888Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert555To6665Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; + outColor.g = srcColor.g >> 2; + outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; + outColor.a = srcColor.a >> 3; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To6665(srcColorComponent); +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; + outColor.g = material_6bit_to_8bit[srcColor.g]; + outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; + outColor.a = material_5bit_to_8bit[srcColor.a]; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To8888(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(FragmentColor srcColor) +{ + return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To5551(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(FragmentColor srcColor) +{ + return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To5551(srcColorComponent); +} + +template +void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To8888Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To8888Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To8888Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + } +} + +template +void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To6665Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To6665Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To6665Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + } +} + +template +void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To6665_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To6665_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To6665_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To6665(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } +} + +template +void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To8888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To8888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To8888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To8888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } +} + +template +void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To5551_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To5551_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To5551_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To5551(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } +} + +template +void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To5551_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To5551_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To5551_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To5551(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To8888Opaque(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To6665Opaque(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To6665(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To6665_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To8888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To8888_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To5551(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To5551_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (;i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To5551(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(src, dst, pixCount); +} + +template u32 ColorspaceConvert555To8888Opaque(const u16 src); +template u32 ColorspaceConvert555To8888Opaque(const u16 src); + +template u32 ColorspaceConvert555To6665Opaque(const u16 src); +template u32 ColorspaceConvert555To6665Opaque(const u16 src); + +template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); +template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); + +template u32 ColorspaceConvert8888To6665(u32 srcColor); +template u32 ColorspaceConvert8888To6665(u32 srcColor); + +template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); +template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); + +template u32 ColorspaceConvert6665To8888(u32 srcColor); +template u32 ColorspaceConvert6665To8888(u32 srcColor); + +template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); +template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); + +template u16 ColorspaceConvert8888To5551(u32 srcColor); +template u16 ColorspaceConvert8888To5551(u32 srcColor); + +template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); +template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); + +template u16 ColorspaceConvert6665To5551(u32 srcColor); +template u16 ColorspaceConvert6665To5551(u32 srcColor); + +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h new file mode 100644 index 000000000..362e975ea --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -0,0 +1,194 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef COLORSPACEHANDLER_H +#define COLORSPACEHANDLER_H + +#include "types.h" +#include +#include + + +enum NDSColorFormat +{ + // The color format information is packed in a 32-bit value. + // The bits are as follows: + // FFFOOOOO AAAAAABB BBBBGGGG GGRRRRRR + // + // F = Flags (see below) + // O = Color order (see below) + // A = Bit count for alpha [0-63] + // B = Bit count for blue [0-63] + // G = Bit count for green [0-63] + // R = Bit count for red [0-63] + // + // Flags: + // Bit 29: Reverse order flag. + // Set = Bits are in reverse order, usually for little-endian usage. + // Cleared = Bits are in normal order, usually for big-endian usage. + // + // Color order bits, 24-28: + // 0x00 = RGBA, common format + // 0x01 = RGAB + // 0x02 = RBGA + // 0x03 = RBAG + // 0x04 = RAGB + // 0x05 = RABG + // 0x06 = GRBA + // 0x07 = GRAB + // 0x08 = GBRA + // 0x09 = GBAR + // 0x0A = GARB + // 0x0B = GABR + // 0x0C = BRGA + // 0x0D = BRAG + // 0x0E = BGRA, common format + // 0x0F = BGAR + // 0x10 = BARG + // 0x11 = BAGR + // 0x12 = ARGB + // 0x13 = ARBG + // 0x14 = AGRB + // 0x15 = AGBR + // 0x16 = ABRG + // 0x17 = ABGR + + // Color formats used for internal processing. + //NDSColorFormat_ABGR1555_Rev = 0x20045145, + //NDSColorFormat_ABGR5666_Rev = 0x20186186, + //NDSColorFormat_ABGR8888_Rev = 0x20208208, + + // Color formats used by the output framebuffers. + NDSColorFormat_BGR555_Rev = 0x20005145, + NDSColorFormat_BGR666_Rev = 0x20006186, + NDSColorFormat_BGR888_Rev = 0x20008208 +}; + +union FragmentColor +{ + u32 color; + struct + { + u8 r,g,b,a; + }; +}; + +extern CACHE_ALIGN const u32 material_5bit_to_31bit[32]; +extern CACHE_ALIGN const u8 material_5bit_to_6bit[32]; +extern CACHE_ALIGN const u8 material_5bit_to_8bit[32]; +extern CACHE_ALIGN const u8 material_6bit_to_8bit[64]; +extern CACHE_ALIGN const u8 material_3bit_to_5bit[8]; +extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; +extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; + +extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; +extern CACHE_ALIGN u32 color_555_to_666[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; +extern CACHE_ALIGN u32 color_555_to_888[32768]; + +#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped +#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color + +#ifdef LOCAL_LE + #define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian +#else + #define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian +#endif + +#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color +#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped +#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color + +#ifdef LOCAL_LE + #define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian +#else + #define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian +#endif + +//produce a 15bpp color from individual 5bit components +#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) ) + +//produce a 16bpp color from individual 5bit components +#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) ) + +void ColorspaceHandlerInit(); + +template u32 ColorspaceConvert555To8888Opaque(const u16 src); +template u32 ColorspaceConvert555To6665Opaque(const u16 src); +template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); +template u32 ColorspaceConvert8888To6665(u32 srcColor); +template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); +template u32 ColorspaceConvert6665To8888(u32 srcColor); +template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); +template u16 ColorspaceConvert8888To5551(u32 srcColor); +template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); +template u16 ColorspaceConvert6665To5551(u32 srcColor); + +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +class ColorspaceHandler +{ +public: + ColorspaceHandler() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) +{ + FragmentColor ret; + ret.r = r; ret.g = g; ret.b = b; ret.a = a; + return ret; +} + +#endif /* COLORSPACEHANDLER_H */ diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp new file mode 100644 index 000000000..6682bea12 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -0,0 +1,491 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_AVX2.h" + +#ifndef ENABLE_AVX2 + #error This code requires AVX2 support. +#else + +#include + +template +FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) +{ + v256u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); + dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); + dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); + dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); + + src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); + dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); + dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) ); + dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) +{ + v256u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); + dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); + dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); + dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); + + src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); + dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); + dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) ); + dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0xFF000000); + ColorspaceConvert555To8888_AVX2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0x1F000000); + ColorspaceConvert555To6665_AVX2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v256u32 rgb; + const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { + rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); + rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + else + { + rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); + } + + return _mm256_or_si256(rgb, a); +} + +template +FORCEINLINE v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v256u32 rgb = _mm256_or_si256( _mm256_and_si256(_mm256_slli_epi32(src, 2), _mm256_set1_epi32(0x00FCFCFC)), _mm256_and_si256(_mm256_srli_epi32(src, 4), _mm256_set1_epi32(0x00030303)) ); + const v256u32 a = _mm256_or_si256( _mm256_and_si256(_mm256_slli_epi32(src, 3), _mm256_set1_epi32(0xF8000000)), _mm256_and_si256(_mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { + rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm256_or_si256(rgb, a); +} + +template +FORCEINLINE v256u16 _ConvertColorBaseTo5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v256u32 rgbLo; + v256u32 rgbHi; + v256u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 17), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_slli_epi32(srcLo, 9), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 17), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_slli_epi32(srcHi, 9), _mm256_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 1), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 7), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 1), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 7), _mm256_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm256_packs_epi32( _mm256_and_si256(_mm256_srli_epi32(srcLo, 24), _mm256_set1_epi32(0x0000001F)), _mm256_and_si256(_mm256_srli_epi32(srcHi, 24), _mm256_set1_epi32(0x0000001F)) ); + alpha = _mm256_cmpgt_epi16(alpha, _mm256_setzero_si256()); + alpha = _mm256_and_si256(alpha, _mm256_set1_epi16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 19), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_slli_epi32(srcLo, 7), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 19), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_slli_epi32(srcHi, 7), _mm256_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 3), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 9), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 3), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 9), _mm256_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm256_packs_epi32( _mm256_srli_epi32(srcLo, 24), _mm256_srli_epi32(srcHi, 24) ); + alpha = _mm256_cmpgt_epi16(alpha, _mm256_setzero_si256()); + alpha = _mm256_and_si256(alpha, _mm256_set1_epi16(0x8000)); + } + + return _mm256_or_si256(_mm256_packs_epi32(rgbLo, rgbHi), alpha); +} + +template +FORCEINLINE v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); +} + +template +FORCEINLINE v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + v256u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_AVX2(src_vec256, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + v256u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_AVX2(src_vec256, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=8) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert8888To6665_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert8888To6665_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=8) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert6665To8888_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert6665To8888_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AVX2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) ); + } + } + + return i; +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); + +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); + +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +#endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h new file mode 100644 index 000000000..730bf730f --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -0,0 +1,74 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_AVX2_H +#define COLORSPACEHANDLER_AVX2_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_AVX2 + #warning This header requires AVX2 support. +#else + +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +class ColorspaceHandler_AVX2 : public ColorspaceHandler +{ +public: + ColorspaceHandler_AVX2() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +#endif // ENABLE_AVX2 + +#endif /* COLORSPACEHANDLER_AVX2_H */ diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp new file mode 100644 index 000000000..b4b39f751 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -0,0 +1,345 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_Altivec.h" + +#ifndef ENABLE_ALTIVEC + #error This code requires PowerPC AltiVec support. +#else + +template +FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + dstLo = vec_unpackl((vector pixel)srcColor); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){3,3,3,0, 3,3,3,0, 3,3,3,0, 3,3,3,0})), vec_sr((v128u8)dstLo, ((v128u8){2,2,2,0, 2,2,2,0, 2,2,2,0, 2,2,2,0})) ); + dstLo = vec_sel(dstLo, srcAlphaBits32Lo, vec_splat_u32(0xFF000000)); + + dstHi = vec_unpackh((vector pixel)srcColor); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){3,3,3,0, 3,3,3,0, 3,3,3,0, 3,3,3,0})), vec_sr((v128u8)dstHi, ((v128u8){2,2,2,0, 2,2,2,0, 2,2,2,0, 2,2,2,0})) ); + dstHi = vec_sel(dstHi, srcAlphaBits32Hi, vec_splat_u32(0xFF000000)); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + dstLo = vec_unpackl((vector pixel)srcColor); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){1,1,1,0, 1,1,1,0, 1,1,1,0, 1,1,1,0})), vec_sr((v128u8)dstLo, ((v128u8){4,4,4,0, 4,4,4,0, 4,4,4,0, 4,4,4,0})) ); + dstLo = vec_sel(dstLo, srcAlphaBits32Lo, vec_splat_u32(0xFF000000)); + + dstHi = vec_unpackh((vector pixel)srcColor); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){1,1,1,0, 1,1,1,0, 1,1,1,0, 1,1,1,0})), vec_sr((v128u8)dstHi, ((v128u8){4,4,4,0, 4,4,4,0, 4,4,4,0, 4,4,4,0})) ); + dstHi = vec_sel(dstHi, srcAlphaBits32Hi, vec_splat_u32(0xFF000000)); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = {0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000}; + ColorspaceConvert555To8888_AltiVec(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = {0x1F000000, 0x1F000000, 0x1F000000, 0x1F000000}; + ColorspaceConvert555To6665_AltiVec(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u8 rgba = vec_sr( (v128u8)src, ((v128u8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3}) ); + + if (SWAP_RB) + { + rgba = vec_perm( rgba, rgba, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + } + + return (v128u32)rgba; +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u8 rgba = vec_or( vec_sl((v128u8)src, ((v128u8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3})), vec_sr((v128u8)src, ((v128u8){4,4,4,2, 4,4,4,2, 4,4,4,2, 4,4,4,2})) ); + + if (SWAP_RB) + { + rgba = vec_perm( rgba, rgba, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + } + + return (v128u32)rgba; +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + + v128u16 dstColor; + v128u16 dstAlpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + // Convert alpha + dstAlpha = vec_packsu( vec_and(vec_sr(srcLo, vec_splat_u32(24)), vec_splat_u32(0x0000001F)), vec_and(vec_sr(srcHi, vec_splat_u32(24)), vec_splat_u32(0x0000001F)) ); + dstAlpha = vec_cmpgt(dstAlpha, vec_splat_u16(0)); + dstAlpha = vec_and(dstAlpha, vec_splat_u16(0x8000)); + + // Convert RGB + if (SWAP_RB) + { + rgbLo = vec_perm( srcLo, srcLo, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + rgbHi = vec_perm( srcHi, srcHi, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + + rgbLo = vec_sl( rgbLo, vec_splat_u32(2) ); + rgbHi = vec_sl( rgbHi, vec_splat_u32(2) ); + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + } + else + { + rgbLo = vec_sl( srcLo, vec_splat_u32(2) ); + rgbHi = vec_sl( srcHi, vec_splat_u32(2) ); + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + } + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + // Convert alpha + dstAlpha = vec_packsu( vec_sr(srcLo, vec_splat_u32(24)), vec_sr(srcHi, vec_splat_u32(24)) ); + dstAlpha = vec_cmpgt(dstAlpha, vec_splat_u16(0)); + dstAlpha = vec_and(dstAlpha, vec_splat_u16(0x8000)); + + // Convert RGB + if (SWAP_RB) + { + rgbLo = vec_perm( srcLo, srcLo, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + rgbHi = vec_perm( srcHi, srcHi, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + } + else + { + dstColor = (v128u16)vec_packpx(srcLo, srcHi); + } + } + + dstColor = vec_and(dstColor, vec_splat_u16(0x7FFF)); + return vec_or(dstColor, dstAlpha); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u32 dstConvertedLo, dstConvertedHi; + + ColorspaceConvert555To8888Opaque_AltiVec( vec_ld(0, src+i), dstConvertedLo, dstConvertedHi ); + vec_st(dstConvertedHi, 0, dst+i); + vec_st(dstConvertedLo, 16, dst+i); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u32 dstConvertedLo, dstConvertedHi; + + ColorspaceConvert555To6665Opaque_AltiVec( vec_ld(0, src+i), dstConvertedLo, dstConvertedHi ); + vec_st(dstConvertedHi, 0, dst+i); + vec_st(dstConvertedLo, 16, dst+i); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert8888To6665_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert6665To8888_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AltiVec(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceConvert8888To5551_AltiVec(vec_ld(0, src+i), vec_ld(16, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AltiVec(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceConvert6665To5551_AltiVec(vec_ld(0, src+i), vec_ld(16, src+i)), 0, dst+i ); + } + + return i; +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +#endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h new file mode 100644 index 000000000..d26e05eba --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h @@ -0,0 +1,64 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_ALTIVEC_H +#define COLORSPACEHANDLER_ALTIVEC_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_ALTIVEC + #warning This header requires PowerPC AltiVec support. +#else + +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +// AltiVec has very poor support for dealing with unaligned addresses (it's possible, just +// very obtuse), so we're not even going to bother dealing with any unaligned addresses. +class ColorspaceHandler_AltiVec : public ColorspaceHandler +{ +public: + ColorspaceHandler_AltiVec() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +#endif // ENABLE_ALTIVEC + +#endif /* COLORSPACEHANDLER_ALTIVEC_H */ diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp new file mode 100644 index 000000000..fb4ada420 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -0,0 +1,503 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_SSE2.h" + +#ifndef ENABLE_SSE2 + #error This code requires SSE2 support. +#else + +#include + +#ifdef ENABLE_SSSE3 +#include +#endif + +template +FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + v128u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); + dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); + dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); + dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); + + src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); + dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); + dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); + dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + v128u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); + dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); + dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); + dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); + + src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); + dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); + dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); + dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = _mm_set1_epi32(0xFF000000); + ColorspaceConvert555To8888_SSE2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = _mm_set1_epi32(0x1F000000); + ColorspaceConvert555To6665_SSE2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u32 rgb; + const v128u32 a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); + rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); +#else + rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x003F0000)), 18), _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00003F00)), 2), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x0000003F)), 14)) ); +#endif + } + else + { + rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); + } + + return _mm_or_si128(rgb, a); +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u32 rgb = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00FCFCFC)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00030303)) ); + const v128u32 a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0xF8000000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); +#else + rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16)) ); +#endif + } + + return _mm_or_si128(rgb, a); +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + v128u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 17), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 17), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 1), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 1), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x0000001F)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x0000001F)) ); + alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); + alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 19), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 19), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 3), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 3), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm_packs_epi32( _mm_srli_epi32(srcLo, 24), _mm_srli_epi32(srcHi, 24) ); + alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); + alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); + } + + return _mm_or_si128(_mm_packs_epi32(rgbLo, rgbHi), alpha); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + v128u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_SSE2(src_vec128, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + else + { + _mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + v128u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_SSE2(src_vec128, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + else + { + _mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert8888To6665_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert8888To6665_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert6665To8888_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert6665To8888_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_SSE2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), ColorspaceConvert8888To5551_SSE2(_mm_loadu_si128((v128u32 *)(src+i)), _mm_loadu_si128((v128u32 *)(src+i+4))) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), ColorspaceConvert8888To5551_SSE2(_mm_load_si128((v128u32 *)(src+i)), _mm_load_si128((v128u32 *)(src+i+4))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), ColorspaceConvert6665To5551_SSE2(_mm_loadu_si128((v128u32 *)(src+i)), _mm_loadu_si128((v128u32 *)(src+i+4))) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), ColorspaceConvert6665To5551_SSE2(_mm_load_si128((v128u32 *)(src+i)), _mm_load_si128((v128u32 *)(src+i+4))) ); + } + } + + return i; +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +#endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h new file mode 100644 index 000000000..5b44577ea --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -0,0 +1,74 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_SSE2_H +#define COLORSPACEHANDLER_SSE2_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_SSE2 + #warning This header requires SSE2 support. +#else + +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +class ColorspaceHandler_SSE2 : public ColorspaceHandler +{ +public: + ColorspaceHandler_SSE2() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +#endif // ENABLE_SSE2 + +#endif /* COLORSPACEHANDLER_SSE2_H */ diff --git a/desmume/src/version.cpp b/desmume/src/version.cpp index 2051e59ae..dbaae6651 100644 --- a/desmume/src/version.cpp +++ b/desmume/src/version.cpp @@ -59,44 +59,41 @@ #define DESMUME_PLATFORM_STRING "" #endif -#define DESMUME_SSE_STRING "" -#define DESMUME_AVX_STRING "" +#define DESMUME_CPUEXT_PRIMARY_STRING "" +#define DESMUME_CPUEXT_SECONDARY_STRING "" -#ifdef ENABLE_SSE - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE" -#endif -#ifdef ENABLE_SSE2 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE2" -#endif -#ifdef ENABLE_SSE3 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE3" -#endif -#ifdef ENABLE_SSSE3 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSSE3" -#endif -#ifdef ENABLE_SSE4_1 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE4.1" -#endif -#ifdef ENABLE_SSE4_2 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE4.2" -#endif -#ifdef ENABLE_AVX - #undef DESMUME_AVX_STRING - #define DESMUME_AVX_STRING "+AVX" -#endif -#ifdef ENABLE_AVX2 - #undef DESMUME_AVX_STRING - #define DESMUME_AVX_STRING "+AVX2" +#if defined(ENABLE_SSE4_2) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.2" +#elif defined(ENABLE_SSE4_1) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.1" +#elif defined(ENABLE_SSSE3) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSSE3" +#elif defined(ENABLE_SSE3) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE3" +#elif defined(ENABLE_SSE2) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE2" +#elif defined(ENABLE_SSE) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE" +#elif defined(ENABLE_ALTIVEC) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec" #endif -#define DESMUME_CPUEXT_STRING DESMUME_SSE_STRING DESMUME_AVX_STRING +#if defined(ENABLE_AVX2) + #undef DESMUME_CPUEXT_SECONDARY_STRING + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX2" +#elif defined(ENABLE_AVX) + #undef DESMUME_CPUEXT_SECONDARY_STRING + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX" +#endif +#define DESMUME_CPUEXT_STRING DESMUME_CPUEXT_PRIMARY_STRING DESMUME_CPUEXT_SECONDARY_STRING #ifdef DEVELOPER #define DESMUME_FEATURE_STRING " dev+" diff --git a/desmume/src/windows/DeSmuME.vcxproj b/desmume/src/windows/DeSmuME.vcxproj index 5354571f2..53d2c00d8 100644 --- a/desmume/src/windows/DeSmuME.vcxproj +++ b/desmume/src/windows/DeSmuME.vcxproj @@ -171,6 +171,8 @@ + + @@ -442,6 +444,8 @@ + + diff --git a/desmume/src/windows/DeSmuME.vcxproj.filters b/desmume/src/windows/DeSmuME.vcxproj.filters index 2e317d9d3..0a0742ced 100644 --- a/desmume/src/windows/DeSmuME.vcxproj.filters +++ b/desmume/src/windows/DeSmuME.vcxproj.filters @@ -121,6 +121,9 @@ {18cba3ce-aaa6-441d-8111-408d0fcef7d2} + + {db5dc512-2b75-4476-8cac-75fd4acfd85f} + @@ -966,6 +969,12 @@ Core\libretro-common\file + + Core\utils\colorspacehandler + + + Core\utils\colorspacehandler + @@ -1739,6 +1748,12 @@ Core\libretro-common\include\compat + + Core\utils\colorspacehandler + + + Core\utils\colorspacehandler + diff --git a/desmume/src/windows/aviout.cpp b/desmume/src/windows/aviout.cpp index cd34ac559..6a3464fb3 100644 --- a/desmume/src/windows/aviout.cpp +++ b/desmume/src/windows/aviout.cpp @@ -316,13 +316,14 @@ static void do_video_conversion(AVIFile* avi, const u16* buffer) int height = avi->prescaleLevel*384; u8* outbuf = avi_file->convert_buffer + width*(height-1)*3; - for(int y=0;y(*buffer++); - *(u32 *)outbuf = (dst & 0x00FFFFFF) | (*(u32 *)outbuf & 0xFF000000); - outbuf += 3; + u32 dst = ColorspaceConvert555To8888Opaque(*buffer++); + *outbuf++ = dst & 0xFF; + *outbuf++ = (dst >> 8) & 0xFF; + *outbuf++ = (dst >> 16) & 0xFF; } outbuf -= width*3*2; diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index aaabebd3a..2e8b4b314 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -1920,7 +1920,7 @@ static void DoDisplay(bool firstTime) //convert pixel format to 32bpp for compositing //why do we do this over and over? well, we are compositing to //filteredbuffer32bpp, and it needs to get refreshed each frame. - ConvertColorBuffer555To8888Opaque((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); + ColorspaceConvertBuffer555To8888Opaque((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); if(firstTime) { From 8c60f5fdf38a97e6309c5a1ac0ad20d8dea36429 Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 16 Aug 2016 20:00:36 +0000 Subject: [PATCH 8/9] GPU: - Use more consistent behavior when forcing the 3D rendering to finish. - Guarantee that 3D rendering will always be finished at the end of V-blank. --- desmume/src/GPU.cpp | 20 ++++++++++++++++++-- desmume/src/GPU.h | 1 + desmume/src/gfx3d.cpp | 21 +++++---------------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index a252d92c1..eb6cce8c8 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -6879,6 +6879,22 @@ void GPUSubsystem::Reset() osd->clear(); } +void GPUSubsystem::ForceRender3DFinishAndFlush(bool willFlush) +{ + if (CurrentRenderer->GetRenderNeedsFinish()) + { + bool need3DDisplayFramebuffer; + bool need3DCaptureFramebuffer; + CurrentRenderer->GetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); + + CurrentRenderer->SetFramebufferFlushStates(willFlush, willFlush); + CurrentRenderer->RenderFinish(); + CurrentRenderer->SetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); + CurrentRenderer->SetRenderNeedsFinish(false); + this->_event->DidRender3DEnd(); + } +} + void GPUSubsystem::UpdateRenderProperties() { this->_engineMain->vramBlockOBJIndex = VRAM_NO_3D_USAGE; @@ -7004,7 +7020,7 @@ void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h, void *clientNati return; } - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(false); const float customWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; const float customHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT; @@ -7146,7 +7162,7 @@ void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h) void GPUSubsystem::SetColorFormat(const NDSColorFormat outputFormat, void *clientNativeBuffer, void *clientCustomBuffer) { - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(false); this->_displayInfo.colorFormat = outputFormat; this->_displayInfo.pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index a32857d36..12e6cbc9d 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1627,6 +1627,7 @@ public: GPUEventHandler* GetEventHandler(); void Reset(); + void ForceRender3DFinishAndFlush(bool willFlush); const NDSDisplayInfo& GetDisplayInfo(); // Frontends need to call this whenever they need to read the video buffers from the emulator core void SetDisplayDidCustomRender(NDSDisplayID displayID, bool theState); diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index d93485685..d882b981c 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -527,7 +527,7 @@ void gfx3d_deinit() void gfx3d_reset() { - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(false); #ifdef _SHOW_VTX_COUNTERS max_polys = max_verts = 0; @@ -2300,23 +2300,12 @@ void gfx3d_VBlankSignal() void gfx3d_VBlankEndSignal(bool skipFrame) { + GPU->ForceRender3DFinishAndFlush(false); + if (!drawPending) return; if (skipFrame) return; - - drawPending = FALSE; - if (CurrentRenderer->GetRenderNeedsFinish()) - { - bool need3DDisplayFramebuffer; - bool need3DCaptureFramebuffer; - CurrentRenderer->GetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); - - CurrentRenderer->SetFramebufferFlushStates(false, false); - CurrentRenderer->RenderFinish(); - CurrentRenderer->SetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); - CurrentRenderer->SetRenderNeedsFinish(false); - GPU->GetEventHandler()->DidRender3DEnd(); - } + drawPending = FALSE; GPU->GetEventHandler()->DidRender3DBegin(); @@ -2534,7 +2523,7 @@ void gfx3d_Update3DFramebuffers(FragmentColor *framebufferRGBA6665, u16 *framebu //-------------savestate void gfx3d_savestate(EMUFILE* os) { - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(true); //version write32le(4,os); From 9c1f523a725abca2fcfbf07cd11d12077154a80c Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 16 Aug 2016 22:00:27 +0000 Subject: [PATCH 9/9] support --3d-render in common commandline parsing and use in windows port as a demo --- desmume/src/commandline.cpp | 16 ++++++++++++++++ desmume/src/commandline.h | 14 +++++++++++++- desmume/src/windows/main.cpp | 7 +++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp index 476076335..049fa7887 100644 --- a/desmume/src/commandline.cpp +++ b/desmume/src/commandline.cpp @@ -66,6 +66,7 @@ CommandLine::CommandLine() , arm7_gdb_port(0) , start_paused(FALSE) , autodetect_method(-1) +, render3d(COMMANDLINE_RENDER3D_DEFAULT) { #ifndef HOST_WINDOWS disable_sound = 0; @@ -92,6 +93,8 @@ static const char* help_string = \ " --num-cores N Override numcores detection and use this many" ENDL " --spu-synch Use SPU synch (crackles; helps streams; default ON)" ENDL " --spu-method N Select SPU synch method: 0:N, 1:Z, 2:P; default 0" ENDL +" --3d-render [SW|AUTOGL|GL|OLDGL]" ENDL +" Select 3d renderer; default SW" ENDL #ifndef HOST_WINDOWS " --disable-sound Disables the sound output" ENDL " --disable-limiter Disables the 60fps limiter" ENDL @@ -154,6 +157,7 @@ ENDL #define OPT_NUMCORES 1 #define OPT_SPU_METHOD 2 +#define OPT_3D_RENDER 3 #define OPT_JIT_SIZE 100 #define OPT_CONSOLE_TYPE 200 @@ -183,6 +187,8 @@ ENDL bool CommandLine::parse(int argc,char **argv) { + std::string _render3d; + int opt_help = 0; int option_index = 0; for(;;) @@ -197,6 +203,7 @@ bool CommandLine::parse(int argc,char **argv) { "num-cores", required_argument, NULL, OPT_NUMCORES }, { "spu-synch", no_argument, &_spu_sync_mode, 1 }, { "spu-method", required_argument, NULL, OPT_SPU_METHOD }, + { "3d-render", required_argument, NULL, OPT_3D_RENDER }, #ifndef HOST_WINDOWS { "disable-sound", no_argument, &disable_sound, 1}, { "disable-limiter", no_argument, &disable_limiter, 1}, @@ -265,6 +272,7 @@ bool CommandLine::parse(int argc,char **argv) //user settings case OPT_NUMCORES: _num_cores = atoi(optarg); break; case OPT_SPU_METHOD: _spu_sync_method = atoi(optarg); break; + case OPT_3D_RENDER: _render3d = optarg; break; //sync settings case OPT_JIT_SIZE: _jit_size = atoi(optarg); break; @@ -343,6 +351,14 @@ bool CommandLine::parse(int argc,char **argv) CommonSettings.DebugConsole = true; } + //process 3d renderer + _render3d = strtoupper(_render3d); + if(_render3d == "NONE") render3d = COMMANDLINE_RENDER3D_NONE; + if(_render3d == "SW") render3d = COMMANDLINE_RENDER3D_SW; + if(_render3d == "OLDGL") render3d = COMMANDLINE_RENDER3D_OLDGL; + if(_render3d == "AUTOGL") render3d = COMMANDLINE_RENDER3D_AUTOGL; + if(_render3d == "GL") render3d = COMMANDLINE_RENDER3D_GL; + if (autodetect_method != -1) CommonSettings.autodetectBackupMethod = autodetect_method; diff --git a/desmume/src/commandline.h b/desmume/src/commandline.h index 40ab7bc34..b77239400 100644 --- a/desmume/src/commandline.h +++ b/desmume/src/commandline.h @@ -24,17 +24,29 @@ //hacky commandline options that i didnt want to route through commonoptions extern int _commandline_linux_nojoy; +#define COMMANDLINE_RENDER3D_DEFAULT 0 +#define COMMANDLINE_RENDER3D_NONE 1 +#define COMMANDLINE_RENDER3D_SW 2 +#define COMMANDLINE_RENDER3D_OLDGL 3 +#define COMMANDLINE_RENDER3D_GL 4 +#define COMMANDLINE_RENDER3D_AUTOGL 5 + //this class will also eventually try to take over the responsibility of using the args that it handles //for example: preparing the emulator run by loading the rom, savestate, and/or movie in the correct pattern. //it should also populate CommonSettings with its initial values +//EDIT: not really. combining this with what a frontend wants to do is complicated. +//you might design the API so that the frontend sets all those up, but I'm not sure I like that +//Really, this should be a passive structure that just collects the results provided by the shared command line processing, to be used later as appropriate +//(and the CommonSettings setup REMOVED or at least refactored into a separate method) class CommandLine { public: - //actual options: these may move to another sturct + //actual options: these may move to another struct int load_slot; int depth_threshold; int autodetect_method; + int render3d; std::string nds_file; std::string play_movie_file; std::string record_movie_file; diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 2e8b4b314..9f334fed3 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -3283,6 +3283,13 @@ int _main() cur3DCore = GPU3D_NULL; else if(cur3DCore == GPU3D_NULL) // this value shouldn't be saved anymore cur3DCore = GPU3D_DEFAULT; + + if(cmdline.render3d == COMMANDLINE_RENDER3D_NONE) cur3DCore = GPU3D_NULL; + if(cmdline.render3d == COMMANDLINE_RENDER3D_SW) cur3DCore = GPU3D_SWRAST; + if(cmdline.render3d == COMMANDLINE_RENDER3D_OLDGL) cur3DCore = GPU3D_OPENGL_OLD; + if(cmdline.render3d == COMMANDLINE_RENDER3D_GL) cur3DCore = GPU3D_OPENGL_3_2; //no way of forcing it, at least not right now. I dont care. + if(cmdline.render3d == COMMANDLINE_RENDER3D_AUTOGL) cur3DCore = GPU3D_OPENGL_3_2; //this will fallback i guess + CommonSettings.GFX3D_HighResolutionInterpolateColor = GetPrivateProfileBool("3D", "HighResolutionInterpolateColor", 1, IniName); CommonSettings.GFX3D_EdgeMark = GetPrivateProfileBool("3D", "EnableEdgeMark", 1, IniName); CommonSettings.GFX3D_Fog = GetPrivateProfileBool("3D", "EnableFog", 1, IniName);