diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index dd9da900a..b1993ae37 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -38,11 +38,18 @@ #include #endif +// Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the +// shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can +// then substitute the palignr instruction with an SSE2 equivalent. +#if defined(ENABLE_SSE2) && !defined(ENABLE_SSSE3) + #define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128(a, 16-(immShiftCount)), _mm_srli_si128(b, (immShiftCount))) +#endif + // Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to // pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit // mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it // should work fine for both SSE4.1 and SSE2. -#if !defined(_SMMINTRIN_H) && defined(__EMMINTRIN_H) +#if defined(ENABLE_SSE2) && !defined(ENABLE_SSE4_1) #define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a))) #endif diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp index 624f52e84..15959e45a 100644 --- a/desmume/src/commandline.cpp +++ b/desmume/src/commandline.cpp @@ -26,7 +26,7 @@ #include "slot2.h" #include "NDSSystem.h" #include "utils/xstring.h" -#include "compat/getopt.h" +#include //#include "frontend/modules/mGetOpt.h" //to test with this, make sure global `optind` is initialized to 1 #define printerror(...) fprintf(stderr, __VA_ARGS__) @@ -194,9 +194,9 @@ bool CommandLine::parse(int argc,char **argv) { "help", no_argument, &opt_help, 1 }, //user settings - { "num-cores", required_argument, nullptr, OPT_NUMCORES }, + { "num-cores", required_argument, NULL, OPT_NUMCORES }, { "spu-synch", no_argument, &_spu_sync_mode, 1 }, - { "spu-method", required_argument, nullptr, OPT_SPU_METHOD }, + { "spu-method", required_argument, NULL, OPT_SPU_METHOD }, #ifndef HOST_WINDOWS { "disable-sound", no_argument, &disable_sound, 1}, { "disable-limiter", no_argument, &disable_limiter, 1}, @@ -214,41 +214,41 @@ bool CommandLine::parse(int argc,char **argv) { "backupmem-db", no_argument, &autodetect_method, 1}, //system equipment - { "console-type", required_argument, nullptr, OPT_CONSOLE_TYPE }, - { "bios-arm9", required_argument, nullptr, OPT_ARM9}, - { "bios-arm7", required_argument, nullptr, OPT_ARM7}, + { "console-type", required_argument, NULL, OPT_CONSOLE_TYPE }, + { "bios-arm9", required_argument, NULL, OPT_ARM9}, + { "bios-arm7", required_argument, NULL, OPT_ARM7}, { "bios-swi", required_argument, &_bios_swi, 1}, //slot-1 contents - { "slot1", required_argument, nullptr, OPT_SLOT1}, + { "slot1", required_argument, NULL, OPT_SLOT1}, { "preload-rom", no_argument, &_load_to_memory, 1}, - { "slot1-fat-dir", required_argument, nullptr, OPT_SLOT1_FAT_DIR}, + { "slot1-fat-dir", required_argument, NULL, OPT_SLOT1_FAT_DIR}, //slot-2 contents - { "cflash-image", required_argument, nullptr, OPT_SLOT2_CFLASH_IMAGE}, - { "cflash-path", required_argument, nullptr, OPT_SLOT2_CFLASH_DIR}, - { "gbaslot-rom", required_argument, nullptr, OPT_SLOT2_GBAGAME}, + { "cflash-image", required_argument, NULL, OPT_SLOT2_CFLASH_IMAGE}, + { "cflash-path", required_argument, NULL, OPT_SLOT2_CFLASH_DIR}, + { "gbaslot-rom", required_argument, NULL, OPT_SLOT2_GBAGAME}, //commands { "start-paused", no_argument, &start_paused, 1}, - { "load-slot", required_argument, nullptr, OPT_LOAD_SLOT}, - { "play-movie", required_argument, nullptr, OPT_PLAY_MOVIE}, - { "record-movie", required_argument, nullptr, OPT_RECORD_MOVIE}, + { "load-slot", required_argument, NULL, OPT_LOAD_SLOT}, + { "play-movie", required_argument, NULL, OPT_PLAY_MOVIE}, + { "record-movie", required_argument, NULL, OPT_RECORD_MOVIE}, //video filters - { "scanline-filter-a", required_argument, nullptr, OPT_SCANLINES_A}, - { "scanline-filter-b", required_argument, nullptr, OPT_SCANLINES_B}, - { "scanline-filter-c", required_argument, nullptr, OPT_SCANLINES_C}, - { "scanline-filter-d", required_argument, nullptr, OPT_SCANLINES_D}, + { "scanline-filter-a", required_argument, NULL, OPT_SCANLINES_A}, + { "scanline-filter-b", required_argument, NULL, OPT_SCANLINES_B}, + { "scanline-filter-c", required_argument, NULL, OPT_SCANLINES_C}, + { "scanline-filter-d", required_argument, NULL, OPT_SCANLINES_D}, //debugging #ifdef GDB_STUB - { "arm9gdb", required_argument, nullptr, OPT_ARM9GDB}, - { "arm7gdb", required_argument, nullptr, OPT_ARM7GDB}, + { "arm9gdb", required_argument, NULL, OPT_ARM9GDB}, + { "arm7gdb", required_argument, NULL, OPT_ARM7GDB}, #endif //utilities - { "advanscene-import", required_argument, nullptr, OPT_ADVANSCENE}, + { "advanscene-import", required_argument, NULL, OPT_ADVANSCENE}, {0,0,0,0} }; @@ -355,7 +355,7 @@ bool CommandLine::parse(int argc,char **argv) free(_bios_arm9); free(_bios_arm7); - _bios_arm9 = _bios_arm7 = nullptr; + _bios_arm9 = _bios_arm7 = NULL; //remaining argument should be an NDS file, and nothing more int remain = argc-optind; diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index a0504cdfd..4a460ed1e 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -904,42 +904,42 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) const __m128i clearColor1 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex1)); const __m128i clearDepth0 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex0)); const __m128i clearDepth1 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex1)); -#ifdef ENABLE_SSSE3 + switch (shiftCount) { case 1: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 7 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 7 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 14); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 14); break; case 2: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 6 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 6 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 12); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 12); break; case 3: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 5 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 5 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 10); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 10); break; case 4: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 4 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 4 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 8); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 8); break; case 5: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 3 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 3 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 6); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 6); break; case 6: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 2 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 2 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 4); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 4); break; case 7: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 1 * sizeof(u16)); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 1 * sizeof(u16)); + clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 2); + clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 2); break; default: @@ -947,50 +947,6 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) clearDepth_vec128 = _mm_setzero_si128(); break; } -#else - switch (shiftCount) - { - case 1: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 1 * sizeof(u16)), _mm_srli_si128(clearColor0, 7 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 1 * sizeof(u16)), _mm_srli_si128(clearDepth0, 7 * sizeof(u16)) ); - break; - - case 2: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 2 * sizeof(u16)), _mm_srli_si128(clearColor0, 6 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 2 * sizeof(u16)), _mm_srli_si128(clearDepth0, 6 * sizeof(u16)) ); - break; - - case 3: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 3 * sizeof(u16)), _mm_srli_si128(clearColor0, 5 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 3 * sizeof(u16)), _mm_srli_si128(clearDepth0, 5 * sizeof(u16)) ); - break; - - case 4: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 4 * sizeof(u16)), _mm_srli_si128(clearColor0, 4 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 4 * sizeof(u16)), _mm_srli_si128(clearDepth0, 4 * sizeof(u16)) ); - break; - - case 5: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 5 * sizeof(u16)), _mm_srli_si128(clearColor0, 3 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 5 * sizeof(u16)), _mm_srli_si128(clearDepth0, 3 * sizeof(u16)) ); - break; - - case 6: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 6 * sizeof(u16)), _mm_srli_si128(clearColor0, 2 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 6 * sizeof(u16)), _mm_srli_si128(clearDepth0, 2 * sizeof(u16)) ); - break; - - case 7: - clearColor = _mm_or_si128( _mm_slli_si128(clearColor1, 7 * sizeof(u16)), _mm_srli_si128(clearColor0, 1 * sizeof(u16)) ); - clearDepth_vec128 = _mm_or_si128( _mm_slli_si128(clearDepth1, 7 * sizeof(u16)), _mm_srli_si128(clearDepth0, 1 * sizeof(u16)) ); - break; - - default: - clearColor = _mm_setzero_si128(); - clearDepth_vec128 = _mm_setzero_si128(); - break; - } -#endif } const __m128i clearDepthValue = _mm_and_si128(clearDepth_vec128, _mm_set1_epi16(0x7FFF));