diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp index 542bae9fc..38f10876b 100644 --- a/desmume/src/FIFO.cpp +++ b/desmume/src/FIFO.cpp @@ -237,6 +237,15 @@ void GFX_FIFOsend(u8 cmd, u32 param) if(IsMatrixStackCommand(cmd)) gxFIFO.matrix_stack_op_size++; + //along the same lines: + //american girls julie finds a way will put a bunch of stuff and then a box test into the fifo and then immediately test the busy flag + //so we need to set the busy flag here. + //does it expect the fifo to be running then? well, it's definitely jammed -- making it unjammed at one point did fix this bug. + //it's still not clear whether we're handling the immediate vs fifo commands properly at all :( + //anyway, here we go, similar treatment. consider this a hack. + if(cmd == 0x70) MMU_new.gxstat.tb = 1; //just set the flag--youre insane if you queue more than one of these anyway + if(cmd == 0x71) MMU_new.gxstat.tb = 1; + if(gxFIFO.size>=HACK_GXIFO_SIZE) { printf("--FIFO FULL-- : %d\n",gxFIFO.size); } diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 01e1ca16e..94c5737a1 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -18,6 +18,14 @@ along with the this software. If not, see . */ +#ifdef FASTBUILD + #undef FORCEINLINE + #define FORCEINLINE + //compilation speed hack (cuts time exactly in half by cutting out permutations) + #define DISABLE_MOSAIC + #define DISABLE_COLOREFFECTDISABLEHINT +#endif + #include "GPU.h" #include @@ -40,75 +48,8 @@ #include "matrix.h" #include "emufile.h" -#ifdef FASTBUILD - #undef FORCEINLINE - #define FORCEINLINE - //compilation speed hack (cuts time exactly in half by cutting out permutations) - #define DISABLE_MOSAIC -#endif - u32 Render3DFramesPerSecond; -CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; -CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; -CACHE_ALIGN u32 color_555_to_666[32768]; -CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; -CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; -CACHE_ALIGN u32 color_555_to_888[32768]; - -//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX -CACHE_ALIGN const u32 material_5bit_to_31bit[] = { - 0x00000000, 0x04210842, 0x08421084, 0x0C6318C6, - 0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE, - 0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6, - 0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE, - 0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7, - 0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF, - 0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7, - 0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF -}; - -// 5-bit to 6-bit conversions use this formula -- dst = (src == 0) ? 0 : (2*src) + 1 -// Reference GBATEK: http://problemkaputt.de/gbatek.htm#ds3dtextureblending -CACHE_ALIGN const u8 material_5bit_to_6bit[] = { - 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, - 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, - 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, - 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F -}; - -CACHE_ALIGN const u8 material_5bit_to_8bit[] = { - 0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39, - 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B, - 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD, - 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF -}; - -CACHE_ALIGN const u8 material_6bit_to_8bit[] = { - 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, - 0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C, - 0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D, - 0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D, - 0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E, - 0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE, - 0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF, - 0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF -}; - -CACHE_ALIGN const u8 material_3bit_to_8bit[] = { - 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF -}; - -//maybe not very precise -CACHE_ALIGN const u8 material_3bit_to_5bit[] = { - 0, 4, 8, 13, 17, 22, 26, 31 -}; - -//TODO - generate this in the static init method more accurately -CACHE_ALIGN const u8 material_3bit_to_6bit[] = { - 0, 8, 16, 26, 34, 44, 52, 63 -}; - //instantiate static instance u16 GPUEngineBase::_brightnessUpTable555[17][0x8000]; FragmentColor GPUEngineBase::_brightnessUpTable666[17][0x8000]; @@ -167,7 +108,7 @@ const CACHE_ALIGN BGLayerSize GPUEngineBase::_BGLayerSizeLUT[8][4] = { {{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext direct }; -static void ExpandLine8(u8 *__restrict dst, const u8 *__restrict src, size_t dstLength) +static FORCEINLINE void ExpandLine8(u8 *__restrict dst, const u8 *__restrict src, size_t dstLength) { #ifdef ENABLE_SSSE3 const bool isIntegerScale = ((dstLength % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0); @@ -1655,11 +1596,11 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - dstColor32.color = ConvertColor555To6665Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); break; case NDSColorFormat_BGR888_Rev: - dstColor32.color = ConvertColor555To8888Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); break; } @@ -1682,11 +1623,11 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - dstColor32.color = ConvertColor555To6665Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); break; case NDSColorFormat_BGR888_Rev: - dstColor32.color = ConvertColor555To8888Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); break; } @@ -1767,11 +1708,11 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - dstColor32.color = ConvertColor555To6665Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); break; case NDSColorFormat_BGR888_Rev: - dstColor32.color = ConvertColor555To8888Opaque(srcColor16); + dstColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); break; } break; @@ -1833,13 +1774,13 @@ FORCEINLINE void GPUEngineBase::_RenderPixel(GPUEngineCompositorInfo &compInfo, break; case NDSColorFormat_BGR666_Rev: - srcColor32.color = ConvertColor555To6665Opaque(srcColor16); + srcColor32.color = ColorspaceConvert555To6665Opaque(srcColor16); dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); dstColor32.a = 0x1F; break; case NDSColorFormat_BGR888_Rev: - srcColor32.color = ConvertColor555To8888Opaque(srcColor16); + srcColor32.color = ColorspaceConvert555To8888Opaque(srcColor16); dstColor32 = this->_ColorEffectBlend(srcColor32, dstColor32, blendEVA, blendEVB); dstColor32.a = 0xFF; break; @@ -2132,7 +2073,7 @@ FORCEINLINE void GPUEngineBase::_RenderPixel3D(GPUEngineCompositorInfo &compInfo // Render the pixel using the selected color effect. if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) { - const u16 srcColor16 = ConvertColor6665To5551(srcColor32); + const u16 srcColor16 = ColorspaceConvert6665To5551(srcColor32); switch (selectedEffect) { @@ -2695,13 +2636,13 @@ void GPUEngineBase::_RenderPixelsCustom(GPUEngineCompositorInfo &compInfo) if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) { - ConvertColor555To6665Opaque(src16[0], src[0], src[1]); - ConvertColor555To6665Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); } else { - ConvertColor555To8888Opaque(src16[0], src[0], src[1]); - ConvertColor555To8888Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); } } @@ -2796,13 +2737,13 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(GPUEngineCompositorInfo &compInfo) { if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) { - ConvertColor555To6665Opaque(src16[0], src[0], src[1]); - ConvertColor555To6665Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To6665Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To6665Opaque_SSE2(src16[1], src[2], src[3]); } else { - ConvertColor555To8888Opaque(src16[0], src[0], src[1]); - ConvertColor555To8888Opaque(src16[1], src[2], src[3]); + ColorspaceConvert555To8888Opaque_SSE2(src16[0], src[0], src[1]); + ColorspaceConvert555To8888Opaque_SSE2(src16[1], src[2], src[3]); } } @@ -4502,7 +4443,7 @@ void GPUEngineBase::UpdateVRAM3DUsageProperties_OBJLayer(const size_t bankIndex) } template -void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo) { bool useCustomVRAM = false; @@ -4538,26 +4479,28 @@ void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo) } template -void GPUEngineBase::_RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo) { this->_RenderLine_LayerBG_Final(compInfo); } template -void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo) { +#ifndef DISABLE_COLOREFFECTDISABLEHINT if (compInfo.renderState.colorEffect == ColorEffect_Disable) { this->_RenderLine_LayerBG_ApplyColorEffectDisabledHint(compInfo); } else +#endif { this->_RenderLine_LayerBG_ApplyColorEffectDisabledHint(compInfo); } } template -void GPUEngineBase::_RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo) +FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo) { if (ISDEBUGRENDER) { @@ -4951,7 +4894,7 @@ void GPUEngineBase::ResolveCustomRendering() void GPUEngineBase::ResolveRGB666ToRGB888() { - ConvertColorBuffer6665To8888((u32 *)this->renderedBuffer, (u32 *)this->renderedBuffer, this->renderedWidth * this->renderedHeight); + ColorspaceConvertBuffer6665To8888((u32 *)this->renderedBuffer, (u32 *)this->renderedBuffer, this->renderedWidth * this->renderedHeight); } void GPUEngineBase::ResolveToCustomFramebuffer() @@ -5575,12 +5518,12 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) case NDSColorFormat_BGR666_Rev: renderedLineSrcA16 = (u16 *)malloc_alignedCacheLine(compInfo.line.pixelCount * sizeof(u16)); - ConvertColorBuffer6665To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); + ColorspaceConvertBuffer6665To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); break; case NDSColorFormat_BGR888_Rev: renderedLineSrcA16 = (u16 *)malloc_alignedCacheLine(compInfo.line.pixelCount * sizeof(u16)); - ConvertColorBuffer8888To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); + ColorspaceConvertBuffer8888To5551((u32 *)compInfo.target.lineColorHead, renderedLineSrcA16, compInfo.line.pixelCount); break; } } @@ -6570,7 +6513,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); - ConvertColorBuffer555To6665Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); + ColorspaceConvertBuffer555To6665Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); break; } @@ -6578,7 +6521,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); FragmentColor *dst = (FragmentColor *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); - ConvertColorBuffer555To8888Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); + ColorspaceConvertBuffer555To8888Opaque(src, (u32 *)dst, GPU_FRAMEBUFFER_NATIVE_WIDTH); break; } } @@ -6598,7 +6541,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth); FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth); - ConvertColorBuffer555To6665Opaque(src, (u32 *)dst, customPixCount); + ColorspaceConvertBuffer555To6665Opaque(src, (u32 *)dst, customPixCount); break; } @@ -6606,7 +6549,7 @@ void GPUEngineA::_HandleDisplayModeVRAM(const size_t l) { const u16 *src = this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth); FragmentColor *dst = (FragmentColor *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth); - ConvertColorBuffer555To8888Opaque(src, (u32 *)dst, customPixCount); + ColorspaceConvertBuffer555To8888Opaque(src, (u32 *)dst, customPixCount); break; } } @@ -6802,28 +6745,7 @@ void GPUEngineB::RenderLine(const u16 l) GPUSubsystem::GPUSubsystem() { - static bool needInitTables = true; - - if (needInitTables) - { -#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) -#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) -#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) -#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) - - for (size_t i = 0; i < 32768; i++) - { - color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); - color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); - color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); - - color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); - color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); - color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 ); - } - - needInitTables = false; - } + ColorspaceHandlerInit(); _defaultEventHandler = new GPUEventHandlerDefault; _event = _defaultEventHandler; @@ -6957,6 +6879,22 @@ void GPUSubsystem::Reset() osd->clear(); } +void GPUSubsystem::ForceRender3DFinishAndFlush(bool willFlush) +{ + if (CurrentRenderer->GetRenderNeedsFinish()) + { + bool need3DDisplayFramebuffer; + bool need3DCaptureFramebuffer; + CurrentRenderer->GetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); + + CurrentRenderer->SetFramebufferFlushStates(willFlush, willFlush); + CurrentRenderer->RenderFinish(); + CurrentRenderer->SetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); + CurrentRenderer->SetRenderNeedsFinish(false); + this->_event->DidRender3DEnd(); + } +} + void GPUSubsystem::UpdateRenderProperties() { this->_engineMain->vramBlockOBJIndex = VRAM_NO_3D_USAGE; @@ -7082,7 +7020,7 @@ void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h, void *clientNati return; } - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(false); const float customWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; const float customHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT; @@ -7224,7 +7162,7 @@ void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h) void GPUSubsystem::SetColorFormat(const NDSColorFormat outputFormat, void *clientNativeBuffer, void *clientCustomBuffer) { - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(false); this->_displayInfo.colorFormat = outputFormat; this->_displayInfo.pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor); @@ -7581,178 +7519,6 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID) this->_gpu->SetDisplayByID(this->_ID); } -template -void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - __m128i src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); - __m128i dstConvertedLo, dstConvertedHi; - ConvertColor555To8888Opaque(src_vec128, dstConvertedLo, dstConvertedHi); - - if (IS_UNALIGNED) - { - _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - else - { - _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor555To8888Opaque(src[i]); - } -} - -template -void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - __m128i src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); - __m128i dstConvertedLo, dstConvertedHi; - ConvertColor555To6665Opaque(src_vec128, dstConvertedLo, dstConvertedHi); - - if (IS_UNALIGNED) - { - _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - else - { - _mm_store_si128((__m128i *)(dst + i + 0), dstConvertedLo); - _mm_store_si128((__m128i *)(dst + i + 4), dstConvertedHi); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor555To6665Opaque(src[i]); - } -} - -template -void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To6665(_mm_load_si128((__m128i *)(src + i))) ); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor8888To6665(src[i]); - } -} - -template -void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To8888(_mm_load_si128((__m128i *)(src + i))) ); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor6665To8888(src[i]); - } -} - -template -void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - if (IS_UNALIGNED) - { - _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); - } - else - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor8888To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor8888To5551(src[i]); - } -} - -template -void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const size_t ssePixCount = pixCount - (pixCount % 8); - for (; i < ssePixCount; i += 8) - { - if (IS_UNALIGNED) - { - _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); - } - else - { - _mm_store_si128( (__m128i *)(dst + i), ConvertColor6665To5551(_mm_load_si128((__m128i *)(src + i)), _mm_load_si128((__m128i *)(src + i + 4))) ); - } - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < pixCount; i++) - { - dst[i] = ConvertColor6665To5551(src[i]); - } -} - template void GPUEngineBase::ParseReg_BGnHOFS(); template void GPUEngineBase::ParseReg_BGnHOFS(); template void GPUEngineBase::ParseReg_BGnHOFS(); @@ -7774,29 +7540,3 @@ template void GPUEngineBase::ParseReg_BGnY(); template void GPUSubsystem::RenderLine(const u16 l, bool skip); template void GPUSubsystem::RenderLine(const u16 l, bool skip); template void GPUSubsystem::RenderLine(const u16 l, bool skip); - -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); - -template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); - -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index da5644d94..12e6cbc9d 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -25,9 +25,11 @@ #include #include "types.h" +#include "./utils/colorspacehandler/colorspacehandler.h" #ifdef ENABLE_SSE2 #include +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" #endif #ifdef ENABLE_SSSE3 @@ -80,7 +82,7 @@ enum PaletteMode enum OBJMode { OBJMode_Normal = 0, - OBJMode_Transparent = 1, + OBJMode_Transparent = 1, OBJMode_Window = 2, OBJMode_Bitmap = 3 }; @@ -89,7 +91,7 @@ enum OBJShape { OBJShape_Square = 0, OBJShape_Horizontal = 1, - OBJShape_Vertical = 2, + OBJShape_Vertical = 2, OBJShape_Prohibited = 3 }; @@ -98,16 +100,7 @@ enum DisplayCaptureSize DisplayCaptureSize_128x128 = 0, DisplayCaptureSize_256x64 = 1, DisplayCaptureSize_256x128 = 2, - DisplayCaptureSize_256x192 = 3 -}; - -union FragmentColor -{ - u32 color; - struct - { - u8 r,g,b,a; - }; + DisplayCaptureSize_256x192 = 3, }; typedef union @@ -116,38 +109,7 @@ typedef union struct { -#ifdef MSB_FIRST - u8 ForceBlank:1; // 7: A+B; - u8 OBJ_BMP_mapping:1; // 6: A+B; 0=2D (128KB), 1=1D (128..256KB) - u8 OBJ_BMP_2D_dim:1; // 5: A+B; 0=128x512, 1=256x256 pixels - u8 OBJ_Tile_mapping:1; // 4: A+B; 0=2D (32KB), 1=1D (32..256KB) - u8 BG0_3D:1; // 3: A ; 0=2D, 1=3D - u8 BG_Mode:3; // 0- 2: A+B; - - u8 WinOBJ_Enable:1; // 15: A+B; 0=Disable, 1=Enable - u8 Win1_Enable:1; // 14: A+B; 0=Disable, 1=Enable - u8 Win0_Enable:1; // 13: A+B; 0=Disable, 1=Enable - u8 OBJ_Enable:1; // 12: A+B; 0=Disable, 1=Enable - u8 BG3_Enable:1; // 11: A+B; 0=Disable, 1=Enable - u8 BG2_Enable:1; // 10: A+B; 0=Disable, 1=Enable - u8 BG1_Enable:1; // 9: A+B; 0=Disable, 1=Enable - u8 BG0_Enable:1; // 8: A+B; 0=Disable, 1=Enable - - u8 OBJ_HBlank_process:1; // 23: A+B; OBJ processed during HBlank (GBA bit5) - u8 OBJ_BMP_1D_Bound:1; // 22: A ; - u8 OBJ_Tile_1D_Bound:2; // 20-21: A+B; - u8 VRAM_Block:2; // 18-19: A ; VRAM block (0..3=A..D) - u8 DisplayMode:2; // 16-17: A+B; coreA(0..3) coreB(0..1) GBA(Green Swap) - // 0=off (white screen) - // 1=on (normal BG & OBJ layers) - // 2=VRAM display (coreA only) - // 3=RAM display (coreA only, DMA transfers) - - u8 ExOBJPalette_Enable:1; // 31: A+B; 0=Disable, 1=Enable OBJ extended Palette - u8 ExBGxPalette_Enable:1; // 30: A+B; 0=Disable, 1=Enable BG extended Palette - u8 ScreenBase_Block:3; // 27-29: A ; Screen Base (64K step) - u8 CharacBase_Block:3; // 24-26: A ; Character Base (64K step) -#else +#ifdef LOCAL_LE u8 BG_Mode:3; // 0- 2: A+B; u8 BG0_3D:1; // 3: A ; 0=2D, 1=3D u8 OBJ_Tile_mapping:1; // 4: A+B; 0=2D (32KB), 1=1D (32..256KB) @@ -178,6 +140,37 @@ typedef union u8 ScreenBase_Block:3; // 27-29: A ; Screen Base (64K step) u8 ExBGxPalette_Enable:1; // 30: A+B; 0=Disable, 1=Enable BG extended Palette u8 ExOBJPalette_Enable:1; // 31: A+B; 0=Disable, 1=Enable OBJ extended Palette +#else + u8 ForceBlank:1; // 7: A+B; + u8 OBJ_BMP_mapping:1; // 6: A+B; 0=2D (128KB), 1=1D (128..256KB) + u8 OBJ_BMP_2D_dim:1; // 5: A+B; 0=128x512, 1=256x256 pixels + u8 OBJ_Tile_mapping:1; // 4: A+B; 0=2D (32KB), 1=1D (32..256KB) + u8 BG0_3D:1; // 3: A ; 0=2D, 1=3D + u8 BG_Mode:3; // 0- 2: A+B; + + u8 WinOBJ_Enable:1; // 15: A+B; 0=Disable, 1=Enable + u8 Win1_Enable:1; // 14: A+B; 0=Disable, 1=Enable + u8 Win0_Enable:1; // 13: A+B; 0=Disable, 1=Enable + u8 OBJ_Enable:1; // 12: A+B; 0=Disable, 1=Enable + u8 BG3_Enable:1; // 11: A+B; 0=Disable, 1=Enable + u8 BG2_Enable:1; // 10: A+B; 0=Disable, 1=Enable + u8 BG1_Enable:1; // 9: A+B; 0=Disable, 1=Enable + u8 BG0_Enable:1; // 8: A+B; 0=Disable, 1=Enable + + u8 OBJ_HBlank_process:1; // 23: A+B; OBJ processed during HBlank (GBA bit5) + u8 OBJ_BMP_1D_Bound:1; // 22: A ; + u8 OBJ_Tile_1D_Bound:2; // 20-21: A+B; + u8 VRAM_Block:2; // 18-19: A ; VRAM block (0..3=A..D) + u8 DisplayMode:2; // 16-17: A+B; coreA(0..3) coreB(0..1) GBA(Green Swap) + // 0=off (white screen) + // 1=on (normal BG & OBJ layers) + // 2=VRAM display (coreA only) + // 3=RAM display (coreA only, DMA transfers) + + u8 ExOBJPalette_Enable:1; // 31: A+B; 0=Disable, 1=Enable OBJ extended Palette + u8 ExBGxPalette_Enable:1; // 30: A+B; 0=Disable, 1=Enable BG extended Palette + u8 ScreenBase_Block:3; // 27-29: A ; Screen Base (64K step) + u8 CharacBase_Block:3; // 24-26: A ; Character Base (64K step) #endif }; } IOREG_DISPCNT; // 0x400x000: Display control (Engine A+B) @@ -222,37 +215,36 @@ typedef union struct { -#ifdef MSB_FIRST - u8 PaletteMode:1; // 7: Color/palette mode; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors - u8 Mosaic:1; // 6: Mosaic render: 0=Disable, 1=Enable - u8 CharacBase_Block:4; // 2- 5: individual character base offset (n*16KB) +#ifdef LOCAL_LE u8 Priority:2; // 0- 1: Rendering priority; 0...3, where 0 is highest priority and 3 is lowest priority + u8 CharacBase_Block:4; // 2- 5: individual character base offset (n*16KB) + u8 Mosaic:1; // 6: Mosaic render: 0=Disable, 1=Enable + u8 PaletteMode:1; // 7: Color/palette mode; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors - u8 ScreenSize:2; // 14-15: text : 256x256 512x256 256x512 512x512 - // x/rot/s : 128x128 256x256 512x512 1024x1024 - // bmp : 128x128 256x256 512x256 512x512 - // large : 512x1024 1024x512 - - + u8 ScreenBase_Block:5; // 8-12: individual screen base offset (text n*2KB, BMP n*16KB) u8 PaletteSet_Wrap:1; // 13: BG0 extended palette set 0=set0, 1=set2 // BG1 extended palette set 0=set1, 1=set3 // BG2 overflow area wraparound 0=off, 1=wrap // BG3 overflow area wraparound 0=off, 1=wrap - u8 ScreenBase_Block:5; // 8-12: individual screen base offset (text n*2KB, BMP n*16KB) + u8 ScreenSize:2; // 14-15: text : 256x256 512x256 256x512 512x512 + // x/rot/s : 128x128 256x256 512x512 1024x1024 + // bmp : 128x128 256x256 512x256 512x512 + // large : 512x1024 1024x512 - - #else - - u8 Priority:2; // 0- 1: Rendering priority; 0...3, where 0 is highest priority and 3 is lowest priority - u8 CharacBase_Block:4; // 2- 5: individual character base offset (n*16KB) - u8 Mosaic:1; // 6: Mosaic render: 0=Disable, 1=Enable u8 PaletteMode:1; // 7: Color/palette mode; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors + u8 Mosaic:1; // 6: Mosaic render: 0=Disable, 1=Enable + u8 CharacBase_Block:4; // 2- 5: individual character base offset (n*16KB) + u8 Priority:2; // 0- 1: Rendering priority; 0...3, where 0 is highest priority and 3 is lowest priority - u8 ScreenBase_Block:5; // 8-12: individual screen base offset (text n*2KB, BMP n*16KB) - u8 PaletteSet_Wrap:1; // 13: BG0 extended palette set 0=set0, 1=set2 - // BG1 extended palette set 0=set1, 1=set3 - // BG2 overflow area wraparound 0=off, 1=wrap - // BG3 overflow area wraparound 0=off, 1=wrap u8 ScreenSize:2; // 14-15: text : 256x256 512x256 256x512 512x512 // x/rot/s : 128x128 256x256 512x512 1024x1024 // bmp : 128x128 256x256 512x256 512x512 // large : 512x1024 1024x512 - - + u8 PaletteSet_Wrap:1; // 13: BG0 extended palette set 0=set0, 1=set2 + // BG1 extended palette set 0=set1, 1=set3 + // BG2 overflow area wraparound 0=off, 1=wrap + // BG3 overflow area wraparound 0=off, 1=wrap + u8 ScreenBase_Block:5; // 8-12: individual screen base offset (text n*2KB, BMP n*16KB) #endif }; } IOREG_BGnCNT; // 0x400x008, 0x400x00A, 0x400x00C, 0x400x00E: BGn layer control (Engine A+B) @@ -310,14 +302,14 @@ typedef union struct { -#ifdef MSB_FIRST - s32 :4; - s32 Integer:20; +#ifdef LOCAL_LE u32 Fraction:8; + s32 Integer:20; + s32 :4; #else - u32 Fraction:8; - s32 Integer:20; s32 :4; + s32 Integer:20; + u32 Fraction:8; #endif }; } IOREG_BGnX; // 0x400x028, 0x400x038: BGn X-coordinate (Engine A+B) @@ -409,22 +401,22 @@ typedef union struct { -#ifdef MSB_FIRST - u8 :2; // 6- 7: Unused bits - u8 Effect_Enable:1; // 5: Color special effect; 0=Disable, 1=Enable - u8 OBJ_Enable:1; // 4: Layer OBJ display; 0=Disable, 1=Enable - u8 BG3_Enable:1; // 3: Layer BG3 display; 0=Disable, 1=Enable - u8 BG2_Enable:1; // 2: Layer BG2 display; 0=Disable, 1=Enable - u8 BG1_Enable:1; // 1: Layer BG1 display; 0=Disable, 1=Enable +#ifdef LOCAL_LE u8 BG0_Enable:1; // 0: Layer BG0 display; 0=Disable, 1=Enable + u8 BG1_Enable:1; // 1: Layer BG1 display; 0=Disable, 1=Enable + u8 BG2_Enable:1; // 2: Layer BG2 display; 0=Disable, 1=Enable + u8 BG3_Enable:1; // 3: Layer BG3 display; 0=Disable, 1=Enable + u8 OBJ_Enable:1; // 4: Layer OBJ display; 0=Disable, 1=Enable + u8 Effect_Enable:1; // 5: Color special effect; 0=Disable, 1=Enable + u8 :2; // 6- 7: Unused bits #else - u8 BG0_Enable:1; // 0: Layer BG0 display; 0=Disable, 1=Enable - u8 BG1_Enable:1; // 1: Layer BG1 display; 0=Disable, 1=Enable - u8 BG2_Enable:1; // 2: Layer BG2 display; 0=Disable, 1=Enable - u8 BG3_Enable:1; // 3: Layer BG3 display; 0=Disable, 1=Enable - u8 OBJ_Enable:1; // 4: Layer OBJ display; 0=Disable, 1=Enable - u8 Effect_Enable:1; // 5: Color special effect; 0=Disable, 1=Enable u8 :2; // 6- 7: Unused bits + u8 Effect_Enable:1; // 5: Color special effect; 0=Disable, 1=Enable + u8 OBJ_Enable:1; // 4: Layer OBJ display; 0=Disable, 1=Enable + u8 BG3_Enable:1; // 3: Layer BG3 display; 0=Disable, 1=Enable + u8 BG2_Enable:1; // 2: Layer BG2 display; 0=Disable, 1=Enable + u8 BG1_Enable:1; // 1: Layer BG1 display; 0=Disable, 1=Enable + u8 BG0_Enable:1; // 0: Layer BG0 display; 0=Disable, 1=Enable #endif }; } IOREG_WIN0IN; // 0x400x048: Control of inside of Window 0 (highest priority) @@ -438,18 +430,18 @@ typedef union struct { -#ifdef MSB_FIRST - u32 BG_MosaicV:4; // 4- 7: Mosaic pixel height for BG layers; 0...15 +#ifdef LOCAL_LE u32 BG_MosaicH:4; // 0- 3: Mosaic pixel width for BG layers; 0...15 + u32 BG_MosaicV:4; // 4- 7: Mosaic pixel height for BG layers; 0...15 - u32 OBJ_MosaicV:4; // 12-15: Mosaic pixel height for OBJ layer; 0...15 u32 OBJ_MosaicH:4; // 8-11: Mosaic pixel width for OBJ layer; 0...15 + u32 OBJ_MosaicV:4; // 12-15: Mosaic pixel height for OBJ layer; 0...15 #else - u32 BG_MosaicH:4; // 0- 3: Mosaic pixel width for BG layers; 0...15 u32 BG_MosaicV:4; // 4- 7: Mosaic pixel height for BG layers; 0...15 + u32 BG_MosaicH:4; // 0- 3: Mosaic pixel width for BG layers; 0...15 - u32 OBJ_MosaicH:4; // 8-11: Mosaic pixel width for OBJ layer; 0...15 u32 OBJ_MosaicV:4; // 12-15: Mosaic pixel height for OBJ layer; 0...15 + u32 OBJ_MosaicH:4; // 8-11: Mosaic pixel width for OBJ layer; 0...15 #endif u32 :16; // 16-31: Unused bits @@ -462,46 +454,46 @@ typedef union struct { -#ifdef MSB_FIRST +#ifdef LOCAL_LE + u16 BG0_Target1:1; // 0: Select layer BG0 for 1st target; 0=Disable, 1=Enable + u16 BG1_Target1:1; // 1: Select layer BG1 for 1st target; 0=Disable, 1=Enable + u16 BG2_Target1:1; // 2: Select layer BG2 for 1st target; 0=Disable, 1=Enable + u16 BG3_Target1:1; // 3: Select layer BG3 for 1st target; 0=Disable, 1=Enable + u16 OBJ_Target1:1; // 4: Select layer OBJ for 1st target; 0=Disable, 1=Enable + u16 Backdrop_Target1:1; // 5: Select backdrop for 1st target; 0=Disable, 1=Enable u16 ColorEffect:2; // 6- 7: Color effect mode; // 0=Disable // 1=Alpha blend 1st and 2nd target, interacts with BLDALPHA (0x400x052) // 2=Increase brightness, interacts with BLDY (0x400x054) // 3=Decrease brightness, interacts with BLDY (0x400x054) - u16 Backdrop_Target1:1; // 5: Select backdrop for 1st target; 0=Disable, 1=Enable - u16 OBJ_Target1:1; // 4: Select layer OBJ for 1st target; 0=Disable, 1=Enable - u16 BG3_Target1:1; // 3: Select layer BG3 for 1st target; 0=Disable, 1=Enable - u16 BG2_Target1:1; // 2: Select layer BG2 for 1st target; 0=Disable, 1=Enable - u16 BG1_Target1:1; // 1: Select layer BG1 for 1st target; 0=Disable, 1=Enable - u16 BG0_Target1:1; // 0: Select layer BG0 for 1st target; 0=Disable, 1=Enable - u16 :2; // 14-15: Unused bits - u16 Backdrop_Target2:1; // 13: Select backdrop for 2nd target; 0=Disable, 1=Enable - u16 OBJ_Target2:1; // 12: Select layer OBJ for 2nd target; 0=Disable, 1=Enable - u16 BG3_Target2:1; // 11: Select layer BG3 for 2nd target; 0=Disable, 1=Enable - u16 BG2_Target2:1; // 10: Select layer BG2 for 2nd target; 0=Disable, 1=Enable - u16 BG1_Target2:1; // 9: Select layer BG1 for 2nd target; 0=Disable, 1=Enable u16 BG0_Target2:1; // 8: Select layer BG0 for 2nd target; 0=Disable, 1=Enable + u16 BG1_Target2:1; // 9: Select layer BG1 for 2nd target; 0=Disable, 1=Enable + u16 BG2_Target2:1; // 10: Select layer BG2 for 2nd target; 0=Disable, 1=Enable + u16 BG3_Target2:1; // 11: Select layer BG3 for 2nd target; 0=Disable, 1=Enable + u16 OBJ_Target2:1; // 12: Select layer OBJ for 2nd target; 0=Disable, 1=Enable + u16 Backdrop_Target2:1; // 13: Select backdrop for 2nd target; 0=Disable, 1=Enable + u16 :2; // 14-15: Unused bits #else - u16 BG0_Target1:1; // 0: Select layer BG0 for 1st target; 0=Disable, 1=Enable - u16 BG1_Target1:1; // 1: Select layer BG1 for 1st target; 0=Disable, 1=Enable - u16 BG2_Target1:1; // 2: Select layer BG2 for 1st target; 0=Disable, 1=Enable - u16 BG3_Target1:1; // 3: Select layer BG3 for 1st target; 0=Disable, 1=Enable - u16 OBJ_Target1:1; // 4: Select layer OBJ for 1st target; 0=Disable, 1=Enable - u16 Backdrop_Target1:1; // 5: Select backdrop for 1st target; 0=Disable, 1=Enable u16 ColorEffect:2; // 6- 7: Color effect mode; // 0=Disable // 1=Alpha blend 1st and 2nd target, interacts with BLDALPHA (0x400x052) // 2=Increase brightness, interacts with BLDY (0x400x054) // 3=Decrease brightness, interacts with BLDY (0x400x054) + u16 Backdrop_Target1:1; // 5: Select backdrop for 1st target; 0=Disable, 1=Enable + u16 OBJ_Target1:1; // 4: Select layer OBJ for 1st target; 0=Disable, 1=Enable + u16 BG3_Target1:1; // 3: Select layer BG3 for 1st target; 0=Disable, 1=Enable + u16 BG2_Target1:1; // 2: Select layer BG2 for 1st target; 0=Disable, 1=Enable + u16 BG1_Target1:1; // 1: Select layer BG1 for 1st target; 0=Disable, 1=Enable + u16 BG0_Target1:1; // 0: Select layer BG0 for 1st target; 0=Disable, 1=Enable - u16 BG0_Target2:1; // 8: Select layer BG0 for 2nd target; 0=Disable, 1=Enable - u16 BG1_Target2:1; // 9: Select layer BG1 for 2nd target; 0=Disable, 1=Enable - u16 BG2_Target2:1; // 10: Select layer BG2 for 2nd target; 0=Disable, 1=Enable - u16 BG3_Target2:1; // 11: Select layer BG3 for 2nd target; 0=Disable, 1=Enable - u16 OBJ_Target2:1; // 12: Select layer OBJ for 2nd target; 0=Disable, 1=Enable - u16 Backdrop_Target2:1; // 13: Select backdrop for 2nd target; 0=Disable, 1=Enable u16 :2; // 14-15: Unused bits + u16 Backdrop_Target2:1; // 13: Select backdrop for 2nd target; 0=Disable, 1=Enable + u16 OBJ_Target2:1; // 12: Select layer OBJ for 2nd target; 0=Disable, 1=Enable + u16 BG3_Target2:1; // 11: Select layer BG3 for 2nd target; 0=Disable, 1=Enable + u16 BG2_Target2:1; // 10: Select layer BG2 for 2nd target; 0=Disable, 1=Enable + u16 BG1_Target2:1; // 9: Select layer BG1 for 2nd target; 0=Disable, 1=Enable + u16 BG0_Target2:1; // 8: Select layer BG0 for 2nd target; 0=Disable, 1=Enable #endif }; } IOREG_BLDCNT; // 0x400x050: Color effects selection (Engine A+B) @@ -512,18 +504,18 @@ typedef union struct { -#ifdef MSB_FIRST - u16 :3; // 5- 7: Unused bits +#ifdef LOCAL_LE u16 EVA:5; // 0- 4: Blending coefficient for 1st target; 0...31 (clamped to 16) + u16 :3; // 5- 7: Unused bits - u16 :3; // 13-15: Unused bits u16 EVB:5; // 8-12: Blending coefficient for 2nd target; 0...31 (clamped to 16) + u16 :3; // 13-15: Unused bits #else - u16 EVA:5; // 0- 4: Blending coefficient for 1st target; 0...31 (clamped to 16) u16 :3; // 5- 7: Unused bits + u16 EVA:5; // 0- 4: Blending coefficient for 1st target; 0...31 (clamped to 16) - u16 EVB:5; // 8-12: Blending coefficient for 2nd target; 0...31 (clamped to 16) u16 :3; // 13-15: Unused bits + u16 EVB:5; // 8-12: Blending coefficient for 2nd target; 0...31 (clamped to 16) #endif }; } IOREG_BLDALPHA; // 0x400x052: Color effects selection, interacts with BLDCNT (0x400x050) (Engine A+B) @@ -534,12 +526,12 @@ typedef union struct { -#ifdef MSB_FIRST - u16 :3; // 5- 7: Unused bits +#ifdef LOCAL_LE u16 EVY:5; // 0- 4: Blending coefficient for increase/decrease brightness; 0...31 (clamped to 16) + u16 :3; // 5- 7: Unused bits #else - u16 EVY:5; // 0- 4: Blending coefficient for increase/decrease brightness; 0...31 (clamped to 16) u16 :3; // 5- 7: Unused bits + u16 EVY:5; // 0- 4: Blending coefficient for increase/decrease brightness; 0...31 (clamped to 16) #endif u16 :8; // 8-15: Unused bits }; @@ -551,42 +543,42 @@ typedef union struct { -#ifdef MSB_FIRST +#ifdef LOCAL_LE + u8 EnableTexMapping:1; // 0: Apply textures; 0=Disable, 1=Enable + u8 PolygonShading:1; // 1: Polygon shading mode, interacts with POLYGON_ATTR (0x40004A4); 0=Toon Shading, 1=Highlight Shading + u8 EnableAlphaTest:1; // 2: Perform alpha test, interacts with ALPHA_TEST_REF (0x4000340); 0=Disable, 1=Enable + u8 EnableAlphaBlending:1; // 3: Perform alpha blending, interacts with POLYGON_ATTR (0x40004A4); 0=Disable, 1=Enable + u8 EnableAntiAliasing:1; // 4: Render polygon edges with antialiasing; 0=Disable, 1=Enable + u8 EnableEdgeMarking:1; // 5: Perform polygon edge marking, interacts with EDGE_COLOR (0x4000330); 0=Disable, 1=Enable + u8 FogOnlyAlpha:1; // 6: Apply fog to the alpha channel only, interacts with FOG_COLOR (0x4000358) / FOG_TABLE (0x4000360); 0=Color+Alpha, 1=Alpha u8 EnableFog:1; // 7: Perform fog rendering, interacts with FOG_COLOR (0x4000358) / FOG_OFFSET (0x400035C) / FOG_TABLE (0x4000360); // 0=Disable, 1=Enable - u8 FogOnlyAlpha:1; // 6: Apply fog to the alpha channel only, interacts with FOG_COLOR (0x4000358) / FOG_TABLE (0x4000360); 0=Color+Alpha, 1=Alpha - u8 EnableEdgeMarking:1; // 5: Perform polygon edge marking, interacts with EDGE_COLOR (0x4000330); 0=Disable, 1=Enable - u8 EnableAntiAliasing:1; // 4: Render polygon edges with antialiasing; 0=Disable, 1=Enable - u8 EnableAlphaBlending:1; // 3: Perform alpha blending, interacts with POLYGON_ATTR (0x40004A4); 0=Disable, 1=Enable - u8 EnableAlphaTest:1; // 2: Perform alpha test, interacts with ALPHA_TEST_REF (0x4000340); 0=Disable, 1=Enable - u8 PolygonShading:1; // 1: Polygon shading mode, interacts with POLYGON_ATTR (0x40004A4); 0=Toon Shading, 1=Highlight Shading - u8 EnableTexMapping:1; // 0: Apply textures; 0=Disable, 1=Enable - u8 :1; // 15: Unused bits + u8 FogShiftSHR:4; // 8-11: SHR-Divider, interacts with FOG_OFFSET (0x400035C); 0...10 + u8 AckColorBufferUnderflow:1; // 12: Color Buffer RDLINES Underflow; 0=None, 1=Underflow/Acknowledge + u8 AckVertexRAMOverflow:1; // 13: Polygon/Vertex RAM Overflow; 0=None, 1=Overflow/Acknowledge u8 RearPlaneMode:1; // 14: Use clear image, interacts with CLEAR_COLOR (0x4000350) / CLEAR_DEPTH (0x4000354) / CLRIMAGE_OFFSET (0x4000356); // 0=Blank, 1=Bitmap - u8 AckVertexRAMOverflow:1; // 13: Polygon/Vertex RAM Overflow; 0=None, 1=Overflow/Acknowledge - u8 AckColorBufferUnderflow:1; // 12: Color Buffer RDLINES Underflow; 0=None, 1=Underflow/Acknowledge - u8 FogShiftSHR:4; // 8-11: SHR-Divider, interacts with FOG_OFFSET (0x400035C); 0...10 + u8 :1; // 15: Unused bits u16 :16; // 16-31: Unused bits #else - u8 EnableTexMapping:1; // 0: Apply textures; 0=Disable, 1=Enable - u8 PolygonShading:1; // 1: Polygon shading mode, interacts with POLYGON_ATTR (0x40004A4); 0=Toon Shading, 1=Highlight Shading - u8 EnableAlphaTest:1; // 2: Perform alpha test, interacts with ALPHA_TEST_REF (0x4000340); 0=Disable, 1=Enable - u8 EnableAlphaBlending:1; // 3: Perform alpha blending, interacts with POLYGON_ATTR (0x40004A4); 0=Disable, 1=Enable - u8 EnableAntiAliasing:1; // 4: Render polygon edges with antialiasing; 0=Disable, 1=Enable - u8 EnableEdgeMarking:1; // 5: Perform polygon edge marking, interacts with EDGE_COLOR (0x4000330); 0=Disable, 1=Enable - u8 FogOnlyAlpha:1; // 6: Apply fog to the alpha channel only, interacts with FOG_COLOR (0x4000358) / FOG_TABLE (0x4000360); 0=Color+Alpha, 1=Alpha u8 EnableFog:1; // 7: Perform fog rendering, interacts with FOG_COLOR (0x4000358) / FOG_OFFSET (0x400035C) / FOG_TABLE (0x4000360); // 0=Disable, 1=Enable + u8 FogOnlyAlpha:1; // 6: Apply fog to the alpha channel only, interacts with FOG_COLOR (0x4000358) / FOG_TABLE (0x4000360); 0=Color+Alpha, 1=Alpha + u8 EnableEdgeMarking:1; // 5: Perform polygon edge marking, interacts with EDGE_COLOR (0x4000330); 0=Disable, 1=Enable + u8 EnableAntiAliasing:1; // 4: Render polygon edges with antialiasing; 0=Disable, 1=Enable + u8 EnableAlphaBlending:1; // 3: Perform alpha blending, interacts with POLYGON_ATTR (0x40004A4); 0=Disable, 1=Enable + u8 EnableAlphaTest:1; // 2: Perform alpha test, interacts with ALPHA_TEST_REF (0x4000340); 0=Disable, 1=Enable + u8 PolygonShading:1; // 1: Polygon shading mode, interacts with POLYGON_ATTR (0x40004A4); 0=Toon Shading, 1=Highlight Shading + u8 EnableTexMapping:1; // 0: Apply textures; 0=Disable, 1=Enable - u8 FogShiftSHR:4; // 8-11: SHR-Divider, interacts with FOG_OFFSET (0x400035C); 0...10 - u8 AckColorBufferUnderflow:1; // 12: Color Buffer RDLINES Underflow; 0=None, 1=Underflow/Acknowledge - u8 AckVertexRAMOverflow:1; // 13: Polygon/Vertex RAM Overflow; 0=None, 1=Overflow/Acknowledge + u8 :1; // 15: Unused bits u8 RearPlaneMode:1; // 14: Use clear image, interacts with CLEAR_COLOR (0x4000350) / CLEAR_DEPTH (0x4000354) / CLRIMAGE_OFFSET (0x4000356); // 0=Blank, 1=Bitmap - u8 :1; // 15: Unused bits + u8 AckVertexRAMOverflow:1; // 13: Polygon/Vertex RAM Overflow; 0=None, 1=Overflow/Acknowledge + u8 AckColorBufferUnderflow:1; // 12: Color Buffer RDLINES Underflow; 0=None, 1=Underflow/Acknowledge + u8 FogShiftSHR:4; // 8-11: SHR-Divider, interacts with FOG_OFFSET (0x400035C); 0...10 u16 :16; // 16-31: Unused bits #endif @@ -599,46 +591,46 @@ typedef union struct { -#ifdef MSB_FIRST - unsigned :3; // 5- 7: Unused bits +#ifdef LOCAL_LE unsigned EVA:5; // 0- 4: Blending coefficient for SrcA; 0...31 (clamped to 16) + unsigned :3; // 5- 7: Unused bits - unsigned :3; // 13-15: Unused bits unsigned EVB:5; // 8-12: Blending coefficient for SrcB; 0...31 (clamped to 16) + unsigned :3; // 13-15: Unused bits - unsigned :2; // 22-23: Unused bits - unsigned CaptureSize:2; // 20-21: Display capture dimensions; 0=128x128, 1=256x64, 2=256x128, 3=256x192 - unsigned VRAMWriteOffset:2; // 18-19: VRAM write target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB unsigned VRAMWriteBlock:2; // 16-17: VRAM write target block; 0=Block A, 1=Block B, 2=Block C, 3=Block D + unsigned VRAMWriteOffset:2; // 18-19: VRAM write target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB + unsigned CaptureSize:2; // 20-21: Display capture dimensions; 0=128x128, 1=256x64, 2=256x128, 3=256x192 + unsigned :2; // 22-23: Unused bits - unsigned CaptureEnable:1; // 31: Display capture status; 0=Disable/Ready 1=Enable/Busy - unsigned CaptureSrc:2; // 29-30: Select capture target; 0=SrcA, 1=SrcB, 2=SrcA+SrcB blend, 3=SrcA+SrcB blend - unsigned :1; // 28: Unused bit - unsigned VRAMReadOffset:2; // 26-27: VRAM read target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB + unsigned SrcA:1; // 24: SrcA target; 0=Current framebuffer, 1=3D render buffer unsigned SrcB:1; // 25: SrcB target; // 0=VRAM block, interacts with DISPCNT (0x4000000) // 1=Main memory FIFO, interacts with DISP_MMEM_FIFO (0x4000068) - unsigned SrcA:1; // 24: SrcA target; 0=Current framebuffer, 1=3D render buffer + unsigned VRAMReadOffset:2; // 26-27: VRAM read target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB + unsigned :1; // 28: Unused bit + unsigned CaptureSrc:2; // 29-30: Select capture target; 0=SrcA, 1=SrcB, 2=SrcA+SrcB blend, 3=SrcA+SrcB blend + unsigned CaptureEnable:1; // 31: Display capture status; 0=Disable/Ready 1=Enable/Busy #else - unsigned EVA:5; // 0- 4: Blending coefficient for SrcA; 0...31 (clamped to 16) unsigned :3; // 5- 7: Unused bits + unsigned EVA:5; // 0- 4: Blending coefficient for SrcA; 0...31 (clamped to 16) - unsigned EVB:5; // 8-12: Blending coefficient for SrcB; 0...31 (clamped to 16) unsigned :3; // 13-15: Unused bits + unsigned EVB:5; // 8-12: Blending coefficient for SrcB; 0...31 (clamped to 16) - unsigned VRAMWriteBlock:2; // 16-17: VRAM write target block; 0=Block A, 1=Block B, 2=Block C, 3=Block D - unsigned VRAMWriteOffset:2; // 18-19: VRAM write target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB - unsigned CaptureSize:2; // 20-21: Display capture dimensions; 0=128x128, 1=256x64, 2=256x128, 3=256x192 unsigned :2; // 22-23: Unused bits + unsigned CaptureSize:2; // 20-21: Display capture dimensions; 0=128x128, 1=256x64, 2=256x128, 3=256x192 + unsigned VRAMWriteOffset:2; // 18-19: VRAM write target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB + unsigned VRAMWriteBlock:2; // 16-17: VRAM write target block; 0=Block A, 1=Block B, 2=Block C, 3=Block D - unsigned SrcA:1; // 24: SrcA target; 0=Current framebuffer, 1=3D render buffer + unsigned CaptureEnable:1; // 31: Display capture status; 0=Disable/Ready 1=Enable/Busy + unsigned CaptureSrc:2; // 29-30: Select capture target; 0=SrcA, 1=SrcB, 2=SrcA+SrcB blend, 3=SrcA+SrcB blend + unsigned :1; // 28: Unused bit + unsigned VRAMReadOffset:2; // 26-27: VRAM read target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB unsigned SrcB:1; // 25: SrcB target; // 0=VRAM block, interacts with DISPCNT (0x4000000) // 1=Main memory FIFO, interacts with DISP_MMEM_FIFO (0x4000068) - unsigned VRAMReadOffset:2; // 26-27: VRAM read target offset; 0=0KB, 1=32KB, 2=64KB, 3=96KB - unsigned :1; // 28: Unused bit - unsigned CaptureSrc:2; // 29-30: Select capture target; 0=SrcA, 1=SrcB, 2=SrcA+SrcB blend, 3=SrcA+SrcB blend - unsigned CaptureEnable:1; // 31: Display capture status; 0=Disable/Ready 1=Enable/Busy + unsigned SrcA:1; // 24: SrcA target; 0=Current framebuffer, 1=3D render buffer #endif }; @@ -652,20 +644,20 @@ typedef union struct { -#ifdef MSB_FIRST - u32 :3; // 5- 7: Unused bits +#ifdef LOCAL_LE u32 Intensity:5; // 0- 4: Brightness coefficient for increase/decrease brightness; 0...31 (clamped to 16) + u32 :3; // 5- 7: Unused bits - u32 Mode:2; // 14-15: Brightness mode; 0=Disable, 1=Increase, 2=Decrease, 3=Reserved u32 :6; // 8-13: Unused bits + u32 Mode:2; // 14-15: Brightness mode; 0=Disable, 1=Increase, 2=Decrease, 3=Reserved u32 :16; // 16-31: Unused bits #else - u32 Intensity:5; // 0- 4: Brightness coefficient for increase/decrease brightness; 0...31 (clamped to 16) u32 :3; // 5- 7: Unused bits + u32 Intensity:5; // 0- 4: Brightness coefficient for increase/decrease brightness; 0...31 (clamped to 16) - u32 :6; // 8-13: Unused bits u32 Mode:2; // 14-15: Brightness mode; 0=Disable, 1=Increase, 2=Decrease, 3=Reserved + u32 :6; // 8-13: Unused bits u32 :16; // 16-31: Unused bits #endif @@ -774,15 +766,15 @@ typedef struct enum ColorEffect { - ColorEffect_Disable = 0, - ColorEffect_Blend = 1, + ColorEffect_Disable = 0, + ColorEffect_Blend = 1, ColorEffect_IncreaseBrightness = 2, ColorEffect_DecreaseBrightness = 3 }; enum GPUEngineID { - GPUEngineID_Main = 0, + GPUEngineID_Main = 0, GPUEngineID_Sub = 1 }; @@ -800,7 +792,7 @@ enum GPUEngineID #define ADDRESS_STEP_512KB 0x80000 #define ADDRESS_MASK_256KB (ADDRESS_STEP_256KB-1) -#ifdef MSB_FIRST +#ifdef LOCAL_BE struct _TILEENTRY { /*14*/ unsigned Palette:4; @@ -831,7 +823,7 @@ typedef union */ struct _COLOR { // abgr x555 -#ifdef MSB_FIRST +#ifdef LOCAL_BE unsigned alpha:1; // sometimes it is unused (pad) unsigned blue:5; unsigned green:5; @@ -868,7 +860,47 @@ typedef union struct { -#ifdef MSB_FIRST +#ifdef LOCAL_LE + union + { + u16 attr0; + + struct + { + u16 Y:8; // 0- 7: Sprite Y-coordinate; 0...255 + u16 RotScale:1; // 8: Perform rotation/scaling; 0=Disable, 1=Enable + u16 Disable:1; // 9: OBJ disable flag, only if Bit8 is cleared; 0=Perform render, 1=Do not perform render + u16 Mode:2; // 10-11: OBJ mode; 0=Normal, 1=Transparent, 2=Window, 3=Bitmap + u16 Mosaic:1; // 12: Mosaic render: 0=Disable, 1=Enable + u16 PaletteMode:1; // 13: Color/palette select; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors + u16 Shape:2; // 14-15: OBJ shape; 0=Square, 1=Horizontal, 2=Vertical, 3=Prohibited + }; + + struct + { + u16 :8; + u16 :1; + u16 DoubleSize:1; // 9: Perform double-size render, only if Bit8 is set; 0=Disable, 1=Enable + u16 :6; + }; + }; + + s16 X:9; // 16-24: Sprite X-coordinate; 0...511 + u16 RotScaleIndex:3; // 25-27: Rotation/scaling parameter selection; 0...31 + u16 HFlip:1; // 28: Flip sprite horizontally; 0=Normal, 1=Flip + u16 VFlip:1; // 29: Flip sprite vertically; 0=Normal, 1=Flip + u16 Size:2; // 30-31: OBJ size, interacts with Bit 14-15 + // + // Size| Square | Horizontal | Vertical + // 0: 8x8 16x8 8x16 + // 1: 16x16 32x8 8x32 + // 2: 32x32 32x16 16x32 + // 3: 64x64 64x32 32x64 + u16 TileIndex:10; // 32-41: Tile index; 0...1023 + + u16 Priority:2; // 42-43: Rendering priority; 0...3, where 0 is highest priority and 3 is lowest priority + u16 PaletteIndex:4; // 44-47: Palette index; 0...15 +#else union { u16 attr0; @@ -910,46 +942,6 @@ typedef union u16 PaletteIndex:4; // 44-47: Palette index; 0...15 u16 Priority:2; // 42-43: Rendering priority; 0...3, where 0 is highest priority and 3 is lowest priority u16 TileIndex:10; // 32-41: Tile index; 0...1023 -#else - union - { - u16 attr0; - - struct - { - u16 Y:8; // 0- 7: Sprite Y-coordinate; 0...255 - u16 RotScale:1; // 8: Perform rotation/scaling; 0=Disable, 1=Enable - u16 Disable:1; // 9: OBJ disable flag, only if Bit8 is cleared; 0=Perform render, 1=Do not perform render - u16 Mode:2; // 10-11: OBJ mode; 0=Normal, 1=Transparent, 2=Window, 3=Bitmap - u16 Mosaic:1; // 12: Mosaic render: 0=Disable, 1=Enable - u16 PaletteMode:1; // 13: Color/palette select; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors - u16 Shape:2; // 14-15: OBJ shape; 0=Square, 1=Horizontal, 2=Vertical, 3=Prohibited - }; - - struct - { - u16 :8; - u16 :1; - u16 DoubleSize:1; // 9: Perform double-size render, only if Bit8 is set; 0=Disable, 1=Enable - u16 :6; - }; - }; - - s16 X:9; // 16-24: Sprite X-coordinate; 0...511 - u16 RotScaleIndex:3; // 25-27: Rotation/scaling parameter selection; 0...31 - u16 HFlip:1; // 28: Flip sprite horizontally; 0=Normal, 1=Flip - u16 VFlip:1; // 29: Flip sprite vertically; 0=Normal, 1=Flip - u16 Size:2; // 30-31: OBJ size, interacts with Bit 14-15 - // - // Size| Square | Horizontal | Vertical - // 0: 8x8 16x8 8x16 - // 1: 16x16 32x8 8x32 - // 2: 32x32 32x16 16x32 - // 3: 64x64 64x32 32x64 - u16 TileIndex:10; // 32-41: Tile index; 0...1023 - - u16 Priority:2; // 42-43: Rendering priority; 0...3, where 0 is highest priority and 3 is lowest priority - u16 PaletteIndex:4; // 44-47: Palette index; 0...15 #endif u16 attr3:16; // 48-63: Whenever this is used, you will need to explicitly convert endianness. @@ -1007,7 +999,7 @@ enum GPULayerID GPULayerID_BG2 = 2, GPULayerID_BG3 = 3, GPULayerID_OBJ = 4, - GPULayerID_Backdrop = 5 + GPULayerID_Backdrop = 5 }; enum BGType @@ -1018,9 +1010,9 @@ enum BGType BGType_Large8bpp = 3, BGType_AffineExt = 4, - BGType_AffineExt_256x16 = 5, - BGType_AffineExt_256x1 = 6, - BGType_AffineExt_Direct = 7 + BGType_AffineExt_256x16 = 5, + BGType_AffineExt_256x1 = 6, + BGType_AffineExt_Direct = 7 }; enum GPUDisplayMode @@ -1033,7 +1025,7 @@ enum GPUDisplayMode enum GPUMasterBrightMode { - GPUMasterBrightMode_Disable = 0, + GPUMasterBrightMode_Disable = 0, GPUMasterBrightMode_Up = 1, GPUMasterBrightMode_Down = 2, GPUMasterBrightMode_Reserved = 3 @@ -1044,70 +1036,15 @@ enum GPULayerType { GPULayerType_3D = 0, GPULayerType_BG = 1, - GPULayerType_OBJ = 2 + GPULayerType_OBJ = 2 }; enum NDSDisplayID { - NDSDisplayID_Main = 0, + NDSDisplayID_Main = 0, NDSDisplayID_Touch = 1 }; -enum NDSColorFormat -{ - // The color format information is packed in a 32-bit value. - // The bits are as follows: - // FFFOOOOO AAAAAABB BBBBGGGG GGRRRRRR - // - // F = Flags (see below) - // O = Color order (see below) - // A = Bit count for alpha [0-63] - // B = Bit count for blue [0-63] - // G = Bit count for green [0-63] - // R = Bit count for red [0-63] - // - // Flags: - // Bit 29: Reverse order flag. - // Set = Bits are in reverse order, usually for little-endian usage. - // Cleared = Bits are in normal order, usually for big-endian usage. - // - // Color order bits, 24-28: - // 0x00 = RGBA, common format - // 0x01 = RGAB - // 0x02 = RBGA - // 0x03 = RBAG - // 0x04 = RAGB - // 0x05 = RABG - // 0x06 = GRBA - // 0x07 = GRAB - // 0x08 = GBRA - // 0x09 = GBAR - // 0x0A = GARB - // 0x0B = GABR - // 0x0C = BRGA - // 0x0D = BRAG - // 0x0E = BGRA, common format - // 0x0F = BGAR - // 0x10 = BARG - // 0x11 = BAGR - // 0x12 = ARGB - // 0x13 = ARBG - // 0x14 = AGRB - // 0x15 = AGBR - // 0x16 = ABRG - // 0x17 = ABGR - - // Color formats used for internal processing. - //NDSColorFormat_ABGR1555_Rev = 0x20045145, - //NDSColorFormat_ABGR5666_Rev = 0x20186186, - //NDSColorFormat_ABGR8888_Rev = 0x20208208, - - // Color formats used by the output framebuffers. - NDSColorFormat_BGR555_Rev = 0x20005145, - NDSColorFormat_BGR666_Rev = 0x20006186, - NDSColorFormat_BGR888_Rev = 0x20008208 -}; - struct DISPCAPCNT_parsed { u8 EVA; @@ -1411,9 +1348,9 @@ protected: template bool _IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo); void _PerformWindowTesting(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo); - template void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); + template FORCEINLINE void _RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo); + template FORCEINLINE void _RenderLine_LayerBG_ApplyColorEffectDisabledHint(GPUEngineCompositorInfo &compInfo); + template FORCEINLINE void _RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo); template void _RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo); template void _RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item); @@ -1690,6 +1627,7 @@ public: GPUEventHandler* GetEventHandler(); void Reset(); + void ForceRender3DFinishAndFlush(bool willFlush); const NDSDisplayInfo& GetDisplayInfo(); // Frontends need to call this whenever they need to read the video buffers from the emulator core void SetDisplayDidCustomRender(NDSDisplayID displayID, bool theState); @@ -1734,346 +1672,4 @@ public: extern GPUSubsystem *GPU; extern MMU_struct MMU; -extern CACHE_ALIGN const u32 material_5bit_to_31bit[32]; -extern CACHE_ALIGN const u8 material_5bit_to_6bit[32]; -extern CACHE_ALIGN const u8 material_5bit_to_8bit[32]; -extern CACHE_ALIGN const u8 material_6bit_to_8bit[64]; -extern CACHE_ALIGN const u8 material_3bit_to_5bit[8]; -extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; -extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; - -extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; -extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; -extern CACHE_ALIGN u32 color_555_to_666[32768]; -extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; -extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; -extern CACHE_ALIGN u32 color_555_to_888[32768]; - -#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color -#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped -#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color - -#ifdef MSB_FIRST - #define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian -#else - #define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian -#endif - -#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color -#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped -#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color - -#ifdef MSB_FIRST - #define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian -#else - #define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian -#endif - -//produce a 15bpp color from individual 5bit components -#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) ) - -//produce a 16bpp color from individual 5bit components -#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) ) - -inline FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) -{ - FragmentColor ret; - ret.r = r; ret.g = g; ret.b = b; ret.a = a; - return ret; -} - -template -FORCEINLINE u32 ConvertColor555To8888Opaque(const u16 src) -{ - return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); -} - -template -FORCEINLINE u32 ConvertColor555To6665Opaque(const u16 src) -{ - return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); -} - -template -FORCEINLINE u32 ConvertColor8888To6665(FragmentColor srcColor) -{ - FragmentColor outColor; - outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; - outColor.g = srcColor.g >> 2; - outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; - outColor.a = srcColor.a >> 3; - - return outColor.color; -} - -template -FORCEINLINE u32 ConvertColor8888To6665(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor8888To6665(srcColorComponent); -} - -template -FORCEINLINE u32 ConvertColor6665To8888(FragmentColor srcColor) -{ - FragmentColor outColor; - outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; - outColor.g = material_6bit_to_8bit[srcColor.g]; - outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; - outColor.a = material_5bit_to_8bit[srcColor.a]; - - return outColor.color; -} - -template -FORCEINLINE u32 ConvertColor6665To8888(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor6665To8888(srcColorComponent); -} - -template -FORCEINLINE u16 ConvertColor8888To5551(FragmentColor srcColor) -{ - return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); -} - -template -FORCEINLINE u16 ConvertColor8888To5551(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor8888To5551(srcColorComponent); -} - -template -FORCEINLINE u16 ConvertColor6665To5551(FragmentColor srcColor) -{ - return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); -} - -template -FORCEINLINE u16 ConvertColor6665To5551(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ConvertColor6665To5551(srcColorComponent); -} - -#ifdef ENABLE_SSE2 - -template -FORCEINLINE void ConvertColor555To8888(const __m128i &srcColor, const __m128i &srcAlphaBits32Lo, const __m128i &srcAlphaBits32Hi, __m128i &dstLo, __m128i &dstHi) -{ - __m128i src32; - - // Conversion algorithm: - // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) - src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); - dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); - dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); - dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); - - src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); - dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); - dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); - dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); -} - -template -FORCEINLINE void ConvertColor555To6665(const __m128i &srcColor, const __m128i &srcAlphaBits32Lo, const __m128i &srcAlphaBits32Hi, __m128i &dstLo, __m128i &dstHi) -{ - __m128i src32; - - // Conversion algorithm: - // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) - src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); - dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); - dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); - dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); - dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); - - src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); - dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); - dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); - dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); - dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); -} - -template -FORCEINLINE void ConvertColor555To8888Opaque(const __m128i &srcColor, __m128i &dstLo, __m128i &dstHi) -{ - const __m128i srcAlphaBits32 = _mm_set1_epi32(0xFF000000); - ConvertColor555To8888(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); -} - -template -FORCEINLINE void ConvertColor555To6665Opaque(const __m128i &srcColor, __m128i &dstLo, __m128i &dstHi) -{ - const __m128i srcAlphaBits32 = _mm_set1_epi32(0x1F000000); - ConvertColor555To6665(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); -} - -template -FORCEINLINE __m128i ConvertColor8888To6665(const __m128i &src) -{ - // Conversion algorithm: - // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) - // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) - __m128i rgb; - const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); - - if (SWAP_RB) - { -#ifdef ENABLE_SSSE3 - rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); - rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2) ); -#else - rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x003F0000)), 18), _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00003F00)), 2), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x0000003F)), 14)) ); -#endif - } - else - { - rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); - } - - return _mm_or_si128(rgb, a); -} - -template -FORCEINLINE __m128i ConvertColor6665To8888(const __m128i &src) -{ - // Conversion algorithm: - // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) - // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) - __m128i rgb = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00FCFCFC)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00030303)) ); - const __m128i a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0xF8000000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x07000000)) ); - - if (SWAP_RB) - { -#ifdef ENABLE_SSSE3 - rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2) ); -#else - rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16)) ); -#endif - } - - return _mm_or_si128(rgb, a); -} - -template -FORCEINLINE __m128i _ConvertColorBaseTo5551(const __m128i &srcLo, const __m128i &srcHi) -{ - if (COLORFORMAT == NDSColorFormat_BGR555_Rev) - { - return srcLo; - } - - __m128i rgbLo; - __m128i rgbHi; - __m128i alpha; - - if (COLORFORMAT == NDSColorFormat_BGR666_Rev) - { - if (SWAP_RB) - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 17), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 17), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); - } - else - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 1), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 1), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); - } - - // Convert alpha - alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x0000001F)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x0000001F)) ); - alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); - alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); - } - else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) - { - if (SWAP_RB) - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 19), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 19), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); - } - else - { - // Convert color from low bits - rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 3), _mm_set1_epi32(0x0000001F)); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); - rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); - - // Convert color from high bits - rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 3), _mm_set1_epi32(0x0000001F)); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); - rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); - } - - // Convert alpha - alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x000000FF)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x000000FF)) ); - alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); - alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); - } - - return _mm_or_si128(_mm_packs_epi32(rgbLo, rgbHi), alpha); -} - -template -FORCEINLINE __m128i ConvertColor8888To5551(const __m128i &srcLo, const __m128i &srcHi) -{ - return _ConvertColorBaseTo5551(srcLo, srcHi); -} - -template -FORCEINLINE __m128i ConvertColor6665To5551(const __m128i &srcLo, const __m128i &srcHi) -{ - return _ConvertColorBaseTo5551(srcLo, srcHi); -} - -#endif - -template void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); - -template void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); -template void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); - -template void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); -template void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); - #endif diff --git a/desmume/src/Makefile.am b/desmume/src/Makefile.am index 4b8150d31..6bbce87be 100644 --- a/desmume/src/Makefile.am +++ b/desmume/src/Makefile.am @@ -52,6 +52,7 @@ libdesmume_a_SOURCES = \ utils/decrypt/decrypt.h utils/decrypt/header.cpp utils/decrypt/header.h \ utils/task.cpp utils/task.h \ utils/vfat.h utils/vfat.cpp \ + utils/colorspacehandler/colorspacehandler.cpp \ utils/dlditool.cpp \ utils/libfat/bit_ops.h \ utils/libfat/cache.cpp \ @@ -109,6 +110,21 @@ libdesmume_a_SOURCES = \ libretro-common/rthreads/async_job.c \ libretro-common/rthreads/rsemaphore.c \ libretro-common/rthreads/rthreads.c + +if SUPPORT_SSE2 += \ +libdesmume_a_SOURCES += \ + utils/colorspacehandler/colorspacehandler_SSE2.cpp +endif + +if SUPPORT_AVX2 += \ +libdesmume_a_SOURCES += \ + utils/colorspacehandler/colorspacehandler_AVX2.cpp +endif + +if SUPPORT_ALTIVEC += \ +libdesmume_a_SOURCES += \ + utils/colorspacehandler/colorspacehandler_AltiVec.cpp +endif if HAVE_JIT libdesmume_a_SOURCES += \ diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 96b5f699c..c2f64fedb 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -32,6 +32,7 @@ #ifdef ENABLE_SSE2 #include +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" #endif typedef struct @@ -990,9 +991,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + i + 0)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + i + 4)); - _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), ConvertColor8888To6665(srcColorLo) ); - _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665(srcColorHi) ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), ColorspaceConvert8888To6665_SSE2(srcColorLo) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ColorspaceConvert8888To6665_SSE2(srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1001,17 +1002,17 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; i < pixCount; i++) { - dstFramebuffer[i].color = ConvertColor8888To6665(srcFramebuffer[i]); - dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); + dstFramebuffer[i].color = ColorspaceConvert8888To6665(srcFramebuffer[i]); + dstRGBA5551[i] = ColorspaceConvert8888To5551(srcFramebuffer[i]); } } else if (dstFramebuffer != NULL) { - ConvertColorBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); + ColorspaceConvertBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } else if (this->_outputFormat == NDSColorFormat_BGR888_Rev) @@ -1027,7 +1028,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + i + 0), srcColorLo ); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1036,8 +1037,8 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; i < pixCount; i++) { - dstFramebuffer[i].color = ConvertColor8888To6665(srcFramebuffer[i]); - dstRGBA5551[i] = ConvertColor8888To5551(srcFramebuffer[i]); + dstFramebuffer[i].color = ColorspaceConvert8888To6665(srcFramebuffer[i]); + dstRGBA5551[i] = ColorspaceConvert8888To5551(srcFramebuffer[i]); } } else if (dstFramebuffer != NULL) @@ -1046,7 +1047,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor } else { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } } @@ -1068,9 +1069,9 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor const __m128i srcColorLo = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 0)); const __m128i srcColorHi = _mm_load_si128((__m128i *)(srcFramebuffer + ir + 4)); - _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), ConvertColor8888To6665(srcColorLo) ); - _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665(srcColorHi) ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), ColorspaceConvert8888To6665_SSE2(srcColorLo) ); + _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ColorspaceConvert8888To6665_SSE2(srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1079,8 +1080,8 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor #endif for (; x < pixCount; x++, ir++, iw++) { - dstFramebuffer[iw].color = ConvertColor8888To6665(srcFramebuffer[ir]); - dstRGBA5551[iw] = ConvertColor8888To5551(srcFramebuffer[ir]); + dstFramebuffer[iw].color = ColorspaceConvert8888To6665(srcFramebuffer[ir]); + dstRGBA5551[iw] = ColorspaceConvert8888To5551(srcFramebuffer[ir]); } } } @@ -1088,14 +1089,14 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - ConvertColorBuffer8888To6665((u32 *)srcFramebuffer + ir, (u32 *)dstFramebuffer + iw, pixCount); + ColorspaceConvertBuffer8888To6665((u32 *)srcFramebuffer + ir, (u32 *)dstFramebuffer + iw, pixCount); } } else { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } @@ -1115,7 +1116,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 0), srcColorLo ); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi ); - _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551(srcColorLo, srcColorHi) ); + _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ColorspaceConvert8888To5551_SSE2(srcColorLo, srcColorHi) ); } #endif @@ -1125,7 +1126,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor for (; x < pixCount; x++, ir++, iw++) { dstFramebuffer[iw] = srcFramebuffer[ir]; - dstRGBA5551[iw] = ConvertColor8888To5551(srcFramebuffer[ir]); + dstRGBA5551[iw] = ColorspaceConvert8888To5551(srcFramebuffer[ir]); } } } @@ -1146,7 +1147,7 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor { for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, ir += this->_framebufferWidth, iw -= this->_framebufferWidth) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer + ir, dstRGBA5551 + iw, pixCount); } } } diff --git a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index fa5bb43e7..d64c3c5f6 100644 --- a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -243,6 +243,8 @@ AB564915186E6F67002740F4 /* Image_Piano.png in Resources */ = {isa = PBXBuildFile; fileRef = AB56490B186E6F67002740F4 /* Image_Piano.png */; }; AB5785FD17176AFC002C5FC7 /* OpenEmuBase.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = AB5785FC17176AFC002C5FC7 /* OpenEmuBase.framework */; }; AB58F32D1364F44B0074C376 /* cocoa_file.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB58F32C1364F44B0074C376 /* cocoa_file.mm */; }; + AB5FDDAC1D62C89E0094617C /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + AB5FDDAD1D62C8A00094617C /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; AB64987C13ECC73800EE7DD2 /* FileTypeInfo.plist in Resources */ = {isa = PBXBuildFile; fileRef = AB64987B13ECC73800EE7DD2 /* FileTypeInfo.plist */; }; AB68101B187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png in Resources */ = {isa = PBXBuildFile; fileRef = AB681013187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png */; }; AB68101C187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png in Resources */ = {isa = PBXBuildFile; fileRef = AB681013187D4AEF0049F2C2 /* Icon_GuitarGrip_Button_Blue_512x512.png */; }; @@ -974,6 +976,12 @@ ABB97878144E89CC00793FA3 /* Icon_DeSmuME_32x32.png in Resources */ = {isa = PBXBuildFile; fileRef = ABB97875144E89CC00793FA3 /* Icon_DeSmuME_32x32.png */; }; ABBC0F8D1394B1AA0028B6BD /* DefaultUserPrefs.plist in Resources */ = {isa = PBXBuildFile; fileRef = ABBC0F8C1394B1AA0028B6BD /* DefaultUserPrefs.plist */; }; ABBF04A514B515F300E505A0 /* AppIcon_ROMCheats.icns in Resources */ = {isa = PBXBuildFile; fileRef = ABBF04A414B515F300E505A0 /* AppIcon_ROMCheats.icns */; }; + ABBFFF851D6283C0003CD598 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + ABBFFF861D6283C1003CD598 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + ABBFFF871D6283C1003CD598 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */; }; + ABBFFF891D6283D2003CD598 /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; + ABBFFF8A1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; + ABBFFF8B1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */; }; ABC3AF2F14B7F06900D5B13D /* Icon_VolumeFull_16x16.png in Resources */ = {isa = PBXBuildFile; fileRef = ABC3AF2B14B7F06900D5B13D /* Icon_VolumeFull_16x16.png */; }; ABC3AF3014B7F06900D5B13D /* Icon_VolumeMute_16x16.png in Resources */ = {isa = PBXBuildFile; fileRef = ABC3AF2C14B7F06900D5B13D /* Icon_VolumeMute_16x16.png */; }; ABC3AF3114B7F06900D5B13D /* Icon_VolumeOneThird_16x16.png in Resources */ = {isa = PBXBuildFile; fileRef = ABC3AF2D14B7F06900D5B13D /* Icon_VolumeOneThird_16x16.png */; }; @@ -1534,6 +1542,14 @@ ABBB421516B4A5F30012E5AB /* OGLRender_3_2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = OGLRender_3_2.h; path = ../OGLRender_3_2.h; sourceTree = ""; }; ABBC0F8C1394B1AA0028B6BD /* DefaultUserPrefs.plist */ = {isa = PBXFileReference; lastKnownFileType = file.bplist; path = DefaultUserPrefs.plist; sourceTree = ""; }; ABBF04A414B515F300E505A0 /* AppIcon_ROMCheats.icns */ = {isa = PBXFileReference; lastKnownFileType = image.icns; path = AppIcon_ROMCheats.icns; sourceTree = ""; }; + ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler.cpp; sourceTree = ""; }; + ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler.h; sourceTree = ""; }; + ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_SSE2.cpp; sourceTree = ""; }; + ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_SSE2.h; sourceTree = ""; }; + ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX2.cpp; sourceTree = ""; }; + ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX2.h; sourceTree = ""; }; + ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AltiVec.cpp; sourceTree = ""; }; + ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AltiVec.h; sourceTree = ""; }; ABC3AF2B14B7F06900D5B13D /* Icon_VolumeFull_16x16.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_VolumeFull_16x16.png; path = images/Icon_VolumeFull_16x16.png; sourceTree = ""; }; ABC3AF2C14B7F06900D5B13D /* Icon_VolumeMute_16x16.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_VolumeMute_16x16.png; path = images/Icon_VolumeMute_16x16.png; sourceTree = ""; }; ABC3AF2D14B7F06900D5B13D /* Icon_VolumeOneThird_16x16.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_VolumeOneThird_16x16.png; path = images/Icon_VolumeOneThird_16x16.png; sourceTree = ""; }; @@ -2507,6 +2523,21 @@ path = openemu; sourceTree = ""; }; + ABBFFF6E1D5F9C10003CD598 /* colorspacehandler */ = { + isa = PBXGroup; + children = ( + ABBFFF811D611A36003CD598 /* colorspacehandler_AltiVec.cpp */, + ABBFFF7B1D610457003CD598 /* colorspacehandler_AVX2.cpp */, + ABBFFF751D5FD2ED003CD598 /* colorspacehandler_SSE2.cpp */, + ABBFFF6F1D5F9C52003CD598 /* colorspacehandler.cpp */, + ABBFFF821D611A36003CD598 /* colorspacehandler_AltiVec.h */, + ABBFFF7C1D610457003CD598 /* colorspacehandler_AVX2.h */, + ABBFFF761D5FD2ED003CD598 /* colorspacehandler_SSE2.h */, + ABBFFF701D5F9C52003CD598 /* colorspacehandler.h */, + ); + path = colorspacehandler; + sourceTree = ""; + }; ABC2ECD613B1C87000FAAA2A /* Images */ = { isa = PBXGroup; children = ( @@ -2757,6 +2788,7 @@ ABD1FF211345ACBF00AF11D1 /* decrypt */, ABD1FF2E1345ACBF00AF11D1 /* libfat */, ABE670241415DE6C00E8E4C9 /* tinyxml */, + ABBFFF6E1D5F9C10003CD598 /* colorspacehandler */, ABD1FF1D1345ACBF00AF11D1 /* ConvertUTF.c */, AB9038A517C5ECFD00F410BD /* advanscene.cpp */, ABD1FF1F1345ACBF00AF11D1 /* datetime.cpp */, @@ -3768,6 +3800,7 @@ ABE6840D189E33BC007FD69C /* OGLDisplayOutput.cpp in Sources */, ABD1FF121345AC9C00AF11D1 /* slot2_none.cpp in Sources */, ABD1FF131345AC9C00AF11D1 /* slot2_paddle.cpp in Sources */, + ABBFFF8A1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */, ABD1FF141345AC9C00AF11D1 /* slot2_piano.cpp in Sources */, ABD1FF151345AC9C00AF11D1 /* slot2_rumblepak.cpp in Sources */, ABD1041F1346652500AF11D1 /* sndOSX.cpp in Sources */, @@ -3862,6 +3895,7 @@ AB40565E169F5DBB0016AC3E /* virtualmemory.cpp in Sources */, AB405661169F5DBB0016AC3E /* zonememory.cpp in Sources */, AB405679169F5DCC0016AC3E /* x86assembler.cpp in Sources */, + ABBFFF861D6283C1003CD598 /* colorspacehandler.cpp in Sources */, AB40567C169F5DCC0016AC3E /* x86compiler.cpp in Sources */, ABFEA8A41BB4EC1100B08C25 /* sfnt.c in Sources */, ABA731691BB51FDC00B26147 /* type1cid.c in Sources */, @@ -4015,6 +4049,7 @@ AB796D4315CDCBA200C59155 /* version.cpp in Sources */, ABFEA82B1BB4EC1100B08C25 /* ftinit.c in Sources */, AB796D4415CDCBA200C59155 /* vfat.cpp in Sources */, + AB5FDDAC1D62C89E0094617C /* colorspacehandler.cpp in Sources */, AB796D4515CDCBA200C59155 /* videofilter.cpp in Sources */, AB796D4615CDCBA200C59155 /* WavFile.cpp in Sources */, AB796D4715CDCBA200C59155 /* wifi.cpp in Sources */, @@ -4094,6 +4129,7 @@ AB26D87C16B5253D00A2305C /* OGLRender_3_2.cpp in Sources */, AB3A655E16CC5421001F5D4A /* EmuControllerDelegate.mm in Sources */, AB3A656116CC5438001F5D4A /* cocoa_GPU.mm in Sources */, + AB5FDDAD1D62C8A00094617C /* colorspacehandler_SSE2.cpp in Sources */, AB8967D916D2ED0700F826F1 /* DisplayWindowController.mm in Sources */, AB29B33116D4BEBF000EF671 /* InputManager.mm in Sources */, AB8B7AAC17CE8C440051CEBF /* slot1comp_protocol.cpp in Sources */, @@ -4270,6 +4306,7 @@ AB2ABA401C9F9CFA00173B15 /* rsemaphore.c in Sources */, AB8F3CF01A53AC2600A80BF6 /* ringbuffer.cpp in Sources */, AB8F3CF11A53AC2600A80BF6 /* arm_jit.cpp in Sources */, + ABBFFF891D6283D2003CD598 /* colorspacehandler_SSE2.cpp in Sources */, AB8F3CF21A53AC2600A80BF6 /* troubleshootingWindowDelegate.mm in Sources */, AB8F3CF31A53AC2600A80BF6 /* assembler.cpp in Sources */, AB8F3CF41A53AC2600A80BF6 /* assert.cpp in Sources */, @@ -4293,6 +4330,7 @@ AB8F3D041A53AC2600A80BF6 /* virtualmemory.cpp in Sources */, AB8F3D051A53AC2600A80BF6 /* zonememory.cpp in Sources */, AB8F3D061A53AC2600A80BF6 /* x86assembler.cpp in Sources */, + ABBFFF851D6283C0003CD598 /* colorspacehandler.cpp in Sources */, AB8F3D071A53AC2600A80BF6 /* x86compiler.cpp in Sources */, AB8F3D081A53AC2600A80BF6 /* x86compilercontext.cpp in Sources */, AB8F3D091A53AC2600A80BF6 /* x86compilerfunc.cpp in Sources */, @@ -4365,6 +4403,7 @@ ABB3C6911501C04F00E0C22E /* SoundTouch.cpp in Sources */, ABB3C6921501C04F00E0C22E /* sse_optimized.cpp in Sources */, ABB3C6931501C04F00E0C22E /* TDStretch.cpp in Sources */, + ABBFFF871D6283C1003CD598 /* colorspacehandler.cpp in Sources */, ABB3C6941501C04F00E0C22E /* WavFile.cpp in Sources */, ABB3C6951501C04F00E0C22E /* metaspu.cpp in Sources */, ABB3C6961501C04F00E0C22E /* SndOut.cpp in Sources */, @@ -4434,6 +4473,7 @@ ABB3C6D11501C04F00E0C22E /* slot1.cpp in Sources */, ABB3C6D31501C04F00E0C22E /* SPU.cpp in Sources */, ABB3C6D41501C04F00E0C22E /* texcache.cpp in Sources */, + ABBFFF8B1D6283D3003CD598 /* colorspacehandler_SSE2.cpp in Sources */, AB9038BA17C5ED2200F410BD /* slot1comp_rom.cpp in Sources */, ABB3C6D51501C04F00E0C22E /* thumb_instructions.cpp in Sources */, AB2EE13317D57F5000F68622 /* fsnitro.cpp in Sources */, diff --git a/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj index 3d5d94cff..5ebd32aa8 100644 --- a/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj @@ -740,6 +740,14 @@ AB2F56F11704C86900E28885 /* utilities.c in Sources */ = {isa = PBXBuildFile; fileRef = AB2F56EF1704C86900E28885 /* utilities.c */; }; AB2F56F21704C86900E28885 /* utilities.c in Sources */ = {isa = PBXBuildFile; fileRef = AB2F56EF1704C86900E28885 /* utilities.c */; }; AB2F56F31704C86900E28885 /* utilities.c in Sources */ = {isa = PBXBuildFile; fileRef = AB2F56EF1704C86900E28885 /* utilities.c */; }; + AB37E3741D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E3771D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */; }; + AB37E3781D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E37B1D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */; }; + AB37E37C1D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E37D1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */; }; + AB37E3801D6188BC004A2C0D /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB37E38A1D61895F004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */; }; AB3ACB7814C2361100D7D192 /* appDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB3ACB6714C2361100D7D192 /* appDelegate.mm */; }; AB3ACB7914C2361100D7D192 /* cheatWindowDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB3ACB6914C2361100D7D192 /* cheatWindowDelegate.mm */; }; AB3ACB7C14C2361100D7D192 /* inputPrefsView.mm in Sources */ = {isa = PBXBuildFile; fileRef = AB3ACB6F14C2361100D7D192 /* inputPrefsView.mm */; }; @@ -1156,6 +1164,8 @@ AB73AA2E1507C9F500A310C8 /* OpenGL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABC570D4134431DA00E7B0B1 /* OpenGL.framework */; }; AB73AA2F1507C9F500A310C8 /* libz.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = AB0A0D1914AACA9600E83E91 /* libz.dylib */; }; AB75226F14C7BB51009B97B3 /* AppIcon_FirmwareConfig.icns in Resources */ = {isa = PBXBuildFile; fileRef = AB75226D14C7BB51009B97B3 /* AppIcon_FirmwareConfig.icns */; }; + AB7BB17F1D62C8CC00A7A6E2 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */; }; + AB7BB1801D62C8CF00A7A6E2 /* colorspacehandler_AltiVec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */; }; AB7DDA6D173DC38F004F3D07 /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABB6AD5C173A3F2B00EC2E8D /* Carbon.framework */; }; AB7DDA6E173DC399004F3D07 /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABB6AD5C173A3F2B00EC2E8D /* Carbon.framework */; }; AB7DDA6F173DC39E004F3D07 /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ABB6AD5C173A3F2B00EC2E8D /* Carbon.framework */; }; @@ -1835,6 +1845,12 @@ AB2F56EF1704C86900E28885 /* utilities.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = utilities.c; sourceTree = ""; }; AB350BA41478AC96007165AC /* IOKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = IOKit.framework; path = System/Library/Frameworks/IOKit.framework; sourceTree = SDKROOT; }; AB350D38147A1D8D007165AC /* English */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = English; path = translations/English.lproj/HID_usage_strings.plist; sourceTree = ""; }; + AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler.cpp; sourceTree = ""; }; + AB37E36D1D6188BC004A2C0D /* colorspacehandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler.h; sourceTree = ""; }; + AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AltiVec.cpp; sourceTree = ""; }; + AB37E36F1D6188BC004A2C0D /* colorspacehandler_AltiVec.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AltiVec.h; sourceTree = ""; }; + AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_SSE2.cpp; sourceTree = ""; }; + AB37E3731D6188BC004A2C0D /* colorspacehandler_SSE2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_SSE2.h; sourceTree = ""; }; AB3ACB6614C2361100D7D192 /* appDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = appDelegate.h; sourceTree = ""; }; AB3ACB6714C2361100D7D192 /* appDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = appDelegate.mm; sourceTree = ""; }; AB3ACB6814C2361100D7D192 /* cheatWindowDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cheatWindowDelegate.h; sourceTree = ""; }; @@ -2893,6 +2909,19 @@ path = src; sourceTree = ""; }; + AB37E36B1D6188BC004A2C0D /* colorspacehandler */ = { + isa = PBXGroup; + children = ( + AB37E36C1D6188BC004A2C0D /* colorspacehandler.cpp */, + AB37E36D1D6188BC004A2C0D /* colorspacehandler.h */, + AB37E36E1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp */, + AB37E36F1D6188BC004A2C0D /* colorspacehandler_AltiVec.h */, + AB37E3721D6188BC004A2C0D /* colorspacehandler_SSE2.cpp */, + AB37E3731D6188BC004A2C0D /* colorspacehandler_SSE2.h */, + ); + path = colorspacehandler; + sourceTree = ""; + }; AB3ACB6514C2361100D7D192 /* userinterface */ = { isa = PBXGroup; children = ( @@ -3205,6 +3234,7 @@ isa = PBXGroup; children = ( ABBCE2A115ACB29100A2C965 /* AsmJit */, + AB37E36B1D6188BC004A2C0D /* colorspacehandler */, ABD1FF211345ACBF00AF11D1 /* decrypt */, ABD1FF2E1345ACBF00AF11D1 /* libfat */, ABE670241415DE6C00E8E4C9 /* tinyxml */, @@ -4506,6 +4536,8 @@ AB50200A1D09E712002FA150 /* file_path.c in Sources */, AB50200B1D09E712002FA150 /* retro_dirent.c in Sources */, AB50200C1D09E712002FA150 /* retro_stat.c in Sources */, + AB7BB17F1D62C8CC00A7A6E2 /* colorspacehandler.cpp in Sources */, + AB7BB1801D62C8CF00A7A6E2 /* colorspacehandler_AltiVec.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -4685,6 +4717,8 @@ AB5020161D09E712002FA150 /* file_path.c in Sources */, AB5020171D09E712002FA150 /* retro_dirent.c in Sources */, AB5020181D09E712002FA150 /* retro_stat.c in Sources */, + AB37E3801D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E38A1D61895F004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -4894,6 +4928,8 @@ AB50200D1D09E712002FA150 /* file_path.c in Sources */, AB50200E1D09E712002FA150 /* retro_dirent.c in Sources */, AB50200F1D09E712002FA150 /* retro_stat.c in Sources */, + AB37E3741D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E3771D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -5103,6 +5139,8 @@ AB5020101D09E712002FA150 /* file_path.c in Sources */, AB5020111D09E712002FA150 /* retro_dirent.c in Sources */, AB5020121D09E712002FA150 /* retro_stat.c in Sources */, + AB37E3781D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E37B1D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -5282,6 +5320,8 @@ AB5020131D09E712002FA150 /* file_path.c in Sources */, AB5020141D09E712002FA150 /* retro_dirent.c in Sources */, AB5020151D09E712002FA150 /* retro_stat.c in Sources */, + AB37E37C1D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, + AB37E37D1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/desmume/src/cocoa/cocoa_output.mm b/desmume/src/cocoa/cocoa_output.mm index cf75e67bd..384c61c9c 100644 --- a/desmume/src/cocoa/cocoa_output.mm +++ b/desmume/src/cocoa/cocoa_output.mm @@ -754,7 +754,7 @@ if (dispInfo.pixelBytes == 2) { - ConvertColorBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); + ColorspaceConvertBuffer555To8888Opaque((u16 *)displayBuffer, bitmapData, (w * h)); } else if (dispInfo.pixelBytes == 4) { diff --git a/desmume/src/cocoa/cocoa_rom.mm b/desmume/src/cocoa/cocoa_rom.mm index ff21c1fa4..2a9e8ebdc 100644 --- a/desmume/src/cocoa/cocoa_rom.mm +++ b/desmume/src/cocoa/cocoa_rom.mm @@ -692,7 +692,7 @@ void RomIconToRGBA8888(uint32_t *bitmapData) // // The first entry always represents the alpha, so we can just ignore it. clut[0] = 0x00000000; - ConvertColorBuffer555To8888Opaque((u16 *)iconClutPtr, &clut[1], 15); + ColorspaceConvertBuffer555To8888Opaque((u16 *)iconClutPtr, &clut[1], 15); // Load the image from the icon pixel data. // diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp index 476076335..049fa7887 100644 --- a/desmume/src/commandline.cpp +++ b/desmume/src/commandline.cpp @@ -66,6 +66,7 @@ CommandLine::CommandLine() , arm7_gdb_port(0) , start_paused(FALSE) , autodetect_method(-1) +, render3d(COMMANDLINE_RENDER3D_DEFAULT) { #ifndef HOST_WINDOWS disable_sound = 0; @@ -92,6 +93,8 @@ static const char* help_string = \ " --num-cores N Override numcores detection and use this many" ENDL " --spu-synch Use SPU synch (crackles; helps streams; default ON)" ENDL " --spu-method N Select SPU synch method: 0:N, 1:Z, 2:P; default 0" ENDL +" --3d-render [SW|AUTOGL|GL|OLDGL]" ENDL +" Select 3d renderer; default SW" ENDL #ifndef HOST_WINDOWS " --disable-sound Disables the sound output" ENDL " --disable-limiter Disables the 60fps limiter" ENDL @@ -154,6 +157,7 @@ ENDL #define OPT_NUMCORES 1 #define OPT_SPU_METHOD 2 +#define OPT_3D_RENDER 3 #define OPT_JIT_SIZE 100 #define OPT_CONSOLE_TYPE 200 @@ -183,6 +187,8 @@ ENDL bool CommandLine::parse(int argc,char **argv) { + std::string _render3d; + int opt_help = 0; int option_index = 0; for(;;) @@ -197,6 +203,7 @@ bool CommandLine::parse(int argc,char **argv) { "num-cores", required_argument, NULL, OPT_NUMCORES }, { "spu-synch", no_argument, &_spu_sync_mode, 1 }, { "spu-method", required_argument, NULL, OPT_SPU_METHOD }, + { "3d-render", required_argument, NULL, OPT_3D_RENDER }, #ifndef HOST_WINDOWS { "disable-sound", no_argument, &disable_sound, 1}, { "disable-limiter", no_argument, &disable_limiter, 1}, @@ -265,6 +272,7 @@ bool CommandLine::parse(int argc,char **argv) //user settings case OPT_NUMCORES: _num_cores = atoi(optarg); break; case OPT_SPU_METHOD: _spu_sync_method = atoi(optarg); break; + case OPT_3D_RENDER: _render3d = optarg; break; //sync settings case OPT_JIT_SIZE: _jit_size = atoi(optarg); break; @@ -343,6 +351,14 @@ bool CommandLine::parse(int argc,char **argv) CommonSettings.DebugConsole = true; } + //process 3d renderer + _render3d = strtoupper(_render3d); + if(_render3d == "NONE") render3d = COMMANDLINE_RENDER3D_NONE; + if(_render3d == "SW") render3d = COMMANDLINE_RENDER3D_SW; + if(_render3d == "OLDGL") render3d = COMMANDLINE_RENDER3D_OLDGL; + if(_render3d == "AUTOGL") render3d = COMMANDLINE_RENDER3D_AUTOGL; + if(_render3d == "GL") render3d = COMMANDLINE_RENDER3D_GL; + if (autodetect_method != -1) CommonSettings.autodetectBackupMethod = autodetect_method; diff --git a/desmume/src/commandline.h b/desmume/src/commandline.h index 40ab7bc34..b77239400 100644 --- a/desmume/src/commandline.h +++ b/desmume/src/commandline.h @@ -24,17 +24,29 @@ //hacky commandline options that i didnt want to route through commonoptions extern int _commandline_linux_nojoy; +#define COMMANDLINE_RENDER3D_DEFAULT 0 +#define COMMANDLINE_RENDER3D_NONE 1 +#define COMMANDLINE_RENDER3D_SW 2 +#define COMMANDLINE_RENDER3D_OLDGL 3 +#define COMMANDLINE_RENDER3D_GL 4 +#define COMMANDLINE_RENDER3D_AUTOGL 5 + //this class will also eventually try to take over the responsibility of using the args that it handles //for example: preparing the emulator run by loading the rom, savestate, and/or movie in the correct pattern. //it should also populate CommonSettings with its initial values +//EDIT: not really. combining this with what a frontend wants to do is complicated. +//you might design the API so that the frontend sets all those up, but I'm not sure I like that +//Really, this should be a passive structure that just collects the results provided by the shared command line processing, to be used later as appropriate +//(and the CommonSettings setup REMOVED or at least refactored into a separate method) class CommandLine { public: - //actual options: these may move to another sturct + //actual options: these may move to another struct int load_slot; int depth_threshold; int autodetect_method; + int render3d; std::string nds_file; std::string play_movie_file; std::string record_movie_file; diff --git a/desmume/src/frontend/modules/ImageOut.cpp b/desmume/src/frontend/modules/ImageOut.cpp index 16ff5473f..77d4aa294 100644 --- a/desmume/src/frontend/modules/ImageOut.cpp +++ b/desmume/src/frontend/modules/ImageOut.cpp @@ -1,65 +1,63 @@ -/* - Copyright (C) 2008-2015 DeSmuME team - - This file is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - This file is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with the this software. If not, see . -*/ - -#include -#include -#include "types.h" -#include "ImageOut.h" -#include "formats/rpng.h" -#include "formats/rbmp.h" -#include "GPU.h" - -static u8* Convert15To24(const u16* src, int width, int height) -{ - u8 *tmp_buffer; - u8 *tmp_inc; - tmp_inc = tmp_buffer = (u8 *)malloc(width * height * 3); - - for(int y=0;y(*src++); - *tmp_inc++ = dst&0xFF; - *tmp_inc++ = (dst>>8)&0xFF; - *tmp_inc++ = (dst>>16)&0xFF; - } - } - return tmp_buffer; -} - -int NDS_WritePNG_15bpp(int width, int height, const u16 *data, const char *filename) -{ - u8* tmp = Convert15To24(data,width,height); - bool ok = rpng_save_image_bgr24(filename,tmp,width,height,width*3); - free(tmp); - return ok?1:0; -} - -int NDS_WriteBMP_15bpp(int width, int height, const u16 *data, const char *filename) -{ - u8* tmp = Convert15To24(data,width,height); - bool ok = rbmp_save_image(filename,tmp,width,height,width*3,RBMP_SOURCE_TYPE_BGR24); - free(tmp); - return ok?1:0; -} - -int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char *filename) -{ - bool ok = rbmp_save_image(filename,buf,width,height,width*4,RBMP_SOURCE_TYPE_ARGB8888); - return ok?1:0; +/* + Copyright (C) 2008-2015 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#include +#include +#include "types.h" +#include "ImageOut.h" +#include "formats/rpng.h" +#include "formats/rbmp.h" +#include "GPU.h" + +static u8* Convert15To24(const u16* src, int width, int height) +{ + u8 *tmp_buffer; + u8 *tmp_inc; + tmp_inc = tmp_buffer = (u8 *)malloc(width * height * 3); + + for (int i = 0; i < width*height; i++) + { + u32 dst = ColorspaceConvert555To8888Opaque(*src++); + *tmp_inc++ = dst & 0xFF; + *tmp_inc++ = (dst >> 8) & 0xFF; + *tmp_inc++ = (dst >> 16) & 0xFF; + } + + return tmp_buffer; +} + +int NDS_WritePNG_15bpp(int width, int height, const u16 *data, const char *filename) +{ + u8* tmp = Convert15To24(data,width,height); + bool ok = rpng_save_image_bgr24(filename,tmp,width,height,width*3); + free(tmp); + return ok?1:0; +} + +int NDS_WriteBMP_15bpp(int width, int height, const u16 *data, const char *filename) +{ + u8* tmp = Convert15To24(data,width,height); + bool ok = rbmp_save_image(filename,tmp,width,height,width*3,RBMP_SOURCE_TYPE_BGR24); + free(tmp); + return ok?1:0; +} + +int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char *filename) +{ + bool ok = rbmp_save_image(filename,buf,width,height,width*4,RBMP_SOURCE_TYPE_ARGB8888); + return ok?1:0; } \ No newline at end of file diff --git a/desmume/src/frontend/windows/DeSmuME.vcxproj b/desmume/src/frontend/windows/DeSmuME.vcxproj index a3956a7c1..53d2c00d8 100644 --- a/desmume/src/frontend/windows/DeSmuME.vcxproj +++ b/desmume/src/frontend/windows/DeSmuME.vcxproj @@ -56,174 +56,174 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -315,7 +315,7 @@ - + @@ -341,158 +341,160 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -600,10 +602,10 @@ - - - - + + + + diff --git a/desmume/src/frontend/windows/DeSmuME.vcxproj.filters b/desmume/src/frontend/windows/DeSmuME.vcxproj.filters index f551f4dc0..0a0742ced 100644 --- a/desmume/src/frontend/windows/DeSmuME.vcxproj.filters +++ b/desmume/src/frontend/windows/DeSmuME.vcxproj.filters @@ -121,207 +121,207 @@ {18cba3ce-aaa6-441d-8111-408d0fcef7d2} - - {37e11fb9-8dec-43bd-8242-a721d69a740f} + + {db5dc512-2b75-4476-8cac-75fd4acfd85f} - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core\addons - + Core\addons - + Core\addons - + Core\addons - + Core\addons - + Core\gdbstub - + Core\utils - + Core\utils - + Core\utils - + Core\utils - + Core\utils - + Core\utils\decrypt - + Core\utils\decrypt - + Core\utils\decrypt - + Core\metaspu - + Core\metaspu - + Core\metaspu - + Core\metaspu\win32 - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch @@ -360,6 +360,9 @@ Windows + + Windows + Windows @@ -426,241 +429,244 @@ Windows\tools - + Core\addons - + Core\utils - + Core\addons - + Core\addons - + Core\addons - + Core\utils - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + + Core\utils\libfat + + Core - + Core\addons - + Core\utils - + Core\utils - + Core\addons - + Core\utils\tinyxml - + Core\utils\tinyxml - + Core\utils\tinyxml - + Core\utils\tinyxml - + Core\filter - + Core\filter - + Core\filter - + Core\filter - + Core\filter - + Core\filter - + Core\filter Windows - + Core Windows - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core Windows - + Core - + Core\addons - + Core\addons - + Core\addons - + Core\addons - + Core\utils - + Core\addons - + Core\addons - + Core\utils Windows\tools - + Core\addons - + Core\addons - + Core\filter @@ -891,276 +897,276 @@ Windows\File_Extractor\unrar - + Core\libretro-common\rthreads - + Core\frontend\modules - + Core\libretro-common\formats\png - + Core\libretro-common\formats\png - + Core\libretro-common\formats\bmp - + Core\libretro-common\file\nbio - + Core\libretro-common\compat - + Core\libretro-common\compat - + Core\libretro-common\compat - + Core\libretro-common\compat - + Core\libretro-common\compat - + Core\libretro-common\file - + Core\libretro-common\file - + Core\libretro-common\file - + Core\libretro-common\hash - + Core\libretro-common\rthreads - + Core\libretro-common\compat - + Core\libretro-common\include\streams - + Core\libretro-common\include\streams - + Core\libretro-common\file - + Core\libretro-common\lists - + Core\libretro-common\lists - + Core\libretro-common\lists - + Core\libretro-common\file - - Core\utils\libfat + + Core\utils\colorspacehandler - - Core\libretro-common\features - - - Core - - - Windows + + Core\utils\colorspacehandler - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + Core - + + Core + + + Core + + Core\gdbstub - + Core\utils - + Core\utils - + Core\utils - + Core\utils - + Core\utils - + Core\utils\decrypt - + Core\utils\decrypt - + Core\utils\decrypt - + Core\metaspu - + Core\metaspu - + Core\metaspu\win32 - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch - + Core\metaspu\SoundTouch @@ -1265,247 +1271,247 @@ Windows\tools - + Core\utils - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils\libfat - + Core\utils - + Core\utils - + Core\utils - + Core\utils\tinyxml - + Core\utils\tinyxml - + Core\filter - + Core\filter - + Core\filter - + Core\filter Windows - + Core - + Core - + Core Windows - + Core\utils\AsmJit - + Core\utils\AsmJit - + Core\utils\AsmJit - + Core\utils\AsmJit - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\core - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core\utils\AsmJit\x86 - + Core Windows - + Core - + Core\addons - + Core\addons - + Core\utils - + Core\addons - + Core\utils Windows\tools - + Core - + Core\filter @@ -1709,51 +1715,51 @@ Windows\File_Extractor\unrar - + Core\libretro-common\include\rthreads - + Core\libretro-common\include - + Core\libretro-common\include - + Core\libretro-common\include - + Core\frontend\modules - + + Core\libretro-common\formats\png + + Core\libretro-common\include\formats - + Core\libretro-common\include\formats - + Core\libretro-common\include\formats - + Core\libretro-common\include\compat - + Core\libretro-common\include\compat - - Core + + Core\utils\colorspacehandler - - Core\utils - - - Core\utils + + Core\utils\colorspacehandler - + Core - + Core @@ -1762,10 +1768,10 @@ Windows\resources - + Core\filter - + Core\utils\AsmJit diff --git a/desmume/src/frontend/windows/aviout.cpp b/desmume/src/frontend/windows/aviout.cpp index 0e9c9a2db..5bcb86e51 100644 --- a/desmume/src/frontend/windows/aviout.cpp +++ b/desmume/src/frontend/windows/aviout.cpp @@ -316,13 +316,14 @@ static void do_video_conversion(AVIFile* avi, const u16* buffer) int height = avi->prescaleLevel*384; u8* outbuf = avi_file->convert_buffer + width*(height-1)*3; - for(int y=0;y(*buffer++); - *(u32 *)outbuf = (dst & 0x00FFFFFF) | (*(u32 *)outbuf & 0xFF000000); - outbuf += 3; + u32 dst = ColorspaceConvert555To8888Opaque(*buffer++); + *outbuf++ = dst & 0xFF; + *outbuf++ = (dst >> 8) & 0xFF; + *outbuf++ = (dst >> 16) & 0xFF; } outbuf -= width*3*2; diff --git a/desmume/src/frontend/windows/desmume.props b/desmume/src/frontend/windows/desmume.props index 8e778e61e..d5862129e 100644 --- a/desmume/src/frontend/windows/desmume.props +++ b/desmume/src/frontend/windows/desmume.props @@ -94,7 +94,7 @@ _DEBUG;%(PreprocessorDefinitions) RELEASE;NDEBUG;%(PreprocessorDefinitions) - RELEASE;NDEBUG;%(PreprocessorDefinitions) + FASTBUILD;RELEASE;NDEBUG;%(PreprocessorDefinitions) diff --git a/desmume/src/frontend/windows/main.cpp b/desmume/src/frontend/windows/main.cpp index 0040c7c4d..04c7f01cd 100644 --- a/desmume/src/frontend/windows/main.cpp +++ b/desmume/src/frontend/windows/main.cpp @@ -1919,7 +1919,7 @@ static void DoDisplay(bool firstTime) //convert pixel format to 32bpp for compositing //why do we do this over and over? well, we are compositing to //filteredbuffer32bpp, and it needs to get refreshed each frame. - ConvertColorBuffer555To8888Opaque((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); + ColorspaceConvertBuffer555To8888Opaque((u16 *)video.srcBuffer, video.buffer, video.srcBufferSize / sizeof(u16)); if(firstTime) { @@ -3282,6 +3282,13 @@ int _main() cur3DCore = GPU3D_NULL; else if(cur3DCore == GPU3D_NULL) // this value shouldn't be saved anymore cur3DCore = GPU3D_DEFAULT; + + if(cmdline.render3d == COMMANDLINE_RENDER3D_NONE) cur3DCore = GPU3D_NULL; + if(cmdline.render3d == COMMANDLINE_RENDER3D_SW) cur3DCore = GPU3D_SWRAST; + if(cmdline.render3d == COMMANDLINE_RENDER3D_OLDGL) cur3DCore = GPU3D_OPENGL_OLD; + if(cmdline.render3d == COMMANDLINE_RENDER3D_GL) cur3DCore = GPU3D_OPENGL_3_2; //no way of forcing it, at least not right now. I dont care. + if(cmdline.render3d == COMMANDLINE_RENDER3D_AUTOGL) cur3DCore = GPU3D_OPENGL_3_2; //this will fallback i guess + CommonSettings.GFX3D_HighResolutionInterpolateColor = GetPrivateProfileBool("3D", "HighResolutionInterpolateColor", 1, IniName); CommonSettings.GFX3D_EdgeMark = GetPrivateProfileBool("3D", "EnableEdgeMark", 1, IniName); CommonSettings.GFX3D_Fog = GetPrivateProfileBool("3D", "EnableFog", 1, IniName); diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 9ec2fd43b..a1afe371d 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -526,7 +526,7 @@ void gfx3d_deinit() void gfx3d_reset() { - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(false); #ifdef _SHOW_VTX_COUNTERS max_polys = max_verts = 0; @@ -627,6 +627,53 @@ FORCEINLINE s32 vec3dot_fixed32(const s32* a, const s32* b) { return sfx32_shiftdown(fx32_mul(a[0],b[0]) + fx32_mul(a[1],b[1]) + fx32_mul(a[2],b[2])); } +//--------------- +//I'm going to start name these functions GE for GEOMETRY ENGINE MATH. +//Pretty much any math function in this file should be explicit about how it's handling precision. +//Handling that stuff generically globally is not a winning proposition. + +FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b) +{ +#ifdef _MSC_VER + return __emul(a,b); +#else + return ((s64)a)*((s64)b); +#endif +} + +static s32 GEM_SaturateAndShiftdown36To32(const s64 val) +{ + if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU; + if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U; + + return fx32_shiftdown(val); +} + +static void GEM_TransformVertex(const s32 *matrix, s32 *vecPtr) +{ + const s32 x = vecPtr[0]; + const s32 y = vecPtr[1]; + const s32 z = vecPtr[2]; + const s32 w = vecPtr[3]; + + //saturation logic is most carefully tested by: + //+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen + //You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like + //but if it's done wrongly, you can get bugs in: + //+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA??????? + + //other test cases that cropped up during this development, but are probably not actually related to this after all + //+ SM64: outside castle skybox + //+ NSMB: mario head screen wipe + + vecPtr[0] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[0]) + GEM_Mul32x32To64(y,matrix[4]) + GEM_Mul32x32To64(z,matrix [8]) + GEM_Mul32x32To64(w,matrix[12])); + vecPtr[1] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[1]) + GEM_Mul32x32To64(y,matrix[5]) + GEM_Mul32x32To64(z,matrix[ 9]) + GEM_Mul32x32To64(w,matrix[13])); + vecPtr[2] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[2]) + GEM_Mul32x32To64(y,matrix[6]) + GEM_Mul32x32To64(z,matrix[10]) + GEM_Mul32x32To64(w,matrix[14])); + vecPtr[3] = GEM_SaturateAndShiftdown36To32(GEM_Mul32x32To64(x,matrix[3]) + GEM_Mul32x32To64(y,matrix[7]) + GEM_Mul32x32To64(z,matrix[11]) + GEM_Mul32x32To64(w,matrix[15])); +} +//--------------- + + #define SUBMITVERTEX(ii, nn) polylist->list[polylist->count].vertIndexes[ii] = tempVertInfo.map[nn]; //Submit a vertex to the GE static void SetVertex() @@ -658,16 +705,9 @@ static void SetVertex() return; if(polylist->count >= POLYLIST_SIZE) return; - - //TODO - think about keeping the clip matrix concatenated, - //so that we only have to multiply one matrix here - //(we could lazy cache the concatenated clip matrix and only generate it - //when we need to) - MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed); - //printf("%f %f %f\n",s16coord[0]/4096.0f,s16coord[1]/4096.0f,s16coord[2]/4096.0f); - //printf("x %f %f %f %f\n",mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f); - //printf(" = %f %f %f %f\n",coordTransformed[0]/4096.0f,coordTransformed[1]/4096.0f,coordTransformed[2]/4096.0f,coordTransformed[3]/4096.0f); + GEM_TransformVertex(mtxCurrent[1],coordTransformed); //modelview + GEM_TransformVertex(mtxCurrent[0],coordTransformed); //projection //TODO - culling should be done here. //TODO - viewport transform? @@ -1484,8 +1524,9 @@ static void gfx3d_glViewPort(u32 v) static BOOL gfx3d_glBoxTest(u32 v) { //printf("boxtest\n"); - MMU_new.gxstat.tr = 0; // clear boxtest bit - MMU_new.gxstat.tb = 1; // busy + + //clear result flag. busy flag has been set by fifo component already + MMU_new.gxstat.tr = 0; BTcoords[BTind++] = v & 0xFFFF; BTcoords[BTind++] = v >> 16; @@ -1493,9 +1534,11 @@ static BOOL gfx3d_glBoxTest(u32 v) if (BTind < 5) return FALSE; BTind = 0; - MMU_new.gxstat.tb = 0; // clear busy GFX_DELAY(103); + //now that we're executing this, we're not busy anymore + MMU_new.gxstat.tb = 0; + #if 0 INFO("BoxTEST: x %f y %f width %f height %f depth %f\n", BTcoords[0], BTcoords[1], BTcoords[2], BTcoords[3], BTcoords[4], BTcoords[5]); @@ -1608,27 +1651,31 @@ static BOOL gfx3d_glBoxTest(u32 v) //if any portion of this poly was retained, then the test passes. if (boxtestClipper.clippedPolyCounter > 0) { - //printf("%06d PASS %d\n",boxcounter,gxFIFO.size); + //printf("%06d PASS %d\n",gxFIFO.size, i); MMU_new.gxstat.tr = 1; break; } + else + { + } + + //if(i==5) printf("%06d FAIL\n",gxFIFO.size); } - if (MMU_new.gxstat.tr == 0) - { - //printf("%06d FAIL %d\n",boxcounter,gxFIFO.size); - } - + //printf("%06d RESULT %d\n",gxFIFO.size, MMU_new.gxstat.tr); + return TRUE; } static BOOL gfx3d_glPosTest(u32 v) { - //printf("postest\n"); //this is apparently tested by transformers decepticons and ultimate spiderman - //printf("POSTEST\n"); - MMU_new.gxstat.tb = 1; + //clear result flag. busy flag has been set by fifo component already + MMU_new.gxstat.tr = 0; + + //now that we're executing this, we're not busy anymore + MMU_new.gxstat.tb = 0; PTcoords[PTind++] = float16table[v & 0xFFFF]; PTcoords[PTind++] = float16table[v >> 16]; @@ -2252,23 +2299,12 @@ void gfx3d_VBlankSignal() void gfx3d_VBlankEndSignal(bool skipFrame) { + GPU->ForceRender3DFinishAndFlush(false); + if (!drawPending) return; if (skipFrame) return; - - drawPending = FALSE; - if (CurrentRenderer->GetRenderNeedsFinish()) - { - bool need3DDisplayFramebuffer; - bool need3DCaptureFramebuffer; - CurrentRenderer->GetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); - - CurrentRenderer->SetFramebufferFlushStates(false, false); - CurrentRenderer->RenderFinish(); - CurrentRenderer->SetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); - CurrentRenderer->SetRenderNeedsFinish(false); - GPU->GetEventHandler()->DidRender3DEnd(); - } + drawPending = FALSE; GPU->GetEventHandler()->DidRender3DBegin(); @@ -2486,7 +2522,7 @@ void gfx3d_Update3DFramebuffers(FragmentColor *framebufferRGBA6665, u16 *framebu //-------------savestate void gfx3d_savestate(EMUFILE* os) { - CurrentRenderer->RenderFinish(); + GPU->ForceRender3DFinishAndFlush(true); //version write32le(4,os); diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index 330747c69..168525ef4 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -427,8 +427,3 @@ void MatrixTranslate(s32 *matrix, const s32 *ptr) }); } -void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr) -{ - MatrixMultVec4x4(matrix+16,vecPtr); - MatrixMultVec4x4(matrix,vecPtr); -} diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 8aa87c2fe..d060a4d38 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -276,13 +276,6 @@ FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr) _mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr))); } -FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr) -{ - //there are hardly any gains from merging these manually - MatrixMultVec4x4(matrix+16,vecPtr); - MatrixMultVec4x4(matrix,vecPtr); -} - FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr) { const __m128 vec = _mm_load_ps(vecPtr); @@ -355,13 +348,6 @@ void MatrixMultiply(float * matrix, const float * rightMatrix); void MatrixTranslate(float *matrix, const float *ptr); void MatrixScale(float * matrix, const float * ptr); -FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr) -{ - //there are hardly any gains from merging these manually - MatrixMultVec4x4(matrix+16,vecPtr); - MatrixMultVec4x4(matrix,vecPtr); -} - template FORCEINLINE void vector_fix2float(float* matrix, const float divisor) { @@ -373,8 +359,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor) void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr); -void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr); - void MatrixMultiply(s32* matrix, const s32* rightMatrix); void MatrixScale(s32 *matrix, const s32 *ptr); void MatrixTranslate(s32 *matrix, const s32 *ptr); diff --git a/desmume/src/mc.cpp b/desmume/src/mc.cpp index dbf97f297..6f52c75e4 100644 --- a/desmume/src/mc.cpp +++ b/desmume/src/mc.cpp @@ -619,6 +619,21 @@ void BackupDevice::reset() ensure((u32)savesize); //expand properly if necessary addr_size = addr_size_for_old_save_type(savetype); } + + //automatically detect these hardcodes + if(state == DETECTING) + { + if(!memcmp(gameInfo.header.gameCode,"ASMK", 4)) addr_size = 1; //super mario 64 ds (KOR, which is different somehow) + else if(!memcmp(gameInfo.header.gameCode,"ASM", 3)) addr_size = 2; //super mario 64 ds + else if(!memcmp(gameInfo.header.gameCode,"BDE", 3)) addr_size = 2; // Dementium II + else if(!memcmp(gameInfo.header.gameCode,"AL3", 3)) addr_size = 1; //spongebob atlantis squarepantis. + else if(!memcmp(gameInfo.header.gameCode,"AH5", 3)) addr_size = 1; //over the hedge + else if(!memcmp(gameInfo.header.gameCode,"AVH", 3)) addr_size = 1; //over the hedge - Hammy Goes Nuts! + else if(!memcmp(gameInfo.header.gameCode,"AQ3", 3)) addr_size = 1; //spider-man 3 + + //if we found a whitelist match, we dont need to run detection + if(addr_size) state = RUNNING; + } } void BackupDevice::close_rom() @@ -662,36 +677,33 @@ void BackupDevice::detect() addr_size = 1; //choose 1 just to keep the busted savefile from growing too big msgbox->error("Catastrophic error while autodetecting save type.\nIt will need to be specified manually\n"); break; + case 2: //the modern typical case for small eeproms addr_size = 1; break; + case 3: //another modern typical case.. //but unfortunately we select this case on accident sometimes when what it meant to do was present the archaic 1+2 case //(the archaic 1+2 case is: specifying one address byte, and then reading the first two bytes, instead of the first one byte, as most other games would do.) //so, we're gonna hack in checks for the games that are doing this addr_size = 2; - - // TODO: will study a deep, why this happens (wrong detect size) - if(!memcmp(gameInfo.header.gameCode,"AL3", 3)) addr_size = 1; //spongebob atlantis squarepantis. - if(!memcmp(gameInfo.header.gameCode,"AH5", 3)) addr_size = 1; //over the hedge - if(!memcmp(gameInfo.header.gameCode,"AVH", 3)) addr_size = 1; //over the hedge - Hammy Goes Nuts! - if(!memcmp(gameInfo.header.gameCode,"AQ3", 3)) addr_size = 1; //spider-man 3 - break; + case 4: //a modern typical case addr_size = 3; - if(!memcmp(gameInfo.header.gameCode,"ASM", 3)) addr_size = 2; //super mario 64 ds + break; default: //the archaic case: write the address and then some modulo-4 number of bytes //why modulo 4? who knows. - //SM64 (KOR) makes it here with autodetect_size=11 and nothing interesting in the buffer addr_size = autodetect_size & 3; - if(!memcmp(gameInfo.header.gameCode,"BDE", 3)) addr_size = 2; // Dementium II + //SM64 (KOR) makes it here with autodetect_size=11 and nothing interesting in the buffer + //we whitelisted it earlier though + break; } diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 725a1536b..a730530c6 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -604,11 +604,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if ( (this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev) ) { - ConvertColorBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); + ColorspaceConvertBuffer8888To6665((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else if ( (this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev) ) { - ConvertColorBuffer6665To8888((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); + ColorspaceConvertBuffer6665To8888((u32 *)srcFramebuffer, (u32 *)dstFramebuffer, pixCount); } else if ( ((this->_internalRenderingFormat == NDSColorFormat_BGR666_Rev) && (this->_outputFormat == NDSColorFormat_BGR666_Rev)) || ((this->_internalRenderingFormat == NDSColorFormat_BGR888_Rev) && (this->_outputFormat == NDSColorFormat_BGR888_Rev)) ) @@ -621,11 +621,11 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram { if (this->_outputFormat == NDSColorFormat_BGR666_Rev) { - ConvertColorBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer6665To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } else if (this ->_outputFormat == NDSColorFormat_BGR888_Rev) { - ConvertColorBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); + ColorspaceConvertBuffer8888To5551((u32 *)srcFramebuffer, dstRGBA5551, pixCount); } } diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 712ec7dae..f7931e23b 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -30,6 +30,10 @@ #include "MMU.h" #include "NDSSystem.h" +#ifdef ENABLE_SSE2 +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" +#endif + using std::min; using std::max; @@ -451,13 +455,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } // Set converted colors to 0 if the palette index is 0. @@ -517,13 +521,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); @@ -580,13 +584,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } // Set converted colors to 0 if the palette index is 0. @@ -646,13 +650,13 @@ public: if (TEXFORMAT == TexFormat_15bpp) { - ConvertColor555To6665Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To6665Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } else { - ConvertColor555To8888Opaque(palColor0, convertedColor[0], convertedColor[1]); - ConvertColor555To8888Opaque(palColor1, convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); @@ -881,11 +885,11 @@ public: tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ConvertColor555To6665(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ConvertColor555To6665(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); } else { @@ -895,11 +899,11 @@ public: tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ConvertColor555To8888(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ConvertColor555To8888(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); } _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); diff --git a/desmume/src/types.h b/desmume/src/types.h index 192fc943d..22a575256 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -19,10 +19,6 @@ #ifndef TYPES_HPP #define TYPES_HPP -#include -#include -#include - //analyze microsoft compilers #ifdef _MSC_VER #define HOST_WINDOWS @@ -80,6 +76,18 @@ #ifdef __SSE4_2__ #define ENABLE_SSE4_2 #endif + + #ifdef __AVX__ + #define ENABLE_AVX + #endif + + #ifdef __AVX2__ + #define ENABLE_AVX2 + #endif + + #ifdef __ALTIVEC__ + #define ENABLE_ALTIVEC + #endif #endif #ifdef _MSC_VER @@ -148,6 +156,14 @@ #define _CDECL_ #endif +#ifndef INLINE + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define INLINE _inline + #else + #define INLINE inline + #endif +#endif + #ifndef FORCEINLINE #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define FORCEINLINE __forceinline @@ -219,6 +235,38 @@ typedef u32 uint32; #define uint32 u32 //uint32 is defined in Leopard somewhere, avoid conflicts #endif +#ifdef ENABLE_ALTIVEC + #ifndef __APPLE_ALTIVEC__ + #include + #endif +typedef vector unsigned char v128u8; +typedef vector signed char v128s8; +typedef vector unsigned short v128u16; +typedef vector signed short v128s16; +typedef vector unsigned int v128u32; +typedef vector signed int v128s32; +#endif + +#ifdef ENABLE_SSE2 +#include +typedef __m128i v128u8; +typedef __m128i v128s8; +typedef __m128i v128u16; +typedef __m128i v128s16; +typedef __m128i v128u32; +typedef __m128i v128s32; +#endif + +#ifdef ENABLE_AVX2 +#include +typedef __m256i v256u8; +typedef __m256i v256s8; +typedef __m256i v256u16; +typedef __m256i v256s16; +typedef __m256i v256u32; +typedef __m256i v256s32; +#endif + /*---------- GPU3D fixed-points types -----------*/ typedef s32 f32; @@ -266,8 +314,20 @@ typedef int desmume_BOOL; #define FALSE 0 #endif +#ifdef __BIG_ENDIAN__ +#ifndef WORDS_BIGENDIAN +#define WORDS_BIGENDIAN +#endif +#endif + +#ifdef WORDS_BIGENDIAN +# define LOCAL_BE 1 +#else +# define LOCAL_LE 1 +#endif + /* little endian (ds' endianess) to local endianess convert macros */ -#ifdef MSB_FIRST /* local arch is big endian */ +#ifdef LOCAL_BE /* local arch is big endian */ # define LE_TO_LOCAL_16(x) ((((x)&0xff)<<8)|(((x)>>8)&0xff)) # define LE_TO_LOCAL_32(x) ((((x)&0xff)<<24)|(((x)&0xff00)<<8)|(((x)>>8)&0xff00)|(((x)>>24)&0xff)) # define LE_TO_LOCAL_64(x) ((((x)&0xff)<<56)|(((x)&0xff00)<<40)|(((x)&0xff0000)<<24)|(((x)&0xff000000)<<8)|(((x)>>8)&0xff000000)|(((x)>>24)&0xff0000)|(((x)>>40)&0xff00)|(((x)>>56)&0xff)) @@ -287,6 +347,8 @@ typedef int desmume_BOOL; #define MB(x) ((x)*1024*1024) #define KB(x) ((x)*1024) +#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) + #define CPU_STR(c) ((c==ARM9)?"ARM9":"ARM7") typedef enum { @@ -294,6 +356,28 @@ typedef enum ARM7 = 1 } cpu_id_t; +///endian-flips count bytes. count should be even and nonzero. +inline void FlipByteOrder(u8 *src, u32 count) +{ + u8 *start=src; + u8 *end=src+count-1; + + if((count&1) || !count) return; /* This shouldn't happen. */ + + while(count--) + { + u8 tmp; + + tmp=*end; + *end=*start; + *start=tmp; + end--; + start++; + } +} + + + inline u64 double_to_u64(double d) { union { u64 a; @@ -312,6 +396,68 @@ inline double u64_to_double(u64 u) { return fuxor.b; } +inline u32 float_to_u32(float f) { + union { + u32 a; + float b; + } fuxor; + fuxor.b = f; + return fuxor.a; +} + +inline float u32_to_float(u32 u) { + union { + u32 a; + float b; + } fuxor; + fuxor.a = u; + return fuxor.b; +} + + +///stores a 32bit value into the provided byte array in guaranteed little endian form +inline void en32lsb(u8 *buf, u32 morp) +{ + buf[0]=(u8)(morp); + buf[1]=(u8)(morp>>8); + buf[2]=(u8)(morp>>16); + buf[3]=(u8)(morp>>24); +} + +inline void en16lsb(u8* buf, u16 morp) +{ + buf[0]=(u8)morp; + buf[1]=(u8)(morp>>8); +} + +///unpacks a 64bit little endian value from the provided byte array into host byte order +inline u64 de64lsb(u8 *morp) +{ + return morp[0]|(morp[1]<<8)|(morp[2]<<16)|(morp[3]<<24)|((u64)morp[4]<<32)|((u64)morp[5]<<40)|((u64)morp[6]<<48)|((u64)morp[7]<<56); +} + +///unpacks a 32bit little endian value from the provided byte array into host byte order +inline u32 de32lsb(u8 *morp) +{ + return morp[0]|(morp[1]<<8)|(morp[2]<<16)|(morp[3]<<24); +} + +///unpacks a 16bit little endian value from the provided byte array into host byte order +inline u16 de16lsb(u8 *morp) +{ + return morp[0]|(morp[1]<<8); +} + +#ifndef ARRAY_SIZE +//taken from winnt.h +extern "C++" // templates cannot be declared to have 'C' linkage +template +char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N]; + +#define ARRAY_SIZE(A) (sizeof(*BLAHBLAHBLAH(A))) +#endif + + //fairly standard for loop macros #define MACRODO1(TRICK,TODO) { const size_t X = TRICK; TODO; } #define MACRODO2(X,TODO) { MACRODO1((X),TODO) MACRODO1(((X)+1),TODO) } @@ -385,30 +531,37 @@ template inline void reconstruct(T* t) { new(t) T(); } -/* fixed point speedup macros */ +//-------------fixed point speedup macros -FORCEINLINE s32 sfx32_shiftdown(const s64 a) +#ifdef _MSC_VER +#include +#endif + +FORCEINLINE s64 fx32_mul(const s32 a, const s32 b) { - s64 shifted = fx32_shiftdown(a); +#ifdef _MSC_VER + return __emul(a,b); +#else + return ((s64)a)*((s64)b); +#endif +} - /*either matrix math is happening at higher precision (an extra bit would suffice, - * I think), or the sums sent to this are saturated. - * - *tested by: spectrobes beyond the portals excavation blower - *(it sets very large +x,+y in the modelview matrix to push things offscreen, - *but the +y will overflow and become negative if we're not careful) - * - *I didnt think very hard about what would be fastest here on 32bit systems - *NOTE: this was intended for use in MatrixMultVec4x4_M2; it may not be appropriate for - * other uses of fx32_shiftdown. - *if this causes problems we should refactor the math routines a bit to take care of - * saturating in another function - */ - if(shifted>(s32)0x7FFFFFFF) - return 0x7FFFFFFF; - if(shifted<=(s32)0x80000000) - return 0x80000000; - return shifted; +FORCEINLINE s32 fx32_shiftdown(const s64 a) +{ +#ifdef _MSC_VER + return (s32)__ll_rshift(a,12); +#else + return (s32)(a>>12); +#endif +} + +FORCEINLINE s64 fx32_shiftup(const s32 a) +{ +#ifdef _MSC_VER + return __ll_lshift(a,12); +#else + return ((s64)a)<<12; +#endif } #endif diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp new file mode 100644 index 000000000..d0757d7cc --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -0,0 +1,776 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#include "colorspacehandler.h" + +#if defined(ENABLE_AVX2) + #include "colorspacehandler_AVX2.h" +#elif defined(ENABLE_SSE2) + #include "colorspacehandler_SSE2.h" +#elif defined(ENABLE_ALTIVEC) + #include "colorspacehandler_AltiVec.h" +#endif + +#if defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) + #define USEVECTORSIZE_128 +#endif + +#if defined(ENABLE_AVX2) + #define USEVECTORSIZE_256 +#endif + +// By default, the hand-coded vectorized code will be used instead of a compiler's built-in +// autovectorization (if supported). However, if USEMANUALVECTORIZATION is not defined, then +// the compiler will use autovectorization (if supported). +#if defined(USEVECTORSIZE_128) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_512) + // Comment out USEMANUALVECTORIZATION to disable the hand-coded vectorized code. + #define USEMANUALVECTORIZATION +#endif + +#ifdef USEMANUALVECTORIZATION + #if defined(ENABLE_AVX2) + static const ColorspaceHandler_AVX2 csh; + #elif defined(ENABLE_SSE2) + static const ColorspaceHandler_SSE2 csh; + #elif defined(ENABLE_ALTIVEC) + static const ColorspaceHandler_AltiVec csh; + #else + static const ColorspaceHandler csh; + #endif +#else + static const ColorspaceHandler csh; +#endif + +CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; +CACHE_ALIGN u32 color_555_to_666[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; +CACHE_ALIGN u32 color_555_to_888[32768]; + +//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX +CACHE_ALIGN const u32 material_5bit_to_31bit[] = { + 0x00000000, 0x04210842, 0x08421084, 0x0C6318C6, + 0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE, + 0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6, + 0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE, + 0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7, + 0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF, + 0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7, + 0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF +}; + +// 5-bit to 6-bit conversions use this formula -- dst = (src == 0) ? 0 : (2*src) + 1 +// Reference GBATEK: http://problemkaputt.de/gbatek.htm#ds3dtextureblending +CACHE_ALIGN const u8 material_5bit_to_6bit[] = { + 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, + 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, + 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, + 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F +}; + +CACHE_ALIGN const u8 material_5bit_to_8bit[] = { + 0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39, + 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B, + 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD, + 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF +}; + +CACHE_ALIGN const u8 material_6bit_to_8bit[] = { + 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, + 0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C, + 0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D, + 0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D, + 0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E, + 0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE, + 0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF, + 0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF +}; + +CACHE_ALIGN const u8 material_3bit_to_8bit[] = { + 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF +}; + +//maybe not very precise +CACHE_ALIGN const u8 material_3bit_to_5bit[] = { + 0, 4, 8, 13, 17, 22, 26, 31 +}; + +//TODO - generate this in the static init method more accurately +CACHE_ALIGN const u8 material_3bit_to_6bit[] = { + 0, 8, 16, 26, 34, 44, 52, 63 +}; + +void ColorspaceHandlerInit() +{ + static bool needInitTables = true; + + if (needInitTables) + { +#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) +#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) +#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) +#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) + + for (size_t i = 0; i < 32768; i++) + { + color_555_to_666[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) ); + color_555_to_6665_opaque[i] = LE_TO_LOCAL_32( RGB15TO18_BITLOGIC(i) | 0x1F000000 ); + color_555_to_6665_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000 ); + + color_555_to_888[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) ); + color_555_to_8888_opaque[i] = LE_TO_LOCAL_32( RGB15TO24_BITLOGIC(i) | 0xFF000000 ); + color_555_to_8888_opaque_swap_rb[i] = LE_TO_LOCAL_32( RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000 ); + } + } +} + +template +FORCEINLINE u32 ColorspaceConvert555To8888Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert555To6665Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; + outColor.g = srcColor.g >> 2; + outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; + outColor.a = srcColor.a >> 3; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To6665(srcColorComponent); +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; + outColor.g = material_6bit_to_8bit[srcColor.g]; + outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; + outColor.a = material_5bit_to_8bit[srcColor.a]; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To8888(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(FragmentColor srcColor) +{ + return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To5551(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(FragmentColor srcColor) +{ + return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To5551(srcColorComponent); +} + +template +void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To8888Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To8888Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To8888Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + } +} + +template +void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To6665Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To6665Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To6665Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + } +} + +template +void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To6665_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To6665_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To6665_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To6665(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } +} + +template +void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 4); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 16); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To8888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To8888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To8888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To8888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } +} + +template +void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To5551_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To5551_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To5551_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To5551(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } +} + +template +void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + +#if defined(USEVECTORSIZE_128) + const size_t pixCountVector = pixCount - (pixCount % 8); +#elif defined(USEVECTORSIZE_256) + const size_t pixCountVector = pixCount - (pixCount % 16); +#elif defined(USEVECTORSIZE_512) + const size_t pixCountVector = pixCount - (pixCount % 32); +#endif + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To5551_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To5551_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To5551_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To5551(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE + +#endif // USEMANUALVECTORIZATION + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To8888Opaque(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To6665Opaque(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To6665(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To6665_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To8888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To8888_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To5551(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To5551_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (;i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To5551(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(src, dst, pixCount); +} + +template u32 ColorspaceConvert555To8888Opaque(const u16 src); +template u32 ColorspaceConvert555To8888Opaque(const u16 src); + +template u32 ColorspaceConvert555To6665Opaque(const u16 src); +template u32 ColorspaceConvert555To6665Opaque(const u16 src); + +template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); +template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); + +template u32 ColorspaceConvert8888To6665(u32 srcColor); +template u32 ColorspaceConvert8888To6665(u32 srcColor); + +template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); +template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); + +template u32 ColorspaceConvert6665To8888(u32 srcColor); +template u32 ColorspaceConvert6665To8888(u32 srcColor); + +template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); +template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); + +template u16 ColorspaceConvert8888To5551(u32 srcColor); +template u16 ColorspaceConvert8888To5551(u32 srcColor); + +template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); +template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); + +template u16 ColorspaceConvert6665To5551(u32 srcColor); +template u16 ColorspaceConvert6665To5551(u32 srcColor); + +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h new file mode 100644 index 000000000..362e975ea --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -0,0 +1,194 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef COLORSPACEHANDLER_H +#define COLORSPACEHANDLER_H + +#include "types.h" +#include +#include + + +enum NDSColorFormat +{ + // The color format information is packed in a 32-bit value. + // The bits are as follows: + // FFFOOOOO AAAAAABB BBBBGGGG GGRRRRRR + // + // F = Flags (see below) + // O = Color order (see below) + // A = Bit count for alpha [0-63] + // B = Bit count for blue [0-63] + // G = Bit count for green [0-63] + // R = Bit count for red [0-63] + // + // Flags: + // Bit 29: Reverse order flag. + // Set = Bits are in reverse order, usually for little-endian usage. + // Cleared = Bits are in normal order, usually for big-endian usage. + // + // Color order bits, 24-28: + // 0x00 = RGBA, common format + // 0x01 = RGAB + // 0x02 = RBGA + // 0x03 = RBAG + // 0x04 = RAGB + // 0x05 = RABG + // 0x06 = GRBA + // 0x07 = GRAB + // 0x08 = GBRA + // 0x09 = GBAR + // 0x0A = GARB + // 0x0B = GABR + // 0x0C = BRGA + // 0x0D = BRAG + // 0x0E = BGRA, common format + // 0x0F = BGAR + // 0x10 = BARG + // 0x11 = BAGR + // 0x12 = ARGB + // 0x13 = ARBG + // 0x14 = AGRB + // 0x15 = AGBR + // 0x16 = ABRG + // 0x17 = ABGR + + // Color formats used for internal processing. + //NDSColorFormat_ABGR1555_Rev = 0x20045145, + //NDSColorFormat_ABGR5666_Rev = 0x20186186, + //NDSColorFormat_ABGR8888_Rev = 0x20208208, + + // Color formats used by the output framebuffers. + NDSColorFormat_BGR555_Rev = 0x20005145, + NDSColorFormat_BGR666_Rev = 0x20006186, + NDSColorFormat_BGR888_Rev = 0x20008208 +}; + +union FragmentColor +{ + u32 color; + struct + { + u8 r,g,b,a; + }; +}; + +extern CACHE_ALIGN const u32 material_5bit_to_31bit[32]; +extern CACHE_ALIGN const u8 material_5bit_to_6bit[32]; +extern CACHE_ALIGN const u8 material_5bit_to_8bit[32]; +extern CACHE_ALIGN const u8 material_6bit_to_8bit[64]; +extern CACHE_ALIGN const u8 material_3bit_to_5bit[8]; +extern CACHE_ALIGN const u8 material_3bit_to_6bit[8]; +extern CACHE_ALIGN const u8 material_3bit_to_8bit[8]; + +extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; +extern CACHE_ALIGN u32 color_555_to_666[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; +extern CACHE_ALIGN u32 color_555_to_888[32768]; + +#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped +#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color + +#ifdef LOCAL_LE + #define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, little-endian +#else + #define COLOR555TO6665(col,alpha5) ((alpha5) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha, big-endian +#endif + +#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color +#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped +#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color + +#ifdef LOCAL_LE + #define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, little-endian +#else + #define COLOR555TO8888(col,alpha8) ((alpha8) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha, big-endian +#endif + +//produce a 15bpp color from individual 5bit components +#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) ) + +//produce a 16bpp color from individual 5bit components +#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) ) + +void ColorspaceHandlerInit(); + +template u32 ColorspaceConvert555To8888Opaque(const u16 src); +template u32 ColorspaceConvert555To6665Opaque(const u16 src); +template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); +template u32 ColorspaceConvert8888To6665(u32 srcColor); +template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); +template u32 ColorspaceConvert6665To8888(u32 srcColor); +template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); +template u16 ColorspaceConvert8888To5551(u32 srcColor); +template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); +template u16 ColorspaceConvert6665To5551(u32 srcColor); + +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +class ColorspaceHandler +{ +public: + ColorspaceHandler() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) +{ + FragmentColor ret; + ret.r = r; ret.g = g; ret.b = b; ret.a = a; + return ret; +} + +#endif /* COLORSPACEHANDLER_H */ diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp new file mode 100644 index 000000000..6682bea12 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp @@ -0,0 +1,491 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_AVX2.h" + +#ifndef ENABLE_AVX2 + #error This code requires AVX2 support. +#else + +#include + +template +FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) +{ + v256u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); + dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); + dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x00F800F8) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00070707)) ); + dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); + + src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); + dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 19), _mm256_srli_epi32(src32, 7)) : _mm256_or_si256(_mm256_slli_epi32(src32, 3), _mm256_slli_epi32(src32, 9)); + dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x00F800F8) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 6), _mm256_set1_epi32(0x0000F800)) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00070707)) ); + dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi) +{ + v256u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + src32 = _mm256_unpacklo_epi16(srcColor, _mm256_setzero_si256()); + dstLo = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); + dstLo = _mm256_and_si256( dstLo, _mm256_set1_epi32(0x003E003E) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); + dstLo = _mm256_or_si256( dstLo, _mm256_and_si256(_mm256_srli_epi32(dstLo, 5), _mm256_set1_epi32(0x00010101)) ); + dstLo = _mm256_or_si256( dstLo, srcAlphaBits32Lo ); + + src32 = _mm256_unpackhi_epi16(srcColor, _mm256_setzero_si256()); + dstHi = (SWAP_RB) ? _mm256_or_si256(_mm256_slli_epi32(src32, 17), _mm256_srli_epi32(src32, 9)) : _mm256_or_si256(_mm256_slli_epi32(src32, 1), _mm256_slli_epi32(src32, 7)); + dstHi = _mm256_and_si256( dstHi, _mm256_set1_epi32(0x003E003E) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_slli_epi32(src32, 4), _mm256_set1_epi32(0x00003E00)) ); + dstHi = _mm256_or_si256( dstHi, _mm256_and_si256(_mm256_srli_epi32(dstHi, 5), _mm256_set1_epi32(0x00010101)) ); + dstHi = _mm256_or_si256( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0xFF000000); + ColorspaceConvert555To8888_AVX2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + const v256u32 srcAlphaBits32 = _mm256_set1_epi32(0x1F000000); + ColorspaceConvert555To6665_AVX2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v256u32 rgb; + const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { + rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); + rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + else + { + rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); + } + + return _mm256_or_si256(rgb, a); +} + +template +FORCEINLINE v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v256u32 rgb = _mm256_or_si256( _mm256_and_si256(_mm256_slli_epi32(src, 2), _mm256_set1_epi32(0x00FCFCFC)), _mm256_and_si256(_mm256_srli_epi32(src, 4), _mm256_set1_epi32(0x00030303)) ); + const v256u32 a = _mm256_or_si256( _mm256_and_si256(_mm256_slli_epi32(src, 3), _mm256_set1_epi32(0xF8000000)), _mm256_and_si256(_mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { + rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm256_or_si256(rgb, a); +} + +template +FORCEINLINE v256u16 _ConvertColorBaseTo5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v256u32 rgbLo; + v256u32 rgbHi; + v256u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 17), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_slli_epi32(srcLo, 9), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 17), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_slli_epi32(srcHi, 9), _mm256_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 1), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 7), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 1), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 7), _mm256_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm256_packs_epi32( _mm256_and_si256(_mm256_srli_epi32(srcLo, 24), _mm256_set1_epi32(0x0000001F)), _mm256_and_si256(_mm256_srli_epi32(srcHi, 24), _mm256_set1_epi32(0x0000001F)) ); + alpha = _mm256_cmpgt_epi16(alpha, _mm256_setzero_si256()); + alpha = _mm256_and_si256(alpha, _mm256_set1_epi16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 19), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_slli_epi32(srcLo, 7), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 19), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_slli_epi32(srcHi, 7), _mm256_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 3), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 9), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 3), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 9), _mm256_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm256_packs_epi32( _mm256_srli_epi32(srcLo, 24), _mm256_srli_epi32(srcHi, 24) ); + alpha = _mm256_cmpgt_epi16(alpha, _mm256_setzero_si256()); + alpha = _mm256_and_si256(alpha, _mm256_set1_epi16(0x8000)); + } + + return _mm256_or_si256(_mm256_packs_epi32(rgbLo, rgbHi), alpha); +} + +template +FORCEINLINE v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); +} + +template +FORCEINLINE v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + v256u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_AVX2(src_vec256, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + v256u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_AVX2(src_vec256, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i+0), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+8), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=8) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert8888To6665_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert8888To6665_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=8) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert6665To8888_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert6665To8888_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AVX2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=16) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+8))) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+8))) ); + } + } + + return i; +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); + +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); + +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +#endif // ENABLE_AVX2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h new file mode 100644 index 000000000..730bf730f --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.h @@ -0,0 +1,74 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_AVX2_H +#define COLORSPACEHANDLER_AVX2_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_AVX2 + #warning This header requires AVX2 support. +#else + +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u32 &srcAlphaBits32Lo, const v256u32 &srcAlphaBits32Hi, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +class ColorspaceHandler_AVX2 : public ColorspaceHandler +{ +public: + ColorspaceHandler_AVX2() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +#endif // ENABLE_AVX2 + +#endif /* COLORSPACEHANDLER_AVX2_H */ diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp new file mode 100644 index 000000000..b4b39f751 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -0,0 +1,345 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_Altivec.h" + +#ifndef ENABLE_ALTIVEC + #error This code requires PowerPC AltiVec support. +#else + +template +FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + dstLo = vec_unpackl((vector pixel)srcColor); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){3,3,3,0, 3,3,3,0, 3,3,3,0, 3,3,3,0})), vec_sr((v128u8)dstLo, ((v128u8){2,2,2,0, 2,2,2,0, 2,2,2,0, 2,2,2,0})) ); + dstLo = vec_sel(dstLo, srcAlphaBits32Lo, vec_splat_u32(0xFF000000)); + + dstHi = vec_unpackh((vector pixel)srcColor); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){3,3,3,0, 3,3,3,0, 3,3,3,0, 3,3,3,0})), vec_sr((v128u8)dstHi, ((v128u8){2,2,2,0, 2,2,2,0, 2,2,2,0, 2,2,2,0})) ); + dstHi = vec_sel(dstHi, srcAlphaBits32Hi, vec_splat_u32(0xFF000000)); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + dstLo = vec_unpackl((vector pixel)srcColor); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){1,1,1,0, 1,1,1,0, 1,1,1,0, 1,1,1,0})), vec_sr((v128u8)dstLo, ((v128u8){4,4,4,0, 4,4,4,0, 4,4,4,0, 4,4,4,0})) ); + dstLo = vec_sel(dstLo, srcAlphaBits32Lo, vec_splat_u32(0xFF000000)); + + dstHi = vec_unpackh((vector pixel)srcColor); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){1,1,1,0, 1,1,1,0, 1,1,1,0, 1,1,1,0})), vec_sr((v128u8)dstHi, ((v128u8){4,4,4,0, 4,4,4,0, 4,4,4,0, 4,4,4,0})) ); + dstHi = vec_sel(dstHi, srcAlphaBits32Hi, vec_splat_u32(0xFF000000)); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = {0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000}; + ColorspaceConvert555To8888_AltiVec(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = {0x1F000000, 0x1F000000, 0x1F000000, 0x1F000000}; + ColorspaceConvert555To6665_AltiVec(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u8 rgba = vec_sr( (v128u8)src, ((v128u8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3}) ); + + if (SWAP_RB) + { + rgba = vec_perm( rgba, rgba, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + } + + return (v128u32)rgba; +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u8 rgba = vec_or( vec_sl((v128u8)src, ((v128u8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3})), vec_sr((v128u8)src, ((v128u8){4,4,4,2, 4,4,4,2, 4,4,4,2, 4,4,4,2})) ); + + if (SWAP_RB) + { + rgba = vec_perm( rgba, rgba, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + } + + return (v128u32)rgba; +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + + v128u16 dstColor; + v128u16 dstAlpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + // Convert alpha + dstAlpha = vec_packsu( vec_and(vec_sr(srcLo, vec_splat_u32(24)), vec_splat_u32(0x0000001F)), vec_and(vec_sr(srcHi, vec_splat_u32(24)), vec_splat_u32(0x0000001F)) ); + dstAlpha = vec_cmpgt(dstAlpha, vec_splat_u16(0)); + dstAlpha = vec_and(dstAlpha, vec_splat_u16(0x8000)); + + // Convert RGB + if (SWAP_RB) + { + rgbLo = vec_perm( srcLo, srcLo, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + rgbHi = vec_perm( srcHi, srcHi, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + + rgbLo = vec_sl( rgbLo, vec_splat_u32(2) ); + rgbHi = vec_sl( rgbHi, vec_splat_u32(2) ); + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + } + else + { + rgbLo = vec_sl( srcLo, vec_splat_u32(2) ); + rgbHi = vec_sl( srcHi, vec_splat_u32(2) ); + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + } + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + // Convert alpha + dstAlpha = vec_packsu( vec_sr(srcLo, vec_splat_u32(24)), vec_sr(srcHi, vec_splat_u32(24)) ); + dstAlpha = vec_cmpgt(dstAlpha, vec_splat_u16(0)); + dstAlpha = vec_and(dstAlpha, vec_splat_u16(0x8000)); + + // Convert RGB + if (SWAP_RB) + { + rgbLo = vec_perm( srcLo, srcLo, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + rgbHi = vec_perm( srcHi, srcHi, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + } + else + { + dstColor = (v128u16)vec_packpx(srcLo, srcHi); + } + } + + dstColor = vec_and(dstColor, vec_splat_u16(0x7FFF)); + return vec_or(dstColor, dstAlpha); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u32 dstConvertedLo, dstConvertedHi; + + ColorspaceConvert555To8888Opaque_AltiVec( vec_ld(0, src+i), dstConvertedLo, dstConvertedHi ); + vec_st(dstConvertedHi, 0, dst+i); + vec_st(dstConvertedLo, 16, dst+i); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u32 dstConvertedLo, dstConvertedHi; + + ColorspaceConvert555To6665Opaque_AltiVec( vec_ld(0, src+i), dstConvertedLo, dstConvertedHi ); + vec_st(dstConvertedHi, 0, dst+i); + vec_st(dstConvertedLo, 16, dst+i); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert8888To6665_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert6665To8888_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AltiVec(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceConvert8888To5551_AltiVec(vec_ld(0, src+i), vec_ld(16, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AltiVec(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceConvert6665To5551_AltiVec(vec_ld(0, src+i), vec_ld(16, src+i)), 0, dst+i ); + } + + return i; +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +#endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h new file mode 100644 index 000000000..d26e05eba --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AltiVec.h @@ -0,0 +1,64 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_ALTIVEC_H +#define COLORSPACEHANDLER_ALTIVEC_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_ALTIVEC + #warning This header requires PowerPC AltiVec support. +#else + +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +// AltiVec has very poor support for dealing with unaligned addresses (it's possible, just +// very obtuse), so we're not even going to bother dealing with any unaligned addresses. +class ColorspaceHandler_AltiVec : public ColorspaceHandler +{ +public: + ColorspaceHandler_AltiVec() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +#endif // ENABLE_ALTIVEC + +#endif /* COLORSPACEHANDLER_ALTIVEC_H */ diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp new file mode 100644 index 000000000..fb4ada420 --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp @@ -0,0 +1,503 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_SSE2.h" + +#ifndef ENABLE_SSE2 + #error This code requires SSE2 support. +#else + +#include + +#ifdef ENABLE_SSSE3 +#include +#endif + +template +FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + v128u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); + dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); + dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x00F800F8) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00070707)) ); + dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); + + src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); + dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 19), _mm_srli_epi32(src32, 7)) : _mm_or_si128(_mm_slli_epi32(src32, 3), _mm_slli_epi32(src32, 9)); + dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x00F800F8) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 6), _mm_set1_epi32(0x0000F800)) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00070707)) ); + dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi) +{ + v128u32 src32; + + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + src32 = _mm_unpacklo_epi16(srcColor, _mm_setzero_si128()); + dstLo = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); + dstLo = _mm_and_si128( dstLo, _mm_set1_epi32(0x003E003E) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); + dstLo = _mm_or_si128( dstLo, _mm_and_si128(_mm_srli_epi32(dstLo, 5), _mm_set1_epi32(0x00010101)) ); + dstLo = _mm_or_si128( dstLo, srcAlphaBits32Lo ); + + src32 = _mm_unpackhi_epi16(srcColor, _mm_setzero_si128()); + dstHi = (SWAP_RB) ? _mm_or_si128(_mm_slli_epi32(src32, 17), _mm_srli_epi32(src32, 9)) : _mm_or_si128(_mm_slli_epi32(src32, 1), _mm_slli_epi32(src32, 7)); + dstHi = _mm_and_si128( dstHi, _mm_set1_epi32(0x003E003E) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_slli_epi32(src32, 4), _mm_set1_epi32(0x00003E00)) ); + dstHi = _mm_or_si128( dstHi, _mm_and_si128(_mm_srli_epi32(dstHi, 5), _mm_set1_epi32(0x00010101)) ); + dstHi = _mm_or_si128( dstHi, srcAlphaBits32Hi ); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = _mm_set1_epi32(0xFF000000); + ColorspaceConvert555To8888_SSE2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u32 srcAlphaBits32 = _mm_set1_epi32(0x1F000000); + ColorspaceConvert555To6665_SSE2(srcColor, srcAlphaBits32, srcAlphaBits32, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u32 rgb; + const v128u32 a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); + rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); +#else + rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x003F0000)), 18), _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00003F00)), 2), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x0000003F)), 14)) ); +#endif + } + else + { + rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); + } + + return _mm_or_si128(rgb, a); +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u32 rgb = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00FCFCFC)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00030303)) ); + const v128u32 a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0xF8000000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); +#else + rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16)) ); +#endif + } + + return _mm_or_si128(rgb, a); +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + v128u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 17), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 17), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 1), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 1), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x0000001F)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x0000001F)) ); + alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); + alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 19), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 19), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 3), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 3), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm_packs_epi32( _mm_srli_epi32(srcLo, 24), _mm_srli_epi32(srcHi, 24) ); + alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); + alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); + } + + return _mm_or_si128(_mm_packs_epi32(rgbLo, rgbHi), alpha); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + v128u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_SSE2(src_vec128, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + else + { + _mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + v128u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_SSE2(src_vec128, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + else + { + _mm_store_si128((v128u32 *)(dst+i+0), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+4), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert8888To6665_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert8888To6665_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert6665To8888_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert6665To8888_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_SSE2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), ColorspaceConvert8888To5551_SSE2(_mm_loadu_si128((v128u32 *)(src+i)), _mm_loadu_si128((v128u32 *)(src+i+4))) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), ColorspaceConvert8888To5551_SSE2(_mm_load_si128((v128u32 *)(src+i)), _mm_load_si128((v128u32 *)(src+i+4))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), ColorspaceConvert6665To5551_SSE2(_mm_loadu_si128((v128u32 *)(src+i)), _mm_loadu_si128((v128u32 *)(src+i+4))) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), ColorspaceConvert6665To5551_SSE2(_mm_load_si128((v128u32 *)(src+i)), _mm_load_si128((v128u32 *)(src+i+4))) ); + } + } + + return i; +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +#endif // ENABLE_SSE2 diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h new file mode 100644 index 000000000..5b44577ea --- /dev/null +++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.h @@ -0,0 +1,74 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_SSE2_H +#define COLORSPACEHANDLER_SSE2_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_SSE2 + #warning This header requires SSE2 support. +#else + +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u32 &srcAlphaBits32Lo, const v128u32 &srcAlphaBits32Hi, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +class ColorspaceHandler_SSE2 : public ColorspaceHandler +{ +public: + ColorspaceHandler_SSE2() {}; + + size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; +}; + +#endif // ENABLE_SSE2 + +#endif /* COLORSPACEHANDLER_SSE2_H */ diff --git a/desmume/src/version.cpp b/desmume/src/version.cpp index e97b71b4b..acc10b9a0 100644 --- a/desmume/src/version.cpp +++ b/desmume/src/version.cpp @@ -59,44 +59,41 @@ #define DESMUME_PLATFORM_STRING "" #endif -#define DESMUME_SSE_STRING "" -#define DESMUME_AVX_STRING "" +#define DESMUME_CPUEXT_PRIMARY_STRING "" +#define DESMUME_CPUEXT_SECONDARY_STRING "" -#ifdef ENABLE_SSE - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE" -#endif -#ifdef ENABLE_SSE2 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE2" -#endif -#ifdef ENABLE_SSE3 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE3" -#endif -#ifdef ENABLE_SSSE3 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSSE3" -#endif -#ifdef ENABLE_SSE4_1 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE4.1" -#endif -#ifdef ENABLE_SSE4_2 - #undef DESMUME_SSE_STRING - #define DESMUME_SSE_STRING " SSE4.2" -#endif -#ifdef ENABLE_AVX - #undef DESMUME_AVX_STRING - #define DESMUME_AVX_STRING "+AVX" -#endif -#ifdef ENABLE_AVX2 - #undef DESMUME_AVX_STRING - #define DESMUME_AVX_STRING "+AVX2" +#if defined(ENABLE_SSE4_2) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.2" +#elif defined(ENABLE_SSE4_1) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE4.1" +#elif defined(ENABLE_SSSE3) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSSE3" +#elif defined(ENABLE_SSE3) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE3" +#elif defined(ENABLE_SSE2) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE2" +#elif defined(ENABLE_SSE) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " SSE" +#elif defined(ENABLE_ALTIVEC) + #undef DESMUME_CPUEXT_PRIMARY_STRING + #define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec" #endif -#define DESMUME_CPUEXT_STRING DESMUME_SSE_STRING DESMUME_AVX_STRING +#if defined(ENABLE_AVX2) + #undef DESMUME_CPUEXT_SECONDARY_STRING + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX2" +#elif defined(ENABLE_AVX) + #undef DESMUME_CPUEXT_SECONDARY_STRING + #define DESMUME_CPUEXT_SECONDARY_STRING "+AVX" +#endif +#define DESMUME_CPUEXT_STRING DESMUME_CPUEXT_PRIMARY_STRING DESMUME_CPUEXT_SECONDARY_STRING #ifdef DEVELOPER #define DESMUME_FEATURE_STRING " dev+"