From 84a007451af936583660fc5b5f524deb52b64f5f Mon Sep 17 00:00:00 2001 From: rogerman Date: Thu, 18 Aug 2016 04:14:45 +0000 Subject: [PATCH 01/41] Colorspace Handler: - Generic color conversion functions are now inlined at the header in order to keep up performance for compilers without LTO. --- .../colorspacehandler/colorspacehandler.cpp | 114 ------------------ .../colorspacehandler/colorspacehandler.h | 93 ++++++++++++-- 2 files changed, 83 insertions(+), 124 deletions(-) diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp index d0757d7cc..f6b1cf995 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.cpp +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.cpp @@ -139,90 +139,6 @@ void ColorspaceHandlerInit() } } -template -FORCEINLINE u32 ColorspaceConvert555To8888Opaque(const u16 src) -{ - return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); -} - -template -FORCEINLINE u32 ColorspaceConvert555To6665Opaque(const u16 src) -{ - return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); -} - -template -FORCEINLINE u32 ColorspaceConvert8888To6665(FragmentColor srcColor) -{ - FragmentColor outColor; - outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; - outColor.g = srcColor.g >> 2; - outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; - outColor.a = srcColor.a >> 3; - - return outColor.color; -} - -template -FORCEINLINE u32 ColorspaceConvert8888To6665(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ColorspaceConvert8888To6665(srcColorComponent); -} - -template -FORCEINLINE u32 ColorspaceConvert6665To8888(FragmentColor srcColor) -{ - FragmentColor outColor; - outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; - outColor.g = material_6bit_to_8bit[srcColor.g]; - outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; - outColor.a = material_5bit_to_8bit[srcColor.a]; - - return outColor.color; -} - -template -FORCEINLINE u32 ColorspaceConvert6665To8888(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ColorspaceConvert6665To8888(srcColorComponent); -} - -template -FORCEINLINE u16 ColorspaceConvert8888To5551(FragmentColor srcColor) -{ - return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); -} - -template -FORCEINLINE u16 ColorspaceConvert8888To5551(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ColorspaceConvert8888To5551(srcColorComponent); -} - -template -FORCEINLINE u16 ColorspaceConvert6665To5551(FragmentColor srcColor) -{ - return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); -} - -template -FORCEINLINE u16 ColorspaceConvert6665To5551(u32 srcColor) -{ - FragmentColor srcColorComponent; - srcColorComponent.color = srcColor; - - return ColorspaceConvert6665To5551(srcColorComponent); -} - template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) { @@ -715,36 +631,6 @@ size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 * return this->ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(src, dst, pixCount); } -template u32 ColorspaceConvert555To8888Opaque(const u16 src); -template u32 ColorspaceConvert555To8888Opaque(const u16 src); - -template u32 ColorspaceConvert555To6665Opaque(const u16 src); -template u32 ColorspaceConvert555To6665Opaque(const u16 src); - -template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); -template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); - -template u32 ColorspaceConvert8888To6665(u32 srcColor); -template u32 ColorspaceConvert8888To6665(u32 srcColor); - -template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); -template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); - -template u32 ColorspaceConvert6665To8888(u32 srcColor); -template u32 ColorspaceConvert6665To8888(u32 srcColor); - -template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); -template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); - -template u16 ColorspaceConvert8888To5551(u32 srcColor); -template u16 ColorspaceConvert8888To5551(u32 srcColor); - -template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); -template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); - -template u16 ColorspaceConvert6665To5551(u32 srcColor); -template u16 ColorspaceConvert6665To5551(u32 srcColor); - template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler.h b/desmume/src/utils/colorspacehandler/colorspacehandler.h index 362e975ea..7bcb4be3a 100644 --- a/desmume/src/utils/colorspacehandler/colorspacehandler.h +++ b/desmume/src/utils/colorspacehandler/colorspacehandler.h @@ -130,16 +130,89 @@ extern CACHE_ALIGN u32 color_555_to_888[32768]; void ColorspaceHandlerInit(); -template u32 ColorspaceConvert555To8888Opaque(const u16 src); -template u32 ColorspaceConvert555To6665Opaque(const u16 src); -template u32 ColorspaceConvert8888To6665(FragmentColor srcColor); -template u32 ColorspaceConvert8888To6665(u32 srcColor); -template u32 ColorspaceConvert6665To8888(FragmentColor srcColor); -template u32 ColorspaceConvert6665To8888(u32 srcColor); -template u16 ColorspaceConvert8888To5551(FragmentColor srcColor); -template u16 ColorspaceConvert8888To5551(u32 srcColor); -template u16 ColorspaceConvert6665To5551(FragmentColor srcColor); -template u16 ColorspaceConvert6665To5551(u32 srcColor); +template +FORCEINLINE u32 ColorspaceConvert555To8888Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert555To6665Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; + outColor.g = srcColor.g >> 2; + outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; + outColor.a = srcColor.a >> 3; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To6665(srcColorComponent); +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; + outColor.g = material_6bit_to_8bit[srcColor.g]; + outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; + outColor.a = material_5bit_to_8bit[srcColor.a]; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To8888(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(FragmentColor srcColor) +{ + return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To5551(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(FragmentColor srcColor) +{ + return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To5551(srcColorComponent); +} template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); From c5cae26246ccecfd37735f1d9b046ee40880cd54 Mon Sep 17 00:00:00 2001 From: rogerman Date: Thu, 18 Aug 2016 04:38:59 +0000 Subject: [PATCH 02/41] GPU: - Do some additional tweaks to how GPU events are handled. (Related to r5534.) --- desmume/src/GPU.cpp | 50 ++++++++++++++++++++++++++------------- desmume/src/GPU.h | 3 +++ desmume/src/NDSSystem.cpp | 9 ++++--- desmume/src/gfx3d.cpp | 23 ++++++++++++++---- 4 files changed, 61 insertions(+), 24 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index eb6cce8c8..f76dac39e 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -6760,6 +6760,7 @@ GPUSubsystem::GPUSubsystem() _displayTouch = new NDSDisplay(NDSDisplayID_Touch); _displayTouch->SetEngine(_engineSub); + _frameNeedsFinish = false; _willAutoResolveToCustomBuffer = true; OSDCLASS *previousOSD = osd; @@ -6880,19 +6881,30 @@ void GPUSubsystem::Reset() } void GPUSubsystem::ForceRender3DFinishAndFlush(bool willFlush) +{ + bool need3DDisplayFramebuffer; + bool need3DCaptureFramebuffer; + CurrentRenderer->GetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); + + CurrentRenderer->SetFramebufferFlushStates(willFlush, willFlush); + CurrentRenderer->RenderFinish(); + CurrentRenderer->SetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); +} + +void GPUSubsystem::ForceFrameStop() { if (CurrentRenderer->GetRenderNeedsFinish()) { - bool need3DDisplayFramebuffer; - bool need3DCaptureFramebuffer; - CurrentRenderer->GetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); - - CurrentRenderer->SetFramebufferFlushStates(willFlush, willFlush); - CurrentRenderer->RenderFinish(); - CurrentRenderer->SetFramebufferFlushStates(need3DDisplayFramebuffer, need3DCaptureFramebuffer); + this->ForceRender3DFinishAndFlush(true); CurrentRenderer->SetRenderNeedsFinish(false); this->_event->DidRender3DEnd(); } + + if (this->_frameNeedsFinish) + { + this->_frameNeedsFinish = false; + this->_event->DidFrameEnd(false); + } } void GPUSubsystem::UpdateRenderProperties() @@ -7020,8 +7032,6 @@ void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h, void *clientNati return; } - GPU->ForceRender3DFinishAndFlush(false); - const float customWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; const float customHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT; const float newGpuLargestDstLineCount = (size_t)ceilf(customHeightScale); @@ -7162,8 +7172,6 @@ void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h) void GPUSubsystem::SetColorFormat(const NDSColorFormat outputFormat, void *clientNativeBuffer, void *clientCustomBuffer) { - GPU->ForceRender3DFinishAndFlush(false); - this->_displayInfo.colorFormat = outputFormat; this->_displayInfo.pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor); @@ -7305,10 +7313,14 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) const bool isFramebufferRenderNeeded[2] = {(CommonSettings.showGpu.main && !this->_engineMain->GetIsMasterBrightFullIntensity()) || isDisplayCaptureNeeded, CommonSettings.showGpu.sub && !this->_engineSub->GetIsMasterBrightFullIntensity() }; - if (l == 0) + if (!this->_frameNeedsFinish) { this->_event->DidFrameBegin(isFrameSkipRequested); - + this->_frameNeedsFinish = true; + } + + if (l == 0) + { // Clear displays to black if they are turned off by the user. if (!isFrameSkipRequested) { @@ -7384,6 +7396,9 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) if (l == 191) { + this->_engineMain->FramebufferPostprocess(); + this->_engineSub->FramebufferPostprocess(); + if (!isFrameSkipRequested) { if (this->_displayInfo.isCustomSizeRequested) @@ -7425,9 +7440,6 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) } } - this->_engineMain->FramebufferPostprocess(); - this->_engineSub->FramebufferPostprocess(); - gfx3d._videoFrameCount++; if (gfx3d._videoFrameCount == 60) { @@ -7436,7 +7448,11 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) gfx3d._videoFrameCount = 0; } - this->_event->DidFrameEnd(isFrameSkipRequested); + if (this->_frameNeedsFinish) + { + this->_frameNeedsFinish = false; + this->_event->DidFrameEnd(isFrameSkipRequested); + } } } diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 12e6cbc9d..b983fed69 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1608,6 +1608,7 @@ private: NDSDisplay *_displayMain; NDSDisplay *_displayTouch; + bool _frameNeedsFinish; bool _willAutoResolveToCustomBuffer; u16 *_customVRAM; u16 *_customVRAMBlank; @@ -1628,6 +1629,8 @@ public: void Reset(); void ForceRender3DFinishAndFlush(bool willFlush); + void ForceFrameStop(); + const NDSDisplayInfo& GetDisplayInfo(); // Frontends need to call this whenever they need to read the video buffers from the emulator core void SetDisplayDidCustomRender(NDSDisplayID displayID, bool theState); diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index a79ad0801..74f9b82fc 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1411,8 +1411,8 @@ static void execHardware_hstart_vblankStart() triggerDma(EDMAMode_VBlank); //tracking for arm9 load average - nds.runCycleCollector[0][nds.idleFrameCounter] = 1120380-nds.idleCycles[0]; - nds.runCycleCollector[1][nds.idleFrameCounter] = 1120380-nds.idleCycles[1]; + nds.runCycleCollector[ARMCPU_ARM9][nds.idleFrameCounter] = 1120380-nds.idleCycles[0]; + nds.runCycleCollector[ARMCPU_ARM7][nds.idleFrameCounter] = 1120380-nds.idleCycles[1]; nds.idleFrameCounter++; nds.idleFrameCounter &= 15; nds.idleCycles[0] = 0; @@ -2973,9 +2973,12 @@ void NDS_swapScreen() } -void emu_halt() { +void emu_halt() +{ //printf("halting emu: ARM9 PC=%08X/%08X, ARM7 PC=%08X/%08X\n", NDS_ARM9.R[15], NDS_ARM9.instruct_adr, NDS_ARM7.R[15], NDS_ARM7.instruct_adr); execute = false; + GPU->ForceFrameStop(); + #ifdef LOG_ARM9 if (fp_dis9) { diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index d882b981c..bc5de5392 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -527,7 +527,10 @@ void gfx3d_deinit() void gfx3d_reset() { - GPU->ForceRender3DFinishAndFlush(false); + if (CurrentRenderer->GetRenderNeedsFinish()) + { + GPU->ForceRender3DFinishAndFlush(false); + } #ifdef _SHOW_VTX_COUNTERS max_polys = max_verts = 0; @@ -2300,7 +2303,12 @@ void gfx3d_VBlankSignal() void gfx3d_VBlankEndSignal(bool skipFrame) { - GPU->ForceRender3DFinishAndFlush(false); + if (CurrentRenderer->GetRenderNeedsFinish()) + { + GPU->ForceRender3DFinishAndFlush(false); + CurrentRenderer->SetRenderNeedsFinish(false); + GPU->GetEventHandler()->DidRender3DEnd(); + } if (!drawPending) return; if (skipFrame) return; @@ -2308,10 +2316,10 @@ void gfx3d_VBlankEndSignal(bool skipFrame) drawPending = FALSE; GPU->GetEventHandler()->DidRender3DBegin(); + CurrentRenderer->SetRenderNeedsFinish(true); if (CommonSettings.showGpu.main) { - CurrentRenderer->SetRenderNeedsFinish(true); CurrentRenderer->SetTextureProcessingProperties(CommonSettings.GFX3D_Renderer_TextureScalingFactor, CommonSettings.GFX3D_Renderer_TextureDeposterize, CommonSettings.GFX3D_Renderer_TextureSmoothing); @@ -2523,7 +2531,10 @@ void gfx3d_Update3DFramebuffers(FragmentColor *framebufferRGBA6665, u16 *framebu //-------------savestate void gfx3d_savestate(EMUFILE* os) { - GPU->ForceRender3DFinishAndFlush(true); + if (CurrentRenderer->GetRenderNeedsFinish()) + { + GPU->ForceRender3DFinishAndFlush(true); + } //version write32le(4,os); @@ -2556,6 +2567,10 @@ bool gfx3d_loadstate(EMUFILE* is, int size) if (read32le(&version,is) != 1) return false; if (size == 8) version = 0; + if (CurrentRenderer->GetRenderNeedsFinish()) + { + GPU->ForceRender3DFinishAndFlush(false); + } gfx3d_glPolygonAttrib_cache(); gfx3d_glTexImage_cache(); From 44b227d0b8fff4f464a76a0e1c8bd8a85e54efa3 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 20 Aug 2016 19:20:27 +0000 Subject: [PATCH 03/41] GPU: - If a GPU engine is disabled or has master brightness at full intensity, fill the output framebuffer on line 191 instead of on line 0. - Replace global variable Render3DFramesPerSecond with accessor method GPUSubsystem::GetFPSRender3D(). --- desmume/src/GPU.cpp | 73 +++++++++++++++++-------------- desmume/src/GPU.h | 4 ++ desmume/src/cocoa/cocoa_output.mm | 2 +- desmume/src/gfx3d.cpp | 4 -- desmume/src/gfx3d.h | 3 -- desmume/src/gtk/main.cpp | 8 ++-- desmume/src/windows/main.cpp | 2 +- 7 files changed, 50 insertions(+), 46 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index f76dac39e..82a5da1a2 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -48,7 +48,6 @@ #include "matrix.h" #include "emufile.h" -u32 Render3DFramesPerSecond; //instantiate static instance u16 GPUEngineBase::_brightnessUpTable555[17][0x8000]; @@ -6760,6 +6759,8 @@ GPUSubsystem::GPUSubsystem() _displayTouch = new NDSDisplay(NDSDisplayID_Touch); _displayTouch->SetEngine(_engineSub); + _videoFrameCount = 0; + _render3DFrameCount = 0; _frameNeedsFinish = false; _willAutoResolveToCustomBuffer = true; @@ -6853,6 +6854,9 @@ void GPUSubsystem::Reset() this->SetCustomFramebufferSize(this->_displayInfo.customWidth, this->_displayInfo.customHeight); } + this->_videoFrameCount = 0; + this->_render3DFrameCount = 0; + this->ClearWithColor(0xFFFF); this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = false; @@ -6990,6 +6994,11 @@ const NDSDisplayInfo& GPUSubsystem::GetDisplayInfo() return this->_displayInfo; } +u32 GPUSubsystem::GetFPSRender3D() const +{ + return this->_render3DFrameCount; +} + void GPUSubsystem::SetDisplayDidCustomRender(NDSDisplayID displayID, bool theState) { this->_displayInfo.didPerformCustomRender[displayID] = theState; @@ -7325,30 +7334,6 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) if (!isFrameSkipRequested) { this->UpdateRenderProperties(); - - if (!isFramebufferRenderNeeded[GPUEngineID_Main]) - { - if (!CommonSettings.showGpu.main) - { - memset(this->_engineMain->renderedBuffer, 0, this->_engineMain->renderedWidth * this->_engineMain->renderedHeight * this->_displayInfo.pixelBytes); - } - else if (this->_engineMain->GetIsMasterBrightFullIntensity()) - { - this->_engineMain->ApplyMasterBrightness(); - } - } - - if (!isFramebufferRenderNeeded[GPUEngineID_Sub]) - { - if (!CommonSettings.showGpu.sub) - { - memset(this->_engineSub->renderedBuffer, 0, this->_engineSub->renderedWidth * this->_engineSub->renderedHeight * this->_displayInfo.pixelBytes); - } - else if (this->_engineSub->GetIsMasterBrightFullIntensity()) - { - this->_engineSub->ApplyMasterBrightness(); - } - } } } @@ -7399,6 +7384,14 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) this->_engineMain->FramebufferPostprocess(); this->_engineSub->FramebufferPostprocess(); + this->_videoFrameCount++; + if (this->_videoFrameCount == 60) + { + this->_render3DFrameCount = gfx3d.render3DFrameCount; + gfx3d.render3DFrameCount = 0; + this->_videoFrameCount = 0; + } + if (!isFrameSkipRequested) { if (this->_displayInfo.isCustomSizeRequested) @@ -7421,11 +7414,33 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) { this->_engineMain->ApplyMasterBrightness(); } + else + { + if (!CommonSettings.showGpu.main) + { + memset(this->_engineMain->renderedBuffer, 0, this->_engineMain->renderedWidth * this->_engineMain->renderedHeight * this->_displayInfo.pixelBytes); + } + else if (this->_engineMain->GetIsMasterBrightFullIntensity()) + { + this->_engineMain->ApplyMasterBrightness(); + } + } if (isFramebufferRenderNeeded[GPUEngineID_Sub]) { this->_engineSub->ApplyMasterBrightness(); } + else + { + if (!CommonSettings.showGpu.sub) + { + memset(this->_engineSub->renderedBuffer, 0, this->_engineSub->renderedWidth * this->_engineSub->renderedHeight * this->_displayInfo.pixelBytes); + } + else if (this->_engineSub->GetIsMasterBrightFullIntensity()) + { + this->_engineSub->ApplyMasterBrightness(); + } + } if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) { @@ -7440,14 +7455,6 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) } } - gfx3d._videoFrameCount++; - if (gfx3d._videoFrameCount == 60) - { - Render3DFramesPerSecond = gfx3d.render3DFrameCount; - gfx3d.render3DFrameCount = 0; - gfx3d._videoFrameCount = 0; - } - if (this->_frameNeedsFinish) { this->_frameNeedsFinish = false; diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index b983fed69..00883a3ce 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1608,6 +1608,8 @@ private: NDSDisplay *_displayMain; NDSDisplay *_displayTouch; + u32 _videoFrameCount; // Internal variable that increments when a video frame is completed. Resets every 60 video frames. + u32 _render3DFrameCount; // The current 3D rendering frame count, saved to this variable once every 60 video frames. bool _frameNeedsFinish; bool _willAutoResolveToCustomBuffer; u16 *_customVRAM; @@ -1632,6 +1634,8 @@ public: void ForceFrameStop(); const NDSDisplayInfo& GetDisplayInfo(); // Frontends need to call this whenever they need to read the video buffers from the emulator core + u32 GetFPSRender3D() const; + void SetDisplayDidCustomRender(NDSDisplayID displayID, bool theState); GPUEngineA* GetEngineMain(); diff --git a/desmume/src/cocoa/cocoa_output.mm b/desmume/src/cocoa/cocoa_output.mm index 0a94dd4ae..5fff9e35b 100644 --- a/desmume/src/cocoa/cocoa_output.mm +++ b/desmume/src/cocoa/cocoa_output.mm @@ -869,7 +869,7 @@ [super handleEmuFrameProcessed]; NDSFrameInfo frameInfo; - frameInfo.render3DFPS = Render3DFramesPerSecond; + frameInfo.render3DFPS = GPU->GetFPSRender3D(); frameInfo.frameIndex = currFrameCounter; frameInfo.lagFrameCount = TotalLagFrames; diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index bc5de5392..04a43064b 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -504,9 +504,7 @@ void gfx3d_init() gfx3d.state.fogDensityTable = MMU.ARM9_REG+0x0360; gfx3d.state.edgeMarkColorTable = (u16 *)(MMU.ARM9_REG+0x0330); - gfx3d._videoFrameCount = 0; gfx3d.render3DFrameCount = 0; - Render3DFramesPerSecond = 0; makeTables(); Render3D_Init(); @@ -607,9 +605,7 @@ void gfx3d_reset() GFX_PIPEclear(); GFX_FIFOclear(); - gfx3d._videoFrameCount = 0; gfx3d.render3DFrameCount = 0; - Render3DFramesPerSecond = 0; CurrentRenderer->Reset(); } diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h index 0a4da3273..5a5bea1f0 100644 --- a/desmume/src/gfx3d.h +++ b/desmume/src/gfx3d.h @@ -667,7 +667,6 @@ struct GFX3D GFX3D() : polylist(0) , vertlist(0) - , _videoFrameCount(0) , render3DFrameCount(0) { } @@ -681,11 +680,9 @@ struct GFX3D VERTLIST* vertlist; INDEXLIST indexlist; - u32 _videoFrameCount; // Internal variable that increments when a video frame is completed. Resets every 60 video frames. u32 render3DFrameCount; // Increments when gfx3d_doFlush() is called. Resets every 60 video frames. }; extern GFX3D gfx3d; -extern u32 Render3DFramesPerSecond; // save the current 3D rendering frame count to here every 60 video frames //--------------------- diff --git a/desmume/src/gtk/main.cpp b/desmume/src/gtk/main.cpp index 2e9fa593d..5b335cafb 100644 --- a/desmume/src/gtk/main.cpp +++ b/desmume/src/gtk/main.cpp @@ -2413,7 +2413,7 @@ gboolean EmuLoop(gpointer data) // HUD display things (copied from Windows main.cpp) #ifdef HAVE_LIBAGG - Hud.fps3d = Render3DFramesPerSecond; + Hud.fps3d = GPU->GetFPSRender3D(); if(nds.idleFrameCounter==0 || oneSecond) { @@ -2446,7 +2446,7 @@ gboolean EmuLoop(gpointer data) for (i = 0; i < Frameskip; i++) { NDS_SkipNextFrame(); #ifdef HAVE_LIBAGG - Hud.fps3d = Render3DFramesPerSecond; + Hud.fps3d = GPU->GetFPSRender3D(); #endif desmume_cycle(); skipped_frames++; @@ -2459,7 +2459,7 @@ gboolean EmuLoop(gpointer data) for (i = 0; i < Frameskip; i++) { NDS_SkipNextFrame(); #ifdef HAVE_LIBAGG - Hud.fps3d = Render3DFramesPerSecond; + Hud.fps3d = GPU->GetFPSRender3D(); #endif desmume_cycle(); skipped_frames++; @@ -2481,7 +2481,7 @@ gboolean EmuLoop(gpointer data) // Aggressively skip frames to avoid delay NDS_SkipNextFrame(); #ifdef HAVE_LIBAGG - Hud.fps3d = Render3DFramesPerSecond; + Hud.fps3d = GPU->GetFPSRender3D(); #endif desmume_cycle(); skipped_frames++; diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 9f334fed3..1f7b29b1f 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -2185,7 +2185,7 @@ static void StepRunLoop_User() const int kFramesPerToolUpdate = 1; Hud.fps = mainLoopData.fps; - Hud.fps3d = Render3DFramesPerSecond; + Hud.fps3d = GPU->GetFPSRender3D(); Display(); From 5b2691b40e8195dc7f6cc86cabc3ef0be9709e7a Mon Sep 17 00:00:00 2001 From: zeromus Date: Sun, 21 Aug 2016 01:16:51 +0000 Subject: [PATCH 04/41] winport: experimental fix for hanging softrasterizer code. extremely finnicky and untested thread synchronization code in here! --- .../src/libretro-common/rthreads/rthreads.c | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index 7d6767cd0..109ba8bf9 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -80,7 +80,11 @@ struct slock struct scond { #ifdef USE_WIN32_THREADS + /* this might could be done with a semaphore? I'm not sure. */ HANDLE event; + int waiters; + bool waiting_ack; + HANDLE ack; #else pthread_cond_t cond; #endif @@ -311,6 +315,9 @@ scond_t *scond_new(void) #ifdef USE_WIN32_THREADS cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); + cond->waiters = 0; + cond->ack = CreateEvent(NULL, FALSE, FALSE, NULL); + cond->waiting_ack = false; event_created = !!cond->event; #else event_created = (pthread_cond_init(&cond->cond, NULL) == 0); @@ -339,6 +346,7 @@ void scond_free(scond_t *cond) #ifdef USE_WIN32_THREADS CloseHandle(cond->event); + CloseHandle(cond->ack); #else pthread_cond_destroy(&cond->cond); #endif @@ -355,10 +363,22 @@ void scond_free(scond_t *cond) void scond_wait(scond_t *cond, slock_t *lock) { #ifdef USE_WIN32_THREADS - WaitForSingleObject(cond->event, 0); - - SignalObjectAndWait(lock->lock, cond->event, INFINITE, FALSE); - slock_lock(lock); + /* remember: we currently have mutex so this will be safe */ + cond->waiters++; + ReleaseMutex(lock->lock); + + /* wait for a signaller */ + WaitForSingleObject(cond->event, INFINITE); + /* the algorithm hinges on this uncontrolled variable access. It's too hard to explain why it's safe. (..erm.. I hope it is) */ + cond->waiting_ack = false; + + /* reacquire mutex and finish up */ + WaitForSingleObject(lock->lock, INFINITE); + cond->waiters--; + + /* only when the waiter is COMPLETELY FINISHED do we ack a signaller */ + SetEvent(cond->ack); + #else pthread_cond_wait(&cond->cond, &lock->lock); #endif @@ -393,7 +413,23 @@ int scond_broadcast(scond_t *cond) void scond_signal(scond_t *cond) { #ifdef USE_WIN32_THREADS + + /* remember: we currently have mutex */ + if(cond->waiters == 0) return; + + /* OK, someone is waiting for a signal */ + + /* if we're waiting for an ack, we can't proceed until we receive an ack (signifies cond->event is freed up) */ + if(cond->waiting_ack) + WaitForSingleObject(cond->ack,INFINITE); + + /* so someone set the ack event; a waiter is proceeding. we can wait for another ack now... */ + cond->waiting_ack = true; + + /* ...and set an event to wake up a waiter so he can actually set that ack... */ + /* but definitely not right now, since we still have the mutex. So it may take a while */ SetEvent(cond->event); + #else pthread_cond_signal(&cond->cond); #endif From a7a9371b3a3d957274440e2e4ad84215a1cf19b1 Mon Sep 17 00:00:00 2001 From: jsteffens Date: Sun, 21 Aug 2016 21:52:29 +0000 Subject: [PATCH 05/41] Makefile.am: Fix syntax errors --- desmume/src/Makefile.am | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/desmume/src/Makefile.am b/desmume/src/Makefile.am index f5d8afceb..be5652c99 100644 --- a/desmume/src/Makefile.am +++ b/desmume/src/Makefile.am @@ -108,18 +108,18 @@ libdesmume_a_SOURCES = \ libretro-common/rthreads/async_job.c \ libretro-common/rthreads/rsemaphore.c \ libretro-common/rthreads/rthreads.c - -if SUPPORT_SSE2 += \ + +if SUPPORT_SSE2 libdesmume_a_SOURCES += \ utils/colorspacehandler/colorspacehandler_SSE2.cpp endif -if SUPPORT_AVX2 += \ +if SUPPORT_AVX2 libdesmume_a_SOURCES += \ utils/colorspacehandler/colorspacehandler_AVX2.cpp endif -if SUPPORT_ALTIVEC += \ +if SUPPORT_ALTIVEC libdesmume_a_SOURCES += \ utils/colorspacehandler/colorspacehandler_AltiVec.cpp endif From 701bfdde80619331180b9d56669171834b560088 Mon Sep 17 00:00:00 2001 From: jsteffens Date: Sun, 21 Aug 2016 22:41:56 +0000 Subject: [PATCH 06/41] configure.ac: Add missing conditionals --- desmume/configure.ac | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/desmume/configure.ac b/desmume/configure.ac index 05959007c..ca4a8212a 100644 --- a/desmume/configure.ac +++ b/desmume/configure.ac @@ -302,6 +302,15 @@ AS_CASE([$host_cpu], [AC_DEFINE(HOST_32)] ) +AC_CHECK_DECL([__SSE2__]) +AM_CONDITIONAL([SUPPORT_SSE2], [test "x$ac_cv_have_decl___SSE2__" = xyes]) + +AC_CHECK_DECL([__AVX2__]) +AM_CONDITIONAL([SUPPORT_AVX2], [test "x$ac_cv_have_decl___AVX2__" = xyes]) + +AC_CHECK_DECL([__ALTIVEC__]) +AM_CONDITIONAL([SUPPORT_ALTIVEC], [test "x$ac_cv_have_decl___ALTIVEC__" = xyes]) + AC_SUBST(UI_DIR) AC_SUBST(PO_DIR) From b4759f854cd1f0805b3678536d3dc209075c12e2 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 22 Aug 2016 21:04:03 +0000 Subject: [PATCH 07/41] GPU: - Add some functions to control when master brightness and RGB666=to=RGB888 conversions occur internally or not. --- desmume/src/GPU.cpp | 73 ++++++++++++++++++++++++++++++--------------- desmume/src/GPU.h | 22 ++++++++++++++ 2 files changed, 71 insertions(+), 24 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 82a5da1a2..cbf098329 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -6762,6 +6762,8 @@ GPUSubsystem::GPUSubsystem() _videoFrameCount = 0; _render3DFrameCount = 0; _frameNeedsFinish = false; + _willAutoApplyMasterBrightness = true; + _willAutoConvertRGB666ToRGB888 = true; _willAutoResolveToCustomBuffer = true; OSDCLASS *previousOSD = osd; @@ -7305,6 +7307,26 @@ u16* GPUSubsystem::GetCustomVRAMAddressUsingMappedAddress(const u32 mappedAddr) return (this->GetEngineMain()->GetCustomVRAMBlockPtr(blockID) + (_gpuCaptureLineIndex[blockLine] * this->_displayInfo.customWidth) + _gpuDstPitchIndex[linePixel]); } +bool GPUSubsystem::GetWillAutoApplyMasterBrightness() const +{ + return this->_willAutoApplyMasterBrightness; +} + +void GPUSubsystem::SetWillAutoApplyMasterBrightness(const bool willAutoApply) +{ + this->_willAutoApplyMasterBrightness = willAutoApply; +} + +bool GPUSubsystem::GetWillAutoConvertRGB666ToRGB888() const +{ + return this->_willAutoConvertRGB666ToRGB888; +} + +void GPUSubsystem::SetWillAutoConvertRGB666ToRGB888(const bool willAutoConvert) +{ + this->_willAutoConvertRGB666ToRGB888 = willAutoConvert; +} + bool GPUSubsystem::GetWillAutoResolveToCustomBuffer() const { return this->_willAutoResolveToCustomBuffer; @@ -7319,7 +7341,7 @@ template void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) { const bool isDisplayCaptureNeeded = this->_engineMain->WillDisplayCapture(l); - const bool isFramebufferRenderNeeded[2] = {(CommonSettings.showGpu.main && !this->_engineMain->GetIsMasterBrightFullIntensity()) || isDisplayCaptureNeeded, + const bool isFramebufferRenderNeeded[2] = { CommonSettings.showGpu.main && !this->_engineMain->GetIsMasterBrightFullIntensity(), CommonSettings.showGpu.sub && !this->_engineSub->GetIsMasterBrightFullIntensity() }; if (!this->_frameNeedsFinish) @@ -7337,7 +7359,7 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) } } - if (isFramebufferRenderNeeded[GPUEngineID_Main] && !isFrameSkipRequested) + if ( (isFramebufferRenderNeeded[GPUEngineID_Main] || isDisplayCaptureNeeded) && !isFrameSkipRequested ) { // GPUEngineA:WillRender3DLayer() and GPUEngineA:WillCapture3DLayerDirect() both rely on register // states that might change on a per-line basis. Therefore, we need to check these states on a @@ -7410,39 +7432,42 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = this->_displayTouch->GetEngine()->renderedWidth; this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = this->_displayTouch->GetEngine()->renderedHeight; - if (isFramebufferRenderNeeded[GPUEngineID_Main]) + if (this->_willAutoApplyMasterBrightness) { - this->_engineMain->ApplyMasterBrightness(); - } - else - { - if (!CommonSettings.showGpu.main) + if (CommonSettings.showGpu.main) + { + if (this->_engineMain->GetIsMasterBrightFullIntensity()) + { + this->_engineMain->ApplyMasterBrightness(); + } + else + { + this->_engineMain->ApplyMasterBrightness(); + } + } + else { memset(this->_engineMain->renderedBuffer, 0, this->_engineMain->renderedWidth * this->_engineMain->renderedHeight * this->_displayInfo.pixelBytes); } - else if (this->_engineMain->GetIsMasterBrightFullIntensity()) + + if (CommonSettings.showGpu.sub) { - this->_engineMain->ApplyMasterBrightness(); + if (this->_engineSub->GetIsMasterBrightFullIntensity()) + { + this->_engineSub->ApplyMasterBrightness(); + } + else + { + this->_engineSub->ApplyMasterBrightness(); + } } - } - - if (isFramebufferRenderNeeded[GPUEngineID_Sub]) - { - this->_engineSub->ApplyMasterBrightness(); - } - else - { - if (!CommonSettings.showGpu.sub) + else { memset(this->_engineSub->renderedBuffer, 0, this->_engineSub->renderedWidth * this->_engineSub->renderedHeight * this->_displayInfo.pixelBytes); } - else if (this->_engineSub->GetIsMasterBrightFullIntensity()) - { - this->_engineSub->ApplyMasterBrightness(); - } } - if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + if ( (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) && this->_willAutoConvertRGB666ToRGB888 ) { this->_engineMain->ResolveRGB666ToRGB888(); this->_engineSub->ResolveRGB666ToRGB888(); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 00883a3ce..7d194481b 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1611,6 +1611,8 @@ private: u32 _videoFrameCount; // Internal variable that increments when a video frame is completed. Resets every 60 video frames. u32 _render3DFrameCount; // The current 3D rendering frame count, saved to this variable once every 60 video frames. bool _frameNeedsFinish; + bool _willAutoApplyMasterBrightness; + bool _willAutoConvertRGB666ToRGB888; bool _willAutoResolveToCustomBuffer; u16 *_customVRAM; u16 *_customVRAMBlank; @@ -1656,6 +1658,26 @@ public: void UpdateRenderProperties(); + // By default, the output framebuffer will have the master brightness applied before + // the DidFrameEnd event. The master brightness is applied using the CPU. + // + // To turn off this behavior, call SetWillAutoApplyMasterBrightness() and pass a value + // of "false". This can be useful if the client wants to apply the master brightness + // itself, for example, if a client applies it on the GPU. + bool GetWillAutoApplyMasterBrightness() const; + void SetWillAutoApplyMasterBrightness(const bool willAutoApply); + + // By default, if the output framebuffer is in RGB666 format, then the framebuffers will + // automatically be converted to the much more common RGB888 format. This conversion is + // performed on the CPU. + // + // To turn off this behavior, call SetWillAutoConvertRGB666ToRGB888() and pass a value + // of "false". This can be useful if the client wants to do its own post-processing + // while the color format is still RGB666, or if the client wants to do its own custom + // conversion (such as converting the framebuffer later on the GPU). + bool GetWillAutoConvertRGB666ToRGB888() const; + void SetWillAutoConvertRGB666ToRGB888(const bool willAutoConvert); + // Normally, the GPUs will automatically resolve their native buffers to the master // custom framebuffer at the end of V-blank so that all rendered graphics are contained // within a single common buffer. This is necessary for when someone wants to read From ae92918d275d81a7cd4c76237defa8b657f04d81 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 23 Aug 2016 21:12:49 +0000 Subject: [PATCH 08/41] fix bugs in libretro's scond for win32, hopefully. --- .../src/libretro-common/rthreads/rthreads.c | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index 109ba8bf9..229a871ae 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -80,10 +80,9 @@ struct slock struct scond { #ifdef USE_WIN32_THREADS - /* this might could be done with a semaphore? I'm not sure. */ HANDLE event; - int waiters; - bool waiting_ack; + volatile int waiters; + volatile bool waiting_ack; HANDLE ack; #else pthread_cond_t cond; @@ -314,9 +313,12 @@ scond_t *scond_new(void) return NULL; #ifdef USE_WIN32_THREADS - cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); - cond->waiters = 0; + /* this is very complex because recreating condition variable semantics with win32 parts is not easy (or maybe it is and I just havent seen how) */ + /* the main problem is that a condition variable can be used to wake up a thread, but only if the thread is already waiting. */ + /* whereas a win32 event will 'wake up' a thread in advance (the event will be set in advance, so a 'waiter' wont even have to wait on it) */ + cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); cond->ack = CreateEvent(NULL, FALSE, FALSE, NULL); + cond->waiters = 0; cond->waiting_ack = false; event_created = !!cond->event; #else @@ -365,20 +367,31 @@ void scond_wait(scond_t *cond, slock_t *lock) #ifdef USE_WIN32_THREADS /* remember: we currently have mutex so this will be safe */ cond->waiters++; + if(cond->waiting_ack) + WaitForSingleObject(cond->ack,INFINITE); + ReleaseMutex(lock->lock); /* wait for a signaller */ WaitForSingleObject(cond->event, INFINITE); - /* the algorithm hinges on this uncontrolled variable access. It's too hard to explain why it's safe. (..erm.. I hope it is) */ - cond->waiting_ack = false; + + /* the algorithm hinges on this doing this stuff outside of the mutex */ + /* suppose several people signal right now. Actually, only one of them can. He'll be waiting on an ack signal! *inside the mutex* */ + /* we need to clear waiting_ack before we release him, otherwise it might race to set it to true and beat us */ + /* also: suppose several people are waiting right now (in the above wait on `event`). */ + /* well, only one of them is going to get freed by a signal; it must have been us */ + /* notice that both of the waits for ack are inside the mutex; this guarantees only one of them can be waiting at a time */ + /* that's essential for making this safe */ + //if(cond->waiting_ack) + { + cond->waiting_ack = false; + SetEvent(cond->ack); + } /* reacquire mutex and finish up */ WaitForSingleObject(lock->lock, INFINITE); cond->waiters--; - /* only when the waiter is COMPLETELY FINISHED do we ack a signaller */ - SetEvent(cond->ack); - #else pthread_cond_wait(&cond->cond, &lock->lock); #endif @@ -419,15 +432,14 @@ void scond_signal(scond_t *cond) /* OK, someone is waiting for a signal */ - /* if we're waiting for an ack, we can't proceed until we receive an ack (signifies cond->event is freed up) */ + /* if we're waiting for an ack, we can't proceed until we receive an ack (signifies that the event is freed up from the waiter destined to be waked by it) */ if(cond->waiting_ack) WaitForSingleObject(cond->ack,INFINITE); - /* so someone set the ack event; a waiter is proceeding. we can wait for another ack now... */ + /* before any further waits or signals, we'll need to wait for a waiter to wake up */ cond->waiting_ack = true; - /* ...and set an event to wake up a waiter so he can actually set that ack... */ - /* but definitely not right now, since we still have the mutex. So it may take a while */ + /* the main wakeup event. the winning waiter definitely won't wake up this moment since we're in a mutex. */ SetEvent(cond->event); #else From fc77539bda812e2c69a45285fafbac99eea724f7 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 23 Aug 2016 21:13:29 +0000 Subject: [PATCH 09/41] reimplement task to not be buggy --- desmume/src/utils/task.cpp | 183 +++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 90 deletions(-) diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index bc51c26b7..b4e3ba1d4 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -53,9 +53,10 @@ int getOnlineCores (void) class Task::Impl { private: - sthread_t* _thread; - bool _isThreadRunning; - + sthread_t* thread; + friend void thunkTaskProc(void* arg); + void taskProc(); + public: Impl(); ~Impl(); @@ -64,142 +65,144 @@ public: void execute(const TWork &work, void *param); void* finish(); void shutdown(); + void initialize(); slock_t *mutex; - scond_t *condWork; + scond_t *workCond; + bool workFlag, finishFlag; TWork workFunc; void *workFuncParam; void *ret; bool exitThread; + bool started; }; -static void taskProc(void *arg) +static void thunkTaskProc(void *arg) { Task::Impl *ctx = (Task::Impl *)arg; + ctx->taskProc(); +} - do { - slock_lock(ctx->mutex); +void Task::Impl::taskProc() +{ + for(;;) + { + slock_lock(mutex); - while (ctx->workFunc == NULL && !ctx->exitThread) { - scond_wait(ctx->condWork, ctx->mutex); - } + if(!workFlag) + scond_wait(workCond, mutex); + workFlag = false; - if (ctx->workFunc != NULL) { - ctx->ret = ctx->workFunc(ctx->workFuncParam); - } else { - ctx->ret = NULL; - } + ret = workFunc(workFuncParam); - ctx->workFunc = NULL; - scond_signal(ctx->condWork); + finishFlag = true; + scond_signal(workCond); - slock_unlock(ctx->mutex); + slock_unlock(mutex); - } while(!ctx->exitThread); + if(exitThread) + break; + } +} + +static void* killTask(void* task) +{ + ((Task::Impl*)task)->exitThread = true; + return 0; } Task::Impl::Impl() + : started(false) { - _isThreadRunning = false; - workFunc = NULL; - workFuncParam = NULL; - ret = NULL; - exitThread = false; - mutex = slock_new(); - condWork = scond_new(); } Task::Impl::~Impl() { shutdown(); - slock_free(mutex); - scond_free(condWork); +} + +void Task::Impl::initialize() +{ + thread = NULL; + workFunc = NULL; + workCond = NULL; + workFlag = finishFlag = false; + workFunc = NULL; + workFuncParam = NULL; + ret = NULL; + exitThread = false; + started = false; } void Task::Impl::start(bool spinlock) { - slock_lock(this->mutex); + initialize(); + mutex = slock_new(); + workCond = scond_new(); - if (this->_isThreadRunning) { - slock_unlock(this->mutex); - return; - } + slock_lock(mutex); - this->workFunc = NULL; - this->workFuncParam = NULL; - this->ret = NULL; - this->exitThread = false; - this->_thread = sthread_create(&taskProc,this); - this->_isThreadRunning = true; + thread = sthread_create(&thunkTaskProc,this); + started = true; + + slock_unlock(mutex); +} + +void Task::Impl::shutdown() +{ + if(!started) return; + + execute(killTask,this); + finish(); + + started = false; + + sthread_join(thread); + slock_free(mutex); + scond_free(workCond); +} + +void* Task::Impl::finish() +{ + //no work running; nothing to do (it's kind of lame that we call this under the circumstances) + if(!workFunc) + return NULL; + + slock_lock(mutex); + + if(!finishFlag) + scond_wait(workCond, mutex); + finishFlag = false; slock_unlock(this->mutex); + + workFunc = NULL; + + return ret; } void Task::Impl::execute(const TWork &work, void *param) { slock_lock(this->mutex); - if (work == NULL || !this->_isThreadRunning) { - slock_unlock(this->mutex); - return; - } - - this->workFunc = work; - this->workFuncParam = param; - scond_signal(this->condWork); + workFunc = work; + workFuncParam = param; + workFlag = true; + scond_signal(workCond); slock_unlock(this->mutex); } -void* Task::Impl::finish() -{ - void *returnValue = NULL; - slock_lock(this->mutex); - - if (!this->_isThreadRunning) { - slock_unlock(this->mutex); - return returnValue; - } - - while (this->workFunc != NULL) { - scond_wait(this->condWork, this->mutex); - } - - returnValue = this->ret; - - slock_unlock(this->mutex); - - return returnValue; -} - -void Task::Impl::shutdown() -{ - slock_lock(this->mutex); - - if (!this->_isThreadRunning) { - slock_unlock(this->mutex); - return; - } - - this->workFunc = NULL; - this->exitThread = true; - scond_signal(this->condWork); - - slock_unlock(this->mutex); - - sthread_join(this->_thread); - - slock_lock(this->mutex); - this->_isThreadRunning = false; - slock_unlock(this->mutex); -} void Task::start(bool spinlock) { impl->start(spinlock); } void Task::shutdown() { impl->shutdown(); } Task::Task() : impl(new Task::Impl()) {} -Task::~Task() { delete impl; } +Task::~Task() +{ + delete impl; +} void Task::execute(const TWork &work, void* param) { impl->execute(work,param); } void* Task::finish() { return impl->finish(); } From a1a6c47aac500d346851e1fd5e48e0ed17f5d3fe Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 23 Aug 2016 21:55:59 +0000 Subject: [PATCH 10/41] task.cpp: - Fix compiling for non-MSVC compilers. --- desmume/src/utils/task.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index b4e3ba1d4..999679526 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -77,7 +77,7 @@ public: bool started; }; -static void thunkTaskProc(void *arg) +void thunkTaskProc(void *arg) { Task::Impl *ctx = (Task::Impl *)arg; ctx->taskProc(); From 07e3612e4da862e96e6f247dca99a1865b5e5472 Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 23 Aug 2016 23:17:10 +0000 Subject: [PATCH 11/41] task.cpp: - When shutting down, ensure that the existing task is finished if its running before continuing with the shutdown process. - Explicitly declare thunkTaskProc() as static. --- desmume/src/utils/task.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index 999679526..447958992 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -51,6 +51,8 @@ int getOnlineCores (void) #endif } +static void thunkTaskProc(void *arg); + class Task::Impl { private: sthread_t* thread; @@ -77,7 +79,7 @@ public: bool started; }; -void thunkTaskProc(void *arg) +static void thunkTaskProc(void *arg) { Task::Impl *ctx = (Task::Impl *)arg; ctx->taskProc(); @@ -153,6 +155,7 @@ void Task::Impl::shutdown() { if(!started) return; + finish(); // Ensure that any previous tasks are finished before calling killTask(). execute(killTask,this); finish(); From 166365ab0d714c9831ace97f5622dba3e2700cdc Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 23 Aug 2016 23:32:44 +0000 Subject: [PATCH 12/41] task cleanup and add some volatiles which i reasoned were necessary --- desmume/src/utils/task.cpp | 46 +++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index 447958992..e28fa0214 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2015 DeSmuME team + Copyright (C) 2009-2016 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ */ #include +#include #include "types.h" #include "task.h" @@ -71,11 +72,10 @@ public: slock_t *mutex; scond_t *workCond; - bool workFlag, finishFlag; - TWork workFunc; - void *workFuncParam; - void *ret; - bool exitThread; + volatile bool workFlag, finishFlag, exitFlag; + volatile TWork workFunc; + void * volatile workFuncParam; + void * volatile ret; bool started; }; @@ -102,21 +102,20 @@ void Task::Impl::taskProc() slock_unlock(mutex); - if(exitThread) + if(exitFlag) break; } } static void* killTask(void* task) { - ((Task::Impl*)task)->exitThread = true; - return 0; + ((Task::Impl*)task)->exitFlag = true; + return NULL; } Task::Impl::Impl() : started(false) { - } Task::Impl::~Impl() @@ -129,16 +128,20 @@ void Task::Impl::initialize() thread = NULL; workFunc = NULL; workCond = NULL; - workFlag = finishFlag = false; workFunc = NULL; workFuncParam = NULL; + workFlag = finishFlag = exitFlag = false; ret = NULL; - exitThread = false; started = false; } void Task::Impl::start(bool spinlock) { + //check user error + assert(!started); + + if(started) shutdown(); + initialize(); mutex = slock_new(); workCond = scond_new(); @@ -155,20 +158,26 @@ void Task::Impl::shutdown() { if(!started) return; - finish(); // Ensure that any previous tasks are finished before calling killTask(). + //nobody should shutdown while a task is still running; + //it would imply that we're in some kind of shutdown pricess, and datastructures might be getting freed while someone is still working on it + //nonetheless, _troublingly_, it seems like we do that now, so for now let's try to let that work finish instead of blowing up when it isn't finished + //assert(!workFunc); + finish(); + + //a new task which sets the kill flag execute(killTask,this); finish(); started = false; sthread_join(thread); - slock_free(mutex); scond_free(workCond); + slock_free(mutex); } void* Task::Impl::finish() { - //no work running; nothing to do (it's kind of lame that we call this under the circumstances) + //no work running; nothing to do if(!workFunc) return NULL; @@ -197,15 +206,10 @@ void Task::Impl::execute(const TWork &work, void *param) slock_unlock(this->mutex); } - - void Task::start(bool spinlock) { impl->start(spinlock); } void Task::shutdown() { impl->shutdown(); } Task::Task() : impl(new Task::Impl()) {} -Task::~Task() -{ - delete impl; -} +Task::~Task() { delete impl; } void Task::execute(const TWork &work, void* param) { impl->execute(work,param); } void* Task::finish() { return impl->finish(); } From 66bc2d1d71477602201b33767c7447ffe9e35eb1 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 23 Aug 2016 23:35:06 +0000 Subject: [PATCH 13/41] etc --- desmume/src/utils/task.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index e28fa0214..fa4cf13e8 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -159,8 +159,8 @@ void Task::Impl::shutdown() if(!started) return; //nobody should shutdown while a task is still running; - //it would imply that we're in some kind of shutdown pricess, and datastructures might be getting freed while someone is still working on it - //nonetheless, _troublingly_, it seems like we do that now, so for now let's try to let that work finish instead of blowing up when it isn't finished + //it would imply that we're in some kind of shutdown process, and datastructures might be getting freed while a worker is still working on it. + //nonetheless, _troublingly_, it seems like we do that now, so for now let's try to let that work finish instead of blowing up when it isn't finished. //assert(!workFunc); finish(); From 212c23f30e16d019a7c0816dff3dfbdd388bb276 Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 24 Aug 2016 21:17:39 +0000 Subject: [PATCH 14/41] task.cpp: - EXPERIMENTAL: Revert task.cpp and pthreads.c to what they were back in r5538, but change scond_wait() to explicitly unlock the mutex before calling WaitForSingleObject(). --- .../src/libretro-common/rthreads/rthreads.c | 55 +---- desmume/src/utils/task.cpp | 204 +++++++++--------- 2 files changed, 100 insertions(+), 159 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index 229a871ae..097a58927 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -81,9 +81,6 @@ struct scond { #ifdef USE_WIN32_THREADS HANDLE event; - volatile int waiters; - volatile bool waiting_ack; - HANDLE ack; #else pthread_cond_t cond; #endif @@ -313,13 +310,7 @@ scond_t *scond_new(void) return NULL; #ifdef USE_WIN32_THREADS - /* this is very complex because recreating condition variable semantics with win32 parts is not easy (or maybe it is and I just havent seen how) */ - /* the main problem is that a condition variable can be used to wake up a thread, but only if the thread is already waiting. */ - /* whereas a win32 event will 'wake up' a thread in advance (the event will be set in advance, so a 'waiter' wont even have to wait on it) */ - cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); - cond->ack = CreateEvent(NULL, FALSE, FALSE, NULL); - cond->waiters = 0; - cond->waiting_ack = false; + cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); event_created = !!cond->event; #else event_created = (pthread_cond_init(&cond->cond, NULL) == 0); @@ -348,7 +339,6 @@ void scond_free(scond_t *cond) #ifdef USE_WIN32_THREADS CloseHandle(cond->event); - CloseHandle(cond->ack); #else pthread_cond_destroy(&cond->cond); #endif @@ -365,33 +355,9 @@ void scond_free(scond_t *cond) void scond_wait(scond_t *cond, slock_t *lock) { #ifdef USE_WIN32_THREADS - /* remember: we currently have mutex so this will be safe */ - cond->waiters++; - if(cond->waiting_ack) - WaitForSingleObject(cond->ack,INFINITE); - - ReleaseMutex(lock->lock); - - /* wait for a signaller */ + slock_unlock(lock); WaitForSingleObject(cond->event, INFINITE); - - /* the algorithm hinges on this doing this stuff outside of the mutex */ - /* suppose several people signal right now. Actually, only one of them can. He'll be waiting on an ack signal! *inside the mutex* */ - /* we need to clear waiting_ack before we release him, otherwise it might race to set it to true and beat us */ - /* also: suppose several people are waiting right now (in the above wait on `event`). */ - /* well, only one of them is going to get freed by a signal; it must have been us */ - /* notice that both of the waits for ack are inside the mutex; this guarantees only one of them can be waiting at a time */ - /* that's essential for making this safe */ - //if(cond->waiting_ack) - { - cond->waiting_ack = false; - SetEvent(cond->ack); - } - - /* reacquire mutex and finish up */ - WaitForSingleObject(lock->lock, INFINITE); - cond->waiters--; - + slock_lock(lock); #else pthread_cond_wait(&cond->cond, &lock->lock); #endif @@ -426,22 +392,7 @@ int scond_broadcast(scond_t *cond) void scond_signal(scond_t *cond) { #ifdef USE_WIN32_THREADS - - /* remember: we currently have mutex */ - if(cond->waiters == 0) return; - - /* OK, someone is waiting for a signal */ - - /* if we're waiting for an ack, we can't proceed until we receive an ack (signifies that the event is freed up from the waiter destined to be waked by it) */ - if(cond->waiting_ack) - WaitForSingleObject(cond->ack,INFINITE); - - /* before any further waits or signals, we'll need to wait for a waiter to wake up */ - cond->waiting_ack = true; - - /* the main wakeup event. the winning waiter definitely won't wake up this moment since we're in a mutex. */ SetEvent(cond->event); - #else pthread_cond_signal(&cond->cond); #endif diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index fa4cf13e8..bc51c26b7 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2016 DeSmuME team + Copyright (C) 2009-2015 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,7 +16,6 @@ */ #include -#include #include "types.h" #include "task.h" @@ -52,14 +51,11 @@ int getOnlineCores (void) #endif } -static void thunkTaskProc(void *arg); - class Task::Impl { private: - sthread_t* thread; - friend void thunkTaskProc(void* arg); - void taskProc(); - + sthread_t* _thread; + bool _isThreadRunning; + public: Impl(); ~Impl(); @@ -68,141 +64,135 @@ public: void execute(const TWork &work, void *param); void* finish(); void shutdown(); - void initialize(); slock_t *mutex; - scond_t *workCond; - volatile bool workFlag, finishFlag, exitFlag; - volatile TWork workFunc; - void * volatile workFuncParam; - void * volatile ret; - bool started; + scond_t *condWork; + TWork workFunc; + void *workFuncParam; + void *ret; + bool exitThread; }; -static void thunkTaskProc(void *arg) +static void taskProc(void *arg) { Task::Impl *ctx = (Task::Impl *)arg; - ctx->taskProc(); -} -void Task::Impl::taskProc() -{ - for(;;) - { - slock_lock(mutex); + do { + slock_lock(ctx->mutex); - if(!workFlag) - scond_wait(workCond, mutex); - workFlag = false; + while (ctx->workFunc == NULL && !ctx->exitThread) { + scond_wait(ctx->condWork, ctx->mutex); + } - ret = workFunc(workFuncParam); + if (ctx->workFunc != NULL) { + ctx->ret = ctx->workFunc(ctx->workFuncParam); + } else { + ctx->ret = NULL; + } - finishFlag = true; - scond_signal(workCond); + ctx->workFunc = NULL; + scond_signal(ctx->condWork); - slock_unlock(mutex); + slock_unlock(ctx->mutex); - if(exitFlag) - break; - } -} - -static void* killTask(void* task) -{ - ((Task::Impl*)task)->exitFlag = true; - return NULL; + } while(!ctx->exitThread); } Task::Impl::Impl() - : started(false) { + _isThreadRunning = false; + workFunc = NULL; + workFuncParam = NULL; + ret = NULL; + exitThread = false; + + mutex = slock_new(); + condWork = scond_new(); } Task::Impl::~Impl() { shutdown(); -} - -void Task::Impl::initialize() -{ - thread = NULL; - workFunc = NULL; - workCond = NULL; - workFunc = NULL; - workFuncParam = NULL; - workFlag = finishFlag = exitFlag = false; - ret = NULL; - started = false; + slock_free(mutex); + scond_free(condWork); } void Task::Impl::start(bool spinlock) { - //check user error - assert(!started); + slock_lock(this->mutex); - if(started) shutdown(); + if (this->_isThreadRunning) { + slock_unlock(this->mutex); + return; + } - initialize(); - mutex = slock_new(); - workCond = scond_new(); - - slock_lock(mutex); - - thread = sthread_create(&thunkTaskProc,this); - started = true; - - slock_unlock(mutex); -} - -void Task::Impl::shutdown() -{ - if(!started) return; - - //nobody should shutdown while a task is still running; - //it would imply that we're in some kind of shutdown process, and datastructures might be getting freed while a worker is still working on it. - //nonetheless, _troublingly_, it seems like we do that now, so for now let's try to let that work finish instead of blowing up when it isn't finished. - //assert(!workFunc); - finish(); - - //a new task which sets the kill flag - execute(killTask,this); - finish(); - - started = false; - - sthread_join(thread); - scond_free(workCond); - slock_free(mutex); -} - -void* Task::Impl::finish() -{ - //no work running; nothing to do - if(!workFunc) - return NULL; - - slock_lock(mutex); - - if(!finishFlag) - scond_wait(workCond, mutex); - finishFlag = false; + this->workFunc = NULL; + this->workFuncParam = NULL; + this->ret = NULL; + this->exitThread = false; + this->_thread = sthread_create(&taskProc,this); + this->_isThreadRunning = true; slock_unlock(this->mutex); - - workFunc = NULL; - - return ret; } void Task::Impl::execute(const TWork &work, void *param) { slock_lock(this->mutex); - workFunc = work; - workFuncParam = param; - workFlag = true; - scond_signal(workCond); + if (work == NULL || !this->_isThreadRunning) { + slock_unlock(this->mutex); + return; + } + this->workFunc = work; + this->workFuncParam = param; + scond_signal(this->condWork); + + slock_unlock(this->mutex); +} + +void* Task::Impl::finish() +{ + void *returnValue = NULL; + + slock_lock(this->mutex); + + if (!this->_isThreadRunning) { + slock_unlock(this->mutex); + return returnValue; + } + + while (this->workFunc != NULL) { + scond_wait(this->condWork, this->mutex); + } + + returnValue = this->ret; + + slock_unlock(this->mutex); + + return returnValue; +} + +void Task::Impl::shutdown() +{ + slock_lock(this->mutex); + + if (!this->_isThreadRunning) { + slock_unlock(this->mutex); + return; + } + + this->workFunc = NULL; + this->exitThread = true; + scond_signal(this->condWork); + + slock_unlock(this->mutex); + + sthread_join(this->_thread); + + slock_lock(this->mutex); + this->_isThreadRunning = false; slock_unlock(this->mutex); } From 3ae591be7a991cb956a124a08c834a7761df1388 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 26 Aug 2016 03:45:42 +0000 Subject: [PATCH 15/41] task.cpp: - Add additional checks for workFunc in Task::Impl::execute() and Task::Impl::finish() to make their reentrancy more robust on Windows. - Add a last resort execution of workFunc in Task::Impl::finish() in the case where taskProc() misses the wake up signal from Task::Impl::execute() when running on Windows. --- desmume/src/utils/task.cpp | 50 +++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index bc51c26b7..eed37c137 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2015 DeSmuME team + Copyright (C) 2009-2016 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -71,6 +71,7 @@ public: void *workFuncParam; void *ret; bool exitThread; + bool isTaskWaiting; }; static void taskProc(void *arg) @@ -81,7 +82,9 @@ static void taskProc(void *arg) slock_lock(ctx->mutex); while (ctx->workFunc == NULL && !ctx->exitThread) { + ctx->isTaskWaiting = true; scond_wait(ctx->condWork, ctx->mutex); + ctx->isTaskWaiting = false; } if (ctx->workFunc != NULL) { @@ -101,6 +104,7 @@ static void taskProc(void *arg) Task::Impl::Impl() { _isThreadRunning = false; + isTaskWaiting = false; workFunc = NULL; workFuncParam = NULL; ret = NULL; @@ -130,6 +134,7 @@ void Task::Impl::start(bool spinlock) this->workFuncParam = NULL; this->ret = NULL; this->exitThread = false; + this->isTaskWaiting = false; this->_thread = sthread_create(&taskProc,this); this->_isThreadRunning = true; @@ -140,7 +145,8 @@ void Task::Impl::execute(const TWork &work, void *param) { slock_lock(this->mutex); - if (work == NULL || !this->_isThreadRunning) { + if ((work == NULL) || (this->workFunc != NULL) || !this->_isThreadRunning) + { slock_unlock(this->mutex); return; } @@ -158,12 +164,48 @@ void* Task::Impl::finish() slock_lock(this->mutex); - if (!this->_isThreadRunning) { + if ((this->workFunc == NULL) || !this->_isThreadRunning) { slock_unlock(this->mutex); return returnValue; } - while (this->workFunc != NULL) { + // As a last resort, we need to ensure that taskProc() actually executed, and if + // it didn't, do something about it right now. + // + // Normally, calling execute() will wake up taskProc(), but on certain systems, + // the signal from execute() might get missed by taskProc(). If this signal is + // missed, then this method's scond_wait() will hang, since taskProc() will never + // clear workFunc and signal back when its finished (taskProc() was never woken + // up in the first place). + // + // This situation is only possible on systems where scond_wait() does not have + // immediate lock/unlock mechanics with the wait state, such as on Windows. + // Signals can get lost in scond_wait() since a thread's wait state might start + // at a much later time from releasing the mutex, causing the signalling thread + // to send its signal before the wait state is set. All of this is possible + // because of the fact that switching the wait state and switching the mutex + // state are performed as two separate operations. In common parlance, this is + // known as the "lost wakeup problem". + // + // On systems that do have immediate lock/unlock mechanics with the wait state, + // such as systems that natively support pthread_cond_wait(), it is impossible + // for this situation to occur since both the thread wait state and the mutex + // state will switch simultaneously, thus never missing a signal due to the + // constant protection of the mutex. +#if defined(WIN32) + if (this->isTaskWaiting) + { + // In the event where the signal was missed by taskProc(), just do the work + // right now in this thread. Hopefully, signal misses don't happen to often, + // because if they do, it would completely defeat the purpose of having the + // extra task thread in the first place. + this->ret = this->workFunc(workFuncParam); + this->workFunc = NULL; + } +#endif + + while (this->workFunc != NULL) + { scond_wait(this->condWork, this->mutex); } From 386d9bad96e460482e7a2e72cfaf5ef6460624e7 Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 26 Aug 2016 10:57:20 +0000 Subject: [PATCH 16/41] task.cpp: - Revert the last resort execution of workFunc in Task::Impl::finish(). Windows now has much better compliance with the behavior of pthread_cond_wait(), so the last resort execution is no longer necessary. --- .../src/libretro-common/rthreads/rthreads.c | 108 ++++++++++++++++-- desmume/src/utils/task.cpp | 40 ------- 2 files changed, 98 insertions(+), 50 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index 097a58927..794b09909 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -77,10 +77,21 @@ struct slock #endif }; +#ifdef USE_WIN32_THREADS +//TODO - there's actually no need for this struct. we could do it all with ugly pointer syntax. save that for later. +struct ConceptualBlock +{ + struct ConceptualBlock* next; +}; +#endif + struct scond { #ifdef USE_WIN32_THREADS - HANDLE event; + HANDLE event, hot_potato; + volatile struct ConceptualBlock* volatile root; //the root of the queue; NULL if queue is empty + volatile int waiters; //equivalent to the queue length + volatile int wakens; #else pthread_cond_t cond; #endif @@ -310,7 +321,14 @@ scond_t *scond_new(void) return NULL; #ifdef USE_WIN32_THREADS - cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); + /* this is very complex because recreating condition variable semantics with win32 parts is not easy (or maybe it is and I just havent seen how) */ + /* the main problem is that a condition variable can be used to wake up a thread, but only if the thread is already waiting. */ + /* whereas a win32 event will 'wake up' a thread in advance (the event will be set in advance, so a 'waiter' wont even have to wait on it) */ + cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); + cond->hot_potato = CreateEvent(NULL, FALSE, FALSE, NULL); + cond->waiters = 0; + cond->wakens = 0; + cond->root = NULL; event_created = !!cond->event; #else event_created = (pthread_cond_init(&cond->cond, NULL) == 0); @@ -339,6 +357,7 @@ void scond_free(scond_t *cond) #ifdef USE_WIN32_THREADS CloseHandle(cond->event); + CloseHandle(cond->hot_potato); #else pthread_cond_destroy(&cond->cond); #endif @@ -355,9 +374,66 @@ void scond_free(scond_t *cond) void scond_wait(scond_t *cond, slock_t *lock) { #ifdef USE_WIN32_THREADS - slock_unlock(lock); - WaitForSingleObject(cond->event, INFINITE); - slock_lock(lock); + + //setup a queue (linked list) of blocked threads + volatile struct ConceptualBlock myblock; + myblock.next = NULL; + volatile struct ConceptualBlock* volatile * ptr = &cond->root; + while(*ptr != NULL) + ptr = &((*ptr)->next); + *ptr = &myblock; + + //now the conceptual lock release and condition block are supposed to be atomic. + //we can't do that in windows, but we can simulate the effects by using the queue, by the following analysis: + //What happens if they aren't atomic? + //1. a signaller can rush in and signal, expecting a waiter to get it; but the waiter wouldn't, because he isn't blocked yet + //solution: win32 events make this easy. the event will sit there enabled + //2. a signaller can rush in and signal, and then turn right around and wait + //solution: the signaller will get queued behind the waiter, who's enqueued before he releases the mutex + + for(;;) + { + bool myturn = (cond->root == &myblock); + + if(!myturn) + { + //well, someone else needs to get to go, maybe it's their turn + //NOTE: this depends on good fair behavour of thread blocking in the OS. I think it's OK + SetEvent(cond->hot_potato); + } + + ReleaseMutex(lock->lock); + + if(myturn) + { + WaitForSingleObject(cond->event, INFINITE); + break; + } + else + { + WaitForSingleObject(cond->hot_potato, INFINITE); + + //re-acquire mutex just for interrogating the queue + WaitForSingleObject(lock->lock, INFINITE); + } + } + + //re-acquire mutex + WaitForSingleObject(lock->lock, INFINITE); + + //remove ourselves from the queue + cond->root = myblock.next; + + //if we have any more wakening to do, chain it here + cond->wakens--; + if(cond->wakens>0) + SetEvent(cond->event); + + cond->waiters--; + + //always leave this set. because--TBD: explain later + SetEvent(cond->hot_potato); + #else pthread_cond_wait(&cond->cond, &lock->lock); #endif @@ -373,9 +449,14 @@ void scond_wait(scond_t *cond, slock_t *lock) int scond_broadcast(scond_t *cond) { #ifdef USE_WIN32_THREADS - /* FIXME _- check how this function should differ - * from scond_signal implementation. */ - SetEvent(cond->event); + + /* remember: we currently have mutex */ + if(cond->root == NULL) return 0; + + //awaken everything which is currently queued up + if(cond->wakens == 0) SetEvent(cond->event); + cond->wakens = cond->waiters; + return 0; #else return pthread_cond_broadcast(&cond->cond); @@ -392,7 +473,14 @@ int scond_broadcast(scond_t *cond) void scond_signal(scond_t *cond) { #ifdef USE_WIN32_THREADS - SetEvent(cond->event); + + /* remember: we currently have mutex */ + if(cond->root == NULL) return; + + //wake up the next thing in the queue + if(cond->wakens == 0) SetEvent(cond->event); + cond->wakens++; + #else pthread_cond_signal(&cond->cond); #endif @@ -465,4 +553,4 @@ bool scond_wait_timeout(scond_t *cond, slock_t *lock, int64_t timeout_us) ret = pthread_cond_timedwait(&cond->cond, &lock->lock, &now); return (ret == 0); #endif -} +} \ No newline at end of file diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp index eed37c137..da860e9d6 100644 --- a/desmume/src/utils/task.cpp +++ b/desmume/src/utils/task.cpp @@ -71,7 +71,6 @@ public: void *workFuncParam; void *ret; bool exitThread; - bool isTaskWaiting; }; static void taskProc(void *arg) @@ -82,9 +81,7 @@ static void taskProc(void *arg) slock_lock(ctx->mutex); while (ctx->workFunc == NULL && !ctx->exitThread) { - ctx->isTaskWaiting = true; scond_wait(ctx->condWork, ctx->mutex); - ctx->isTaskWaiting = false; } if (ctx->workFunc != NULL) { @@ -104,7 +101,6 @@ static void taskProc(void *arg) Task::Impl::Impl() { _isThreadRunning = false; - isTaskWaiting = false; workFunc = NULL; workFuncParam = NULL; ret = NULL; @@ -134,7 +130,6 @@ void Task::Impl::start(bool spinlock) this->workFuncParam = NULL; this->ret = NULL; this->exitThread = false; - this->isTaskWaiting = false; this->_thread = sthread_create(&taskProc,this); this->_isThreadRunning = true; @@ -169,41 +164,6 @@ void* Task::Impl::finish() return returnValue; } - // As a last resort, we need to ensure that taskProc() actually executed, and if - // it didn't, do something about it right now. - // - // Normally, calling execute() will wake up taskProc(), but on certain systems, - // the signal from execute() might get missed by taskProc(). If this signal is - // missed, then this method's scond_wait() will hang, since taskProc() will never - // clear workFunc and signal back when its finished (taskProc() was never woken - // up in the first place). - // - // This situation is only possible on systems where scond_wait() does not have - // immediate lock/unlock mechanics with the wait state, such as on Windows. - // Signals can get lost in scond_wait() since a thread's wait state might start - // at a much later time from releasing the mutex, causing the signalling thread - // to send its signal before the wait state is set. All of this is possible - // because of the fact that switching the wait state and switching the mutex - // state are performed as two separate operations. In common parlance, this is - // known as the "lost wakeup problem". - // - // On systems that do have immediate lock/unlock mechanics with the wait state, - // such as systems that natively support pthread_cond_wait(), it is impossible - // for this situation to occur since both the thread wait state and the mutex - // state will switch simultaneously, thus never missing a signal due to the - // constant protection of the mutex. -#if defined(WIN32) - if (this->isTaskWaiting) - { - // In the event where the signal was missed by taskProc(), just do the work - // right now in this thread. Hopefully, signal misses don't happen to often, - // because if they do, it would completely defeat the purpose of having the - // extra task thread in the first place. - this->ret = this->workFunc(workFuncParam); - this->workFunc = NULL; - } -#endif - while (this->workFunc != NULL) { scond_wait(this->condWork, this->mutex); From 5d66422b94970025a21c6f8fa95e2909b00e6aa0 Mon Sep 17 00:00:00 2001 From: zeromus Date: Mon, 29 Aug 2016 04:54:30 +0000 Subject: [PATCH 17/41] win32 scond: remove volatile, fix busy loops in some scenarios, tidy code. Except for scond_wait_timeout, this may be done --- .../src/libretro-common/rthreads/rthreads.c | 1146 +++++++++-------- 1 file changed, 591 insertions(+), 555 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index 794b09909..edf3829c4 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -1,556 +1,592 @@ -/* Copyright (C) 2010-2016 The RetroArch team - * - * --------------------------------------------------------------------------------------- - * The following license statement only applies to this file (rthreads.c). - * --------------------------------------------------------------------------------------- - * - * Permission is hereby granted, free of charge, - * to any person obtaining a copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifdef __unix__ -#define _POSIX_C_SOURCE 199309 -#endif - -#include - -#include -#include - -/* with RETRO_WIN32_USE_PTHREADS, pthreads can be used even on win32. Maybe only supported in MSVC>=2005 */ - -#if defined(_WIN32) && !defined(RETRO_WIN32_USE_PTHREADS) -#define USE_WIN32_THREADS -#ifdef _XBOX -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#endif -#elif defined(GEKKO) -#include "gx_pthread.h" -#elif defined(PSP) -#include "psp_pthread.h" -#else -#include -#include -#endif - -#ifdef __MACH__ -#include -#include -#endif - -struct thread_data -{ - void (*func)(void*); - void *userdata; -}; - -struct sthread -{ -#ifdef USE_WIN32_THREADS - HANDLE thread; -#else - pthread_t id; -#endif -}; - -struct slock -{ -#ifdef USE_WIN32_THREADS - HANDLE lock; -#else - pthread_mutex_t lock; -#endif -}; - -#ifdef USE_WIN32_THREADS -//TODO - there's actually no need for this struct. we could do it all with ugly pointer syntax. save that for later. -struct ConceptualBlock -{ - struct ConceptualBlock* next; -}; -#endif - -struct scond -{ -#ifdef USE_WIN32_THREADS - HANDLE event, hot_potato; - volatile struct ConceptualBlock* volatile root; //the root of the queue; NULL if queue is empty - volatile int waiters; //equivalent to the queue length - volatile int wakens; -#else - pthread_cond_t cond; -#endif -}; - -#ifdef USE_WIN32_THREADS -static DWORD CALLBACK thread_wrap(void *data_) -#else -static void *thread_wrap(void *data_) -#endif -{ - struct thread_data *data = (struct thread_data*)data_; - if (!data) - return 0; - data->func(data->userdata); - free(data); - return 0; -} - -/** - * sthread_create: - * @start_routine : thread entry callback function - * @userdata : pointer to userdata that will be made - * available in thread entry callback function - * - * Create a new thread. - * - * Returns: pointer to new thread if successful, otherwise NULL. - */ -sthread_t *sthread_create(void (*thread_func)(void*), void *userdata) -{ - bool thread_created = false; - struct thread_data *data = NULL; - sthread_t *thread = (sthread_t*)calloc(1, sizeof(*thread)); - - if (!thread) - return NULL; - - data = (struct thread_data*)calloc(1, sizeof(*data)); - if (!data) - goto error; - - data->func = thread_func; - data->userdata = userdata; - -#ifdef USE_WIN32_THREADS - thread->thread = CreateThread(NULL, 0, thread_wrap, data, 0, NULL); - thread_created = !!thread->thread; -#else - thread_created = pthread_create(&thread->id, NULL, thread_wrap, data) == 0; -#endif - - if (!thread_created) - goto error; - - return thread; - -error: - if (data) - free(data); - free(thread); - return NULL; -} - -/** - * sthread_detach: - * @thread : pointer to thread object - * - * Detach a thread. When a detached thread terminates, its - * resource sare automatically released back to the system - * without the need for another thread to join with the - * terminated thread. - * - * Returns: 0 on success, otherwise it returns a non-zero error number. - */ -int sthread_detach(sthread_t *thread) -{ -#ifdef USE_WIN32_THREADS - CloseHandle(thread->thread); - free(thread); - return 0; -#else - return pthread_detach(thread->id); -#endif -} - -/** - * sthread_join: - * @thread : pointer to thread object - * - * Join with a terminated thread. Waits for the thread specified by - * @thread to terminate. If that thread has already terminated, then - * it will return immediately. The thread specified by @thread must - * be joinable. - * - * Returns: 0 on success, otherwise it returns a non-zero error number. - */ -void sthread_join(sthread_t *thread) -{ -#ifdef USE_WIN32_THREADS - WaitForSingleObject(thread->thread, INFINITE); - CloseHandle(thread->thread); -#else - pthread_join(thread->id, NULL); -#endif - free(thread); -} - -/** - * sthread_isself: - * @thread : pointer to thread object - * - * Join with a terminated thread. Waits for the thread specified by - * @thread to terminate. If that thread has already terminated, then - * it will return immediately. The thread specified by @thread must - * be joinable. - * - * Returns: true (1) if calling thread is the specified thread - */ -bool sthread_isself(sthread_t *thread) -{ -#ifdef USE_WIN32_THREADS - return GetCurrentThread() == thread->thread; -#else - return pthread_equal(pthread_self(),thread->id); -#endif -} - -/** - * slock_new: - * - * Create and initialize a new mutex. Must be manually - * freed. - * - * Returns: pointer to a new mutex if successful, otherwise NULL. - **/ -slock_t *slock_new(void) -{ - bool mutex_created = false; - slock_t *lock = (slock_t*)calloc(1, sizeof(*lock)); - if (!lock) - return NULL; - -#ifdef USE_WIN32_THREADS - lock->lock = CreateMutex(NULL, FALSE, NULL); - mutex_created = !!lock->lock; -#else - mutex_created = (pthread_mutex_init(&lock->lock, NULL) == 0); -#endif - - if (!mutex_created) - goto error; - - return lock; - -error: - free(lock); - return NULL; -} - -/** - * slock_free: - * @lock : pointer to mutex object - * - * Frees a mutex. - **/ -void slock_free(slock_t *lock) -{ - if (!lock) - return; - -#ifdef USE_WIN32_THREADS - CloseHandle(lock->lock); -#else - pthread_mutex_destroy(&lock->lock); -#endif - free(lock); -} - -/** - * slock_lock: - * @lock : pointer to mutex object - * - * Locks a mutex. If a mutex is already locked by - * another thread, the calling thread shall block until - * the mutex becomes available. -**/ -void slock_lock(slock_t *lock) -{ -#ifdef USE_WIN32_THREADS - WaitForSingleObject(lock->lock, INFINITE); -#else - pthread_mutex_lock(&lock->lock); -#endif -} - -/** - * slock_unlock: - * @lock : pointer to mutex object - * - * Unlocks a mutex. - **/ -void slock_unlock(slock_t *lock) -{ -#ifdef USE_WIN32_THREADS - ReleaseMutex(lock->lock); -#else - pthread_mutex_unlock(&lock->lock); -#endif -} - -/** - * scond_new: - * - * Creates and initializes a condition variable. Must - * be manually freed. - * - * Returns: pointer to new condition variable on success, - * otherwise NULL. - **/ -scond_t *scond_new(void) -{ - bool event_created = false; - scond_t *cond = (scond_t*)calloc(1, sizeof(*cond)); - - if (!cond) - return NULL; - -#ifdef USE_WIN32_THREADS - /* this is very complex because recreating condition variable semantics with win32 parts is not easy (or maybe it is and I just havent seen how) */ - /* the main problem is that a condition variable can be used to wake up a thread, but only if the thread is already waiting. */ - /* whereas a win32 event will 'wake up' a thread in advance (the event will be set in advance, so a 'waiter' wont even have to wait on it) */ - cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); - cond->hot_potato = CreateEvent(NULL, FALSE, FALSE, NULL); - cond->waiters = 0; - cond->wakens = 0; - cond->root = NULL; - event_created = !!cond->event; -#else - event_created = (pthread_cond_init(&cond->cond, NULL) == 0); -#endif - - if (!event_created) - goto error; - - return cond; - -error: - free(cond); - return NULL; -} - -/** - * scond_free: - * @cond : pointer to condition variable object - * - * Frees a condition variable. -**/ -void scond_free(scond_t *cond) -{ - if (!cond) - return; - -#ifdef USE_WIN32_THREADS - CloseHandle(cond->event); - CloseHandle(cond->hot_potato); -#else - pthread_cond_destroy(&cond->cond); -#endif - free(cond); -} - -/** - * scond_wait: - * @cond : pointer to condition variable object - * @lock : pointer to mutex object - * - * Block on a condition variable (i.e. wait on a condition). - **/ -void scond_wait(scond_t *cond, slock_t *lock) -{ -#ifdef USE_WIN32_THREADS - - //setup a queue (linked list) of blocked threads - volatile struct ConceptualBlock myblock; - myblock.next = NULL; - volatile struct ConceptualBlock* volatile * ptr = &cond->root; - while(*ptr != NULL) - ptr = &((*ptr)->next); - *ptr = &myblock; - - //now the conceptual lock release and condition block are supposed to be atomic. - //we can't do that in windows, but we can simulate the effects by using the queue, by the following analysis: - //What happens if they aren't atomic? - //1. a signaller can rush in and signal, expecting a waiter to get it; but the waiter wouldn't, because he isn't blocked yet - //solution: win32 events make this easy. the event will sit there enabled - //2. a signaller can rush in and signal, and then turn right around and wait - //solution: the signaller will get queued behind the waiter, who's enqueued before he releases the mutex - - for(;;) - { - bool myturn = (cond->root == &myblock); - - if(!myturn) - { - //well, someone else needs to get to go, maybe it's their turn - //NOTE: this depends on good fair behavour of thread blocking in the OS. I think it's OK - SetEvent(cond->hot_potato); - } - - ReleaseMutex(lock->lock); - - if(myturn) - { - WaitForSingleObject(cond->event, INFINITE); - break; - } - else - { - WaitForSingleObject(cond->hot_potato, INFINITE); - - //re-acquire mutex just for interrogating the queue - WaitForSingleObject(lock->lock, INFINITE); - } - } - - //re-acquire mutex - WaitForSingleObject(lock->lock, INFINITE); - - //remove ourselves from the queue - cond->root = myblock.next; - - //if we have any more wakening to do, chain it here - cond->wakens--; - if(cond->wakens>0) - SetEvent(cond->event); - - cond->waiters--; - - //always leave this set. because--TBD: explain later - SetEvent(cond->hot_potato); - -#else - pthread_cond_wait(&cond->cond, &lock->lock); -#endif -} - -/** - * scond_broadcast: - * @cond : pointer to condition variable object - * - * Broadcast a condition. Unblocks all threads currently blocked - * on the specified condition variable @cond. - **/ -int scond_broadcast(scond_t *cond) -{ -#ifdef USE_WIN32_THREADS - - /* remember: we currently have mutex */ - if(cond->root == NULL) return 0; - - //awaken everything which is currently queued up - if(cond->wakens == 0) SetEvent(cond->event); - cond->wakens = cond->waiters; - - return 0; -#else - return pthread_cond_broadcast(&cond->cond); -#endif -} - -/** - * scond_signal: - * @cond : pointer to condition variable object - * - * Signal a condition. Unblocks at least one of the threads currently blocked - * on the specified condition variable @cond. - **/ -void scond_signal(scond_t *cond) -{ -#ifdef USE_WIN32_THREADS - - /* remember: we currently have mutex */ - if(cond->root == NULL) return; - - //wake up the next thing in the queue - if(cond->wakens == 0) SetEvent(cond->event); - cond->wakens++; - -#else - pthread_cond_signal(&cond->cond); -#endif -} - -/** - * scond_wait_timeout: - * @cond : pointer to condition variable object - * @lock : pointer to mutex object - * @timeout_us : timeout (in microseconds) - * - * Try to block on a condition variable (i.e. wait on a condition) until - * @timeout_us elapses. - * - * Returns: false (0) if timeout elapses before condition variable is - * signaled or broadcast, otherwise true (1). - **/ -bool scond_wait_timeout(scond_t *cond, slock_t *lock, int64_t timeout_us) -{ -#ifdef USE_WIN32_THREADS - DWORD ret; - - WaitForSingleObject(cond->event, 0); - ret = SignalObjectAndWait(lock->lock, cond->event, - (DWORD)(timeout_us) / 1000, FALSE); - - slock_lock(lock); - return ret == WAIT_OBJECT_0; -#else - int ret; - int64_t seconds, remainder; - struct timespec now = {0}; - -#ifdef __MACH__ - /* OSX doesn't have clock_gettime. */ - clock_serv_t cclock; - mach_timespec_t mts; - - host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); - clock_get_time(cclock, &mts); - mach_port_deallocate(mach_task_self(), cclock); - now.tv_sec = mts.tv_sec; - now.tv_nsec = mts.tv_nsec; -#elif defined(__CELLOS_LV2__) - sys_time_sec_t s; - sys_time_nsec_t n; - - sys_time_get_current_time(&s, &n); - now.tv_sec = s; - now.tv_nsec = n; -#elif defined(__mips__) - struct timeval tm; - - gettimeofday(&tm, NULL); - now.tv_sec = tm.tv_sec; - now.tv_nsec = tm.tv_usec * 1000; -#elif defined(RETRO_WIN32_USE_PTHREADS) - _ftime64_s(&now); -#elif !defined(GEKKO) - /* timeout on libogc is duration, not end time. */ - clock_gettime(CLOCK_REALTIME, &now); -#endif - - seconds = timeout_us / INT64_C(1000000); - remainder = timeout_us % INT64_C(1000000); - - now.tv_sec += seconds; - now.tv_nsec += remainder * INT64_C(1000); - - ret = pthread_cond_timedwait(&cond->cond, &lock->lock, &now); - return (ret == 0); -#endif +/* Copyright (C) 2010-2016 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (rthreads.c). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef __unix__ +#define _POSIX_C_SOURCE 199309 +#endif + +#include + +#include +#include + +/* with RETRO_WIN32_USE_PTHREADS, pthreads can be used even on win32. Maybe only supported in MSVC>=2005 */ + +#if defined(_WIN32) && !defined(RETRO_WIN32_USE_PTHREADS) +#define USE_WIN32_THREADS +#ifdef _XBOX +#include +#else +#define WIN32_LEAN_AND_MEAN +#include +#endif +#elif defined(GEKKO) +#include "gx_pthread.h" +#elif defined(PSP) +#include "psp_pthread.h" +#else +#include +#include +#endif + +#ifdef __MACH__ +#include +#include +#endif + +struct thread_data +{ + void (*func)(void*); + void *userdata; +}; + +struct sthread +{ +#ifdef USE_WIN32_THREADS + HANDLE thread; +#else + pthread_t id; +#endif +}; + +struct slock +{ +#ifdef USE_WIN32_THREADS + HANDLE lock; +#else + pthread_mutex_t lock; +#endif +}; + +struct scond +{ +#ifdef USE_WIN32_THREADS + + /* The syntax we'll use is mind-bending unless we use a struct. Plus, we might want to store more info later */ + /* This will be used as a linked list immplementing a queue of waiting threads */ + struct QueueEntry + { + struct QueueEntry* next; + }; + + /* With this implementation of scond, we don't have any way of waking (or even identifying) specific threads */ + /* But we need to wake them in the order indicated by the queue. */ + /* This potato token will get get passed around every waiter. The bearer can test whether he's next, and hold onto the potato if he is. */ + /* When he's done he can then put it back into play to progress the queue further */ + HANDLE hot_potato; + + /* The primary signalled event. Hot potatoes are passed until this is set. */ + HANDLE event; + + /* the head of the queue; NULL if queue is empty */ + struct QueueEntry* head; + + /* equivalent to the queue length */ + int waiters; + + /* how many waiters in the queue have been conceptually wakened by signals (even if we haven't managed to actually wake them yet */ + int wakens; + +#else + pthread_cond_t cond; +#endif +}; + +#ifdef USE_WIN32_THREADS +static DWORD CALLBACK thread_wrap(void *data_) +#else +static void *thread_wrap(void *data_) +#endif +{ + struct thread_data *data = (struct thread_data*)data_; + if (!data) + return 0; + data->func(data->userdata); + free(data); + return 0; +} + +/** + * sthread_create: + * @start_routine : thread entry callback function + * @userdata : pointer to userdata that will be made + * available in thread entry callback function + * + * Create a new thread. + * + * Returns: pointer to new thread if successful, otherwise NULL. + */ +sthread_t *sthread_create(void (*thread_func)(void*), void *userdata) +{ + bool thread_created = false; + struct thread_data *data = NULL; + sthread_t *thread = (sthread_t*)calloc(1, sizeof(*thread)); + + if (!thread) + return NULL; + + data = (struct thread_data*)calloc(1, sizeof(*data)); + if (!data) + goto error; + + data->func = thread_func; + data->userdata = userdata; + +#ifdef USE_WIN32_THREADS + thread->thread = CreateThread(NULL, 0, thread_wrap, data, 0, NULL); + thread_created = !!thread->thread; +#else + thread_created = pthread_create(&thread->id, NULL, thread_wrap, data) == 0; +#endif + + if (!thread_created) + goto error; + + return thread; + +error: + if (data) + free(data); + free(thread); + return NULL; +} + +/** + * sthread_detach: + * @thread : pointer to thread object + * + * Detach a thread. When a detached thread terminates, its + * resource sare automatically released back to the system + * without the need for another thread to join with the + * terminated thread. + * + * Returns: 0 on success, otherwise it returns a non-zero error number. + */ +int sthread_detach(sthread_t *thread) +{ +#ifdef USE_WIN32_THREADS + CloseHandle(thread->thread); + free(thread); + return 0; +#else + return pthread_detach(thread->id); +#endif +} + +/** + * sthread_join: + * @thread : pointer to thread object + * + * Join with a terminated thread. Waits for the thread specified by + * @thread to terminate. If that thread has already terminated, then + * it will return immediately. The thread specified by @thread must + * be joinable. + * + * Returns: 0 on success, otherwise it returns a non-zero error number. + */ +void sthread_join(sthread_t *thread) +{ +#ifdef USE_WIN32_THREADS + WaitForSingleObject(thread->thread, INFINITE); + CloseHandle(thread->thread); +#else + pthread_join(thread->id, NULL); +#endif + free(thread); +} + +/** + * sthread_isself: + * @thread : pointer to thread object + * + * Join with a terminated thread. Waits for the thread specified by + * @thread to terminate. If that thread has already terminated, then + * it will return immediately. The thread specified by @thread must + * be joinable. + * + * Returns: true (1) if calling thread is the specified thread + */ +bool sthread_isself(sthread_t *thread) +{ +#ifdef USE_WIN32_THREADS + return GetCurrentThread() == thread->thread; +#else + return pthread_equal(pthread_self(),thread->id); +#endif +} + +/** + * slock_new: + * + * Create and initialize a new mutex. Must be manually + * freed. + * + * Returns: pointer to a new mutex if successful, otherwise NULL. + **/ +slock_t *slock_new(void) +{ + bool mutex_created = false; + slock_t *lock = (slock_t*)calloc(1, sizeof(*lock)); + if (!lock) + return NULL; + +#ifdef USE_WIN32_THREADS + lock->lock = CreateMutex(NULL, FALSE, NULL); + mutex_created = !!lock->lock; +#else + mutex_created = (pthread_mutex_init(&lock->lock, NULL) == 0); +#endif + + if (!mutex_created) + goto error; + + return lock; + +error: + free(lock); + return NULL; +} + +/** + * slock_free: + * @lock : pointer to mutex object + * + * Frees a mutex. + **/ +void slock_free(slock_t *lock) +{ + if (!lock) + return; + +#ifdef USE_WIN32_THREADS + CloseHandle(lock->lock); +#else + pthread_mutex_destroy(&lock->lock); +#endif + free(lock); +} + +/** + * slock_lock: + * @lock : pointer to mutex object + * + * Locks a mutex. If a mutex is already locked by + * another thread, the calling thread shall block until + * the mutex becomes available. +**/ +void slock_lock(slock_t *lock) +{ +#ifdef USE_WIN32_THREADS + WaitForSingleObject(lock->lock, INFINITE); +#else + pthread_mutex_lock(&lock->lock); +#endif +} + +/** + * slock_unlock: + * @lock : pointer to mutex object + * + * Unlocks a mutex. + **/ +void slock_unlock(slock_t *lock) +{ +#ifdef USE_WIN32_THREADS + ReleaseMutex(lock->lock); +#else + pthread_mutex_unlock(&lock->lock); +#endif +} + +/** + * scond_new: + * + * Creates and initializes a condition variable. Must + * be manually freed. + * + * Returns: pointer to new condition variable on success, + * otherwise NULL. + **/ +scond_t *scond_new(void) +{ + scond_t *cond = (scond_t*)calloc(1, sizeof(*cond)); + + if (!cond) + return NULL; + +#ifdef USE_WIN32_THREADS + /* This is very complex because recreating condition variable semantics with win32 parts is not easy */ + /* The main problem is that a condition variable can be used to wake up a thread, but only if the thread is already waiting; */ + /* whereas a win32 event will 'wake up' a thread in advance (the event will be set in advance, so a 'waiter' wont even have to wait on it) */ + /* So at the very least, we need to do something clever. But there's bigger problems. */ + /* We don't even have a straightforward way in win32 to satisfy pthread_cond_wait's atomicity requirement. The bulk of this algorithm is solving that. */ + /* Note: We might could simplify this using vista+ condition variables, but we wanted an XP compatible solution. */ + cond->event = CreateEvent(NULL, FALSE, FALSE, NULL); + if(!cond->event) goto error; + cond->hot_potato = CreateEvent(NULL, FALSE, FALSE, NULL); + if(!cond->hot_potato) + { + CloseHandle(cond->event); + goto error; + } + cond->waiters = cond->wakens = 0; + cond->head = NULL; + +#else + if(pthread_cond_init(&cond->cond, NULL) != 0) + goto error; +#endif + + return cond; + +error: + free(cond); + return NULL; +} + +/** + * scond_free: + * @cond : pointer to condition variable object + * + * Frees a condition variable. +**/ +void scond_free(scond_t *cond) +{ + if (!cond) + return; + +#ifdef USE_WIN32_THREADS + CloseHandle(cond->event); + CloseHandle(cond->hot_potato); +#else + pthread_cond_destroy(&cond->cond); +#endif + free(cond); +} + +/** + * scond_wait: + * @cond : pointer to condition variable object + * @lock : pointer to mutex object + * + * Block on a condition variable (i.e. wait on a condition). + **/ +void scond_wait(scond_t *cond, slock_t *lock) +{ +#ifdef USE_WIN32_THREADS + + /* add ourselves to a queue of waiting threads */ + struct QueueEntry myentry; + myentry.next = NULL; + struct QueueEntry** ptr = &cond->head; + while(*ptr) /* walk to the end of the linked list */ + ptr = &((*ptr)->next); + *ptr = &myentry; + + cond->waiters++; + + /* now the conceptual lock release and condition block are supposed to be atomic. */ + /* we can't do that in windows, but we can simulate the effects by using the queue, by the following analysis: */ + /* What happens if they aren't atomic? */ + /* 1. a signaller can rush in and signal, expecting a waiter to get it; but the waiter wouldn't, because he isn't blocked yet */ + /* solution: win32 events make this easy. the event will sit there enabled */ + /* 2. a signaller can rush in and signal, and then turn right around and wait */ + /* solution: the signaller will get queued behind the waiter, who's enqueued before he releases the mutex */ + + for(;;) + { + /* We're always in the mutex here*/ + + /* It's my turn if I'm the head of the queue */ + bool myturn = (cond->head == &myentry); + + if(!myturn) + { + /* As long as someone is even going to be able to wake up when they receive the potato, keep it going round */ + if(cond->wakens>0) + SetEvent(cond->hot_potato); + } + + /* If it's my turn, I hold the potato */ + + ReleaseMutex(lock->lock); + + if(myturn) + { + WaitForSingleObject(cond->event, INFINITE); + break; + } + else + { + /* Wait to catch the hot potato before checking for my turn again */ + WaitForSingleObject(cond->hot_potato, INFINITE); + + /* Re-acquire the mutex just for interrogating the queue */ + WaitForSingleObject(lock->lock, INFINITE); + } + } + + /* Reacquire the main mutex */ + WaitForSingleObject(lock->lock, INFINITE); + + /* Remove ourselves from the queue */ + cond->head = myentry.next; + cond->waiters--; + + /* If any other wakenings are pending, go ahead and set it up */ + /* There may actually be no waiters. That's OK. The first waiter will come in, find it's his turn, and immediately get the signaled event */ + cond->wakens--; + if(cond->wakens>0) + { + SetEvent(cond->event); + + /* Progress the queue: Put the hot potato back into play. It'll be tossed around until next in line gets it */ + SetEvent(cond->hot_potato); + } + +#else + pthread_cond_wait(&cond->cond, &lock->lock); +#endif +} + +/** + * scond_broadcast: + * @cond : pointer to condition variable object + * + * Broadcast a condition. Unblocks all threads currently blocked + * on the specified condition variable @cond. + **/ +int scond_broadcast(scond_t *cond) +{ +#ifdef USE_WIN32_THREADS + + /* remember: we currently have mutex */ + if(cond->waiters == 0) return 0; + + /* awaken everything which is currently queued up */ + if(cond->wakens == 0) SetEvent(cond->event); + cond->wakens = cond->waiters; + + /* Since there is now at least one pending waken, the potato must be in play */ + SetEvent(cond->hot_potato); + + return 0; +#else + return pthread_cond_broadcast(&cond->cond); +#endif +} + +/** + * scond_signal: + * @cond : pointer to condition variable object + * + * Signal a condition. Unblocks at least one of the threads currently blocked + * on the specified condition variable @cond. + **/ +void scond_signal(scond_t *cond) +{ +#ifdef USE_WIN32_THREADS + + /* remember: we currently have mutex */ + if(cond->waiters == 0) return; + + /* wake up the next thing in the queue */ + if(cond->wakens == 0) SetEvent(cond->event); + cond->wakens++; + + /* Since there is now at least one pending waken, the potato must be in play */ + SetEvent(cond->hot_potato); + +#else + pthread_cond_signal(&cond->cond); +#endif +} + +/** + * scond_wait_timeout: + * @cond : pointer to condition variable object + * @lock : pointer to mutex object + * @timeout_us : timeout (in microseconds) + * + * Try to block on a condition variable (i.e. wait on a condition) until + * @timeout_us elapses. + * + * Returns: false (0) if timeout elapses before condition variable is + * signaled or broadcast, otherwise true (1). + **/ +bool scond_wait_timeout(scond_t *cond, slock_t *lock, int64_t timeout_us) +{ +#ifdef USE_WIN32_THREADS + DWORD ret; + + /* TODO: this is woefully inadequate. It needs to be solved with the newer approach used above */ + WaitForSingleObject(cond->event, 0); + ret = SignalObjectAndWait(lock->lock, cond->event, + (DWORD)(timeout_us) / 1000, FALSE); + + slock_lock(lock); + return ret == WAIT_OBJECT_0; +#else + int ret; + int64_t seconds, remainder; + struct timespec now = {0}; + +#ifdef __MACH__ + /* OSX doesn't have clock_gettime. */ + clock_serv_t cclock; + mach_timespec_t mts; + + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &mts); + mach_port_deallocate(mach_task_self(), cclock); + now.tv_sec = mts.tv_sec; + now.tv_nsec = mts.tv_nsec; +#elif defined(__CELLOS_LV2__) + sys_time_sec_t s; + sys_time_nsec_t n; + + sys_time_get_current_time(&s, &n); + now.tv_sec = s; + now.tv_nsec = n; +#elif defined(__mips__) + struct timeval tm; + + gettimeofday(&tm, NULL); + now.tv_sec = tm.tv_sec; + now.tv_nsec = tm.tv_usec * 1000; +#elif defined(RETRO_WIN32_USE_PTHREADS) + _ftime64_s(&now); +#elif !defined(GEKKO) + /* timeout on libogc is duration, not end time. */ + clock_gettime(CLOCK_REALTIME, &now); +#endif + + seconds = timeout_us / INT64_C(1000000); + remainder = timeout_us % INT64_C(1000000); + + now.tv_sec += seconds; + now.tv_nsec += remainder * INT64_C(1000); + + ret = pthread_cond_timedwait(&cond->cond, &lock->lock, &now); + return (ret == 0); +#endif } \ No newline at end of file From 538442861baede759c0ac0c66c49fc4f8f668a1b Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 29 Aug 2016 20:08:07 +0000 Subject: [PATCH 18/41] rthreads.c: - In scond_wait(), make use of SignalObjectAndWait() as an optimization. --- .../src/libretro-common/rthreads/rthreads.c | 46 ++++++------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index edf3829c4..4122ed4a8 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -411,42 +411,22 @@ void scond_wait(scond_t *cond, slock_t *lock) /* solution: win32 events make this easy. the event will sit there enabled */ /* 2. a signaller can rush in and signal, and then turn right around and wait */ /* solution: the signaller will get queued behind the waiter, who's enqueued before he releases the mutex */ - - for(;;) + + /* It's my turn if I'm the head of the queue. Check to see if it's my turn. */ + while (cond->head != &myentry) { - /* We're always in the mutex here*/ + /* As long as someone is even going to be able to wake up when they receive the potato, keep it going round */ + if (cond->wakens > 0) + SetEvent(cond->hot_potato); - /* It's my turn if I'm the head of the queue */ - bool myturn = (cond->head == &myentry); - - if(!myturn) - { - /* As long as someone is even going to be able to wake up when they receive the potato, keep it going round */ - if(cond->wakens>0) - SetEvent(cond->hot_potato); - } - - /* If it's my turn, I hold the potato */ - - ReleaseMutex(lock->lock); - - if(myturn) - { - WaitForSingleObject(cond->event, INFINITE); - break; - } - else - { - /* Wait to catch the hot potato before checking for my turn again */ - WaitForSingleObject(cond->hot_potato, INFINITE); - - /* Re-acquire the mutex just for interrogating the queue */ - WaitForSingleObject(lock->lock, INFINITE); - } + /* Wait to catch the hot potato before checking for my turn again */ + SignalObjectAndWait(lock->lock, cond->hot_potato, INFINITE, FALSE); + slock_lock(lock); } - - /* Reacquire the main mutex */ - WaitForSingleObject(lock->lock, INFINITE); + + /* It's my turn now -- I hold the potato */ + SignalObjectAndWait(lock->lock, cond->event, INFINITE, FALSE); + slock_lock(lock); /* Remove ourselves from the queue */ cond->head = myentry.next; From 8f6bfd280829e6e0182866061c42036d42d3a858 Mon Sep 17 00:00:00 2001 From: zeromus Date: Fri, 2 Sep 2016 01:15:26 +0000 Subject: [PATCH 19/41] w32 scond: reorder variable declarations and statements for ancient compilers --- desmume/src/libretro-common/rthreads/rthreads.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/desmume/src/libretro-common/rthreads/rthreads.c b/desmume/src/libretro-common/rthreads/rthreads.c index 4122ed4a8..31892c372 100644 --- a/desmume/src/libretro-common/rthreads/rthreads.c +++ b/desmume/src/libretro-common/rthreads/rthreads.c @@ -396,11 +396,11 @@ void scond_wait(scond_t *cond, slock_t *lock) /* add ourselves to a queue of waiting threads */ struct QueueEntry myentry; - myentry.next = NULL; - struct QueueEntry** ptr = &cond->head; + struct QueueEntry** ptr = &cond->head; while(*ptr) /* walk to the end of the linked list */ ptr = &((*ptr)->next); *ptr = &myentry; + myentry.next = NULL; cond->waiters++; From df7f081cb52486aa505ea4fe31fbef8a34db7ec0 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 6 Sep 2016 18:09:00 +0000 Subject: [PATCH 20/41] placement of fastbuild hacks in gpu.cpp breaks some build types. tired of fixing it locally. not a safe hack anyway. --- desmume/src/GPU.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index cbf098329..772eb2f5e 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -18,14 +18,6 @@ along with the this software. If not, see . */ -#ifdef FASTBUILD - #undef FORCEINLINE - #define FORCEINLINE - //compilation speed hack (cuts time exactly in half by cutting out permutations) - #define DISABLE_MOSAIC - #define DISABLE_COLOREFFECTDISABLEHINT -#endif - #include "GPU.h" #include @@ -48,6 +40,13 @@ #include "matrix.h" #include "emufile.h" +#ifdef FASTBUILD + #undef FORCEINLINE + #define FORCEINLINE + //compilation speed hack (cuts time exactly in half by cutting out permutations) + #define DISABLE_MOSAIC + #define DISABLE_COLOREFFECTDISABLEHINT +#endif //instantiate static instance u16 GPUEngineBase::_brightnessUpTable555[17][0x8000]; From de91bcf3699b756fa0c26ad1db3109f0131c8378 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 6 Sep 2016 21:17:32 +0000 Subject: [PATCH 21/41] winport: fix #1590 1 pixel black line on the right side of the emulator --- desmume/src/windows/CWindow.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/desmume/src/windows/CWindow.cpp b/desmume/src/windows/CWindow.cpp index d6af59235..b9edfe540 100644 --- a/desmume/src/windows/CWindow.cpp +++ b/desmume/src/windows/CWindow.cpp @@ -621,8 +621,10 @@ static void MyAdjustWindowRectEx(RECT* rect, HWND hwnd) ZeroMemory(&mbi, sizeof(mbi)); mbi.cbSize = sizeof(mbi); GetMenuBarInfo(hwnd, OBJID_MENU, 0, &mbi); - //int menuHeight = (mbi.rcBar.bottom - mbi.rcBar.top + 1); //zero 07-aug-2016 - why did I do this? it isn't normal in windows and in the case of no menu bar it was making a 1 instead of a 0 (r3184 in 2009) + + //if the menubar exists, its height is off by 1 (frame between bar and client area?) int menuHeight = (mbi.rcBar.bottom - mbi.rcBar.top); + if(menuHeight != 0) menuHeight++; rect->bottom -= cymenu; rect->bottom += menuHeight; From ebff84cd456c03f4ce22179979978172c5946999 Mon Sep 17 00:00:00 2001 From: zeromus Date: Thu, 8 Sep 2016 22:31:31 +0000 Subject: [PATCH 22/41] try to apply patches from #1593 --- desmume/src/commandline.cpp | 15 ++++++++++----- desmume/src/commandline.h | 1 + desmume/src/gtk/main.cpp | 22 ++++++++++++++-------- desmume/src/windows/main.cpp | 3 +++ 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp index 049fa7887..ccc07ea53 100644 --- a/desmume/src/commandline.cpp +++ b/desmume/src/commandline.cpp @@ -67,6 +67,7 @@ CommandLine::CommandLine() , start_paused(FALSE) , autodetect_method(-1) , render3d(COMMANDLINE_RENDER3D_DEFAULT) +, language(-1) { #ifndef HOST_WINDOWS disable_sound = 0; @@ -117,6 +118,7 @@ ENDL " --bios-arm9 BIN_FILE Uses the ARM9 BIOS provided at the specified path" ENDL " --bios-arm7 BIN_FILE Uses the ARM7 BIOS provided at the specified path" ENDL " --bios-swi Uses SWI from the provided bios files (else HLE)" ENDL +" --lang N Pick firmware language (can affect game translations)" ENDL ENDL "Arguments affecting contents of SLOT-1:" ENDL " --slot1 [RETAIL|RETAILAUTO|R4|RETAILNAND|RETAILMCDROM|RETAILDEBUG]" ENDL @@ -163,6 +165,7 @@ ENDL #define OPT_CONSOLE_TYPE 200 #define OPT_ARM9 201 #define OPT_ARM7 202 +#define OPT_LANGUAGE 203 #define OPT_SLOT1 300 #define OPT_SLOT1_FAT_DIR 301 @@ -224,7 +227,8 @@ bool CommandLine::parse(int argc,char **argv) { "console-type", required_argument, NULL, OPT_CONSOLE_TYPE }, { "bios-arm9", required_argument, NULL, OPT_ARM9}, { "bios-arm7", required_argument, NULL, OPT_ARM7}, - { "bios-swi", required_argument, &_bios_swi, 1}, + { "bios-swi", no_argument, &_bios_swi, 1}, + { "lang", required_argument, NULL, OPT_LANGUAGE}, //slot-1 contents { "slot1", required_argument, NULL, OPT_SLOT1}, @@ -308,6 +312,7 @@ bool CommandLine::parse(int argc,char **argv) //utilities case OPT_ADVANSCENE: CommonSettings.run_advanscene_import = optarg; break; + case OPT_LANGUAGE: language = atoi(optarg); break; } } //arg parsing loop @@ -354,10 +359,10 @@ bool CommandLine::parse(int argc,char **argv) //process 3d renderer _render3d = strtoupper(_render3d); if(_render3d == "NONE") render3d = COMMANDLINE_RENDER3D_NONE; - if(_render3d == "SW") render3d = COMMANDLINE_RENDER3D_SW; - if(_render3d == "OLDGL") render3d = COMMANDLINE_RENDER3D_OLDGL; - if(_render3d == "AUTOGL") render3d = COMMANDLINE_RENDER3D_AUTOGL; - if(_render3d == "GL") render3d = COMMANDLINE_RENDER3D_GL; + else if(_render3d == "SW") render3d = COMMANDLINE_RENDER3D_SW; + else if(_render3d == "OLDGL") render3d = COMMANDLINE_RENDER3D_OLDGL; + else if(_render3d == "AUTOGL") render3d = COMMANDLINE_RENDER3D_AUTOGL; + else if(_render3d == "GL") render3d = COMMANDLINE_RENDER3D_GL; if (autodetect_method != -1) CommonSettings.autodetectBackupMethod = autodetect_method; diff --git a/desmume/src/commandline.h b/desmume/src/commandline.h index b77239400..3e095bc0e 100644 --- a/desmume/src/commandline.h +++ b/desmume/src/commandline.h @@ -47,6 +47,7 @@ public: int depth_threshold; int autodetect_method; int render3d; + int language; std::string nds_file; std::string play_movie_file; std::string record_movie_file; diff --git a/desmume/src/gtk/main.cpp b/desmume/src/gtk/main.cpp index 5b335cafb..8bde91193 100644 --- a/desmume/src/gtk/main.cpp +++ b/desmume/src/gtk/main.cpp @@ -1,6 +1,6 @@ /* main.cpp - this file is part of DeSmuME * - * Copyright (C) 2006-2015 DeSmuME Team + * Copyright (C) 2006-2016 DeSmuME Team * Copyright (C) 2007 Pascal Giard (evilynux) * * This file is free software; you can redistribute it and/or modify @@ -649,24 +649,30 @@ public: }; static void -init_configured_features( class configured_features *config) +init_configured_features( class configured_features *config ) { - config->engine_3d = 1; + if(config->render3d == COMMANDLINE_RENDER3D_GL || config->render3d == COMMANDLINE_RENDER3D_OLDGL || config->render3d == COMMANDLINE_RENDER3D_AUTOGL) + config->engine_3d = 2; + else + config->engine_3d = 1; config->savetype = 0; config->timeout = 0; /* use the default language */ - config->firmware_language = -1; + config->firmware_language = -1; + + /* If specified by --lang option the lang will change to choosed one */ + config->firmware_language = config->language; } static int fill_configured_features( class configured_features *config, - int argc, char ** argv) + char ** argv) { GOptionEntry options[] = { - { "3d-engine", 0, 0, G_OPTION_ARG_INT, &config->engine_3d, "Select 3d rendering engine. Available engines:\n" + { "3d-render", 0, 0, G_OPTION_ARG_INT, &config->engine_3d, "Select 3D rendering engine. Available engines:\n" "\t\t\t\t 0 = 3d disabled\n" "\t\t\t\t 1 = internal rasterizer (default)\n" #if defined(HAVE_LIBOSMESA) || defined(HAVE_GL_GLX) @@ -696,7 +702,6 @@ fill_configured_features( class configured_features *config, //g_option_context_add_main_entries (config->ctx, options, "options"); //g_option_context_add_group (config->ctx, gtk_get_option_group (TRUE)); - config->parse(argc,argv); if(!config->validate()) goto error; @@ -3252,6 +3257,7 @@ int main (int argc, char *argv[]) // The global menu screws up the window size... unsetenv("UBUNTU_MENUPROXY"); + my_config.parse(argc, argv); init_configured_features( &my_config); if (!g_thread_supported()) @@ -3259,7 +3265,7 @@ int main (int argc, char *argv[]) gtk_init(&argc, &argv); - if ( !fill_configured_features( &my_config, argc, argv)) { + if ( !fill_configured_features( &my_config, argv)) { exit(0); } diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 1f7b29b1f..abc51e050 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -3383,6 +3383,9 @@ int _main() } } + if(cmdline.language != -1) + CommonSettings.fw_config.language = cmdline.language; + cmdline.process_movieCommands(); if(cmdline.load_slot != -1) From 14458da8f8f1476be52313191f290f40a83f2570 Mon Sep 17 00:00:00 2001 From: zeromus Date: Thu, 8 Sep 2016 22:39:31 +0000 Subject: [PATCH 23/41] do a better job on the language setting, probably --- desmume/src/commandline.cpp | 6 ++++-- desmume/src/windows/main.cpp | 4 ++-- desmume/src/windows/main.h | 9 --------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp index ccc07ea53..8551db797 100644 --- a/desmume/src/commandline.cpp +++ b/desmume/src/commandline.cpp @@ -67,7 +67,7 @@ CommandLine::CommandLine() , start_paused(FALSE) , autodetect_method(-1) , render3d(COMMANDLINE_RENDER3D_DEFAULT) -, language(-1) +, language(1) //english by default { #ifndef HOST_WINDOWS disable_sound = 0; @@ -118,7 +118,9 @@ ENDL " --bios-arm9 BIN_FILE Uses the ARM9 BIOS provided at the specified path" ENDL " --bios-arm7 BIN_FILE Uses the ARM7 BIOS provided at the specified path" ENDL " --bios-swi Uses SWI from the provided bios files (else HLE)" ENDL -" --lang N Pick firmware language (can affect game translations)" ENDL +" --lang N Firmware language (can affect game translations)" ENDL +" 0 = Japanese, 1 = English (default), 2 = French" ENDL +" 3 = German, 4 = Italian, 5 = Spanish" ENDL ENDL "Arguments affecting contents of SLOT-1:" ENDL " --slot1 [RETAIL|RETAILAUTO|R4|RETAILNAND|RETAILMCDROM|RETAILDEBUG]" ENDL diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index abc51e050..33b1a864b 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -3383,8 +3383,8 @@ int _main() } } - if(cmdline.language != -1) - CommonSettings.fw_config.language = cmdline.language; + //not supported; use the GUI + //if(cmdline.language != -1) CommonSettings.fw_config.language = cmdline.language; cmdline.process_movieCommands(); diff --git a/desmume/src/windows/main.h b/desmume/src/windows/main.h index a216cb1d4..d15c32604 100644 --- a/desmume/src/windows/main.h +++ b/desmume/src/windows/main.h @@ -57,15 +57,6 @@ extern bool ShowLagFrameCounter; #define GPU3D_SWRAST 2 #define GPU3D_OPENGL_OLD 3 -static const int LANGUAGE_ENGLISH = 0; -static const int LANGUAGE_FRENCH = 1; -static const int LANGUAGE_CHINESE = 3; -static const int LANGUAGE_ITALIAN = 4; -static const int LANGUAGE_JAPANESE = 5; -static const int LANGUAGE_SPANISH = 6; -static const int LANGUAGE_KOREAN = 7; -static const int LANGUAGE_BRAZILIAN = 8; - extern void Change3DCoreWithFallbackAndSave(int newCore); extern int backupmemorytype; From 59088e1ff84cdbac20a3cfd7df131b7edc1842f0 Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 14 Sep 2016 21:49:47 +0000 Subject: [PATCH 24/41] =?UTF-8?q?Cocoa=20Port:=20-=20OS=20X=20App=20Debug?= =?UTF-8?q?=20builds=20now=20use=20Xcode=208=E2=80=99s=20new=20Incremental?= =?UTF-8?q?=20LTO=20feature.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index 63d240a0b..edcc944f1 100644 --- a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -4577,13 +4577,13 @@ buildSettings = { GCC_OPTIMIZATION_LEVEL = fast; GCC_UNROLL_LOOPS = YES; + LLVM_LTO = YES_THIN; }; name = Debug; }; AB796D6F15CDCBA200C59155 /* Release */ = { isa = XCBuildConfiguration; buildSettings = { - GCC_OPTIMIZATION_LEVEL = fast; GCC_UNROLL_LOOPS = YES; LLVM_LTO = YES; }; From b39f9ef9e499c8437ae8f3cc1a04d75bc605560e Mon Sep 17 00:00:00 2001 From: zeromus Date: Sun, 25 Sep 2016 22:09:13 +0000 Subject: [PATCH 25/41] fix bugs in MC import. I dont want to talk about it. --- desmume/src/emufile.h | 1 + desmume/src/mc.cpp | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/desmume/src/emufile.h b/desmume/src/emufile.h index b2b3474cc..3c9bf2d4f 100644 --- a/desmume/src/emufile.h +++ b/desmume/src/emufile.h @@ -280,6 +280,7 @@ private: { mPositionCacheEnabled = false; mCondition = eCondition_Clean; + mFilePosition = 0; fp = fopen(fname,mode); if(!fp) failbit = true; diff --git a/desmume/src/mc.cpp b/desmume/src/mc.cpp index 2ddae7b26..973eb95de 100644 --- a/desmume/src/mc.cpp +++ b/desmume/src/mc.cpp @@ -294,7 +294,9 @@ BackupDevice::BackupDevice() else { printf("BackupDevice: Converting old raw .sav file.\n"); - sz = trim(buf, sz); + //dont TRIM this! it will wreck the searchFileSaveType below. + //was this intended for egregiously over-sized save files? too bad. + //sz = trim(buf, sz); } if (fpOut->fwrite(buf, sz) == sz) @@ -305,6 +307,7 @@ BackupDevice::BackupDevice() info.type = (res + 1); addr_size = info.addr_size = save_types[info.type].addr_size; info.size = fsize = sz; + fpMC = fpOut; //so ensure() works ensure(sz, fpOut); fsize = 0; } @@ -1069,8 +1072,10 @@ bool BackupDevice::importData(const char *filename, u32 force_size) bool res = false; if (strlen(filename) < 4) return res; - if ((memcmp(filename + strlen(filename) - 4, ".duc", 4) == 0) || - (memcmp(filename + strlen(filename) - 4, ".dss", 4) == 0)) + std::string ext = strright(filename,4); + bool isDuc = strncasecmp(ext.c_str(), ".duc", 4) == 0; + bool isDss = strncasecmp(ext.c_str(), ".dss", 4) == 0; + if(isDuc || isDss) res = import_duc(filename, force_size); else if (import_no_gba(filename, force_size)) From 4af90cd902a7483df447cdf2e4954a3d259f7161 Mon Sep 17 00:00:00 2001 From: rogerman Date: Thu, 29 Sep 2016 00:58:04 +0000 Subject: [PATCH 26/41] Filters: - Remove the Deposterize texture filter from render3D.cpp and make it a general-purpose standalone filter. --- desmume/src/Makefile.am | 42 +++- desmume/src/OGLRender.cpp | 8 +- desmume/src/OGLRender_3_2.cpp | 4 +- .../project.pbxproj | 10 + .../project.pbxproj | 12 ++ desmume/src/filter/deposterize.cpp | 186 ++++++++++++++++ desmume/src/filter/filter.h | 35 ++-- desmume/src/render3D.cpp | 198 +++--------------- desmume/src/render3D.h | 8 +- desmume/src/windows/DeSmuME.vcxproj | 1 + desmume/src/windows/DeSmuME.vcxproj.filters | 3 + 11 files changed, 312 insertions(+), 195 deletions(-) create mode 100644 desmume/src/filter/deposterize.cpp diff --git a/desmume/src/Makefile.am b/desmume/src/Makefile.am index be5652c99..2f137db59 100644 --- a/desmume/src/Makefile.am +++ b/desmume/src/Makefile.am @@ -88,15 +88,45 @@ libdesmume_a_SOURCES = \ utils/tinyxml/tinyxmlerror.cpp \ utils/tinyxml/tinyxmlparser.cpp \ utils/glcorearb.h \ - addons/slot2_auto.cpp addons/slot2_mpcf.cpp addons/slot2_paddle.cpp addons/slot2_gbagame.cpp addons/slot2_none.cpp addons/slot2_rumblepak.cpp addons/slot2_guitarGrip.cpp addons/slot2_expMemory.cpp addons/slot2_piano.cpp addons/slot2_passme.cpp addons/slot1_none.cpp addons/slot1_r4.cpp addons/slot1_retail_nand.cpp addons/slot1_retail_auto.cpp addons/slot1_retail_mcrom.cpp addons/slot1_retail_mcrom_debug.cpp addons/slot1comp_mc.cpp addons/slot1comp_mc.h addons/slot1comp_rom.h addons/slot1comp_rom.cpp addons/slot1comp_protocol.h addons/slot1comp_protocol.cpp \ + addons/slot2_auto.cpp \ + addons/slot2_mpcf.cpp \ + addons/slot2_paddle.cpp \ + addons/slot2_gbagame.cpp \ + addons/slot2_none.cpp \ + addons/slot2_rumblepak.cpp \ + addons/slot2_guitarGrip.cpp \ + addons/slot2_expMemory.cpp \ + addons/slot2_piano.cpp \ + addons/slot2_passme.cpp \ + addons/slot1_none.cpp \ + addons/slot1_r4.cpp \ + addons/slot1_retail_nand.cpp \ + addons/slot1_retail_auto.cpp \ + addons/slot1_retail_mcrom.cpp \ + addons/slot1_retail_mcrom_debug.cpp \ + addons/slot1comp_mc.cpp \ + addons/slot1comp_mc.h \ + addons/slot1comp_rom.h \ + addons/slot1comp_rom.cpp \ + addons/slot1comp_protocol.h \ + addons/slot1comp_protocol.cpp \ cheatSystem.cpp cheatSystem.h \ texcache.cpp texcache.h rasterize.cpp rasterize.h \ metaspu/metaspu.cpp metaspu/metaspu.h \ - filter/2xsai.cpp filter/bilinear.cpp filter/epx.cpp filter/filter.h \ - filter/hq2x.cpp filter/hq2x.h \ - filter/hq3x.cpp filter/hq3x.dat \ - filter/hq4x.cpp filter/hq4x.dat \ - filter/interp.h filter/lq2x.cpp filter/lq2x.h filter/scanline.cpp \ + filter/2xsai.cpp \ + filter/bilinear.cpp \ + filter/deposterize.cpp \ + filter/epx.cpp \ + filter/filter.h \ + filter/hq2x.cpp \ + filter/hq2x.h \ + filter/hq3x.cpp \ + filter/hq3x.dat \ + filter/hq4x.cpp \ + filter/hq4x.dat \ + filter/interp.h \ + filter/lq2x.cpp filter/lq2x.h \ + filter/scanline.cpp \ filter/videofilter.cpp filter/videofilter.h \ filter/xbrz.cpp filter/xbrz.h \ version.cpp version.h \ diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index c2f64fedb..bb5bfcb05 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -2993,10 +2993,10 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT size_t texWidth = this->currTexture->sizeX; size_t texHeight = this->currTexture->sizeY; - if (this->_textureDeposterizeBuffer != NULL) + if (this->_textureDeposterizeDstSurface.Surface != NULL) { this->TextureDeposterize(textureSrc, texWidth, texHeight); - textureSrc = this->_textureDeposterizeBuffer; + textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; } switch (this->_textureScalingFactor) @@ -4644,10 +4644,10 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT size_t texWidth = this->currTexture->sizeX; size_t texHeight = this->currTexture->sizeY; - if (this->_textureDeposterizeBuffer != NULL) + if (this->_textureDeposterizeDstSurface.Surface != NULL) { this->TextureDeposterize(textureSrc, texWidth, texHeight); - textureSrc = this->_textureDeposterizeBuffer; + textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; } switch (this->_textureScalingFactor) diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 7c48721b9..e218fc448 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1721,10 +1721,10 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT size_t texWidth = this->currTexture->sizeX; size_t texHeight = this->currTexture->sizeY; - if (this->_textureDeposterizeBuffer != NULL) + if (this->_textureDeposterizeDstSurface.Surface != NULL) { this->TextureDeposterize(textureSrc, texWidth, texHeight); - textureSrc = this->_textureDeposterizeBuffer; + textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; } switch (this->_textureScalingFactor) diff --git a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj index edcc944f1..1a7713810 100644 --- a/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (Latest).xcodeproj/project.pbxproj @@ -112,6 +112,10 @@ AB2EE13117D57F5000F68622 /* fsnitro.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB2EE13017D57F5000F68622 /* fsnitro.cpp */; }; AB2EE13217D57F5000F68622 /* fsnitro.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB2EE13017D57F5000F68622 /* fsnitro.cpp */; }; AB2EE13317D57F5000F68622 /* fsnitro.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB2EE13017D57F5000F68622 /* fsnitro.cpp */; }; + AB301BDF1D9C8BAC00246A93 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB301BDE1D9C8BAC00246A93 /* deposterize.cpp */; }; + AB301BE01D9C8BCD00246A93 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB301BDE1D9C8BAC00246A93 /* deposterize.cpp */; }; + AB301BE11D9C8BCE00246A93 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB301BDE1D9C8BAC00246A93 /* deposterize.cpp */; }; + AB301BE21D9C8BCF00246A93 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AB301BDE1D9C8BAC00246A93 /* deposterize.cpp */; }; AB350BA51478AC96007165AC /* IOKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = AB350BA41478AC96007165AC /* IOKit.framework */; }; AB350D3B147A1D93007165AC /* HID_usage_strings.plist in Resources */ = {isa = PBXBuildFile; fileRef = AB350D3A147A1D93007165AC /* HID_usage_strings.plist */; }; AB3701E5173A3FBF006E573E /* Carbon.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = AB74EC891738499C0026C41E /* Carbon.framework */; }; @@ -1323,6 +1327,7 @@ AB2EE12B17D57ED500F68622 /* slot1_retail_mcrom_debug.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slot1_retail_mcrom_debug.cpp; sourceTree = ""; }; AB2EE12F17D57F5000F68622 /* fsnitro.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fsnitro.h; sourceTree = ""; }; AB2EE13017D57F5000F68622 /* fsnitro.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fsnitro.cpp; sourceTree = ""; }; + AB301BDE1D9C8BAC00246A93 /* deposterize.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deposterize.cpp; sourceTree = ""; }; AB350BA41478AC96007165AC /* IOKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = IOKit.framework; path = System/Library/Frameworks/IOKit.framework; sourceTree = SDKROOT; }; AB350D38147A1D8D007165AC /* English */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = English; path = translations/English.lproj/HID_usage_strings.plist; sourceTree = ""; }; AB3A655C16CC5416001F5D4A /* EmuControllerDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = EmuControllerDelegate.h; sourceTree = ""; }; @@ -2926,6 +2931,7 @@ children = ( ABFE14FA14C92FF5005D6699 /* 2xsai.cpp */, ABFE14FB14C92FF5005D6699 /* bilinear.cpp */, + AB301BDE1D9C8BAC00246A93 /* deposterize.cpp */, ABFE14FC14C92FF5005D6699 /* epx.cpp */, ABFE14FE14C92FF5005D6699 /* hq2x.cpp */, AB4C81E31B21676C00ACECD5 /* hq3x.cpp */, @@ -3847,6 +3853,7 @@ ABD1041C1346652500AF11D1 /* cocoa_input.mm in Sources */, AB3E34C9134AF4500056477A /* cocoa_output.mm in Sources */, ABFEA8CB1BB4EC1100B08C25 /* smooth.c in Sources */, + AB301BE11D9C8BCE00246A93 /* deposterize.cpp in Sources */, ABD1041E1346652500AF11D1 /* cocoa_rom.mm in Sources */, AB80E04D142BC4A800A52038 /* cocoa_util.mm in Sources */, ABE5DFE5143FB1DA00835AD8 /* cocoa_videofilter.mm in Sources */, @@ -3951,6 +3958,7 @@ AB796CF815CDCBA200C59155 /* cp15.cpp in Sources */, AB796CF915CDCBA200C59155 /* cpu_detect_x86_gcc.cpp in Sources */, AB796CFA15CDCBA200C59155 /* crc.cpp in Sources */, + AB301BDF1D9C8BAC00246A93 /* deposterize.cpp in Sources */, AB796CFB15CDCBA200C59155 /* datetime.cpp in Sources */, AB796CFC15CDCBA200C59155 /* debug.cpp in Sources */, ABFEA82E1BB4EC1100B08C25 /* ftlcdfil.c in Sources */, @@ -4157,6 +4165,7 @@ ABFEA8361BB4EC1100B08C25 /* ftmm.c in Sources */, ABFEA81E1BB4EC1000B08C25 /* ftfstype.c in Sources */, ABA731601BB51E7000B26147 /* pshinter.c in Sources */, + AB301BE01D9C8BCD00246A93 /* deposterize.cpp in Sources */, ABFEA8211BB4EC1000B08C25 /* ftgasp.c in Sources */, ABFEA83C1BB4EC1100B08C25 /* ftotval.c in Sources */, ABFEA8181BB4EC1000B08C25 /* ftdebug.c in Sources */, @@ -4448,6 +4457,7 @@ ABB3C6B81501C04F00E0C22E /* common.cpp in Sources */, ABB3C6B91501C04F00E0C22E /* cp15.cpp in Sources */, AB407F371A6206FB00313213 /* xbrz.cpp in Sources */, + AB301BE21D9C8BCF00246A93 /* deposterize.cpp in Sources */, ABB3C6BA1501C04F00E0C22E /* debug.cpp in Sources */, ABB3C6BB1501C04F00E0C22E /* Disassembler.cpp in Sources */, ABB3C6BC1501C04F00E0C22E /* driver.cpp in Sources */, diff --git a/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj b/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj index e53060176..011440e6e 100644 --- a/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj +++ b/desmume/src/cocoa/DeSmuME (XCode 3).xcodeproj/project.pbxproj @@ -1418,6 +1418,11 @@ ABB9212317CEB4110049D4C5 /* slot1comp_protocol.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABB9212017CEB4110049D4C5 /* slot1comp_protocol.cpp */; }; ABB9212417CEB4110049D4C5 /* slot1comp_protocol.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABB9212017CEB4110049D4C5 /* slot1comp_protocol.cpp */; }; ABB9212517CEB4110049D4C5 /* slot1comp_protocol.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABB9212017CEB4110049D4C5 /* slot1comp_protocol.cpp */; }; + ABBB4ACD1D9C927C00794E08 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */; }; + ABBB4ACE1D9C927C00794E08 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */; }; + ABBB4ACF1D9C927C00794E08 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */; }; + ABBB4AD01D9C927C00794E08 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */; }; + ABBB4AD11D9C927C00794E08 /* deposterize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */; }; ABBCE29715ACB1FF00A2C965 /* arm_jit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBCE29515ACB1FF00A2C965 /* arm_jit.cpp */; }; ABBCE29815ACB1FF00A2C965 /* arm_jit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ABBCE29515ACB1FF00A2C965 /* arm_jit.cpp */; }; ABBF04A614B515F300E505A0 /* AppIcon_ROMCheats.icns in Resources */ = {isa = PBXBuildFile; fileRef = ABBF04A414B515F300E505A0 /* AppIcon_ROMCheats.icns */; }; @@ -1953,6 +1958,7 @@ ABB97873144E89CC00793FA3 /* Icon_ActionReplay_32x32.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_ActionReplay_32x32.png; path = Images/Icon_ActionReplay_32x32.png; sourceTree = ""; }; ABB97874144E89CC00793FA3 /* Icon_CodeBreaker_32x32.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_CodeBreaker_32x32.png; path = Images/Icon_CodeBreaker_32x32.png; sourceTree = ""; }; ABB97875144E89CC00793FA3 /* Icon_DeSmuME_32x32.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_DeSmuME_32x32.png; path = Images/Icon_DeSmuME_32x32.png; sourceTree = ""; }; + ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deposterize.cpp; sourceTree = ""; }; ABBC0F8C1394B1AA0028B6BD /* DefaultUserPrefs.plist */ = {isa = PBXFileReference; lastKnownFileType = file.bplist; path = DefaultUserPrefs.plist; sourceTree = ""; }; ABBCE29415ACB1E600A2C965 /* arm_jit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = arm_jit.h; path = ../arm_jit.h; sourceTree = SOURCE_ROOT; }; ABBCE29515ACB1FF00A2C965 /* arm_jit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = arm_jit.cpp; path = ../arm_jit.cpp; sourceTree = SOURCE_ROOT; }; @@ -3453,6 +3459,7 @@ children = ( ABFE14FA14C92FF5005D6699 /* 2xsai.cpp */, ABFE14FB14C92FF5005D6699 /* bilinear.cpp */, + ABBB4ACC1D9C927C00794E08 /* deposterize.cpp */, ABFE14FC14C92FF5005D6699 /* epx.cpp */, ABFE14FE14C92FF5005D6699 /* hq2x.cpp */, ABAAEFFE1B22361800E1269D /* hq3x.cpp */, @@ -4540,6 +4547,7 @@ AB50200C1D09E712002FA150 /* retro_stat.c in Sources */, AB7BB17F1D62C8CC00A7A6E2 /* colorspacehandler.cpp in Sources */, AB7BB1801D62C8CF00A7A6E2 /* colorspacehandler_AltiVec.cpp in Sources */, + ABBB4AD11D9C927C00794E08 /* deposterize.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -4721,6 +4729,7 @@ AB5020181D09E712002FA150 /* retro_stat.c in Sources */, AB37E3801D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, AB37E38A1D61895F004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */, + ABBB4AD01D9C927C00794E08 /* deposterize.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -4932,6 +4941,7 @@ AB50200F1D09E712002FA150 /* retro_stat.c in Sources */, AB37E3741D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, AB37E3771D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */, + ABBB4ACD1D9C927C00794E08 /* deposterize.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -5143,6 +5153,7 @@ AB5020121D09E712002FA150 /* retro_stat.c in Sources */, AB37E3781D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, AB37E37B1D6188BC004A2C0D /* colorspacehandler_SSE2.cpp in Sources */, + ABBB4ACE1D9C927C00794E08 /* deposterize.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -5324,6 +5335,7 @@ AB5020151D09E712002FA150 /* retro_stat.c in Sources */, AB37E37C1D6188BC004A2C0D /* colorspacehandler.cpp in Sources */, AB37E37D1D6188BC004A2C0D /* colorspacehandler_AltiVec.cpp in Sources */, + ABBB4ACF1D9C927C00794E08 /* deposterize.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/desmume/src/filter/deposterize.cpp b/desmume/src/filter/deposterize.cpp new file mode 100644 index 000000000..eeb0a94bb --- /dev/null +++ b/desmume/src/filter/deposterize.cpp @@ -0,0 +1,186 @@ +/* + Copyright (C) 2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "../types.h" +#include "filter.h" + +static u32 Deposterize_InterpLTE(const u32 pixA, const u32 pixB, const u32 threshold) +{ + const u32 aB = (pixB & 0xFF000000) >> 24; + if (aB == 0) + { + return pixA; + } + + const u32 rA = (pixA & 0x000000FF); + const u32 gA = (pixA & 0x0000FF00) >> 8; + const u32 bA = (pixA & 0x00FF0000) >> 16; + const u32 aA = (pixA & 0xFF000000) >> 24; + + const u32 rB = (pixB & 0x000000FF); + const u32 gB = (pixB & 0x0000FF00) >> 8; + const u32 bB = (pixB & 0x00FF0000) >> 16; + + const u32 rC = ( (rB - rA <= threshold) || (rA - rB <= threshold) ) ? ( ((rA+rB)>>1) ) : rA; + const u32 gC = ( (gB - gA <= threshold) || (gA - gB <= threshold) ) ? ( ((gA+gB)>>1) ) : gA; + const u32 bC = ( (bB - bA <= threshold) || (bA - bB <= threshold) ) ? ( ((bA+bB)>>1) ) : bA; + const u32 aC = ( (bB - aA <= threshold) || (aA - aB <= threshold) ) ? ( ((aA+aB)>>1) ) : aA; + + return (rC | (gC << 8) | (bC << 16) | (aC << 24)); +} + +static u32 Deposterize_Blend(const u32 pixA, const u32 pixB, const u32 weightA, const u32 weightB) +{ + const u32 aB = (pixB & 0xFF000000) >> 24; + if (aB == 0) + { + return pixA; + } + + const u32 weightSum = weightA + weightB; + + const u32 rbA = pixA & 0x00FF00FF; + const u32 gA = pixA & 0x0000FF00; + const u32 aA = (pixA & 0xFF000000) >> 24; + + const u32 rbB = pixB & 0x00FF00FF; + const u32 gB = pixB & 0x0000FF00; + + const u32 rbC = ( ((rbA * weightA) + (rbB * weightB)) / weightSum ) & 0x00FF00FF; + const u32 gC = ( (( gA * weightA) + ( gB * weightB)) / weightSum ) & 0x0000FF00; + const u32 aC = ( (( aA * weightA) + ( aB * weightB)) / weightSum ) << 24; + + return (rbC | gC | aC); +} + +void RenderDeposterize(SSurface Src, SSurface Dst) +{ + //---------------------------------------\n\ + // Input Pixel Mapping: 06|07|08 + // 05|00|01 + // 04|03|02 + // + // Output Pixel Mapping: 00 + + const int w = Src.Width; + const int h = Src.Height; + + u32 color[9]; + u32 blend[9]; + u32 *src = (u32 *)Src.Surface; + u32 *workingDst = (u32 *)Dst.workingSurface[0]; + u32 *finalDst = (u32 *)Dst.Surface; + u32 threshold = *(u32 *)Dst.userData; + + int i = 0; + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++, i++) + { + if ((src[i] & 0xFF000000) == 0) + { + workingDst[i] = src[i]; + continue; + } + + color[0] = src[i]; + color[1] = (x < w-1) ? src[i+1] : src[i]; + color[2] = ((x < w-1) && (y < h-1)) ? src[i+w+1] : src[i]; + color[3] = (y < h-1) ? src[i+w] : src[i]; + color[4] = ((x > 0) && (y < h-1)) ? src[i+w-1] : src[i]; + color[5] = (x > 0) ? src[i-1] : src[i]; + color[6] = ((x > 0) && (y > 0)) ? src[i-w-1] : src[i]; + color[7] = (y > 0) ? src[i-w] : src[i]; + color[8] = ((x < w-1) && (y > 0)) ? src[i-w+1] : src[i]; + + blend[0] = color[0]; + blend[1] = Deposterize_InterpLTE(color[0], color[1], threshold); + blend[2] = Deposterize_InterpLTE(color[0], color[2], threshold); + blend[3] = Deposterize_InterpLTE(color[0], color[3], threshold); + blend[4] = Deposterize_InterpLTE(color[0], color[4], threshold); + blend[5] = Deposterize_InterpLTE(color[0], color[5], threshold); + blend[6] = Deposterize_InterpLTE(color[0], color[6], threshold); + blend[7] = Deposterize_InterpLTE(color[0], color[7], threshold); + blend[8] = Deposterize_InterpLTE(color[0], color[8], threshold); + + workingDst[i] = Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[5], 1, 7), + Deposterize_Blend(blend[0], blend[1], 1, 7), + 1, 1), + Deposterize_Blend(Deposterize_Blend(blend[0], blend[7], 1, 7), + Deposterize_Blend(blend[0], blend[3], 1, 7), + 1, 1), + 1, 1), + Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[6], 7, 9), + Deposterize_Blend(blend[0], blend[2], 7, 9), + 1, 1), + Deposterize_Blend(Deposterize_Blend(blend[0], blend[8], 7, 9), + Deposterize_Blend(blend[0], blend[4], 7, 9), + 1, 1), + 1, 1), + 3, 1); + } + } + + i = 0; + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++, i++) + { + if ((src[i] & 0xFF000000) == 0) + { + finalDst[i] = src[i]; + continue; + } + + color[0] = workingDst[i]; + color[1] = (x < w-1) ? workingDst[i+1] : workingDst[i]; + color[2] = ((x < w-1) && (y < h-1)) ? workingDst[i+w+1] : workingDst[i]; + color[3] = (y < h-1) ? workingDst[i+w] : workingDst[i]; + color[4] = ((x > 0) && (y < h-1)) ? workingDst[i+w-1] : workingDst[i]; + color[5] = (x > 0) ? workingDst[i-1] : workingDst[i]; + color[6] = ((x > 0) && (y > 0)) ? workingDst[i-w-1] : workingDst[i]; + color[7] = (y > 0) ? workingDst[i-w] : workingDst[i]; + color[8] = ((x < w-1) && (y > 0)) ? workingDst[i-w+1] : workingDst[i]; + + blend[0] = color[0]; + blend[1] = Deposterize_InterpLTE(color[0], color[1], threshold); + blend[2] = Deposterize_InterpLTE(color[0], color[2], threshold); + blend[3] = Deposterize_InterpLTE(color[0], color[3], threshold); + blend[4] = Deposterize_InterpLTE(color[0], color[4], threshold); + blend[5] = Deposterize_InterpLTE(color[0], color[5], threshold); + blend[6] = Deposterize_InterpLTE(color[0], color[6], threshold); + blend[7] = Deposterize_InterpLTE(color[0], color[7], threshold); + blend[8] = Deposterize_InterpLTE(color[0], color[8], threshold); + + finalDst[i] = Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[5], 1, 7), + Deposterize_Blend(blend[0], blend[1], 1, 7), + 1, 1), + Deposterize_Blend(Deposterize_Blend(blend[0], blend[7], 1, 7), + Deposterize_Blend(blend[0], blend[3], 1, 7), + 1, 1), + 1, 1), + Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[6], 7, 9), + Deposterize_Blend(blend[0], blend[2], 7, 9), + 1, 1), + Deposterize_Blend(Deposterize_Blend(blend[0], blend[8], 7, 9), + Deposterize_Blend(blend[0], blend[4], 7, 9), + 1, 1), + 1, 1), + 3, 1); + } + } +} diff --git a/desmume/src/filter/filter.h b/desmume/src/filter/filter.h index 03be9e93e..f21e08ec2 100644 --- a/desmume/src/filter/filter.h +++ b/desmume/src/filter/filter.h @@ -1,19 +1,22 @@ /* -Copyright (C) 2009-2014 DeSmuME team + Copyright (C) 2009-2016 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ -This file is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 2 of the License, or -(at your option) any later version. - -This file is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with the this software. If not, see . -*/ +#ifndef _IMAGE_FILTER_ +#define _IMAGE_FILTER_ #define FILTER_MAX_WORKING_SURFACE_COUNT 8 @@ -27,6 +30,8 @@ typedef struct { void *userData; } SSurface; +void RenderDeposterize(SSurface Src, SSurface Dst); + void RenderNearest2X (SSurface Src, SSurface Dst); void RenderLQ2X (SSurface Src, SSurface Dst); void RenderLQ2XS (SSurface Src, SSurface Dst); @@ -52,3 +57,5 @@ void Render3xBRZ(SSurface Src, SSurface Dst); void Render4xBRZ(SSurface Src, SSurface Dst); void Render5xBRZ(SSurface Src, SSurface Dst); void Render6xBRZ(SSurface Src, SSurface Dst); + +#endif // _IMAGE_FILTER_ diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index f777b6db1..c5360d660 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -29,6 +29,7 @@ #include "gfx3d.h" #include "MMU.h" #include "texcache.h" +#include "./filter/filter.h" #include "./filter/xbrz.h" #define TEXTURE_DEPOSTERIZE_THRESHOLD 21 // Possible values are [0-255], where lower a value prevents blending and a higher value allows for more blending @@ -127,55 +128,6 @@ void Render3DBaseDestroy() } } -static u32 TextureDeposterize_InterpLTE(const u32 pixA, const u32 pixB, const u32 threshold) -{ - const u32 aB = (pixB & 0xFF000000) >> 24; - if (aB == 0) - { - return pixA; - } - - const u32 rA = (pixA & 0x000000FF); - const u32 gA = (pixA & 0x0000FF00) >> 8; - const u32 bA = (pixA & 0x00FF0000) >> 16; - const u32 aA = (pixA & 0xFF000000) >> 24; - - const u32 rB = (pixB & 0x000000FF); - const u32 gB = (pixB & 0x0000FF00) >> 8; - const u32 bB = (pixB & 0x00FF0000) >> 16; - - const u32 rC = ( (rB - rA <= threshold) || (rA - rB <= threshold) ) ? ( ((rA+rB)>>1) ) : rA; - const u32 gC = ( (gB - gA <= threshold) || (gA - gB <= threshold) ) ? ( ((gA+gB)>>1) ) : gA; - const u32 bC = ( (bB - bA <= threshold) || (bA - bB <= threshold) ) ? ( ((bA+bB)>>1) ) : bA; - const u32 aC = ( (bB - aA <= threshold) || (aA - aB <= threshold) ) ? ( ((aA+aB)>>1) ) : aA; - - return (rC | (gC << 8) | (bC << 16) | (aC << 24)); -} - -static u32 TextureDeposterize_Blend(const u32 pixA, const u32 pixB, const u32 weightA, const u32 weightB) -{ - const u32 aB = (pixB & 0xFF000000) >> 24; - if (aB == 0) - { - return pixA; - } - - const u32 weightSum = weightA + weightB; - - const u32 rbA = pixA & 0x00FF00FF; - const u32 gA = pixA & 0x0000FF00; - const u32 aA = (pixA & 0xFF000000) >> 24; - - const u32 rbB = pixB & 0x00FF00FF; - const u32 gB = pixB & 0x0000FF00; - - const u32 rbC = ( ((rbA * weightA) + (rbB * weightB)) / weightSum ) & 0x00FF00FF; - const u32 gC = ( (( gA * weightA) + ( gB * weightB)) / weightSum ) & 0x0000FF00; - const u32 aC = ( (( aA * weightA) + ( aB * weightB)) / weightSum ) << 24; - - return (rbC | gC | aC); -} - FragmentAttributesBuffer::FragmentAttributesBuffer(size_t newCount) { count = newCount; @@ -285,15 +237,28 @@ Render3D::Render3D() _textureScalingFactor = 1; _textureSmooth = false; - _textureDeposterizeBuffer = NULL; _textureUpscaleBuffer = NULL; + _textureDeposterizeThreshold = TEXTURE_DEPOSTERIZE_THRESHOLD; + + memset(&_textureDeposterizeSrcSurface, 0, sizeof(_textureDeposterizeSrcSurface)); + memset(&_textureDeposterizeDstSurface, 0, sizeof(_textureDeposterizeDstSurface)); + + _textureDeposterizeSrcSurface.Width = _textureDeposterizeDstSurface.Width = 1; + _textureDeposterizeSrcSurface.Height = _textureDeposterizeDstSurface.Height = 1; + _textureDeposterizeSrcSurface.Pitch = _textureDeposterizeDstSurface.Pitch = 1; + _textureDeposterizeDstSurface.userData = &_textureDeposterizeThreshold; Reset(); } Render3D::~Render3D() { - // Do nothing. + if (this->_textureDeposterizeDstSurface.Surface != NULL) + { + free_aligned(this->_textureDeposterizeDstSurface.Surface); + this->_textureDeposterizeDstSurface.Surface = NULL; + this->_textureDeposterizeDstSurface.workingSurface[0] = NULL; + } } const Render3DDeviceInfo& Render3D::GetDeviceInfo() @@ -385,20 +350,24 @@ void Render3D::SetTextureProcessingProperties(size_t scalingFactor, bool willDep const size_t newScalingFactor = (isScaleValid) ? scalingFactor : 1; bool needTexCacheReset = false; - if ( willDeposterize && (this->_textureDeposterizeBuffer == NULL) ) + if ( willDeposterize && (this->_textureDeposterizeDstSurface.Surface == NULL) ) { // 1024x1024 texels is the largest possible texture size. // We need two buffers, one for each deposterize stage. const size_t bufferSize = 1024 * 1024 * 2 * sizeof(u32); - this->_textureDeposterizeBuffer = (u32 *)malloc_alignedCacheLine(bufferSize); - memset(this->_textureDeposterizeBuffer, 0, bufferSize); + + this->_textureDeposterizeDstSurface.Surface = (unsigned char *)malloc_alignedCacheLine(bufferSize); + this->_textureDeposterizeDstSurface.workingSurface[0] = (unsigned char *)((u32 *)this->_textureDeposterizeDstSurface.Surface + (1024 * 1024)); + + memset(this->_textureDeposterizeDstSurface.Surface, 0, bufferSize); needTexCacheReset = true; } - else if ( !willDeposterize && (this->_textureDeposterizeBuffer != NULL) ) + else if ( !willDeposterize && (this->_textureDeposterizeDstSurface.Surface != NULL) ) { - free_aligned(this->_textureDeposterizeBuffer); - this->_textureDeposterizeBuffer = NULL; + free_aligned(this->_textureDeposterizeDstSurface.Surface); + this->_textureDeposterizeDstSurface.Surface = NULL; + this->_textureDeposterizeDstSurface.workingSurface[0] = NULL; needTexCacheReset = true; } @@ -429,118 +398,11 @@ void Render3D::SetTextureProcessingProperties(size_t scalingFactor, bool willDep Render3DError Render3D::TextureDeposterize(const u32 *src, const size_t srcTexWidth, const size_t srcTexHeight) { - //---------------------------------------\n\ - // Input Pixel Mapping: 06|07|08 - // 05|00|01 - // 04|03|02 - // - // Output Pixel Mapping: 00 + this->_textureDeposterizeSrcSurface.Width = this->_textureDeposterizeDstSurface.Width = srcTexWidth; + this->_textureDeposterizeSrcSurface.Height = this->_textureDeposterizeDstSurface.Height = srcTexHeight; + this->_textureDeposterizeSrcSurface.Surface = (unsigned char *)src; - const int w = srcTexWidth; - const int h = srcTexHeight; - - u32 color[9]; - u32 blend[9]; - u32 *dst = this->_textureDeposterizeBuffer + (1024 * 1024); - u32 *finalDst = this->_textureDeposterizeBuffer; - - size_t i = 0; - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++, i++) - { - if ((src[i] & 0xFF000000) == 0) - { - dst[i] = src[i]; - continue; - } - - color[0] = src[i]; - color[1] = (x < w-1) ? src[i+1] : src[i]; - color[2] = ((x < w-1) && (y < h-1)) ? src[i+w+1] : src[i]; - color[3] = (y < h-1) ? src[i+w] : src[i]; - color[4] = ((x > 0) && (y < h-1)) ? src[i+w-1] : src[i]; - color[5] = (x > 0) ? src[i-1] : src[i]; - color[6] = ((x > 0) && (y > 0)) ? src[i-w-1] : src[i]; - color[7] = (y > 0) ? src[i-w] : src[i]; - color[8] = ((x < w-1) && (y > 0)) ? src[i-w+1] : src[i]; - - blend[0] = color[0]; - blend[1] = TextureDeposterize_InterpLTE(color[0], color[1], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[2] = TextureDeposterize_InterpLTE(color[0], color[2], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[3] = TextureDeposterize_InterpLTE(color[0], color[3], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[4] = TextureDeposterize_InterpLTE(color[0], color[4], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[5] = TextureDeposterize_InterpLTE(color[0], color[5], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[6] = TextureDeposterize_InterpLTE(color[0], color[6], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[7] = TextureDeposterize_InterpLTE(color[0], color[7], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[8] = TextureDeposterize_InterpLTE(color[0], color[8], TEXTURE_DEPOSTERIZE_THRESHOLD); - - dst[i] = TextureDeposterize_Blend(TextureDeposterize_Blend(TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[5], 1, 7), - TextureDeposterize_Blend(blend[0], blend[1], 1, 7), - 1, 1), - TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[7], 1, 7), - TextureDeposterize_Blend(blend[0], blend[3], 1, 7), - 1, 1), - 1, 1), - TextureDeposterize_Blend(TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[6], 7, 9), - TextureDeposterize_Blend(blend[0], blend[2], 7, 9), - 1, 1), - TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[8], 7, 9), - TextureDeposterize_Blend(blend[0], blend[4], 7, 9), - 1, 1), - 1, 1), - 3, 1); - } - } - - i = 0; - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++, i++) - { - if ((src[i] & 0xFF000000) == 0) - { - finalDst[i] = src[i]; - continue; - } - - color[0] = dst[i]; - color[1] = (x < w-1) ? dst[i+1] : dst[i]; - color[2] = ((x < w-1) && (y < h-1)) ? dst[i+w+1] : dst[i]; - color[3] = (y < h-1) ? dst[i+w] : dst[i]; - color[4] = ((x > 0) && (y < h-1)) ? dst[i+w-1] : dst[i]; - color[5] = (x > 0) ? dst[i-1] : dst[i]; - color[6] = ((x > 0) && (y > 0)) ? dst[i-w-1] : dst[i]; - color[7] = (y > 0) ? dst[i-w] : dst[i]; - color[8] = ((x < w-1) && (y > 0)) ? dst[i-w+1] : dst[i]; - - blend[0] = color[0]; - blend[1] = TextureDeposterize_InterpLTE(color[0], color[1], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[2] = TextureDeposterize_InterpLTE(color[0], color[2], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[3] = TextureDeposterize_InterpLTE(color[0], color[3], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[4] = TextureDeposterize_InterpLTE(color[0], color[4], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[5] = TextureDeposterize_InterpLTE(color[0], color[5], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[6] = TextureDeposterize_InterpLTE(color[0], color[6], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[7] = TextureDeposterize_InterpLTE(color[0], color[7], TEXTURE_DEPOSTERIZE_THRESHOLD); - blend[8] = TextureDeposterize_InterpLTE(color[0], color[8], TEXTURE_DEPOSTERIZE_THRESHOLD); - - finalDst[i] = TextureDeposterize_Blend(TextureDeposterize_Blend(TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[5], 1, 7), - TextureDeposterize_Blend(blend[0], blend[1], 1, 7), - 1, 1), - TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[7], 1, 7), - TextureDeposterize_Blend(blend[0], blend[3], 1, 7), - 1, 1), - 1, 1), - TextureDeposterize_Blend(TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[6], 7, 9), - TextureDeposterize_Blend(blend[0], blend[2], 7, 9), - 1, 1), - TextureDeposterize_Blend(TextureDeposterize_Blend(blend[0], blend[8], 7, 9), - TextureDeposterize_Blend(blend[0], blend[4], 7, 9), - 1, 1), - 1, 1), - 3, 1); - } - } + RenderDeposterize(this->_textureDeposterizeSrcSurface, this->_textureDeposterizeDstSurface); return RENDER3DERROR_NOERR; } diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index dd64ff99a..a98225a4e 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -21,6 +21,7 @@ #include "gfx3d.h" #include "types.h" +#include "./filter/filter.h" #define kUnsetTranslucentPolyID 255 @@ -130,7 +131,12 @@ protected: size_t _textureScalingFactor; bool _textureSmooth; - u32 *_textureDeposterizeBuffer; + + SSurface _textureDeposterizeSrcSurface; + SSurface _textureDeposterizeDstSurface; + u32 _textureDeposterizeThreshold; + + //u32 *_textureDeposterizeBuffer; u32 *_textureUpscaleBuffer; CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; diff --git a/desmume/src/windows/DeSmuME.vcxproj b/desmume/src/windows/DeSmuME.vcxproj index 53d2c00d8..27264e586 100644 --- a/desmume/src/windows/DeSmuME.vcxproj +++ b/desmume/src/windows/DeSmuME.vcxproj @@ -87,6 +87,7 @@ + diff --git a/desmume/src/windows/DeSmuME.vcxproj.filters b/desmume/src/windows/DeSmuME.vcxproj.filters index 0a0742ced..6d951c804 100644 --- a/desmume/src/windows/DeSmuME.vcxproj.filters +++ b/desmume/src/windows/DeSmuME.vcxproj.filters @@ -975,6 +975,9 @@ Core\utils\colorspacehandler + + Core\filter + From 284119237e5790f4475f979a8f556bdbcba528b1 Mon Sep 17 00:00:00 2001 From: zeromus Date: Fri, 30 Sep 2016 05:08:23 +0000 Subject: [PATCH 27/41] support importing newer style ardsi duc files --- desmume/src/mc.cpp | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/desmume/src/mc.cpp b/desmume/src/mc.cpp index 973eb95de..2abab90c9 100644 --- a/desmume/src/mc.cpp +++ b/desmume/src/mc.cpp @@ -1504,26 +1504,43 @@ u32 BackupDevice::get_save_duc_size(const char* fname) bool BackupDevice::import_duc(const char* filename, u32 force_size) { u32 size; - char id[16]; + u8 id16[16] = {0}, id4[4] = {0}, id2[2] = {0}; FILE* file = fopen(filename, "rb"); if(!file) return false; - fseek(file, 0, SEEK_END); - size = (u32)ftell(file) - 500; - fseek(file, 0, SEEK_SET); + int version = 0; - // Make sure we really have the right file - fread((void *)id, sizeof(char), 16, file); + //ID version 1 + fread(id16, 1, 16, file); + if(!memcmp(id16, "ARDS000000000001", 16)) version = 1; - if (memcmp(id, "ARDS000000000001", 16) != 0) + //ID version 2 + fseek(file,0xA2,SEEK_SET); + fread(id2,1,2,file); + if(!memcmp(id16,"\0\0\0\0",4) && !memcmp(id2,"\x04\xC0",2)) version = 2; + + if(version == 0) { printf("Not recognized as a valid DUC file\n"); fclose(file); return false; } - // Skip the rest of the header since we don't need it - fseek(file, 500, SEEK_SET); + + fseek(file, 0, SEEK_END); + size = (u32)ftell(file); + + //skip to raw data + if(version == 1) + { + size -= 500; + fseek(file, 500, SEEK_SET); + } + if(version == 2) + { + size -= 0xA4; + fseek(file, 0xA4, SEEK_SET); + } u32 left = 0; if (force_size > 0) From b9c33745c5688336106ebe9f0c93b07be811525d Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 30 Sep 2016 05:41:00 +0000 Subject: [PATCH 28/41] Filters: - Simplify the functionality of the Deposterize filter by making the threshold a constant value. - Increase the Deposterize threshold from 21 to 23. --- desmume/src/filter/deposterize.cpp | 46 ++++++++++++++++-------------- desmume/src/render3D.cpp | 3 -- desmume/src/render3D.h | 2 -- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/desmume/src/filter/deposterize.cpp b/desmume/src/filter/deposterize.cpp index eeb0a94bb..84ebebb34 100644 --- a/desmume/src/filter/deposterize.cpp +++ b/desmume/src/filter/deposterize.cpp @@ -18,7 +18,10 @@ #include "../types.h" #include "filter.h" -static u32 Deposterize_InterpLTE(const u32 pixA, const u32 pixB, const u32 threshold) +#define DEPOSTERIZE_THRESHOLD 23 // Possible values are [0-255], where lower a value prevents blending and a higher value allows for more blending + + +static u32 Deposterize_InterpLTE(const u32 pixA, const u32 pixB) { const u32 aB = (pixB & 0xFF000000) >> 24; if (aB == 0) @@ -35,10 +38,10 @@ static u32 Deposterize_InterpLTE(const u32 pixA, const u32 pixB, const u32 thres const u32 gB = (pixB & 0x0000FF00) >> 8; const u32 bB = (pixB & 0x00FF0000) >> 16; - const u32 rC = ( (rB - rA <= threshold) || (rA - rB <= threshold) ) ? ( ((rA+rB)>>1) ) : rA; - const u32 gC = ( (gB - gA <= threshold) || (gA - gB <= threshold) ) ? ( ((gA+gB)>>1) ) : gA; - const u32 bC = ( (bB - bA <= threshold) || (bA - bB <= threshold) ) ? ( ((bA+bB)>>1) ) : bA; - const u32 aC = ( (bB - aA <= threshold) || (aA - aB <= threshold) ) ? ( ((aA+aB)>>1) ) : aA; + const u32 rC = ( (rB - rA <= DEPOSTERIZE_THRESHOLD) || (rA - rB <= DEPOSTERIZE_THRESHOLD) ) ? ( ((rA+rB)>>1) ) : rA; + const u32 gC = ( (gB - gA <= DEPOSTERIZE_THRESHOLD) || (gA - gB <= DEPOSTERIZE_THRESHOLD) ) ? ( ((gA+gB)>>1) ) : gA; + const u32 bC = ( (bB - bA <= DEPOSTERIZE_THRESHOLD) || (bA - bB <= DEPOSTERIZE_THRESHOLD) ) ? ( ((bA+bB)>>1) ) : bA; + const u32 aC = ( (bB - aA <= DEPOSTERIZE_THRESHOLD) || (aA - aB <= DEPOSTERIZE_THRESHOLD) ) ? ( ((aA+aB)>>1) ) : aA; return (rC | (gC << 8) | (bC << 16) | (aC << 24)); } @@ -84,7 +87,6 @@ void RenderDeposterize(SSurface Src, SSurface Dst) u32 *src = (u32 *)Src.Surface; u32 *workingDst = (u32 *)Dst.workingSurface[0]; u32 *finalDst = (u32 *)Dst.Surface; - u32 threshold = *(u32 *)Dst.userData; int i = 0; for (int y = 0; y < h; y++) @@ -108,14 +110,14 @@ void RenderDeposterize(SSurface Src, SSurface Dst) color[8] = ((x < w-1) && (y > 0)) ? src[i-w+1] : src[i]; blend[0] = color[0]; - blend[1] = Deposterize_InterpLTE(color[0], color[1], threshold); - blend[2] = Deposterize_InterpLTE(color[0], color[2], threshold); - blend[3] = Deposterize_InterpLTE(color[0], color[3], threshold); - blend[4] = Deposterize_InterpLTE(color[0], color[4], threshold); - blend[5] = Deposterize_InterpLTE(color[0], color[5], threshold); - blend[6] = Deposterize_InterpLTE(color[0], color[6], threshold); - blend[7] = Deposterize_InterpLTE(color[0], color[7], threshold); - blend[8] = Deposterize_InterpLTE(color[0], color[8], threshold); + blend[1] = Deposterize_InterpLTE(color[0], color[1]); + blend[2] = Deposterize_InterpLTE(color[0], color[2]); + blend[3] = Deposterize_InterpLTE(color[0], color[3]); + blend[4] = Deposterize_InterpLTE(color[0], color[4]); + blend[5] = Deposterize_InterpLTE(color[0], color[5]); + blend[6] = Deposterize_InterpLTE(color[0], color[6]); + blend[7] = Deposterize_InterpLTE(color[0], color[7]); + blend[8] = Deposterize_InterpLTE(color[0], color[8]); workingDst[i] = Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[5], 1, 7), Deposterize_Blend(blend[0], blend[1], 1, 7), @@ -157,14 +159,14 @@ void RenderDeposterize(SSurface Src, SSurface Dst) color[8] = ((x < w-1) && (y > 0)) ? workingDst[i-w+1] : workingDst[i]; blend[0] = color[0]; - blend[1] = Deposterize_InterpLTE(color[0], color[1], threshold); - blend[2] = Deposterize_InterpLTE(color[0], color[2], threshold); - blend[3] = Deposterize_InterpLTE(color[0], color[3], threshold); - blend[4] = Deposterize_InterpLTE(color[0], color[4], threshold); - blend[5] = Deposterize_InterpLTE(color[0], color[5], threshold); - blend[6] = Deposterize_InterpLTE(color[0], color[6], threshold); - blend[7] = Deposterize_InterpLTE(color[0], color[7], threshold); - blend[8] = Deposterize_InterpLTE(color[0], color[8], threshold); + blend[1] = Deposterize_InterpLTE(color[0], color[1]); + blend[2] = Deposterize_InterpLTE(color[0], color[2]); + blend[3] = Deposterize_InterpLTE(color[0], color[3]); + blend[4] = Deposterize_InterpLTE(color[0], color[4]); + blend[5] = Deposterize_InterpLTE(color[0], color[5]); + blend[6] = Deposterize_InterpLTE(color[0], color[6]); + blend[7] = Deposterize_InterpLTE(color[0], color[7]); + blend[8] = Deposterize_InterpLTE(color[0], color[8]); finalDst[i] = Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[5], 1, 7), Deposterize_Blend(blend[0], blend[1], 1, 7), diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index c5360d660..23174307d 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -32,7 +32,6 @@ #include "./filter/filter.h" #include "./filter/xbrz.h" -#define TEXTURE_DEPOSTERIZE_THRESHOLD 21 // Possible values are [0-255], where lower a value prevents blending and a higher value allows for more blending int cur3DCore = GPU3D_NULL; @@ -238,7 +237,6 @@ Render3D::Render3D() _textureScalingFactor = 1; _textureSmooth = false; _textureUpscaleBuffer = NULL; - _textureDeposterizeThreshold = TEXTURE_DEPOSTERIZE_THRESHOLD; memset(&_textureDeposterizeSrcSurface, 0, sizeof(_textureDeposterizeSrcSurface)); memset(&_textureDeposterizeDstSurface, 0, sizeof(_textureDeposterizeDstSurface)); @@ -246,7 +244,6 @@ Render3D::Render3D() _textureDeposterizeSrcSurface.Width = _textureDeposterizeDstSurface.Width = 1; _textureDeposterizeSrcSurface.Height = _textureDeposterizeDstSurface.Height = 1; _textureDeposterizeSrcSurface.Pitch = _textureDeposterizeDstSurface.Pitch = 1; - _textureDeposterizeDstSurface.userData = &_textureDeposterizeThreshold; Reset(); } diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index a98225a4e..3ffd0df61 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -134,9 +134,7 @@ protected: SSurface _textureDeposterizeSrcSurface; SSurface _textureDeposterizeDstSurface; - u32 _textureDeposterizeThreshold; - //u32 *_textureDeposterizeBuffer; u32 *_textureUpscaleBuffer; CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; From 66613008f11da8d23feb97a385c6019835689532 Mon Sep 17 00:00:00 2001 From: zeromus Date: Mon, 3 Oct 2016 01:48:05 +0000 Subject: [PATCH 29/41] update path format tooltip --- desmume/src/windows/pathsettings.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/desmume/src/windows/pathsettings.cpp b/desmume/src/windows/pathsettings.cpp index d394464f7..1699f0bd6 100644 --- a/desmume/src/windows/pathsettings.cpp +++ b/desmume/src/windows/pathsettings.cpp @@ -185,14 +185,17 @@ BOOL PathSettings_OnInitDialog(HWND hDlg, HWND hwndFocus, LPARAM lParam) ti.uFlags = TTF_SUBCLASS | TTF_IDISHWND; ti.uId = (UINT_PTR)hwnd; ti.lpszText = - "The format a screenshot should be saved in.\r\n" + "The string format a screenshot should be saved with (google strftime).\r\n" "%f\t\tFilename\r\n" "%r\t\tRandom: 0 ~ RAND_MAX\r\n" "%t\t\tTick: Reset on startup\r\n" "%Y\t\tYear:Four Digit\r\n" + "%y\t\tYear:Two Digit\r\n" "%m\t\tMonth:Two Digit\r\n" - "%D\t\tDay:Two Digit\r\n" - "%H\t\tHour:Two Digit\r\n" + "%d\t\tDay:Two Digit\r\n" + "%H\t\tHour (24):Two Digit\r\n" + "%I\t\tHour (12):Two Digit\r\n" + "%p\t\tAM/PM\r\n" "%M\t\tMinute: Two Digit\r\n" "%S\t\tSecond: Two Digit\r\n"; GetClientRect(hwnd, &ti.rect); From 77918f13ee1df5de71f92e5719d36b82a84e3f30 Mon Sep 17 00:00:00 2001 From: zeromus Date: Wed, 5 Oct 2016 08:21:38 +0000 Subject: [PATCH 30/41] fix support importing newer style ardsi duc files --- desmume/src/mc.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/desmume/src/mc.cpp b/desmume/src/mc.cpp index 2abab90c9..d5c264312 100644 --- a/desmume/src/mc.cpp +++ b/desmume/src/mc.cpp @@ -1504,7 +1504,7 @@ u32 BackupDevice::get_save_duc_size(const char* fname) bool BackupDevice::import_duc(const char* filename, u32 force_size) { u32 size; - u8 id16[16] = {0}, id4[4] = {0}, id2[2] = {0}; + u8 id16[16] = {0}, id4[4] = {0}, id3[3] = {0}; FILE* file = fopen(filename, "rb"); if(!file) return false; @@ -1516,12 +1516,13 @@ bool BackupDevice::import_duc(const char* filename, u32 force_size) if(!memcmp(id16, "ARDS000000000001", 16)) version = 1; //ID version 2 - fseek(file,0xA2,SEEK_SET); - fread(id2,1,2,file); - if(!memcmp(id16,"\0\0\0\0",4) && !memcmp(id2,"\x04\xC0",2)) version = 2; + fseek(file,0xA1,SEEK_SET); + fread(id3,1,3,file); + if(!memcmp(id16,"\0\0\0\0",4) && id3[2] == 0xC0) version = 2; if(version == 0) { + INVALID_DUC: printf("Not recognized as a valid DUC file\n"); fclose(file); return false; @@ -1540,6 +1541,11 @@ bool BackupDevice::import_duc(const char* filename, u32 force_size) { size -= 0xA4; fseek(file, 0xA4, SEEK_SET); + + //validate size + int specifiedSize = (id3[0]<<8)+(id3[1]<<16); + if(specifiedSize != size) + goto INVALID_DUC; } u32 left = 0; From 318613e783c06b46241e75d520354bbbaae0a2a2 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 25 Oct 2016 06:02:33 +0000 Subject: [PATCH 31/41] add savetype hardcode for puzzler world --- desmume/src/mc.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/desmume/src/mc.cpp b/desmume/src/mc.cpp index d5c264312..4ae4ce71b 100644 --- a/desmume/src/mc.cpp +++ b/desmume/src/mc.cpp @@ -632,6 +632,7 @@ void BackupDevice::reset() else if(!memcmp(gameInfo.header.gameCode,"AH5", 3)) addr_size = 1; //over the hedge else if(!memcmp(gameInfo.header.gameCode,"AVH", 3)) addr_size = 1; //over the hedge - Hammy Goes Nuts! else if(!memcmp(gameInfo.header.gameCode,"AQ3", 3)) addr_size = 1; //spider-man 3 + else if(!memcmp(gameInfo.header.gameCode,"BPV", 3)) addr_size = 2; //puzzler world (should be eeprom 64KBits) //if we found a whitelist match, we dont need to run detection if(addr_size) state = RUNNING; From 95db2317b91f9938d1f9c1e9190bea528cac3fc8 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sun, 30 Oct 2016 23:16:49 +0000 Subject: [PATCH 32/41] Texture Handler: - Do some heavy cleanup and code refactoring. - Add SSE2-enabled unpacking function for direct 16-bit color textures. --- desmume/src/OGLRender.cpp | 31 +- desmume/src/OGLRender_3_2.cpp | 12 +- desmume/src/rasterize.cpp | 12 +- desmume/src/render3D.cpp | 6 +- desmume/src/texcache.cpp | 2237 +++++++++++++++++++++------------ desmume/src/texcache.h | 131 +- 6 files changed, 1543 insertions(+), 886 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index bb5bfcb05..a99db0263 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -1196,7 +1196,8 @@ OpenGLRenderer_1_2::~OpenGLRenderer_1_2() DestroyMultisampledFBO(); // Kill the texture cache now before all of our texture IDs disappear. - TexCache_Reset(); + texCache.Reset(); + texCache.Reset(); while(!ref->freeTextureIDs.empty()) { @@ -2694,7 +2695,7 @@ Render3DError OpenGLRenderer_1_2::RenderGeometry(const GFX3D_State &renderState, Render3DError OpenGLRenderer_1_2::EndRender(const u64 frameCount) { //needs to happen before endgl because it could free some textureids for expired cache items - TexCache_EvictFrame(); + texCache.Evict(TEXCACHE_MAX_SIZE); this->ReadBackPixels(); @@ -2967,21 +2968,21 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT glEnable(GL_TEXTURE_2D); } - TexCacheItem *newTexture = TexCache_SetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); - if(newTexture != this->currTexture) + TexCacheItem *newTexture = texCache.GetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); + if (newTexture != this->currTexture) { this->currTexture = newTexture; //has the ogl renderer initialized the texture? - if(this->currTexture->GetDeleteCallback() == NULL) + if (this->currTexture->GetDeleteCallback() == NULL) { this->currTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); - if(OGLRef.freeTextureIDs.empty()) + if (OGLRef.freeTextureIDs.empty()) { this->ExpandFreeTextures(); } - this->currTexture->texid = (u64)OGLRef.freeTextureIDs.front(); + this->currTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); @@ -2989,7 +2990,7 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); const NDSTextureFormat texFormat = this->currTexture->GetTextureFormat(); - const u32 *textureSrc = (u32 *)this->currTexture->decoded; + const u32 *textureSrc = this->currTexture->unpackData; size_t texWidth = this->currTexture->sizeX; size_t texHeight = this->currTexture->sizeY; @@ -3133,7 +3134,7 @@ Render3DError OpenGLRenderer_1_2::Reset() memset(this->clearImagePolyIDBuffer, 0, sizeof(this->clearImagePolyIDBuffer)); memset(this->clearImageFogBuffer, 0, sizeof(this->clearImageFogBuffer)); - TexCache_Reset(); + texCache.Reset(); return OGLERROR_NOERR; } @@ -4618,21 +4619,21 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT glUniform1i(OGLRef.uniformPolyEnableTexture, GL_TRUE); glUniform1i(OGLRef.uniformTexSingleBitAlpha, (params.texFormat != TEXMODE_A3I5 && params.texFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); - TexCacheItem *newTexture = TexCache_SetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); - if(newTexture != this->currTexture) + TexCacheItem *newTexture = texCache.GetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); + if (newTexture != this->currTexture) { this->currTexture = newTexture; //has the ogl renderer initialized the texture? - if(this->currTexture->GetDeleteCallback() == NULL) + if (this->currTexture->GetDeleteCallback() == NULL) { this->currTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); - if(OGLRef.freeTextureIDs.empty()) + if (OGLRef.freeTextureIDs.empty()) { this->ExpandFreeTextures(); } - this->currTexture->texid = (u64)OGLRef.freeTextureIDs.front(); + this->currTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); @@ -4640,7 +4641,7 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); const NDSTextureFormat texFormat = this->currTexture->GetTextureFormat(); - const u32 *textureSrc = (u32 *)this->currTexture->decoded; + const u32 *textureSrc = this->currTexture->unpackData; size_t texWidth = this->currTexture->sizeX; size_t texHeight = this->currTexture->sizeY; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index e218fc448..384ac3686 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1695,21 +1695,21 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } - TexCacheItem *newTexture = TexCache_SetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); - if(newTexture != this->currTexture) + TexCacheItem *newTexture = texCache.GetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); + if (newTexture != this->currTexture) { this->currTexture = newTexture; //has the ogl renderer initialized the texture? - if(this->currTexture->GetDeleteCallback() == NULL) + if (this->currTexture->GetDeleteCallback() == NULL) { this->currTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); - if(OGLRef.freeTextureIDs.empty()) + if (OGLRef.freeTextureIDs.empty()) { this->ExpandFreeTextures(); } - this->currTexture->texid = (u64)OGLRef.freeTextureIDs.front(); + this->currTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); @@ -1717,7 +1717,7 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); const NDSTextureFormat texFormat = this->currTexture->GetTextureFormat(); - const u32 *textureSrc = (u32 *)this->currTexture->decoded; + const u32 *textureSrc = this->currTexture->unpackData; size_t texWidth = this->currTexture->sizeX; size_t texHeight = this->currTexture->sizeY; diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 62a384e40..9bff7089d 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -461,7 +461,7 @@ public: sampler.dowrap(iu, iv); FragmentColor color; - color.color = ((u32*)lastTexKey->decoded)[(iv<unpackData[(iv<_clippedPolyCount; i++) { @@ -1386,7 +1386,7 @@ void SoftRasterizerRenderer::setupTextures() //and then it won't be safe. if (lastTexParams != thePoly.texParam || lastTexPalette != thePoly.texPalette) { - lastTexKey = TexCache_SetTexture(TexFormat_15bpp, thePoly.texParam, thePoly.texPalette); + lastTexKey = texCache.GetTexture(TexFormat_15bpp, thePoly.texParam, thePoly.texPalette); lastTexParams = thePoly.texParam; lastTexPalette = thePoly.texPalette; } @@ -1537,7 +1537,7 @@ Render3DError SoftRasterizerRenderer::RenderGeometry(const GFX3D_State &renderSt { rasterizerUnit[0].mainLoop(); this->_renderGeometryNeedsFinish = false; - TexCache_EvictFrame(); // Since we're finishing geometry rendering here and now, also check the texture cache now. + texCache.Evict(TEXCACHE_MAX_SIZE); // Since we're finishing geometry rendering here and now, also check the texture cache now. } // printf("rendered %d of %d polys after backface culling\n",gfx3d.polylist->count-culled,gfx3d.polylist->count); @@ -1888,7 +1888,7 @@ Render3DError SoftRasterizerRenderer::Reset() memset(this->clearImagePolyIDBuffer, 0, sizeof(this->clearImagePolyIDBuffer)); memset(this->clearImageFogBuffer, 0, sizeof(this->clearImageFogBuffer)); - TexCache_Reset(); + texCache.Reset(); return RENDER3DERROR_NOERR; } @@ -1947,7 +1947,7 @@ Render3DError SoftRasterizerRenderer::RenderFinish() } // Now that geometry rendering is finished on all threads, check the texture cache. - TexCache_EvictFrame(); + texCache.Evict(TEXCACHE_MAX_SIZE); // Do multithreaded post-processing. if (this->currentRenderState->enableEdgeMarking || this->currentRenderState->enableFog) diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 23174307d..0e6008a6d 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -389,7 +389,7 @@ void Render3D::SetTextureProcessingProperties(size_t scalingFactor, bool willDep if (needTexCacheReset) { - TexCache_Reset(); + texCache.Reset(); } } @@ -616,7 +616,7 @@ Render3DError Render3D::Reset() this->_willFlushFramebufferRGBA6665 = true; this->_willFlushFramebufferRGBA5551 = true; - TexCache_Reset(); + texCache.Reset(); return RENDER3DERROR_NOERR; } @@ -658,7 +658,7 @@ Render3DError Render3D::RenderFinish() Render3DError Render3D::VramReconfigureSignal() { - TexCache_Invalidate(); + texCache.Invalidate(); return RENDER3DERROR_NOERR; } diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 89f86b1d2..f02e171e3 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include "texcache.h" @@ -41,7 +40,11 @@ using std::max; //only dump this from ogl renderer. for now, softrasterizer creates things in an incompatible pixel format //#define DEBUG_DUMP_TEXTURE -#define CONVERT(color) ((TEXFORMAT == TexFormat_32bpp)?(COLOR555TO8888_OPAQUE(color)):COLOR555TO6665_OPAQUE(color)) +#if defined(DEBUG_DUMP_TEXTURE) && defined(WIN32) + #define DO_DEBUG_DUMP_TEXTURE +#endif + +#define CONVERT(color) ((TEXCACHEFORMAT == TexFormat_32bpp)?(COLOR555TO8888_OPAQUE(color)):COLOR555TO6665_OPAQUE(color)) //This class represents a number of regions of memory which should be viewed as contiguous class MemSpan @@ -86,7 +89,7 @@ public: //dumps the memspan to the specified buffer //you may set size to limit the size to be copied - int dump(void* buf, int size=-1) + int dump(void* buf, int size=-1) const { if(size==-1) size = this->size; size = min(this->size,size); @@ -108,7 +111,7 @@ public: // this function does the same than dump // but works for both little and big endian // when buf is an u16 array - int dump16(void* buf, int size=-1) + int dump16(void* buf, int size=-1) const { if(size==-1) size = this->size; size = min(this->size,size); @@ -192,842 +195,1464 @@ static MemSpan MemSpan_TexPalette(u32 ofs, u32 len, bool silent) return ret; } -#if defined (DEBUG_DUMP_TEXTURE) && defined (WIN32) -#define DO_DEBUG_DUMP_TEXTURE -static void DebugDumpTexture(TexCacheItem* item) +//for each texformat, number of palette entries +static const u32 paletteSizeList[] = {0, 32, 4, 16, 256, 0, 8, 0}; + +TexCache texCache; + +TexCache::TexCache() +{ + cache_size = 0; + memset(paletteDump, 0, sizeof(paletteDump)); +} + +void TexCache::list_remove(TexCacheItem *item) +{ + this->index.erase(item->iterator); + this->cache_size -= item->unpackSize; +} + +void TexCache::list_push_front(TexCacheItem *item) +{ + item->iterator = this->index.insert(std::make_pair(item->textureAttributes, item)); + this->cache_size += item->unpackSize; +} + +void TexCache::Invalidate() +{ + //check whether the palette memory changed + //TODO - we should handle this instead by setting dirty flags in the vram memory mapping and noting whether palette memory was dirty. + //but this will work for now + MemSpan mspal = MemSpan_TexPalette(0, PALETTE_DUMP_SIZE, true); + bool paletteDirty = mspal.memcmp(paletteDump); + if (paletteDirty) + { + mspal.dump(paletteDump); + } + + for (TTexCacheItemMultimap::iterator it(this->index.begin()); it != this->index.end(); ++it) + { + it->second->suspectedInvalid = true; + + //when the palette changes, we assume all 4x4 textures are dirty. + //this is because each 4x4 item doesnt carry along with it a copy of the entire palette, for verification + //instead, we just use the one paletteDump for verifying of all 4x4 textures; and if paletteDirty is set, verification has failed + if( (it->second->GetTextureFormat() == TEXMODE_4X4) && paletteDirty ) + { + it->second->assumedInvalid = true; + } + } +} + +void TexCache::Evict(u32 target) +{ + //debug print + //printf("%d %d/%d\n",index.size(),cache_size/1024,target/1024); + + //dont do anything unless we're over the target + if (cache_size < target) return; + + //aim at cutting the cache to half of the max size + target /= 2; + + //evicts items in an arbitrary order until it is less than the max cache size + //TODO - do this based on age and not arbitrarily + while (this->cache_size > target) + { + if (this->index.size() == 0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 + + TexCacheItem *item = this->index.begin()->second; + this->list_remove(item); + //printf("evicting! totalsize:%d\n",cache_size); + delete item; + } +} + +void TexCache::Reset() +{ + this->Evict(0); +} + +TexCacheItem* TexCache::GetTexture(TexCache_TexFormat texCacheFormat, u32 texAttributes, u32 palAttributes) +{ + //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) + static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; + + NDSTextureFormat texPackFormat = (NDSTextureFormat)((texAttributes>>26)&0x07); + u32 sizeX = (8 << ((texAttributes>>20)&0x07)); + u32 sizeY = (8 << ((texAttributes>>23)&0x07)); + u32 imageSize = sizeX*sizeY; + u32 palAddress; + + switch (texPackFormat) + { + case TEXMODE_I2: + palAddress = palAttributes << 3; + break; + + case TEXMODE_A3I5: + case TEXMODE_I4: + case TEXMODE_I8: + case TEXMODE_A5I3: + case TEXMODE_16BPP: + case TEXMODE_4X4: + default: + palAddress = palAttributes << 4; + break; + } + + //analyze the texture memory mapping and the specifications of this texture + u32 texSize = (imageSize*texSizes[texPackFormat]) >> 2; //shifted because the texSizes multiplier is fixed point + MemSpan currentPackedTexDataMS = MemSpan_TexMem((texAttributes&0xFFFF)<<3, texSize); + + //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. + //this isnt such a problem with texture memory, because we read sequentially from it. + //however, we read randomly from palette memory, so the mapping is more costly. + u32 palSize = paletteSizeList[texPackFormat] * sizeof(u16); + MemSpan currentPaletteMS = MemSpan_TexPalette(palAddress, palSize, false); + + CACHE_ALIGN u16 currentPalette[256]; +#ifdef WORDS_BIGENDIAN + currentPaletteMS.dump16(currentPalette); +#else + currentPaletteMS.dump(currentPalette); +#endif + + //determine the location for 4x4 index data + u32 indexBase; + if ((texAttributes & 0xc000) == 0x8000) indexBase = 0x30000; + else indexBase = 0x20000; + + u32 indexOffset = (texAttributes & 0x3FFF) << 2; + int indexSize = 0; + MemSpan currentPackedTexIndexMS; + if (texPackFormat == TEXMODE_4X4) + { + indexSize = imageSize >> 3; + currentPackedTexIndexMS = MemSpan_TexMem(indexOffset+indexBase, indexSize); + } + + //TODO - as a special optimization, keep the last item returned and check it first + + TexCacheItem *cachedTexture = NULL; + + for(std::pair + iters = index.equal_range(texAttributes); + iters.first != iters.second; + ++iters.first) + { + cachedTexture = iters.first->second; + + //conditions where we reject matches: + //when the teximage or texpal params dont match + //(this is our key for identifying textures in the cache) + //NEW: due to using format as a key we dont need to check this anymore + //if(curr->texAttributes != texAttributes) continue; + if (cachedTexture->paletteAttributes != palAttributes) continue; + + //we're being asked for a different format than what we had cached. + //TODO - this could be done at the entire cache level instead of checking repeatedly + if (cachedTexture->unpackFormat != texCacheFormat) goto REJECT; + + //if the texture is assumed invalid, reject it + if (cachedTexture->assumedInvalid) goto REJECT; + + //the texture matches params, but isnt suspected invalid. accept it. + if (!cachedTexture->suspectedInvalid) return cachedTexture; + + //we suspect the texture may be invalid. we need to do a byte-for-byte comparison to re-establish that it is valid: + + //when the palettes dont match: + //note that we are considering 4x4 textures to have a palette size of 0. + //they really have a potentially HUGE palette, too big for us to handle like a normal palette, + //so they go through a different system + if (currentPaletteMS.size != 0 && memcmp(cachedTexture->paletteColorTable, currentPalette, currentPaletteMS.size)) goto REJECT; + + //when the texture data doesn't match + if (currentPackedTexDataMS.memcmp(cachedTexture->packData, cachedTexture->packSize)) goto REJECT; + + //if the texture is 4x4 then the index data must match + if (texPackFormat == TEXMODE_4X4) + { + if (currentPackedTexIndexMS.memcmp(cachedTexture->packIndexData, cachedTexture->packIndexSize)) goto REJECT; + } + + //we found a match. just return it + //REMINDER to make it primary/newest when we have smarter code + //list_remove(curr); + //list_push_front(curr); + cachedTexture->suspectedInvalid = false; + return cachedTexture; + + REJECT: + //we found a cached item for the current address, but the data is stale. + //for a variety of complicated reasons, we need to throw it out right this instant. + this->list_remove(cachedTexture); + delete cachedTexture; + break; + } + + //item was not found. recruit an existing one (the oldest), or create a new one + //evict(); //reduce the size of the cache if necessary + //TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs + //to support separate cache and read passes + TexCacheItem *newTexture = new TexCacheItem(); + newTexture->SetTextureData(texAttributes, currentPackedTexDataMS, currentPackedTexIndexMS); + newTexture->SetTexturePalette(palAttributes, currentPalette); + newTexture->unpackFormat = texCacheFormat; + + this->list_push_front(newTexture); + //printf("allocating: up to %d with %d items\n",cache_size,index.size()); + + switch (texCacheFormat) + { + case TexFormat_32bpp: newTexture->Unpack(currentPackedTexDataMS); break; + case TexFormat_15bpp: newTexture->Unpack(currentPackedTexDataMS); break; + default: assert(false); return NULL; + } + + return newTexture; +} + +TexCacheItem::TexCacheItem() +{ + _deleteCallback = NULL; + _deleteCallbackParam1 = NULL; + _deleteCallbackParam2 = NULL; + + packFormat = TEXMODE_NONE; + packSize = 0; + packData = NULL; + paletteColorTable = NULL; + + unpackFormat = TexFormat_None; + unpackSize = 0; + unpackData = NULL; + + suspectedInvalid = false; + assumedInvalid = false; + + textureAttributes = 0; + paletteAttributes = 0; + paletteAddress = 0; + paletteSize = 0; + sizeX = 0; + sizeY = 0; + invSizeX = 0.0f; + invSizeY = 0.0f; + + packIndexData = NULL; + packSizeFirstSlot = 0; + packIndexSize = 0; + + texid = 0; +} + +TexCacheItem::~TexCacheItem() +{ + free_aligned(this->packData); + free_aligned(this->unpackData); + free_aligned(this->paletteColorTable); + free_aligned(this->packIndexData); + if (this->_deleteCallback != NULL) this->_deleteCallback(this, this->_deleteCallbackParam1, this->_deleteCallbackParam2); +} + +TexCacheItemDeleteCallback TexCacheItem::GetDeleteCallback() const +{ + return this->_deleteCallback; +} + +void TexCacheItem::SetDeleteCallback(TexCacheItemDeleteCallback callbackFunc, void *inParam1, void *inParam2) +{ + this->_deleteCallback = callbackFunc; + this->_deleteCallbackParam1 = inParam1; + this->_deleteCallbackParam2 = inParam2; +} + +NDSTextureFormat TexCacheItem::GetTextureFormat() const +{ + return this->packFormat; +} + +void TexCacheItem::SetTextureData(const u32 attr, const MemSpan &packedData, const MemSpan &packedIndexData) +{ + u8 *oldPackData = this->packData; + u32 *oldUnpackData = this->unpackData; + u32 w = (8 << ((attr >> 20) & 0x07)); + u32 h = (8 << ((attr >> 23) & 0x07)); + + this->textureAttributes = attr; + this->packFormat = (NDSTextureFormat)((attr >> 26) & 0x07); + + this->sizeX = w; + this->sizeY = h; + this->invSizeX = 1.0f / (float)w; + this->invSizeY = 1.0f / (float)h; + this->unpackSize = w * h * sizeof(u32); + this->unpackData = (u32 *)malloc_alignedCacheLine(this->unpackSize); + + //dump texture and 4x4 index data for cache keying + this->packSize = packedData.size; + this->packIndexSize = packedIndexData.size; + this->packData = (u8 *)malloc_alignedCacheLine(this->packSize); + this->packSizeFirstSlot = packedData.items[0].len; + + packedData.dump(this->packData); + + if (this->packFormat == TEXMODE_4X4) + { + u8 *oldPackIndexData = this->packIndexData; + this->packIndexData = (u8 *)malloc_alignedCacheLine(this->packIndexSize); + packedIndexData.dump(this->packIndexData, this->packIndexSize); + free_aligned(oldPackIndexData); + } + + free_aligned(oldPackData); + free_aligned(oldUnpackData); +} + +void TexCacheItem::SetTexturePalette(const u32 attr, const u16 *paletteBuffer) +{ + const u32 oldPaletteSize = this->paletteSize; + + this->paletteAttributes = attr; + this->paletteAddress = (this->packFormat == TEXMODE_I2) ? attr << 3 : attr << 4; + this->paletteSize = paletteSizeList[this->packFormat] * sizeof(u16); + + if (this->paletteSize > 0) + { + if (this->paletteSize != oldPaletteSize) + { + u16 *oldPaletteColorTable = this->paletteColorTable; + this->paletteColorTable = (u16 *)malloc_alignedCacheLine(this->paletteSize); + memcpy(this->paletteColorTable, paletteBuffer, this->paletteSize); + free_aligned(oldPaletteColorTable); + } + else + { + memcpy(this->paletteColorTable, paletteBuffer, this->paletteSize); + } + } + else + { + free_aligned(this->paletteColorTable); + this->paletteColorTable = NULL; + } +} + +template +void TexCacheItem::Unpack(const MemSpan &packedData) +{ + // Whenever a 1-bit alpha or no-alpha texture is unpacked (this means any texture + // format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D + // renderers can assume that the transparent color is 0 during texture sampling. + + bool isPalZeroTransparent; + + switch (this->packFormat) + { + case TEXMODE_A3I5: + NDSTextureUnpackA3I5(this->packSize, this->packData, this->paletteColorTable, this->unpackData); + break; + + case TEXMODE_I2: + isPalZeroTransparent = ( ((this->textureAttributes >> 29) & 1) != 0 ); + NDSTextureUnpackI2(this->packSize, this->packData, this->paletteColorTable, isPalZeroTransparent, this->unpackData); + break; + + case TEXMODE_I4: + isPalZeroTransparent = ( ((this->textureAttributes >> 29) & 1) != 0 ); + NDSTextureUnpackI4(this->packSize, this->packData, this->paletteColorTable, isPalZeroTransparent, this->unpackData); + break; + + case TEXMODE_I8: + isPalZeroTransparent = ( ((this->textureAttributes >> 29) & 1) != 0 ); + NDSTextureUnpackI8(this->packSize, this->packData, this->paletteColorTable, isPalZeroTransparent, this->unpackData); + break; + + case TEXMODE_4X4: + { + if (this->packSize > this->packSizeFirstSlot) + { + PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); + } + + NDSTextureUnpack4x4(this->packSizeFirstSlot, this->packData, this->packIndexData, this->paletteAddress, this->textureAttributes, this->sizeX, this->sizeY, this->unpackData); + break; + } + + case TEXMODE_A5I3: + NDSTextureUnpackA5I3(this->packSize, this->packData, this->paletteColorTable, this->unpackData); + break; + + case TEXMODE_16BPP: + NDSTextureUnpackDirect16Bit(this->packSize, this->packData, this->unpackData); + break; + + default: + break; + } + +#ifdef DO_DEBUG_DUMP_TEXTURE + this->DebugDump(); +#endif +} + +#ifdef DO_DEBUG_DUMP_TEXTURE +void TexCacheItem::DebugDump() { static int ctr=0; char fname[100]; sprintf(fname,"c:\\dump\\%d.bmp", ctr); ctr++; - - NDS_WriteBMP_32bppBuffer(item->sizeX,item->sizeY,item->decoded,fname); + + NDS_WriteBMP_32bppBuffer(this->sizeX, this->sizeY, this->unpackData, fname); } #endif -class TexCache +// TODO: Delete these MemSpan based functions after testing confirms that using the dumped texture data works properly. +template +void NDSTextureUnpackI2(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer) { -public: - TexCache() - : cache_size(0) + u8 *adr; + +#ifdef ENABLE_SSSE3 + const __m128i pal_vec128 = _mm_loadl_epi64((__m128i *)pal); +#endif + if (isPalZeroTransparent) { - memset(paletteDump,0,sizeof(paletteDump)); - } - - TTexCacheItemMultimap index; - - //this ought to be enough for anyone - //static const u32 kMaxCacheSize = 64*1024*1024; - //changed by zeromus on 15-dec. I couldnt find any games that were getting anywhere NEAR 64 - static const u32 kMaxCacheSize = 16*1024*1024; - //metal slug burns through sprites so fast, it can test it pretty quickly though - - //this is not really precise, it is off by a constant factor - u32 cache_size; - - void list_remove(TexCacheItem* item) - { - index.erase(item->iterator); - cache_size -= item->decode_len; - } - - void list_push_front(TexCacheItem* item) - { - item->iterator = index.insert(std::make_pair(item->texformat,item)); - cache_size += item->decode_len; - } - - template - TexCacheItem* scan(u32 format, u32 texpal) - { - //for each texformat, number of palette entries - static const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0}; - - //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) - static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; - - //used to hold a copy of the palette specified for this texture - CACHE_ALIGN u16 pal[256]; - - NDSTextureFormat textureMode = (NDSTextureFormat)((format>>26)&0x07); - u32 sizeX=(8 << ((format>>20)&0x07)); - u32 sizeY=(8 << ((format>>23)&0x07)); - u32 imageSize = sizeX*sizeY; - - u8 *adr; - - u32 paletteAddress; - - switch (textureMode) + for (size_t j = 0; j < ms.numItems; j++) { - case TEXMODE_I2: - paletteAddress = texpal<<3; - break; + adr = ms.items[j].ptr; +#ifdef ENABLE_SSSE3 + for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dstBuffer+=16) + { + __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); + idx = _mm_slli_epi16(idx, 1); - case TEXMODE_A3I5: - case TEXMODE_I4: - case TEXMODE_I8: - case TEXMODE_A5I3: - case TEXMODE_16BPP: - case TEXMODE_4X4: - default: - paletteAddress = texpal<<4; - break; + __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); + const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + // Set converted colors to 0 if the palette index is 0. + idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); + idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); + convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); + convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); + convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); + convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + u8 idx; + + idx = *adr & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); + + idx = (*adr >> 2) & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); + + idx = (*adr >> 4) & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); + + idx = (*adr >> 6) & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); + } +#endif } - - //analyze the texture memory mapping and the specifications of this texture - int palSize = palSizes[textureMode]; - int texSize = (imageSize*texSizes[textureMode])>>2; //shifted because the texSizes multiplier is fixed point - MemSpan ms = MemSpan_TexMem((format&0xFFFF)<<3,texSize); - MemSpan mspal = MemSpan_TexPalette(paletteAddress,palSize*2,false); - - //determine the location for 4x4 index data - u32 indexBase; - if((format & 0xc000) == 0x8000) indexBase = 0x30000; - else indexBase = 0x20000; - - u32 indexOffset = (format&0x3FFF)<<2; - - int indexSize = 0; - MemSpan msIndex; - if(textureMode == TEXMODE_4X4) + } + else + { + for (size_t j = 0; j < ms.numItems; j++) { - indexSize = imageSize>>3; - msIndex = MemSpan_TexMem(indexOffset+indexBase,indexSize); + adr = ms.items[j].ptr; +#ifdef ENABLE_SSSE3 + for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dstBuffer+=16) + { + __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); + idx = _mm_slli_epi16(idx, 1); + + const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); + const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + *dstBuffer++ = CONVERT(pal[ *adr & 0x03] & 0x7FFF); + *dstBuffer++ = CONVERT(pal[(*adr >> 2) & 0x03] & 0x7FFF); + *dstBuffer++ = CONVERT(pal[(*adr >> 4) & 0x03] & 0x7FFF); + *dstBuffer++ = CONVERT(pal[(*adr >> 6) & 0x03] & 0x7FFF); + } +#endif } + } +} - - //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. - //this isnt such a problem with texture memory, because we read sequentially from it. - //however, we read randomly from palette memory, so the mapping is more costly. - #ifdef WORDS_BIGENDIAN - mspal.dump16(pal); - #else - mspal.dump(pal); - #endif - - //TODO - as a special optimization, keep the last item returned and check it first - - for(std::pair - iters = index.equal_range(format); - iters.first != iters.second; - ++iters.first) +template +void NDSTextureUnpackI4(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer) +{ + u8 *adr; + +#ifdef ENABLE_SSSE3 + const __m128i palLo = _mm_load_si128((__m128i *)pal + 0); + const __m128i palHi = _mm_load_si128((__m128i *)pal + 1); +#endif + if (isPalZeroTransparent) + { + for (size_t j = 0; j < ms.numItems; j++) { - TexCacheItem* curr = iters.first->second; + adr = ms.items[j].ptr; +#ifdef ENABLE_SSSE3 + for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dstBuffer+=16) + { + __m128i idx = _mm_loadl_epi64((__m128i *)adr); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); + idx = _mm_slli_epi16(idx, 1); + + __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); + const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); + const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); + const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); + const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); + + const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); + const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + // Set converted colors to 0 if the palette index is 0. + idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); + idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); + convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); + convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); + convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); + convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + u8 idx; + + idx = *adr & 0xF; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); + + idx = *adr >> 4; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); + } +#endif + } + } + else + { + for (size_t j = 0; j < ms.numItems; j++) + { + adr = ms.items[j].ptr; +#ifdef ENABLE_SSSE3 + for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dstBuffer+=16) + { + __m128i idx = _mm_loadl_epi64((__m128i *)adr); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); + idx = _mm_slli_epi16(idx, 1); + + const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); + const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); + const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); + const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); + const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); + + const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); + const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + *dstBuffer++ = CONVERT(pal[*adr & 0x0F] & 0x7FFF); + *dstBuffer++ = CONVERT(pal[*adr >> 4] & 0x7FFF); + } +#endif + } + } +} + +template +void NDSTextureUnpackI8(const MemSpan &ms, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) +{ + u8 *adr; + + if (isPalZeroTransparent) + { + for (size_t j = 0; j < ms.numItems; j++) + { + adr = ms.items[j].ptr; + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + *dstBuffer++ = (*adr == 0) ? 0 : CONVERT(srcPal[*adr] & 0x7FFF); + } + } + } + else + { + for (size_t j = 0; j < ms.numItems; j++) + { + adr = ms.items[j].ptr; + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + *dstBuffer++ = CONVERT(srcPal[*adr] & 0x7FFF); + } + } + } +} + +template +void NDSTextureUnpackA3I5(const MemSpan &ms, const u16 *pal, u32 *dstBuffer) +{ + u8 *adr; + + for (size_t j = 0; j < ms.numItems; j++) + { + adr = ms.items[j].ptr; + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + const u16 c = pal[*adr & 0x1F] & 0x7FFF; + const u8 alpha = *adr >> 5; + *dstBuffer++ = (TEXCACHEFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, material_3bit_to_5bit[alpha]) : COLOR555TO8888(c, material_3bit_to_8bit[alpha]); + } + } +} + +template +void NDSTextureUnpackA5I3(const MemSpan &ms, const u16 *pal, u32 *dstBuffer) +{ + u8 *adr; + +#ifdef ENABLE_SSSE3 + const __m128i pal_vec128 = _mm_load_si128((__m128i *)pal); +#endif + for (size_t j = 0; j < ms.numItems; j++) + { + adr = ms.items[j].ptr; +#ifdef ENABLE_SSSE3 + for (size_t x = 0; x < ms.items[j].len; x+=16, adr+=16, dstBuffer+=16) + { + const __m128i bits = _mm_loadu_si128((__m128i *)adr); - //conditions where we reject matches: - //when the teximage or texpal params dont match - //(this is our key for identifying textures in the cache) - //NEW: due to using format as a key we dont need to check this anymore - //if(curr->texformat != format) continue; - if(curr->texpal != texpal) continue; - - //we're being asked for a different format than what we had cached. - //TODO - this could be done at the entire cache level instead of checking repeatedly - if(curr->cacheFormat != TEXFORMAT) goto REJECT; - - //if the texture is assumed invalid, reject it - if(curr->assumedInvalid) goto REJECT; - - //the texture matches params, but isnt suspected invalid. accept it. - if(!curr->suspectedInvalid) return curr; - - //we suspect the texture may be invalid. we need to do a byte-for-byte comparison to re-establish that it is valid: - - //when the palettes dont match: - //note that we are considering 4x4 textures to have a palette size of 0. - //they really have a potentially HUGE palette, too big for us to handle like a normal palette, - //so they go through a different system - if(mspal.size != 0 && memcmp(curr->dump.palette,pal,mspal.size)) goto REJECT; - - //when the texture data doesn't match - if(ms.memcmp(&curr->dump.texture[0],curr->dump.textureSize)) goto REJECT; - - //if the texture is 4x4 then the index data must match - if(textureMode == TEXMODE_4X4) - { - if(msIndex.memcmp(curr->dump.texture + curr->dump.textureSize,curr->dump.indexSize)) goto REJECT; - } - - //we found a match. just return it - //REMINDER to make it primary/newest when we have smarter code - //list_remove(curr); - //list_push_front(curr); - curr->suspectedInvalid = false; - return curr; - - REJECT: - //we found a cached item for the current address, but the data is stale. - //for a variety of complicated reasons, we need to throw it out right this instant. - list_remove(curr); - delete curr; - break; - } - - //item was not found. recruit an existing one (the oldest), or create a new one - //evict(); //reduce the size of the cache if necessary - //TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs - //to support separate cache and read passes - TexCacheItem* newitem = new TexCacheItem(); - newitem->suspectedInvalid = false; - newitem->texformat = format; - newitem->cacheFormat = TEXFORMAT; - newitem->texpal = texpal; - newitem->sizeX=sizeX; - newitem->sizeY=sizeY; - newitem->invSizeX=1.0f/((float)(sizeX)); - newitem->invSizeY=1.0f/((float)(sizeY)); - newitem->decode_len = sizeX*sizeY*4; - newitem->format = textureMode; - newitem->decoded = (u8 *)malloc_alignedCacheLine(newitem->decode_len); - list_push_front(newitem); - //printf("allocating: up to %d with %d items\n",cache_size,index.size()); - - u32 *dwdst = (u32*)newitem->decoded; - - //dump palette data for cache keying - if(palSize) - { - memcpy(newitem->dump.palette, pal, palSize*2); - } - - //dump texture and 4x4 index data for cache keying - const int texsize = newitem->dump.textureSize = ms.size; - const int indexsize = newitem->dump.indexSize = msIndex.size; - newitem->dump.texture = new u8[texsize+indexsize]; - ms.dump(&newitem->dump.texture[0],newitem->dump.maxTextureSize); //dump texture - if(textureMode == TEXMODE_4X4) - msIndex.dump(newitem->dump.texture+newitem->dump.textureSize,newitem->dump.indexSize); //dump 4x4 - - - //============================================================================ - //Texture conversion - //============================================================================ - - // Whenever a 1-bit alpha or no-alpha texture is unpacked (this means any texture - // format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D - // renderers can assume that the transparent color is 0 during texture sampling. - - const bool isPalZeroTransparent = ( ((format >> 29) & 1) != 0 ); - - switch (newitem->format) - { - case TEXMODE_A3I5: - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - const u16 c = pal[*adr & 31] & 0x7FFF; - const u8 alpha = *adr >> 5; - *dwdst++ = (TEXFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, material_3bit_to_5bit[alpha]) : COLOR555TO8888(c, material_3bit_to_8bit[alpha]); - } - } - break; - } - - case TEXMODE_I2: - { -#ifdef ENABLE_SSSE3 - const __m128i pal_vec128 = _mm_loadl_epi64((__m128i *)pal); -#endif - if (isPalZeroTransparent) - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dwdst+=16) - { - __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); - idx = _mm_slli_epi16(idx, 1); - - __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); - const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - - __m128i convertedColor[4]; - - if (TEXFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - // Set converted colors to 0 if the palette index is 0. - idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); - idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); - convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); - convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); - convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); - convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); - - _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - u8 idx; - - idx = *adr & 0x03; - *dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = (*adr >> 2) & 0x03; - *dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = (*adr >> 4) & 0x03; - *dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = (*adr >> 6) & 0x03; - *dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - } -#endif - } - } - else - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dwdst+=16) - { - __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); - idx = _mm_slli_epi16(idx, 1); - - const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); - const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - - __m128i convertedColor[4]; - - if (TEXFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dwdst++ = CONVERT(pal[ *adr & 0x03] & 0x7FFF); - *dwdst++ = CONVERT(pal[(*adr >> 2) & 0x03] & 0x7FFF); - *dwdst++ = CONVERT(pal[(*adr >> 4) & 0x03] & 0x7FFF); - *dwdst++ = CONVERT(pal[(*adr >> 6) & 0x03] & 0x7FFF); - } -#endif - } - } - break; - } - - case TEXMODE_I4: - { -#ifdef ENABLE_SSSE3 - const __m128i palLo = _mm_load_si128((__m128i *)pal + 0); - const __m128i palHi = _mm_load_si128((__m128i *)pal + 1); -#endif - if (isPalZeroTransparent) - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dwdst+=16) - { - __m128i idx = _mm_loadl_epi64((__m128i *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); - idx = _mm_slli_epi16(idx, 1); - - __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); - const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); - const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); - const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); - const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); - - const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); - const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); - - __m128i convertedColor[4]; - - if (TEXFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - // Set converted colors to 0 if the palette index is 0. - idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); - idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); - convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); - convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); - convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); - convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); - - _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - u8 idx; - - idx = *adr & 0xF; - *dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = *adr >> 4; - *dwdst++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - } -#endif - } - - } - else - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dwdst+=16) - { - __m128i idx = _mm_loadl_epi64((__m128i *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); - idx = _mm_slli_epi16(idx, 1); - - const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); - const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); - const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); - const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); - const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); - - const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); - const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); - - __m128i convertedColor[4]; - - if (TEXFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dwdst++ = CONVERT(pal[*adr & 0x0F] & 0x7FFF); - *dwdst++ = CONVERT(pal[*adr >> 4] & 0x7FFF); - } -#endif - } - } - break; - } - - case TEXMODE_I8: - { - if (isPalZeroTransparent) - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dwdst++ = (*adr == 0) ? 0 : CONVERT(pal[*adr] & 0x7FFF); - } - } - } - else - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dwdst++ = CONVERT(pal[*adr] & 0x7FFF); - } - } - } - break; - } - - case TEXMODE_4X4: - { - if (ms.numItems != 1) - { - PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); - } - //this check isnt necessary since the addressing is tied to the texture data which will also run out: - //if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n"); - - #define PAL4X4(offset) ( LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((paletteAddress + (offset)*2)>>14)&0x7] + ((paletteAddress + (offset)*2)&0x3FFF) ) ) & 0x7FFF ) - - u16* slot1; - u32* map = (u32*)ms.items[0].ptr; - u32 limit = ms.items[0].len<<2; - u32 d = 0; - if ( (format & 0xc000) == 0x8000) - // texel are in slot 2 - slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((format & 0x3FFF)<<2)+0x010000]; - else - slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(format & 0x3FFF)<<2]; - - u16 yTmpSize = (sizeY>>2); - u16 xTmpSize = (sizeX>>2); - - //this is flagged whenever a 4x4 overruns its slot. - //i am guessing we just generate black in that case - bool dead = false; - - for (size_t y = 0; y < yTmpSize; y++) - { - u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX, - ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; - for (size_t x = 0; x < xTmpSize; x++, d++) - { - if (d >= limit) - dead = true; - - if (dead) - { - for (int sy = 0; sy < 4; sy++) - { - const u32 currentPos = (x<<2) + tmpPos[sy]; - dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0; - } - continue; - } - - const u32 currBlock = LE_TO_LOCAL_32(map[d]); - const u16 pal1 = LE_TO_LOCAL_16(slot1[d]); - const u16 pal1offset = (pal1 & 0x3FFF)<<1; - const u8 mode = pal1>>14; - u32 tmp_col[4]; - - tmp_col[0] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset) ); - tmp_col[1] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+1) ); - - switch (mode) - { - case 0: - tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); - tmp_col[3] = 0x00000000; - break; - - case 1: -#ifdef LOCAL_BE - tmp_col[2] = ( (((tmp_col[0] & 0xFF000000) >> 1)+((tmp_col[1] & 0xFF000000) >> 1)) & 0xFF000000 ) | - ( (((tmp_col[0] & 0x00FF0000) + (tmp_col[1] & 0x00FF0000)) >> 1) & 0x00FF0000 ) | - ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | - 0x000000FF; - tmp_col[3] = 0x00000000; -#else - tmp_col[2] = ( (((tmp_col[0] & 0x00FF00FF) + (tmp_col[1] & 0x00FF00FF)) >> 1) & 0x00FF00FF ) | - ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | - 0xFF000000; - tmp_col[3] = 0x00000000; -#endif - break; - - case 2: - tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); - tmp_col[3] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+3) ); - break; - - case 3: - { -#ifdef LOCAL_BE - const u32 r0 = (tmp_col[0]>>24) & 0x000000FF; - const u32 r1 = (tmp_col[1]>>24) & 0x000000FF; - const u32 g0 = (tmp_col[0]>>16) & 0x000000FF; - const u32 g1 = (tmp_col[1]>>16) & 0x000000FF; - const u32 b0 = (tmp_col[0]>> 8) & 0x000000FF; - const u32 b1 = (tmp_col[1]>> 8) & 0x000000FF; -#else - const u32 r0 = tmp_col[0] & 0x000000FF; - const u32 r1 = tmp_col[1] & 0x000000FF; - const u32 g0 = (tmp_col[0]>> 8) & 0x000000FF; - const u32 g1 = (tmp_col[1]>> 8) & 0x000000FF; - const u32 b0 = (tmp_col[0]>>16) & 0x000000FF; - const u32 b1 = (tmp_col[1]>>16) & 0x000000FF; -#endif - - const u16 tmp1 = ( (r0*5 + r1*3)>>6) | - ( ((g0*5 + g1*3)>>6) << 5 ) | - ( ((b0*5 + b1*3)>>6) << 10 ); - const u16 tmp2 = ( (r0*3 + r1*5)>>6) | - ( ((g0*3 + g1*5)>>6) << 5 ) | - ( ((b0*3 + b1*5)>>6) << 10 ); - - tmp_col[2] = COLOR555TO8888_OPAQUE(tmp1); - tmp_col[3] = COLOR555TO8888_OPAQUE(tmp2); - break; - } - } - - if (TEXFORMAT==TexFormat_15bpp) - { - for (size_t i = 0; i < 4; i++) - { -#ifdef LOCAL_BE - const u32 a = (tmp_col[i] >> 3) & 0x0000001F; - tmp_col[i] >>= 2; - tmp_col[i] &= 0x3F3F3F00; - tmp_col[i] |= a; -#else - const u32 a = (tmp_col[i] >> 3) & 0x1F000000; - tmp_col[i] >>= 2; - tmp_col[i] &= 0x003F3F3F; - tmp_col[i] |= a; -#endif - } - } - - //TODO - this could be more precise for 32bpp mode (run it through the color separation table) - - //set all 16 texels - for (size_t sy = 0; sy < 4; sy++) - { - // Texture offset - const u32 currentPos = (x<<2) + tmpPos[sy]; - const u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); - - dwdst[currentPos ] = tmp_col[ currRow &3]; - dwdst[currentPos+1] = tmp_col[(currRow>>2)&3]; - dwdst[currentPos+2] = tmp_col[(currRow>>4)&3]; - dwdst[currentPos+3] = tmp_col[(currRow>>6)&3]; - } - } - } - break; - } - - case TEXMODE_A5I3: - { -#ifdef ENABLE_SSSE3 - const __m128i pal_vec128 = _mm_load_si128((__m128i *)pal); -#endif - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=16, adr+=16, dwdst+=16) - { - const __m128i bits = _mm_loadu_si128((__m128i *)adr); - - const __m128i idx = _mm_slli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0x07)), 1 ); - const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); - const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - - __m128i tmpAlpha[2]; - __m128i convertedColor[4]; - - if (TEXFORMAT == TexFormat_15bpp) - { - const __m128i alpha = _mm_srli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), 3 ); - const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); - const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); - } - else - { - const __m128i alpha = _mm_or_si128( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), _mm_srli_epi16(_mm_and_si128(bits, _mm_set1_epi8(0xE0)), 5) ); - const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); - const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); - } - - _mm_store_si128((__m128i *)(dwdst + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dwdst + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dwdst + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dwdst + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - const u16 c = pal[*adr&0x07] & 0x7FFF; - const u8 alpha = (*adr>>3); - *dwdst++ = (TEXFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, alpha) : COLOR555TO8888(c, material_5bit_to_8bit[alpha]); - } -#endif - } - break; - } - - case TEXMODE_16BPP: - { - for (size_t j = 0; j < ms.numItems; j++) - { - const u16 *map = (u16*)ms.items[j].ptr; - const size_t len = ms.items[j].len >> 1; - - for (size_t x = 0; x < len; x++) - { - const u16 c = LOCAL_TO_LE_16(map[x]); - *dwdst++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0; - } - } - break; - } - - default: - break; - } //switch(texture format) - -#ifdef DO_DEBUG_DUMP_TEXTURE - DebugDumpTexture(newitem); -#endif - - return newitem; - } //scan() - - static const int PALETTE_DUMP_SIZE = (64+16+16)*1024; - u8 paletteDump[PALETTE_DUMP_SIZE]; - - void invalidate() - { - //check whether the palette memory changed - //TODO - we should handle this instead by setting dirty flags in the vram memory mapping and noting whether palette memory was dirty. - //but this will work for now - MemSpan mspal = MemSpan_TexPalette(0,PALETTE_DUMP_SIZE,true); - bool paletteDirty = mspal.memcmp(paletteDump); - if (paletteDirty) - { - mspal.dump(paletteDump); - } - - for (TTexCacheItemMultimap::iterator it(index.begin()); it != index.end(); ++it) - { - it->second->suspectedInvalid = true; + const __m128i idx = _mm_slli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0x07)), 1 ); + const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - //when the palette changes, we assume all 4x4 textures are dirty. - //this is because each 4x4 item doesnt carry along with it a copy of the entire palette, for verification - //instead, we just use the one paletteDump for verifying of all 4x4 textures; and if paletteDirty is set, verification has failed - if( (it->second->GetTextureFormat() == TEXMODE_4X4) && paletteDirty ) + const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); + const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); + + __m128i tmpAlpha[2]; + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) { - it->second->assumedInvalid = true; + const __m128i alpha = _mm_srli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), 3 ); + const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); + const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); + ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); + ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + } + else + { + const __m128i alpha = _mm_or_si128( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), _mm_srli_epi16(_mm_and_si128(bits, _mm_set1_epi8(0xE0)), 5) ); + const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); + const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); + ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); + ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + } + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t x = 0; x < ms.items[j].len; x++, adr++) + { + const u16 c = pal[*adr&0x07] & 0x7FFF; + const u8 alpha = (*adr>>3); + *dstBuffer++ = (TEXCACHEFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, alpha) : COLOR555TO8888(c, material_5bit_to_8bit[alpha]); + } +#endif + } +} + +#define PAL4X4(offset) ( LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((palAddress + (offset)*2)>>14)&0x7] + ((palAddress + (offset)*2)&0x3FFF) ) ) & 0x7FFF ) + +template +void NDSTextureUnpack4x4(const MemSpan &ms, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer) +{ + if (ms.numItems != 1) + { + PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); + } + //this check isnt necessary since the addressing is tied to the texture data which will also run out: + //if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n"); + + u16* slot1; + u32* map = (u32*)ms.items[0].ptr; + u32 limit = ms.items[0].len<<2; + u32 d = 0; + if ( (texAttributes & 0xc000) == 0x8000) + // texel are in slot 2 + slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((texAttributes & 0x3FFF)<<2)+0x010000]; + else + slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(texAttributes & 0x3FFF)<<2]; + + u16 yTmpSize = (sizeY>>2); + u16 xTmpSize = (sizeX>>2); + + //this is flagged whenever a 4x4 overruns its slot. + //i am guessing we just generate black in that case + bool dead = false; + + for (size_t y = 0; y < yTmpSize; y++) + { + u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX, + ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; + for (size_t x = 0; x < xTmpSize; x++, d++) + { + if (d >= limit) + dead = true; + + if (dead) + { + for (int sy = 0; sy < 4; sy++) + { + const u32 currentPos = (x<<2) + tmpPos[sy]; + dstBuffer[currentPos] = dstBuffer[currentPos+1] = dstBuffer[currentPos+2] = dstBuffer[currentPos+3] = 0; + } + continue; + } + + const u32 currBlock = LE_TO_LOCAL_32(map[d]); + const u16 pal1 = LE_TO_LOCAL_16(slot1[d]); + const u16 pal1offset = (pal1 & 0x3FFF)<<1; + const u8 mode = pal1>>14; + u32 tmp_col[4]; + + tmp_col[0] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset) ); + tmp_col[1] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+1) ); + + switch (mode) + { + case 0: + tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); + tmp_col[3] = 0x00000000; + break; + + case 1: +#ifdef LOCAL_BE + tmp_col[2] = ( (((tmp_col[0] & 0xFF000000) >> 1)+((tmp_col[1] & 0xFF000000) >> 1)) & 0xFF000000 ) | + ( (((tmp_col[0] & 0x00FF0000) + (tmp_col[1] & 0x00FF0000)) >> 1) & 0x00FF0000 ) | + ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | + 0x000000FF; + tmp_col[3] = 0x00000000; +#else + tmp_col[2] = ( (((tmp_col[0] & 0x00FF00FF) + (tmp_col[1] & 0x00FF00FF)) >> 1) & 0x00FF00FF ) | + ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | + 0xFF000000; + tmp_col[3] = 0x00000000; +#endif + break; + + case 2: + tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); + tmp_col[3] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+3) ); + break; + + case 3: + { +#ifdef LOCAL_BE + const u32 r0 = (tmp_col[0]>>24) & 0x000000FF; + const u32 r1 = (tmp_col[1]>>24) & 0x000000FF; + const u32 g0 = (tmp_col[0]>>16) & 0x000000FF; + const u32 g1 = (tmp_col[1]>>16) & 0x000000FF; + const u32 b0 = (tmp_col[0]>> 8) & 0x000000FF; + const u32 b1 = (tmp_col[1]>> 8) & 0x000000FF; +#else + const u32 r0 = tmp_col[0] & 0x000000FF; + const u32 r1 = tmp_col[1] & 0x000000FF; + const u32 g0 = (tmp_col[0]>> 8) & 0x000000FF; + const u32 g1 = (tmp_col[1]>> 8) & 0x000000FF; + const u32 b0 = (tmp_col[0]>>16) & 0x000000FF; + const u32 b1 = (tmp_col[1]>>16) & 0x000000FF; +#endif + + const u16 tmp1 = ( (r0*5 + r1*3)>>6) | + ( ((g0*5 + g1*3)>>6) << 5 ) | + ( ((b0*5 + b1*3)>>6) << 10 ); + const u16 tmp2 = ( (r0*3 + r1*5)>>6) | + ( ((g0*3 + g1*5)>>6) << 5 ) | + ( ((b0*3 + b1*5)>>6) << 10 ); + + tmp_col[2] = COLOR555TO8888_OPAQUE(tmp1); + tmp_col[3] = COLOR555TO8888_OPAQUE(tmp2); + break; + } + } + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + for (size_t i = 0; i < 4; i++) + { +#ifdef LOCAL_BE + const u32 a = (tmp_col[i] >> 3) & 0x0000001F; + tmp_col[i] >>= 2; + tmp_col[i] &= 0x3F3F3F00; + tmp_col[i] |= a; +#else + const u32 a = (tmp_col[i] >> 3) & 0x1F000000; + tmp_col[i] >>= 2; + tmp_col[i] &= 0x003F3F3F; + tmp_col[i] |= a; +#endif + } + } + + //TODO - this could be more precise for 32bpp mode (run it through the color separation table) + + //set all 16 texels + for (size_t sy = 0; sy < 4; sy++) + { + // Texture offset + const u32 currentPos = (x<<2) + tmpPos[sy]; + const u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); + + dstBuffer[currentPos ] = tmp_col[ currRow &3]; + dstBuffer[currentPos+1] = tmp_col[(currRow>>2)&3]; + dstBuffer[currentPos+2] = tmp_col[(currRow>>4)&3]; + dstBuffer[currentPos+3] = tmp_col[(currRow>>6)&3]; } } } +} - void evict(u32 target = kMaxCacheSize) +template +void NDSTextureUnpackDirect16Bit(const MemSpan &ms, u32 *dstBuffer) +{ + for (size_t j = 0; j < ms.numItems; j++) { - //debug print - //printf("%d %d/%d\n",index.size(),cache_size/1024,target/1024); - - //dont do anything unless we're over the target - if(cache_size target) + const u16 *map = (u16 *)ms.items[j].ptr; + const size_t len = ms.items[j].len >> 1; + + for (size_t x = 0; x < len; x++) { - if(index.size()==0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 - - TexCacheItem* item = index.begin()->second; - list_remove(item); - //printf("evicting! totalsize:%d\n",cache_size); - delete item; + const u16 c = LOCAL_TO_LE_16(map[x]); + *dstBuffer++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0; } } -} texCache; - -void TexCache_Reset() -{ - texCache.evict(0); } -void TexCache_Invalidate() +template +void NDSTextureUnpackI2(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) { - //note that this gets called whether texdata or texpalette gets reconfigured. - texCache.invalidate(); -} - -TexCacheItem* TexCache_SetTexture(TexCache_TexFormat TEXFORMAT, u32 format, u32 texpal) -{ - switch(TEXFORMAT) +#ifdef ENABLE_SSSE3 + const __m128i pal_vec128 = _mm_loadl_epi64((__m128i *)srcPal); +#endif + if (isPalZeroTransparent) { - case TexFormat_32bpp: return texCache.scan(format,texpal); - case TexFormat_15bpp: return texCache.scan(format,texpal); - default: assert(false); return NULL; +#ifdef ENABLE_SSSE3 + for (size_t i = 0; i < srcSize; i+=4, srcData+=4, dstBuffer+=16) + { + __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)srcData); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); + idx = _mm_slli_epi16(idx, 1); + + __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); + const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + // Set converted colors to 0 if the palette index is 0. + idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); + idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); + convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); + convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); + convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); + convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t i = 0; i < srcSize; i++, srcData++) + { + u8 idx; + + idx = *srcData & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + + idx = (*srcData >> 2) & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + + idx = (*srcData >> 4) & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + + idx = (*srcData >> 6) & 0x03; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + } +#endif + } + else + { +#ifdef ENABLE_SSSE3 + for (size_t i = 0; i < srcSize; i+=4, srcData+=4, dstBuffer+=16) + { + __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)srcData); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); + idx = _mm_slli_epi16(idx, 1); + + const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); + const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t i = 0; i < srcSize; i++, srcData++) + { + *dstBuffer++ = CONVERT(srcPal[ *srcData & 0x03] & 0x7FFF); + *dstBuffer++ = CONVERT(srcPal[(*srcData >> 2) & 0x03] & 0x7FFF); + *dstBuffer++ = CONVERT(srcPal[(*srcData >> 4) & 0x03] & 0x7FFF); + *dstBuffer++ = CONVERT(srcPal[(*srcData >> 6) & 0x03] & 0x7FFF); + } +#endif } } -//call this periodically to keep the tex cache clean -void TexCache_EvictFrame() +template +void NDSTextureUnpackI4(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) { - texCache.evict(); +#ifdef ENABLE_SSSE3 + const __m128i palLo = _mm_load_si128((__m128i *)srcPal + 0); + const __m128i palHi = _mm_load_si128((__m128i *)srcPal + 1); +#endif + if (isPalZeroTransparent) + { +#ifdef ENABLE_SSSE3 + for (size_t i = 0; i < srcSize; i+=8, srcData+=8, dstBuffer+=16) + { + __m128i idx = _mm_loadl_epi64((__m128i *)srcData); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); + idx = _mm_slli_epi16(idx, 1); + + __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); + const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); + const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); + const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); + const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); + + const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); + const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + // Set converted colors to 0 if the palette index is 0. + idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); + idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); + convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); + convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); + convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); + convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t i = 0; i < srcSize; i++, srcData++) + { + u8 idx; + + idx = *srcData & 0x0F; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + + idx = *srcData >> 4; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + } +#endif + } + else + { +#ifdef ENABLE_SSSE3 + for (size_t i = 0; i < srcSize; i+=8, srcData+=8, dstBuffer+=16) + { + __m128i idx = _mm_loadl_epi64((__m128i *)srcData); + idx = _mm_unpacklo_epi8(idx, idx); + idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); + idx = _mm_slli_epi16(idx, 1); + + const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); + const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); + const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); + const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); + const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); + + const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); + const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); + + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); + ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); + } + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t i = 0; i < srcSize; i++, srcData++) + { + *dstBuffer++ = CONVERT(srcPal[*srcData & 0x0F] & 0x7FFF); + *dstBuffer++ = CONVERT(srcPal[*srcData >> 4] & 0x7FFF); + } +#endif + } +} + +template +void NDSTextureUnpackI8(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) +{ + if (isPalZeroTransparent) + { + for (size_t i = 0; i < srcSize; i++, srcData++) + { + const u8 idx = *srcData; + *dstBuffer++ = (idx == 0) ? 0 : CONVERT(srcPal[idx] & 0x7FFF); + } + } + else + { + for (size_t i = 0; i < srcSize; i++, srcData++) + { + *dstBuffer++ = CONVERT(srcPal[*srcData] & 0x7FFF); + } + } +} + +template +void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer) +{ + for (size_t i = 0; i < srcSize; i++, srcData++) + { + const u16 c = srcPal[*srcData & 0x1F] & 0x7FFF; + const u8 alpha = *srcData >> 5; + *dstBuffer++ = (TEXCACHEFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, material_3bit_to_5bit[alpha]) : COLOR555TO8888(c, material_3bit_to_8bit[alpha]); + } +} + +template +void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer) +{ +#ifdef ENABLE_SSSE3 + const __m128i pal_vec128 = _mm_load_si128((__m128i *)srcPal); + + for (size_t i = 0; i < srcSize; i+=16, srcData+=16, dstBuffer+=16) + { + const __m128i bits = _mm_loadu_si128((__m128i *)srcData); + + const __m128i idx = _mm_slli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0x07)), 1 ); + const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); + + const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); + const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); + + __m128i tmpAlpha[2]; + __m128i convertedColor[4]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + const __m128i alpha = _mm_srli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), 3 ); + const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); + const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); + ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); + ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + } + else + { + const __m128i alpha = _mm_or_si128( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), _mm_srli_epi16(_mm_and_si128(bits, _mm_set1_epi8(0xE0)), 5) ); + const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); + const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); + ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); + + tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); + tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); + ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); + } + + _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); + _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); + _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); + } +#else + for (size_t i = 0; i < srcSize; i++, srcData++) + { + const u16 c = srcPal[*srcData & 0x07] & 0x7FFF; + const u8 alpha = (*srcData >> 3); + *dstBuffer++ = (TEXCACHEFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, alpha) : COLOR555TO8888(c, material_5bit_to_8bit[alpha]); + } +#endif +} + +template +void NDSTextureUnpack4x4(const size_t srcSize, const u8 *srcData, const u8 *srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer) +{ + u16 *slot1; + u32 *map = (u32 *)srcData; + u32 limit = srcSize * sizeof(u32); + u32 d = 0; + if ( (texAttributes & 0xc000) == 0x8000) + // texel are in slot 2 + slot1 = (u16 *)&MMU.texInfo.textureSlotAddr[1][((texAttributes & 0x3FFF)<<2)+0x010000]; + else + slot1 = (u16 *)&MMU.texInfo.textureSlotAddr[1][(texAttributes & 0x3FFF)<<2]; + + u16 xTmpSize = sizeX >> 2; + u16 yTmpSize = sizeY >> 2; + + //this is flagged whenever a 4x4 overruns its slot. + //i am guessing we just generate black in that case + bool dead = false; + + for (size_t y = 0; y < yTmpSize; y++) + { + u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX, + ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; + for (size_t x = 0; x < xTmpSize; x++, d++) + { + if (d >= limit) + dead = true; + + if (dead) + { + for (int sy = 0; sy < 4; sy++) + { + const u32 currentPos = (x<<2) + tmpPos[sy]; + dstBuffer[currentPos] = dstBuffer[currentPos+1] = dstBuffer[currentPos+2] = dstBuffer[currentPos+3] = 0; + } + continue; + } + + const u32 currBlock = LE_TO_LOCAL_32(map[d]); + const u16 pal1 = LE_TO_LOCAL_16(slot1[d]); + const u16 pal1offset = (pal1 & 0x3FFF)<<1; + const u8 mode = pal1>>14; + u32 tmp_col[4]; + + tmp_col[0] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset) ); + tmp_col[1] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+1) ); + + switch (mode) + { + case 0: + tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); + tmp_col[3] = 0x00000000; + break; + + case 1: +#ifdef LOCAL_BE + tmp_col[2] = ( (((tmp_col[0] & 0xFF000000) >> 1)+((tmp_col[1] & 0xFF000000) >> 1)) & 0xFF000000 ) | + ( (((tmp_col[0] & 0x00FF0000) + (tmp_col[1] & 0x00FF0000)) >> 1) & 0x00FF0000 ) | + ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | + 0x000000FF; + tmp_col[3] = 0x00000000; +#else + tmp_col[2] = ( (((tmp_col[0] & 0x00FF00FF) + (tmp_col[1] & 0x00FF00FF)) >> 1) & 0x00FF00FF ) | + ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | + 0xFF000000; + tmp_col[3] = 0x00000000; +#endif + break; + + case 2: + tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); + tmp_col[3] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+3) ); + break; + + case 3: + { +#ifdef LOCAL_BE + const u32 r0 = (tmp_col[0]>>24) & 0x000000FF; + const u32 r1 = (tmp_col[1]>>24) & 0x000000FF; + const u32 g0 = (tmp_col[0]>>16) & 0x000000FF; + const u32 g1 = (tmp_col[1]>>16) & 0x000000FF; + const u32 b0 = (tmp_col[0]>> 8) & 0x000000FF; + const u32 b1 = (tmp_col[1]>> 8) & 0x000000FF; +#else + const u32 r0 = tmp_col[0] & 0x000000FF; + const u32 r1 = tmp_col[1] & 0x000000FF; + const u32 g0 = (tmp_col[0]>> 8) & 0x000000FF; + const u32 g1 = (tmp_col[1]>> 8) & 0x000000FF; + const u32 b0 = (tmp_col[0]>>16) & 0x000000FF; + const u32 b1 = (tmp_col[1]>>16) & 0x000000FF; +#endif + + const u16 tmp1 = ( (r0*5 + r1*3)>>6) | + ( ((g0*5 + g1*3)>>6) << 5 ) | + ( ((b0*5 + b1*3)>>6) << 10 ); + const u16 tmp2 = ( (r0*3 + r1*5)>>6) | + ( ((g0*3 + g1*5)>>6) << 5 ) | + ( ((b0*3 + b1*5)>>6) << 10 ); + + tmp_col[2] = COLOR555TO8888_OPAQUE(tmp1); + tmp_col[3] = COLOR555TO8888_OPAQUE(tmp2); + break; + } + } + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + for (size_t i = 0; i < 4; i++) + { +#ifdef LOCAL_BE + const u32 a = (tmp_col[i] >> 3) & 0x0000001F; + tmp_col[i] >>= 2; + tmp_col[i] &= 0x3F3F3F00; + tmp_col[i] |= a; +#else + const u32 a = (tmp_col[i] >> 3) & 0x1F000000; + tmp_col[i] >>= 2; + tmp_col[i] &= 0x003F3F3F; + tmp_col[i] |= a; +#endif + } + } + + //TODO - this could be more precise for 32bpp mode (run it through the color separation table) + + //set all 16 texels + for (size_t sy = 0; sy < 4; sy++) + { + // Texture offset + const u32 currentPos = (x<<2) + tmpPos[sy]; + const u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); + + dstBuffer[currentPos ] = tmp_col[ currRow &3]; + dstBuffer[currentPos+1] = tmp_col[(currRow>>2)&3]; + dstBuffer[currentPos+2] = tmp_col[(currRow>>4)&3]; + dstBuffer[currentPos+3] = tmp_col[(currRow>>6)&3]; + } + } + } +} + +template +void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *srcData, u32 *dstBuffer) +{ + const u16 *srcData16 = (const u16 *)srcData; + const size_t pixCount = srcSize >> 1; + size_t i = 0; + +#ifdef ENABLE_SSE2 + const size_t pixCountVec128 = pixCount - (pixCount % 8); + for (; i < pixCountVec128; i+=8, srcData16+=8, dstBuffer+=8) + { + const v128u16 c = _mm_load_si128((v128u16 *)srcData16); + const v128u16 alpha = _mm_cmpeq_epi16(_mm_srli_epi16(c, 15), _mm_set1_epi16(1)); + v128u32 convertedColor[2]; + + if (TEXCACHEFORMAT == TexFormat_15bpp) + { + ColorspaceConvert555To6665Opaque_SSE2(c, convertedColor[0], convertedColor[1]); + } + else + { + ColorspaceConvert555To8888Opaque_SSE2(c, convertedColor[0], convertedColor[1]); + } + + convertedColor[0] = _mm_blendv_epi8(_mm_setzero_si128(), convertedColor[0], _mm_unpacklo_epi16(alpha, alpha)); + convertedColor[1] = _mm_blendv_epi8(_mm_setzero_si128(), convertedColor[1], _mm_unpackhi_epi16(alpha, alpha)); + + _mm_store_si128((v128u32 *)(dstBuffer + 0), convertedColor[0]); + _mm_store_si128((v128u32 *)(dstBuffer + 4), convertedColor[1]); + } +#endif + + for (; i < pixCount; i++, srcData16++) + { + const u16 c = LOCAL_TO_LE_16(*srcData16); + *dstBuffer++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0; + } } diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index 02b88df19..643bb97cb 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -26,6 +26,14 @@ #include "common.h" #include "gfx3d.h" +//this ought to be enough for anyone +//#define TEXCACHE_MAX_SIZE (64*1024*1024); +//changed by zeromus on 15-dec. I couldnt find any games that were getting anywhere NEAR 64 +//metal slug burns through sprites so fast, it can test it pretty quickly though +#define TEXCACHE_MAX_SIZE (16*1024*1024) + +#define PALETTE_DUMP_SIZE ((64+16+16)*1024) + enum TexCache_TexFormat { TexFormat_None, //used when nothing yet is cached @@ -33,11 +41,31 @@ enum TexCache_TexFormat TexFormat_15bpp //used by rasterizer }; +class MemSpan; class TexCacheItem; typedef std::multimap TTexCacheItemMultimap; typedef void (*TexCacheItemDeleteCallback)(TexCacheItem *texItem, void *param1, void *param2); +class TexCache +{ +public: + TexCache(); + + TTexCacheItemMultimap index; + u32 cache_size; //this is not really precise, it is off by a constant factor + u8 paletteDump[PALETTE_DUMP_SIZE]; + + void list_remove(TexCacheItem *item); + void list_push_front(TexCacheItem *item); + + void Invalidate(); + void Evict(u32 target); + void Reset(); + + TexCacheItem* GetTexture(TexCache_TexFormat texCacheFormat, u32 texAttributes, u32 palAttributes); +}; + class TexCacheItem { private: @@ -46,65 +74,68 @@ private: void *_deleteCallbackParam2; public: - TexCacheItem() - : decode_len(0) - , decoded(NULL) - , suspectedInvalid(false) - , assumedInvalid(false) - , _deleteCallback(NULL) - , _deleteCallbackParam1(NULL) - , _deleteCallbackParam2(NULL) - , cacheFormat(TexFormat_None) - {} + TexCacheItem(); + ~TexCacheItem(); + + NDSTextureFormat packFormat; + u32 packSize; + u8 *packData; + u16 *paletteColorTable; + + TexCache_TexFormat unpackFormat; + u32 unpackSize; + u32 *unpackData; - ~TexCacheItem() - { - free_aligned(this->decoded); - if (this->_deleteCallback != NULL) this->_deleteCallback(this, this->_deleteCallbackParam1, this->_deleteCallbackParam2); - } - u32 decode_len; - NDSTextureFormat format; - u8* decoded; //decoded texture data bool suspectedInvalid; bool assumedInvalid; TTexCacheItemMultimap::iterator iterator; - NDSTextureFormat GetTextureFormat() const { return this->format; } - - u32 texformat, texpal; - u32 sizeX, sizeY; - float invSizeX, invSizeY; - - u32 texid; //used by ogl renderer for the texid - TexCache_TexFormat cacheFormat; - - struct Dump { - ~Dump() { - delete[] texture; - } - int textureSize, indexSize; - static const int maxTextureSize=128*1024; - u8* texture; - u8 palette[256*2]; - } dump; + u32 textureAttributes; + u32 paletteAttributes; + u32 paletteAddress; + u32 paletteSize; + u32 sizeX; + u32 sizeY; + float invSizeX; + float invSizeY; - TexCacheItemDeleteCallback GetDeleteCallback() - { - return this->_deleteCallback; - } + // Only used by 4x4 formatted textures + u8 *packIndexData; + u32 packSizeFirstSlot; + u32 packIndexSize; - void SetDeleteCallback(TexCacheItemDeleteCallback callbackFunc, void *inParam1, void *inParam2) - { - this->_deleteCallback = callbackFunc; - this->_deleteCallbackParam1 = inParam1; - this->_deleteCallbackParam2 = inParam2; - } + // Only used by the OpenGL renderer for the texture ID + u32 texid; + + TexCacheItemDeleteCallback GetDeleteCallback() const; + void SetDeleteCallback(TexCacheItemDeleteCallback callbackFunc, void *inParam1, void *inParam2); + + NDSTextureFormat GetTextureFormat() const; + void SetTextureData(const u32 attr, const MemSpan &packedData, const MemSpan &packedIndexData); + void SetTexturePalette(const u32 attr, const u16 *paletteBuffer); + + template void Unpack(const MemSpan &packedData); + + void DebugDump(); }; -void TexCache_Invalidate(); -void TexCache_Reset(); -void TexCache_EvictFrame(); +// TODO: Delete these MemSpan based functions after testing confirms that using the dumped texture data works properly. +template void NDSTextureUnpackI2(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer); +template void NDSTextureUnpackI4(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer); +template void NDSTextureUnpackI8(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer); +template void NDSTextureUnpackA3I5(const MemSpan &ms, const u16 *pal, u32 *dstBuffer); +template void NDSTextureUnpackA5I3(const MemSpan &ms, const u16 *pal, u32 *dstBuffer); +template void NDSTextureUnpack4x4(const MemSpan &ms, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer); +template void NDSTextureUnpackDirect16Bit(const MemSpan &ms, u32 *dstBuffer); -TexCacheItem* TexCache_SetTexture(TexCache_TexFormat TEXFORMAT, u32 format, u32 texpal); +template void NDSTextureUnpackI2(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer); +template void NDSTextureUnpackI4(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer); +template void NDSTextureUnpackI8(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer); +template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer); +template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer); +template void NDSTextureUnpack4x4(const size_t srcSize, const u8 *srcData, const u8 *srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer); +template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *srcData, u32 *dstBuffer); + +extern TexCache texCache; #endif From 2c82d4b7b41ef2618fc6ff02fac60609f9f9a569 Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 1 Nov 2016 21:07:17 +0000 Subject: [PATCH 33/41] Texture Handler: - Texture items in cache are now searched using std::map instead of std::multimap. - Texture item search keys now ignore the render-specific bits of the texture attributes (repeat mode, flip mode, and coordinate transformation mode bits are ignored). This is to help reduce the number of duplicate textures in the cache. - Searching a texture and unpacking a texture are now performed as separate operations. - Texture unpacking functions now use restrict pointers instead of normal pointers. --- desmume/src/OGLRender.cpp | 15 +- desmume/src/OGLRender_3_2.cpp | 7 +- desmume/src/rasterize.cpp | 16 +- desmume/src/texcache.cpp | 330 +++++++++++++++++++--------------- desmume/src/texcache.h | 29 +-- 5 files changed, 237 insertions(+), 160 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index a99db0263..b855baf8f 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -1197,7 +1197,6 @@ OpenGLRenderer_1_2::~OpenGLRenderer_1_2() // Kill the texture cache now before all of our texture IDs disappear. texCache.Reset(); - texCache.Reset(); while(!ref->freeTextureIDs.empty()) { @@ -2968,7 +2967,12 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT glEnable(GL_TEXTURE_2D); } - TexCacheItem *newTexture = texCache.GetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); + TexCacheItem *newTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (newTexture->unpackFormat != TexFormat_32bpp) + { + newTexture->Unpack(); + } + if (newTexture != this->currTexture) { this->currTexture = newTexture; @@ -4619,7 +4623,12 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT glUniform1i(OGLRef.uniformPolyEnableTexture, GL_TRUE); glUniform1i(OGLRef.uniformTexSingleBitAlpha, (params.texFormat != TEXMODE_A3I5 && params.texFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); - TexCacheItem *newTexture = texCache.GetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); + TexCacheItem *newTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (newTexture->unpackFormat != TexFormat_32bpp) + { + newTexture->Unpack(); + } + if (newTexture != this->currTexture) { this->currTexture = newTexture; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 384ac3686..ed7ddc159 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1695,7 +1695,12 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } - TexCacheItem *newTexture = texCache.GetTexture(TexFormat_32bpp, thePoly.texParam, thePoly.texPalette); + TexCacheItem *newTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (newTexture->unpackFormat != TexFormat_32bpp) + { + newTexture->Unpack(); + } + if (newTexture != this->currTexture) { this->currTexture = newTexture; diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 9bff7089d..bb9850f6f 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1373,7 +1373,12 @@ void SoftRasterizerRenderer::setupTextures() const POLY &firstPoly = *firstClippedPoly.poly; u32 lastTexParams = firstPoly.texParam; u32 lastTexPalette = firstPoly.texPalette; - TexCacheItem *lastTexKey = texCache.GetTexture(TexFormat_15bpp, firstPoly.texParam, firstPoly.texPalette); + + TexCacheItem *lastTexItem = texCache.GetTexture(firstPoly.texParam, firstPoly.texPalette); + if (lastTexItem->unpackFormat != TexFormat_15bpp) + { + lastTexItem->Unpack(); + } for (size_t i = 0; i < this->_clippedPolyCount; i++) { @@ -1386,13 +1391,18 @@ void SoftRasterizerRenderer::setupTextures() //and then it won't be safe. if (lastTexParams != thePoly.texParam || lastTexPalette != thePoly.texPalette) { - lastTexKey = texCache.GetTexture(TexFormat_15bpp, thePoly.texParam, thePoly.texPalette); + lastTexItem = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (lastTexItem->unpackFormat != TexFormat_15bpp) + { + lastTexItem->Unpack(); + } + lastTexParams = thePoly.texParam; lastTexPalette = thePoly.texPalette; } //printf("%08X %d\n",poly->texParam,rasterizerUnit[0].textures.currentNum); - polyTexKeys[i] = lastTexKey; + polyTexKeys[i] = lastTexItem; } } diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index f02e171e3..68afd4995 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -202,19 +202,22 @@ TexCache texCache; TexCache::TexCache() { + cacheTable.clear(); cache_size = 0; memset(paletteDump, 0, sizeof(paletteDump)); } void TexCache::list_remove(TexCacheItem *item) { - this->index.erase(item->iterator); + const TexCacheKey key = TexCache::GenerateKey(item->textureAttributes, item->paletteAttributes); + this->cacheTable.erase(key); this->cache_size -= item->unpackSize; } void TexCache::list_push_front(TexCacheItem *item) { - item->iterator = this->index.insert(std::make_pair(item->textureAttributes, item)); + const TexCacheKey key = TexCache::GenerateKey(item->textureAttributes, item->paletteAttributes); + this->cacheTable[key] = item; this->cache_size += item->unpackSize; } @@ -224,13 +227,13 @@ void TexCache::Invalidate() //TODO - we should handle this instead by setting dirty flags in the vram memory mapping and noting whether palette memory was dirty. //but this will work for now MemSpan mspal = MemSpan_TexPalette(0, PALETTE_DUMP_SIZE, true); - bool paletteDirty = mspal.memcmp(paletteDump); + bool paletteDirty = mspal.memcmp(this->paletteDump); if (paletteDirty) { - mspal.dump(paletteDump); + mspal.dump(this->paletteDump); } - for (TTexCacheItemMultimap::iterator it(this->index.begin()); it != this->index.end(); ++it) + for (TexCacheTable::iterator it(this->cacheTable.begin()); it != this->cacheTable.end(); ++it) { it->second->suspectedInvalid = true; @@ -259,9 +262,9 @@ void TexCache::Evict(u32 target) //TODO - do this based on age and not arbitrarily while (this->cache_size > target) { - if (this->index.size() == 0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 + if (this->cacheTable.size() == 0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 - TexCacheItem *item = this->index.begin()->second; + TexCacheItem *item = this->cacheTable.begin()->second; this->list_remove(item); //printf("evicting! totalsize:%d\n",cache_size); delete item; @@ -273,15 +276,54 @@ void TexCache::Reset() this->Evict(0); } -TexCacheItem* TexCache::GetTexture(TexCache_TexFormat texCacheFormat, u32 texAttributes, u32 palAttributes) +TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) { //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; - NDSTextureFormat texPackFormat = (NDSTextureFormat)((texAttributes>>26)&0x07); - u32 sizeX = (8 << ((texAttributes>>20)&0x07)); - u32 sizeY = (8 << ((texAttributes>>23)&0x07)); - u32 imageSize = sizeX*sizeY; + bool needLoadTexData = false; + bool needLoadPalette = false; + + //conditions where we reject matches: + //when the teximage or texpal params dont match + //(this is our key for identifying textures in the cache) + TexCacheItem *theTexture = NULL; + const TexCacheKey key = TexCache::GenerateKey(texAttributes, palAttributes); + const TexCacheTable::iterator cachedTexture = this->cacheTable.find(key); + + if (cachedTexture == this->cacheTable.end()) + { + needLoadTexData = true; + needLoadPalette = true; + } + else + { + theTexture = cachedTexture->second; + + //if the texture is assumed invalid, reject it + if (theTexture->assumedInvalid) + { + needLoadTexData = true; + needLoadPalette = true; + } + + //the texture matches params, but isnt suspected invalid. accept it. + if (!theTexture->suspectedInvalid) + { + return theTexture; + } + } + + //we suspect the texture may be invalid. we need to do a byte-for-byte comparison to re-establish that it is valid: + const NDSTextureFormat texPackFormat = (NDSTextureFormat)((texAttributes>>26)&0x07); + const u32 sizeX = (8 << ((texAttributes>>20)&0x07)); + const u32 sizeY = (8 << ((texAttributes>>23)&0x07)); + const u32 imageSize = sizeX*sizeY; + + //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. + //this isnt such a problem with texture memory, because we read sequentially from it. + //however, we read randomly from palette memory, so the mapping is more costly. + const u32 palSize = paletteSizeList[texPackFormat] * sizeof(u16); u32 palAddress; switch (texPackFormat) @@ -301,14 +343,6 @@ TexCacheItem* TexCache::GetTexture(TexCache_TexFormat texCacheFormat, u32 texAtt break; } - //analyze the texture memory mapping and the specifications of this texture - u32 texSize = (imageSize*texSizes[texPackFormat]) >> 2; //shifted because the texSizes multiplier is fixed point - MemSpan currentPackedTexDataMS = MemSpan_TexMem((texAttributes&0xFFFF)<<3, texSize); - - //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. - //this isnt such a problem with texture memory, because we read sequentially from it. - //however, we read randomly from palette memory, so the mapping is more costly. - u32 palSize = paletteSizeList[texPackFormat] * sizeof(u16); MemSpan currentPaletteMS = MemSpan_TexPalette(palAddress, palSize, false); CACHE_ALIGN u16 currentPalette[256]; @@ -318,100 +352,96 @@ TexCacheItem* TexCache::GetTexture(TexCache_TexFormat texCacheFormat, u32 texAtt currentPaletteMS.dump(currentPalette); #endif - //determine the location for 4x4 index data - u32 indexBase; - if ((texAttributes & 0xc000) == 0x8000) indexBase = 0x30000; - else indexBase = 0x20000; + //when the palettes dont match: + //note that we are considering 4x4 textures to have a palette size of 0. + //they really have a potentially HUGE palette, too big for us to handle like a normal palette, + //so they go through a different system + if (theTexture != NULL) + { + if ( (palSize > 0) && memcmp(theTexture->paletteColorTable, currentPalette, palSize) ) + { + needLoadPalette = true; + } + } - u32 indexOffset = (texAttributes & 0x3FFF) << 2; - int indexSize = 0; + //analyze the texture memory mapping and the specifications of this texture + const u32 texSize = (imageSize*texSizes[texPackFormat]) >> 2; //shifted because the texSizes multiplier is fixed point + MemSpan currentPackedTexDataMS = MemSpan_TexMem((texAttributes&0xFFFF)<<3, texSize); + + //when the texture data doesn't match + if (theTexture != NULL) + { + if (currentPackedTexDataMS.memcmp(theTexture->packData, theTexture->packSize)) + { + needLoadTexData = true; + } + } + + //if the texture is 4x4 then the index data must match MemSpan currentPackedTexIndexMS; if (texPackFormat == TEXMODE_4X4) { - indexSize = imageSize >> 3; + //determine the location for 4x4 index data + const u32 indexBase = ((texAttributes & 0xc000) == 0x8000) ? 0x30000 : 0x20000; + const u32 indexOffset = (texAttributes & 0x3FFF) << 2; + const int indexSize = imageSize >> 3; + currentPackedTexIndexMS = MemSpan_TexMem(indexOffset+indexBase, indexSize); + + if (theTexture != NULL) + { + if (currentPackedTexIndexMS.memcmp(theTexture->packIndexData, theTexture->packIndexSize)) + { + needLoadTexData = true; + needLoadPalette = true; + } + } } - //TODO - as a special optimization, keep the last item returned and check it first - - TexCacheItem *cachedTexture = NULL; - - for(std::pair - iters = index.equal_range(texAttributes); - iters.first != iters.second; - ++iters.first) + if (needLoadTexData || needLoadPalette) { - cachedTexture = iters.first->second; - - //conditions where we reject matches: - //when the teximage or texpal params dont match - //(this is our key for identifying textures in the cache) - //NEW: due to using format as a key we dont need to check this anymore - //if(curr->texAttributes != texAttributes) continue; - if (cachedTexture->paletteAttributes != palAttributes) continue; - - //we're being asked for a different format than what we had cached. - //TODO - this could be done at the entire cache level instead of checking repeatedly - if (cachedTexture->unpackFormat != texCacheFormat) goto REJECT; - - //if the texture is assumed invalid, reject it - if (cachedTexture->assumedInvalid) goto REJECT; - - //the texture matches params, but isnt suspected invalid. accept it. - if (!cachedTexture->suspectedInvalid) return cachedTexture; - - //we suspect the texture may be invalid. we need to do a byte-for-byte comparison to re-establish that it is valid: - - //when the palettes dont match: - //note that we are considering 4x4 textures to have a palette size of 0. - //they really have a potentially HUGE palette, too big for us to handle like a normal palette, - //so they go through a different system - if (currentPaletteMS.size != 0 && memcmp(cachedTexture->paletteColorTable, currentPalette, currentPaletteMS.size)) goto REJECT; - - //when the texture data doesn't match - if (currentPackedTexDataMS.memcmp(cachedTexture->packData, cachedTexture->packSize)) goto REJECT; - - //if the texture is 4x4 then the index data must match - if (texPackFormat == TEXMODE_4X4) + if (theTexture != NULL) { - if (currentPackedTexIndexMS.memcmp(cachedTexture->packIndexData, cachedTexture->packIndexSize)) goto REJECT; + //we found a cached item for the current address, but the data is stale. + //for a variety of complicated reasons, we need to throw it out right this instant. + this->list_remove(theTexture); + delete theTexture; + theTexture = NULL; } - //we found a match. just return it - //REMINDER to make it primary/newest when we have smarter code - //list_remove(curr); - //list_push_front(curr); - cachedTexture->suspectedInvalid = false; - return cachedTexture; + //item was not found. recruit an existing one (the oldest), or create a new one + //evict(); //reduce the size of the cache if necessary + //TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs + //to support separate cache and read passes + TexCacheItem *newTexture = new TexCacheItem(); + newTexture->SetTextureData(texAttributes, currentPackedTexDataMS, currentPackedTexIndexMS); + newTexture->SetTexturePalette(palAttributes, currentPalette); - REJECT: - //we found a cached item for the current address, but the data is stale. - //for a variety of complicated reasons, we need to throw it out right this instant. - this->list_remove(cachedTexture); - delete cachedTexture; - break; + this->list_push_front(newTexture); + //printf("allocating: up to %d with %d items\n",cache_size,index.size()); + + theTexture = newTexture; } - - //item was not found. recruit an existing one (the oldest), or create a new one - //evict(); //reduce the size of the cache if necessary - //TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs - //to support separate cache and read passes - TexCacheItem *newTexture = new TexCacheItem(); - newTexture->SetTextureData(texAttributes, currentPackedTexDataMS, currentPackedTexIndexMS); - newTexture->SetTexturePalette(palAttributes, currentPalette); - newTexture->unpackFormat = texCacheFormat; - - this->list_push_front(newTexture); - //printf("allocating: up to %d with %d items\n",cache_size,index.size()); - - switch (texCacheFormat) + else { - case TexFormat_32bpp: newTexture->Unpack(currentPackedTexDataMS); break; - case TexFormat_15bpp: newTexture->Unpack(currentPackedTexDataMS); break; - default: assert(false); return NULL; + if (theTexture != NULL) + { + //we found a match. just return it + //REMINDER to make it primary/newest when we have smarter code + //list_remove(curr); + //list_push_front(curr); + theTexture->suspectedInvalid = false; + } } - return newTexture; + return theTexture; +} + +TexCacheKey TexCache::GenerateKey(const u32 texAttributes, const u32 palAttributes) +{ + // Since the repeat, flip, and coordinate transformation modes are render settings + // and not data settings, we can mask out those bits to help reduce duplicate entries. + return (TexCacheKey)( ((u64)palAttributes << 32) | (u64)(texAttributes & 0x3FF0FFFF) ); } TexCacheItem::TexCacheItem() @@ -424,6 +454,7 @@ TexCacheItem::TexCacheItem() packSize = 0; packData = NULL; paletteColorTable = NULL; + isPalZeroTransparent = false; unpackFormat = TexFormat_None; unpackSize = 0; @@ -476,10 +507,8 @@ NDSTextureFormat TexCacheItem::GetTextureFormat() const void TexCacheItem::SetTextureData(const u32 attr, const MemSpan &packedData, const MemSpan &packedIndexData) { - u8 *oldPackData = this->packData; - u32 *oldUnpackData = this->unpackData; - u32 w = (8 << ((attr >> 20) & 0x07)); - u32 h = (8 << ((attr >> 23) & 0x07)); + const u32 w = (8 << ((attr >> 20) & 0x07)); + const u32 h = (8 << ((attr >> 23) & 0x07)); this->textureAttributes = attr; this->packFormat = (NDSTextureFormat)((attr >> 26) & 0x07); @@ -488,67 +517,88 @@ void TexCacheItem::SetTextureData(const u32 attr, const MemSpan &packedData, con this->sizeY = h; this->invSizeX = 1.0f / (float)w; this->invSizeY = 1.0f / (float)h; - this->unpackSize = w * h * sizeof(u32); - this->unpackData = (u32 *)malloc_alignedCacheLine(this->unpackSize); //dump texture and 4x4 index data for cache keying - this->packSize = packedData.size; - this->packIndexSize = packedIndexData.size; - this->packData = (u8 *)malloc_alignedCacheLine(this->packSize); this->packSizeFirstSlot = packedData.items[0].len; + if (this->packSize != packedData.size) + { + u8 *oldPackData = this->packData; + this->packSize = packedData.size; + this->packData = (u8 *)malloc_alignedCacheLine(this->packSize); + free_aligned(oldPackData); + } + packedData.dump(this->packData); - if (this->packFormat == TEXMODE_4X4) + if ( (this->packFormat == TEXMODE_I2) || (this->packFormat == TEXMODE_I4) || (this->packFormat == TEXMODE_I8) ) { - u8 *oldPackIndexData = this->packIndexData; - this->packIndexData = (u8 *)malloc_alignedCacheLine(this->packIndexSize); - packedIndexData.dump(this->packIndexData, this->packIndexSize); - free_aligned(oldPackIndexData); + this->isPalZeroTransparent = ( ((attr >> 29) & 1) != 0 ); + } + else + { + this->isPalZeroTransparent = false; + + if (this->packFormat == TEXMODE_4X4) + { + if (this->packIndexSize != packedIndexData.size) + { + u8 *oldPackIndexData = this->packIndexData; + this->packIndexSize = packedIndexData.size; + this->packIndexData = (u8 *)malloc_alignedCacheLine(this->packIndexSize); + free_aligned(oldPackIndexData); + } + + packedIndexData.dump(this->packIndexData, this->packIndexSize); + } } - free_aligned(oldPackData); - free_aligned(oldUnpackData); + const u32 currentUnpackSize = w * h * sizeof(u32); + if (this->unpackSize != currentUnpackSize) + { + u32 *oldUnpackData = this->unpackData; + this->unpackSize = currentUnpackSize; + this->unpackData = (u32 *)malloc_alignedCacheLine(currentUnpackSize); + free_aligned(oldUnpackData); + } } void TexCacheItem::SetTexturePalette(const u32 attr, const u16 *paletteBuffer) { - const u32 oldPaletteSize = this->paletteSize; + const u32 newPaletteSize = paletteSizeList[this->packFormat] * sizeof(u16); this->paletteAttributes = attr; this->paletteAddress = (this->packFormat == TEXMODE_I2) ? attr << 3 : attr << 4; - this->paletteSize = paletteSizeList[this->packFormat] * sizeof(u16); - if (this->paletteSize > 0) + if (newPaletteSize > 0) { - if (this->paletteSize != oldPaletteSize) + if (this->paletteSize != newPaletteSize) { u16 *oldPaletteColorTable = this->paletteColorTable; - this->paletteColorTable = (u16 *)malloc_alignedCacheLine(this->paletteSize); - memcpy(this->paletteColorTable, paletteBuffer, this->paletteSize); + this->paletteSize = newPaletteSize; + this->paletteColorTable = (u16 *)malloc_alignedCacheLine(newPaletteSize); free_aligned(oldPaletteColorTable); } - else - { - memcpy(this->paletteColorTable, paletteBuffer, this->paletteSize); - } + + memcpy(this->paletteColorTable, paletteBuffer, newPaletteSize); } else { free_aligned(this->paletteColorTable); + this->paletteSize = 0; this->paletteColorTable = NULL; } } template -void TexCacheItem::Unpack(const MemSpan &packedData) +void TexCacheItem::Unpack() { + this->unpackFormat = TEXCACHEFORMAT; + // Whenever a 1-bit alpha or no-alpha texture is unpacked (this means any texture // format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D // renderers can assume that the transparent color is 0 during texture sampling. - bool isPalZeroTransparent; - switch (this->packFormat) { case TEXMODE_A3I5: @@ -556,18 +606,15 @@ void TexCacheItem::Unpack(const MemSpan &packedData) break; case TEXMODE_I2: - isPalZeroTransparent = ( ((this->textureAttributes >> 29) & 1) != 0 ); - NDSTextureUnpackI2(this->packSize, this->packData, this->paletteColorTable, isPalZeroTransparent, this->unpackData); + NDSTextureUnpackI2(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, this->unpackData); break; case TEXMODE_I4: - isPalZeroTransparent = ( ((this->textureAttributes >> 29) & 1) != 0 ); - NDSTextureUnpackI4(this->packSize, this->packData, this->paletteColorTable, isPalZeroTransparent, this->unpackData); + NDSTextureUnpackI4(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, this->unpackData); break; case TEXMODE_I8: - isPalZeroTransparent = ( ((this->textureAttributes >> 29) & 1) != 0 ); - NDSTextureUnpackI8(this->packSize, this->packData, this->paletteColorTable, isPalZeroTransparent, this->unpackData); + NDSTextureUnpackI8(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, this->unpackData); break; case TEXMODE_4X4: @@ -1149,7 +1196,7 @@ void NDSTextureUnpackDirect16Bit(const MemSpan &ms, u32 *dstBuffer) } template -void NDSTextureUnpackI2(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) +void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { #ifdef ENABLE_SSSE3 const __m128i pal_vec128 = _mm_loadl_epi64((__m128i *)srcPal); @@ -1264,7 +1311,7 @@ void NDSTextureUnpackI2(const size_t srcSize, const u8 *srcData, const u16 *srcP } template -void NDSTextureUnpackI4(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) +void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { #ifdef ENABLE_SSSE3 const __m128i palLo = _mm_load_si128((__m128i *)srcPal + 0); @@ -1382,7 +1429,7 @@ void NDSTextureUnpackI4(const size_t srcSize, const u8 *srcData, const u16 *srcP } template -void NDSTextureUnpackI8(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) +void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { if (isPalZeroTransparent) { @@ -1402,7 +1449,7 @@ void NDSTextureUnpackI8(const size_t srcSize, const u8 *srcData, const u16 *srcP } template -void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer) +void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer) { for (size_t i = 0; i < srcSize; i++, srcData++) { @@ -1413,7 +1460,7 @@ void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *srcData, const u16 *sr } template -void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer) +void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer) { #ifdef ENABLE_SSSE3 const __m128i pal_vec128 = _mm_load_si128((__m128i *)srcPal); @@ -1477,7 +1524,7 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *srcData, const u16 *sr } template -void NDSTextureUnpack4x4(const size_t srcSize, const u8 *srcData, const u8 *srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer) +void NDSTextureUnpack4x4(const size_t srcSize, const u8 *__restrict srcData, const u8 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer) { u16 *slot1; u32 *map = (u32 *)srcData; @@ -1619,7 +1666,7 @@ void NDSTextureUnpack4x4(const size_t srcSize, const u8 *srcData, const u8 *srcI } template -void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *srcData, u32 *dstBuffer) +void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *__restrict srcData, u32 *__restrict dstBuffer) { const u16 *srcData16 = (const u16 *)srcData; const size_t pixCount = srcSize >> 1; @@ -1656,3 +1703,6 @@ void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *srcData, u32 *d *dstBuffer++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0; } } + +template void TexCacheItem::Unpack(); +template void TexCacheItem::Unpack(); diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index 643bb97cb..da2b1dca5 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -44,7 +44,8 @@ enum TexCache_TexFormat class MemSpan; class TexCacheItem; -typedef std::multimap TTexCacheItemMultimap; +typedef u64 TexCacheKey; +typedef std::map TexCacheTable; typedef void (*TexCacheItemDeleteCallback)(TexCacheItem *texItem, void *param1, void *param2); class TexCache @@ -52,7 +53,7 @@ class TexCache public: TexCache(); - TTexCacheItemMultimap index; + TexCacheTable cacheTable; u32 cache_size; //this is not really precise, it is off by a constant factor u8 paletteDump[PALETTE_DUMP_SIZE]; @@ -63,7 +64,9 @@ public: void Evict(u32 target); void Reset(); - TexCacheItem* GetTexture(TexCache_TexFormat texCacheFormat, u32 texAttributes, u32 palAttributes); + TexCacheItem* GetTexture(u32 texAttributes, u32 palAttributes); + + static TexCacheKey GenerateKey(const u32 texAttributes, const u32 palAttributes); }; class TexCacheItem @@ -81,6 +84,7 @@ public: u32 packSize; u8 *packData; u16 *paletteColorTable; + bool isPalZeroTransparent; TexCache_TexFormat unpackFormat; u32 unpackSize; @@ -88,8 +92,7 @@ public: bool suspectedInvalid; bool assumedInvalid; - TTexCacheItemMultimap::iterator iterator; - + u32 textureAttributes; u32 paletteAttributes; u32 paletteAddress; @@ -114,7 +117,7 @@ public: void SetTextureData(const u32 attr, const MemSpan &packedData, const MemSpan &packedIndexData); void SetTexturePalette(const u32 attr, const u16 *paletteBuffer); - template void Unpack(const MemSpan &packedData); + template void Unpack(); void DebugDump(); }; @@ -128,13 +131,13 @@ template void NDSTextureUnpackA5I3(const MemS template void NDSTextureUnpack4x4(const MemSpan &ms, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer); template void NDSTextureUnpackDirect16Bit(const MemSpan &ms, u32 *dstBuffer); -template void NDSTextureUnpackI2(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer); -template void NDSTextureUnpackI4(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer); -template void NDSTextureUnpackI8(const size_t srcSize, const u8 *srcData, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer); -template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer); -template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *srcData, const u16 *srcPal, u32 *dstBuffer); -template void NDSTextureUnpack4x4(const size_t srcSize, const u8 *srcData, const u8 *srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer); -template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *srcData, u32 *dstBuffer); +template void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); +template void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); +template void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); +template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); +template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); +template void NDSTextureUnpack4x4(const size_t srcSize, const u8 *__restrict srcData, const u8 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer); +template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *__restrict srcData, u32 *__restrict dstBuffer); extern TexCache texCache; From df60214b26e565de08ad4c1c5704e2cd397a11ed Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 2 Nov 2016 07:25:11 +0000 Subject: [PATCH 34/41] Texture Handler: - Rework TexCacheItem::GetTexture() so that instantiating a new object, dumping the packed data, and dumping the palette are performed as separate operations. - Invalid OpenGL textures are now updated instead of being completely replaced. - NDSTextureUnpack4x4() now uses the srcIndex pointer parameter instead of recalculating the palette address. - Delete the now obsolete MemSpan-based texture unpacking functions. --- desmume/src/OGLRender.cpp | 396 ++++++++------- desmume/src/OGLRender.h | 5 +- desmume/src/OGLRender_3_2.cpp | 180 ++++--- desmume/src/texcache.cpp | 878 ++++++---------------------------- desmume/src/texcache.h | 50 +- 5 files changed, 502 insertions(+), 1007 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index b855baf8f..07fb03899 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -871,7 +871,6 @@ OpenGLRenderer::OpenGLRenderer() ref->fboPostprocessID = 0; ref->selectedRenderingFBO = 0; - currTexture = NULL; _mappedFramebuffer = NULL; _pixelReadNeedsFinish = false; _currentPolyIndex = 0; @@ -2418,10 +2417,6 @@ Render3DError OpenGLRenderer_1_2::ReadBackPixels() Render3DError OpenGLRenderer_1_2::DeleteTexture(const TexCacheItem *item) { this->ref->freeTextureIDs.push((GLuint)item->texid); - if(this->currTexture == item) - { - this->currTexture = NULL; - } return OGLERROR_NOERR; } @@ -2956,125 +2951,149 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } + TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + // Enable textures if they weren't already enabled if (this->isShaderSupported) { glUniform1i(OGLRef.uniformPolyEnableTexture, GL_TRUE); - glUniform1i(OGLRef.uniformTexSingleBitAlpha, (params.texFormat != TEXMODE_A3I5 && params.texFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); + glUniform1i(OGLRef.uniformTexSingleBitAlpha, (theTexture->packFormat != TEXMODE_A3I5 && theTexture->packFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); + glUniform2f(OGLRef.uniformPolyTexScale, theTexture->invSizeX, theTexture->invSizeY); } else { glEnable(GL_TEXTURE_2D); + glMatrixMode(GL_TEXTURE); + glLoadIdentity(); + glScalef(theTexture->invSizeX, theTexture->invSizeY, 1.0f); } - TexCacheItem *newTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); - if (newTexture->unpackFormat != TexFormat_32bpp) + if (theTexture->unpackFormat != TexFormat_32bpp) { - newTexture->Unpack(); - } - - if (newTexture != this->currTexture) - { - this->currTexture = newTexture; + theTexture->Unpack(); + //has the ogl renderer initialized the texture? - if (this->currTexture->GetDeleteCallback() == NULL) + const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); + if (isNewTexture) { - this->currTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); + theTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); if (OGLRef.freeTextureIDs.empty()) { this->ExpandFreeTextures(); } - this->currTexture->texid = (u32)OGLRef.freeTextureIDs.front(); + theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); - - glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - - const NDSTextureFormat texFormat = this->currTexture->GetTextureFormat(); - const u32 *textureSrc = this->currTexture->unpackData; - size_t texWidth = this->currTexture->sizeX; - size_t texHeight = this->currTexture->sizeY; - - if (this->_textureDeposterizeDstSurface.Surface != NULL) - { - this->TextureDeposterize(textureSrc, texWidth, texHeight); - textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; - } - - switch (this->_textureScalingFactor) - { - case 1: - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; - - case 2: - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); - - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, this->currTexture->sizeX, this->currTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; - } - - case 4: - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); - - this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - texWidth = this->currTexture->sizeX; - texHeight = this->currTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, this->currTexture->sizeX, this->currTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; - } - - default: - break; - } - - if (this->_textureSmooth) - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, (this->_textureScalingFactor > 1) ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, this->_deviceInfo.maxAnisotropy); - } - else - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); - } - } - else - { - //otherwise, just bind it - glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); } - if (this->isShaderSupported) + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + + const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); + const u32 *textureSrc = theTexture->unpackData; + size_t texWidth = theTexture->sizeX; + size_t texHeight = theTexture->sizeY; + + if (this->_textureDeposterizeDstSurface.Surface != NULL) { - glUniform2f(OGLRef.uniformPolyTexScale, this->currTexture->invSizeX, this->currTexture->invSizeY); + this->TextureDeposterize(textureSrc, texWidth, texHeight); + textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; + } + + switch (this->_textureScalingFactor) + { + case 1: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; + } + + case 2: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); + + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; + } + + case 4: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); + + this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + texWidth = theTexture->sizeX; + texHeight = theTexture->sizeY; + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + texWidth = theTexture->sizeX; + texHeight = theTexture->sizeY; + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; + } + + default: + break; + } + + if (this->_textureSmooth) + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, (this->_textureScalingFactor > 1) ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, this->_deviceInfo.maxAnisotropy); } else { - glMatrixMode(GL_TEXTURE); - glLoadIdentity(); - glScalef(this->currTexture->invSizeX, this->currTexture->invSizeY, 1.0f); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); } } + else + { + //otherwise, just bind it + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); + } return OGLERROR_NOERR; } @@ -3126,7 +3145,6 @@ Render3DError OpenGLRenderer_1_2::Reset() memset(OGLRef.vertIndexBuffer, 0, OGLRENDER_VERT_INDEX_BUFFER_COUNT * sizeof(GLushort)); } - this->currTexture = NULL; this->_currentPolyIndex = 0; OGLRef.vtxPtrPosition = (GLvoid *)offsetof(VERT, coord); @@ -4620,107 +4638,135 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } + TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + glUniform1i(OGLRef.uniformPolyEnableTexture, GL_TRUE); - glUniform1i(OGLRef.uniformTexSingleBitAlpha, (params.texFormat != TEXMODE_A3I5 && params.texFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); + glUniform1i(OGLRef.uniformTexSingleBitAlpha, (theTexture->packFormat != TEXMODE_A3I5 && theTexture->packFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); + glUniform2f(OGLRef.uniformPolyTexScale, theTexture->invSizeX, theTexture->invSizeY); - TexCacheItem *newTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); - if (newTexture->unpackFormat != TexFormat_32bpp) + if (theTexture->unpackFormat != TexFormat_32bpp) { - newTexture->Unpack(); - } - - if (newTexture != this->currTexture) - { - this->currTexture = newTexture; + theTexture->Unpack(); + //has the ogl renderer initialized the texture? - if (this->currTexture->GetDeleteCallback() == NULL) + const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); + if (isNewTexture) { - this->currTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); + theTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); if (OGLRef.freeTextureIDs.empty()) { this->ExpandFreeTextures(); } - this->currTexture->texid = (u32)OGLRef.freeTextureIDs.front(); + theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); - - glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - - const NDSTextureFormat texFormat = this->currTexture->GetTextureFormat(); - const u32 *textureSrc = this->currTexture->unpackData; - size_t texWidth = this->currTexture->sizeX; - size_t texHeight = this->currTexture->sizeY; - - if (this->_textureDeposterizeDstSurface.Surface != NULL) + } + + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + + const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); + const u32 *textureSrc = theTexture->unpackData; + size_t texWidth = theTexture->sizeX; + size_t texHeight = theTexture->sizeY; + + if (this->_textureDeposterizeDstSurface.Surface != NULL) + { + this->TextureDeposterize(textureSrc, texWidth, texHeight); + textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; + } + + switch (this->_textureScalingFactor) + { + case 1: { - this->TextureDeposterize(textureSrc, texWidth, texHeight); - textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; - } - - switch (this->_textureScalingFactor) - { - case 1: - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; - - case 2: + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); + + if (isNewTexture) { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); - - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, this->currTexture->sizeX, this->currTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } - - case 4: + else { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); - - this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - texWidth = this->currTexture->sizeX; - texHeight = this->currTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, this->currTexture->sizeX, this->currTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } + break; + } + + case 2: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; + } + + case 4: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); + + this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - default: - break; - } - - if (this->_textureSmooth) - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, (this->_textureScalingFactor > 1) ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, this->_deviceInfo.maxAnisotropy); - } - else - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); + texWidth = theTexture->sizeX; + texHeight = theTexture->sizeY; + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + texWidth = theTexture->sizeX; + texHeight = theTexture->sizeY; + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; } + + default: + break; + } + + if (this->_textureSmooth) + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, (this->_textureScalingFactor > 1) ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, this->_deviceInfo.maxAnisotropy); } else { - //otherwise, just bind it - glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); } - - glUniform2f(OGLRef.uniformPolyTexScale, this->currTexture->invSizeX, this->currTexture->invSizeY); + } + else + { + //otherwise, just bind it + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); } return OGLERROR_NOERR; diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index a5cbf305d..9ede77cfc 100644 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -587,10 +587,7 @@ protected: bool isVAOSupported; bool willFlipFramebufferOnGPU; bool willConvertFramebufferOnGPU; - - // Textures - TexCacheItem *currTexture; - + FragmentColor *_mappedFramebuffer; bool _pixelReadNeedsFinish; size_t _currentPolyIndex; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index ed7ddc159..84acb6a13 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1695,103 +1695,133 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } - TexCacheItem *newTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); - if (newTexture->unpackFormat != TexFormat_32bpp) + TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (theTexture->unpackFormat != TexFormat_32bpp) { - newTexture->Unpack(); - } - - if (newTexture != this->currTexture) - { - this->currTexture = newTexture; + theTexture->Unpack(); + //has the ogl renderer initialized the texture? - if (this->currTexture->GetDeleteCallback() == NULL) + const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); + if (isNewTexture) { - this->currTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); + theTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); if (OGLRef.freeTextureIDs.empty()) { this->ExpandFreeTextures(); } - this->currTexture->texid = (u32)OGLRef.freeTextureIDs.front(); + theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); - - glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - - const NDSTextureFormat texFormat = this->currTexture->GetTextureFormat(); - const u32 *textureSrc = this->currTexture->unpackData; - size_t texWidth = this->currTexture->sizeX; - size_t texHeight = this->currTexture->sizeY; - - if (this->_textureDeposterizeDstSurface.Surface != NULL) + } + + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + + const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); + const u32 *textureSrc = theTexture->unpackData; + size_t texWidth = theTexture->sizeX; + size_t texHeight = theTexture->sizeY; + + if (this->_textureDeposterizeDstSurface.Surface != NULL) + { + this->TextureDeposterize(textureSrc, texWidth, texHeight); + textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; + } + + switch (this->_textureScalingFactor) + { + case 1: { - this->TextureDeposterize(textureSrc, texWidth, texHeight); - textureSrc = (u32 *)this->_textureDeposterizeDstSurface.Surface; - } - - switch (this->_textureScalingFactor) - { - case 1: - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; - - case 2: + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); + + if (isNewTexture) { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); - - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, this->currTexture->sizeX, this->currTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } - - case 4: + else { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); - - this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - texWidth = this->currTexture->sizeX; - texHeight = this->currTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, this->_textureUpscaleBuffer); - - glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, this->currTexture->sizeX, this->currTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_BYTE, textureSrc); - break; + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } + break; + } + + case 2: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); + + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; + } + + case 4: + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); + + this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); + + if (isNewTexture) + { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - default: - break; - } - - if (this->_textureSmooth) - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, (this->_textureScalingFactor > 1) ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, this->_deviceInfo.maxAnisotropy); - } - else - { - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); + texWidth = theTexture->sizeX; + texHeight = theTexture->sizeY; + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + else + { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + texWidth = theTexture->sizeX; + texHeight = theTexture->sizeY; + this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); + + glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + } + break; } + + default: + break; + } + + if (this->_textureSmooth) + { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, (this->_textureScalingFactor > 1) ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, this->_deviceInfo.maxAnisotropy); } else { - //otherwise, just bind it - glBindTexture(GL_TEXTURE_2D, (GLuint)this->currTexture->texid); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); } } + else + { + //otherwise, just bind it + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); + } return OGLERROR_NOERR; } diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 68afd4995..2df64d257 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -195,9 +195,6 @@ static MemSpan MemSpan_TexPalette(u32 ofs, u32 len, bool silent) return ret; } -//for each texformat, number of palette entries -static const u32 paletteSizeList[] = {0, 32, 4, 16, 256, 0, 8, 0}; - TexCache texCache; TexCache::TexCache() @@ -278,21 +275,21 @@ void TexCache::Reset() TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) { - //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) - static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; - + TexCacheItem *theTexture = NULL; + bool didCreateNewTexture = false; bool needLoadTexData = false; bool needLoadPalette = false; //conditions where we reject matches: //when the teximage or texpal params dont match //(this is our key for identifying textures in the cache) - TexCacheItem *theTexture = NULL; const TexCacheKey key = TexCache::GenerateKey(texAttributes, palAttributes); const TexCacheTable::iterator cachedTexture = this->cacheTable.find(key); if (cachedTexture == this->cacheTable.end()) { + theTexture = new TexCacheItem(texAttributes, palAttributes); + didCreateNewTexture = true; needLoadTexData = true; needLoadPalette = true; } @@ -315,35 +312,11 @@ TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) } //we suspect the texture may be invalid. we need to do a byte-for-byte comparison to re-establish that it is valid: - const NDSTextureFormat texPackFormat = (NDSTextureFormat)((texAttributes>>26)&0x07); - const u32 sizeX = (8 << ((texAttributes>>20)&0x07)); - const u32 sizeY = (8 << ((texAttributes>>23)&0x07)); - const u32 imageSize = sizeX*sizeY; //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. //this isnt such a problem with texture memory, because we read sequentially from it. //however, we read randomly from palette memory, so the mapping is more costly. - const u32 palSize = paletteSizeList[texPackFormat] * sizeof(u16); - u32 palAddress; - - switch (texPackFormat) - { - case TEXMODE_I2: - palAddress = palAttributes << 3; - break; - - case TEXMODE_A3I5: - case TEXMODE_I4: - case TEXMODE_I8: - case TEXMODE_A5I3: - case TEXMODE_16BPP: - case TEXMODE_4X4: - default: - palAddress = palAttributes << 4; - break; - } - - MemSpan currentPaletteMS = MemSpan_TexPalette(palAddress, palSize, false); + MemSpan currentPaletteMS = MemSpan_TexPalette(theTexture->paletteAddress, theTexture->paletteSize, false); CACHE_ALIGN u16 currentPalette[256]; #ifdef WORDS_BIGENDIAN @@ -356,84 +329,61 @@ TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) //note that we are considering 4x4 textures to have a palette size of 0. //they really have a potentially HUGE palette, too big for us to handle like a normal palette, //so they go through a different system - if (theTexture != NULL) + if ( !didCreateNewTexture && (theTexture->paletteSize > 0) && memcmp(theTexture->paletteColorTable, currentPalette, theTexture->paletteSize) ) { - if ( (palSize > 0) && memcmp(theTexture->paletteColorTable, currentPalette, palSize) ) - { - needLoadPalette = true; - } + needLoadPalette = true; } //analyze the texture memory mapping and the specifications of this texture - const u32 texSize = (imageSize*texSizes[texPackFormat]) >> 2; //shifted because the texSizes multiplier is fixed point - MemSpan currentPackedTexDataMS = MemSpan_TexMem((texAttributes&0xFFFF)<<3, texSize); + MemSpan currentPackedTexDataMS = MemSpan_TexMem(theTexture->packAddress, theTexture->packSize); //when the texture data doesn't match - if (theTexture != NULL) + if ( !didCreateNewTexture && (theTexture->packSize > 0) && currentPackedTexDataMS.memcmp(theTexture->packData, theTexture->packSize) ) { - if (currentPackedTexDataMS.memcmp(theTexture->packData, theTexture->packSize)) - { - needLoadTexData = true; - } + needLoadTexData = true; } //if the texture is 4x4 then the index data must match MemSpan currentPackedTexIndexMS; - if (texPackFormat == TEXMODE_4X4) + if (theTexture->packFormat == TEXMODE_4X4) { //determine the location for 4x4 index data - const u32 indexBase = ((texAttributes & 0xc000) == 0x8000) ? 0x30000 : 0x20000; - const u32 indexOffset = (texAttributes & 0x3FFF) << 2; - const int indexSize = imageSize >> 3; + currentPackedTexIndexMS = MemSpan_TexMem(theTexture->packIndexAddress, theTexture->packIndexSize); - currentPackedTexIndexMS = MemSpan_TexMem(indexOffset+indexBase, indexSize); - - if (theTexture != NULL) + if ( !didCreateNewTexture && (theTexture->packIndexSize > 0) && currentPackedTexIndexMS.memcmp(theTexture->packIndexData, theTexture->packIndexSize) ) { - if (currentPackedTexIndexMS.memcmp(theTexture->packIndexData, theTexture->packIndexSize)) - { - needLoadTexData = true; - needLoadPalette = true; - } + needLoadTexData = true; + needLoadPalette = true; } } - if (needLoadTexData || needLoadPalette) + if (!needLoadTexData && !needLoadPalette) { - if (theTexture != NULL) - { - //we found a cached item for the current address, but the data is stale. - //for a variety of complicated reasons, we need to throw it out right this instant. - this->list_remove(theTexture); - delete theTexture; - theTexture = NULL; - } - - //item was not found. recruit an existing one (the oldest), or create a new one - //evict(); //reduce the size of the cache if necessary - //TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs - //to support separate cache and read passes - TexCacheItem *newTexture = new TexCacheItem(); - newTexture->SetTextureData(texAttributes, currentPackedTexDataMS, currentPackedTexIndexMS); - newTexture->SetTexturePalette(palAttributes, currentPalette); - - this->list_push_front(newTexture); + //we found a match. just return it + theTexture->suspectedInvalid = false; + return theTexture; + } + + if (needLoadTexData) + { + theTexture->SetTextureData(currentPackedTexDataMS, currentPackedTexIndexMS); + theTexture->unpackFormat = TexFormat_None; + } + + if (needLoadPalette) + { + theTexture->SetTexturePalette(currentPalette); + theTexture->unpackFormat = TexFormat_None; + } + + if (didCreateNewTexture) + { + this->list_push_front(theTexture); //printf("allocating: up to %d with %d items\n",cache_size,index.size()); - - theTexture = newTexture; - } - else - { - if (theTexture != NULL) - { - //we found a match. just return it - //REMINDER to make it primary/newest when we have smarter code - //list_remove(curr); - //list_push_front(curr); - theTexture->suspectedInvalid = false; - } } + theTexture->assumedInvalid = false; + theTexture->suspectedInvalid = false; return theTexture; } @@ -450,35 +400,112 @@ TexCacheItem::TexCacheItem() _deleteCallbackParam1 = NULL; _deleteCallbackParam2 = NULL; + textureAttributes = 0; + paletteAttributes = 0; + + sizeX = 0; + sizeY = 0; + invSizeX = 0.0f; + invSizeY = 0.0f; + isPalZeroTransparent = false; + + suspectedInvalid = false; + assumedInvalid = false; + packFormat = TEXMODE_NONE; + packAddress = 0; packSize = 0; packData = NULL; + + paletteAddress = 0; + paletteSize = 0; paletteColorTable = NULL; - isPalZeroTransparent = false; unpackFormat = TexFormat_None; unpackSize = 0; unpackData = NULL; - suspectedInvalid = false; - assumedInvalid = false; - - textureAttributes = 0; - paletteAttributes = 0; - paletteAddress = 0; - paletteSize = 0; - sizeX = 0; - sizeY = 0; - invSizeX = 0.0f; - invSizeY = 0.0f; - + packIndexAddress = 0; + packIndexSize = 0; packIndexData = NULL; packSizeFirstSlot = 0; - packIndexSize = 0; texid = 0; } +TexCacheItem::TexCacheItem(const u32 texAttributes, const u32 palAttributes) +{ + //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) + static const u32 texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; + + //for each texformat, number of palette entries + static const u32 paletteSizeList[] = {0, 32, 4, 16, 256, 0, 8, 0}; + + _deleteCallback = NULL; + _deleteCallbackParam1 = NULL; + _deleteCallbackParam2 = NULL; + + texid = 0; + + textureAttributes = texAttributes; + paletteAttributes = palAttributes; + + sizeX = (8 << ((texAttributes >> 20) & 0x07)); + sizeY = (8 << ((texAttributes >> 23) & 0x07)); + invSizeX = 1.0f / (float)sizeX; + invSizeY = 1.0f / (float)sizeY; + + packFormat = (NDSTextureFormat)((texAttributes >> 26) & 0x07); + packAddress = (texAttributes & 0xFFFF) << 3; + packSize = (sizeX*sizeY*texSizes[packFormat]) >> 2; //shifted because the texSizes multiplier is fixed point + packData = (u8 *)malloc_alignedCacheLine(packSize); + + if ( (packFormat == TEXMODE_I2) || (packFormat == TEXMODE_I4) || (packFormat == TEXMODE_I8) ) + { + isPalZeroTransparent = ( ((texAttributes >> 29) & 1) != 0 ); + } + else + { + isPalZeroTransparent = false; + } + + paletteSize = paletteSizeList[packFormat] * sizeof(u16); + if (paletteSize > 0) + { + paletteAddress = (packFormat == TEXMODE_I2) ? palAttributes << 3 : palAttributes << 4; + paletteColorTable = (u16 *)malloc_alignedCacheLine(paletteSize); + } + else + { + paletteAddress = 0; + paletteColorTable = NULL; + } + + unpackFormat = TexFormat_None; + unpackSize = 0; + unpackData = NULL; + + if (packFormat == TEXMODE_4X4) + { + const u32 indexBase = ((texAttributes & 0xC000) == 0x8000) ? 0x30000 : 0x20000; + const u32 indexOffset = (texAttributes & 0x3FFF) << 2; + packIndexAddress = indexBase + indexOffset; + packIndexSize = (sizeX * sizeY) >> 3; + packIndexData = (u8 *)malloc_alignedCacheLine(packIndexSize); + packSizeFirstSlot = 0; + } + else + { + packIndexAddress = 0; + packIndexSize = 0; + packIndexData = NULL; + packSizeFirstSlot = 0; + } + + suspectedInvalid = true; + assumedInvalid = true; +} + TexCacheItem::~TexCacheItem() { free_aligned(this->packData); @@ -505,55 +532,19 @@ NDSTextureFormat TexCacheItem::GetTextureFormat() const return this->packFormat; } -void TexCacheItem::SetTextureData(const u32 attr, const MemSpan &packedData, const MemSpan &packedIndexData) +void TexCacheItem::SetTextureData(const MemSpan &packedData, const MemSpan &packedIndexData) { - const u32 w = (8 << ((attr >> 20) & 0x07)); - const u32 h = (8 << ((attr >> 23) & 0x07)); - - this->textureAttributes = attr; - this->packFormat = (NDSTextureFormat)((attr >> 26) & 0x07); - - this->sizeX = w; - this->sizeY = h; - this->invSizeX = 1.0f / (float)w; - this->invSizeY = 1.0f / (float)h; - //dump texture and 4x4 index data for cache keying this->packSizeFirstSlot = packedData.items[0].len; - if (this->packSize != packedData.size) - { - u8 *oldPackData = this->packData; - this->packSize = packedData.size; - this->packData = (u8 *)malloc_alignedCacheLine(this->packSize); - free_aligned(oldPackData); - } - packedData.dump(this->packData); - if ( (this->packFormat == TEXMODE_I2) || (this->packFormat == TEXMODE_I4) || (this->packFormat == TEXMODE_I8) ) + if (this->packFormat == TEXMODE_4X4) { - this->isPalZeroTransparent = ( ((attr >> 29) & 1) != 0 ); - } - else - { - this->isPalZeroTransparent = false; - - if (this->packFormat == TEXMODE_4X4) - { - if (this->packIndexSize != packedIndexData.size) - { - u8 *oldPackIndexData = this->packIndexData; - this->packIndexSize = packedIndexData.size; - this->packIndexData = (u8 *)malloc_alignedCacheLine(this->packIndexSize); - free_aligned(oldPackIndexData); - } - - packedIndexData.dump(this->packIndexData, this->packIndexSize); - } + packedIndexData.dump(this->packIndexData, this->packIndexSize); } - const u32 currentUnpackSize = w * h * sizeof(u32); + const u32 currentUnpackSize = this->sizeX * this->sizeY * sizeof(u32); if (this->unpackSize != currentUnpackSize) { u32 *oldUnpackData = this->unpackData; @@ -563,30 +554,11 @@ void TexCacheItem::SetTextureData(const u32 attr, const MemSpan &packedData, con } } -void TexCacheItem::SetTexturePalette(const u32 attr, const u16 *paletteBuffer) +void TexCacheItem::SetTexturePalette(const u16 *paletteBuffer) { - const u32 newPaletteSize = paletteSizeList[this->packFormat] * sizeof(u16); - - this->paletteAttributes = attr; - this->paletteAddress = (this->packFormat == TEXMODE_I2) ? attr << 3 : attr << 4; - - if (newPaletteSize > 0) + if (this->paletteSize > 0) { - if (this->paletteSize != newPaletteSize) - { - u16 *oldPaletteColorTable = this->paletteColorTable; - this->paletteSize = newPaletteSize; - this->paletteColorTable = (u16 *)malloc_alignedCacheLine(newPaletteSize); - free_aligned(oldPaletteColorTable); - } - - memcpy(this->paletteColorTable, paletteBuffer, newPaletteSize); - } - else - { - free_aligned(this->paletteColorTable); - this->paletteSize = 0; - this->paletteColorTable = NULL; + memcpy(this->paletteColorTable, paletteBuffer, this->paletteSize); } } @@ -624,7 +596,7 @@ void TexCacheItem::Unpack() PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); } - NDSTextureUnpack4x4(this->packSizeFirstSlot, this->packData, this->packIndexData, this->paletteAddress, this->textureAttributes, this->sizeX, this->sizeY, this->unpackData); + NDSTextureUnpack4x4(this->packSizeFirstSlot, (u32 *)this->packData, (u16 *)this->packIndexData, this->paletteAddress, this->textureAttributes, this->sizeX, this->sizeY, this->unpackData); break; } @@ -633,7 +605,7 @@ void TexCacheItem::Unpack() break; case TEXMODE_16BPP: - NDSTextureUnpackDirect16Bit(this->packSize, this->packData, this->unpackData); + NDSTextureUnpackDirect16Bit(this->packSize, (u16 *)this->packData, this->unpackData); break; default: @@ -657,544 +629,6 @@ void TexCacheItem::DebugDump() } #endif -// TODO: Delete these MemSpan based functions after testing confirms that using the dumped texture data works properly. -template -void NDSTextureUnpackI2(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer) -{ - u8 *adr; - -#ifdef ENABLE_SSSE3 - const __m128i pal_vec128 = _mm_loadl_epi64((__m128i *)pal); -#endif - if (isPalZeroTransparent) - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dstBuffer+=16) - { - __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); - idx = _mm_slli_epi16(idx, 1); - - __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); - const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - - __m128i convertedColor[4]; - - if (TEXCACHEFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - // Set converted colors to 0 if the palette index is 0. - idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); - idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); - convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); - convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); - convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); - convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); - - _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - u8 idx; - - idx = *adr & 0x03; - *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = (*adr >> 2) & 0x03; - *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = (*adr >> 4) & 0x03; - *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = (*adr >> 6) & 0x03; - *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - } -#endif - } - } - else - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=4, adr+=4, dstBuffer+=16) - { - __m128i idx = _mm_set_epi32(0, 0, 0, *(u32 *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi32(0x00000003)), _mm_and_si128(_mm_srli_epi32(idx, 2), _mm_set1_epi32(0x00000300)) ), _mm_and_si128(_mm_srli_epi32(idx, 4), _mm_set1_epi32(0x00030000)) ), _mm_and_si128(_mm_srli_epi32(idx, 6), _mm_set1_epi32(0x03000000)) ); - idx = _mm_slli_epi16(idx, 1); - - const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); - const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - - __m128i convertedColor[4]; - - if (TEXCACHEFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dstBuffer++ = CONVERT(pal[ *adr & 0x03] & 0x7FFF); - *dstBuffer++ = CONVERT(pal[(*adr >> 2) & 0x03] & 0x7FFF); - *dstBuffer++ = CONVERT(pal[(*adr >> 4) & 0x03] & 0x7FFF); - *dstBuffer++ = CONVERT(pal[(*adr >> 6) & 0x03] & 0x7FFF); - } -#endif - } - } -} - -template -void NDSTextureUnpackI4(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer) -{ - u8 *adr; - -#ifdef ENABLE_SSSE3 - const __m128i palLo = _mm_load_si128((__m128i *)pal + 0); - const __m128i palHi = _mm_load_si128((__m128i *)pal + 1); -#endif - if (isPalZeroTransparent) - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dstBuffer+=16) - { - __m128i idx = _mm_loadl_epi64((__m128i *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); - idx = _mm_slli_epi16(idx, 1); - - __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); - const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); - const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); - const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); - const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); - - const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); - const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); - - __m128i convertedColor[4]; - - if (TEXCACHEFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - // Set converted colors to 0 if the palette index is 0. - idx0 = _mm_cmpeq_epi16(idx0, _mm_set1_epi16(0x0100)); - idx1 = _mm_cmpeq_epi16(idx1, _mm_set1_epi16(0x0100)); - convertedColor[0] = _mm_andnot_si128(_mm_unpacklo_epi16(idx0, idx0), convertedColor[0]); - convertedColor[1] = _mm_andnot_si128(_mm_unpackhi_epi16(idx0, idx0), convertedColor[1]); - convertedColor[2] = _mm_andnot_si128(_mm_unpacklo_epi16(idx1, idx1), convertedColor[2]); - convertedColor[3] = _mm_andnot_si128(_mm_unpackhi_epi16(idx1, idx1), convertedColor[3]); - - _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - u8 idx; - - idx = *adr & 0xF; - *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - - idx = *adr >> 4; - *dstBuffer++ = (idx == 0) ? 0 : CONVERT(pal[idx] & 0x7FFF); - } -#endif - } - } - else - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=8, adr+=8, dstBuffer+=16) - { - __m128i idx = _mm_loadl_epi64((__m128i *)adr); - idx = _mm_unpacklo_epi8(idx, idx); - idx = _mm_or_si128( _mm_and_si128(idx, _mm_set1_epi16(0x000F)), _mm_and_si128(_mm_srli_epi16(idx, 4), _mm_set1_epi16(0x0F00)) ); - idx = _mm_slli_epi16(idx, 1); - - const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palMask = _mm_cmpeq_epi8( _mm_and_si128(idx, _mm_set1_epi8(0x10)), _mm_setzero_si128() ); - const __m128i palColor0A = _mm_shuffle_epi8(palLo, idx0); - const __m128i palColor0B = _mm_shuffle_epi8(palHi, idx0); - const __m128i palColor1A = _mm_shuffle_epi8(palLo, idx1); - const __m128i palColor1B = _mm_shuffle_epi8(palHi, idx1); - - const __m128i palColor0 = _mm_blendv_epi8( palColor0B, palColor0A, _mm_unpacklo_epi8(palMask, palMask) ); - const __m128i palColor1 = _mm_blendv_epi8( palColor1B, palColor1A, _mm_unpackhi_epi8(palMask, palMask) ); - - __m128i convertedColor[4]; - - if (TEXCACHEFORMAT == TexFormat_15bpp) - { - ColorspaceConvert555To6665Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To6665Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - else - { - ColorspaceConvert555To8888Opaque_SSE2(palColor0, convertedColor[0], convertedColor[1]); - ColorspaceConvert555To8888Opaque_SSE2(palColor1, convertedColor[2], convertedColor[3]); - } - - _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dstBuffer++ = CONVERT(pal[*adr & 0x0F] & 0x7FFF); - *dstBuffer++ = CONVERT(pal[*adr >> 4] & 0x7FFF); - } -#endif - } - } -} - -template -void NDSTextureUnpackI8(const MemSpan &ms, const u16 *srcPal, const bool isPalZeroTransparent, u32 *dstBuffer) -{ - u8 *adr; - - if (isPalZeroTransparent) - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dstBuffer++ = (*adr == 0) ? 0 : CONVERT(srcPal[*adr] & 0x7FFF); - } - } - } - else - { - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - *dstBuffer++ = CONVERT(srcPal[*adr] & 0x7FFF); - } - } - } -} - -template -void NDSTextureUnpackA3I5(const MemSpan &ms, const u16 *pal, u32 *dstBuffer) -{ - u8 *adr; - - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - const u16 c = pal[*adr & 0x1F] & 0x7FFF; - const u8 alpha = *adr >> 5; - *dstBuffer++ = (TEXCACHEFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, material_3bit_to_5bit[alpha]) : COLOR555TO8888(c, material_3bit_to_8bit[alpha]); - } - } -} - -template -void NDSTextureUnpackA5I3(const MemSpan &ms, const u16 *pal, u32 *dstBuffer) -{ - u8 *adr; - -#ifdef ENABLE_SSSE3 - const __m128i pal_vec128 = _mm_load_si128((__m128i *)pal); -#endif - for (size_t j = 0; j < ms.numItems; j++) - { - adr = ms.items[j].ptr; -#ifdef ENABLE_SSSE3 - for (size_t x = 0; x < ms.items[j].len; x+=16, adr+=16, dstBuffer+=16) - { - const __m128i bits = _mm_loadu_si128((__m128i *)adr); - - const __m128i idx = _mm_slli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0x07)), 1 ); - const __m128i idx0 = _mm_add_epi8( _mm_unpacklo_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - const __m128i idx1 = _mm_add_epi8( _mm_unpackhi_epi8(idx, idx), _mm_set1_epi16(0x0100) ); - - const __m128i palColor0 = _mm_shuffle_epi8(pal_vec128, idx0); - const __m128i palColor1 = _mm_shuffle_epi8(pal_vec128, idx1); - - __m128i tmpAlpha[2]; - __m128i convertedColor[4]; - - if (TEXCACHEFORMAT == TexFormat_15bpp) - { - const __m128i alpha = _mm_srli_epi16( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), 3 ); - const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); - const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ColorspaceConvert555To6665_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ColorspaceConvert555To6665_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); - } - else - { - const __m128i alpha = _mm_or_si128( _mm_and_si128(bits, _mm_set1_epi8(0xF8)), _mm_srli_epi16(_mm_and_si128(bits, _mm_set1_epi8(0xE0)), 5) ); - const __m128i alphaLo = _mm_unpacklo_epi8(_mm_setzero_si128(), alpha); - const __m128i alphaHi = _mm_unpackhi_epi8(_mm_setzero_si128(), alpha); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaLo); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaLo); - ColorspaceConvert555To8888_SSE2(palColor0, tmpAlpha[0], tmpAlpha[1], convertedColor[0], convertedColor[1]); - - tmpAlpha[0] = _mm_unpacklo_epi16(_mm_setzero_si128(), alphaHi); - tmpAlpha[1] = _mm_unpackhi_epi16(_mm_setzero_si128(), alphaHi); - ColorspaceConvert555To8888_SSE2(palColor1, tmpAlpha[0], tmpAlpha[1], convertedColor[2], convertedColor[3]); - } - - _mm_store_si128((__m128i *)(dstBuffer + 0), convertedColor[0]); - _mm_store_si128((__m128i *)(dstBuffer + 4), convertedColor[1]); - _mm_store_si128((__m128i *)(dstBuffer + 8), convertedColor[2]); - _mm_store_si128((__m128i *)(dstBuffer + 12), convertedColor[3]); - } -#else - for (size_t x = 0; x < ms.items[j].len; x++, adr++) - { - const u16 c = pal[*adr&0x07] & 0x7FFF; - const u8 alpha = (*adr>>3); - *dstBuffer++ = (TEXCACHEFORMAT == TexFormat_15bpp) ? COLOR555TO6665(c, alpha) : COLOR555TO8888(c, material_5bit_to_8bit[alpha]); - } -#endif - } -} - -#define PAL4X4(offset) ( LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((palAddress + (offset)*2)>>14)&0x7] + ((palAddress + (offset)*2)&0x3FFF) ) ) & 0x7FFF ) - -template -void NDSTextureUnpack4x4(const MemSpan &ms, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer) -{ - if (ms.numItems != 1) - { - PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); - } - //this check isnt necessary since the addressing is tied to the texture data which will also run out: - //if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n"); - - u16* slot1; - u32* map = (u32*)ms.items[0].ptr; - u32 limit = ms.items[0].len<<2; - u32 d = 0; - if ( (texAttributes & 0xc000) == 0x8000) - // texel are in slot 2 - slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((texAttributes & 0x3FFF)<<2)+0x010000]; - else - slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(texAttributes & 0x3FFF)<<2]; - - u16 yTmpSize = (sizeY>>2); - u16 xTmpSize = (sizeX>>2); - - //this is flagged whenever a 4x4 overruns its slot. - //i am guessing we just generate black in that case - bool dead = false; - - for (size_t y = 0; y < yTmpSize; y++) - { - u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX, - ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; - for (size_t x = 0; x < xTmpSize; x++, d++) - { - if (d >= limit) - dead = true; - - if (dead) - { - for (int sy = 0; sy < 4; sy++) - { - const u32 currentPos = (x<<2) + tmpPos[sy]; - dstBuffer[currentPos] = dstBuffer[currentPos+1] = dstBuffer[currentPos+2] = dstBuffer[currentPos+3] = 0; - } - continue; - } - - const u32 currBlock = LE_TO_LOCAL_32(map[d]); - const u16 pal1 = LE_TO_LOCAL_16(slot1[d]); - const u16 pal1offset = (pal1 & 0x3FFF)<<1; - const u8 mode = pal1>>14; - u32 tmp_col[4]; - - tmp_col[0] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset) ); - tmp_col[1] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+1) ); - - switch (mode) - { - case 0: - tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); - tmp_col[3] = 0x00000000; - break; - - case 1: -#ifdef LOCAL_BE - tmp_col[2] = ( (((tmp_col[0] & 0xFF000000) >> 1)+((tmp_col[1] & 0xFF000000) >> 1)) & 0xFF000000 ) | - ( (((tmp_col[0] & 0x00FF0000) + (tmp_col[1] & 0x00FF0000)) >> 1) & 0x00FF0000 ) | - ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | - 0x000000FF; - tmp_col[3] = 0x00000000; -#else - tmp_col[2] = ( (((tmp_col[0] & 0x00FF00FF) + (tmp_col[1] & 0x00FF00FF)) >> 1) & 0x00FF00FF ) | - ( (((tmp_col[0] & 0x0000FF00) + (tmp_col[1] & 0x0000FF00)) >> 1) & 0x0000FF00 ) | - 0xFF000000; - tmp_col[3] = 0x00000000; -#endif - break; - - case 2: - tmp_col[2] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+2) ); - tmp_col[3] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+3) ); - break; - - case 3: - { -#ifdef LOCAL_BE - const u32 r0 = (tmp_col[0]>>24) & 0x000000FF; - const u32 r1 = (tmp_col[1]>>24) & 0x000000FF; - const u32 g0 = (tmp_col[0]>>16) & 0x000000FF; - const u32 g1 = (tmp_col[1]>>16) & 0x000000FF; - const u32 b0 = (tmp_col[0]>> 8) & 0x000000FF; - const u32 b1 = (tmp_col[1]>> 8) & 0x000000FF; -#else - const u32 r0 = tmp_col[0] & 0x000000FF; - const u32 r1 = tmp_col[1] & 0x000000FF; - const u32 g0 = (tmp_col[0]>> 8) & 0x000000FF; - const u32 g1 = (tmp_col[1]>> 8) & 0x000000FF; - const u32 b0 = (tmp_col[0]>>16) & 0x000000FF; - const u32 b1 = (tmp_col[1]>>16) & 0x000000FF; -#endif - - const u16 tmp1 = ( (r0*5 + r1*3)>>6) | - ( ((g0*5 + g1*3)>>6) << 5 ) | - ( ((b0*5 + b1*3)>>6) << 10 ); - const u16 tmp2 = ( (r0*3 + r1*5)>>6) | - ( ((g0*3 + g1*5)>>6) << 5 ) | - ( ((b0*3 + b1*5)>>6) << 10 ); - - tmp_col[2] = COLOR555TO8888_OPAQUE(tmp1); - tmp_col[3] = COLOR555TO8888_OPAQUE(tmp2); - break; - } - } - - if (TEXCACHEFORMAT == TexFormat_15bpp) - { - for (size_t i = 0; i < 4; i++) - { -#ifdef LOCAL_BE - const u32 a = (tmp_col[i] >> 3) & 0x0000001F; - tmp_col[i] >>= 2; - tmp_col[i] &= 0x3F3F3F00; - tmp_col[i] |= a; -#else - const u32 a = (tmp_col[i] >> 3) & 0x1F000000; - tmp_col[i] >>= 2; - tmp_col[i] &= 0x003F3F3F; - tmp_col[i] |= a; -#endif - } - } - - //TODO - this could be more precise for 32bpp mode (run it through the color separation table) - - //set all 16 texels - for (size_t sy = 0; sy < 4; sy++) - { - // Texture offset - const u32 currentPos = (x<<2) + tmpPos[sy]; - const u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); - - dstBuffer[currentPos ] = tmp_col[ currRow &3]; - dstBuffer[currentPos+1] = tmp_col[(currRow>>2)&3]; - dstBuffer[currentPos+2] = tmp_col[(currRow>>4)&3]; - dstBuffer[currentPos+3] = tmp_col[(currRow>>6)&3]; - } - } - } -} - -template -void NDSTextureUnpackDirect16Bit(const MemSpan &ms, u32 *dstBuffer) -{ - for (size_t j = 0; j < ms.numItems; j++) - { - const u16 *map = (u16 *)ms.items[j].ptr; - const size_t len = ms.items[j].len >> 1; - - for (size_t x = 0; x < len; x++) - { - const u16 c = LOCAL_TO_LE_16(map[x]); - *dstBuffer++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0; - } - } -} - template void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { @@ -1523,27 +957,20 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co #endif } +#define PAL4X4(offset) ( LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((palAddress + (offset)*2)>>14)&0x7] + ((palAddress + (offset)*2)&0x3FFF) ) ) & 0x7FFF ) + template -void NDSTextureUnpack4x4(const size_t srcSize, const u8 *__restrict srcData, const u8 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer) +void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, const u16 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer) { - u16 *slot1; - u32 *map = (u32 *)srcData; - u32 limit = srcSize * sizeof(u32); - u32 d = 0; - if ( (texAttributes & 0xc000) == 0x8000) - // texel are in slot 2 - slot1 = (u16 *)&MMU.texInfo.textureSlotAddr[1][((texAttributes & 0x3FFF)<<2)+0x010000]; - else - slot1 = (u16 *)&MMU.texInfo.textureSlotAddr[1][(texAttributes & 0x3FFF)<<2]; - - u16 xTmpSize = sizeX >> 2; - u16 yTmpSize = sizeY >> 2; + const u32 limit = srcSize * sizeof(u32); + const u16 xTmpSize = sizeX >> 2; + const u16 yTmpSize = sizeY >> 2; //this is flagged whenever a 4x4 overruns its slot. //i am guessing we just generate black in that case bool dead = false; - for (size_t y = 0; y < yTmpSize; y++) + for (size_t y = 0, d = 0; y < yTmpSize; y++) { u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX, ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; @@ -1562,8 +989,8 @@ void NDSTextureUnpack4x4(const size_t srcSize, const u8 *__restrict srcData, con continue; } - const u32 currBlock = LE_TO_LOCAL_32(map[d]); - const u16 pal1 = LE_TO_LOCAL_16(slot1[d]); + const u32 currBlock = LE_TO_LOCAL_32(srcData[d]); + const u16 pal1 = LE_TO_LOCAL_16(srcIndex[d]); const u16 pal1offset = (pal1 & 0x3FFF)<<1; const u8 mode = pal1>>14; u32 tmp_col[4]; @@ -1666,17 +1093,16 @@ void NDSTextureUnpack4x4(const size_t srcSize, const u8 *__restrict srcData, con } template -void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *__restrict srcData, u32 *__restrict dstBuffer) +void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict srcData, u32 *__restrict dstBuffer) { - const u16 *srcData16 = (const u16 *)srcData; const size_t pixCount = srcSize >> 1; size_t i = 0; #ifdef ENABLE_SSE2 const size_t pixCountVec128 = pixCount - (pixCount % 8); - for (; i < pixCountVec128; i+=8, srcData16+=8, dstBuffer+=8) + for (; i < pixCountVec128; i+=8, srcData+=8, dstBuffer+=8) { - const v128u16 c = _mm_load_si128((v128u16 *)srcData16); + const v128u16 c = _mm_load_si128((v128u16 *)srcData); const v128u16 alpha = _mm_cmpeq_epi16(_mm_srli_epi16(c, 15), _mm_set1_epi16(1)); v128u32 convertedColor[2]; @@ -1697,9 +1123,9 @@ void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *__restrict srcD } #endif - for (; i < pixCount; i++, srcData16++) + for (; i < pixCount; i++, srcData++) { - const u16 c = LOCAL_TO_LE_16(*srcData16); + const u16 c = LOCAL_TO_LE_16(*srcData); *dstBuffer++ = (c & 0x8000) ? CONVERT(c & 0x7FFF) : 0; } } diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index da2b1dca5..199a9406e 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -78,34 +78,39 @@ private: public: TexCacheItem(); + TexCacheItem(const u32 texAttributes, const u32 palAttributes); ~TexCacheItem(); + u32 textureAttributes; + u32 paletteAttributes; + + u32 sizeX; + u32 sizeY; + float invSizeX; + float invSizeY; + bool isPalZeroTransparent; + + bool suspectedInvalid; + bool assumedInvalid; + NDSTextureFormat packFormat; + u32 packAddress; u32 packSize; u8 *packData; + + u32 paletteAddress; + u32 paletteSize; u16 *paletteColorTable; - bool isPalZeroTransparent; TexCache_TexFormat unpackFormat; u32 unpackSize; u32 *unpackData; - bool suspectedInvalid; - bool assumedInvalid; - - u32 textureAttributes; - u32 paletteAttributes; - u32 paletteAddress; - u32 paletteSize; - u32 sizeX; - u32 sizeY; - float invSizeX; - float invSizeY; - // Only used by 4x4 formatted textures + u32 packIndexAddress; + u32 packIndexSize; u8 *packIndexData; u32 packSizeFirstSlot; - u32 packIndexSize; // Only used by the OpenGL renderer for the texture ID u32 texid; @@ -114,30 +119,21 @@ public: void SetDeleteCallback(TexCacheItemDeleteCallback callbackFunc, void *inParam1, void *inParam2); NDSTextureFormat GetTextureFormat() const; - void SetTextureData(const u32 attr, const MemSpan &packedData, const MemSpan &packedIndexData); - void SetTexturePalette(const u32 attr, const u16 *paletteBuffer); + void SetTextureData(const MemSpan &packedData, const MemSpan &packedIndexData); + void SetTexturePalette(const u16 *paletteBuffer); template void Unpack(); void DebugDump(); }; -// TODO: Delete these MemSpan based functions after testing confirms that using the dumped texture data works properly. -template void NDSTextureUnpackI2(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer); -template void NDSTextureUnpackI4(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer); -template void NDSTextureUnpackI8(const MemSpan &ms, const u16 *pal, const bool isPalZeroTransparent, u32 *dstBuffer); -template void NDSTextureUnpackA3I5(const MemSpan &ms, const u16 *pal, u32 *dstBuffer); -template void NDSTextureUnpackA5I3(const MemSpan &ms, const u16 *pal, u32 *dstBuffer); -template void NDSTextureUnpack4x4(const MemSpan &ms, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *dstBuffer); -template void NDSTextureUnpackDirect16Bit(const MemSpan &ms, u32 *dstBuffer); - template void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); template void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); template void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); -template void NDSTextureUnpack4x4(const size_t srcSize, const u8 *__restrict srcData, const u8 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer); -template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u8 *__restrict srcData, u32 *__restrict dstBuffer); +template void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, const u16 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer); +template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict srcData, u32 *__restrict dstBuffer); extern TexCache texCache; From b0e649c9bbfd84919d320ae03f1ff0061aeae045 Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 2 Nov 2016 09:37:59 +0000 Subject: [PATCH 35/41] Texture Handler: - Fix a bug where 4x4 formatted textures were being read incorrectly. (Regression from r5569.) --- desmume/src/texcache.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 2df64d257..28403dbb9 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -469,17 +469,9 @@ TexCacheItem::TexCacheItem(const u32 texAttributes, const u32 palAttributes) isPalZeroTransparent = false; } + paletteAddress = (packFormat == TEXMODE_I2) ? palAttributes << 3 : palAttributes << 4; paletteSize = paletteSizeList[packFormat] * sizeof(u16); - if (paletteSize > 0) - { - paletteAddress = (packFormat == TEXMODE_I2) ? palAttributes << 3 : palAttributes << 4; - paletteColorTable = (u16 *)malloc_alignedCacheLine(paletteSize); - } - else - { - paletteAddress = 0; - paletteColorTable = NULL; - } + paletteColorTable = (paletteSize > 0) ? (u16 *)malloc_alignedCacheLine(paletteSize) : NULL; unpackFormat = TexFormat_None; unpackSize = 0; From 158f0c561bac695a3ba287f9aa4d337127dd55e1 Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 2 Nov 2016 18:49:36 +0000 Subject: [PATCH 36/41] Texture Handler: - Increase TEXCACHE_MAX_SIZE to 32MB. Fixes severe performance problems with continuously evicting/reloading the texture cache in Umihara Kawase Shun. --- desmume/src/texcache.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index 199a9406e..5161760a7 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -27,10 +27,15 @@ #include "gfx3d.h" //this ought to be enough for anyone -//#define TEXCACHE_MAX_SIZE (64*1024*1024); +//#define TEXCACHE_MAX_SIZE (64*1024*1024) + //changed by zeromus on 15-dec. I couldnt find any games that were getting anywhere NEAR 64 //metal slug burns through sprites so fast, it can test it pretty quickly though -#define TEXCACHE_MAX_SIZE (16*1024*1024) +//#define TEXCACHE_MAX_SIZE (16*1024*1024) + +// rogerman, 2016-11-02: Increase this to 32MB for games that use many large textures, such +// as Umihara Kawase Shun, which can cache over 20MB in the first level. +#define TEXCACHE_MAX_SIZE (32*1024*1024) #define PALETTE_DUMP_SIZE ((64+16+16)*1024) From 812cabb752d407e55ee8c48df98212147928626e Mon Sep 17 00:00:00 2001 From: rogerman Date: Thu, 3 Nov 2016 00:39:02 +0000 Subject: [PATCH 37/41] =?UTF-8?q?Texture=20Handler:=20-=20The=203D=20rende?= =?UTF-8?q?rers=20are=20now=20responsible=20for=20managing=20the=20texture?= =?UTF-8?q?=20unpack=20buffers=20instead=20of=20relying=20on=20the=20TexCa?= =?UTF-8?q?cheItem=20itself=20to=20do=20it.=20-=20The=20OpenGL=203D=20rend?= =?UTF-8?q?erer=20now=20uses=20a=20fixed=204MB=20buffer=20for=20unpacking?= =?UTF-8?q?=20textures,=20instead=20of=20maintaining=20extra=20copies=20of?= =?UTF-8?q?=20each=20unpacked=20texture=20in=20main=20memory=20even=20afte?= =?UTF-8?q?r=20they=E2=80=99ve=20been=20uploaded=20to=20the=20GPU.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- desmume/src/OGLRender.cpp | 27 +++++++++----- desmume/src/OGLRender.h | 3 +- desmume/src/OGLRender_3_2.cpp | 11 +++--- desmume/src/rasterize.cpp | 28 +++++++++++++-- desmume/src/texcache.cpp | 67 +++++++++++++++-------------------- desmume/src/texcache.h | 12 +++---- 6 files changed, 87 insertions(+), 61 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 07fb03899..0f20a018b 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -636,7 +636,7 @@ static void OGLGetDriverVersion(const char *oglVersionString, } } -void texDeleteCallback(TexCacheItem *texItem, void *param1, void *param2) +void OGLTextureDeleteCallback(TexCacheItem *texItem, void *param1, void *param2) { OpenGLRenderer *oglRenderer = (OpenGLRenderer *)param1; oglRenderer->DeleteTexture(texItem); @@ -872,6 +872,7 @@ OpenGLRenderer::OpenGLRenderer() ref->selectedRenderingFBO = 0; _mappedFramebuffer = NULL; + _workingTextureUnpackBuffer = (FragmentColor *)malloc_alignedCacheLine(1024 * 1024 * sizeof(FragmentColor)); _pixelReadNeedsFinish = false; _currentPolyIndex = 0; _shadowPolyID.reserve(POLYLIST_SIZE); @@ -880,6 +881,7 @@ OpenGLRenderer::OpenGLRenderer() OpenGLRenderer::~OpenGLRenderer() { free_aligned(_framebufferColor); + free_aligned(_workingTextureUnpackBuffer); // Destroy OpenGL rendering states delete ref; @@ -2417,6 +2419,7 @@ Render3DError OpenGLRenderer_1_2::ReadBackPixels() Render3DError OpenGLRenderer_1_2::DeleteTexture(const TexCacheItem *item) { this->ref->freeTextureIDs.push((GLuint)item->texid); + texCache.cache_size -= item->unpackSize; return OGLERROR_NOERR; } @@ -2970,13 +2973,11 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT if (theTexture->unpackFormat != TexFormat_32bpp) { - theTexture->Unpack(); - //has the ogl renderer initialized the texture? const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); if (isNewTexture) { - theTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); + theTexture->SetDeleteCallback(&OGLTextureDeleteCallback, this, NULL); if (OGLRef.freeTextureIDs.empty()) { @@ -2985,14 +2986,19 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); + + theTexture->unpackSize = theTexture->GetUnpackSizeUsingFormat(TexFormat_32bpp); + texCache.cache_size += theTexture->unpackSize; } + theTexture->Unpack((u32 *)this->_workingTextureUnpackBuffer); + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); - const u32 *textureSrc = theTexture->unpackData; + const u32 *textureSrc = (u32 *)this->_workingTextureUnpackBuffer; size_t texWidth = theTexture->sizeX; size_t texHeight = theTexture->sizeY; @@ -4646,13 +4652,11 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT if (theTexture->unpackFormat != TexFormat_32bpp) { - theTexture->Unpack(); - //has the ogl renderer initialized the texture? const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); if (isNewTexture) { - theTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); + theTexture->SetDeleteCallback(&OGLTextureDeleteCallback, this, NULL); if (OGLRef.freeTextureIDs.empty()) { @@ -4661,14 +4665,19 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); + + theTexture->unpackSize = theTexture->GetUnpackSizeUsingFormat(TexFormat_32bpp); + texCache.cache_size += theTexture->unpackSize; } + theTexture->Unpack((u32 *)this->_workingTextureUnpackBuffer); + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); - const u32 *textureSrc = theTexture->unpackData; + const u32 *textureSrc = (u32 *)this->_workingTextureUnpackBuffer; size_t texWidth = theTexture->sizeX; size_t texHeight = theTexture->sizeY; diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 9ede77cfc..464ec100a 100644 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -526,7 +526,7 @@ extern CACHE_ALIGN const GLfloat divide6bitBy63_LUT[64]; extern const GLfloat PostprocessVtxBuffer[16]; extern const GLubyte PostprocessElementBuffer[6]; -extern void texDeleteCallback(TexCacheItem *texItem, void *param1, void *param2); +extern void OGLTextureDeleteCallback(TexCacheItem *texItem, void *param1, void *param2); //This is called by OGLRender whenever it initializes. //Platforms, please be sure to set this up. @@ -589,6 +589,7 @@ protected: bool willConvertFramebufferOnGPU; FragmentColor *_mappedFramebuffer; + FragmentColor *_workingTextureUnpackBuffer; bool _pixelReadNeedsFinish; size_t _currentPolyIndex; std::vector _shadowPolyID; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 84acb6a13..2ef6b2a8a 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1698,13 +1698,11 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); if (theTexture->unpackFormat != TexFormat_32bpp) { - theTexture->Unpack(); - //has the ogl renderer initialized the texture? const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); if (isNewTexture) { - theTexture->SetDeleteCallback(&texDeleteCallback, this, NULL); + theTexture->SetDeleteCallback(&OGLTextureDeleteCallback, this, NULL); if (OGLRef.freeTextureIDs.empty()) { @@ -1713,14 +1711,19 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); OGLRef.freeTextureIDs.pop(); + + theTexture->unpackSize = theTexture->GetUnpackSizeUsingFormat(TexFormat_32bpp); + texCache.cache_size += theTexture->unpackSize; } + theTexture->Unpack((u32 *)this->_workingTextureUnpackBuffer); + glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); - const u32 *textureSrc = theTexture->unpackData; + const u32 *textureSrc = (u32 *)this->_workingTextureUnpackBuffer; size_t texWidth = theTexture->sizeX; size_t texHeight = theTexture->sizeY; diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index bb9850f6f..9e33f8490 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1147,6 +1147,12 @@ static void SoftRasterizerRendererDestroy() } } +void SoftRasterizerTextureDeleteCallback(TexCacheItem *texItem, void *param1, void *param2) +{ + free_aligned(texItem->unpackData); + texCache.cache_size -= texItem->unpackSize; +} + GPU3DInterface gpu3DRasterize = { "SoftRasterizer", SoftRasterizerRendererCreate, @@ -1377,7 +1383,16 @@ void SoftRasterizerRenderer::setupTextures() TexCacheItem *lastTexItem = texCache.GetTexture(firstPoly.texParam, firstPoly.texPalette); if (lastTexItem->unpackFormat != TexFormat_15bpp) { - lastTexItem->Unpack(); + const bool isNewTexture = (lastTexItem->GetDeleteCallback() == NULL); + if (isNewTexture) + { + lastTexItem->SetDeleteCallback(&SoftRasterizerTextureDeleteCallback, this, NULL); + lastTexItem->unpackSize = lastTexItem->GetUnpackSizeUsingFormat(TexFormat_15bpp); + lastTexItem->unpackData = (u32 *)malloc_alignedCacheLine(lastTexItem->unpackSize); + texCache.cache_size += lastTexItem->unpackSize; + } + + lastTexItem->Unpack(lastTexItem->unpackData); } for (size_t i = 0; i < this->_clippedPolyCount; i++) @@ -1394,7 +1409,16 @@ void SoftRasterizerRenderer::setupTextures() lastTexItem = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); if (lastTexItem->unpackFormat != TexFormat_15bpp) { - lastTexItem->Unpack(); + const bool isNewTexture = (lastTexItem->GetDeleteCallback() == NULL); + if (isNewTexture) + { + lastTexItem->SetDeleteCallback(&SoftRasterizerTextureDeleteCallback, this, NULL); + lastTexItem->unpackSize = lastTexItem->GetUnpackSizeUsingFormat(TexFormat_15bpp); + lastTexItem->unpackData = (u32 *)malloc_alignedCacheLine(lastTexItem->unpackSize); + texCache.cache_size += lastTexItem->unpackSize; + } + + lastTexItem->Unpack(lastTexItem->unpackData); } lastTexParams = thePoly.texParam; diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 28403dbb9..a8fe7cad0 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -204,20 +204,6 @@ TexCache::TexCache() memset(paletteDump, 0, sizeof(paletteDump)); } -void TexCache::list_remove(TexCacheItem *item) -{ - const TexCacheKey key = TexCache::GenerateKey(item->textureAttributes, item->paletteAttributes); - this->cacheTable.erase(key); - this->cache_size -= item->unpackSize; -} - -void TexCache::list_push_front(TexCacheItem *item) -{ - const TexCacheKey key = TexCache::GenerateKey(item->textureAttributes, item->paletteAttributes); - this->cacheTable[key] = item; - this->cache_size += item->unpackSize; -} - void TexCache::Invalidate() { //check whether the palette memory changed @@ -244,7 +230,7 @@ void TexCache::Invalidate() } } -void TexCache::Evict(u32 target) +void TexCache::Evict(size_t target) { //debug print //printf("%d %d/%d\n",index.size(),cache_size/1024,target/1024); @@ -262,7 +248,9 @@ void TexCache::Evict(u32 target) if (this->cacheTable.size() == 0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 TexCacheItem *item = this->cacheTable.begin()->second; - this->list_remove(item); + const TexCacheKey key = TexCache::GenerateKey(item->textureAttributes, item->paletteAttributes); + this->cacheTable.erase(key); + //printf("evicting! totalsize:%d\n",cache_size); delete item; } @@ -270,7 +258,15 @@ void TexCache::Evict(u32 target) void TexCache::Reset() { - this->Evict(0); + for (TexCacheTable::iterator it(this->cacheTable.begin()); it != this->cacheTable.end(); ++it) + { + TexCacheItem *item = it->second; + delete item; + } + + this->cacheTable.clear(); + this->cache_size = 0; + memset(this->paletteDump, 0, sizeof(paletteDump)); } TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) @@ -378,7 +374,7 @@ TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) if (didCreateNewTexture) { - this->list_push_front(theTexture); + this->cacheTable[key] = theTexture; //printf("allocating: up to %d with %d items\n",cache_size,index.size()); } @@ -501,7 +497,6 @@ TexCacheItem::TexCacheItem(const u32 texAttributes, const u32 palAttributes) TexCacheItem::~TexCacheItem() { free_aligned(this->packData); - free_aligned(this->unpackData); free_aligned(this->paletteColorTable); free_aligned(this->packIndexData); if (this->_deleteCallback != NULL) this->_deleteCallback(this, this->_deleteCallbackParam1, this->_deleteCallbackParam2); @@ -535,15 +530,6 @@ void TexCacheItem::SetTextureData(const MemSpan &packedData, const MemSpan &pack { packedIndexData.dump(this->packIndexData, this->packIndexSize); } - - const u32 currentUnpackSize = this->sizeX * this->sizeY * sizeof(u32); - if (this->unpackSize != currentUnpackSize) - { - u32 *oldUnpackData = this->unpackData; - this->unpackSize = currentUnpackSize; - this->unpackData = (u32 *)malloc_alignedCacheLine(currentUnpackSize); - free_aligned(oldUnpackData); - } } void TexCacheItem::SetTexturePalette(const u16 *paletteBuffer) @@ -554,8 +540,13 @@ void TexCacheItem::SetTexturePalette(const u16 *paletteBuffer) } } +size_t TexCacheItem::GetUnpackSizeUsingFormat(const TexCache_TexFormat texCacheFormat) const +{ + return (this->sizeX * this->sizeY * sizeof(u32)); +} + template -void TexCacheItem::Unpack() +void TexCacheItem::Unpack(u32 *unpackBuffer) { this->unpackFormat = TEXCACHEFORMAT; @@ -566,19 +557,19 @@ void TexCacheItem::Unpack() switch (this->packFormat) { case TEXMODE_A3I5: - NDSTextureUnpackA3I5(this->packSize, this->packData, this->paletteColorTable, this->unpackData); + NDSTextureUnpackA3I5(this->packSize, this->packData, this->paletteColorTable, unpackBuffer); break; case TEXMODE_I2: - NDSTextureUnpackI2(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, this->unpackData); + NDSTextureUnpackI2(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, unpackBuffer); break; case TEXMODE_I4: - NDSTextureUnpackI4(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, this->unpackData); + NDSTextureUnpackI4(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, unpackBuffer); break; case TEXMODE_I8: - NDSTextureUnpackI8(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, this->unpackData); + NDSTextureUnpackI8(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, unpackBuffer); break; case TEXMODE_4X4: @@ -588,16 +579,16 @@ void TexCacheItem::Unpack() PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); } - NDSTextureUnpack4x4(this->packSizeFirstSlot, (u32 *)this->packData, (u16 *)this->packIndexData, this->paletteAddress, this->textureAttributes, this->sizeX, this->sizeY, this->unpackData); + NDSTextureUnpack4x4(this->packSizeFirstSlot, (u32 *)this->packData, (u16 *)this->packIndexData, this->paletteAddress, this->textureAttributes, this->sizeX, this->sizeY, unpackBuffer); break; } case TEXMODE_A5I3: - NDSTextureUnpackA5I3(this->packSize, this->packData, this->paletteColorTable, this->unpackData); + NDSTextureUnpackA5I3(this->packSize, this->packData, this->paletteColorTable, unpackBuffer); break; case TEXMODE_16BPP: - NDSTextureUnpackDirect16Bit(this->packSize, (u16 *)this->packData, this->unpackData); + NDSTextureUnpackDirect16Bit(this->packSize, (u16 *)this->packData, unpackBuffer); break; default: @@ -1122,5 +1113,5 @@ void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict src } } -template void TexCacheItem::Unpack(); -template void TexCacheItem::Unpack(); +template void TexCacheItem::Unpack(u32 *unpackBuffer); +template void TexCacheItem::Unpack(u32 *unpackBuffer); diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index 5161760a7..7d8bb5eae 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -59,14 +59,11 @@ public: TexCache(); TexCacheTable cacheTable; - u32 cache_size; //this is not really precise, it is off by a constant factor + size_t cache_size; //this is not really precise, it is off by a constant factor u8 paletteDump[PALETTE_DUMP_SIZE]; - - void list_remove(TexCacheItem *item); - void list_push_front(TexCacheItem *item); - + void Invalidate(); - void Evict(u32 target); + void Evict(size_t target); void Reset(); TexCacheItem* GetTexture(u32 texAttributes, u32 palAttributes); @@ -127,7 +124,8 @@ public: void SetTextureData(const MemSpan &packedData, const MemSpan &packedIndexData); void SetTexturePalette(const u16 *paletteBuffer); - template void Unpack(); + size_t GetUnpackSizeUsingFormat(const TexCache_TexFormat texCacheFormat) const; + template void Unpack(u32 *unpackBuffer); void DebugDump(); }; From 6e0077ecb9d1f3b8e83e4d52fa8861a5929acfa5 Mon Sep 17 00:00:00 2001 From: zeromus Date: Thu, 10 Nov 2016 07:02:54 +0000 Subject: [PATCH 38/41] fix retriggering sounds in AC:WW --- desmume/src/SPU.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/desmume/src/SPU.cpp b/desmume/src/SPU.cpp index 34a90f066..44046f264 100644 --- a/desmume/src/SPU.cpp +++ b/desmume/src/SPU.cpp @@ -795,8 +795,11 @@ void SPU_struct::WriteByte(u32 addr, u8 val) regs.ctl_ch1bypass = (val >> 4) & 1; regs.ctl_ch3bypass = (val >> 5) & 1; regs.masteren = (val >> 7) & 1; - for(u8 i=0; i<16; i++) - KeyProbe(i); + //from r4925 - after changing 'masteren', we retrigger any sounds? doubtful. + //maybe we STOP sounds here, but we don't enable them (this would retrigger any previous sounds that had finished; glitched AC:WW) + //(probably broken in r3299) + //after commenting this out, I checked bug #1356. seems unrelated. + //for(int i=0; i<16; i++) KeyProbe(i); break; //SOUNDBIAS From 2801ee5d198e763be9bda8a1a9955471efd51595 Mon Sep 17 00:00:00 2001 From: zeromus Date: Fri, 11 Nov 2016 02:33:33 +0000 Subject: [PATCH 39/41] fix vfat (broken in r5438). file sizes weren't counted right. only ~36MB would get allocated and then it would get blown when loading the files. --- desmume/src/utils/vfat.cpp | 105 +++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 38 deletions(-) diff --git a/desmume/src/utils/vfat.cpp b/desmume/src/utils/vfat.cpp index 17853c08a..d631736f4 100644 --- a/desmume/src/utils/vfat.cpp +++ b/desmume/src/utils/vfat.cpp @@ -43,6 +43,9 @@ enum EListCallbackArg { typedef void (*ListCallback)(RDIR* rdir, EListCallbackArg); // List all files and subdirectories recursively +//TODO: clunky architecture. we've combined the callbacks into one handler. +//we could merge the callback and list_files function, or refactor the callback into one for each enum which receives a unit of work after +//the more detailed recursing logic (caused by libretro-common integration) is handled in the lister static void list_files(const char *filepath, ListCallback list_callback) { void * hFind; @@ -64,6 +67,7 @@ static void list_files(const char *filepath, ListCallback list_callback) const char* fname = retro_dirent_get_name(rdir); list_callback(rdir,EListCallbackArg_Item); + printf("cflash added %s\n",fname); if(retro_dirent_is_dir(rdir) && (strcmp(fname, ".")) && (strcmp(fname, ".."))) { @@ -76,28 +80,24 @@ static void list_files(const char *filepath, ListCallback list_callback) retro_closedir(rdir); } -static u64 dataSectors = 0; -void count_ListCallback(RDIR* rdir, EListCallbackArg arg) +enum eCallbackType { - if(arg == EListCallbackArg_Pop) return; - u32 sectors = 1; - if(retro_dirent_is_dir(rdir)) - { - } - else - { - //allocate sectors for file - int32_t fileSize = path_get_size(retro_dirent_get_name(rdir)); - sectors += (fileSize+511)/512 + 1; - } - dataSectors += sectors; -} + eCallbackType_Count, eCallbackType_Build +}; +static eCallbackType callbackType; + +//for eCallbackType_Count: +static bool count_failed = false; +static u64 dataSectors = 0; + +//recursing related.. really ought to be merged with list_files functionality static std::string currPath; static std::stack pathStack; static std::stack virtPathStack; static std::string currVirtPath; -void build_ListCallback(RDIR* rdir, EListCallbackArg arg) + +static void DirectoryListCallback(RDIR* rdir, EListCallbackArg arg) { const char* fname = retro_dirent_get_name(rdir); @@ -119,10 +119,18 @@ void build_ListCallback(RDIR* rdir, EListCallbackArg arg) virtPathStack.push(currVirtPath); currVirtPath = currVirtPath + "/" + fname; - bool ok = LIBFAT::MkDir(currVirtPath.c_str()); - if(!ok) - printf("ERROR adding dir %s via libfat\n",currVirtPath.c_str()); + if(callbackType == eCallbackType_Build) + { + bool ok = LIBFAT::MkDir(currVirtPath.c_str()); + + if(!ok) + printf("ERROR adding dir %s via libfat\n",currVirtPath.c_str()); + } + else + { + dataSectors++; //directories take one sector + } currPath = currPath + path_default_slash() + fname; return; @@ -131,38 +139,56 @@ void build_ListCallback(RDIR* rdir, EListCallbackArg arg) { std::string path = currPath + path_default_slash() + fname; - FILE* inf = fopen(path.c_str(),"rb"); - if(inf) + if(callbackType == eCallbackType_Build) { - fseek(inf,0,SEEK_END); - long len = ftell(inf); - fseek(inf,0,SEEK_SET); - u8 *buf = new u8[len]; - fread(buf,1,len,inf); - fclose(inf); + FILE* inf = fopen(path.c_str(),"rb"); + if(inf) + { + fseek(inf,0,SEEK_END); + long len = ftell(inf); + fseek(inf,0,SEEK_SET); + u8 *buf = new u8[len]; + fread(buf,1,len,inf); + fclose(inf); - std::string path = currVirtPath + "/" + fname; - printf("FAT + (%10.2f KB) %s \n",len/1024.f,path.c_str()); - bool ok = LIBFAT::WriteFile(path.c_str(),buf,len); - if(!ok) - printf("ERROR adding file to fat\n"); - delete[] buf; - } else printf("ERROR opening file for fat\n"); + std::string path = currVirtPath + "/" + fname; + printf("FAT + (%10.2f KB) %s \n",len/1024.f,path.c_str()); + bool ok = LIBFAT::WriteFile(path.c_str(),buf,len); + if(!ok) + printf("ERROR adding file to fat\n"); + delete[] buf; + } else printf("ERROR opening file for fat\n"); + } + else + { + //allocate sectors for file + int32_t fileSize = path_get_size(path.c_str()); + if(fileSize == -1) { count_failed = true; dataSectors = 0; } + else dataSectors += (fileSize+511)/512 + 1; + } } } - - bool VFAT::build(const char* path, int extra_MB) { dataSectors = 0; currVirtPath = ""; currPath = path; - list_files(path, count_ListCallback); + + count_failed = false; + callbackType = eCallbackType_Count; + list_files(path, DirectoryListCallback); + + if(count_failed) + { + printf("FAILED enumerating files for fat\n"); + return false; + } dataSectors += 8; //a few for reserved sectors, etc. + dataSectors += extra_MB*1024*1024/512; //add extra write space //dataSectors += 16*1024*1024/512; //add 16MB worth of write space. this is probably enough for anyone, but maybe it should be configurable. //we could always suggest to users to add a big file to their directory to overwrite (that would cause the image to get padded) @@ -171,6 +197,8 @@ bool VFAT::build(const char* path, int extra_MB) if(dataSectors<36*1024*1024/512) dataSectors = 36*1024*1024/512; + printf("dataSectors: %lld\n",dataSectors); + if(dataSectors>=(0x80000000>>9)) { printf("error allocating memory for fat (%d KBytes)\n",(dataSectors*512)/1024); @@ -207,7 +235,8 @@ bool VFAT::build(const char* path, int extra_MB) //setup libfat and write all the files through it LIBFAT::Init(memf->buf(),memf->size()); - list_files(path, build_ListCallback); + callbackType = eCallbackType_Build; + list_files(path, DirectoryListCallback); LIBFAT::Shutdown(); return true; From c75b9ed62bcec5e6dfd6f3932958f4361249f5e9 Mon Sep 17 00:00:00 2001 From: zeromus Date: Fri, 11 Nov 2016 02:34:02 +0000 Subject: [PATCH 40/41] oops, remove unneeded printf --- desmume/src/utils/vfat.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/desmume/src/utils/vfat.cpp b/desmume/src/utils/vfat.cpp index d631736f4..cd0e8cd6b 100644 --- a/desmume/src/utils/vfat.cpp +++ b/desmume/src/utils/vfat.cpp @@ -197,8 +197,6 @@ bool VFAT::build(const char* path, int extra_MB) if(dataSectors<36*1024*1024/512) dataSectors = 36*1024*1024/512; - printf("dataSectors: %lld\n",dataSectors); - if(dataSectors>=(0x80000000>>9)) { printf("error allocating memory for fat (%d KBytes)\n",(dataSectors*512)/1024); From 02645310b4031ef36c328059f23493d4de8d3f0d Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 23 Nov 2016 20:41:07 +0000 Subject: [PATCH 41/41] =?UTF-8?q?Texture=20Handler:=20-=20Finish=20refacto?= =?UTF-8?q?ring=20and=20cleaning=20up=20TexCache=20(now=20renamed=20to=20?= =?UTF-8?q?=E2=80=9CTextureCache=E2=80=9D)=20and=20TexCacheItem=20(now=20r?= =?UTF-8?q?enamed=20to=20=E2=80=9CTextureStore=E2=80=9D).=20-=20TextureCac?= =?UTF-8?q?he=20items=20are=20now=20evicted=20based=20on=20age=20and=20usa?= =?UTF-8?q?ge=20instead=20of=20arbitrarily.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- desmume/src/OGLRender.cpp | 240 +++++----- desmume/src/OGLRender.h | 30 +- desmume/src/OGLRender_3_2.cpp | 78 ++- desmume/src/rasterize.cpp | 139 ++++-- desmume/src/rasterize.h | 25 +- desmume/src/texcache.cpp | 865 +++++++++++++++++++++------------- desmume/src/texcache.h | 184 +++++--- 7 files changed, 921 insertions(+), 640 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 0f20a018b..bf703e5da 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -636,10 +636,42 @@ static void OGLGetDriverVersion(const char *oglVersionString, } } -void OGLTextureDeleteCallback(TexCacheItem *texItem, void *param1, void *param2) +OpenGLTexture::OpenGLTexture() { - OpenGLRenderer *oglRenderer = (OpenGLRenderer *)param1; - oglRenderer->DeleteTexture(texItem); + _cacheSize = GetUnpackSizeUsingFormat(TexFormat_32bpp); + _invSizeS = 0.0f; + _invSizeT = 0.0f; + + glGenTextures(1, &_texID); +} + +OpenGLTexture::OpenGLTexture(u32 texAttributes, u32 palAttributes) : TextureStore(texAttributes, palAttributes) +{ + _cacheSize = GetUnpackSizeUsingFormat(TexFormat_32bpp); + _invSizeS = 1.0f / (float)_sizeS; + _invSizeT = 1.0f / (float)_sizeT; + + glGenTextures(1, &_texID); +} + +OpenGLTexture::~OpenGLTexture() +{ + glDeleteTextures(1, &this->_texID); +} + +GLuint OpenGLTexture::GetID() const +{ + return this->_texID; +} + +GLfloat OpenGLTexture::GetInvWidth() const +{ + return this->_invSizeS; +} + +GLfloat OpenGLTexture::GetInvHeight() const +{ + return this->_invSizeT; } template @@ -1199,13 +1231,6 @@ OpenGLRenderer_1_2::~OpenGLRenderer_1_2() // Kill the texture cache now before all of our texture IDs disappear. texCache.Reset(); - while(!ref->freeTextureIDs.empty()) - { - GLuint temp = ref->freeTextureIDs.front(); - ref->freeTextureIDs.pop(); - glDeleteTextures(1, &temp); - } - glFinish(); } @@ -1352,7 +1377,6 @@ Render3DError OpenGLRenderer_1_2::InitExtensions() INFO("OpenGL: Multisampled FBOs are unsupported. Multisample antialiasing will be disabled.\n"); } - this->InitTextures(); this->InitFinalRenderStates(&oglExtensionSet); // This must be done last return OGLERROR_NOERR; @@ -2063,13 +2087,6 @@ Render3DError OpenGLRenderer_1_2::InitFinalRenderStates(const std::setExpandFreeTextures(); - - return OGLERROR_NOERR; -} - Render3DError OpenGLRenderer_1_2::InitTables() { static bool needTableInit = true; @@ -2228,20 +2245,6 @@ void OpenGLRenderer_1_2::GetExtensionSet(std::set *oglExtensionSet) } } -Render3DError OpenGLRenderer_1_2::ExpandFreeTextures() -{ - static const GLsizei kInitTextures = 128; - GLuint oglTempTextureID[kInitTextures]; - glGenTextures(kInitTextures, oglTempTextureID); - - for(GLsizei i = 0; i < kInitTextures; i++) - { - this->ref->freeTextureIDs.push(oglTempTextureID[i]); - } - - return OGLERROR_NOERR; -} - Render3DError OpenGLRenderer_1_2::EnableVertexAttributes() { OGLRenderRef &OGLRef = *this->ref; @@ -2416,14 +2419,6 @@ Render3DError OpenGLRenderer_1_2::ReadBackPixels() return OGLERROR_NOERR; } -Render3DError OpenGLRenderer_1_2::DeleteTexture(const TexCacheItem *item) -{ - this->ref->freeTextureIDs.push((GLuint)item->texid); - texCache.cache_size -= item->unpackSize; - - return OGLERROR_NOERR; -} - Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) { OGLRenderRef &OGLRef = *this->ref; @@ -2692,7 +2687,7 @@ Render3DError OpenGLRenderer_1_2::RenderGeometry(const GFX3D_State &renderState, Render3DError OpenGLRenderer_1_2::EndRender(const u64 frameCount) { //needs to happen before endgl because it could free some textureids for expired cache items - texCache.Evict(TEXCACHE_MAX_SIZE); + texCache.Evict(); this->ReadBackPixels(); @@ -2953,54 +2948,44 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } + + OpenGLTexture *theTexture = (OpenGLTexture *)texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + const bool isNewTexture = (theTexture == NULL); - TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (isNewTexture) + { + theTexture = new OpenGLTexture(thePoly.texParam, thePoly.texPalette); + texCache.Add(theTexture); + } + + const NDSTextureFormat packFormat = theTexture->GetPackFormat(); // Enable textures if they weren't already enabled if (this->isShaderSupported) { glUniform1i(OGLRef.uniformPolyEnableTexture, GL_TRUE); - glUniform1i(OGLRef.uniformTexSingleBitAlpha, (theTexture->packFormat != TEXMODE_A3I5 && theTexture->packFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); - glUniform2f(OGLRef.uniformPolyTexScale, theTexture->invSizeX, theTexture->invSizeY); + glUniform1i(OGLRef.uniformTexSingleBitAlpha, (packFormat != TEXMODE_A3I5 && packFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); + glUniform2f(OGLRef.uniformPolyTexScale, theTexture->GetInvWidth(), theTexture->GetInvHeight()); } else { glEnable(GL_TEXTURE_2D); glMatrixMode(GL_TEXTURE); glLoadIdentity(); - glScalef(theTexture->invSizeX, theTexture->invSizeY, 1.0f); + glScalef(theTexture->GetInvWidth(), theTexture->GetInvHeight(), 1.0f); } - if (theTexture->unpackFormat != TexFormat_32bpp) + glBindTexture(GL_TEXTURE_2D, theTexture->GetID()); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + + if (theTexture->IsLoadNeeded()) { - //has the ogl renderer initialized the texture? - const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); - if (isNewTexture) - { - theTexture->SetDeleteCallback(&OGLTextureDeleteCallback, this, NULL); - - if (OGLRef.freeTextureIDs.empty()) - { - this->ExpandFreeTextures(); - } - - theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); - OGLRef.freeTextureIDs.pop(); - - theTexture->unpackSize = theTexture->GetUnpackSizeUsingFormat(TexFormat_32bpp); - texCache.cache_size += theTexture->unpackSize; - } - theTexture->Unpack((u32 *)this->_workingTextureUnpackBuffer); - glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? OGLRef.stateTexMirroredRepeat : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - - const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); const u32 *textureSrc = (u32 *)this->_workingTextureUnpackBuffer; - size_t texWidth = theTexture->sizeX; - size_t texHeight = theTexture->sizeY; + size_t texWidth = theTexture->GetWidth(); + size_t texHeight = theTexture->GetHeight(); if (this->_textureDeposterizeDstSurface.Surface != NULL) { @@ -3031,17 +3016,17 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); if (isNewTexture) { glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->GetWidth(), theTexture->GetHeight(), 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->GetWidth(), theTexture->GetHeight(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } break; } @@ -3051,29 +3036,29 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); - this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); + this->TextureUpscale<4>(packFormat, textureSrc, texWidth, texHeight); if (isNewTexture) { glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - texWidth = theTexture->sizeX; - texHeight = theTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + texWidth = theTexture->GetWidth(); + texHeight = theTexture->GetHeight(); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->GetWidth(), theTexture->GetHeight(), 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - texWidth = theTexture->sizeX; - texHeight = theTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + texWidth = theTexture->GetWidth(); + texHeight = theTexture->GetHeight(); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->GetWidth(), theTexture->GetHeight(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } break; } @@ -3095,11 +3080,9 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); } } - else - { - //otherwise, just bind it - glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); - } + + theTexture->ResetCacheAge(); + theTexture->IncreaseCacheUsageCount(1); return OGLERROR_NOERR; } @@ -3867,7 +3850,6 @@ Render3DError OpenGLRenderer_2_0::InitExtensions() INFO("OpenGL: Multisampled FBOs are unsupported. Multisample antialiasing will be disabled.\n"); } - this->InitTextures(); this->InitFinalRenderStates(&oglExtensionSet); // This must be done last return OGLERROR_NOERR; @@ -4644,42 +4626,32 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } - TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + OpenGLTexture *theTexture = (OpenGLTexture *)texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + const bool isNewTexture = (theTexture == NULL); + + if (isNewTexture) + { + theTexture = new OpenGLTexture(thePoly.texParam, thePoly.texPalette); + texCache.Add(theTexture); + } + + const NDSTextureFormat packFormat = theTexture->GetPackFormat(); glUniform1i(OGLRef.uniformPolyEnableTexture, GL_TRUE); - glUniform1i(OGLRef.uniformTexSingleBitAlpha, (theTexture->packFormat != TEXMODE_A3I5 && theTexture->packFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); - glUniform2f(OGLRef.uniformPolyTexScale, theTexture->invSizeX, theTexture->invSizeY); + glUniform1i(OGLRef.uniformTexSingleBitAlpha, (packFormat != TEXMODE_A3I5 && packFormat != TEXMODE_A5I3) ? GL_TRUE : GL_FALSE); + glUniform2f(OGLRef.uniformPolyTexScale, theTexture->GetInvWidth(), theTexture->GetInvHeight()); - if (theTexture->unpackFormat != TexFormat_32bpp) + glBindTexture(GL_TEXTURE_2D, theTexture->GetID()); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + + if (theTexture->IsLoadNeeded()) { - //has the ogl renderer initialized the texture? - const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); - if (isNewTexture) - { - theTexture->SetDeleteCallback(&OGLTextureDeleteCallback, this, NULL); - - if (OGLRef.freeTextureIDs.empty()) - { - this->ExpandFreeTextures(); - } - - theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); - OGLRef.freeTextureIDs.pop(); - - theTexture->unpackSize = theTexture->GetUnpackSizeUsingFormat(TexFormat_32bpp); - texCache.cache_size += theTexture->unpackSize; - } - theTexture->Unpack((u32 *)this->_workingTextureUnpackBuffer); - glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - - const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); const u32 *textureSrc = (u32 *)this->_workingTextureUnpackBuffer; - size_t texWidth = theTexture->sizeX; - size_t texHeight = theTexture->sizeY; + size_t texWidth = theTexture->GetWidth(); + size_t texHeight = theTexture->GetHeight(); if (this->_textureDeposterizeDstSurface.Surface != NULL) { @@ -4710,15 +4682,17 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); + if (isNewTexture) { glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->GetWidth(), theTexture->GetHeight(), 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->GetWidth(), theTexture->GetHeight(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } break; } @@ -4728,29 +4702,29 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); - this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); + this->TextureUpscale<4>(packFormat, textureSrc, texWidth, texHeight); if (isNewTexture) { glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - texWidth = theTexture->sizeX; - texHeight = theTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + texWidth = theTexture->GetWidth(); + texHeight = theTexture->GetHeight(); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->GetWidth(), theTexture->GetHeight(), 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - texWidth = theTexture->sizeX; - texHeight = theTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + texWidth = theTexture->GetWidth(); + texHeight = theTexture->GetHeight(); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->GetWidth(), theTexture->GetHeight(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } break; } @@ -4772,11 +4746,9 @@ Render3DError OpenGLRenderer_2_0::SetupTexture(const POLY &thePoly, bool enableT glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); } } - else - { - //otherwise, just bind it - glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); - } + + theTexture->ResetCacheAge(); + theTexture->IncreaseCacheUsageCount(1); return OGLERROR_NOERR; } diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 464ec100a..976a811d0 100644 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -24,6 +24,7 @@ #include #include #include "render3D.h" +#include "texcache.h" #include "types.h" #ifndef OGLRENDER_3_2_H @@ -491,9 +492,6 @@ struct OGLRenderRef GLuint vaoGeometryStatesID; GLuint vaoPostprocessStatesID; - // Textures - std::queue freeTextureIDs; - // Client-side Buffers GLfloat *color4fBuffer; GLushort *vertIndexBuffer; @@ -526,8 +524,6 @@ extern CACHE_ALIGN const GLfloat divide6bitBy63_LUT[64]; extern const GLfloat PostprocessVtxBuffer[16]; extern const GLubyte PostprocessElementBuffer[6]; -extern void OGLTextureDeleteCallback(TexCacheItem *texItem, void *param1, void *param2); - //This is called by OGLRender whenever it initializes. //Platforms, please be sure to set this up. //return true if you successfully init. @@ -559,6 +555,23 @@ extern void (*OGLCreateRenderer_3_2_Func)(OpenGLRenderer **rendererPtr); bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision); +class OpenGLTexture : public TextureStore +{ +private: + GLuint _texID; + GLfloat _invSizeS; + GLfloat _invSizeT; + +public: + OpenGLTexture(); + OpenGLTexture(u32 texAttributes, u32 palAttributes); + virtual ~OpenGLTexture(); + + GLuint GetID() const; + GLfloat GetInvWidth() const; + GLfloat GetInvHeight() const; +}; + #if defined(ENABLE_SSE2) class OpenGLRenderer : public Render3D_SSE2 #else @@ -609,7 +622,6 @@ protected: virtual void DestroyGeometryProgram() = 0; virtual Render3DError CreateVAOs() = 0; virtual void DestroyVAOs() = 0; - virtual Render3DError InitTextures() = 0; virtual Render3DError InitFinalRenderStates(const std::set *oglExtensionSet) = 0; virtual Render3DError InitTables() = 0; virtual Render3DError InitPostprocessingPrograms(const std::string &edgeMarkVtxShader, @@ -635,7 +647,6 @@ protected: virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) = 0; virtual void GetExtensionSet(std::set *oglExtensionSet) = 0; - virtual Render3DError ExpandFreeTextures() = 0; virtual Render3DError EnableVertexAttributes() = 0; virtual Render3DError DisableVertexAttributes() = 0; virtual Render3DError DownsampleFBO() = 0; @@ -648,7 +659,6 @@ public: virtual ~OpenGLRenderer(); virtual Render3DError InitExtensions() = 0; - virtual Render3DError DeleteTexture(const TexCacheItem *item) = 0; bool IsExtensionPresent(const std::set *oglExtensionSet, const std::string extensionName) const; bool ValidateShaderCompile(GLuint theShader) const; @@ -673,7 +683,6 @@ protected: virtual void DestroyMultisampledFBO(); virtual Render3DError CreateVAOs(); virtual void DestroyVAOs(); - virtual Render3DError InitTextures(); virtual Render3DError InitFinalRenderStates(const std::set *oglExtensionSet); virtual Render3DError InitTables(); @@ -702,7 +711,6 @@ protected: virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual void GetExtensionSet(std::set *oglExtensionSet); - virtual Render3DError ExpandFreeTextures(); virtual Render3DError EnableVertexAttributes(); virtual Render3DError DisableVertexAttributes(); virtual Render3DError DownsampleFBO(); @@ -729,8 +737,6 @@ public: virtual Render3DError Reset(); virtual Render3DError RenderFinish(); virtual Render3DError SetFramebufferSize(size_t w, size_t h); - - virtual Render3DError DeleteTexture(const TexCacheItem *item); }; class OpenGLRenderer_1_3 : public OpenGLRenderer_1_2 diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 2ef6b2a8a..5626b677a 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -609,7 +609,6 @@ Render3DError OpenGLRenderer_3_2::InitExtensions() } } - this->InitTextures(); this->InitFinalRenderStates(&oglExtensionSet); // This must be done last return OGLERROR_NOERR; @@ -1686,7 +1685,6 @@ Render3DError OpenGLRenderer_3_2::SetupPolygon(const POLY &thePoly) Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableTexturing) { - OGLRenderRef &OGLRef = *this->ref; const PolygonTexParams params = thePoly.getTexParams(); // Check if we need to use textures @@ -1695,37 +1693,27 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT return OGLERROR_NOERR; } - TexCacheItem *theTexture = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); - if (theTexture->unpackFormat != TexFormat_32bpp) + OpenGLTexture *theTexture = (OpenGLTexture *)texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + const bool isNewTexture = (theTexture == NULL); + + if (isNewTexture) + { + theTexture = new OpenGLTexture(thePoly.texParam, thePoly.texPalette); + texCache.Add(theTexture); + } + + glBindTexture(GL_TEXTURE_2D, theTexture->GetID()); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); + + if (theTexture->IsLoadNeeded()) { - //has the ogl renderer initialized the texture? - const bool isNewTexture = (theTexture->GetDeleteCallback() == NULL); - if (isNewTexture) - { - theTexture->SetDeleteCallback(&OGLTextureDeleteCallback, this, NULL); - - if (OGLRef.freeTextureIDs.empty()) - { - this->ExpandFreeTextures(); - } - - theTexture->texid = (u32)OGLRef.freeTextureIDs.front(); - OGLRef.freeTextureIDs.pop(); - - theTexture->unpackSize = theTexture->GetUnpackSizeUsingFormat(TexFormat_32bpp); - texCache.cache_size += theTexture->unpackSize; - } - theTexture->Unpack((u32 *)this->_workingTextureUnpackBuffer); - glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (params.enableRepeatS ? (params.enableMirroredRepeatS ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (params.enableRepeatT ? (params.enableMirroredRepeatT ? GL_MIRRORED_REPEAT : GL_REPEAT) : GL_CLAMP_TO_EDGE)); - - const NDSTextureFormat texFormat = theTexture->GetTextureFormat(); const u32 *textureSrc = (u32 *)this->_workingTextureUnpackBuffer; - size_t texWidth = theTexture->sizeX; - size_t texHeight = theTexture->sizeY; + const NDSTextureFormat packFormat = theTexture->GetPackFormat(); + size_t texWidth = theTexture->GetWidth(); + size_t texHeight = theTexture->GetHeight(); if (this->_textureDeposterizeDstSurface.Surface != NULL) { @@ -1756,17 +1744,17 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 1); - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); if (isNewTexture) { glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, theTexture->GetWidth(), theTexture->GetHeight(), 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, theTexture->GetWidth(), theTexture->GetHeight(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } break; } @@ -1776,29 +1764,29 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 2); - this->TextureUpscale<4>(texFormat, textureSrc, texWidth, texHeight); + this->TextureUpscale<4>(packFormat, textureSrc, texWidth, texHeight); if (isNewTexture) { glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - texWidth = theTexture->sizeX; - texHeight = theTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + texWidth = theTexture->GetWidth(); + texHeight = theTexture->GetHeight(); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, texWidth, texHeight, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->sizeX, theTexture->sizeY, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexImage2D(GL_TEXTURE_2D, 2, GL_RGBA, theTexture->GetWidth(), theTexture->GetHeight(), 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } else { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - texWidth = theTexture->sizeX; - texHeight = theTexture->sizeY; - this->TextureUpscale<2>(texFormat, textureSrc, texWidth, texHeight); + texWidth = theTexture->GetWidth(); + texHeight = theTexture->GetHeight(); + this->TextureUpscale<2>(packFormat, textureSrc, texWidth, texHeight); glTexSubImage2D(GL_TEXTURE_2D, 1, 0, 0, texWidth, texHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, this->_textureUpscaleBuffer); - glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->sizeX, theTexture->sizeY, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); + glTexSubImage2D(GL_TEXTURE_2D, 2, 0, 0, theTexture->GetWidth(), theTexture->GetHeight(), GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, textureSrc); } break; } @@ -1820,11 +1808,9 @@ Render3DError OpenGLRenderer_3_2::SetupTexture(const POLY &thePoly, bool enableT glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 1.0f); } } - else - { - //otherwise, just bind it - glBindTexture(GL_TEXTURE_2D, (GLuint)theTexture->texid); - } + + theTexture->ResetCacheAge(); + theTexture->IncreaseCacheUsageCount(1); return OGLERROR_NOERR; } diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 9e33f8490..4b975dc34 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -54,7 +54,6 @@ #include "matrix.h" #include "render3D.h" #include "gfx3d.h" -#include "texcache.h" #include "MMU.h" #include "NDSSystem.h" #include "utils/task.h" @@ -331,7 +330,7 @@ class RasterizerUnit { protected: SoftRasterizerRenderer *_softRender; - TexCacheItem *lastTexKey; + SoftRasterizerTexture *lastTexKey; VERT* verts[MAX_CLIPPED_VERTS]; int polynum; @@ -351,19 +350,16 @@ public: int width, height; s32 wmask, hmask; int wrap; - int wshift; - int texFormat; - void setup(u32 texParam) + void setup(SoftRasterizerTexture *theTexture, u32 texParam) { - texFormat = (texParam>>26)&7; - wshift = ((texParam>>20)&0x07) + 3; - width=(1 << wshift); - height=(8 << ((texParam>>23)&0x07)); - wmask = width-1; - hmask = height-1; + width = theTexture->GetRenderWidth(); + height = theTexture->GetRenderHeight(); + wmask = theTexture->GetRenderWidthMask(); + hmask = theTexture->GetRenderHeightMask(); + wrap = (texParam>>16)&0xF; - enabled = gfx3d.renderState.enableTexturing && (texFormat!=0); + enabled = gfx3d.renderState.enableTexturing && (theTexture->GetPackFormat() != TEXMODE_NONE); } FORCEINLINE void clamp(s32 &val, const int size, const s32 sizemask) @@ -461,7 +457,10 @@ public: sampler.dowrap(iu, iv); FragmentColor color; - color.color = lastTexKey->unpackData[(iv<GetUnpackData(); + + color.color = textureData[( iv << lastTexKey->GetRenderWidthShift() ) + iu]; + return color; } @@ -1006,15 +1005,15 @@ public: const size_t dstWidth = this->_softRender->GetFramebufferWidth(); const size_t dstHeight = this->_softRender->GetFramebufferHeight(); - lastTexKey = NULL; - const GFX3D_Clipper::TClippedPoly &firstClippedPoly = this->_softRender->clippedPolys[0]; const POLY &firstPoly = *firstClippedPoly.poly; PolygonAttributes polyAttr = firstPoly.getAttributes(); u32 lastPolyAttr = firstPoly.polyAttr; u32 lastTexParams = firstPoly.texParam; u32 lastTexPalette = firstPoly.texPalette; - sampler.setup(firstPoly.texParam); + + lastTexKey = this->_softRender->polyTexKeys[0]; + sampler.setup(lastTexKey, firstPoly.texParam); //iterate over polys for (size_t i = 0; i < polyCount; i++) @@ -1035,13 +1034,15 @@ public: if (lastTexParams != thePoly.texParam || lastTexPalette != thePoly.texPalette) { - sampler.setup(thePoly.texParam); lastTexParams = thePoly.texParam; lastTexPalette = thePoly.texPalette; + + lastTexKey = this->_softRender->polyTexKeys[i]; + sampler.setup(lastTexKey, thePoly.texParam); + lastTexKey->ResetCacheAge(); + lastTexKey->IncreaseCacheUsageCount(1); } - lastTexKey = this->_softRender->polyTexKeys[i]; - for (int j = 0; j < type; j++) this->verts[j] = &clippedPoly.clipVerts[j]; for (int j = type; j < MAX_CLIPPED_VERTS; j++) @@ -1147,10 +1148,58 @@ static void SoftRasterizerRendererDestroy() } } -void SoftRasterizerTextureDeleteCallback(TexCacheItem *texItem, void *param1, void *param2) +SoftRasterizerTexture::SoftRasterizerTexture(u32 texAttributes, u32 palAttributes) : TextureStore(texAttributes, palAttributes) { - free_aligned(texItem->unpackData); - texCache.cache_size -= texItem->unpackSize; + _cacheSize = GetUnpackSizeUsingFormat(TexFormat_15bpp); + _unpackData = (u32 *)malloc_alignedCacheLine(_cacheSize); + _renderWidth = _sizeS; + _renderHeight = _sizeT; + _renderWidthMask = _renderWidth - 1; + _renderHeightMask = _renderHeight - 1; + + _renderWidthShift = 0; + + u32 tempWidth = _renderWidth; + while ( (tempWidth & 1) == 0) + { + tempWidth >>= 1; + _renderWidthShift++; + } +} + +SoftRasterizerTexture::~SoftRasterizerTexture() +{ + free_aligned(this->_unpackData); +} + +u32* SoftRasterizerTexture::GetUnpackData() +{ + return this->_unpackData; +} + +u32 SoftRasterizerTexture::GetRenderWidth() const +{ + return this->_renderWidth; +} + +u32 SoftRasterizerTexture::GetRenderHeight() const +{ + return this->_renderHeight; +} + +u32 SoftRasterizerTexture::GetRenderWidthMask() const +{ + return this->_renderWidthMask; +} + +u32 SoftRasterizerTexture::GetRenderHeightMask() const +{ + return this->_renderHeightMask; +} + +u32 SoftRasterizerTexture::GetRenderWidthShift() const +{ + return this->_renderWidthShift; } GPU3DInterface gpu3DRasterize = { @@ -1380,19 +1429,16 @@ void SoftRasterizerRenderer::setupTextures() u32 lastTexParams = firstPoly.texParam; u32 lastTexPalette = firstPoly.texPalette; - TexCacheItem *lastTexItem = texCache.GetTexture(firstPoly.texParam, firstPoly.texPalette); - if (lastTexItem->unpackFormat != TexFormat_15bpp) + SoftRasterizerTexture *lastTexItem = (SoftRasterizerTexture *)texCache.GetTexture(firstPoly.texParam, firstPoly.texPalette); + if (lastTexItem == NULL) { - const bool isNewTexture = (lastTexItem->GetDeleteCallback() == NULL); - if (isNewTexture) - { - lastTexItem->SetDeleteCallback(&SoftRasterizerTextureDeleteCallback, this, NULL); - lastTexItem->unpackSize = lastTexItem->GetUnpackSizeUsingFormat(TexFormat_15bpp); - lastTexItem->unpackData = (u32 *)malloc_alignedCacheLine(lastTexItem->unpackSize); - texCache.cache_size += lastTexItem->unpackSize; - } - - lastTexItem->Unpack(lastTexItem->unpackData); + lastTexItem = new SoftRasterizerTexture(firstPoly.texParam, firstPoly.texPalette); + texCache.Add(lastTexItem); + } + + if (lastTexItem->IsLoadNeeded()) + { + lastTexItem->Unpack(lastTexItem->GetUnpackData()); } for (size_t i = 0; i < this->_clippedPolyCount; i++) @@ -1406,19 +1452,16 @@ void SoftRasterizerRenderer::setupTextures() //and then it won't be safe. if (lastTexParams != thePoly.texParam || lastTexPalette != thePoly.texPalette) { - lastTexItem = texCache.GetTexture(thePoly.texParam, thePoly.texPalette); - if (lastTexItem->unpackFormat != TexFormat_15bpp) + lastTexItem = (SoftRasterizerTexture *)texCache.GetTexture(thePoly.texParam, thePoly.texPalette); + if (lastTexItem == NULL) { - const bool isNewTexture = (lastTexItem->GetDeleteCallback() == NULL); - if (isNewTexture) - { - lastTexItem->SetDeleteCallback(&SoftRasterizerTextureDeleteCallback, this, NULL); - lastTexItem->unpackSize = lastTexItem->GetUnpackSizeUsingFormat(TexFormat_15bpp); - lastTexItem->unpackData = (u32 *)malloc_alignedCacheLine(lastTexItem->unpackSize); - texCache.cache_size += lastTexItem->unpackSize; - } - - lastTexItem->Unpack(lastTexItem->unpackData); + lastTexItem = new SoftRasterizerTexture(thePoly.texParam, thePoly.texPalette); + texCache.Add(lastTexItem); + } + + if (lastTexItem->IsLoadNeeded()) + { + lastTexItem->Unpack(lastTexItem->GetUnpackData()); } lastTexParams = thePoly.texParam; @@ -1571,7 +1614,7 @@ Render3DError SoftRasterizerRenderer::RenderGeometry(const GFX3D_State &renderSt { rasterizerUnit[0].mainLoop(); this->_renderGeometryNeedsFinish = false; - texCache.Evict(TEXCACHE_MAX_SIZE); // Since we're finishing geometry rendering here and now, also check the texture cache now. + texCache.Evict(); // Since we're finishing geometry rendering here and now, also check the texture cache now. } // printf("rendered %d of %d polys after backface culling\n",gfx3d.polylist->count-culled,gfx3d.polylist->count); @@ -1981,7 +2024,7 @@ Render3DError SoftRasterizerRenderer::RenderFinish() } // Now that geometry rendering is finished on all threads, check the texture cache. - texCache.Evict(TEXCACHE_MAX_SIZE); + texCache.Evict(); // Do multithreaded post-processing. if (this->currentRenderState->enableEdgeMarking || this->currentRenderState->enableFog) diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index c56129626..1582c2a27 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -20,6 +20,7 @@ #include "render3D.h" #include "gfx3d.h" +#include "texcache.h" #define SOFTRASTERIZER_DEPTH_EQUAL_TEST_TOLERANCE 0x200 @@ -39,6 +40,28 @@ struct SoftRasterizerPostProcessParams bool fogAlphaOnly; }; +class SoftRasterizerTexture : public TextureStore +{ +protected: + u32 *_unpackData; + u32 _renderWidth; + u32 _renderHeight; + u32 _renderWidthMask; + u32 _renderHeightMask; + u32 _renderWidthShift; + +public: + SoftRasterizerTexture(u32 texAttributes, u32 palAttributes); + virtual ~SoftRasterizerTexture(); + + u32* GetUnpackData(); + u32 GetRenderWidth() const; + u32 GetRenderHeight() const; + u32 GetRenderWidthMask() const; + u32 GetRenderHeightMask() const; + u32 GetRenderWidthShift() const; +}; + #if defined(ENABLE_SSE2) class SoftRasterizerRenderer : public Render3D_SSE2 #else @@ -75,7 +98,7 @@ public: FragmentColor toonColor32LUT[32]; GFX3D_Clipper::TClippedPoly *clippedPolys; FragmentAttributesBuffer *_framebufferAttributes; - TexCacheItem *polyTexKeys[POLYLIST_SIZE]; + SoftRasterizerTexture *polyTexKeys[POLYLIST_SIZE]; bool polyVisible[POLYLIST_SIZE]; bool polyBackfacing[POLYLIST_SIZE]; GFX3D_State *currentRenderState; diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index a8fe7cad0..30e76cbcb 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -195,124 +195,570 @@ static MemSpan MemSpan_TexPalette(u32 ofs, u32 len, bool silent) return ret; } -TexCache texCache; - -TexCache::TexCache() +static bool TextureLRUCompare(TextureStore *tex1, TextureStore *tex2) { - cacheTable.clear(); - cache_size = 0; - memset(paletteDump, 0, sizeof(paletteDump)); + const size_t cacheAge1 = tex1->GetCacheAge(); + const size_t cacheAge2 = tex2->GetCacheAge(); + + if (cacheAge1 == cacheAge2) + { + return ( tex1->GetCacheUseCount() > tex2->GetCacheUseCount() ); + } + + return (cacheAge1 < cacheAge2); } -void TexCache::Invalidate() +TextureCache texCache; + +TextureCache::TextureCache() +{ + _texCacheMap.clear(); + _texCacheList.reserve(4096); + _actualCacheSize = 0; + _cacheSizeThreshold = TEXCACHE_DEFAULT_THRESHOLD; + memset(_paletteDump, 0, sizeof(_paletteDump)); +} + +size_t TextureCache::GetActualCacheSize() const +{ + return this->_actualCacheSize; +} + +size_t TextureCache::GetCacheSizeThreshold() const +{ + return this->_cacheSizeThreshold; +} + +void TextureCache::SetCacheSizeThreshold(size_t newThreshold) +{ + this->_cacheSizeThreshold = newThreshold; +} + +void TextureCache::Invalidate() { //check whether the palette memory changed //TODO - we should handle this instead by setting dirty flags in the vram memory mapping and noting whether palette memory was dirty. //but this will work for now MemSpan mspal = MemSpan_TexPalette(0, PALETTE_DUMP_SIZE, true); - bool paletteDirty = mspal.memcmp(this->paletteDump); + const bool paletteDirty = mspal.memcmp(this->_paletteDump); if (paletteDirty) { - mspal.dump(this->paletteDump); + mspal.dump(this->_paletteDump); } - for (TexCacheTable::iterator it(this->cacheTable.begin()); it != this->cacheTable.end(); ++it) + for (TextureCacheMap::iterator it(this->_texCacheMap.begin()); it != this->_texCacheMap.end(); ++it) { - it->second->suspectedInvalid = true; + it->second->SetSuspectedInvalid(); //when the palette changes, we assume all 4x4 textures are dirty. //this is because each 4x4 item doesnt carry along with it a copy of the entire palette, for verification //instead, we just use the one paletteDump for verifying of all 4x4 textures; and if paletteDirty is set, verification has failed - if( (it->second->GetTextureFormat() == TEXMODE_4X4) && paletteDirty ) + if( (it->second->GetPackFormat() == TEXMODE_4X4) && paletteDirty ) { - it->second->assumedInvalid = true; + it->second->SetAssumedInvalid(); } } } -void TexCache::Evict(size_t target) +void TextureCache::Evict() { //debug print //printf("%d %d/%d\n",index.size(),cache_size/1024,target/1024); //dont do anything unless we're over the target - if (cache_size < target) return; + if (this->_actualCacheSize <= this->_cacheSizeThreshold) + { + for (size_t i = 0; i < this->_texCacheList.size(); i++) + { + this->_texCacheList[i]->IncreaseCacheAge(1); + } + + return; + } //aim at cutting the cache to half of the max size - target /= 2; + size_t targetCacheSize = this->_cacheSizeThreshold / 2; - //evicts items in an arbitrary order until it is less than the max cache size - //TODO - do this based on age and not arbitrarily - while (this->cache_size > target) + // Sort the textures in cache by age and usage count. Textures that we want to keep in + // cache are placed in the front of the list, while textures we want to evict are sorted + // to the back of the list. + std::sort(this->_texCacheList.begin(), this->_texCacheList.end(), &TextureLRUCompare); + + while (this->_actualCacheSize > targetCacheSize) { - if (this->cacheTable.size() == 0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 + if (this->_texCacheMap.size() == 0) break; //just in case.. doesnt seem possible, cache_size wouldve been 0 - TexCacheItem *item = this->cacheTable.begin()->second; - const TexCacheKey key = TexCache::GenerateKey(item->textureAttributes, item->paletteAttributes); - this->cacheTable.erase(key); + TextureStore *item = this->_texCacheList.back(); + this->Remove(item); + this->_texCacheList.pop_back(); //printf("evicting! totalsize:%d\n",cache_size); delete item; } + + for (size_t i = 0; i < this->_texCacheList.size(); i++) + { + this->_texCacheList[i]->IncreaseCacheAge(1); + } } -void TexCache::Reset() +void TextureCache::Reset() { - for (TexCacheTable::iterator it(this->cacheTable.begin()); it != this->cacheTable.end(); ++it) + for (size_t i = 0; i < this->_texCacheList.size(); i++) { - TexCacheItem *item = it->second; - delete item; + delete this->_texCacheList[i]; } - this->cacheTable.clear(); - this->cache_size = 0; - memset(this->paletteDump, 0, sizeof(paletteDump)); + this->_texCacheMap.clear(); + this->_texCacheList.clear(); + this->_actualCacheSize = 0; + memset(this->_paletteDump, 0, sizeof(this->_paletteDump)); } -TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) +TextureStore* TextureCache::GetTexture(u32 texAttributes, u32 palAttributes) { - TexCacheItem *theTexture = NULL; - bool didCreateNewTexture = false; - bool needLoadTexData = false; - bool needLoadPalette = false; + TextureStore *theTexture = NULL; + const TextureCacheKey key = TextureCache::GenerateKey(texAttributes, palAttributes); + const TextureCacheMap::iterator cachedTexture = this->_texCacheMap.find(key); - //conditions where we reject matches: - //when the teximage or texpal params dont match - //(this is our key for identifying textures in the cache) - const TexCacheKey key = TexCache::GenerateKey(texAttributes, palAttributes); - const TexCacheTable::iterator cachedTexture = this->cacheTable.find(key); - - if (cachedTexture == this->cacheTable.end()) + if (cachedTexture == this->_texCacheMap.end()) { - theTexture = new TexCacheItem(texAttributes, palAttributes); - didCreateNewTexture = true; - needLoadTexData = true; - needLoadPalette = true; + return theTexture; } else { theTexture = cachedTexture->second; - //if the texture is assumed invalid, reject it - if (theTexture->assumedInvalid) + if (theTexture->IsAssumedInvalid()) { - needLoadTexData = true; - needLoadPalette = true; + theTexture->Update(); } - - //the texture matches params, but isnt suspected invalid. accept it. - if (!theTexture->suspectedInvalid) + else if (theTexture->IsSuspectedInvalid()) { - return theTexture; + theTexture->VRAMCompareAndUpdate(); } } - //we suspect the texture may be invalid. we need to do a byte-for-byte comparison to re-establish that it is valid: + return theTexture; +} + +void TextureCache::Add(TextureStore *texItem) +{ + const TextureCacheKey key = texItem->GetCacheKey(); + this->_texCacheMap[key] = texItem; + this->_texCacheList.push_back(texItem); + this->_actualCacheSize += texItem->GetCacheSize(); + //printf("allocating: up to %d with %d items\n", this->cache_size, this->cacheTable.size()); +} + +void TextureCache::Remove(TextureStore *texItem) +{ + const TextureCacheKey key = texItem->GetCacheKey(); + this->_texCacheMap.erase(key); + this->_actualCacheSize -= texItem->GetCacheSize(); +} + +TextureCacheKey TextureCache::GenerateKey(const u32 texAttributes, const u32 palAttributes) +{ + // Since the repeat, flip, and coordinate transformation modes are render settings + // and not data settings, we can mask out those bits to help reduce duplicate entries. + return (TextureCacheKey)( ((u64)palAttributes << 32) | (u64)(texAttributes & 0x3FF0FFFF) ); +} + +TextureStore::TextureStore() +{ + _textureAttributes = 0; + _paletteAttributes = 0; + _cacheKey = 0; + + _sizeS = 0; + _sizeT = 0; + _isPalZeroTransparent = false; + + _packFormat = TEXMODE_NONE; + _packAddress = 0; + _packSize = 0; + _packData = NULL; + + _paletteAddress = 0; + _paletteSize = 0; + _paletteColorTable = NULL; + + _packIndexAddress = 0; + _packIndexSize = 0; + _packIndexData = NULL; + _packSizeFirstSlot = 0; + + _suspectedInvalid = false; + _assumedInvalid = false; + _isLoadNeeded = false; + + _cacheSize = 0; + _cacheAge = 0; + _cacheUsageCount = 0; +} + +TextureStore::TextureStore(const u32 texAttributes, const u32 palAttributes) +{ + //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) + static const u32 texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; + + //for each texformat, number of palette entries + static const u32 paletteSizeList[] = {0, 32, 4, 16, 256, 0, 8, 0}; + + _textureAttributes = texAttributes; + _paletteAttributes = palAttributes; + _cacheKey = TextureCache::GenerateKey(texAttributes, palAttributes); + + _sizeS = (8 << ((texAttributes >> 20) & 0x07)); + _sizeT = (8 << ((texAttributes >> 23) & 0x07)); + + _packFormat = (NDSTextureFormat)((texAttributes >> 26) & 0x07); + _packAddress = (texAttributes & 0xFFFF) << 3; + _packSize = (_sizeS * _sizeT * texSizes[_packFormat]) >> 2; //shifted because the texSizes multiplier is fixed point + + if ( (_packFormat == TEXMODE_I2) || (_packFormat == TEXMODE_I4) || (_packFormat == TEXMODE_I8) ) + { + _isPalZeroTransparent = ( ((texAttributes >> 29) & 1) != 0 ); + } + else + { + _isPalZeroTransparent = false; + } + + _paletteAddress = (_packFormat == TEXMODE_I2) ? palAttributes << 3 : palAttributes << 4; + _paletteSize = paletteSizeList[_packFormat] * sizeof(u16); + + if (_packFormat == TEXMODE_4X4) + { + const u32 indexBase = ((texAttributes & 0xC000) == 0x8000) ? 0x30000 : 0x20000; + const u32 indexOffset = (texAttributes & 0x3FFF) << 2; + _packIndexAddress = indexBase + indexOffset; + _packIndexSize = (_sizeS * _sizeT) >> 3; + + _packData = (u8 *)malloc_alignedCacheLine(_packSize + _packIndexSize + _paletteSize); + _packIndexData = _packData + _packSize; + _paletteColorTable = (u16 *)(_packData + _packSize + _packIndexSize); + + MemSpan currentPackedTexIndexMS = MemSpan_TexMem(_packIndexAddress, _packIndexSize); + currentPackedTexIndexMS.dump(_packIndexData, _packIndexSize); + } + else + { + _packIndexAddress = 0; + _packIndexSize = 0; + _packIndexData = NULL; + + _packData = (u8 *)malloc_alignedCacheLine(_packSize + _paletteSize); + _packIndexData = NULL; + _paletteColorTable = (u16 *)(_packData + _packSize); + } + + if (_paletteSize > 0) + { + MemSpan currentPaletteMS = MemSpan_TexPalette(_paletteAddress, _paletteSize, false); + +#ifdef WORDS_BIGENDIAN + currentPaletteMS.dump16(_paletteColorTable); +#else + currentPaletteMS.dump(_paletteColorTable); +#endif + } + else + { + _paletteColorTable = NULL; + } + + MemSpan currentPackedTexDataMS = MemSpan_TexMem(_packAddress, _packSize); + currentPackedTexDataMS.dump(_packData); + _packSizeFirstSlot = currentPackedTexDataMS.items[0].len; + + _suspectedInvalid = false; + _assumedInvalid = false; + _isLoadNeeded = true; + + _cacheSize = _packSize + _paletteSize + _packIndexSize; + _cacheAge = 0; + _cacheUsageCount = 0; +} + +TextureStore::~TextureStore() +{ + free_aligned(this->_packData); +} + +u32 TextureStore::GetTextureAttributes() const +{ + return this->_textureAttributes; +} + +u32 TextureStore::GetPaletteAttributes() const +{ + return this->_paletteAttributes; +} + +u32 TextureStore::GetWidth() const +{ + return this->_sizeS; +} + +u32 TextureStore::GetHeight() const +{ + return this->_sizeT; +} + +bool TextureStore::IsPalZeroTransparent() const +{ + return this->_isPalZeroTransparent; +} + +NDSTextureFormat TextureStore::GetPackFormat() const +{ + return this->_packFormat; +} + +u32 TextureStore::GetPackAddress() const +{ + return this->_packAddress; +} + +u32 TextureStore::GetPackSize() const +{ + return this->_packSize; +} + +u8* TextureStore::GetPackData() +{ + return this->_packData; +} + +u32 TextureStore::GetPaletteAddress() const +{ + return this->_paletteAddress; +} + +u32 TextureStore::GetPaletteSize() const +{ + return this->_paletteSize; +} + +u16* TextureStore::GetPaletteColorTable() const +{ + return this->_paletteColorTable; +} + +u32 TextureStore::GetPackIndexAddress() const +{ + return this->_packIndexAddress; +} + +u32 TextureStore::GetPackIndexSize() const +{ + return this->_packIndexSize; +} + +u8* TextureStore::GetPackIndexData() +{ + return this->_packIndexData; +} + +void TextureStore::SetTextureData(const MemSpan &packedData, const MemSpan &packedIndexData) +{ + //dump texture and 4x4 index data for cache keying + this->_packSizeFirstSlot = packedData.items[0].len; + + packedData.dump(this->_packData); + + if (this->_packFormat == TEXMODE_4X4) + { + packedIndexData.dump(this->_packIndexData, this->_packIndexSize); + } +} + +void TextureStore::SetTexturePalette(const MemSpan &packedPalette) +{ + if (this->_paletteSize > 0) + { +#ifdef WORDS_BIGENDIAN + packedPalette.dump16(this->_paletteColorTable); +#else + packedPalette.dump(this->_paletteColorTable); +#endif + } +} + +void TextureStore::SetTexturePalette(const u16 *paletteBuffer) +{ + if (this->_paletteSize > 0) + { + memcpy(this->_paletteColorTable, paletteBuffer, this->_paletteSize); + } +} + +size_t TextureStore::GetUnpackSizeUsingFormat(const TextureStoreUnpackFormat texCacheFormat) const +{ + return (this->_sizeS * this->_sizeT * sizeof(u32)); +} + +template +void TextureStore::Unpack(u32 *unpackBuffer) +{ + // Whenever a 1-bit alpha or no-alpha texture is unpacked (this means any texture + // format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D + // renderers can assume that the transparent color is 0 during texture sampling. + + switch (this->_packFormat) + { + case TEXMODE_A3I5: + NDSTextureUnpackA3I5(this->_packSize, this->_packData, this->_paletteColorTable, unpackBuffer); + break; + + case TEXMODE_I2: + NDSTextureUnpackI2(this->_packSize, this->_packData, this->_paletteColorTable, this->_isPalZeroTransparent, unpackBuffer); + break; + + case TEXMODE_I4: + NDSTextureUnpackI4(this->_packSize, this->_packData, this->_paletteColorTable, this->_isPalZeroTransparent, unpackBuffer); + break; + + case TEXMODE_I8: + NDSTextureUnpackI8(this->_packSize, this->_packData, this->_paletteColorTable, this->_isPalZeroTransparent, unpackBuffer); + break; + + case TEXMODE_4X4: + { + if (this->_packSize > this->_packSizeFirstSlot) + { + PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); + } + + NDSTextureUnpack4x4(this->_packSizeFirstSlot, (u32 *)this->_packData, (u16 *)this->_packIndexData, this->_paletteAddress, this->_textureAttributes, this->_sizeS, this->_sizeT, unpackBuffer); + break; + } + + case TEXMODE_A5I3: + NDSTextureUnpackA5I3(this->_packSize, this->_packData, this->_paletteColorTable, unpackBuffer); + break; + + case TEXMODE_16BPP: + NDSTextureUnpackDirect16Bit(this->_packSize, (u16 *)this->_packData, unpackBuffer); + break; + + default: + break; + } + +#ifdef DO_DEBUG_DUMP_TEXTURE + this->DebugDump(); +#endif + + this->_isLoadNeeded = false; +} + +bool TextureStore::IsSuspectedInvalid() const +{ + return this->_suspectedInvalid; +} + +void TextureStore::SetSuspectedInvalid() +{ + this->_suspectedInvalid = true; +} + +bool TextureStore::IsAssumedInvalid() const +{ + return this->_assumedInvalid; +} + +void TextureStore::SetAssumedInvalid() +{ + this->_assumedInvalid = true; +} + +void TextureStore::SetLoadNeeded() +{ + this->_isLoadNeeded = true; +} + +bool TextureStore::IsLoadNeeded() const +{ + return this->_isLoadNeeded; +} + +TextureCacheKey TextureStore::GetCacheKey() const +{ + return this->_cacheKey; +} + +size_t TextureStore::GetCacheSize() const +{ + return this->_cacheSize; +} + +void TextureStore::SetCacheSize(size_t cacheSize) +{ + this->_cacheSize = cacheSize; +} + +size_t TextureStore::GetCacheAge() const +{ + return this->_cacheAge; +} + +void TextureStore::IncreaseCacheAge(const size_t ageAmount) +{ + this->_cacheAge += ageAmount; +} + +void TextureStore::ResetCacheAge() +{ + this->_cacheAge = 0; +} + +size_t TextureStore::GetCacheUseCount() const +{ + return this->_cacheUsageCount; +} + +void TextureStore::IncreaseCacheUsageCount(const size_t usageCount) +{ + this->_cacheUsageCount += usageCount; +} + +void TextureStore::ResetCacheUsageCount() +{ + this->_cacheUsageCount = 0; +} + +void TextureStore::Update() +{ + MemSpan currentPaletteMS = MemSpan_TexPalette(this->_paletteAddress, this->_paletteSize, false); + MemSpan currentPackedTexDataMS = MemSpan_TexMem(this->_packAddress, this->_packSize); + + MemSpan currentPackedTexIndexMS; + if (this->_packFormat == TEXMODE_4X4) + { + //determine the location for 4x4 index data + currentPackedTexIndexMS = MemSpan_TexMem(this->_packIndexAddress, this->_packIndexSize); + } + + this->SetTextureData(currentPackedTexDataMS, currentPackedTexIndexMS); + this->SetTexturePalette(currentPaletteMS); + + this->_assumedInvalid = false; + this->_suspectedInvalid = false; + this->_isLoadNeeded = true; +} + +void TextureStore::VRAMCompareAndUpdate() +{ + bool needUpdateTexData = false; + bool needUpdatePalette = false; //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. //this isnt such a problem with texture memory, because we read sequentially from it. //however, we read randomly from palette memory, so the mapping is more costly. - MemSpan currentPaletteMS = MemSpan_TexPalette(theTexture->paletteAddress, theTexture->paletteSize, false); + MemSpan currentPaletteMS = MemSpan_TexPalette(this->_paletteAddress, this->_paletteSize, false); CACHE_ALIGN u16 currentPalette[256]; #ifdef WORDS_BIGENDIAN @@ -325,283 +771,52 @@ TexCacheItem* TexCache::GetTexture(u32 texAttributes, u32 palAttributes) //note that we are considering 4x4 textures to have a palette size of 0. //they really have a potentially HUGE palette, too big for us to handle like a normal palette, //so they go through a different system - if ( !didCreateNewTexture && (theTexture->paletteSize > 0) && memcmp(theTexture->paletteColorTable, currentPalette, theTexture->paletteSize) ) + if ( (this->_paletteSize > 0) && memcmp(this->_paletteColorTable, currentPalette, this->_paletteSize) ) { - needLoadPalette = true; + needUpdatePalette = true; } //analyze the texture memory mapping and the specifications of this texture - MemSpan currentPackedTexDataMS = MemSpan_TexMem(theTexture->packAddress, theTexture->packSize); + MemSpan currentPackedTexDataMS = MemSpan_TexMem(this->_packAddress, this->_packSize); //when the texture data doesn't match - if ( !didCreateNewTexture && (theTexture->packSize > 0) && currentPackedTexDataMS.memcmp(theTexture->packData, theTexture->packSize) ) + if ( (this->_packSize > 0) && currentPackedTexDataMS.memcmp(this->_packData, this->_packSize) ) { - needLoadTexData = true; + needUpdateTexData = true; } //if the texture is 4x4 then the index data must match MemSpan currentPackedTexIndexMS; - if (theTexture->packFormat == TEXMODE_4X4) + if (this->GetPackFormat() == TEXMODE_4X4) { //determine the location for 4x4 index data - currentPackedTexIndexMS = MemSpan_TexMem(theTexture->packIndexAddress, theTexture->packIndexSize); + currentPackedTexIndexMS = MemSpan_TexMem(this->_packIndexAddress, this->_packIndexSize); - if ( !didCreateNewTexture && (theTexture->packIndexSize > 0) && currentPackedTexIndexMS.memcmp(theTexture->packIndexData, theTexture->packIndexSize) ) + if ( (this->_packIndexSize > 0) && currentPackedTexIndexMS.memcmp(this->_packIndexData, this->_packIndexSize) ) { - needLoadTexData = true; - needLoadPalette = true; + needUpdateTexData = true; + needUpdatePalette = true; } } - if (!needLoadTexData && !needLoadPalette) + if (needUpdateTexData) { - //we found a match. just return it - theTexture->suspectedInvalid = false; - return theTexture; + this->SetTextureData(currentPackedTexDataMS, currentPackedTexIndexMS); + this->_isLoadNeeded = true; } - if (needLoadTexData) + if (needUpdatePalette) { - theTexture->SetTextureData(currentPackedTexDataMS, currentPackedTexIndexMS); - theTexture->unpackFormat = TexFormat_None; + this->SetTexturePalette(currentPalette); + this->_isLoadNeeded = true; } - if (needLoadPalette) - { - theTexture->SetTexturePalette(currentPalette); - theTexture->unpackFormat = TexFormat_None; - } - - if (didCreateNewTexture) - { - this->cacheTable[key] = theTexture; - //printf("allocating: up to %d with %d items\n",cache_size,index.size()); - } - - theTexture->assumedInvalid = false; - theTexture->suspectedInvalid = false; - return theTexture; -} - -TexCacheKey TexCache::GenerateKey(const u32 texAttributes, const u32 palAttributes) -{ - // Since the repeat, flip, and coordinate transformation modes are render settings - // and not data settings, we can mask out those bits to help reduce duplicate entries. - return (TexCacheKey)( ((u64)palAttributes << 32) | (u64)(texAttributes & 0x3FF0FFFF) ); -} - -TexCacheItem::TexCacheItem() -{ - _deleteCallback = NULL; - _deleteCallbackParam1 = NULL; - _deleteCallbackParam2 = NULL; - - textureAttributes = 0; - paletteAttributes = 0; - - sizeX = 0; - sizeY = 0; - invSizeX = 0.0f; - invSizeY = 0.0f; - isPalZeroTransparent = false; - - suspectedInvalid = false; - assumedInvalid = false; - - packFormat = TEXMODE_NONE; - packAddress = 0; - packSize = 0; - packData = NULL; - - paletteAddress = 0; - paletteSize = 0; - paletteColorTable = NULL; - - unpackFormat = TexFormat_None; - unpackSize = 0; - unpackData = NULL; - - packIndexAddress = 0; - packIndexSize = 0; - packIndexData = NULL; - packSizeFirstSlot = 0; - - texid = 0; -} - -TexCacheItem::TexCacheItem(const u32 texAttributes, const u32 palAttributes) -{ - //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) - static const u32 texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; - - //for each texformat, number of palette entries - static const u32 paletteSizeList[] = {0, 32, 4, 16, 256, 0, 8, 0}; - - _deleteCallback = NULL; - _deleteCallbackParam1 = NULL; - _deleteCallbackParam2 = NULL; - - texid = 0; - - textureAttributes = texAttributes; - paletteAttributes = palAttributes; - - sizeX = (8 << ((texAttributes >> 20) & 0x07)); - sizeY = (8 << ((texAttributes >> 23) & 0x07)); - invSizeX = 1.0f / (float)sizeX; - invSizeY = 1.0f / (float)sizeY; - - packFormat = (NDSTextureFormat)((texAttributes >> 26) & 0x07); - packAddress = (texAttributes & 0xFFFF) << 3; - packSize = (sizeX*sizeY*texSizes[packFormat]) >> 2; //shifted because the texSizes multiplier is fixed point - packData = (u8 *)malloc_alignedCacheLine(packSize); - - if ( (packFormat == TEXMODE_I2) || (packFormat == TEXMODE_I4) || (packFormat == TEXMODE_I8) ) - { - isPalZeroTransparent = ( ((texAttributes >> 29) & 1) != 0 ); - } - else - { - isPalZeroTransparent = false; - } - - paletteAddress = (packFormat == TEXMODE_I2) ? palAttributes << 3 : palAttributes << 4; - paletteSize = paletteSizeList[packFormat] * sizeof(u16); - paletteColorTable = (paletteSize > 0) ? (u16 *)malloc_alignedCacheLine(paletteSize) : NULL; - - unpackFormat = TexFormat_None; - unpackSize = 0; - unpackData = NULL; - - if (packFormat == TEXMODE_4X4) - { - const u32 indexBase = ((texAttributes & 0xC000) == 0x8000) ? 0x30000 : 0x20000; - const u32 indexOffset = (texAttributes & 0x3FFF) << 2; - packIndexAddress = indexBase + indexOffset; - packIndexSize = (sizeX * sizeY) >> 3; - packIndexData = (u8 *)malloc_alignedCacheLine(packIndexSize); - packSizeFirstSlot = 0; - } - else - { - packIndexAddress = 0; - packIndexSize = 0; - packIndexData = NULL; - packSizeFirstSlot = 0; - } - - suspectedInvalid = true; - assumedInvalid = true; -} - -TexCacheItem::~TexCacheItem() -{ - free_aligned(this->packData); - free_aligned(this->paletteColorTable); - free_aligned(this->packIndexData); - if (this->_deleteCallback != NULL) this->_deleteCallback(this, this->_deleteCallbackParam1, this->_deleteCallbackParam2); -} - -TexCacheItemDeleteCallback TexCacheItem::GetDeleteCallback() const -{ - return this->_deleteCallback; -} - -void TexCacheItem::SetDeleteCallback(TexCacheItemDeleteCallback callbackFunc, void *inParam1, void *inParam2) -{ - this->_deleteCallback = callbackFunc; - this->_deleteCallbackParam1 = inParam1; - this->_deleteCallbackParam2 = inParam2; -} - -NDSTextureFormat TexCacheItem::GetTextureFormat() const -{ - return this->packFormat; -} - -void TexCacheItem::SetTextureData(const MemSpan &packedData, const MemSpan &packedIndexData) -{ - //dump texture and 4x4 index data for cache keying - this->packSizeFirstSlot = packedData.items[0].len; - - packedData.dump(this->packData); - - if (this->packFormat == TEXMODE_4X4) - { - packedIndexData.dump(this->packIndexData, this->packIndexSize); - } -} - -void TexCacheItem::SetTexturePalette(const u16 *paletteBuffer) -{ - if (this->paletteSize > 0) - { - memcpy(this->paletteColorTable, paletteBuffer, this->paletteSize); - } -} - -size_t TexCacheItem::GetUnpackSizeUsingFormat(const TexCache_TexFormat texCacheFormat) const -{ - return (this->sizeX * this->sizeY * sizeof(u32)); -} - -template -void TexCacheItem::Unpack(u32 *unpackBuffer) -{ - this->unpackFormat = TEXCACHEFORMAT; - - // Whenever a 1-bit alpha or no-alpha texture is unpacked (this means any texture - // format that is not A3I5 or A5I3), set all transparent pixels to 0 so that 3D - // renderers can assume that the transparent color is 0 during texture sampling. - - switch (this->packFormat) - { - case TEXMODE_A3I5: - NDSTextureUnpackA3I5(this->packSize, this->packData, this->paletteColorTable, unpackBuffer); - break; - - case TEXMODE_I2: - NDSTextureUnpackI2(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, unpackBuffer); - break; - - case TEXMODE_I4: - NDSTextureUnpackI4(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, unpackBuffer); - break; - - case TEXMODE_I8: - NDSTextureUnpackI8(this->packSize, this->packData, this->paletteColorTable, this->isPalZeroTransparent, unpackBuffer); - break; - - case TEXMODE_4X4: - { - if (this->packSize > this->packSizeFirstSlot) - { - PROGINFO("Your 4x4 texture has overrun its texture slot.\n"); - } - - NDSTextureUnpack4x4(this->packSizeFirstSlot, (u32 *)this->packData, (u16 *)this->packIndexData, this->paletteAddress, this->textureAttributes, this->sizeX, this->sizeY, unpackBuffer); - break; - } - - case TEXMODE_A5I3: - NDSTextureUnpackA5I3(this->packSize, this->packData, this->paletteColorTable, unpackBuffer); - break; - - case TEXMODE_16BPP: - NDSTextureUnpackDirect16Bit(this->packSize, (u16 *)this->packData, unpackBuffer); - break; - - default: - break; - } - -#ifdef DO_DEBUG_DUMP_TEXTURE - this->DebugDump(); -#endif + this->_assumedInvalid = false; + this->_suspectedInvalid = false; } #ifdef DO_DEBUG_DUMP_TEXTURE -void TexCacheItem::DebugDump() +void TextureStore::DebugDump() { static int ctr=0; char fname[100]; @@ -612,7 +827,7 @@ void TexCacheItem::DebugDump() } #endif -template +template void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { #ifdef ENABLE_SSSE3 @@ -727,7 +942,7 @@ void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, cons } } -template +template void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { #ifdef ENABLE_SSSE3 @@ -845,7 +1060,7 @@ void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, cons } } -template +template void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer) { if (isPalZeroTransparent) @@ -865,7 +1080,7 @@ void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, cons } } -template +template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer) { for (size_t i = 0; i < srcSize; i++, srcData++) @@ -876,7 +1091,7 @@ void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, co } } -template +template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer) { #ifdef ENABLE_SSSE3 @@ -942,7 +1157,7 @@ void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, co #define PAL4X4(offset) ( LE_TO_LOCAL_16( *(u16*)( MMU.texInfo.texPalSlot[((palAddress + (offset)*2)>>14)&0x7] + ((palAddress + (offset)*2)&0x3FFF) ) ) & 0x7FFF ) -template +template void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, const u16 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer) { const u32 limit = srcSize * sizeof(u32); @@ -976,7 +1191,7 @@ void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, co const u16 pal1 = LE_TO_LOCAL_16(srcIndex[d]); const u16 pal1offset = (pal1 & 0x3FFF)<<1; const u8 mode = pal1>>14; - u32 tmp_col[4]; + CACHE_ALIGN u32 tmp_col[4]; tmp_col[0] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset) ); tmp_col[1] = COLOR555TO8888_OPAQUE( PAL4X4(pal1offset+1) ); @@ -1041,20 +1256,10 @@ void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, co if (TEXCACHEFORMAT == TexFormat_15bpp) { - for (size_t i = 0; i < 4; i++) - { -#ifdef LOCAL_BE - const u32 a = (tmp_col[i] >> 3) & 0x0000001F; - tmp_col[i] >>= 2; - tmp_col[i] &= 0x3F3F3F00; - tmp_col[i] |= a; -#else - const u32 a = (tmp_col[i] >> 3) & 0x1F000000; - tmp_col[i] >>= 2; - tmp_col[i] &= 0x003F3F3F; - tmp_col[i] |= a; -#endif - } + tmp_col[0] = ColorspaceConvert8888To6665(tmp_col[0]); + tmp_col[1] = ColorspaceConvert8888To6665(tmp_col[1]); + tmp_col[2] = ColorspaceConvert8888To6665(tmp_col[2]); + tmp_col[3] = ColorspaceConvert8888To6665(tmp_col[3]); } //TODO - this could be more precise for 32bpp mode (run it through the color separation table) @@ -1075,7 +1280,7 @@ void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, co } } -template +template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict srcData, u32 *__restrict dstBuffer) { const size_t pixCount = srcSize >> 1; @@ -1113,5 +1318,5 @@ void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict src } } -template void TexCacheItem::Unpack(u32 *unpackBuffer); -template void TexCacheItem::Unpack(u32 *unpackBuffer); +template void TextureStore::Unpack(u32 *unpackBuffer); +template void TextureStore::Unpack(u32 *unpackBuffer); diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index 7d8bb5eae..448d58fad 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -21,25 +21,26 @@ #define _TEXCACHE_H_ #include +#include #include "types.h" #include "common.h" #include "gfx3d.h" //this ought to be enough for anyone -//#define TEXCACHE_MAX_SIZE (64*1024*1024) +//#define TEXCACHE_DEFAULT_THRESHOLD (64*1024*1024) //changed by zeromus on 15-dec. I couldnt find any games that were getting anywhere NEAR 64 //metal slug burns through sprites so fast, it can test it pretty quickly though -//#define TEXCACHE_MAX_SIZE (16*1024*1024) +//#define TEXCACHE_DEFAULT_THRESHOLD (16*1024*1024) // rogerman, 2016-11-02: Increase this to 32MB for games that use many large textures, such // as Umihara Kawase Shun, which can cache over 20MB in the first level. -#define TEXCACHE_MAX_SIZE (32*1024*1024) +#define TEXCACHE_DEFAULT_THRESHOLD (32*1024*1024) #define PALETTE_DUMP_SIZE ((64+16+16)*1024) -enum TexCache_TexFormat +enum TextureStoreUnpackFormat { TexFormat_None, //used when nothing yet is cached TexFormat_32bpp, //used by ogl renderer @@ -47,97 +48,142 @@ enum TexCache_TexFormat }; class MemSpan; -class TexCacheItem; +class TextureStore; -typedef u64 TexCacheKey; -typedef std::map TexCacheTable; -typedef void (*TexCacheItemDeleteCallback)(TexCacheItem *texItem, void *param1, void *param2); +typedef u64 TextureCacheKey; +typedef std::map TextureCacheMap; // Key = A TextureCacheKey that includes a combination of the texture's NDS texture attributes and palette attributes; Value = Pointer to the texture item +typedef std::vector TextureCacheList; +//typedef u32 TextureFingerprint; -class TexCache +class TextureCache { -public: - TexCache(); +protected: + TextureCacheMap _texCacheMap; // Used to quickly find a texture item by using a key of type TextureCacheKey + TextureCacheList _texCacheList; // Used to sort existing texture items for various operations + size_t _actualCacheSize; + size_t _cacheSizeThreshold; + u8 _paletteDump[PALETTE_DUMP_SIZE]; + +public: + TextureCache(); + + size_t GetActualCacheSize() const; + size_t GetCacheSizeThreshold() const; + void SetCacheSizeThreshold(size_t newThreshold); - TexCacheTable cacheTable; - size_t cache_size; //this is not really precise, it is off by a constant factor - u8 paletteDump[PALETTE_DUMP_SIZE]; - void Invalidate(); - void Evict(size_t target); + void Evict(); void Reset(); - TexCacheItem* GetTexture(u32 texAttributes, u32 palAttributes); + TextureStore* GetTexture(u32 texAttributes, u32 palAttributes); - static TexCacheKey GenerateKey(const u32 texAttributes, const u32 palAttributes); + void Add(TextureStore *texItem); + void Remove(TextureStore *texItem); + + static TextureCacheKey GenerateKey(const u32 texAttributes, const u32 palAttributes); }; -class TexCacheItem +class TextureStore { -private: - TexCacheItemDeleteCallback _deleteCallback; - void *_deleteCallbackParam1; - void *_deleteCallbackParam2; +protected: + u32 _textureAttributes; + u32 _paletteAttributes; -public: - TexCacheItem(); - TexCacheItem(const u32 texAttributes, const u32 palAttributes); - ~TexCacheItem(); + u32 _sizeS; + u32 _sizeT; + bool _isPalZeroTransparent; - u32 textureAttributes; - u32 paletteAttributes; + NDSTextureFormat _packFormat; + u32 _packAddress; + u32 _packSize; + u8 *_packData; - u32 sizeX; - u32 sizeY; - float invSizeX; - float invSizeY; - bool isPalZeroTransparent; - - bool suspectedInvalid; - bool assumedInvalid; - - NDSTextureFormat packFormat; - u32 packAddress; - u32 packSize; - u8 *packData; - - u32 paletteAddress; - u32 paletteSize; - u16 *paletteColorTable; - - TexCache_TexFormat unpackFormat; - u32 unpackSize; - u32 *unpackData; + u32 _paletteAddress; + u32 _paletteSize; + u16 *_paletteColorTable; // Only used by 4x4 formatted textures - u32 packIndexAddress; - u32 packIndexSize; - u8 *packIndexData; - u32 packSizeFirstSlot; + u32 _packIndexAddress; + u32 _packIndexSize; + u8 *_packIndexData; + u32 _packSizeFirstSlot; - // Only used by the OpenGL renderer for the texture ID - u32 texid; + bool _suspectedInvalid; + bool _assumedInvalid; + bool _isLoadNeeded; - TexCacheItemDeleteCallback GetDeleteCallback() const; - void SetDeleteCallback(TexCacheItemDeleteCallback callbackFunc, void *inParam1, void *inParam2); + TextureCacheKey _cacheKey; + size_t _cacheSize; + size_t _cacheAge; // A value of 0 means the texture was just used. The higher this value, the older the texture. + size_t _cacheUsageCount; + +public: + TextureStore(); + TextureStore(const u32 texAttributes, const u32 palAttributes); + virtual ~TextureStore(); + + u32 GetTextureAttributes() const; + u32 GetPaletteAttributes() const; + + u32 GetWidth() const; + u32 GetHeight() const; + bool IsPalZeroTransparent() const; + + NDSTextureFormat GetPackFormat() const; + u32 GetPackAddress() const; + u32 GetPackSize() const; + u8* GetPackData(); + + u32 GetPaletteAddress() const; + u32 GetPaletteSize() const; + u16* GetPaletteColorTable() const; + + u32 GetPackIndexAddress() const; + u32 GetPackIndexSize() const; + u8* GetPackIndexData(); - NDSTextureFormat GetTextureFormat() const; void SetTextureData(const MemSpan &packedData, const MemSpan &packedIndexData); + void SetTexturePalette(const MemSpan &packedPalette); void SetTexturePalette(const u16 *paletteBuffer); - size_t GetUnpackSizeUsingFormat(const TexCache_TexFormat texCacheFormat) const; - template void Unpack(u32 *unpackBuffer); + size_t GetUnpackSizeUsingFormat(const TextureStoreUnpackFormat texCacheFormat) const; + template void Unpack(u32 *unpackBuffer); + bool IsSuspectedInvalid() const; + void SetSuspectedInvalid(); + + bool IsAssumedInvalid() const; + void SetAssumedInvalid(); + + void SetLoadNeeded(); + bool IsLoadNeeded() const; + + TextureCacheKey GetCacheKey() const; + + size_t GetCacheSize() const; + void SetCacheSize(size_t cacheSize); + + size_t GetCacheAge() const; + void IncreaseCacheAge(const size_t ageAmount); + void ResetCacheAge(); + + size_t GetCacheUseCount() const; + void IncreaseCacheUsageCount(const size_t usageCount); + void ResetCacheUsageCount(); + + void Update(); + void VRAMCompareAndUpdate(); void DebugDump(); }; -template void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); -template void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); -template void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); -template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); -template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); -template void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, const u16 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer); -template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict srcData, u32 *__restrict dstBuffer); +template void NDSTextureUnpackI2(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); +template void NDSTextureUnpackI4(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); +template void NDSTextureUnpackI8(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, const bool isPalZeroTransparent, u32 *__restrict dstBuffer); +template void NDSTextureUnpackA3I5(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); +template void NDSTextureUnpackA5I3(const size_t srcSize, const u8 *__restrict srcData, const u16 *__restrict srcPal, u32 *__restrict dstBuffer); +template void NDSTextureUnpack4x4(const size_t srcSize, const u32 *__restrict srcData, const u16 *__restrict srcIndex, const u32 palAddress, const u32 texAttributes, const u32 sizeX, const u32 sizeY, u32 *__restrict dstBuffer); +template void NDSTextureUnpackDirect16Bit(const size_t srcSize, const u16 *__restrict srcData, u32 *__restrict dstBuffer); -extern TexCache texCache; +extern TextureCache texCache; #endif