diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index edd0e14a1..cefd8c15d 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -345,8 +345,9 @@ void GPUEngineBase::Reset() this->_BGLayer[GPULayerID_BG2].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG2]; this->_BGLayer[GPULayerID_BG3].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG3]; - this->_needUpdateWINH[0] = true; - this->_needUpdateWINH[1] = true; + this->_prevWINState.value = 0xFFFF; + this->_prevWINCoord.value = 0xFFFFFFFFFFFFFFFFULL; + this->_prevWINCtrl.value = 0xFFFFFFFF; this->_vramBlockOBJAddress = 0; @@ -396,9 +397,7 @@ void GPUEngineBase::Reset() memset(&renderState.WINOBJ_enable[0], 0, sizeof(renderState.WINOBJ_enable[0]) * 6); memset(&renderState.dstBlendEnableVecLookup, 0, sizeof(renderState.dstBlendEnableVecLookup)); - renderState.WIN0_ENABLED = false; - renderState.WIN1_ENABLED = false; - renderState.WINOBJ_ENABLED = false; + renderState.windowState.value = 0; renderState.isAnyWindowEnabled = false; memset(&renderState.srcEffectEnable[0], 0, sizeof(renderState.srcEffectEnable[0]) * 6); @@ -425,6 +424,7 @@ void GPUEngineBase::Reset() void GPUEngineBase::_ResortBGLayers() { const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT; + GPUEngineRenderState &renderState = this->_currentRenderState; int i, prio; itemsForPriority_t *item; @@ -441,6 +441,12 @@ void GPUEngineBase::_ResortBGLayers() this->_isAnyBGLayerShown = this->_isBGLayerShown[GPULayerID_BG0] || this->_isBGLayerShown[GPULayerID_BG1] || this->_isBGLayerShown[GPULayerID_BG2] || this->_isBGLayerShown[GPULayerID_BG3]; + renderState.windowState.BG0_Shown = (this->_isBGLayerShown[GPULayerID_BG0]) ? 1 : 0; + renderState.windowState.BG1_Shown = (this->_isBGLayerShown[GPULayerID_BG1]) ? 1 : 0; + renderState.windowState.BG2_Shown = (this->_isBGLayerShown[GPULayerID_BG2]) ? 1 : 0; + renderState.windowState.BG3_Shown = (this->_isBGLayerShown[GPULayerID_BG3]) ? 1 : 0; + renderState.windowState.OBJ_Shown = (this->_isBGLayerShown[GPULayerID_OBJ]) ? 1 : 0; + // KISS ! lower priority first, if same then lower num for (i = 0; i < NB_PRIORITIES; i++) { @@ -495,10 +501,10 @@ void GPUEngineBase::ParseReg_DISPCNT() renderState.displayOutputMode = (this->_engineID == GPUEngineID_Main) ? (GPUDisplayMode)DISPCNT.DisplayMode : (GPUDisplayMode)(DISPCNT.DisplayMode & 0x01); - renderState.WIN0_ENABLED = (DISPCNT.Win0_Enable != 0); - renderState.WIN1_ENABLED = (DISPCNT.Win1_Enable != 0); - renderState.WINOBJ_ENABLED = (DISPCNT.WinOBJ_Enable != 0); - renderState.isAnyWindowEnabled = (renderState.WIN0_ENABLED || renderState.WIN1_ENABLED || renderState.WINOBJ_ENABLED); + renderState.windowState.WIN0_ENABLED = DISPCNT.Win0_Enable; + renderState.windowState.WIN1_ENABLED = DISPCNT.Win1_Enable; + renderState.windowState.WINOBJ_ENABLED = DISPCNT.WinOBJ_Enable; + renderState.isAnyWindowEnabled = ( (DISPCNT.Win0_Enable != 0) || (DISPCNT.Win1_Enable != 0) || (DISPCNT.WinOBJ_Enable != 0) ); if (DISPCNT.OBJ_Tile_mapping) { @@ -1338,6 +1344,9 @@ void GPUEngineBase::_CompositeNativeLineOBJ(GPUEngineCompositorInfo &compInfo, c template void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) { + const u8 *windowTest = (compInfo.line.widthCustom == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID] : this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID]; + const u8 *colorEffectEnable = (compInfo.line.widthCustom == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? this->_enableColorEffectNative[compInfo.renderState.selectedLayerID] : this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID]; + compInfo.target.xNative = 0; compInfo.target.xCustom = 0; compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead; @@ -1347,7 +1356,7 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, co size_t i = 0; #ifdef USEMANUALVECTORIZATION - i = this->_CompositeLineDeferred_LoopOp(compInfo, srcColorCustom16, srcIndexCustom); + i = this->_CompositeLineDeferred_LoopOp(compInfo, windowTest, colorEffectEnable, srcColorCustom16, srcIndexCustom); #pragma LOOPVECTORIZE_DISABLE #endif for (; i < compInfo.line.pixelCount; i++, compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) @@ -1375,6 +1384,9 @@ void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo, co template void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) { + const u8 *windowTest = (compInfo.line.widthCustom == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID] : this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID]; + const u8 *colorEffectEnable = (compInfo.line.widthCustom == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? this->_enableColorEffectNative[compInfo.renderState.selectedLayerID] : this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID]; + compInfo.target.xNative = 0; compInfo.target.xCustom = 0; compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead; @@ -1384,7 +1396,7 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo size_t i = 0; #ifdef USEMANUALVECTORIZATION - i = this->_CompositeVRAMLineDeferred_LoopOp(compInfo, vramColorPtr); + i = this->_CompositeVRAMLineDeferred_LoopOp(compInfo, windowTest, colorEffectEnable, vramColorPtr); #pragma LOOPVECTORIZE_DISABLE #endif for (; i < compInfo.line.pixelCount; i++, compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) @@ -1394,7 +1406,7 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo compInfo.target.xCustom -= compInfo.line.widthCustom; } - if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] == 0) ) + if ( WILLPERFORMWINDOWTEST && (windowTest[compInfo.target.xCustom] == 0) ) { continue; } @@ -1406,7 +1418,7 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo continue; } - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (colorEffectEnable[compInfo.target.xCustom] != 0) : true; pixelop.Composite32(compInfo, ((FragmentColor *)vramColorPtr)[i], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); } else @@ -1416,7 +1428,7 @@ void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo continue; } - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true; + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (colorEffectEnable[compInfo.target.xCustom] != 0) : true; pixelop.Composite16(compInfo, ((u16 *)vramColorPtr)[i], enableColorEffect, this->_sprAlphaCustom[compInfo.target.xCustom], this->_sprTypeCustom[compInfo.target.xCustom]); } } @@ -2663,12 +2675,12 @@ void GPUEngineBase::TransitionRenderStatesToDisplayInfo(NDSDisplayInfo &mutableI template bool GPUEngineBase::_IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo) { - if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) + if (WIN_NUM == 0 && !compInfo.renderState.windowState.WIN0_ENABLED) { return false; } - if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) + if (WIN_NUM == 1 && !compInfo.renderState.windowState.WIN1_ENABLED) { return false; } @@ -2699,10 +2711,9 @@ template void GPUEngineBase::_UpdateWINH(GPUEngineCompositorInfo &compInfo) { //dont even waste any time in here if the window isnt enabled - if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) return; - if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) return; - - this->_needUpdateWINH[WIN_NUM] = false; + if (WIN_NUM == 0 && !compInfo.renderState.windowState.WIN0_ENABLED) return; + if (WIN_NUM == 1 && !compInfo.renderState.windowState.WIN1_ENABLED) return; + const size_t windowLeft = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0H.Left : this->_IORegisterMap->WIN1H.Left; const size_t windowRight = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0H.Right : this->_IORegisterMap->WIN1H.Right; @@ -2730,12 +2741,42 @@ void GPUEngineBase::_UpdateWINH(GPUEngineCompositorInfo &compInfo) void GPUEngineBase::_PerformWindowTesting(GPUEngineCompositorInfo &compInfo) { - if (this->_needUpdateWINH[0]) this->_UpdateWINH<0>(compInfo); - if (this->_needUpdateWINH[1]) this->_UpdateWINH<1>(compInfo); + compInfo.renderState.windowState.IsWithinVerticalRange_WIN0 = (this->_IsWindowInsideVerticalRange<0>(compInfo)) ? 1 : 0; + compInfo.renderState.windowState.IsWithinVerticalRange_WIN1 = (this->_IsWindowInsideVerticalRange<1>(compInfo)) ? 1 : 0; - const u8 *__restrict win0Ptr = (this->_IsWindowInsideVerticalRange<0>(compInfo)) ? this->_h_win[0] : NULL; - const u8 *__restrict win1Ptr = (this->_IsWindowInsideVerticalRange<1>(compInfo)) ? this->_h_win[1] : NULL; - const u8 *__restrict winObjPtr = (compInfo.renderState.WINOBJ_ENABLED) ? this->_sprWin[compInfo.line.indexNative] : NULL; + // While window testing isn't too expensive to do, it might become a performance issue when it's + // performed on every line when running large framebuffer widths on a system without SIMD + // acceleration. Therefore, we're going to check all the relevant window states and only perform + // window testing when one of those states actually changes. + + if ( (this->_prevWINState.value == compInfo.renderState.windowState.value) && + (this->_prevWINCoord.value == this->_IORegisterMap->WIN_COORD.value) && + (this->_prevWINCtrl.value == this->_IORegisterMap->WIN_CTRL.value) && + (compInfo.renderState.windowState.WINOBJ_ENABLED == 0) ) // Sprite window states continually update. If WINOBJ is enabled, that means we need to continually perform window testing as well. + { + return; + } + + const bool needUpdateWIN0H = (this->_prevWINCoord.WIN0H.value != this->_IORegisterMap->WIN0H.value); + const bool needUpdateWIN1H = (this->_prevWINCoord.WIN1H.value != this->_IORegisterMap->WIN1H.value); + + this->_prevWINState.value = compInfo.renderState.windowState.value; + this->_prevWINCoord.value = this->_IORegisterMap->WIN_COORD.value; + this->_prevWINCtrl.value = this->_IORegisterMap->WIN_CTRL.value; + + if (needUpdateWIN0H) + { + this->_UpdateWINH<0>(compInfo); + } + + if (needUpdateWIN1H) + { + this->_UpdateWINH<1>(compInfo); + } + + const u8 *__restrict win0Ptr = (compInfo.renderState.windowState.IsWithinVerticalRange_WIN0) ? this->_h_win[0] : NULL; + const u8 *__restrict win1Ptr = (compInfo.renderState.windowState.IsWithinVerticalRange_WIN1) ? this->_h_win[1] : NULL; + const u8 *__restrict winObjPtr = (compInfo.renderState.windowState.WINOBJ_ENABLED) ? this->_sprWin[compInfo.line.indexNative] : NULL; for (size_t layerID = GPULayerID_BG0; layerID <= GPULayerID_OBJ; layerID++) { @@ -2939,12 +2980,6 @@ void GPUEngineBase::_HandleDisplayModeNormal(const size_t l) } } -template -void GPUEngineBase::ParseReg_WINnH() -{ - this->_needUpdateWINH[WINNUM] = true; -} - void GPUEngineBase::ParseReg_WININ() { GPUEngineRenderState &renderState = this->_currentRenderState; @@ -3127,9 +3162,6 @@ void GPUEngineBase::AllocateWorkingBuffers(NDSColorFormat requestedColorFormat, this->_enableColorEffectCustom[GPULayerID_BG3] = this->_enableColorEffectCustomMasterPtr + (3 * w * sizeof(u8)); this->_enableColorEffectCustom[GPULayerID_OBJ] = this->_enableColorEffectCustomMasterPtr + (4 * w * sizeof(u8)); - this->_needUpdateWINH[0] = true; - this->_needUpdateWINH[1] = true; - for (size_t line = 0; line < GPU_VRAM_BLOCK_LINES + 1; line++) { this->_currentCompositorInfo[line].line = GPU->GetLineInfoAtIndex(line); @@ -3202,8 +3234,6 @@ void GPUEngineBase::ParseAllRegisters() this->ParseReg_BGnX(); this->ParseReg_BGnY(); - this->ParseReg_WINnH<0>(); - this->ParseReg_WINnH<1>(); this->ParseReg_WININ(); this->ParseReg_WINOUT(); @@ -3582,6 +3612,9 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) this->_TransitionLineNativeToCustom(compInfo); } + const u8 *windowTest = (CurrentRenderer->GetFramebufferWidth() == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? this->_didPassWindowTestNative[GPULayerID_BG0] : this->_didPassWindowTestCustom[GPULayerID_BG0]; + const u8 *colorEffectEnable = (CurrentRenderer->GetFramebufferWidth() == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? this->_enableColorEffectNative[GPULayerID_BG0] : this->_enableColorEffectCustom[GPULayerID_BG0]; + const float customWidthScale = (float)compInfo.line.widthCustom / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; const FragmentColor *__restrict srcLinePtr = framebuffer3D + compInfo.line.blockOffsetCustom; @@ -3600,7 +3633,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) size_t i = 0; #ifdef USEMANUALVECTORIZATION - i = this->_RenderLine_Layer3D_LoopOp(compInfo, srcLinePtr); + i = this->_RenderLine_Layer3D_LoopOp(compInfo, windowTest, colorEffectEnable, srcLinePtr); #pragma LOOPVECTORIZE_DISABLE #endif for (; i < compInfo.line.pixelCount; i++, srcLinePtr++, compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) @@ -3610,12 +3643,12 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) compInfo.target.xCustom -= compInfo.line.widthCustom; } - if ( (srcLinePtr->a == 0) || (WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[GPULayerID_BG0][compInfo.target.xCustom] == 0)) ) + if ( (srcLinePtr->a == 0) || (WILLPERFORMWINDOWTEST && (windowTest[compInfo.target.xCustom] == 0)) ) { continue; } - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[GPULayerID_BG0][compInfo.target.xCustom] != 0) : true; + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (colorEffectEnable[compInfo.target.xCustom] != 0) : true; pixelop.Composite32(compInfo, *srcLinePtr, enableColorEffect, 0, 0); } } @@ -3625,7 +3658,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) { for (compInfo.target.xCustom = 0; compInfo.target.xCustom < compInfo.line.widthCustom; compInfo.target.xCustom++, compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++) { - if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[GPULayerID_BG0][compInfo.target.xCustom] == 0) ) + if ( WILLPERFORMWINDOWTEST && (windowTest[compInfo.target.xCustom] == 0) ) { continue; } @@ -3641,7 +3674,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) continue; } - const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[GPULayerID_BG0][compInfo.target.xCustom] != 0) : true; + const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (colorEffectEnable[compInfo.target.xCustom] != 0) : true; pixelop.Composite32(compInfo, srcLinePtr[srcX], enableColorEffect, 0, 0); } @@ -6533,9 +6566,6 @@ template void GPUEngineBase::ParseReg_BGnVOFS(); template void GPUEngineBase::ParseReg_BGnVOFS(); template void GPUEngineBase::ParseReg_BGnVOFS(); -template void GPUEngineBase::ParseReg_WINnH<0>(); -template void GPUEngineBase::ParseReg_WINnH<1>(); - template void GPUEngineBase::ParseReg_BGnX(); template void GPUEngineBase::ParseReg_BGnY(); template void GPUEngineBase::ParseReg_BGnX(); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 9be4fdb88..f480aca2e 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -407,6 +407,19 @@ typedef union } IOREG_WIN0V; // 0x400x044: Vertical coordinates of Window 0 (Engine A+B) typedef IOREG_WIN0V IOREG_WIN1V; // 0x400x046: Vertical coordinates of Window 1 (Engine A+B) +typedef union +{ + u64 value; + + struct + { + IOREG_WIN0H WIN0H; // 0x0400x040 + IOREG_WIN1H WIN1H; // 0x0400x042 + IOREG_WIN0V WIN0V; // 0x0400x044 + IOREG_WIN1V WIN1V; // 0x0400x046 + }; +} IOREG_WIN_COORD; + typedef union { u8 value; @@ -436,6 +449,19 @@ typedef IOREG_WIN0IN IOREG_WIN1IN; // 0x400x049: Control of inside of Window 1 typedef IOREG_WIN0IN IOREG_WINOUT; // 0x400x04A: Control of outside of all windows typedef IOREG_WIN0IN IOREG_WINOBJ; // 0x400x04B: Control of inside of Window OBJ (lowest priority) +typedef union +{ + u32 value; + + struct + { + IOREG_WIN0IN WIN0IN; // 0x0400x048 + IOREG_WIN1IN WIN1IN; // 0x0400x049 + IOREG_WINOUT WINOUT; // 0x0400x04A + IOREG_WINOBJ WINOBJ; // 0x0400x04B + }; +} IOREG_WIN_CTRL; + typedef union { u32 value; @@ -751,14 +777,31 @@ typedef struct }; }; - IOREG_WIN0H WIN0H; // 0x0400x040 - IOREG_WIN1H WIN1H; // 0x0400x042 - IOREG_WIN0V WIN0V; // 0x0400x044 - IOREG_WIN1V WIN1V; // 0x0400x046 - IOREG_WIN0IN WIN0IN; // 0x0400x048 - IOREG_WIN1IN WIN1IN; // 0x0400x049 - IOREG_WINOUT WINOUT; // 0x0400x04A - IOREG_WINOBJ WINOBJ; // 0x0400x04B + union + { + IOREG_WIN_COORD WIN_COORD; // 0x0400x040 + + struct + { + IOREG_WIN0H WIN0H; // 0x0400x040 + IOREG_WIN1H WIN1H; // 0x0400x042 + IOREG_WIN0V WIN0V; // 0x0400x044 + IOREG_WIN1V WIN1V; // 0x0400x046 + }; + }; + + union + { + IOREG_WIN_CTRL WIN_CTRL; // 0x0400x048 + + struct + { + IOREG_WIN0IN WIN0IN; // 0x0400x048 + IOREG_WIN1IN WIN1IN; // 0x0400x049 + IOREG_WINOUT WINOUT; // 0x0400x04A + IOREG_WINOBJ WINOBJ; // 0x0400x04B + }; + }; IOREG_MOSAIC MOSAIC; // 0x0400x04C @@ -1222,6 +1265,28 @@ typedef struct u32 trunc32[GPU_FRAMEBUFFER_NATIVE_WIDTH]; } MosaicTableEntry; +typedef union +{ + u16 value; + + struct + { + u8 WIN0_ENABLED:1; + u8 WIN1_ENABLED:1; + u8 WINOBJ_ENABLED:1; + u8 IsWithinVerticalRange_WIN0:1; + u8 IsWithinVerticalRange_WIN1:1; + u8 unused1:3; + + u8 BG0_Shown:1; + u8 BG1_Shown:1; + u8 BG2_Shown:1; + u8 BG3_Shown:1; + u8 OBJ_Shown:1; + u8 unused2:3; + }; +} WINState; + typedef struct { GPULayerID layerID; @@ -1293,9 +1358,7 @@ typedef struct u8 WINOUT_enable[6]; u8 WINOBJ_enable[6]; - bool WIN0_ENABLED; - bool WIN1_ENABLED; - bool WINOBJ_ENABLED; + WINState windowState; bool isAnyWindowEnabled; u8 srcEffectEnable[6]; @@ -1434,7 +1497,10 @@ protected: void *_internalRenderLineTargetCustom; u8 *_renderLineLayerIDCustom; - bool _needUpdateWINH[2]; + + WINState _prevWINState; + IOREG_WIN_COORD _prevWINCoord; + IOREG_WIN_CTRL _prevWINCtrl; Task *_asyncClearTask; bool _asyncClearIsRunning; @@ -1467,8 +1533,8 @@ protected: template void _CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr); template void _CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorNative16, const FragmentColor *__restrict srcColorNative32); - template size_t _CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom); - template size_t _CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr); + template size_t _CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom); + template size_t _CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const void *__restrict vramColorPtr); template void _RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG); template void _RenderLine_BGAffine(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter ¶m); @@ -1528,7 +1594,6 @@ public: template void ParseReg_BGnVOFS(); template void ParseReg_BGnX(); template void ParseReg_BGnY(); - template void ParseReg_WINnH(); void ParseReg_WININ(); void ParseReg_WINOUT(); void ParseReg_MOSAIC(); @@ -1635,7 +1700,7 @@ protected: template FragmentColor _RenderLine_DispCapture_BlendFunc(const FragmentColor srcA, const FragmentColor srcB, const u8 blendEVA, const u8 blendEVB); template - size_t _RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr); + size_t _RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const FragmentColor *__restrict srcLinePtr); template void _RenderLine_DispCapture_Blend_Buffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t pixCount); // Do not use restrict pointers, since srcB and dst can be the same diff --git a/desmume/src/GPU_Operations.cpp b/desmume/src/GPU_Operations.cpp index 76759297f..bc515432b 100644 --- a/desmume/src/GPU_Operations.cpp +++ b/desmume/src/GPU_Operations.cpp @@ -1019,14 +1019,14 @@ void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &comp } template -size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) { // Do nothing. This is a placeholder for a manually vectorized version of this method. return 0; } template -size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const void *__restrict vramColorPtr) { // Do nothing. This is a placeholder for a manually vectorized version of this method. return 0; @@ -1086,7 +1086,7 @@ void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInf } template -size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr) +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const FragmentColor *__restrict srcLinePtr) { // Do nothing. This is a placeholder for a manually vectorized version of this method. return 0; diff --git a/desmume/src/GPU_Operations_AVX2.cpp b/desmume/src/GPU_Operations_AVX2.cpp index bc20d3e99..820233a57 100644 --- a/desmume/src/GPU_Operations_AVX2.cpp +++ b/desmume/src/GPU_Operations_AVX2.cpp @@ -2583,7 +2583,7 @@ void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &comp } template -size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) { static const size_t step = sizeof(v256u8); @@ -2610,7 +2610,7 @@ size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &com if (WILLPERFORMWINDOWTEST) { // Do the window test. - passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + passMask8 = _mm256_load_si256((v256u8 *)(windowTestPtr + compInfo.target.xCustom)); } if (LAYERTYPE == GPULayerType_BG) @@ -2656,7 +2656,7 @@ size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &com src[1], src[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, this->_sprAlphaCustom + compInfo.target.xCustom, this->_sprTypeCustom + compInfo.target.xCustom); } @@ -2665,7 +2665,7 @@ size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &com } template -size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const void *__restrict vramColorPtr) { static const size_t step = sizeof(v256u8); @@ -2689,7 +2689,7 @@ size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo if (WILLPERFORMWINDOWTEST) { // Do the window test. - passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + passMask8 = _mm256_load_si256((v256u8 *)(windowTestPtr + compInfo.target.xCustom)); // If none of the pixels within the vector pass, then reject them all at once. passMaskValue = _mm256_movemask_epi8(passMask8); @@ -2739,7 +2739,7 @@ size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo src16[1], src16[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, this->_sprAlphaCustom + compInfo.target.xCustom, this->_sprTypeCustom + compInfo.target.xCustom); break; @@ -2780,7 +2780,7 @@ size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo src32[3], src32[2], src32[1], src32[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, this->_sprAlphaCustom + compInfo.target.xCustom, this->_sprTypeCustom + compInfo.target.xCustom); break; @@ -2905,7 +2905,7 @@ void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInf } template -size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr) +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const FragmentColor *__restrict srcLinePtr) { static const size_t step = sizeof(v256u32); @@ -2930,7 +2930,7 @@ size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, if (WILLPERFORMWINDOWTEST) { // Do the window test. - passMask8 = _mm256_load_si256((v256u8 *)(this->_didPassWindowTestCustom[GPULayerID_BG0] + compInfo.target.xCustom)); + passMask8 = _mm256_load_si256((v256u8 *)(windowTestPtr + compInfo.target.xCustom)); // If none of the pixels within the vector pass, then reject them all at once. passMaskValue = _mm256_movemask_epi8(passMask8); @@ -2974,7 +2974,7 @@ size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, src[3], src[2], src[1], src[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[GPULayerID_BG0] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, NULL, NULL); } diff --git a/desmume/src/GPU_Operations_SSE2.cpp b/desmume/src/GPU_Operations_SSE2.cpp index 5c68cb727..4a1ea7990 100644 --- a/desmume/src/GPU_Operations_SSE2.cpp +++ b/desmume/src/GPU_Operations_SSE2.cpp @@ -2366,7 +2366,7 @@ void GPUEngineBase::_CompositeNativeLineOBJ_LoopOp(GPUEngineCompositorInfo &comp } template -size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) +size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const u16 *__restrict srcColorCustom16, const u8 *__restrict srcIndexCustom) { static const size_t step = sizeof(v128u8); @@ -2393,7 +2393,7 @@ size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &com if (WILLPERFORMWINDOWTEST) { // Do the window test. - passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + passMask8 = _mm_load_si128((v128u8 *)(windowTestPtr + compInfo.target.xCustom)); } if (LAYERTYPE == GPULayerType_BG) @@ -2439,7 +2439,7 @@ size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &com src[1], src[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, this->_sprAlphaCustom + compInfo.target.xCustom, this->_sprTypeCustom + compInfo.target.xCustom); } @@ -2448,7 +2448,7 @@ size_t GPUEngineBase::_CompositeLineDeferred_LoopOp(GPUEngineCompositorInfo &com } template -size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const void *__restrict vramColorPtr) +size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const void *__restrict vramColorPtr) { static const size_t step = sizeof(v128u8); @@ -2472,7 +2472,7 @@ size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo if (WILLPERFORMWINDOWTEST) { // Do the window test. - passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)); + passMask8 = _mm_load_si128((v128u8 *)(windowTestPtr + compInfo.target.xCustom)); // If none of the pixels within the vector pass, then reject them all at once. passMaskValue = _mm_movemask_epi8(passMask8); @@ -2521,7 +2521,7 @@ size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo src16[1], src16[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, this->_sprAlphaCustom + compInfo.target.xCustom, this->_sprTypeCustom + compInfo.target.xCustom); break; @@ -2560,7 +2560,7 @@ size_t GPUEngineBase::_CompositeVRAMLineDeferred_LoopOp(GPUEngineCompositorInfo src32[3], src32[2], src32[1], src32[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, this->_sprAlphaCustom + compInfo.target.xCustom, this->_sprTypeCustom + compInfo.target.xCustom); break; @@ -2688,7 +2688,7 @@ void GPUEngineBase::_PerformWindowTestingNative(GPUEngineCompositorInfo &compInf } template -size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const FragmentColor *__restrict srcLinePtr) +size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, const u8 *__restrict windowTestPtr, const u8 *__restrict colorEffectEnablePtr, const FragmentColor *__restrict srcLinePtr) { static const size_t step = sizeof(v128u8); @@ -2713,7 +2713,7 @@ size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, if (WILLPERFORMWINDOWTEST) { // Do the window test. - passMask8 = _mm_load_si128((v128u8 *)(this->_didPassWindowTestCustom[GPULayerID_BG0] + compInfo.target.xCustom)); + passMask8 = _mm_load_si128((v128u8 *)(windowTestPtr + compInfo.target.xCustom)); // If none of the pixels within the vector pass, then reject them all at once. passMaskValue = _mm_movemask_epi8(passMask8); @@ -2757,7 +2757,7 @@ size_t GPUEngineA::_RenderLine_Layer3D_LoopOp(GPUEngineCompositorInfo &compInfo, src[3], src[2], src[1], src[0], srcEffectEnableMask, dstBlendEnableMaskLUT, - this->_enableColorEffectCustom[GPULayerID_BG0] + compInfo.target.xCustom, + colorEffectEnablePtr + compInfo.target.xCustom, NULL, NULL); } diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index 9857d4777..24fb1b9d8 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -3337,20 +3337,16 @@ void FASTCALL _MMU_ARM9_write08(u32 adr, u8 val) case REG_DISPA_WIN0H: T1WriteByte(MMU.ARM9_REG, 0x0040, val); - mainEngine->ParseReg_WINnH<0>(); return; case REG_DISPA_WIN0H+1: T1WriteByte(MMU.ARM9_REG, 0x0041, val); - mainEngine->ParseReg_WINnH<0>(); return; case REG_DISPA_WIN1H: T1WriteByte(MMU.ARM9_REG, 0x0042, val); - mainEngine->ParseReg_WINnH<1>(); return; case REG_DISPA_WIN1H+1: T1WriteByte(MMU.ARM9_REG, 0x0043, val); - mainEngine->ParseReg_WINnH<1>(); return; case REG_DISPA_WIN0V: @@ -3506,20 +3502,16 @@ void FASTCALL _MMU_ARM9_write08(u32 adr, u8 val) case REG_DISPB_WIN0H: T1WriteByte(MMU.ARM9_REG, 0x1040, val); - subEngine->ParseReg_WINnH<0>(); return; case REG_DISPB_WIN0H+1: T1WriteByte(MMU.ARM9_REG, 0x1041, val); - subEngine->ParseReg_WINnH<0>(); return; case REG_DISPB_WIN1H: T1WriteByte(MMU.ARM9_REG, 0x1042, val); - subEngine->ParseReg_WINnH<1>(); return; case REG_DISPB_WIN1H+1: T1WriteByte(MMU.ARM9_REG, 0x1043, val); - subEngine->ParseReg_WINnH<1>(); return; case REG_DISPB_WIN0V: @@ -3875,12 +3867,10 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val) case REG_DISPA_WIN0H: T1WriteWord(MMU.ARM9_REG, 0x0040, val); - mainEngine->ParseReg_WINnH<0>(); return; case REG_DISPA_WIN1H: T1WriteWord(MMU.ARM9_REG, 0x0042, val); - mainEngine->ParseReg_WINnH<1>(); return; case REG_DISPA_WIN0V: @@ -4062,12 +4052,10 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val) case REG_DISPB_WIN0H: T1WriteWord(MMU.ARM9_REG, 0x1040, val); - subEngine->ParseReg_WINnH<0>(); return; case REG_DISPB_WIN1H: T1WriteWord(MMU.ARM9_REG, 0x1042, val); - subEngine->ParseReg_WINnH<1>(); return; case REG_DISPB_WIN0V: @@ -4462,8 +4450,6 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val) case REG_DISPA_WIN0H: T1WriteLong(MMU.ARM9_REG, 0x0040, val); - mainEngine->ParseReg_WINnH<0>(); - mainEngine->ParseReg_WINnH<1>(); return; case REG_DISPA_WIN0V: @@ -4577,8 +4563,6 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val) case REG_DISPB_WIN0H: T1WriteLong(MMU.ARM9_REG, 0x1040, val); - subEngine->ParseReg_WINnH<0>(); - subEngine->ParseReg_WINnH<1>(); return; case REG_DISPB_WIN0V: