diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp
index 35c1e3cc1..555bce502 100644
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@@ -1,8445 +1,8417 @@
-/*
-	Copyright (C) 2006 yopyop
-	Copyright (C) 2006-2007 Theo Berkau
-	Copyright (C) 2007 shash
-	Copyright (C) 2008-2017 DeSmuME team
-
-	This file is free software: you can redistribute it and/or modify
-	it under the terms of the GNU General Public License as published by
-	the Free Software Foundation, either version 2 of the License, or
-	(at your option) any later version.
-
-	This file is distributed in the hope that it will be useful,
-	but WITHOUT ANY WARRANTY; without even the implied warranty of
-	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-	GNU General Public License for more details.
-
-	You should have received a copy of the GNU General Public License
-	along with the this software.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "GPU.h"
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-#include <iostream>
-
-#include "common.h"
-#include "MMU.h"
-#include "FIFO.h"
-#include "debug.h"
-#include "render3D.h"
-#include "registers.h"
-#include "gfx3d.h"
-#include "debug.h"
-#include "NDSSystem.h"
-#include "readwrite.h"
-#include "matrix.h"
-#include "emufile.h"
-
-#ifdef FASTBUILD
-	#undef FORCEINLINE
-	#define FORCEINLINE
-	//compilation speed hack (cuts time exactly in half by cutting out permutations)
-	#define DISABLE_MOSAIC
-	#define DISABLE_COMPOSITOR_FAST_PATHS
-#endif
-
-//instantiate static instance
-u16 GPUEngineBase::_brightnessUpTable555[17][0x8000];
-FragmentColor GPUEngineBase::_brightnessUpTable666[17][0x8000];
-FragmentColor GPUEngineBase::_brightnessUpTable888[17][0x8000];
-u16 GPUEngineBase::_brightnessDownTable555[17][0x8000];
-FragmentColor GPUEngineBase::_brightnessDownTable666[17][0x8000];
-FragmentColor GPUEngineBase::_brightnessDownTable888[17][0x8000];
-u8 GPUEngineBase::_blendTable555[17][17][32][32];
-GPUEngineBase::MosaicLookup GPUEngineBase::_mosaicLookup;
-
-GPUSubsystem *GPU = NULL;
-
-static size_t _gpuLargestDstLineCount = 1;
-static size_t _gpuVRAMBlockOffset = GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH;
-
-static u16 *_gpuDstToSrcIndex = NULL; // Key: Destination pixel index / Value: Source pixel index
-static u8 *_gpuDstToSrcSSSE3_u8_8e = NULL;
-static u8 *_gpuDstToSrcSSSE3_u8_16e = NULL;
-static u8 *_gpuDstToSrcSSSE3_u16_8e = NULL;
-static u8 *_gpuDstToSrcSSSE3_u32_4e = NULL;
-
-static CACHE_ALIGN size_t _gpuDstPitchCount[GPU_FRAMEBUFFER_NATIVE_WIDTH];	// Key: Source pixel index in x-dimension / Value: Number of x-dimension destination pixels for the source pixel
-static CACHE_ALIGN size_t _gpuDstPitchIndex[GPU_FRAMEBUFFER_NATIVE_WIDTH];	// Key: Source pixel index in x-dimension / Value: First destination pixel that maps to the source pixel
-static CACHE_ALIGN size_t _gpuDstLineCount[GPU_FRAMEBUFFER_NATIVE_HEIGHT];	// Key: Source line index / Value: Number of destination lines for the source line
-static CACHE_ALIGN size_t _gpuDstLineIndex[GPU_FRAMEBUFFER_NATIVE_HEIGHT];	// Key: Source line index / Value: First destination line that maps to the source line
-static CACHE_ALIGN size_t _gpuCaptureLineCount[GPU_VRAM_BLOCK_LINES + 1];	// Key: Source line index / Value: Number of destination lines for the source line
-static CACHE_ALIGN size_t _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES + 1];	// Key: Source line index / Value: First destination line that maps to the source line
-
-const CACHE_ALIGN SpriteSize GPUEngineBase::_sprSizeTab[4][4] = {
-     {{8, 8}, {16, 8}, {8, 16}, {8, 8}},
-     {{16, 16}, {32, 8}, {8, 32}, {8, 8}},
-     {{32, 32}, {32, 16}, {16, 32}, {8, 8}},
-     {{64, 64}, {64, 32}, {32, 64}, {8, 8}},
-};
-
-const CACHE_ALIGN BGType GPUEngineBase::_mode2type[8][4] = {
-      {BGType_Text, BGType_Text, BGType_Text, BGType_Text},
-      {BGType_Text, BGType_Text, BGType_Text, BGType_Affine},
-      {BGType_Text, BGType_Text, BGType_Affine, BGType_Affine},
-      {BGType_Text, BGType_Text, BGType_Text, BGType_AffineExt},
-      {BGType_Text, BGType_Text, BGType_Affine, BGType_AffineExt},
-      {BGType_Text, BGType_Text, BGType_AffineExt, BGType_AffineExt},
-      {BGType_Invalid, BGType_Invalid, BGType_Large8bpp, BGType_Invalid},
-      {BGType_Invalid, BGType_Invalid, BGType_Invalid, BGType_Invalid}
-};
-
-//dont ever think of changing these to bits because you could avoid the multiplies in the main tile blitter.
-//it doesnt really help any
-const CACHE_ALIGN BGLayerSize GPUEngineBase::_BGLayerSizeLUT[8][4] = {
-	{{0, 0}, {0, 0}, {0, 0}, {0, 0}}, //Invalid
-    {{256,256}, {512,256}, {256,512}, {512,512}}, //text
-    {{128,128}, {256,256}, {512,512}, {1024,1024}}, //affine
-    {{512,1024}, {1024,512}, {0,0}, {0,0}}, //large 8bpp
-	{{0, 0}, {0, 0}, {0, 0}, {0, 0}}, //affine ext (to be elaborated with another value)
-	{{128,128}, {256,256}, {512,512}, {1024,1024}}, //affine ext 256x16
-	{{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext 256x1
-	{{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext direct
-};
-
-template <s32 INTEGERSCALEHINT, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
-static FORCEINLINE void CopyLineExpand_C(void *__restrict dst, const void *__restrict src, size_t dstLength)
-{
-	if (INTEGERSCALEHINT == 0)
-	{
-#if defined(MSB_FIRST)
-		if (NEEDENDIANSWAP && (ELEMENTSIZE != 1))
-		{
-			for (size_t i = 0; i < dstLength; i++)
-			{
-				if (ELEMENTSIZE == 2)
-				{
-					((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] );
-				}
-				else if (ELEMENTSIZE == 4)
-				{
-					((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] );
-				}
-			}
-		}
-		else
-#endif
-		{
-			memcpy(dst, src, dstLength * ELEMENTSIZE);
-		}
-	}
-	else if (INTEGERSCALEHINT == 1)
-	{
-#if defined(MSB_FIRST)
-		if (NEEDENDIANSWAP && (ELEMENTSIZE != 1))
-		{
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
-			{
-				if (ELEMENTSIZE == 2)
-				{
-					((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] );
-				}
-				else if (ELEMENTSIZE == 4)
-				{
-					((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] );
-				}
-			}
-		}
-		else
-#endif
-		{
-			memcpy(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE);
-		}
-	}
-	else
-	{
-		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
-		{
-			for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
-			{
-				if (ELEMENTSIZE == 1)
-				{
-					( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x];
-				}
-				else if (ELEMENTSIZE == 2)
-				{
-					((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x];
-				}
-				else if (ELEMENTSIZE == 4)
-				{
-					((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x];
-				}
-			}
-		}
-	}
-}
-
-#ifdef ENABLE_SSE2
-template <s32 INTEGERSCALEHINT, size_t ELEMENTSIZE>
-static FORCEINLINE void CopyLineExpand_SSE2(void *__restrict dst, const void *__restrict src, size_t dstLength)
-{
-	if (INTEGERSCALEHINT == 0)
-	{
-		memcpy(dst, src, dstLength * ELEMENTSIZE);
-	}
-	else if (INTEGERSCALEHINT == 1)
-	{
-		MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), _mm_store_si128((__m128i *)dst + (X), _mm_load_si128((__m128i *)src + (X))) );
-	}
-	else if (INTEGERSCALEHINT == 2)
-	{
-		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
-		{
-			if (ELEMENTSIZE == 1)
-			{
-				const __m128i src8  = _mm_load_si128((__m128i *)( (u8 *)src + srcX));
-				const __m128i src8out[2]  = { _mm_unpacklo_epi8(src8, src8), _mm_unpackhi_epi8(src8, src8) };
-				
-				_mm_store_si128((__m128i *)( (u8 *)dst + dstX +  0), src8out[0]);
-				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 16), src8out[1]);
-				
-				srcX += 16;
-				dstX += 32;
-			}
-			else if (ELEMENTSIZE == 2)
-			{
-				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
-				const __m128i src16out[2] = { _mm_unpacklo_epi16(src16, src16), _mm_unpackhi_epi16(src16, src16) };
-				
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  0), src16out[0]);
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  8), src16out[1]);
-				
-				srcX += 8;
-				dstX += 16;
-			}
-			else if (ELEMENTSIZE == 4)
-			{
-				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
-				const __m128i src32out[2] = { _mm_unpacklo_epi32(src32, src32), _mm_unpackhi_epi32(src32, src32) };
-				
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  0), src32out[0]);
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  4), src32out[1]);
-				
-				srcX += 4;
-				dstX += 8;
-			}
-		}
-	}
-	else if ((INTEGERSCALEHINT == 3) && (ELEMENTSIZE != 1))
-	{
-		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
-		{
-			if (ELEMENTSIZE == 2)
-			{
-				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
-				const __m128i src16lo = _mm_shuffle_epi32(src16, 0x44);
-				const __m128i src16hi = _mm_shuffle_epi32(src16, 0xEE);
-				const __m128i src16out[3] = { _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16lo, 0x40), 0xA5), _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16, 0xFE), 0x40), _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16hi, 0xA5), 0xFE) };
-				
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  0), src16out[0]);
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  8), src16out[1]);
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX + 16), src16out[2]);
-				
-				srcX += 8;
-				dstX += 24;
-			}
-			else if (ELEMENTSIZE == 4)
-			{
-				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
-				const __m128i src32out[3] = { _mm_shuffle_epi32(src32, 0x40), _mm_shuffle_epi32(src32, 0xA5), _mm_shuffle_epi32(src32, 0xFE) };
-				
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  0), src32out[0]);
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  4), src32out[1]);
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  8), src32out[2]);
-				
-				srcX += 4;
-				dstX += 12;
-			}
-		}
-	}
-	else if (INTEGERSCALEHINT == 4)
-	{
-		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH;)
-		{
-			if (ELEMENTSIZE == 1)
-			{
-				const __m128i src8  = _mm_load_si128((__m128i *)( (u8 *)src + srcX));
-				const __m128i src8_lo  = _mm_unpacklo_epi8(src8, src8);
-				const __m128i src8_hi  = _mm_unpackhi_epi8(src8, src8);
-				const __m128i src8out[4] = { _mm_unpacklo_epi8(src8_lo, src8_lo), _mm_unpackhi_epi8(src8_lo, src8_lo), _mm_unpacklo_epi8(src8_hi, src8_hi), _mm_unpackhi_epi8(src8_hi, src8_hi) };
-				
-				_mm_store_si128((__m128i *)( (u8 *)dst + dstX +  0), src8out[0]);
-				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 16), src8out[1]);
-				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 32), src8out[2]);
-				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 48), src8out[3]);
-				
-				srcX += 16;
-				dstX += 64;
-			}
-			else if (ELEMENTSIZE == 2)
-			{
-				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
-				const __m128i src16_lo = _mm_unpacklo_epi16(src16, src16);
-				const __m128i src16_hi = _mm_unpackhi_epi16(src16, src16);
-				const __m128i src16out[4] = { _mm_unpacklo_epi16(src16_lo, src16_lo), _mm_unpackhi_epi16(src16_lo, src16_lo), _mm_unpacklo_epi16(src16_hi, src16_hi), _mm_unpackhi_epi16(src16_hi, src16_hi) };
-				
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  0), src16out[0]);
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  8), src16out[1]);
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX + 16), src16out[2]);
-				_mm_store_si128((__m128i *)((u16 *)dst + dstX + 24), src16out[3]);
-				
-				srcX += 8;
-				dstX += 32;
-			}
-			else if (ELEMENTSIZE == 4)
-			{
-				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
-				const __m128i src32_lo = _mm_unpacklo_epi32(src32, src32);
-				const __m128i src32_hi = _mm_unpackhi_epi32(src32, src32);
-				const __m128i src32out[4] = { _mm_unpacklo_epi32(src32_lo, src32_lo), _mm_unpackhi_epi32(src32_lo, src32_lo), _mm_unpacklo_epi32(src32_hi, src32_hi), _mm_unpackhi_epi32(src32_hi, src32_hi) };
-				
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  0), src32out[0]);
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  4), src32out[1]);
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  8), src32out[2]);
-				_mm_store_si128((__m128i *)((u32 *)dst + dstX + 12), src32out[3]);
-				
-				srcX += 4;
-				dstX += 16;
-			}
-		}
-	}
-#ifdef ENABLE_SSSE3
-	else if (INTEGERSCALEHINT >= 0)
-	{
-		const size_t scale = dstLength / GPU_FRAMEBUFFER_NATIVE_WIDTH;
-		
-		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
-		{
-			if (ELEMENTSIZE == 1)
-			{
-				const __m128i src8 = _mm_load_si128((__m128i *)((u8 *)src + srcX));
-				
-				for (size_t s = 0; s < scale; s++)
-				{
-					const __m128i ssse3idx_u8 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u8_16e + (s * 16)));
-					_mm_store_si128( (__m128i *)( (u8 *)dst + dstX + (s * 16)), _mm_shuffle_epi8( src8, ssse3idx_u8 ) );
-				}
-				
-				srcX += 16;
-				dstX += (16 * scale);
-			}
-			else if (ELEMENTSIZE == 2)
-			{
-				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
-				
-				for (size_t s = 0; s < scale; s++)
-				{
-					const __m128i ssse3idx_u16 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u16_8e + (s * 16)));
-					_mm_store_si128( (__m128i *)((u16 *)dst + dstX + (s *  8)), _mm_shuffle_epi8(src16, ssse3idx_u16) );
-				}
-				
-				srcX += 8;
-				dstX += (8 * scale);
-			}
-			else if (ELEMENTSIZE == 4)
-			{
-				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
-				
-				for (size_t s = 0; s < scale; s++)
-				{
-					const __m128i ssse3idx_u32 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u32_4e + (s * 16)));
-					_mm_store_si128( (__m128i *)((u32 *)dst + dstX + (s *  4)), _mm_shuffle_epi8(src32, ssse3idx_u32) );
-				}
-				
-				srcX += 4;
-				dstX += (4 * scale);
-			}
-		}
-	}
-#endif
-	else
-	{
-		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
-		{
-			for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
-			{
-				if (ELEMENTSIZE == 1)
-				{
-					( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x];
-				}
-				else if (ELEMENTSIZE == 2)
-				{
-					((u16 *)dst)[_gpuDstPitchIndex[x] + p] = ((u16 *)src)[x];
-				}
-				else if (ELEMENTSIZE == 4)
-				{
-					((u32 *)dst)[_gpuDstPitchIndex[x] + p] = ((u32 *)src)[x];
-				}
-			}
-		}
-	}
-}
-#endif
-
-template <s32 INTEGERSCALEHINT, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
-static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstLength)
-{
-	// Use INTEGERSCALEHINT to provide a hint to CopyLineExpand() for the fastest execution path.
-	// INTEGERSCALEHINT represents the scaling value of the framebuffer width, and is always
-	// assumed to be a positive integer.
-	//
-	// Use cases:
-	// - Passing a value of 0 causes CopyLineExpand() to perform a simple copy, using dstLength
-	//   to copy dstLength elements.
-	// - Passing a value of 1 causes CopyLineExpand() to perform a simple copy, ignoring dstLength
-	//   and always copying GPU_FRAMEBUFFER_NATIVE_WIDTH elements.
-	// - Passing any negative value causes CopyLineExpand() to assume that the framebuffer width
-	//   is NOT scaled by an integer value, and will therefore take the safest (but slowest)
-	//   execution path.
-	// - Passing any positive value greater than 1 causes CopyLineExpand() to expand the line
-	//   using the integer scaling value.
-	
-#ifdef ENABLE_SSE2
-	CopyLineExpand_SSE2<INTEGERSCALEHINT, ELEMENTSIZE>(dst, src, dstLength);
-#else
-	CopyLineExpand_C<INTEGERSCALEHINT, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, dstLength);
-#endif
-}
-
-template <bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
-static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src)
-{
-	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
-	{
-		if (ELEMENTSIZE == 1)
-		{
-			( (u8 *)dst)[i] = ((u8 *)src)[_gpuDstPitchIndex[i]];
-		}
-		else if (ELEMENTSIZE == 2)
-		{
-			((u16 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[_gpuDstPitchIndex[i]] ) : ((u16 *)src)[_gpuDstPitchIndex[i]];
-		}
-		else if (ELEMENTSIZE == 4)
-		{
-			((u32 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[_gpuDstPitchIndex[i]] ) : ((u32 *)src)[_gpuDstPitchIndex[i]];
-		}
-	}
-}
-
-/*****************************************************************************/
-//			BACKGROUND RENDERING -ROTOSCALE-
-/*****************************************************************************/
-
-FORCEINLINE void rot_tiled_8bit_entry(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
-{
-	const u16 tileindex = *(u8*)MMU_gpu_map(map + ((auxX>>3) + (auxY>>3) * (lg>>3)));
-	const u16 x = auxX & 0x0007;
-	const u16 y = auxY & 0x0007;
-	
-	outIndex = *(u8*)MMU_gpu_map(tile + ((tileindex<<6)+(y<<3)+x));
-	outColor = LE_TO_LOCAL_16(pal[outIndex]);
-}
-
-template<bool EXTPAL>
-FORCEINLINE void rot_tiled_16bit_entry(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
-{
-	TILEENTRY tileentry;
-	tileentry.val = LE_TO_LOCAL_16( *(u16 *)MMU_gpu_map(map + (((auxX>>3) + (auxY>>3) * (lg>>3))<<1)) );
-	
-	const u16 x = ((tileentry.bits.HFlip) ? 7 - (auxX) : (auxX)) & 0x0007;
-	const u16 y = ((tileentry.bits.VFlip) ? 7 - (auxY) : (auxY)) & 0x0007;
-	
-	outIndex = *(u8*)MMU_gpu_map(tile + ((tileentry.bits.TileNum<<6)+(y<<3)+x));
-	outColor = LE_TO_LOCAL_16(pal[(outIndex + (EXTPAL ? (tileentry.bits.Palette<<8) : 0))]);
-}
-
-FORCEINLINE void rot_256_map(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
-{
-	outIndex = *(u8*)MMU_gpu_map(map + ((auxX + auxY * lg)));
-	outColor = LE_TO_LOCAL_16(pal[outIndex]);
-}
-
-FORCEINLINE void rot_BMP_map(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
-{
-	outColor = LE_TO_LOCAL_16( *(u16 *)MMU_gpu_map(map + ((auxX + auxY * lg) << 1)) );
-	outIndex = ((outColor & 0x8000) == 0) ? 0 : 1;
-}
-
-void gpu_savestate(EMUFILE* os)
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	const GPUEngineA *mainEngine = GPU->GetEngineMain();
-	const GPUEngineB *subEngine = GPU->GetEngineSub();
-	
-	//version
-	write32le(1,os);
-	
-	os->fwrite((u8 *)dispInfo.masterCustomBuffer, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16) * 2);
-	
-	write32le(mainEngine->savedBG2X.value, os);
-	write32le(mainEngine->savedBG2Y.value, os);
-	write32le(mainEngine->savedBG3X.value, os);
-	write32le(mainEngine->savedBG3Y.value, os);
-	write32le(subEngine->savedBG2X.value, os);
-	write32le(subEngine->savedBG2Y.value, os);
-	write32le(subEngine->savedBG3X.value, os);
-	write32le(subEngine->savedBG3Y.value, os);
-}
-
-bool gpu_loadstate(EMUFILE* is, int size)
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	GPUEngineA *mainEngine = GPU->GetEngineMain();
-	GPUEngineB *subEngine = GPU->GetEngineSub();
-	
-	//read version
-	u32 version;
-	
-	//sigh.. shouldve used a new version number
-	if (size == GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16) * 2)
-	{
-		version = 0;
-	}
-	else if (size == 0x30024)
-	{
-		read32le(&version,is);
-		version = 1;
-	}
-	else
-	{
-		if(read32le(&version,is) != 1) return false;
-	}
-	
-	if (version > 1) return false;
-	
-	is->fread((u8 *)dispInfo.masterCustomBuffer, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16) * 2);
-	
-	if (version == 1)
-	{
-		read32le((u32 *)&mainEngine->savedBG2X, is);
-		read32le((u32 *)&mainEngine->savedBG2Y, is);
-		read32le((u32 *)&mainEngine->savedBG3X, is);
-		read32le((u32 *)&mainEngine->savedBG3Y, is);
-		read32le((u32 *)&subEngine->savedBG2X, is);
-		read32le((u32 *)&subEngine->savedBG2Y, is);
-		read32le((u32 *)&subEngine->savedBG3X, is);
-		read32le((u32 *)&subEngine->savedBG3Y, is);
-		//removed per nitsuja feedback. anyway, this same thing will happen almost immediately in gpu line=0
-		//mainEngine->refreshAffineStartRegs(-1,-1);
-		//subEngine->refreshAffineStartRegs(-1,-1);
-	}
-	
-	mainEngine->ParseAllRegisters();
-	subEngine->ParseAllRegisters();
-	
-	return !is->fail();
-}
-
-/*****************************************************************************/
-//			INITIALIZATION
-/*****************************************************************************/
-void GPUEngineBase::_InitLUTs()
-{
-	static bool didInit = false;
-	
-	if (didInit)
-	{
-		return;
-	}
-	
-	/*
-	NOTE: gbatek (in the reference above) seems to expect 6bit values 
-	per component, but as desmume works with 5bit per component, 
-	we use 31 as top, instead of 63. Testing it on a few games, 
-	using 63 seems to give severe color wraping, and 31 works
-	nicely, so for now we'll just that, until proven wrong.
-
-	i have seen pics of pokemon ranger getting white with 31, with 63 it is nice.
-	it could be pb of alpha or blending or...
-
-	MightyMax> created a test NDS to check how the brightness values work,
-	and 31 seems to be correct. FactorEx is a override for max brighten/darken
-	See: http://mightymax.org/gfx_test_brightness.nds
-	The Pokemon Problem could be a problem with 8/32 bit writes not recognized yet,
-	i'll add that so you can check back.
-	*/
-	
-	for (u16 i = 0; i <= 16; i++)
-	{
-		for (u16 j = 0x0000; j < 0x8000; j++)
-		{
-			COLOR cur;
-
-			cur.val = j;
-			cur.bits.red = (cur.bits.red + ((31 - cur.bits.red) * i / 16));
-			cur.bits.green = (cur.bits.green + ((31 - cur.bits.green) * i / 16));
-			cur.bits.blue = (cur.bits.blue + ((31 - cur.bits.blue) * i / 16));
-			cur.bits.alpha = 0;
-			GPUEngineBase::_brightnessUpTable555[i][j] = cur.val;
-			GPUEngineBase::_brightnessUpTable666[i][j].color = COLOR555TO666(cur.val);
-			GPUEngineBase::_brightnessUpTable888[i][j].color = COLOR555TO888(cur.val);
-			
-			cur.val = j;
-			cur.bits.red = (cur.bits.red - (cur.bits.red * i / 16));
-			cur.bits.green = (cur.bits.green - (cur.bits.green * i / 16));
-			cur.bits.blue = (cur.bits.blue - (cur.bits.blue * i / 16));
-			cur.bits.alpha = 0;
-			GPUEngineBase::_brightnessDownTable555[i][j] = cur.val;
-			GPUEngineBase::_brightnessDownTable666[i][j].color = COLOR555TO666(cur.val);
-			GPUEngineBase::_brightnessDownTable888[i][j].color = COLOR555TO888(cur.val);
-		}
-	}
-	
-	for(int c0=0;c0<=31;c0++) 
-		for(int c1=0;c1<=31;c1++) 
-			for(int eva=0;eva<=16;eva++)
-				for(int evb=0;evb<=16;evb++)
-				{
-					int blend = ((c0 * eva) + (c1 * evb) ) / 16;
-					int final = std::min<int>(31,blend);
-					GPUEngineBase::_blendTable555[eva][evb][c0][c1] = final;
-				}
-	
-	didInit = true;
-}
-
-GPUEngineBase::GPUEngineBase()
-{
-	_IORegisterMap = NULL;
-	_paletteOBJ = NULL;
-	
-	_BGLayer[GPULayerID_BG0].layerID = GPULayerID_BG0;
-	_BGLayer[GPULayerID_BG1].layerID = GPULayerID_BG1;
-	_BGLayer[GPULayerID_BG2].layerID = GPULayerID_BG2;
-	_BGLayer[GPULayerID_BG3].layerID = GPULayerID_BG3;
-	
-	_BGLayer[GPULayerID_BG0].extPaletteSlot = GPULayerID_BG0;
-	_BGLayer[GPULayerID_BG1].extPaletteSlot = GPULayerID_BG1;
-	_BGLayer[GPULayerID_BG2].extPaletteSlot = GPULayerID_BG2;
-	_BGLayer[GPULayerID_BG3].extPaletteSlot = GPULayerID_BG3;
-	
-	_BGLayer[GPULayerID_BG0].extPalette = NULL;
-	_BGLayer[GPULayerID_BG1].extPalette = NULL;
-	_BGLayer[GPULayerID_BG2].extPalette = NULL;
-	_BGLayer[GPULayerID_BG3].extPalette = NULL;
-	
-	_InitLUTs();
-	_internalRenderLineTargetCustom = NULL;
-	_renderLineLayerIDCustom = NULL;
-	_deferredIndexCustom = NULL;
-	_deferredColorCustom = NULL;
-	
-	_didPassWindowTestCustomMasterPtr = NULL;
-	_didPassWindowTestCustom[GPULayerID_BG0] = NULL;
-	_didPassWindowTestCustom[GPULayerID_BG1] = NULL;
-	_didPassWindowTestCustom[GPULayerID_BG2] = NULL;
-	_didPassWindowTestCustom[GPULayerID_BG3] = NULL;
-	_didPassWindowTestCustom[GPULayerID_OBJ] = NULL;
-	
-	_enableColorEffectCustomMasterPtr = NULL;
-	_enableColorEffectCustom[GPULayerID_BG0] = NULL;
-	_enableColorEffectCustom[GPULayerID_BG1] = NULL;
-	_enableColorEffectCustom[GPULayerID_BG2] = NULL;
-	_enableColorEffectCustom[GPULayerID_BG3] = NULL;
-	_enableColorEffectCustom[GPULayerID_OBJ] = NULL;
-}
-
-GPUEngineBase::~GPUEngineBase()
-{
-	free_aligned(this->_internalRenderLineTargetCustom);
-	this->_internalRenderLineTargetCustom = NULL;
-	free_aligned(this->_renderLineLayerIDCustom);
-	this->_renderLineLayerIDCustom = NULL;
-	free_aligned(this->_deferredIndexCustom);
-	this->_deferredIndexCustom = NULL;
-	free_aligned(this->_deferredColorCustom);
-	this->_deferredColorCustom = NULL;
-	
-	free_aligned(this->_didPassWindowTestCustomMasterPtr);
-	this->_didPassWindowTestCustomMasterPtr = NULL;
-	this->_didPassWindowTestCustom[GPULayerID_BG0] = NULL;
-	this->_didPassWindowTestCustom[GPULayerID_BG1] = NULL;
-	this->_didPassWindowTestCustom[GPULayerID_BG2] = NULL;
-	this->_didPassWindowTestCustom[GPULayerID_BG3] = NULL;
-	this->_didPassWindowTestCustom[GPULayerID_OBJ] = NULL;
-	
-	this->_enableColorEffectCustomMasterPtr = NULL;
-	this->_enableColorEffectCustom[GPULayerID_BG0] = NULL;
-	this->_enableColorEffectCustom[GPULayerID_BG1] = NULL;
-	this->_enableColorEffectCustom[GPULayerID_BG2] = NULL;
-	this->_enableColorEffectCustom[GPULayerID_BG3] = NULL;
-	this->_enableColorEffectCustom[GPULayerID_OBJ] = NULL;
-}
-
-void GPUEngineBase::_Reset_Base()
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	
-	memset(this->_sprColor, 0, sizeof(this->_sprColor));
-	memset(this->_sprAlpha, 0, sizeof(this->_sprAlpha));
-	memset(this->_sprType, OBJMode_Normal, sizeof(this->_sprType));
-	memset(this->_sprPrio, 0x7F, sizeof(this->_sprPrio));
-	memset(this->_sprNum, 0, sizeof(this->_sprNum));
-	
-	memset(this->_didPassWindowTestNative, 1, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
-	memset(this->_enableColorEffectNative, 1, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
-	memset(this->_didPassWindowTestCustomMasterPtr, 1, 10 * dispInfo.customWidth * sizeof(u8));
-	
-	memset(this->_h_win[0], 0, sizeof(this->_h_win[0]));
-	memset(this->_h_win[1], 0, sizeof(this->_h_win[1]));
-	memset(&this->_mosaicColors, 0, sizeof(MosaicColor));
-	memset(this->_itemsForPriority, 0, sizeof(this->_itemsForPriority));
-	
-	memset(this->_internalRenderLineTargetNative, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(FragmentColor));
-	
-	if (this->_internalRenderLineTargetCustom != NULL)
-	{
-		memset(this->_internalRenderLineTargetCustom, 0, dispInfo.customWidth * _gpuLargestDstLineCount * dispInfo.pixelBytes);
-	}
-	if (this->_renderLineLayerIDCustom != NULL)
-	{
-		memset(this->_renderLineLayerIDCustom, 0, dispInfo.customWidth * _gpuLargestDstLineCount * 4 * sizeof(u8));
-	}
-	
-	this->_enableLayer[GPULayerID_BG0] = false;
-	this->_enableLayer[GPULayerID_BG1] = false;
-	this->_enableLayer[GPULayerID_BG2] = false;
-	this->_enableLayer[GPULayerID_BG3] = false;
-	this->_enableLayer[GPULayerID_OBJ] = false;
-	this->_isAnyBGLayerEnabled = false;
-	
-	this->_BGLayer[GPULayerID_BG0].BGnCNT = this->_IORegisterMap->BG0CNT;
-	this->_BGLayer[GPULayerID_BG1].BGnCNT = this->_IORegisterMap->BG1CNT;
-	this->_BGLayer[GPULayerID_BG2].BGnCNT = this->_IORegisterMap->BG2CNT;
-	this->_BGLayer[GPULayerID_BG3].BGnCNT = this->_IORegisterMap->BG3CNT;
-	
-	this->_BGLayer[GPULayerID_BG0].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
-	this->_BGLayer[GPULayerID_BG1].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
-	this->_BGLayer[GPULayerID_BG2].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
-	this->_BGLayer[GPULayerID_BG3].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
-	
-	this->_BGLayer[GPULayerID_BG0].baseType = BGType_Invalid;
-	this->_BGLayer[GPULayerID_BG1].baseType = BGType_Invalid;
-	this->_BGLayer[GPULayerID_BG2].baseType = BGType_Invalid;
-	this->_BGLayer[GPULayerID_BG3].baseType = BGType_Invalid;
-	
-	this->_BGLayer[GPULayerID_BG0].type = BGType_Invalid;
-	this->_BGLayer[GPULayerID_BG1].type = BGType_Invalid;
-	this->_BGLayer[GPULayerID_BG2].type = BGType_Invalid;
-	this->_BGLayer[GPULayerID_BG3].type = BGType_Invalid;
-	
-	this->_BGLayer[GPULayerID_BG0].priority = 0;
-	this->_BGLayer[GPULayerID_BG1].priority = 0;
-	this->_BGLayer[GPULayerID_BG2].priority = 0;
-	this->_BGLayer[GPULayerID_BG3].priority = 0;
-	
-	this->_BGLayer[GPULayerID_BG0].isVisible = false;
-	this->_BGLayer[GPULayerID_BG1].isVisible = false;
-	this->_BGLayer[GPULayerID_BG2].isVisible = false;
-	this->_BGLayer[GPULayerID_BG3].isVisible = false;
-	
-	this->_BGLayer[GPULayerID_BG0].isMosaic = false;
-	this->_BGLayer[GPULayerID_BG1].isMosaic = false;
-	this->_BGLayer[GPULayerID_BG2].isMosaic = false;
-	this->_BGLayer[GPULayerID_BG3].isMosaic = false;
-	
-	this->_BGLayer[GPULayerID_BG0].isDisplayWrapped = false;
-	this->_BGLayer[GPULayerID_BG1].isDisplayWrapped = false;
-	this->_BGLayer[GPULayerID_BG2].isDisplayWrapped = false;
-	this->_BGLayer[GPULayerID_BG3].isDisplayWrapped = false;
-	
-	this->_BGLayer[GPULayerID_BG0].extPaletteSlot = GPULayerID_BG0;
-	this->_BGLayer[GPULayerID_BG1].extPaletteSlot = GPULayerID_BG1;
-	this->_BGLayer[GPULayerID_BG0].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG0];
-	this->_BGLayer[GPULayerID_BG1].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG1];
-	this->_BGLayer[GPULayerID_BG2].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG2];
-	this->_BGLayer[GPULayerID_BG3].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG3];
-	
-	this->_needUpdateWINH[0] = true;
-	this->_needUpdateWINH[1] = true;
-	
-	this->vramBlockOBJAddress = 0;
-	
-	this->nativeLineRenderCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->nativeLineOutputCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	
-	for (size_t l = 0; l < GPU_FRAMEBUFFER_NATIVE_HEIGHT; l++)
-	{
-		this->isLineRenderNative[l] = true;
-		this->isLineOutputNative[l] = true;
-	}
-	
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.displayOutputMode = GPUDisplayMode_Off;
-	renderState.selectedLayerID = GPULayerID_BG0;
-	renderState.selectedBGLayer = &this->_BGLayer[GPULayerID_BG0];
-	renderState.backdropColor16 = LE_TO_LOCAL_16(this->_paletteBG[0]) & 0x7FFF;
-	renderState.colorEffect = (ColorEffect)this->_IORegisterMap->BLDCNT.ColorEffect;
-	renderState.blendEVA = 0;
-	renderState.blendEVB = 0;
-	renderState.blendEVY = 0;
-	renderState.masterBrightnessMode = GPUMasterBrightMode_Disable;
-	renderState.masterBrightnessIntensity = 0;
-	renderState.masterBrightnessIsFullIntensity = false;
-	renderState.masterBrightnessIsMaxOrMin = true;
-	renderState.blendTable555 = (TBlendTable *)&GPUEngineBase::_blendTable555[renderState.blendEVA][renderState.blendEVB][0][0];
-	renderState.brightnessUpTable555 = &GPUEngineBase::_brightnessUpTable555[renderState.blendEVY][0];
-	renderState.brightnessUpTable666 = &GPUEngineBase::_brightnessUpTable666[renderState.blendEVY][0];
-	renderState.brightnessUpTable888 = &GPUEngineBase::_brightnessUpTable888[renderState.blendEVY][0];
-	renderState.brightnessDownTable555 = &GPUEngineBase::_brightnessDownTable555[renderState.blendEVY][0];
-	renderState.brightnessDownTable666 = &GPUEngineBase::_brightnessDownTable666[renderState.blendEVY][0];
-	renderState.brightnessDownTable888 = &GPUEngineBase::_brightnessDownTable888[renderState.blendEVY][0];
-	
-	renderState.srcBlendEnable[GPULayerID_BG0] = false;
-	renderState.srcBlendEnable[GPULayerID_BG1] = false;
-	renderState.srcBlendEnable[GPULayerID_BG2] = false;
-	renderState.srcBlendEnable[GPULayerID_BG3] = false;
-	renderState.srcBlendEnable[GPULayerID_OBJ] = false;
-	renderState.srcBlendEnable[GPULayerID_Backdrop] = false;
-	
-	renderState.dstBlendEnable[GPULayerID_BG0] = false;
-	renderState.dstBlendEnable[GPULayerID_BG1] = false;
-	renderState.dstBlendEnable[GPULayerID_BG2] = false;
-	renderState.dstBlendEnable[GPULayerID_BG3] = false;
-	renderState.dstBlendEnable[GPULayerID_OBJ] = false;
-	renderState.dstBlendEnable[GPULayerID_Backdrop] = false;
-	renderState.dstAnyBlendEnable = false;
-	
-#ifdef ENABLE_SSE2
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
-	renderState.srcBlendEnable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
-	renderState.srcBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_setzero_si128();
-#ifdef ENABLE_SSSE3
-	renderState.dstBlendEnable_SSSE3 = _mm_setzero_si128();
-#else
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
-	renderState.dstBlendEnable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
-	renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_setzero_si128();
-#endif
-#endif
-	
-	renderState.WIN0_enable[GPULayerID_BG0] = 0;
-	renderState.WIN0_enable[GPULayerID_BG1] = 0;
-	renderState.WIN0_enable[GPULayerID_BG2] = 0;
-	renderState.WIN0_enable[GPULayerID_BG3] = 0;
-	renderState.WIN0_enable[GPULayerID_OBJ] = 0;
-	renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
-	
-	renderState.WIN1_enable[GPULayerID_BG0] = 0;
-	renderState.WIN1_enable[GPULayerID_BG1] = 0;
-	renderState.WIN1_enable[GPULayerID_BG2] = 0;
-	renderState.WIN1_enable[GPULayerID_BG3] = 0;
-	renderState.WIN1_enable[GPULayerID_OBJ] = 0;
-	renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
-	
-	renderState.WINOUT_enable[GPULayerID_BG0] = 0;
-	renderState.WINOUT_enable[GPULayerID_BG1] = 0;
-	renderState.WINOUT_enable[GPULayerID_BG2] = 0;
-	renderState.WINOUT_enable[GPULayerID_BG3] = 0;
-	renderState.WINOUT_enable[GPULayerID_OBJ] = 0;
-	renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
-	
-	renderState.WINOBJ_enable[GPULayerID_BG0] = 0;
-	renderState.WINOBJ_enable[GPULayerID_BG1] = 0;
-	renderState.WINOBJ_enable[GPULayerID_BG2] = 0;
-	renderState.WINOBJ_enable[GPULayerID_BG3] = 0;
-	renderState.WINOBJ_enable[GPULayerID_OBJ] = 0;
-	renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
-	
-#if defined(ENABLE_SSE2)
-	renderState.WIN0_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
-	renderState.WIN0_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
-	renderState.WIN0_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
-	renderState.WIN0_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
-	renderState.WIN0_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
-	renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
-	
-	renderState.WIN1_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
-	renderState.WIN1_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
-	renderState.WIN1_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
-	renderState.WIN1_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
-	renderState.WIN1_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
-	renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
-	
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
-	renderState.WINOUT_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
-	renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
-	
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
-	renderState.WINOBJ_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
-	renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
-#endif
-	
-	renderState.WIN0_ENABLED = false;
-	renderState.WIN1_ENABLED = false;
-	renderState.WINOBJ_ENABLED = false;
-	renderState.isAnyWindowEnabled = false;
-	
-	renderState.mosaicWidthBG = this->_mosaicLookup.table[0];
-	renderState.mosaicHeightBG = this->_mosaicLookup.table[0];
-	renderState.mosaicWidthOBJ = this->_mosaicLookup.table[0];
-	renderState.mosaicHeightOBJ = this->_mosaicLookup.table[0];
-	renderState.isBGMosaicSet = false;
-	renderState.isOBJMosaicSet = false;
-	
-	renderState.spriteRenderMode = SpriteRenderMode_Sprite1D;
-	renderState.spriteBoundary = 0;
-	renderState.spriteBMPBoundary = 0;
-	
-	this->savedBG2X.value = 0;
-	this->savedBG2Y.value = 0;
-	this->savedBG3X.value = 0;
-	this->savedBG3Y.value = 0;
-	
-	this->renderedWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	this->renderedHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->renderedBuffer = this->nativeBuffer;
-	
-	for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
-	{
-		this->_currentCompositorInfo[line].renderState = renderState;
-	}
-}
-
-void GPUEngineBase::Reset()
-{
-	this->_Reset_Base();
-}
-
-void GPUEngineBase::_ResortBGLayers()
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	int i, prio;
-	itemsForPriority_t *item;
-	
-	// we don't need to check for windows here...
-	// if we tick boxes, invisible layers become invisible & vice versa
-#define OP ^ !
-	// if we untick boxes, layers become invisible
-	//#define OP &&
-	this->_enableLayer[GPULayerID_BG0] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG0] OP(this->_BGLayer[GPULayerID_BG0].isVisible);
-	this->_enableLayer[GPULayerID_BG1] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG1] OP(this->_BGLayer[GPULayerID_BG1].isVisible);
-	this->_enableLayer[GPULayerID_BG2] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG2] OP(this->_BGLayer[GPULayerID_BG2].isVisible);
-	this->_enableLayer[GPULayerID_BG3] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG3] OP(this->_BGLayer[GPULayerID_BG3].isVisible);
-	this->_enableLayer[GPULayerID_OBJ] = CommonSettings.dispLayers[this->_engineID][GPULayerID_OBJ] OP(DISPCNT.OBJ_Enable);
-	
-	this->_isAnyBGLayerEnabled = this->_enableLayer[GPULayerID_BG0] || this->_enableLayer[GPULayerID_BG1] || this->_enableLayer[GPULayerID_BG2] || this->_enableLayer[GPULayerID_BG3];
-	
-	// KISS ! lower priority first, if same then lower num
-	for (i = 0; i < NB_PRIORITIES; i++)
-	{
-		item = &(this->_itemsForPriority[i]);
-		item->nbBGs = 0;
-		item->nbPixelsX = 0;
-	}
-	
-	for (i = NB_BG; i > 0; )
-	{
-		i--;
-		if (!this->_enableLayer[i]) continue;
-		prio = this->_BGLayer[i].priority;
-		item = &(this->_itemsForPriority[prio]);
-		item->BGs[item->nbBGs]=i;
-		item->nbBGs++;
-	}
-	
-#if 0
-	//debug
-	for (i = 0; i < NB_PRIORITIES; i++)
-	{
-		item = &(this->_itemsForPriority[i]);
-		printf("%d : ", i);
-		for (j=0; j<NB_PRIORITIES; j++)
-		{
-			if (j < item->nbBGs)
-				printf("BG%d ", item->BGs[j]);
-			else
-				printf("... ", item->BGs[j]);
-		}
-	}
-	printf("\n");
-#endif
-}
-
-FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB)
-{
-	u16 ra =  colA        & 0x001F;
-	u16 ga = (colA >>  5) & 0x001F;
-	u16 ba = (colA >> 10) & 0x001F;
-	u16 rb =  colB        & 0x001F;
-	u16 gb = (colB >>  5) & 0x001F;
-	u16 bb = (colB >> 10) & 0x001F;
-	
-	ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16;
-	ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16;
-	ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16;
-	
-	ra = (ra > 31) ? 31 : ra;
-	ga = (ga > 31) ? 31 : ga;
-	ba = (ba > 31) ? 31 : ba;
-	
-	return ra | (ga << 5) | (ba << 10);
-}
-
-template <NDSColorFormat COLORFORMAT>
-FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB)
-{
-	FragmentColor outColor;
-	
-	u16 r16 = ( (colA.r * blendEVA) + (colB.r * blendEVB) ) / 16;
-	u16 g16 = ( (colA.g * blendEVA) + (colB.g * blendEVB) ) / 16;
-	u16 b16 = ( (colA.b * blendEVA) + (colB.b * blendEVB) ) / 16;
-	
-	if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
-	{
-		outColor.r = (r16 > 63) ? 63 : r16;
-		outColor.g = (g16 > 63) ? 63 : g16;
-		outColor.b = (b16 > 63) ? 63 : b16;
-	}
-	else if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
-	{
-		outColor.r = (r16 > 255) ? 255 : r16;
-		outColor.g = (g16 > 255) ? 255 : g16;
-		outColor.b = (b16 > 255) ? 255 : b16;
-	}
-	
-	outColor.a = 0;
-	return outColor;
-}
-
-FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable)
-{
-	const u8 r = (*blendTable)[ colA        & 0x1F][ colB        & 0x1F];
-	const u8 g = (*blendTable)[(colA >>  5) & 0x1F][(colB >>  5) & 0x1F];
-	const u8 b = (*blendTable)[(colA >> 10) & 0x1F][(colB >> 10) & 0x1F];
-
-	return r | (g << 5) | (b << 10);
-}
-
-FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend3D(const FragmentColor colA, const u16 colB)
-{
-	const u16 alpha = colA.a + 1;
-	COLOR c2;
-	COLOR cfinal;
-	
-	c2.val = colB;
-	
-	cfinal.bits.red   = ((colA.r * alpha) + ((c2.bits.red   << 1) * (32 - alpha))) >> 6;
-	cfinal.bits.green = ((colA.g * alpha) + ((c2.bits.green << 1) * (32 - alpha))) >> 6;
-	cfinal.bits.blue  = ((colA.b * alpha) + ((c2.bits.blue  << 1) * (32 - alpha))) >> 6;
-	cfinal.bits.alpha = 0;
-	
-	return cfinal.val;
-}
-
-template <NDSColorFormat COLORFORMATB>
-FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend3D(const FragmentColor colA, const FragmentColor colB)
-{
-	FragmentColor blendedColor;
-	const u16 alpha = colA.a + 1;
-	
-	if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
-	{
-		blendedColor.r = ((colA.r * alpha) + (colB.r * (32 - alpha))) >> 5;
-		blendedColor.g = ((colA.g * alpha) + (colB.g * (32 - alpha))) >> 5;
-		blendedColor.b = ((colA.b * alpha) + (colB.b * (32 - alpha))) >> 5;
-	}
-	else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
-	{
-		blendedColor.r = ((colA.r * alpha) + (colB.r * (256 - alpha))) >> 8;
-		blendedColor.g = ((colA.g * alpha) + (colB.g * (256 - alpha))) >> 8;
-		blendedColor.b = ((colA.b * alpha) + (colB.b * (256 - alpha))) >> 8;
-	}
-	
-	blendedColor.a = 0;
-	return blendedColor;
-}
-
-FORCEINLINE u16 GPUEngineBase::_ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY)
-{
-	u16 r =  col        & 0x001F;
-	u16 g = (col >>  5) & 0x001F;
-	u16 b = (col >> 10) & 0x001F;
-	
-	r = (r + ((31 - r) * blendEVY / 16));
-	g = (g + ((31 - g) * blendEVY / 16));
-	b = (b + ((31 - b) * blendEVY / 16));
-	
-	return r | (g << 5) | (b << 10);
-}
-
-template <NDSColorFormat COLORFORMAT>
-FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectIncreaseBrightness(const FragmentColor col, const u16 blendEVY)
-{
-	FragmentColor newColor;
-	newColor.color = 0;
-	
-	u32 r = col.r;
-	u32 g = col.g;
-	u32 b = col.b;
-	
-	if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
-	{
-		newColor.r = (r + ((63 - r) * blendEVY / 16));
-		newColor.g = (g + ((63 - g) * blendEVY / 16));
-		newColor.b = (b + ((63 - b) * blendEVY / 16));
-	}
-	else if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
-	{
-		newColor.r = (r + ((255 - r) * blendEVY / 16));
-		newColor.g = (g + ((255 - g) * blendEVY / 16));
-		newColor.b = (b + ((255 - b) * blendEVY / 16));
-	}
-	
-	return newColor;
-}
-
-FORCEINLINE u16 GPUEngineBase::_ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY)
-{
-	u16 r =  col        & 0x001F;
-	u16 g = (col >>  5) & 0x001F;
-	u16 b = (col >> 10) & 0x001F;
-	
-	r = (r - (r * blendEVY / 16));
-	g = (g - (g * blendEVY / 16));
-	b = (b - (b * blendEVY / 16));
-	
-	return r | (g << 5) | (b << 10);
-}
-
-FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectDecreaseBrightness(const FragmentColor col, const u16 blendEVY)
-{
-	FragmentColor newColor;
-	newColor.color = 0;
-	
-	u32 r = col.r;
-	u32 g = col.g;
-	u32 b = col.b;
-	
-	newColor.r = (r - (r * blendEVY / 16));
-	newColor.g = (g - (g * blendEVY / 16));
-	newColor.b = (b - (b * blendEVY / 16));
-	
-	return newColor;
-}
-
-#ifdef ENABLE_SSE2
-
-template <NDSColorFormat COLORFORMAT>
-FORCEINLINE __m128i GPUEngineBase::_ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY)
-{
-	if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		__m128i r_vec128 = _mm_and_si128(                col,      _mm_set1_epi16(0x001F) );
-		__m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col,  5), _mm_set1_epi16(0x001F) );
-		__m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) );
-		
-		r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) );
-		g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) );
-		b_vec128 = _mm_add_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), b_vec128), blendEVY), 4) );
-		
-		return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) );
-	}
-	else
-	{
-		__m128i rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128());
-		__m128i rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128());
-		
-		rgbLo = _mm_add_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbLo), blendEVY), 4) );
-		rgbHi = _mm_add_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbHi), blendEVY), 4) );
-		
-		return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) );
-	}
-}
-
-template <NDSColorFormat COLORFORMAT>
-FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY)
-{
-	if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		__m128i r_vec128 = _mm_and_si128(                col,      _mm_set1_epi16(0x001F) );
-		__m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col,  5), _mm_set1_epi16(0x001F) );
-		__m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) );
-		
-		r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) );
-		g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) );
-		b_vec128 = _mm_sub_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(b_vec128, blendEVY), 4) );
-		
-		return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) );
-	}
-	else
-	{
-		__m128i rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128());
-		__m128i rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128());
-		
-		rgbLo = _mm_sub_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(rgbLo, blendEVY), 4) );
-		rgbHi = _mm_sub_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(rgbHi, blendEVY), 4) );
-		
-		return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) );
-	}
-}
-
-template <NDSColorFormat COLORFORMAT>
-FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB)
-{
-#ifdef ENABLE_SSSE3
-	__m128i blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8));
-#endif
-	
-	if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		__m128i ra;
-		__m128i ga;
-		__m128i ba;
-		__m128i colorBitMask = _mm_set1_epi16(0x001F);
-		
-#ifdef ENABLE_SSSE3
-		ra = _mm_or_si128( _mm_and_si128(               colA,      colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 8), _mm_set1_epi16(0x1F00)) );
-		ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA,  5), colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 3), _mm_set1_epi16(0x1F00)) );
-		ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(colB, 2), _mm_set1_epi16(0x1F00)) );
-		
-		ra = _mm_maddubs_epi16(ra, blendAB);
-		ga = _mm_maddubs_epi16(ga, blendAB);
-		ba = _mm_maddubs_epi16(ba, blendAB);
-#else
-		ra = _mm_and_si128(               colA,      colorBitMask);
-		ga = _mm_and_si128(_mm_srli_epi16(colA,  5), colorBitMask);
-		ba = _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask);
-		
-		__m128i rb = _mm_and_si128(               colB,      colorBitMask);
-		__m128i gb = _mm_and_si128(_mm_srli_epi16(colB,  5), colorBitMask);
-		__m128i bb = _mm_and_si128(_mm_srli_epi16(colB, 10), colorBitMask);
-		
-		ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) );
-		ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) );
-		ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) );
-#endif
-		
-		ra = _mm_srli_epi16(ra, 4);
-		ga = _mm_srli_epi16(ga, 4);
-		ba = _mm_srli_epi16(ba, 4);
-		
-		ra = _mm_min_epi16(ra, colorBitMask);
-		ga = _mm_min_epi16(ga, colorBitMask);
-		ba = _mm_min_epi16(ba, colorBitMask);
-		
-		return _mm_or_si128(ra, _mm_or_si128( _mm_slli_epi16(ga, 5), _mm_slli_epi16(ba, 10)) );
-	}
-	else
-	{
-		__m128i outColorLo;
-		__m128i outColorHi;
-		__m128i outColor;
-		
-#ifdef ENABLE_SSSE3
-		outColorLo = _mm_unpacklo_epi8(colA, colB);
-		outColorHi = _mm_unpackhi_epi8(colA, colB);
-		
-		outColorLo = _mm_maddubs_epi16(outColorLo, blendAB);
-		outColorHi = _mm_maddubs_epi16(outColorHi, blendAB);
-#else
-		__m128i colALo = _mm_unpacklo_epi8(colA, _mm_setzero_si128());
-		__m128i colAHi = _mm_unpackhi_epi8(colA, _mm_setzero_si128());
-		__m128i colBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128());
-		__m128i colBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128());
-		
-		outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendEVA), _mm_mullo_epi16(colBLo, blendEVB) );
-		outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendEVA), _mm_mullo_epi16(colBHi, blendEVB) );
-#endif
-		
-		outColorLo = _mm_srli_epi16(outColorLo, 4);
-		outColorHi = _mm_srli_epi16(outColorHi, 4);
-		outColor = _mm_packus_epi16(outColorLo, outColorHi);
-		
-		// When the color format is 888, the packuswb instruction will naturally clamp
-		// the color component values to 255. However, when the color format is 666, the
-		// color component values must be clamped to 63. In this case, we must call pminub
-		// to do the clamp.
-		if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
-		{
-			outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63));
-		}
-		
-		outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF));
-		
-		return outColor;
-	}
-}
-
-template <NDSColorFormat COLORFORMATB>
-FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB)
-{
-	if (COLORFORMATB == NDSColorFormat_BGR555_Rev)
-	{
-		// If the color format of B is 555, then the colA_Hi parameter is required.
-		// The color format of A is assumed to be RGB666.
-		__m128i ra_lo = _mm_and_si128(                colA_Lo,      _mm_set1_epi32(0x000000FF) );
-		__m128i ga_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo,  8), _mm_set1_epi32(0x000000FF) );
-		__m128i ba_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 16), _mm_set1_epi32(0x000000FF) );
-		__m128i aa_lo =                _mm_srli_epi32(colA_Lo, 24);
-		
-		__m128i ra_hi = _mm_and_si128(                colA_Hi,      _mm_set1_epi32(0x000000FF) );
-		__m128i ga_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi,  8), _mm_set1_epi32(0x000000FF) );
-		__m128i ba_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 16), _mm_set1_epi32(0x000000FF) );
-		__m128i aa_hi =                _mm_srli_epi32(colA_Hi, 24);
-		
-		__m128i ra = _mm_packs_epi32(ra_lo, ra_hi);
-		__m128i ga = _mm_packs_epi32(ga_lo, ga_hi);
-		__m128i ba = _mm_packs_epi32(ba_lo, ba_hi);
-		__m128i aa = _mm_packs_epi32(aa_lo, aa_hi);
-		
-#ifdef ENABLE_SSSE3
-		ra = _mm_or_si128( ra, _mm_and_si128(_mm_slli_epi16(colB, 9), _mm_set1_epi16(0x3E00)) );
-		ga = _mm_or_si128( ga, _mm_and_si128(_mm_slli_epi16(colB, 4), _mm_set1_epi16(0x3E00)) );
-		ba = _mm_or_si128( ba, _mm_and_si128(_mm_srli_epi16(colB, 1), _mm_set1_epi16(0x3E00)) );
-		
-		aa = _mm_adds_epu8(aa, _mm_set1_epi16(1));
-		aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8(32), aa), 8) );
-		
-		ra = _mm_maddubs_epi16(ra, aa);
-		ga = _mm_maddubs_epi16(ga, aa);
-		ba = _mm_maddubs_epi16(ba, aa);
-#else
-		aa = _mm_adds_epu16(aa, _mm_set1_epi16(1));
-		__m128i rb = _mm_and_si128( _mm_slli_epi16(colB, 1), _mm_set1_epi16(0x003E) );
-		__m128i gb = _mm_and_si128( _mm_srli_epi16(colB, 4), _mm_set1_epi16(0x003E) );
-		__m128i bb = _mm_and_si128( _mm_srli_epi16(colB, 9), _mm_set1_epi16(0x003E) );
-		__m128i ab = _mm_subs_epu16( _mm_set1_epi16(32), aa );
-		
-		ra = _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) );
-		ga = _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) );
-		ba = _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) );
-#endif
-		
-		ra = _mm_srli_epi16(ra, 6);
-		ga = _mm_srli_epi16(ga, 6);
-		ba = _mm_srli_epi16(ba, 6);
-		
-		return _mm_or_si128( _mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10) );
-	}
-	else
-	{
-		// If the color format of B is 666 or 888, then the colA_Hi parameter is ignored.
-		// The color format of A is assumed to match the color format of B.
-		__m128i rgbALo;
-		__m128i rgbAHi;
-		
-#ifdef ENABLE_SSSE3
-		if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
-		{
-			// Does not work for RGBA8888 color format. The reason is because this
-			// algorithm depends on the pmaddubsw instruction, which multiplies
-			// two unsigned 8-bit integers into an intermediate signed 16-bit
-			// integer. This means that we can overrun the signed 16-bit value
-			// range, which would be limited to [-32767 - 32767]. For example, a
-			// color component of value 255 multiplied by an alpha value of 255
-			// would equal 65025, which is greater than the upper range of a signed
-			// 16-bit value.
-			rgbALo = _mm_unpacklo_epi8(colA_Lo, colB);
-			rgbAHi = _mm_unpackhi_epi8(colA_Lo, colB);
-			
-			__m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x0000001F) );
-			alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) );
-			alpha = _mm_adds_epu8(alpha, _mm_set1_epi8(1));
-			
-			__m128i invAlpha = _mm_subs_epu8(_mm_set1_epi8(32), alpha);
-			__m128i alphaLo = _mm_unpacklo_epi8(alpha, invAlpha);
-			__m128i alphaHi = _mm_unpackhi_epi8(alpha, invAlpha);
-			
-			rgbALo = _mm_maddubs_epi16(rgbALo, alphaLo);
-			rgbAHi = _mm_maddubs_epi16(rgbAHi, alphaHi);
-		}
-		else
-#endif
-		{
-			rgbALo = _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128());
-			rgbAHi = _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128());
-			__m128i rgbBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128());
-			__m128i rgbBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128());
-			
-			__m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x000000FF) );
-			alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) );
-			
-			__m128i alphaLo = _mm_unpacklo_epi8(alpha, _mm_setzero_si128());
-			__m128i alphaHi = _mm_unpackhi_epi8(alpha, _mm_setzero_si128());
-			alphaLo = _mm_add_epi16(alphaLo, _mm_set1_epi16(1));
-			alphaHi = _mm_add_epi16(alphaHi, _mm_set1_epi16(1));
-			
-			if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
-			{
-				rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(32), alphaLo)) );
-				rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(32), alphaHi)) );
-			}
-			else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
-			{
-				rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(256), alphaLo)) );
-				rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(256), alphaHi)) );
-			}
-		}
-		
-		if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
-		{
-			rgbALo = _mm_srli_epi16(rgbALo, 5);
-			rgbAHi = _mm_srli_epi16(rgbAHi, 5);
-		}
-		else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
-		{
-			rgbALo = _mm_srli_epi16(rgbALo, 8);
-			rgbAHi = _mm_srli_epi16(rgbAHi, 8);
-		}
-		
-		return _mm_and_si128( _mm_packus_epi16(rgbALo, rgbAHi), _mm_set1_epi32(0x00FFFFFF) );
-	}
-}
-
-#endif
-
-void GPUEngineBase::ParseReg_MASTER_BRIGHT()
-{
-	const IOREG_MASTER_BRIGHT &MASTER_BRIGHT = this->_IORegisterMap->MASTER_BRIGHT;
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.masterBrightnessIntensity = (MASTER_BRIGHT.Intensity >= 16) ? 16 : MASTER_BRIGHT.Intensity;
-	renderState.masterBrightnessMode = (GPUMasterBrightMode)MASTER_BRIGHT.Mode;
-	renderState.masterBrightnessIsFullIntensity = ( (MASTER_BRIGHT.Intensity >= 16) && ((MASTER_BRIGHT.Mode == GPUMasterBrightMode_Up) || (MASTER_BRIGHT.Mode == GPUMasterBrightMode_Down)) );
-	renderState.masterBrightnessIsMaxOrMin = ( (MASTER_BRIGHT.Intensity >= 16) || (MASTER_BRIGHT.Intensity == 0) );
-}
-
-//Sets up LCD control variables for Display Engines A and B for quick reading
-void GPUEngineBase::ParseReg_DISPCNT()
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.displayOutputMode = (this->_engineID == GPUEngineID_Main) ? (GPUDisplayMode)DISPCNT.DisplayMode : (GPUDisplayMode)(DISPCNT.DisplayMode & 0x01);
-	
-	renderState.WIN0_ENABLED = (DISPCNT.Win0_Enable != 0);
-	renderState.WIN1_ENABLED = (DISPCNT.Win1_Enable != 0);
-	renderState.WINOBJ_ENABLED = (DISPCNT.WinOBJ_Enable != 0);
-	renderState.isAnyWindowEnabled = (renderState.WIN0_ENABLED || renderState.WIN1_ENABLED || renderState.WINOBJ_ENABLED);
-	
-	if (DISPCNT.OBJ_Tile_mapping)
-	{
-		//1-d sprite mapping boundaries:
-		//32k, 64k, 128k, 256k
-		renderState.spriteBoundary = 5 + DISPCNT.OBJ_Tile_1D_Bound;
-		
-		//do not be deceived: even though a sprBoundary==8 (256KB region) is impossible to fully address
-		//in GPU_SUB, it is still fully legal to address it with that granularity.
-		//so don't do this: //if((gpu->core == GPU_SUB) && (cnt->OBJ_Tile_1D_Bound == 3)) gpu->sprBoundary = 7;
-
-		renderState.spriteRenderMode = SpriteRenderMode_Sprite1D;
-	}
-	else
-	{
-		//2d sprite mapping
-		//boundary : 32k
-		renderState.spriteBoundary = 5;
-		renderState.spriteRenderMode = SpriteRenderMode_Sprite2D;
-	}
-     
-	if (DISPCNT.OBJ_BMP_1D_Bound && (this->_engineID == GPUEngineID_Main))
-		renderState.spriteBMPBoundary = 8;
-	else
-		renderState.spriteBMPBoundary = 7;
-	
-	this->ParseReg_BGnCNT(GPULayerID_BG3);
-	this->ParseReg_BGnCNT(GPULayerID_BG2);
-	this->ParseReg_BGnCNT(GPULayerID_BG1);
-	this->ParseReg_BGnCNT(GPULayerID_BG0);
-}
-
-void GPUEngineBase::ParseReg_BGnCNT(const GPULayerID layerID)
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	const IOREG_BGnCNT &BGnCNT = this->_IORegisterMap->BGnCNT[layerID];
-	this->_BGLayer[layerID].BGnCNT = BGnCNT;
-	
-	switch (layerID)
-	{
-		case GPULayerID_BG0: this->_BGLayer[layerID].isVisible = (DISPCNT.BG0_Enable != 0); break;
-		case GPULayerID_BG1: this->_BGLayer[layerID].isVisible = (DISPCNT.BG1_Enable != 0); break;
-		case GPULayerID_BG2: this->_BGLayer[layerID].isVisible = (DISPCNT.BG2_Enable != 0); break;
-		case GPULayerID_BG3: this->_BGLayer[layerID].isVisible = (DISPCNT.BG3_Enable != 0); break;
-			
-		default:
-			break;
-	}
-	
-	if (this->_engineID == GPUEngineID_Main)
-	{
-		this->_BGLayer[layerID].largeBMPAddress  = MMU_ABG;
-		this->_BGLayer[layerID].BMPAddress       = MMU_ABG + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_16KB);
-		this->_BGLayer[layerID].tileMapAddress   = MMU_ABG + (DISPCNT.ScreenBase_Block * ADDRESS_STEP_64KB) + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_2KB);
-		this->_BGLayer[layerID].tileEntryAddress = MMU_ABG + (DISPCNT.CharacBase_Block * ADDRESS_STEP_64KB) + (BGnCNT.CharacBase_Block * ADDRESS_STEP_16KB);
-	}
-	else
-	{
-		this->_BGLayer[layerID].largeBMPAddress  = MMU_BBG;
-		this->_BGLayer[layerID].BMPAddress       = MMU_BBG + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_16KB);
-		this->_BGLayer[layerID].tileMapAddress   = MMU_BBG + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_2KB);
-		this->_BGLayer[layerID].tileEntryAddress = MMU_BBG + (BGnCNT.CharacBase_Block * ADDRESS_STEP_16KB);
-	}
-	
-	//clarify affine ext modes
-	BGType mode = GPUEngineBase::_mode2type[DISPCNT.BG_Mode][layerID];
-	this->_BGLayer[layerID].baseType = mode;
-	
-	if (mode == BGType_AffineExt)
-	{
-		//see: http://nocash.emubase.de/gbatek.htm#dsvideobgmodescontrol
-		const u8 affineModeSelection = (BGnCNT.PaletteMode << 1) | (BGnCNT.CharacBase_Block & 1);
-		switch (affineModeSelection)
-		{
-			case 0:
-			case 1:
-				mode = BGType_AffineExt_256x16;
-				break;
-			case 2:
-				mode = BGType_AffineExt_256x1;
-				break;
-			case 3:
-				mode = BGType_AffineExt_Direct;
-				break;
-		}
-	}
-	
-	// Extended palette slots can be changed for BG0 and BG1, but BG2 and BG3 remain constant.
-	// Display wrapping can be changed for BG2 and BG3, but BG0 and BG1 cannot wrap.
-	if (layerID == GPULayerID_BG0 || layerID == GPULayerID_BG1)
-	{
-		this->_BGLayer[layerID].extPaletteSlot = (BGnCNT.PaletteSet_Wrap * 2) + layerID;
-	}
-	else
-	{
-		this->_BGLayer[layerID].isDisplayWrapped = (BGnCNT.PaletteSet_Wrap != 0);
-	}
-	
-	this->_BGLayer[layerID].type = mode;
-	this->_BGLayer[layerID].size = GPUEngineBase::_BGLayerSizeLUT[mode][BGnCNT.ScreenSize];
-	this->_BGLayer[layerID].isMosaic = (BGnCNT.Mosaic != 0);
-	this->_BGLayer[layerID].priority = BGnCNT.Priority;
-	this->_BGLayer[layerID].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][this->_BGLayer[layerID].extPaletteSlot];
-	
-	this->_ResortBGLayers();
-}
-
-template <GPULayerID LAYERID>
-void GPUEngineBase::ParseReg_BGnHOFS()
-{
-	const IOREG_BGnHOFS &BGnHOFS = this->_IORegisterMap->BGnOFS[LAYERID].BGnHOFS;
-	this->_BGLayer[LAYERID].BGnHOFS = BGnHOFS;
-	
-#ifdef MSB_FIRST
-	this->_BGLayer[LAYERID].xOffset = LOCAL_TO_LE_16(BGnHOFS.value) & 0x01FF;
-#else
-	this->_BGLayer[LAYERID].xOffset = BGnHOFS.Offset;
-#endif
-}
-
-template <GPULayerID LAYERID>
-void GPUEngineBase::ParseReg_BGnVOFS()
-{
-	const IOREG_BGnVOFS &BGnVOFS = this->_IORegisterMap->BGnOFS[LAYERID].BGnVOFS;
-	this->_BGLayer[LAYERID].BGnVOFS = BGnVOFS;
-	
-#ifdef MSB_FIRST
-	this->_BGLayer[LAYERID].yOffset = LOCAL_TO_LE_16(BGnVOFS.value) & 0x01FF;
-#else
-	this->_BGLayer[LAYERID].yOffset = BGnVOFS.Offset;
-#endif
-}
-
-template <GPULayerID LAYERID>
-void GPUEngineBase::ParseReg_BGnX()
-{
-	if (LAYERID == GPULayerID_BG2)
-	{
-		this->savedBG2X = this->_IORegisterMap->BG2X;
-	}
-	else if (LAYERID == GPULayerID_BG3)
-	{
-		this->savedBG3X = this->_IORegisterMap->BG3X;
-	}
-}
-
-template <GPULayerID LAYERID>
-void GPUEngineBase::ParseReg_BGnY()
-{
-	if (LAYERID == GPULayerID_BG2)
-	{
-		this->savedBG2Y = this->_IORegisterMap->BG2Y;
-	}
-	else if (LAYERID == GPULayerID_BG3)
-	{
-		this->savedBG3Y = this->_IORegisterMap->BG3Y;
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::_RenderLine_Clear(GPUEngineCompositorInfo &compInfo)
-{
-	// Clear the current line with the clear color
-	u16 dstClearColor16 = compInfo.renderState.backdropColor16;
-	
-	if (compInfo.renderState.srcBlendEnable[GPULayerID_Backdrop])
-	{
-		if (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness)
-		{
-			dstClearColor16 = compInfo.renderState.brightnessUpTable555[compInfo.renderState.backdropColor16];
-		}
-		else if (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)
-		{
-			dstClearColor16 = compInfo.renderState.brightnessDownTable555[compInfo.renderState.backdropColor16];
-		}
-	}
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(*compInfo.target.lineColor, dstClearColor16);
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(*compInfo.target.lineColor, COLOR555TO666(dstClearColor16));
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(*compInfo.target.lineColor, COLOR555TO888(dstClearColor16));
-			break;
-	}
-	
-	memset(this->_renderLineLayerIDNative, GPULayerID_Backdrop, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	memset(this->_sprWin, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	
-	// init pixels priorities
-	assert(NB_PRIORITIES == 4);
-	this->_itemsForPriority[0].nbPixelsX = 0;
-	this->_itemsForPriority[1].nbPixelsX = 0;
-	this->_itemsForPriority[2].nbPixelsX = 0;
-	this->_itemsForPriority[3].nbPixelsX = 0;
-}
-
-void GPUEngineBase::UpdateRenderStates(const size_t l)
-{
-	GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[l];
-	
-	this->_currentRenderState.backdropColor16 = LE_TO_LOCAL_16(this->_paletteBG[0]) & 0x7FFF;
-	compInfo.renderState = this->_currentRenderState;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::RenderLine(const size_t l)
-{
-	// By default, do nothing.
-	this->UpdatePropertiesWithoutRender(l);
-}
-
-void GPUEngineBase::UpdatePropertiesWithoutRender(const u16 l)
-{
-	// Update BG2/BG3 parameters for Affine and AffineExt modes
-	if (  this->_enableLayer[GPULayerID_BG2] &&
-		((this->_BGLayer[GPULayerID_BG2].baseType == BGType_Affine) || (this->_BGLayer[GPULayerID_BG2].baseType == BGType_AffineExt)) )
-	{
-		IOREG_BG2Parameter &BG2Param = this->_IORegisterMap->BG2Param;
-		
-		BG2Param.BG2X.value += BG2Param.BG2PB.value;
-		BG2Param.BG2Y.value += BG2Param.BG2PD.value;
-	}
-	
-	if (  this->_enableLayer[GPULayerID_BG3] &&
-		((this->_BGLayer[GPULayerID_BG3].baseType == BGType_Affine) || (this->_BGLayer[GPULayerID_BG3].baseType == BGType_AffineExt)) )
-	{
-		IOREG_BG3Parameter &BG3Param = this->_IORegisterMap->BG3Param;
-		
-		BG3Param.BG3X.value += BG3Param.BG3PB.value;
-		BG3Param.BG3Y.value += BG3Param.BG3PD.value;
-	}
-}
-
-void GPUEngineBase::LastLineProcess()
-{
-	this->RefreshAffineStartRegs();
-}
-
-const GPU_IOREG& GPUEngineBase::GetIORegisterMap() const
-{
-	return *this->_IORegisterMap;
-}
-
-bool GPUEngineBase::IsMasterBrightFullIntensity() const
-{
-	return this->_currentRenderState.masterBrightnessIsFullIntensity;
-}
-
-bool GPUEngineBase::IsMasterBrightMaxOrMin() const
-{
-	return this->_currentRenderState.masterBrightnessIsMaxOrMin;
-}
-
-bool GPUEngineBase::IsMasterBrightFullIntensityAtLineZero() const
-{
-	return this->_currentCompositorInfo[0].renderState.masterBrightnessIsFullIntensity;
-}
-
-void GPUEngineBase::GetMasterBrightnessAtLineZero(GPUMasterBrightMode &outMode, u8 &outIntensity)
-{
-	outMode = this->_currentCompositorInfo[0].renderState.masterBrightnessMode;
-	outIntensity = this->_currentCompositorInfo[0].renderState.masterBrightnessIntensity;
-}
-
-/*****************************************************************************/
-//			ENABLING / DISABLING LAYERS
-/*****************************************************************************/
-
-bool GPUEngineBase::GetEnableState()
-{
-	return CommonSettings.showGpu.screens[this->_engineID];
-}
-
-void GPUEngineBase::SetEnableState(bool theState)
-{
-	CommonSettings.showGpu.screens[this->_engineID] = theState;
-}
-
-bool GPUEngineBase::GetLayerEnableState(const size_t layerIndex)
-{
-	return CommonSettings.dispLayers[this->_engineID][layerIndex];
-}
-
-void GPUEngineBase::SetLayerEnableState(const size_t layerIndex, bool theState)
-{
-	CommonSettings.dispLayers[this->_engineID][layerIndex] = theState;
-	this->_ResortBGLayers();
-}
-
-template <s32 INTEGERSCALEHINT, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
-void GPUEngineBase::_LineCopy(void *__restrict dstBuffer, const void *__restrict srcBuffer, const size_t l)
-{
-	switch (INTEGERSCALEHINT)
-	{
-		case 0:
-		{
-			const size_t lineWidth = GPU->GetDisplayInfo().customWidth;
-			const size_t lineIndex = _gpuCaptureLineIndex[l];
-			const size_t lineCount = _gpuCaptureLineCount[l];
-			
-			const void *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (lineIndex * lineWidth * ELEMENTSIZE) : (u8 *)srcBuffer;
-			void *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (lineIndex * lineWidth * ELEMENTSIZE) : (u8 *)dstBuffer;
-			
-			CopyLineExpand<INTEGERSCALEHINT, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, lineWidth * lineCount);
-			break;
-		}
-			
-		case 1:
-		{
-			const void *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer;
-			void *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer;
-			
-			CopyLineExpand<INTEGERSCALEHINT, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-			break;
-		}
-			
-		default:
-		{
-			const size_t lineWidth = GPU->GetDisplayInfo().customWidth;
-			const size_t lineCount = _gpuCaptureLineCount[l];
-			const size_t lineIndex = _gpuCaptureLineIndex[l];
-			
-			const void *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer;
-			u8 *__restrict dstLineHead = (USELINEINDEX) ? (u8 *)dstBuffer + (lineIndex * lineWidth * ELEMENTSIZE) : (u8 *)dstBuffer;
-			
-			// TODO: Determine INTEGERSCALEHINT earlier in the pipeline, preferably when the framebuffer is first initialized.
-			//
-			// The implementation below is a stopgap measure for getting the faster code paths to run.
-			// However, this setup is not ideal, since the code size will greatly increase in order to
-			// include all possible code paths, possibly causing cache misses on lesser CPUs.
-			if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2))
-			{
-				CopyLineExpand<2, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 2);
-			}
-			else if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3))
-			{
-				CopyLineExpand<3, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 3);
-			}
-			else if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4))
-			{
-				CopyLineExpand<4, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
-			}
-			else if ((lineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0)
-			{
-				CopyLineExpand<0xFFFF, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, lineWidth);
-			}
-			else
-			{
-				CopyLineExpand<-1, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, lineWidth);
-			}
-			
-			u8 *__restrict dst = (u8 *)dstLineHead + (lineWidth * ELEMENTSIZE);
-			
-			for (size_t line = 1; line < lineCount; line++)
-			{
-				memcpy(dst, dstLineHead, lineWidth * ELEMENTSIZE);
-				dst += (lineWidth * ELEMENTSIZE);
-			}
-			
-			break;
-		}
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::_TransitionLineNativeToCustom(GPUEngineCompositorInfo &compInfo)
-{
-	if (this->isLineRenderNative[compInfo.line.indexNative])
-	{
-		switch (OUTPUTFORMAT)
-		{
-			case NDSColorFormat_BGR555_Rev:
-				this->_LineCopy<0xFFFF, false, false, 2>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, 0);
-				break;
-				
-			case NDSColorFormat_BGR666_Rev:
-			case NDSColorFormat_BGR888_Rev:
-				this->_LineCopy<0xFFFF, false, false, 4>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, 0);
-				break;
-		}
-		
-		this->_LineCopy<0xFFFF, false, false, 1>(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, 0);
-		
-		compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom;
-		compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom;
-		this->isLineRenderNative[compInfo.line.indexNative] = false;
-		this->nativeLineRenderCount--;
-	}
-}
-
-/*****************************************************************************/
-//			PIXEL RENDERING
-/*****************************************************************************/
-template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
-FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, const u16 srcColor16)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			dstColor16 = srcColor16 | 0x8000;
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			dstColor32.color = ColorspaceConvert555To6665Opaque<false>(srcColor16);
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			dstColor32.color = ColorspaceConvert555To8888Opaque<false>(srcColor16);
-			break;
-	}
-	
-	if (!ISDEBUGRENDER)
-	{
-		dstLayerID = compInfo.renderState.selectedLayerID;
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
-FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			dstColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
-			dstColor16 = dstColor16 | 0x8000;
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			dstColor32 = srcColor32;
-			dstColor32.a = 0x1F;
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			dstColor32 = srcColor32;
-			dstColor32.a = 0xFF;
-			break;
-			
-		default:
-			return;
-	}
-	
-	if (!ISDEBUGRENDER)
-	{
-		dstLayerID = compInfo.renderState.selectedLayerID;
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const u16 srcColor16)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF] | 0x8000;
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF];
-			dstColor32.a = 0x1F;
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF];
-			dstColor32.a = 0xFF;
-			break;
-	}
-	
-	dstLayerID = compInfo.renderState.selectedLayerID;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const u16 srcColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
-		dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF];
-		dstColor16 = dstColor16 | 0x8000;
-	}
-	else
-	{
-		dstColor32 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(srcColor32, compInfo.renderState.blendEVY);
-		dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
-	}
-	
-	dstLayerID = compInfo.renderState.selectedLayerID;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const u16 srcColor16)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF] | 0x8000;
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF];
-			dstColor32.a = 0x1F;
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF];
-			dstColor32.a = 0xFF;
-			break;
-	}
-	
-	dstLayerID = compInfo.renderState.selectedLayerID;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const u16 srcColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
-		dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF];
-		dstColor16 = dstColor16 | 0x8000;
-	}
-	else
-	{
-		dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY);
-		dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
-	}
-	
-	dstLayerID = compInfo.renderState.selectedLayerID;
-}
-
-template <NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
-FORCEINLINE void GPUEngineBase::_PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha, const bool enableColorEffect)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	TBlendTable *selectedBlendTable = compInfo.renderState.blendTable555;
-	u8 blendEVA = compInfo.renderState.blendEVA;
-	u8 blendEVB = compInfo.renderState.blendEVB;
-	
-	const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID];
-	bool forceBlendEffect = false;
-	
-	if ((LAYERTYPE == GPULayerType_OBJ) && enableColorEffect)
-	{
-		//translucent-capable OBJ are forcing the function to blend when the second target is satisfied
-		const OBJMode objMode = (OBJMode)this->_sprType[compInfo.target.xNative];
-		const bool isObjTranslucentType = (objMode == OBJMode_Transparent) || (objMode == OBJMode_Bitmap);
-		if (isObjTranslucentType && dstEffectEnable)
-		{
-			// OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha.
-			// Test cases:
-			// * The spriteblend demo
-			// * Glory of Heracles - fairy on the title screen
-			// * Phoenix Wright: Ace Attorney - character fade-in/fade-out
-			if (spriteAlpha != 0xFF)
-			{
-				blendEVA = spriteAlpha;
-				blendEVB = 16 - spriteAlpha;
-				selectedBlendTable = &GPUEngineBase::_blendTable555[blendEVA][blendEVB];
-			}
-			
-			forceBlendEffect = true;
-		}
-	}
-	
-	ColorEffect selectedEffect = (forceBlendEffect) ? ColorEffect_Blend : ColorEffect_Disable;
-	
-	// If we're not forcing blending, then select the color effect based on the BLDCNT target flags.
-	if (!forceBlendEffect && enableColorEffect && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID])
-	{
-		switch (compInfo.renderState.colorEffect)
-		{
-			// For the Blend effect, both first and second target flags must be checked.
-			case ColorEffect_Blend:
-			{
-				if (dstEffectEnable) selectedEffect = compInfo.renderState.colorEffect;
-				break;
-			}
-				
-			// For the Increase/Decrease Brightness effects, only the first target flag needs to be checked.
-			// Test case: Bomberman Land Touch! dialog boxes will render too dark without this check.
-			case ColorEffect_IncreaseBrightness:
-			case ColorEffect_DecreaseBrightness:
-				selectedEffect = compInfo.renderState.colorEffect;
-				break;
-				
-			default:
-				break;
-		}
-	}
-	
-	// Render the pixel using the selected color effect.
-	switch (selectedEffect)
-	{
-		case ColorEffect_Disable:
-		{
-			switch (OUTPUTFORMAT)
-			{
-				case NDSColorFormat_BGR555_Rev:
-					dstColor16 = srcColor16;
-					dstColor16 |= 0x8000;
-					break;
-					
-				case NDSColorFormat_BGR666_Rev:
-					dstColor32.color = ColorspaceConvert555To6665Opaque<false>(srcColor16);
-					break;
-					
-				case NDSColorFormat_BGR888_Rev:
-					dstColor32.color = ColorspaceConvert555To8888Opaque<false>(srcColor16);
-					break;
-			}
-			break;
-		}
-			
-		case ColorEffect_IncreaseBrightness:
-		{
-			switch (OUTPUTFORMAT)
-			{
-				case NDSColorFormat_BGR555_Rev:
-					dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF];
-					dstColor16 |= 0x8000;
-					break;
-					
-				case NDSColorFormat_BGR666_Rev:
-					dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF];
-					dstColor32.a = 0x1F;
-					break;
-					
-				case NDSColorFormat_BGR888_Rev:
-					dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF];
-					dstColor32.a = 0xFF;
-					break;
-			}
-			break;
-		}
-			
-		case ColorEffect_DecreaseBrightness:
-		{
-			switch (OUTPUTFORMAT)
-			{
-				case NDSColorFormat_BGR555_Rev:
-					dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF];
-					dstColor16 |= 0x8000;
-					break;
-					
-				case NDSColorFormat_BGR666_Rev:
-					dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF];
-					dstColor32.a = 0x1F;
-					break;
-					
-				case NDSColorFormat_BGR888_Rev:
-					dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF];
-					dstColor32.a = 0xFF;
-					break;
-			}
-			break;
-		}
-			
-		case ColorEffect_Blend:
-		{
-			FragmentColor srcColor32;
-			
-			switch (OUTPUTFORMAT)
-			{
-				case NDSColorFormat_BGR555_Rev:
-					dstColor16 = this->_ColorEffectBlend(srcColor16, dstColor16, selectedBlendTable);
-					dstColor16 |= 0x8000;
-					break;
-					
-				case NDSColorFormat_BGR666_Rev:
-					srcColor32.color = ColorspaceConvert555To6665Opaque<false>(srcColor16);
-					dstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(srcColor32, dstColor32, blendEVA, blendEVB);
-					dstColor32.a = 0x1F;
-					break;
-					
-				case NDSColorFormat_BGR888_Rev:
-					srcColor32.color = ColorspaceConvert555To8888Opaque<false>(srcColor16);
-					dstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(srcColor32, dstColor32, blendEVA, blendEVB);
-					dstColor32.a = 0xFF;
-					break;
-			}
-			break;
-		}
-	}
-	
-	dstLayerID = compInfo.renderState.selectedLayerID;
-}
-
-template <NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
-FORCEINLINE void GPUEngineBase::_PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 spriteAlpha, const bool enableColorEffect)
-{
-	u16 &dstColor16 = *compInfo.target.lineColor16;
-	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
-	u8 &dstLayerID = *compInfo.target.lineLayerID;
-	
-	u8 blendEVA = compInfo.renderState.blendEVA;
-	u8 blendEVB = compInfo.renderState.blendEVB;
-	
-	const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID];
-	
-	// 3D rendering has a special override: If the destination pixel is set to blend, then always blend.
-	// Test case: When starting a stage in Super Princess Peach, the screen will be solid black unless
-	// blending is forced here.
-	//
-	// This behavior must take priority over checking for the window color effect enable flag.
-	// Test case: Dialogue boxes in Front Mission will be rendered with blending disabled unless
-	// blend forcing takes priority.
-	bool forceBlendEffect = (LAYERTYPE == GPULayerType_3D) ? dstEffectEnable : false;
-	
-	if ((LAYERTYPE == GPULayerType_OBJ) && enableColorEffect)
-	{
-		//translucent-capable OBJ are forcing the function to blend when the second target is satisfied
-		const OBJMode objMode = (OBJMode)this->_sprType[compInfo.target.xNative];
-		const bool isObjTranslucentType = (objMode == OBJMode_Transparent) || (objMode == OBJMode_Bitmap);
-		if (isObjTranslucentType && dstEffectEnable)
-		{
-			// OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha.
-			// Test cases:
-			// * The spriteblend demo
-			// * Glory of Heracles - fairy on the title screen
-			// * Phoenix Wright: Ace Attorney - character fade-in/fade-out
-			if (spriteAlpha != 0xFF)
-			{
-				blendEVA = spriteAlpha;
-				blendEVB = 16 - spriteAlpha;
-			}
-			
-			forceBlendEffect = true;
-		}
-	}
-	
-	ColorEffect selectedEffect = (forceBlendEffect) ? ColorEffect_Blend : ColorEffect_Disable;
-	
-	// If we're not forcing blending, then select the color effect based on the BLDCNT target flags.
-	if (!forceBlendEffect && enableColorEffect && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID])
-	{
-		switch (compInfo.renderState.colorEffect)
-		{
-			// For the Blend effect, both first and second target flags must be checked.
-			case ColorEffect_Blend:
-			{
-				if (dstEffectEnable) selectedEffect = compInfo.renderState.colorEffect;
-				break;
-			}
-				
-			// For the Increase/Decrease Brightness effects, only the first target flag needs to be checked.
-			// Test case: Bomberman Land Touch! dialog boxes will render too dark without this check.
-			case ColorEffect_IncreaseBrightness:
-			case ColorEffect_DecreaseBrightness:
-				selectedEffect = compInfo.renderState.colorEffect;
-				break;
-				
-			default:
-				break;
-		}
-	}
-	
-	// Render the pixel using the selected color effect.
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const u16 srcColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
-		
-		switch (selectedEffect)
-		{
-			case ColorEffect_Disable:
-				dstColor16 = srcColor16;
-				break;
-				
-			case ColorEffect_IncreaseBrightness:
-				dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF];
-				break;
-				
-			case ColorEffect_DecreaseBrightness:
-				dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF];
-				break;
-				
-			case ColorEffect_Blend:
-				dstColor16 = this->_ColorEffectBlend3D(srcColor32, dstColor16);
-				break;
-		}
-		
-		dstColor16 |= 0x8000;
-	}
-	else
-	{
-		switch (selectedEffect)
-		{
-			case ColorEffect_Disable:
-				dstColor32 = srcColor32;
-				break;
-				
-			case ColorEffect_IncreaseBrightness:
-				dstColor32 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(srcColor32, compInfo.renderState.blendEVY);
-				break;
-				
-			case ColorEffect_DecreaseBrightness:
-				dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY);
-				break;
-				
-			case ColorEffect_Blend:
-				dstColor32 = (LAYERTYPE == GPULayerType_3D) ? this->_ColorEffectBlend3D<OUTPUTFORMAT>(srcColor32, dstColor32) : this->_ColorEffectBlend<OUTPUTFORMAT>(srcColor32, dstColor32, blendEVA, blendEVB);
-				break;
-		}
-		
-		dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
-	}
-	
-	dstLayerID = compInfo.renderState.selectedLayerID;
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
-FORCEINLINE void GPUEngineBase::_PixelComposite(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha, const bool enableColorEffect)
-{
-	switch (COMPOSITORMODE)
-	{
-		case GPUCompositorMode_Debug:
-			this->_PixelCopy<OUTPUTFORMAT, true>(compInfo, srcColor16);
-			break;
-			
-		case GPUCompositorMode_Copy:
-			this->_PixelCopy<OUTPUTFORMAT, false>(compInfo, srcColor16);
-			break;
-			
-		case GPUCompositorMode_BrightUp:
-			this->_PixelBrightnessUp<OUTPUTFORMAT>(compInfo, srcColor16);
-			break;
-			
-		case GPUCompositorMode_BrightDown:
-			this->_PixelBrightnessDown<OUTPUTFORMAT>(compInfo, srcColor16);
-			break;
-			
-		default:
-			this->_PixelUnknownEffect<OUTPUTFORMAT, LAYERTYPE>(compInfo, srcColor16, spriteAlpha, enableColorEffect);
-			break;
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
-FORCEINLINE void GPUEngineBase::_PixelComposite(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const u8 spriteAlpha, const bool enableColorEffect)
-{
-	switch (COMPOSITORMODE)
-	{
-		case GPUCompositorMode_Debug:
-			this->_PixelCopy<OUTPUTFORMAT, true>(compInfo, srcColor32);
-			break;
-			
-		case GPUCompositorMode_Copy:
-			this->_PixelCopy<OUTPUTFORMAT, false>(compInfo, srcColor32);
-			break;
-			
-		case GPUCompositorMode_BrightUp:
-			this->_PixelBrightnessUp<OUTPUTFORMAT>(compInfo, srcColor32);
-			break;
-			
-		case GPUCompositorMode_BrightDown:
-			this->_PixelBrightnessDown<OUTPUTFORMAT>(compInfo, srcColor32);
-			break;
-			
-		default:
-			this->_PixelUnknownEffect<OUTPUTFORMAT, LAYERTYPE>(compInfo, srcColor32, spriteAlpha, enableColorEffect);
-			break;
-	}
-}
-
-#ifdef ENABLE_SSE2
-
-template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
-FORCEINLINE void GPUEngineBase::_PixelCopy16_SSE2(GPUEngineCompositorInfo &compInfo,
-												  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-												  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-												  __m128i &dstLayerID)
-{
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const __m128i alphaBits = _mm_set1_epi16(0x8000);
-		dst0 = _mm_or_si128(src0, alphaBits);
-		dst1 = _mm_or_si128(src1, alphaBits);
-	}
-	else
-	{
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		dst0 = _mm_or_si128(src0, alphaBits);
-		dst1 = _mm_or_si128(src1, alphaBits);
-		dst2 = _mm_or_si128(src2, alphaBits);
-		dst3 = _mm_or_si128(src3, alphaBits);
-	}
-	
-	if (!ISDEBUGRENDER)
-	{
-		dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
-FORCEINLINE void GPUEngineBase::_PixelCopyWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
-														  const __m128i &passMask8,
-														  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-														  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-														  __m128i &dstLayerID)
-{
-	const __m128i passMask16[2]	= { _mm_unpacklo_epi8(passMask8, passMask8),
-								    _mm_unpackhi_epi8(passMask8, passMask8) };
-	
-	// Do the masked copy.
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const __m128i alphaBits = _mm_set1_epi16(0x8000);
-		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(src0, alphaBits), passMask16[0]);
-		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(src1, alphaBits), passMask16[1]);
-	}
-	else
-	{
-		const __m128i passMask32[4]	= { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
-									    _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
-									    _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
-									    _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
-		
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(src0, alphaBits), passMask32[0]);
-		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(src1, alphaBits), passMask32[1]);
-		dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(src2, alphaBits), passMask32[2]);
-		dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(src3, alphaBits), passMask32[3]);
-	}
-	
-	if (!ISDEBUGRENDER)
-	{
-		const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-		dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessUp16_SSE2(GPUEngineCompositorInfo &compInfo,
-														  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-														  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-														  __m128i &dstLayerID)
-{
-	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const __m128i alphaBits = _mm_set1_epi16(0x8000);
-		dst0 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
-		dst1 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
-	}
-	else
-	{
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		dst0 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
-		dst1 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
-		dst2 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits);
-		dst3 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits);
-	}
-	
-	dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessUpWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
-																  const __m128i &passMask8,
-																  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-																  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-																  __m128i &dstLayerID)
-{
-	const __m128i passMask16[2]	= { _mm_unpacklo_epi8(passMask8, passMask8),
-								    _mm_unpackhi_epi8(passMask8, passMask8) };
-	
-	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const __m128i alphaBits = _mm_set1_epi16(0x8000);
-		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask16[0]);
-		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask16[1]);
-	}
-	else
-	{
-		const __m128i passMask32[4]	= { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
-									    _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
-									    _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
-									    _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
-		
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask32[0]);
-		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask32[1]);
-		dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits), passMask32[2]);
-		dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits), passMask32[3]);
-	}
-	
-	const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-	dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessDown16_SSE2(GPUEngineCompositorInfo &compInfo,
-															const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-															__m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-															__m128i &dstLayerID)
-{
-	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const __m128i alphaBits = _mm_set1_epi16(0x8000);
-		dst0 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
-		dst1 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
-	}
-	else
-	{
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		dst0 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
-		dst1 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
-		dst2 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits);
-		dst3 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits);
-	}
-	
-	dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-FORCEINLINE void GPUEngineBase::_PixelBrightnessDownWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
-																	const __m128i &passMask8,
-																	const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-																	__m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-																	__m128i &dstLayerID)
-{
-	const __m128i passMask16[2]	= { _mm_unpacklo_epi8(passMask8, passMask8),
-								    _mm_unpackhi_epi8(passMask8, passMask8) };
-	
-	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		const __m128i alphaBits = _mm_set1_epi16(0x8000);
-		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask16[0]);
-		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask16[1]);
-	}
-	else
-	{
-		const __m128i passMask32[4]	= { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
-									    _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
-									    _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
-									    _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
-		
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask32[0]);
-		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask32[1]);
-		dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits), passMask32[2]);
-		dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits), passMask32[3]);
-	}
-	
-	const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-	dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
-}
-
-template <NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
-FORCEINLINE void GPUEngineBase::_PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
-																   const __m128i &passMask8,
-																   const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-																   const __m128i &spriteAlpha,
-																   const __m128i &srcEffectEnableMask,
-																   const __m128i &enableColorEffectMask,
-																   __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
-																   __m128i &dstLayerID)
-{
-	const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
-	const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8),
-	                                _mm_unpackhi_epi8(passMask8, passMask8) };
-	
-	const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
-	                                _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
-	                                _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
-	                                _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
-	
-	__m128i dstEffectEnableMask;
-	
-#ifdef ENABLE_SSSE3
-	dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID);
-	dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) );
-#else
-	dstEffectEnableMask =                                   _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]);
-	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) );
-	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) );
-	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) );
-	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) );
-	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) );
-#endif
-	
-	dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID_vec128), dstEffectEnableMask );
-	
-	// Select the color effect based on the BLDCNT target flags.
-	const __m128i colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask);
-	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
-	__m128i eva_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVA);
-	__m128i evb_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVB);
-	__m128i forceBlendEffectMask = (LAYERTYPE == GPULayerType_3D) ? dstEffectEnableMask : _mm_setzero_si128();
-	
-	if (LAYERTYPE == GPULayerType_OBJ)
-	{
-		const __m128i objMode_vec128 = _mm_loadu_si128((__m128i *)(this->_sprType + compInfo.target.xNative));
-		const __m128i isObjTranslucentMask = _mm_and_si128( _mm_and_si128(enableColorEffectMask, dstEffectEnableMask), _mm_or_si128(_mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Bitmap))) );
-		forceBlendEffectMask = isObjTranslucentMask;
-		
-		const __m128i spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask);
-		eva_vec128 = _mm_blendv_epi8(eva_vec128, spriteAlpha, spriteAlphaMask);
-		evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask);
-	}
-	
-	__m128i tmpSrc[4];
-	
-	if ( (LAYERTYPE == GPULayerType_3D) && (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) )
-	{
-		// 3D layer blending requires that all src colors are preserved as 32-bit values.
-		// Since dst2 and dst3 are currently unused for RGB555 output, we used these variables
-		// to store the converted 16-bit src colors in a previous step.
-		tmpSrc[0] = dst2;
-		tmpSrc[1] = dst3;
-	}
-	else
-	{
-		tmpSrc[0] = src0;
-		tmpSrc[1] = src1;
-		tmpSrc[2] = src2;
-		tmpSrc[3] = src3;
-	}
-	
-	switch (compInfo.renderState.colorEffect)
-	{
-		case ColorEffect_IncreaseBrightness:
-		{
-			const __m128i brightnessMask8 = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) );
-			const __m128i brightnessMask16[2] = {_mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8)};
-			
-			if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-			{
-				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask16[0] );
-				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask16[1] );
-			}
-			else
-			{
-				const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]),
-				                                      _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]),
-				                                      _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]),
-				                                      _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) };
-				
-				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask32[0] );
-				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask32[1] );
-				tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[2], evy_vec128), brightnessMask32[2] );
-				tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[3], evy_vec128), brightnessMask32[3] );
-			}
-			break;
-		}
-			
-		case ColorEffect_DecreaseBrightness:
-		{
-			const __m128i brightnessMask8 = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) );
-			const __m128i brightnessMask16[2] = {_mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8)};
-			
-			if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-			{
-				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask16[0] );
-				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask16[1] );
-			}
-			else
-			{
-				const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]),
-				                                      _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]),
-				                                      _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]),
-				                                      _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) };
-				
-				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask32[0] );
-				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask32[1] );
-				tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[2], evy_vec128), brightnessMask32[2] );
-				tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[3], evy_vec128), brightnessMask32[3] );
-			}
-			break;
-		}
-			
-		default:
-			break;
-	}
-	
-	// Render the pixel using the selected color effect.
-	const __m128i blendMask8 = _mm_or_si128( forceBlendEffectMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstEffectEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) );
-	const __m128i blendMask16[2] = {_mm_unpacklo_epi8(blendMask8, blendMask8), _mm_unpackhi_epi8(blendMask8, blendMask8)};
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		__m128i blendSrc16[2];
-		
-		if (LAYERTYPE == GPULayerType_3D)
-		{
-			blendSrc16[0] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src0, src1, dst0);
-			blendSrc16[1] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src2, src3, dst1);
-		}
-		else
-		{
-			blendSrc16[0] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[0], dst0, eva_vec128, evb_vec128);
-			blendSrc16[1] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[1], dst1, eva_vec128, evb_vec128);
-		}
-		
-		tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]);
-		tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]);
-		
-		// Combine the final colors.
-		tmpSrc[0] = _mm_or_si128(tmpSrc[0], _mm_set1_epi16(0x8000));
-		tmpSrc[1] = _mm_or_si128(tmpSrc[1], _mm_set1_epi16(0x8000));
-		
-		dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask16[0]);
-		dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask16[1]);
-	}
-	else
-	{
-		__m128i blendSrc32[4];
-		
-		if (LAYERTYPE == GPULayerType_3D)
-		{
-			blendSrc32[0] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src0, src0, dst0);
-			blendSrc32[1] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src1, src1, dst1);
-			blendSrc32[2] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src2, src2, dst2);
-			blendSrc32[3] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src3, src3, dst3);
-		}
-		else
-		{
-			blendSrc32[0] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[0], dst0, eva_vec128, evb_vec128);
-			blendSrc32[1] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[1], dst1, eva_vec128, evb_vec128);
-			blendSrc32[2] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[2], dst2, eva_vec128, evb_vec128);
-			blendSrc32[3] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[3], dst3, eva_vec128, evb_vec128);
-		}
-		
-		const __m128i blendMask32[4] = { _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]),
-		                                 _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]),
-		                                 _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]),
-		                                 _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) };
-		
-		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
-		
-		tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]);
-		tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]);
-		tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]);
-		tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]);
-		
-		tmpSrc[0] = _mm_or_si128(tmpSrc[0], alphaBits);
-		tmpSrc[1] = _mm_or_si128(tmpSrc[1], alphaBits);
-		tmpSrc[2] = _mm_or_si128(tmpSrc[2], alphaBits);
-		tmpSrc[3] = _mm_or_si128(tmpSrc[3], alphaBits);
-		
-		dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask32[0]);
-		dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask32[1]);
-		dst2 = _mm_blendv_epi8(dst2, tmpSrc[2], passMask32[2]);
-		dst3 = _mm_blendv_epi8(dst3, tmpSrc[3], passMask32[3]);
-	}
-	
-	dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
-FORCEINLINE void GPUEngineBase::_PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo,
-													   const bool didAllPixelsPass,
-													   const __m128i &passMask8,
-													   const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
-													   const __m128i &srcEffectEnableMask)
-{
-	const bool is555and3D = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) && (LAYERTYPE == GPULayerType_3D);
-	__m128i dst[4];
-	__m128i dstLayerID_vec128;
-	
-	if (is555and3D)
-	{
-		// 3D layer blending requires that all src colors are preserved as 32-bit values.
-		// Since dst2 and dst3 are currently unused for RGB555 output, we using these variables
-		// to store the converted 16-bit src colors.
-		dst[2] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x003E0000)), 7)),
-		                          _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x003E0000)), 7)) );
-		dst[3] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x003E0000)), 7)),
-		                          _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x003E0000)), 7)) );
-	}
-	
-	if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass)
-	{
-		switch (COMPOSITORMODE)
-		{
-			case GPUCompositorMode_Debug:
-				this->_PixelCopy16_SSE2<OUTPUTFORMAT, true>(compInfo,
-															src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-															dst[3], dst[2], dst[1], dst[0],
-															dstLayerID_vec128);
-				break;
-				
-			case GPUCompositorMode_Copy:
-				this->_PixelCopy16_SSE2<OUTPUTFORMAT, false>(compInfo,
-															 src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-															 dst[3], dst[2], dst[1], dst[0],
-															 dstLayerID_vec128);
-				break;
-				
-			case GPUCompositorMode_BrightUp:
-				this->_PixelBrightnessUp16_SSE2<OUTPUTFORMAT>(compInfo,
-															  src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-															  dst[3], dst[2], dst[1], dst[0],
-															  dstLayerID_vec128);
-				break;
-				
-			case GPUCompositorMode_BrightDown:
-				this->_PixelBrightnessDown16_SSE2<OUTPUTFORMAT>(compInfo,
-																src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-																dst[3], dst[2], dst[1], dst[0],
-																dstLayerID_vec128);
-				break;
-				
-			default:
-				break;
-		}
-	}
-	else
-	{
-		// Read the destination pixels into registers if we're doing a masked pixel write.
-		dst[0] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 0);
-		dst[1] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 1);
-		
-		if (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev)
-		{
-			dst[2] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 2);
-			dst[3] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 3);
-		}
-		
-		dstLayerID_vec128 = _mm_load_si128((__m128i *)compInfo.target.lineLayerID);
-		
-		switch (COMPOSITORMODE)
-		{
-			case GPUCompositorMode_Debug:
-				this->_PixelCopyWithMask16_SSE2<OUTPUTFORMAT, true>(compInfo,
-																	passMask8,
-																	src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-																	dst[3], dst[2], dst[1], dst[0],
-																	dstLayerID_vec128);
-				break;
-				
-			case GPUCompositorMode_Copy:
-				this->_PixelCopyWithMask16_SSE2<OUTPUTFORMAT, false>(compInfo,
-																	 passMask8,
-																	 src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-																	 dst[3], dst[2], dst[1], dst[0],
-																	 dstLayerID_vec128);
-				break;
-				
-			case GPUCompositorMode_BrightUp:
-				this->_PixelBrightnessUpWithMask16_SSE2<OUTPUTFORMAT>(compInfo,
-																	  passMask8,
-																	  src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-																	  dst[3], dst[2], dst[1], dst[0],
-																	  dstLayerID_vec128);
-				break;
-				
-			case GPUCompositorMode_BrightDown:
-				this->_PixelBrightnessDownWithMask16_SSE2<OUTPUTFORMAT>(compInfo,
-																		passMask8,
-																		src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
-																		dst[3], dst[2], dst[1], dst[0],
-																		dstLayerID_vec128);
-				break;
-				
-			default:
-			{
-				const __m128i spriteAlpha = _mm_setzero_si128();
-				const __m128i enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) ) : _mm_set1_epi8(0xFF);
-				
-				this->_PixelUnknownEffectWithMask16_SSE2<OUTPUTFORMAT, LAYERTYPE>(compInfo,
-																				  passMask8,
-																				  src3, src2, src1, src0,
-																				  spriteAlpha,
-																				  srcEffectEnableMask,
-																				  enableColorEffectMask,
-																				  dst[3], dst[2], dst[1], dst[0],
-																				  dstLayerID_vec128);
-				break;
-			}
-		}
-	}
-	
-	_mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]);
-	_mm_store_si128((__m128i *)*compInfo.target.lineColor + 1, dst[1]);
-	
-	if (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev)
-	{
-		_mm_store_si128((__m128i *)*compInfo.target.lineColor + 2, dst[2]);
-		_mm_store_si128((__m128i *)*compInfo.target.lineColor + 3, dst[3]);
-	}
-	
-	_mm_store_si128((__m128i *)compInfo.target.lineLayerID, dstLayerID_vec128);
-}
-
-#endif
-
-//this is fantastically inaccurate.
-//we do the early return even though it reduces the resulting accuracy
-//because we need the speed, and because it is inaccurate anyway
-void GPUEngineBase::_MosaicSpriteLinePixel(GPUEngineCompositorInfo &compInfo, const size_t x, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
-{
-	const bool enableMosaic = (this->_oamList[this->_sprNum[x]].Mosaic != 0);
-	if (!enableMosaic)
-		return;
-	
-	const bool opaque = prioTab[x] <= 4;
-
-	GPUEngineBase::MosaicColor::Obj objColor;
-	objColor.color = LE_TO_LOCAL_16(dst[x]);
-	objColor.alpha = dst_alpha[x];
-	objColor.opaque = opaque;
-
-	const size_t y = compInfo.line.indexNative;
-	
-	if (!compInfo.renderState.mosaicWidthOBJ[x].begin || !compInfo.renderState.mosaicHeightOBJ[y].begin)
-	{
-		objColor = this->_mosaicColors.obj[compInfo.renderState.mosaicWidthOBJ[x].trunc];
-	}
-	
-	this->_mosaicColors.obj[x] = objColor;
-	
-	dst[x] = LE_TO_LOCAL_16(objColor.color);
-	dst_alpha[x] = objColor.alpha;
-	if (!objColor.opaque) prioTab[x] = 0x7F;
-}
-
-void GPUEngineBase::_MosaicSpriteLine(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
-{
-	if (!compInfo.renderState.isOBJMosaicSet)
-	{
-		return;
-	}
-	
-	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
-	{
-		this->_MosaicSpriteLinePixel(compInfo, i, dst, dst_alpha, typeTab, prioTab);
-	}
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc, bool WRAP>
-void GPUEngineBase::_RenderPixelIterate_Final(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal)
-{
-	const u16 lineWidth = (COMPOSITORMODE == GPUCompositorMode_Debug) ? compInfo.renderState.selectedBGLayer->size.width : GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	const s16 dx = (s16)LOCAL_TO_LE_16(param.BGnPA.value);
-	const s16 dy = (s16)LOCAL_TO_LE_16(param.BGnPC.value);
-	const s32 wh = compInfo.renderState.selectedBGLayer->size.width;
-	const s32 ht = compInfo.renderState.selectedBGLayer->size.height;
-	const s32 wmask = wh - 1;
-	const s32 hmask = ht - 1;
-	
-	IOREG_BGnX x = param.BGnX;
-	IOREG_BGnY y = param.BGnY;
-	
-#ifdef MSB_FIRST
-	// This only seems to work in the unrotated/unscaled case. I'm not too sure
-	// about how these bits should really be arranged on big-endian, but at
-	// least this arrangement fixes a bunch of games that use affine or extended
-	// layers, just as long as they don't perform any rotation/scaling.
-	// - rogerman, 2016-07-05
-	x.value = ((x.value & 0x00FFFFFF) << 8) | ((x.value & 0xFF000000) >> 24);
-	y.value = ((y.value & 0x00FFFFFF) << 8) | ((y.value & 0xFF000000) >> 24);
-#endif
-	
-	u8 index;
-	u16 srcColor;
-	
-	// as an optimization, specially handle the fairly common case of
-	// "unrotated + unscaled + no boundary checking required"
-	if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0)
-	{
-		s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer;
-		const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer;
-		
-		if ( WRAP || ((auxX >= 0) && (auxX + lineWidth <= wh) && (auxY >= 0) && (auxY < ht)) )
-		{
-			for (size_t i = 0; i < lineWidth; i++)
-			{
-				GetPixelFunc(auxX, auxY, wh, map, tile, pal, index, srcColor);
-				
-				if (WILLDEFERCOMPOSITING)
-				{
-					this->_deferredIndexNative[i] = index;
-					this->_deferredColorNative[i] = srcColor;
-				}
-				else
-				{
-					this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, i, srcColor, (index != 0));
-				}
-				
-				auxX++;
-				
-				if (WRAP)
-				{
-					auxX &= wmask;
-				}
-			}
-			
-			return;
-		}
-	}
-	
-	for (size_t i = 0; i < lineWidth; i++, x.value+=dx, y.value+=dy)
-	{
-		const s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer;
-		const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer;
-		
-		if (WRAP || ((auxX >= 0) && (auxX < wh) && (auxY >= 0) && (auxY < ht)))
-		{
-			GetPixelFunc(auxX, auxY, wh, map, tile, pal, index, srcColor);
-			
-			if (WILLDEFERCOMPOSITING)
-			{
-				this->_deferredIndexNative[i] = index;
-				this->_deferredColorNative[i] = srcColor;
-			}
-			else
-			{
-				this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, i, srcColor, (index != 0));
-			}
-		}
-	}
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc, bool WRAP>
-void GPUEngineBase::_RenderPixelIterate_ApplyWrap(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal)
-{
-	this->_RenderPixelIterate_Final<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, GetPixelFunc, WRAP>(compInfo, param, map, tile, pal);
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc>
-void GPUEngineBase::_RenderPixelIterate(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal)
-{
-	if (compInfo.renderState.selectedBGLayer->isDisplayWrapped)
-	{
-		this->_RenderPixelIterate_ApplyWrap<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, GetPixelFunc, true>(compInfo, param, map, tile, pal);
-	}
-	else
-	{
-		this->_RenderPixelIterate_ApplyWrap<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, GetPixelFunc, false>(compInfo, param, map, tile, pal);
-	}
-}
-
-TILEENTRY GPUEngineBase::_GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask)
-{
-	TILEENTRY theTileEntry;
-	
-	const u16 tmp = (xOffset & layerWidthMask) >> 3;
-	u32 mapinfo = tileMapAddress + (tmp & 0x1F) * 2;
-	if (tmp > 31) mapinfo += 32*32*2;
-	theTileEntry.val = LOCAL_TO_LE_16( *(u16 *)MMU_gpu_map(mapinfo) );
-	
-	return theTileEntry;
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
-FORCEINLINE void GPUEngineBase::_CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, const bool opaque)
-{
-	bool willRenderColor = opaque;
-	
-	if (MOSAIC)
-	{
-		//due to this early out, we will get incorrect behavior in cases where
-		//we enable mosaic in the middle of a frame. this is deemed unlikely.
-		
-		if (!opaque) srcColor16 = 0xFFFF;
-		else srcColor16 &= 0x7FFF;
-		
-		if (!compInfo.renderState.mosaicWidthBG[srcX].begin || !compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin)
-		{
-			srcColor16 = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][compInfo.renderState.mosaicWidthBG[srcX].trunc];
-		}
-		
-		this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][srcX] = srcColor16;
-		
-		willRenderColor = (srcColor16 != 0xFFFF);
-	}
-	
-	if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
-	{
-		return;
-	}
-	
-	if (!willRenderColor)
-	{
-		return;
-	}
-	
-	compInfo.target.xNative = srcX;
-	compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
-	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHeadNative + srcX;
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative + srcX;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative + srcX;
-	
-	const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-	this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, srcColor16, 0, enableColorEffect);
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
-void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo)
-{
-	if (MOSAIC)
-	{
-#ifdef ENABLE_SSE2
-		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=8)
-		{
-			const __m128i index_vec128 = _mm_loadl_epi64((__m128i *)(this->_deferredIndexNative + x));
-			const __m128i col_vec128 = _mm_load_si128((__m128i *)(this->_deferredColorNative + x));
-			
-			const __m128i idxMask = _mm_cmpeq_epi16(_mm_unpacklo_epi8(index_vec128, _mm_setzero_si128()), _mm_setzero_si128());
-			const __m128i tmpColor_vec128 = _mm_blendv_epi8(_mm_and_si128(col_vec128, _mm_set1_epi16(0x7FFF)), _mm_set1_epi16(0xFFFF), idxMask);
-			
-			const __m128i mosaicWidthMask = _mm_cmpeq_epi16( _mm_and_si128(_mm_set1_epi16(0x00FF), _mm_loadu_si128((__m128i *)(compInfo.renderState.mosaicWidthBG + x))), _mm_setzero_si128() );
-			const __m128i mosaicHeightMask = _mm_cmpeq_epi16(_mm_set1_epi16(compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin), _mm_setzero_si128());
-			const __m128i mosaicMask = _mm_or_si128(mosaicWidthMask, mosaicHeightMask);
-			
-			u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID];
-			mosaicColorBG[x+0] = (_mm_extract_epi16(mosaicMask, 0) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+0].trunc] : _mm_extract_epi16(tmpColor_vec128, 0);
-			mosaicColorBG[x+1] = (_mm_extract_epi16(mosaicMask, 1) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+1].trunc] : _mm_extract_epi16(tmpColor_vec128, 1);
-			mosaicColorBG[x+2] = (_mm_extract_epi16(mosaicMask, 2) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+2].trunc] : _mm_extract_epi16(tmpColor_vec128, 2);
-			mosaicColorBG[x+3] = (_mm_extract_epi16(mosaicMask, 3) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+3].trunc] : _mm_extract_epi16(tmpColor_vec128, 3);
-			mosaicColorBG[x+4] = (_mm_extract_epi16(mosaicMask, 4) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+4].trunc] : _mm_extract_epi16(tmpColor_vec128, 4);
-			mosaicColorBG[x+5] = (_mm_extract_epi16(mosaicMask, 5) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+5].trunc] : _mm_extract_epi16(tmpColor_vec128, 5);
-			mosaicColorBG[x+6] = (_mm_extract_epi16(mosaicMask, 6) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+6].trunc] : _mm_extract_epi16(tmpColor_vec128, 6);
-			mosaicColorBG[x+7] = (_mm_extract_epi16(mosaicMask, 7) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+7].trunc] : _mm_extract_epi16(tmpColor_vec128, 7);
-			
-			const __m128i mosaicColor_vec128 = _mm_loadu_si128((__m128i *)(mosaicColorBG + x));
-			const __m128i mosaicColorMask = _mm_cmpeq_epi16(mosaicColor_vec128, _mm_set1_epi16(0xFFFF));
-			_mm_storel_epi64( (__m128i *)(this->_deferredIndexNative + x), _mm_andnot_si128(_mm_packs_epi16(mosaicColorMask, _mm_setzero_si128()), index_vec128) );
-			_mm_store_si128( (__m128i *)(this->_deferredColorNative + x), _mm_blendv_epi8(mosaicColor_vec128, col_vec128, mosaicColorMask) );
-		}
-#else
-		for (size_t x = 0, dstIdx = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
-		{
-			u16 tmpColor = (this->_deferredIndexNative[x] == 0) ? 0xFFFF : this->_deferredColorNative[x] & 0x7FFF;
-			
-			if (!compInfo.renderState.mosaicWidthBG[x].begin || !compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin)
-			{
-				tmpColor = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][compInfo.renderState.mosaicWidthBG[x].trunc];
-			}
-			
-			this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][x] = tmpColor;
-			
-			if (tmpColor == 0xFFFF)
-			{
-				this->_deferredIndexNative[x] = 0;
-			}
-			else
-			{
-				this->_deferredColorNative[x] = tmpColor;
-			}
-		}
-#endif
-	}
-	
-	CopyLineExpand<0xFFFF, false, 2>(this->_deferredColorCustom, this->_deferredColorNative, compInfo.line.widthCustom);
-	CopyLineExpand<0xFFFF, false, 1>(this->_deferredIndexCustom, this->_deferredIndexNative, compInfo.line.widthCustom);
-	
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead;
-	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
-	
-#ifdef ENABLE_SSE2
-	const size_t ssePixCount = (compInfo.line.widthCustom - (compInfo.line.widthCustom % 16));
-	const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID];
-#endif
-	
-	for (size_t l = 0; l < compInfo.line.renderCount; l++)
-	{
-		compInfo.target.xNative = 0;
-		compInfo.target.xCustom = 0;
-		
-#ifdef ENABLE_SSE2
-		for (; compInfo.target.xCustom < ssePixCount; compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16)
-		{
-			__m128i passMask8;
-			
-			if (WILLPERFORMWINDOWTEST)
-			{
-				// Do the window test.
-				passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) );
-			}
-			else
-			{
-				passMask8 = _mm_set1_epi8(0xFF);
-			}
-			
-			// Do the index test. Pixels with an index value of 0 are rejected.
-			passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8(_mm_load_si128((__m128i *)(this->_deferredIndexCustom + compInfo.target.xCustom)), _mm_setzero_si128()), passMask8);
-			
-			const int passMaskValue = _mm_movemask_epi8(passMask8);
-			
-			// If none of the pixels within the vector pass, then reject them all at once.
-			if (passMaskValue == 0)
-			{
-				continue;
-			}
-			
-			__m128i src[4];
-			
-			if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-			{
-				src[0] = _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 0));
-				src[1] = _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 8));
-			}
-			else
-			{
-				const __m128i src16[2] = { _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 0)),
-										   _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 8)) };
-				
-				if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
-				{
-					ColorspaceConvert555To6665Opaque_SSE2<false>(src16[0], src[0], src[1]);
-					ColorspaceConvert555To6665Opaque_SSE2<false>(src16[1], src[2], src[3]);
-				}
-				else
-				{
-					ColorspaceConvert555To8888Opaque_SSE2<false>(src16[0], src[0], src[1]);
-					ColorspaceConvert555To8888Opaque_SSE2<false>(src16[1], src[2], src[3]);
-				}
-			}
-			
-			// Write out the pixels.
-			const bool didAllPixelsPass = (passMaskValue == 0xFFFF);
-			this->_PixelComposite16_SSE2<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG, WILLPERFORMWINDOWTEST>(compInfo,
-																											   didAllPixelsPass,
-																											   passMask8,
-																											   src[3], src[2], src[1], src[0],
-																											   srcEffectEnableMask);
-		}
-#endif
-		
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-		for (; compInfo.target.xCustom < compInfo.line.widthCustom; compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
-		{
-			if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] == 0) )
-			{
-				continue;
-			}
-			
-			if (this->_deferredIndexCustom[compInfo.target.xCustom] == 0)
-			{
-				continue;
-			}
-			
-			const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-			this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, this->_deferredColorCustom[compInfo.target.xCustom], 0, enableColorEffect);
-		}
-	}
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
-void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo)
-{
-	const void *__restrict vramColorPtr = GPU->GetCustomVRAMAddressUsingMappedAddress<OUTPUTFORMAT>(compInfo.renderState.selectedBGLayer->BMPAddress, compInfo.line.blockOffsetCustom);
-	
-	compInfo.target.xNative = 0;
-	compInfo.target.xCustom = 0;
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead;
-	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
-	
-	size_t i = 0;
-	
-#ifdef ENABLE_SSE2
-	const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID];
-	
-	const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % 16));
-	for (; i < ssePixCount; i+=16, compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16)
-	{
-		__m128i src[4];
-		__m128i passMask8;
-		
-		switch (OUTPUTFORMAT)
-		{
-			case NDSColorFormat_BGR555_Rev:
-			{
-				const __m128i src16[2] = { _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 0)), _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 8)) };
-				src[0] = src16[0];
-				src[1] = src16[1];
-				passMask8 = _mm_packus_epi16( _mm_srli_epi16(src16[0], 15), _mm_srli_epi16(src16[1], 15) );
-				passMask8 = _mm_cmpeq_epi8(passMask8, _mm_set1_epi8(1));
-				break;
-			}
-				
-			case NDSColorFormat_BGR666_Rev:
-			{
-				const __m128i src16[2] = { _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 0)), _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 8)) };
-				ColorspaceConvert555To6665Opaque_SSE2<false>(src16[0], src[0], src[1]);
-				ColorspaceConvert555To6665Opaque_SSE2<false>(src16[1], src[2], src[3]);
-				passMask8 = _mm_packus_epi16( _mm_srli_epi16(src16[0], 15), _mm_srli_epi16(src16[1], 15) );
-				passMask8 = _mm_cmpeq_epi8(passMask8, _mm_set1_epi8(1));
-				break;
-			}
-				
-			case NDSColorFormat_BGR888_Rev:
-				src[0] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 0));
-				src[1] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 4));
-				src[2] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 8));
-				src[3] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 12));
-				passMask8 = _mm_packus_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)), _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) );
-				passMask8 = _mm_cmpeq_epi8(passMask8, _mm_setzero_si128());
-				passMask8 = _mm_xor_si128(passMask8, _mm_set1_epi32(0xFFFFFFFF));
-				break;
-		}
-		
-		if (WILLPERFORMWINDOWTEST)
-		{
-			// Do the window test.
-			passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_setzero_si128()), passMask8);
-		}
-		
-		const int passMaskValue = _mm_movemask_epi8(passMask8);
-		
-		// If none of the pixels within the vector pass, then reject them all at once.
-		if (passMaskValue == 0)
-		{
-			continue;
-		}
-		
-		// Write out the pixels.
-		const bool didAllPixelsPass = (passMaskValue == 0xFFFF);
-		this->_PixelComposite16_SSE2<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG, WILLPERFORMWINDOWTEST>(compInfo,
-																										   didAllPixelsPass,
-																										   passMask8,
-																										   src[3], src[2], src[1], src[0],
-																										   srcEffectEnableMask);
-	}
-#endif
-	
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-	for (; i < compInfo.line.pixelCount; i++, compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
-	{
-		if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] == 0) )
-		{
-			continue;
-		}
-		
-		if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-		{
-			if ((((u32 *)vramColorPtr)[i] & 0xFF000000) == 0)
-			{
-				continue;
-			}
-			
-			const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-			this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, ((u32 *)vramColorPtr)[i], 0, enableColorEffect);
-		}
-		else
-		{
-			if ((((u16 *)vramColorPtr)[i] & 0x8000) == 0)
-			{
-				continue;
-			}
-			
-			const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-			this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, ((u16 *)vramColorPtr)[i], 0, enableColorEffect);
-		}
-	}
-}
-
-/*****************************************************************************/
-//			BACKGROUND RENDERING -TEXT-
-/*****************************************************************************/
-// render a text background to the combined pixelbuffer
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineBase::_RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG)
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	const u16 lineWidth = (COMPOSITORMODE == GPUCompositorMode_Debug) ? compInfo.renderState.selectedBGLayer->size.width : GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	const u16 lg    = compInfo.renderState.selectedBGLayer->size.width;
-	const u16 ht    = compInfo.renderState.selectedBGLayer->size.height;
-	const u32 tile  = compInfo.renderState.selectedBGLayer->tileEntryAddress;
-	const u16 wmask = lg - 1;
-	const u16 hmask = ht - 1;
-	
-	const size_t pixCountLo = 8 - (XBG & 0x0007);
-	size_t x = 0;
-	size_t xoff = XBG;
-	
-	const u16 tmp = (YBG & hmask) >> 3;
-	u32 map = compInfo.renderState.selectedBGLayer->tileMapAddress + (tmp & 31) * 64;
-	if (tmp > 31)
-		map += ADDRESS_STEP_512B << compInfo.renderState.selectedBGLayer->BGnCNT.ScreenSize;
-	
-	if (compInfo.renderState.selectedBGLayer->BGnCNT.PaletteMode == PaletteMode_16x16) // color: 16 palette entries
-	{
-		const u16 *__restrict pal = this->_paletteBG;
-		const u16 yoff = (YBG & 0x0007) << 2;
-		u8 index;
-		u16 color;
-		
-		for (size_t xfin = pixCountLo; x < lineWidth; xfin = std::min<u16>(x+8, lineWidth))
-		{
-			const TILEENTRY tileEntry = this->_GetTileEntry(map, xoff, wmask);
-			const u16 tilePalette = tileEntry.bits.Palette * 16;
-			u8 *__restrict tileColorIdx = (u8 *)MMU_gpu_map(tile + (tileEntry.bits.TileNum * 0x20) + ((tileEntry.bits.VFlip) ? (7*4)-yoff : yoff));
-			
-			if (tileEntry.bits.HFlip)
-			{
-				tileColorIdx += 3 - ((xoff & 0x0007) >> 1);
-				
-				if (xoff & 1)
-				{
-					if (WILLDEFERCOMPOSITING)
-					{
-						this->_deferredIndexNative[x] = *tileColorIdx & 0x0F;
-						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
-					}
-					else
-					{
-						index = *tileColorIdx & 0x0F;
-						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
-						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-					}
-					
-					x++;
-					xoff++;
-					tileColorIdx--;
-				}
-				
-				for (; x < xfin; tileColorIdx--)
-				{
-					if (WILLDEFERCOMPOSITING)
-					{
-						this->_deferredIndexNative[x] = *tileColorIdx >> 4;
-						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
-					}
-					else
-					{
-						index = *tileColorIdx >> 4;
-						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
-						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-					}
-					
-					x++;
-					xoff++;
-					
-					if (x < xfin)
-					{
-						if (WILLDEFERCOMPOSITING)
-						{
-							this->_deferredIndexNative[x] = *tileColorIdx & 0x0F;
-							this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
-						}
-						else
-						{
-							index = *tileColorIdx & 0x0F;
-							color = LE_TO_LOCAL_16(pal[index + tilePalette]);
-							this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-						}
-						
-						x++;
-						xoff++;
-					}
-				}
-			}
-			else
-			{
-				tileColorIdx += ((xoff & 0x0007) >> 1);
-				
-				if (xoff & 1)
-				{
-					if (WILLDEFERCOMPOSITING)
-					{
-						this->_deferredIndexNative[x] = *tileColorIdx >> 4;
-						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
-					}
-					else
-					{
-						index = *tileColorIdx >> 4;
-						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
-						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-					}
-					
-					x++;
-					xoff++;
-					tileColorIdx++;
-				}
-				
-				for (; x < xfin; tileColorIdx++)
-				{
-					if (WILLDEFERCOMPOSITING)
-					{
-						this->_deferredIndexNative[x] = *tileColorIdx & 0x0F;
-						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
-					}
-					else
-					{
-						index = *tileColorIdx & 0x0F;
-						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
-						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-					}
-					
-					x++;
-					xoff++;
-					
-					if (x < xfin)
-					{
-						if (WILLDEFERCOMPOSITING)
-						{
-							this->_deferredIndexNative[x] = *tileColorIdx >> 4;
-							this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
-						}
-						else
-						{
-							index = *tileColorIdx >> 4;
-							color = LE_TO_LOCAL_16(pal[index + tilePalette]);
-							this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-						}
-						
-						x++;
-						xoff++;
-					}
-				}
-			}
-		}
-	}
-	else //256-color BG
-	{
-		const u16 *__restrict pal = (DISPCNT.ExBGxPalette_Enable) ? *(compInfo.renderState.selectedBGLayer->extPalette) : this->_paletteBG;
-		const u32 extPalMask = -DISPCNT.ExBGxPalette_Enable;
-		const u16 yoff = (YBG & 0x0007) << 3;
-		size_t line_dir;
-		
-		for (size_t xfin = pixCountLo; x < lineWidth; xfin = std::min<u16>(x+8, lineWidth))
-		{
-			const TILEENTRY tileEntry = this->_GetTileEntry(map, xoff, wmask);
-			const u16 *__restrict tilePal = (u16 *)((u8 *)pal + ((tileEntry.bits.Palette<<9) & extPalMask));
-			const u8 *__restrict tileColorIdx = (u8 *)MMU_gpu_map(tile + (tileEntry.bits.TileNum * 0x40) + ((tileEntry.bits.VFlip) ? (7*8)-yoff : yoff));
-			
-			if (tileEntry.bits.HFlip)
-			{
-				tileColorIdx += (7 - (xoff & 0x0007));
-				line_dir = -1;
-			}
-			else
-			{
-				tileColorIdx += (xoff & 0x0007);
-				line_dir = 1;
-			}
-			
-			for (; x < xfin; x++, xoff++, tileColorIdx += line_dir)
-			{
-				if (WILLDEFERCOMPOSITING)
-				{
-					this->_deferredIndexNative[x] = *tileColorIdx;
-					this->_deferredColorNative[x] = LE_TO_LOCAL_16(tilePal[this->_deferredIndexNative[x]]);
-				}
-				else
-				{
-					const u8 index = *tileColorIdx;
-					const u16 color = LE_TO_LOCAL_16(tilePal[index]);
-					this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
-				}
-			}
-		}
-	}
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineBase::_RenderLine_BGAffine(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param)
-{
-	this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_tiled_8bit_entry>(compInfo, param, compInfo.renderState.selectedBGLayer->tileMapAddress, compInfo.renderState.selectedBGLayer->tileEntryAddress, this->_paletteBG);
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineBase::_RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, bool &outUseCustomVRAM)
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	
-	switch (compInfo.renderState.selectedBGLayer->type)
-	{
-		case BGType_AffineExt_256x16: // 16  bit bgmap entries
-		{
-			if (DISPCNT.ExBGxPalette_Enable)
-			{
-				this->_RenderPixelIterate< COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_tiled_16bit_entry<true> >(compInfo, param, compInfo.renderState.selectedBGLayer->tileMapAddress, compInfo.renderState.selectedBGLayer->tileEntryAddress, *(compInfo.renderState.selectedBGLayer->extPalette));
-			}
-			else
-			{
-				this->_RenderPixelIterate< COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_tiled_16bit_entry<false> >(compInfo, param, compInfo.renderState.selectedBGLayer->tileMapAddress, compInfo.renderState.selectedBGLayer->tileEntryAddress, this->_paletteBG);
-			}
-			break;
-		}
-			
-		case BGType_AffineExt_256x1: // 256 colors
-			this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_256_map>(compInfo, param, compInfo.renderState.selectedBGLayer->BMPAddress, 0, this->_paletteBG);
-			break;
-			
-		case BGType_AffineExt_Direct: // direct colors / BMP
-		{
-			outUseCustomVRAM = false;
-			
-			if (!MOSAIC)
-			{
-				const bool isRotationScaled = ( (param.BGnPA.value != 0x100) ||
-				                                (param.BGnPC.value !=     0) ||
-				                                (param.BGnX.value  !=     0) ||
-				                                (param.BGnY.value  != (0x100 * compInfo.line.indexNative)) );
-				if (!isRotationScaled)
-				{
-					const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(compInfo.renderState.selectedBGLayer->BMPAddress) - MMU.ARM9_LCD) / sizeof(u16);
-					
-					if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
-					{
-						const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-						const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-						const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
-						
-						GPU->GetEngineMain()->VerifyVRAMLineDidChange(blockID, compInfo.line.indexNative + blockLine);
-						outUseCustomVRAM = !GPU->GetEngineMain()->isLineCaptureNative[blockID][compInfo.line.indexNative + blockLine];
-					}
-				}
-			}
-			
-			if (!outUseCustomVRAM)
-			{
-				this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_BMP_map>(compInfo, param, compInfo.renderState.selectedBGLayer->BMPAddress, 0, this->_paletteBG);
-			}
-			else
-			{
-				if ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)
-				{
-					this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
-				}
-			}
-			break;
-		}
-			
-		case BGType_Large8bpp: // large screen 256 colors
-			this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_256_map>(compInfo, param, compInfo.renderState.selectedBGLayer->largeBMPAddress, 0, this->_paletteBG);
-			break;
-			
-		default:
-			break;
-	}
-}
-
-/*****************************************************************************/
-//			BACKGROUND RENDERING -HELPER FUNCTIONS-
-/*****************************************************************************/
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineBase::_LineText(GPUEngineCompositorInfo &compInfo)
-{
-	if (COMPOSITORMODE == GPUCompositorMode_Debug)
-	{
-		this->_RenderLine_BGText<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, 0, compInfo.line.indexNative);
-	}
-	else
-	{
-		this->_RenderLine_BGText<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, compInfo.renderState.selectedBGLayer->xOffset, compInfo.line.indexNative + compInfo.renderState.selectedBGLayer->yOffset);
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineBase::_LineRot(GPUEngineCompositorInfo &compInfo)
-{
-	if (COMPOSITORMODE == GPUCompositorMode_Debug)
-	{
-		static const IOREG_BGnParameter debugParams = {256, 0, 0, -77, 0, (s32)compInfo.line.blockOffsetNative};
-		this->_RenderLine_BGAffine<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, debugParams);
-	}
-	else
-	{
-		IOREG_BGnParameter *__restrict bgParams = (compInfo.renderState.selectedLayerID == GPULayerID_BG2) ? (IOREG_BGnParameter *)&this->_IORegisterMap->BG2Param : (IOREG_BGnParameter *)&this->_IORegisterMap->BG3Param;
-		this->_RenderLine_BGAffine<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, *bgParams);
-		
-		bgParams->BGnX.value += bgParams->BGnPB.value;
-		bgParams->BGnY.value += bgParams->BGnPD.value;
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineBase::_LineExtRot(GPUEngineCompositorInfo &compInfo, bool &outUseCustomVRAM)
-{
-	if (COMPOSITORMODE == GPUCompositorMode_Debug)
-	{
-		static const IOREG_BGnParameter debugParams = {256, 0, 0, -77, 0, (s32)compInfo.line.blockOffsetNative};
-		this->_RenderLine_BGExtended<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, debugParams, outUseCustomVRAM);
-	}
-	else
-	{
-		IOREG_BGnParameter *__restrict bgParams = (compInfo.renderState.selectedLayerID == GPULayerID_BG2) ? (IOREG_BGnParameter *)&this->_IORegisterMap->BG2Param : (IOREG_BGnParameter *)&this->_IORegisterMap->BG3Param;
-		this->_RenderLine_BGExtended<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, *bgParams, outUseCustomVRAM);
-		
-		bgParams->BGnX.value += bgParams->BGnPB.value;
-		bgParams->BGnY.value += bgParams->BGnPD.value;
-	}
-}
-
-/*****************************************************************************/
-//			SPRITE RENDERING -HELPER FUNCTIONS-
-/*****************************************************************************/
-
-/* if i understand it correct, and it fixes some sprite problems in chameleon shot */
-/* we have a 15 bit color, and should use the pal entry bits as alpha ?*/
-/* http://nocash.emubase.de/gbatek.htm#dsvideoobjs */
-template <bool ISDEBUGRENDER>
-void GPUEngineBase::_RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha)
-{
-	const u16 *__restrict bmpBuffer = (u16 *)MMU_gpu_map(srcadr);
-	size_t i = 0;
-	
-#ifdef ENABLE_SSE2
-	if (xdir == 1)
-	{
-		if (ISDEBUGRENDER)
-		{
-			const size_t ssePixCount = lg - (lg % 8);
-			for (; i < ssePixCount; i += 8, x += 8, sprX += 8)
-			{
-				const __m128i color_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x));
-				const __m128i alphaCompare = _mm_cmpeq_epi16( _mm_srli_epi16(color_vec128, 15), _mm_set1_epi16(0x0001) );
-				_mm_storeu_si128( (__m128i *)(dst + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX)), color_vec128, alphaCompare) );
-			}
-		}
-		else
-		{
-			const __m128i prio_vec128 = _mm_set1_epi8(prio);
-			
-			const size_t ssePixCount = lg - (lg % 16);
-			for (; i < ssePixCount; i += 16, x += 16, sprX += 16)
-			{
-				const __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + sprX));
-				const __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x));
-				const __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x + 8));
-				
-				const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128);
-				const __m128i alphaCompare = _mm_cmpeq_epi8( _mm_packs_epi16(_mm_srli_epi16(colorLo_vec128, 15), _mm_srli_epi16(colorHi_vec128, 15)), _mm_set1_epi8(0x01) );
-				
-				const __m128i combinedPackedCompare = _mm_and_si128(prioCompare, alphaCompare);
-				const __m128i combinedLoCompare = _mm_unpacklo_epi8(combinedPackedCompare, combinedPackedCompare);
-				const __m128i combinedHiCompare = _mm_unpackhi_epi8(combinedPackedCompare, combinedPackedCompare);
-				
-				// Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because
-				// maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite
-				// to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs.
-				_mm_storeu_si128( (__m128i *)(dst + sprX + 0),       _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 0)), colorLo_vec128, combinedLoCompare) );
-				_mm_storeu_si128( (__m128i *)(dst + sprX + 8),       _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 8)), colorHi_vec128, combinedHiCompare) );
-				_mm_storeu_si128( (__m128i *)(dst_alpha + sprX),     _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst_alpha + sprX)), _mm_set1_epi8(alpha + 1), combinedPackedCompare) );
-				_mm_storeu_si128( (__m128i *)(typeTab + sprX),       _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(typeTab + sprX)), _mm_set1_epi8(OBJMode_Bitmap), combinedPackedCompare) );
-				_mm_storeu_si128( (__m128i *)(prioTab + sprX),       _mm_blendv_epi8(prioTab_vec128, prio_vec128, combinedPackedCompare) );
-				_mm_storeu_si128( (__m128i *)(this->_sprNum + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(this->_sprNum + sprX)), _mm_set1_epi8(spriteNum), combinedPackedCompare) );
-			}
-		}
-	}
-#endif
-	
-	for (; i < lg; i++, sprX++, x += xdir)
-	{
-		const u16 color = LE_TO_LOCAL_16(bmpBuffer[x]);
-		
-		//a cleared alpha bit suppresses the pixel from processing entirely; it doesnt exist
-		if (ISDEBUGRENDER)
-		{
-			if (color & 0x8000)
-			{
-				dst[sprX] = color;
-			}
-		}
-		else
-		{
-			if ((color & 0x8000) && (prio < prioTab[sprX]))
-			{
-				dst[sprX] = color;
-				dst_alpha[sprX] = alpha+1;
-				typeTab[sprX] = OBJMode_Bitmap;
-				prioTab[sprX] = prio;
-				this->_sprNum[sprX] = spriteNum;
-			}
-		}
-	}
-}
-
-template<bool ISDEBUGRENDER>
-void GPUEngineBase::_RenderSprite256(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha)
-{
-	for (size_t i = 0; i < lg; i++, ++sprX, x += xdir)
-	{
-		const u32 adr = srcadr + (u32)( (x & 0x7) + ((x & 0xFFF8) << 3) );
-		const u8 *__restrict src = (u8 *)MMU_gpu_map(adr);
-		const u8 palette_entry = *src;
-
-		//a zero value suppresses the pixel from processing entirely; it doesnt exist
-		if (ISDEBUGRENDER)
-		{
-			if (palette_entry > 0)
-			{
-				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
-			}
-		}
-		else
-		{
-			if ((palette_entry > 0) && (prio < prioTab[sprX]))
-			{
-				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
-				dst_alpha[sprX] = 0xFF;
-				typeTab[sprX] = (alpha ? OBJMode_Transparent : OBJMode_Normal);
-				prioTab[sprX] = prio;
-				this->_sprNum[sprX] = spriteNum;
-			}
-		}
-	}
-}
-
-template<bool ISDEBUGRENDER>
-void GPUEngineBase::_RenderSprite16(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha)
-{
-	for (size_t i = 0; i < lg; i++, ++sprX, x += xdir)
-	{
-		const u16 x1 = x >> 1;
-		const u32 adr = srcadr + (x1 & 0x3) + ((x1 & 0xFFFC) << 3);
-		const u8 *__restrict src = (u8 *)MMU_gpu_map(adr);
-		const u8 palette = *src;
-		const u8 palette_entry = (x & 1) ? palette >> 4 : palette & 0xF;
-		
-		//a zero value suppresses the pixel from processing entirely; it doesnt exist
-		if (ISDEBUGRENDER)
-		{
-			if (palette_entry > 0)
-			{
-				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
-			}
-		}
-		else
-		{
-			if ((palette_entry > 0) && (prio < prioTab[sprX]))
-			{
-				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
-				dst_alpha[sprX] = 0xFF;
-				typeTab[sprX] = (alpha ? OBJMode_Transparent : OBJMode_Normal);
-				prioTab[sprX] = prio;
-				this->_sprNum[sprX] = spriteNum;
-			}
-		}
-	}
-}
-
-void GPUEngineBase::_RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir)
-{
-	if (col256)
-	{
-		for (size_t i = 0; i < lg; i++, sprX++, x += xdir)
-		{
-			if (src[(x & 7) + ((x & 0xFFF8) << 3)])
-			{
-				this->_sprWin[sprX] = 1;
-			}
-		}
-	}
-	else
-	{
-		for (size_t i = 0; i < lg; i++, sprX++, x += xdir)
-		{
-			const size_t x1 = x >> 1;
-			const u8 palette = src[(x1 & 0x3) + ((x1 & 0xFFFC) << 3)];
-			const u8 palette_entry = (x & 1) ? palette >> 4 : palette & 0xF;
-			
-			if (palette_entry)
-			{
-				this->_sprWin[sprX] = 1;
-			}
-		}
-	}
-}
-
-// return val means if the sprite is to be drawn or not
-bool GPUEngineBase::_ComputeSpriteVars(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, SpriteSize &sprSize, s32 &sprX, s32 &sprY, s32 &x, s32 &y, s32 &lg, s32 &xdir)
-{
-	x = 0;
-	// get sprite location and size
-	sprX = spriteInfo.X;
-	sprY = spriteInfo.Y;
-	sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape];
-	lg = sprSize.width;
-	
-// FIXME: for rot/scale, a list of entries into the sprite should be maintained,
-// that tells us where the first pixel of a screenline starts in the sprite,
-// and how a step to the right in a screenline translates within the sprite
-
-	//this wasn't really tested by anything. very unlikely to get triggered
-	y = (compInfo.line.indexNative - sprY) & 0xFF;                        /* get the y line within sprite coords */
-	if (y >= sprSize.height)
-		return false;
-
-	if ((sprX == GPU_FRAMEBUFFER_NATIVE_WIDTH) || ((sprX+sprSize.width) <= 0))	/* sprite pixels outside of line */
-		return false;				/* not to be drawn */
-
-	// sprite portion out of the screen (LEFT)
-	if (sprX < 0)
-	{
-		lg += sprX;	
-		x = -(sprX);
-		sprX = 0;
-	}
-	// sprite portion out of the screen (RIGHT)
-	if ((sprX+sprSize.width) >= GPU_FRAMEBUFFER_NATIVE_WIDTH)
-		lg = GPU_FRAMEBUFFER_NATIVE_WIDTH - sprX;
-
-	// switch TOP<-->BOTTOM
-	if (spriteInfo.VFlip)
-		y = sprSize.height - y - 1;
-	
-	// switch LEFT<-->RIGHT
-	if (spriteInfo.HFlip)
-	{
-		x = sprSize.width - x - 1;
-		xdir = -1;
-	}
-	else
-	{
-		xdir = 1;
-	}
-	
-	return true;
-}
-
-/*****************************************************************************/
-//			SPRITE RENDERING
-/*****************************************************************************/
-
-
-//TODO - refactor this so there isnt as much duped code between rotozoomed and non-rotozoomed versions
-
-u32 GPUEngineBase::_SpriteAddressBMP(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, const SpriteSize sprSize, const s32 y)
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	
-	if (DISPCNT.OBJ_BMP_mapping)
-	{
-		//tested by buffy sacrifice damage blood splatters in corner
-		return this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBMPBoundary) + (y * sprSize.width * 2);
-	}
-	else
-	{
-		//2d mapping:
-		//verified in rotozoomed mode by knights in the nightmare intro
-
-		if (DISPCNT.OBJ_BMP_2D_dim)
-			//256*256, verified by heroes of mana FMV intro
-			return this->_sprMem + (((spriteInfo.TileIndex&0x3E0) * 64 + (spriteInfo.TileIndex&0x1F) * 8 + (y << 8)) << 1);
-		else 
-			//128*512, verified by harry potter and the order of the phoenix conversation portraits
-			return this->_sprMem + (((spriteInfo.TileIndex&0x3F0) * 64 + (spriteInfo.TileIndex&0x0F) * 8 + (y << 7)) << 1);
-	}
-}
-
-template <bool ISDEBUGRENDER>
-void GPUEngineBase::_SpriteRender(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
-{
-	if (compInfo.renderState.spriteRenderMode == SpriteRenderMode_Sprite1D)
-		this->_SpriteRenderPerform<SpriteRenderMode_Sprite1D, ISDEBUGRENDER>(compInfo, dst, dst_alpha, typeTab, prioTab);
-	else
-		this->_SpriteRenderPerform<SpriteRenderMode_Sprite2D, ISDEBUGRENDER>(compInfo, dst, dst_alpha, typeTab, prioTab);
-}
-
-void GPUEngineBase::SpriteRenderDebug(const u16 lineIndex, u16 *dst)
-{
-	GPUEngineCompositorInfo compInfo;
-	memset(&compInfo, 0, sizeof(compInfo));
-	
-	compInfo.renderState.displayOutputMode = GPUDisplayMode_Normal;
-	compInfo.renderState.selectedLayerID = GPULayerID_OBJ;
-	compInfo.renderState.colorEffect = ColorEffect_Disable;
-	compInfo.renderState.masterBrightnessMode = GPUMasterBrightMode_Disable;
-	compInfo.renderState.masterBrightnessIsFullIntensity = false;
-	compInfo.renderState.masterBrightnessIsMaxOrMin = true;
-	compInfo.renderState.spriteRenderMode = this->_currentRenderState.spriteRenderMode;
-	compInfo.renderState.spriteBoundary = this->_currentRenderState.spriteBoundary;
-	compInfo.renderState.spriteBMPBoundary = this->_currentRenderState.spriteBMPBoundary;
-	
-	compInfo.line.indexNative = lineIndex;
-	compInfo.line.indexCustom = compInfo.line.indexNative;
-	compInfo.line.widthCustom = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	compInfo.line.renderCount = 1;
-	compInfo.line.pixelCount = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	compInfo.line.blockOffsetNative = compInfo.line.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	compInfo.line.blockOffsetCustom = compInfo.line.blockOffsetNative;
-	
-	compInfo.target.lineColorHead = dst;
-	compInfo.target.lineColorHeadNative = compInfo.target.lineColorHead;
-	compInfo.target.lineColorHeadCustom = compInfo.target.lineColorHeadNative;
-	compInfo.target.lineLayerIDHead = NULL;
-	compInfo.target.lineLayerIDHeadNative = NULL;
-	compInfo.target.lineLayerIDHeadCustom = NULL;
-	
-	compInfo.target.xNative = 0;
-	compInfo.target.xCustom = 0;
-	compInfo.target.lineColor = (void **)&compInfo.target.lineColor16;
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative;
-	compInfo.target.lineLayerID = NULL;
-	
-	this->_SpriteRender<true>(compInfo, dst, NULL, NULL, NULL);
-}
-
-template <SpriteRenderMode MODE, bool ISDEBUGRENDER>
-void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	size_t cost = 0;
-	
-	for (size_t i = 0; i < 128; i++)
-	{
-		OAMAttributes spriteInfo = this->_oamList[i];
-
-		//for each sprite:
-		if (cost >= 2130)
-		{
-			//out of sprite rendering time
-			//printf("sprite overflow!\n");
-			//return;		
-		}
-
-		//do we incur a cost if a sprite is disabled?? we guess so.
-		cost += 2;
-		
-		// Check if sprite is disabled before everything
-		if (spriteInfo.RotScale == 0 && spriteInfo.Disable != 0)
-			continue;
-		
-		// Must explicitly convert endianness with attributes 1 and 2.
-		spriteInfo.attr[1] = LOCAL_TO_LE_16(spriteInfo.attr[1]);
-		spriteInfo.attr[2] = LOCAL_TO_LE_16(spriteInfo.attr[2]);
-		
-		const OBJMode objMode = (OBJMode)spriteInfo.Mode;
-
-		SpriteSize sprSize;
-		s32 sprX;
-		s32 sprY;
-		s32 x;
-		s32 y;
-		s32 lg;
-		s32 xdir;
-		u8 prio = spriteInfo.Priority;
-		u16 *__restrict pal;
-		u8 *__restrict src;
-		u32 srcadr;
-		
-		if (spriteInfo.RotScale != 0)
-		{
-			s32		fieldX, fieldY, auxX, auxY, realX, realY, offset;
-			u8		blockparameter;
-			s16		dx, dmx, dy, dmy;
-			u16		colour;
-
-			// Get sprite positions and size
-			sprX = spriteInfo.X;
-			sprY = spriteInfo.Y;
-			sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape];
-
-			// Copy sprite size, to check change it if needed
-			fieldX = sprSize.width;
-			fieldY = sprSize.height;
-			lg = sprSize.width;
-
-			// If we are using double size mode, double our control vars
-			if (spriteInfo.DoubleSize != 0)
-			{
-				fieldX <<= 1;
-				fieldY <<= 1;
-				lg <<= 1;
-			}
-
-			//check if the sprite is visible y-wise. unfortunately our logic for x and y is different due to our scanline based rendering
-			//tested thoroughly by many large sprites in Super Robot Wars K which wrap around the screen
-			y = (compInfo.line.indexNative - sprY) & 0xFF;
-			if (y >= fieldY)
-				continue;
-
-			//check if sprite is visible x-wise.
-			if ((sprX == GPU_FRAMEBUFFER_NATIVE_WIDTH) || (sprX + fieldX <= 0))
-				continue;
-
-			cost += (sprSize.width * 2) + 10;
-
-			// Get which four parameter block is assigned to this sprite
-			blockparameter = (spriteInfo.RotScaleIndex + (spriteInfo.HFlip << 3) + (spriteInfo.VFlip << 4)) * 4;
-
-			// Get rotation/scale parameters
-			dx  = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+0].attr3);
-			dmx = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+1].attr3);
-			dy  = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+2].attr3);
-			dmy = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+3].attr3);
-			
-			// Calculate fixed point 8.8 start offsets
-			realX = (sprSize.width  << 7) - (fieldX >> 1)*dx - (fieldY >> 1)*dmx + y*dmx;
-			realY = (sprSize.height << 7) - (fieldX >> 1)*dy - (fieldY >> 1)*dmy + y*dmy;
-
-			if (sprX < 0)
-			{
-				// If sprite is not in the window
-				if (sprX + fieldX <= 0)
-					continue;
-
-				// Otherwise, is partially visible
-				lg += sprX;
-				realX -= sprX*dx;
-				realY -= sprX*dy;
-				sprX = 0;
-			}
-			else
-			{
-				if (sprX + fieldX > GPU_FRAMEBUFFER_NATIVE_WIDTH)
-					lg = GPU_FRAMEBUFFER_NATIVE_WIDTH - sprX;
-			}
-
-			// If we are using 1 palette of 256 colours
-			if (spriteInfo.PaletteMode == PaletteMode_1x256)
-			{
-				src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary));
-
-				// If extended palettes are set, use them
-				pal = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
-
-				for (size_t j = 0; j < lg; ++j, ++sprX)
-				{
-					// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
-					auxX = (realX >> 8);
-					auxY = (realY >> 8);
-
-					if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
-					{
-						if (MODE == SpriteRenderMode_Sprite2D)
-							offset = (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*8);
-						else
-							offset = (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)*sprSize.width*8) + ((auxY&0x7)*8);
-
-						colour = src[offset];
-						
-						if (ISDEBUGRENDER)
-						{
-							if (colour)
-							{
-								dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
-							}
-						}
-						else
-						{
-							if (colour && (prio < prioTab[sprX]))
-							{
-								dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
-								dst_alpha[sprX] = 0xFF;
-								typeTab[sprX] = objMode;
-								prioTab[sprX] = prio;
-								this->_sprNum[sprX] = i;
-							}
-						}
-					}
-
-					// Add the rotation/scale coefficients, here the rotation/scaling is performed
-					realX += dx;
-					realY += dy;
-				}
-			}
-			// Rotozoomed direct color
-			else if (objMode == OBJMode_Bitmap)
-			{
-				//transparent (i think, dont bother to render?) if alpha is 0
-				if (spriteInfo.PaletteIndex == 0)
-					continue;
-
-				srcadr = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, 0);
-
-				for (size_t j = 0; j < lg; ++j, ++sprX)
-				{
-					// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
-					auxX = realX >> 8;
-					auxY = realY >> 8;
-
-					//this is all very slow, and so much dup code with other rotozoomed modes.
-					//dont bother fixing speed until this whole thing gets reworked
-
-					if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
-					{
-						if (DISPCNT.OBJ_BMP_2D_dim)
-							//tested by knights in the nightmare
-							offset = ((this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, auxY) - srcadr) / 2) + auxX;
-						else //tested by lego indiana jones (somehow?)
-							//tested by buffy sacrifice damage blood splatters in corner
-							offset = auxX + (auxY * sprSize.width);
-
-						const u32 finalAddr = srcadr + (offset << 1);
-						u16 *mem = (u16 *)MMU_gpu_map(finalAddr);
-						colour = LE_TO_LOCAL_16(*mem);
-						
-						if (ISDEBUGRENDER)
-						{
-							if (colour & 0x8000)
-							{
-								dst[sprX] = colour;
-							}
-						}
-						else
-						{
-							if ((colour & 0x8000) && (prio < prioTab[sprX]))
-							{
-								dst[sprX] = colour;
-								dst_alpha[sprX] = spriteInfo.PaletteIndex;
-								typeTab[sprX] = objMode;
-								prioTab[sprX] = prio;
-								this->_sprNum[sprX] = i;
-							}
-						}
-					}
-
-					// Add the rotation/scale coefficients, here the rotation/scaling is performed
-					realX += dx;
-					realY += dy;
-				}
-			}
-			// Rotozoomed 16/16 palette
-			else
-			{
-				if (MODE == SpriteRenderMode_Sprite2D)
-				{
-					src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << 5));
-				}
-				else
-				{
-					src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary));
-				}
-				
-				pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4);
-
-				for (size_t j = 0; j < lg; ++j, ++sprX)
-				{
-					// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
-					auxX = realX >> 8;
-					auxY = realY >> 8;
-
-					if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
-					{
-						if (MODE == SpriteRenderMode_Sprite2D)
-							offset = ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*4);
-						else
-							offset = ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)*sprSize.width)*4 + ((auxY&0x7)*4);
-						
-						colour = src[offset];
-
-						// Get 4bits value from the readed 8bits
-						if (auxX&1)	colour >>= 4;
-						else		colour &= 0xF;
-						
-						if (ISDEBUGRENDER)
-						{
-							if (colour)
-							{
-								dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
-							}
-						}
-						else
-						{
-							if (colour && (prio < prioTab[sprX]))
-							{
-								if (objMode == OBJMode_Window)
-								{
-									this->_sprWin[sprX] = 1;
-								}
-								else
-								{
-									dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
-									dst_alpha[sprX] = 0xFF;
-									typeTab[sprX] = objMode;
-									prioTab[sprX] = prio;
-									this->_sprNum[sprX] = i;
-								}
-							}
-						}
-					}
-
-					// Add the rotation/scale coeficients, here the rotation/scaling  is performed
-					realX += dx;
-					realY += dy;
-				}
-			}
-		}
-		else //NOT rotozoomed
-		{
-			if (!this->_ComputeSpriteVars(compInfo, spriteInfo, sprSize, sprX, sprY, x, y, lg, xdir))
-				continue;
-
-			cost += sprSize.width;
-
-			if (objMode == OBJMode_Window)
-			{
-				if (MODE == SpriteRenderMode_Sprite2D)
-				{
-					if (spriteInfo.PaletteMode == PaletteMode_1x256)
-						src = (u8 *)MMU_gpu_map(this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*8));
-					else
-						src = (u8 *)MMU_gpu_map(this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*4));
-				}
-				else
-				{
-					if (spriteInfo.PaletteMode == PaletteMode_1x256)
-						src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*8) + ((y&0x7)*8));
-					else
-						src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*4) + ((y&0x7)*4));
-				}
-
-				this->_RenderSpriteWin(src, (spriteInfo.PaletteMode == PaletteMode_1x256), lg, sprX, x, xdir);
-			}
-			else if (objMode == OBJMode_Bitmap) //sprite is in BMP format
-			{
-				//transparent (i think, dont bother to render?) if alpha is 0
-				if (spriteInfo.PaletteIndex == 0)
-					continue;
-				
-				srcadr = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, y);
-				this->_RenderSpriteBMP<ISDEBUGRENDER>(compInfo, i, dst, srcadr, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, spriteInfo.PaletteIndex);
-				
-				const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(srcadr) - MMU.ARM9_LCD) / sizeof(u16);
-				if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
-				{
-					const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-					const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-					const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
-					const size_t linePixel = blockPixel % GPU_FRAMEBUFFER_NATIVE_WIDTH;
-					
-					if (!GPU->GetEngineMain()->isLineCaptureNative[blockID][blockLine] && (linePixel == 0))
-					{
-						this->vramBlockOBJAddress = srcadr;
-					}
-				}
-			}
-			else if (spriteInfo.PaletteMode == PaletteMode_1x256) //256 colors
-			{
-				if (MODE == SpriteRenderMode_Sprite2D)
-					srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*8);
-				else
-					srcadr = this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*8) + ((y&0x7)*8);
-				
-				pal = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
-				this->_RenderSprite256<ISDEBUGRENDER>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
-			}
-			else // 16 colors
-			{
-				if (MODE == SpriteRenderMode_Sprite2D)
-				{
-					srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*4);
-				}
-				else
-				{
-					srcadr = this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*4) + ((y&0x7)*4);
-				}
-				
-				pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4);
-				this->_RenderSprite16<ISDEBUGRENDER>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
-			}
-		}
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
-void GPUEngineBase::_RenderLine_Layers(const size_t l)
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	itemsForPriority_t *item;
-	
-	GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[l];
-	
-	// Optimization: For normal display mode, render straight to the output buffer when that is what we are going to end
-	// up displaying anyway. Otherwise, we need to use the working buffer.
-	compInfo.target.lineColorHeadNative = (compInfo.renderState.displayOutputMode == GPUDisplayMode_Normal) ? (u8 *)this->nativeBuffer + (compInfo.line.blockOffsetNative * dispInfo.pixelBytes) : (u8 *)this->_internalRenderLineTargetNative;
-	compInfo.target.lineColorHeadCustom = (compInfo.renderState.displayOutputMode == GPUDisplayMode_Normal) ? (u8 *)this->customBuffer + (compInfo.line.blockOffsetCustom * dispInfo.pixelBytes) : (u8 *)this->_internalRenderLineTargetCustom;
-	compInfo.target.lineColorHead = compInfo.target.lineColorHeadNative;
-	
-	compInfo.target.lineLayerIDHeadNative = this->_renderLineLayerIDNative;
-	compInfo.target.lineLayerIDHeadCustom = this->_renderLineLayerIDCustom;
-	compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadNative;
-	
-	compInfo.target.xNative = 0;
-	compInfo.target.xCustom = 0;
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative;
-	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
-	
-	this->_RenderLine_Clear<OUTPUTFORMAT>(compInfo);
-	
-	// for all the pixels in the line
-	if (this->_enableLayer[GPULayerID_OBJ])
-	{
-		this->vramBlockOBJAddress = 0;
-		this->_RenderLine_SetupSprites(compInfo);
-	}
-	
-	if (WILLPERFORMWINDOWTEST)
-	{
-		this->_PerformWindowTesting(compInfo);
-	}
-	
-	// paint lower priorities first
-	// then higher priorities on top
-	for (size_t prio = NB_PRIORITIES; prio > 0; )
-	{
-		prio--;
-		item = &(this->_itemsForPriority[prio]);
-		// render BGs
-		if (this->_isAnyBGLayerEnabled)
-		{
-			for (size_t i = 0; i < item->nbBGs; i++)
-			{
-				const GPULayerID layerID = (GPULayerID)item->BGs[i];
-				
-				if (this->_enableLayer[layerID])
-				{
-					compInfo.renderState.selectedLayerID = layerID;
-					compInfo.renderState.selectedBGLayer = &this->_BGLayer[layerID];
-					
-					if (this->_engineID == GPUEngineID_Main)
-					{
-						if ( (layerID == GPULayerID_BG0) && GPU->GetEngineMain()->WillRender3DLayer() )
-						{
-#ifndef DISABLE_COMPOSITOR_FAST_PATHS
-							if ( !compInfo.renderState.dstAnyBlendEnable && (  (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
-																			   !compInfo.renderState.srcBlendEnable[GPULayerID_BG0] ||
-																			 (((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) )
-							{
-								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_Copy, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-							}
-							else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_BG0] && (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) )
-							{
-								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_BrightUp, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-							}
-							else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_BG0] && (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness) )
-							{
-								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_BrightDown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-							}
-							else
-#endif
-							{
-								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_Unknown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-							}
-							continue;
-						}
-					}
-										
-#ifndef DISABLE_COMPOSITOR_FAST_PATHS
-					if ( (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
-						 !compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] ||
-						((compInfo.renderState.colorEffect == ColorEffect_Blend) && !compInfo.renderState.dstAnyBlendEnable) ||
-						(((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) )
-					{
-						this->_RenderLine_LayerBG<GPUCompositorMode_Copy, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-					}
-					else if ( !WILLPERFORMWINDOWTEST && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] && (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) )
-					{
-						this->_RenderLine_LayerBG<GPUCompositorMode_BrightUp, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-					}
-					else if ( !WILLPERFORMWINDOWTEST && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] && (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness) )
-					{
-						this->_RenderLine_LayerBG<GPUCompositorMode_BrightDown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-					}
-					else
-#endif
-					{
-						this->_RenderLine_LayerBG<GPUCompositorMode_Unknown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
-					}
-				} //layer enabled
-			}
-		}
-		
-		// render sprite Pixels
-		if ( this->_enableLayer[GPULayerID_OBJ] && (item->nbPixelsX > 0) )
-		{
-			compInfo.renderState.selectedLayerID = GPULayerID_OBJ;
-			compInfo.renderState.selectedBGLayer = NULL;
-			
-#ifndef DISABLE_COMPOSITOR_FAST_PATHS
-			if ( !compInfo.renderState.dstAnyBlendEnable && (  (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
-															   !compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] ||
-															 (((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) )
-			{
-				this->_RenderLine_LayerOBJ<GPUCompositorMode_Copy, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
-			}
-			else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] && (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) )
-			{
-				this->_RenderLine_LayerOBJ<GPUCompositorMode_BrightUp, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
-			}
-			else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] && (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness) )
-			{
-				this->_RenderLine_LayerOBJ<GPUCompositorMode_BrightDown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
-			}
-			else
-#endif
-			{
-				this->_RenderLine_LayerOBJ<GPUCompositorMode_Unknown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
-			}
-		}
-	}
-}
-
-void GPUEngineBase::_RenderLine_SetupSprites(GPUEngineCompositorInfo &compInfo)
-{
-	itemsForPriority_t *item;
-	
-	//n.b. - this is clearing the sprite line buffer to the background color,
-	memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(this->_sprColor, compInfo.renderState.backdropColor16);
-	memset(this->_sprAlpha, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	memset(this->_sprType, OBJMode_Normal, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	memset(this->_sprPrio, 0x7F, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	
-	//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
-	//how it interacts with this. I wish we knew why we needed this
-	
-	this->_SpriteRender<false>(compInfo, this->_sprColor, this->_sprAlpha, this->_sprType, this->_sprPrio);
-	this->_MosaicSpriteLine(compInfo, this->_sprColor, this->_sprAlpha, this->_sprType, this->_sprPrio);
-	
-	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
-	{
-		// assign them to the good priority item
-		const size_t prio = this->_sprPrio[i];
-		if (prio >= 4) continue;
-		
-		item = &(this->_itemsForPriority[prio]);
-		item->PixelsX[item->nbPixelsX] = i;
-		item->nbPixelsX++;
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
-void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item)
-{
-	bool useCustomVRAM = false;
-	
-	if (this->vramBlockOBJAddress != 0)
-	{
-		const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(this->vramBlockOBJAddress) - MMU.ARM9_LCD) / sizeof(u16);
-		
-		if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
-		{
-			const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-			const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-			const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
-			
-			GPU->GetEngineMain()->VerifyVRAMLineDidChange(blockID, blockLine);
-			useCustomVRAM = !GPU->GetEngineMain()->isLineCaptureNative[blockID][blockLine];
-		}
-	}
-	
-	if (useCustomVRAM && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested))
-	{
-		this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
-	}
-	
-	if (this->isLineRenderNative[compInfo.line.indexNative])
-	{
-		if (useCustomVRAM && (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev))
-		{
-			const FragmentColor *__restrict vramColorPtr = (FragmentColor *)GPU->GetCustomVRAMAddressUsingMappedAddress<OUTPUTFORMAT>(this->vramBlockOBJAddress, 0);
-			
-			for (size_t i = 0; i < item->nbPixelsX; i++)
-			{
-				const size_t srcX = item->PixelsX[i];
-				
-				if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
-				{
-					continue;
-				}
-				
-				compInfo.target.xNative = srcX;
-				compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
-				compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX;
-				compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX;
-				compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX;
-				
-				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, vramColorPtr[srcX], this->_sprAlpha[srcX], enableColorEffect);
-			}
-		}
-		else
-		{
-			for (size_t i = 0; i < item->nbPixelsX; i++)
-			{
-				const size_t srcX = item->PixelsX[i];
-				
-				if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
-				{
-					continue;
-				}
-				
-				compInfo.target.xNative = srcX;
-				compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
-				compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX;
-				compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX;
-				compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX;
-				
-				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect);
-			}
-		}
-	}
-	else
-	{
-		void *__restrict dstColorPtr = compInfo.target.lineColorHead;
-		u8 *__restrict dstLayerIDPtr = compInfo.target.lineLayerIDHead;
-		
-		if (useCustomVRAM)
-		{
-			const void *__restrict vramColorPtr = GPU->GetCustomVRAMAddressUsingMappedAddress<OUTPUTFORMAT>(this->vramBlockOBJAddress, 0);
-			
-			for (size_t line = 0; line < compInfo.line.renderCount; line++)
-			{
-				compInfo.target.lineColor16 = (u16 *)dstColorPtr;
-				compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr;
-				compInfo.target.lineLayerID = dstLayerIDPtr;
-				
-				for (size_t i = 0; i < item->nbPixelsX; i++)
-				{
-					const size_t srcX = item->PixelsX[i];
-					
-					if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
-					{
-						continue;
-					}
-					
-					compInfo.target.xNative = srcX;
-					compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
-					
-					for (size_t p = 0; p < _gpuDstPitchCount[srcX]; p++)
-					{
-						const size_t dstX = compInfo.target.xCustom + p;
-						
-						compInfo.target.lineColor16 = (u16 *)dstColorPtr + dstX;
-						compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr + dstX;
-						compInfo.target.lineLayerID = dstLayerIDPtr + dstX;
-						
-						const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-						
-						if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-						{
-							this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, ((FragmentColor *)vramColorPtr)[dstX], this->_sprAlpha[srcX], enableColorEffect);
-						}
-						else
-						{
-							this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, ((u16 *)vramColorPtr)[dstX], this->_sprAlpha[srcX], enableColorEffect);
-						}
-					}
-				}
-				
-				vramColorPtr = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)vramColorPtr + compInfo.line.widthCustom) : (void *)((u16 *)vramColorPtr + compInfo.line.widthCustom);
-				dstColorPtr = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) ? (void *)((u16 *)dstColorPtr + compInfo.line.widthCustom) : (void *)((FragmentColor *)dstColorPtr + compInfo.line.widthCustom);
-				dstLayerIDPtr += compInfo.line.widthCustom;
-			}
-		}
-		else
-		{
-			for (size_t line = 0; line < compInfo.line.renderCount; line++)
-			{
-				compInfo.target.lineColor16 = (u16 *)dstColorPtr;
-				compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr;
-				compInfo.target.lineLayerID = dstLayerIDPtr;
-				
-				for (size_t i = 0; i < item->nbPixelsX; i++)
-				{
-					const size_t srcX = item->PixelsX[i];
-					
-					if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
-					{
-						continue;
-					}
-					
-					compInfo.target.xNative = srcX;
-					compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
-					
-					for (size_t p = 0; p < _gpuDstPitchCount[srcX]; p++)
-					{
-						const size_t dstX = compInfo.target.xCustom + p;
-						
-						compInfo.target.lineColor16 = (u16 *)dstColorPtr + dstX;
-						compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr + dstX;
-						compInfo.target.lineLayerID = dstLayerIDPtr + dstX;
-						
-						const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
-						this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect);
-					}
-				}
-				
-				dstColorPtr = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) ? (void *)((u16 *)dstColorPtr + compInfo.line.widthCustom) : (void *)((FragmentColor *)dstColorPtr + compInfo.line.widthCustom);
-				dstLayerIDPtr += compInfo.line.widthCustom;
-			}
-		}
-	}
-}
-
-void GPUEngineBase::UpdateMasterBrightnessDisplayInfo(NDSDisplayInfo &mutableInfo)
-{
-	const GPUEngineCompositorInfo &compInfoZero = this->_currentCompositorInfo[0];
-	bool needsApply = false;
-	bool processPerScanline = false;
-	
-	for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
-	{
-		const GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[line];
-		
-		if ( !needsApply &&
-			 (compInfo.renderState.masterBrightnessIntensity != 0) &&
-			((compInfo.renderState.masterBrightnessMode == GPUMasterBrightMode_Up) || (compInfo.renderState.masterBrightnessMode == GPUMasterBrightMode_Down)) )
-		{
-			needsApply = true;
-		}
-		
-		mutableInfo.masterBrightnessMode[this->_targetDisplayID][line] = compInfo.renderState.masterBrightnessMode;
-		mutableInfo.masterBrightnessIntensity[this->_targetDisplayID][line] = compInfo.renderState.masterBrightnessIntensity;
-		
-		if ( !processPerScanline &&
-			((compInfo.renderState.masterBrightnessMode != compInfoZero.renderState.masterBrightnessMode) ||
-			 (compInfo.renderState.masterBrightnessIntensity != compInfoZero.renderState.masterBrightnessIntensity)) )
-		{
-			processPerScanline = true;
-		}
-	}
-	
-	mutableInfo.masterBrightnessDiffersPerLine[this->_targetDisplayID] = processPerScanline;
-	mutableInfo.needApplyMasterBrightness[this->_targetDisplayID] = needsApply;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::ApplyMasterBrightness(const NDSDisplayInfo &displayInfo)
-{
-	// Most games maintain the exact same master brightness values for all 192 lines, so we
-	// can easily apply the master brightness to the entire framebuffer at once, which is
-	// faster than applying it per scanline.
-	//
-	// However, some games need to have the master brightness values applied on a per-scanline
-	// basis since they can differ for each scanline. For example, Mega Man Zero Collection
-	// purposely changes the master brightness intensity to 31 on line 0, 0 on line 16, and
-	// then back to 31 on line 176. Since the MMZC are originally GBA games, the master
-	// brightness intensity changes are done to disable the unused scanlines on the NDS.
-	
-	if (displayInfo.masterBrightnessDiffersPerLine[this->_targetDisplayID])
-	{
-		for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
-		{
-			const GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[line];
-			void *dstColorLine = (!displayInfo.didPerformCustomRender[this->_targetDisplayID]) ? ((u8 *)displayInfo.nativeBuffer[this->_targetDisplayID] + (compInfo.line.blockOffsetNative * displayInfo.pixelBytes)) : ((u8 *)displayInfo.customBuffer[this->_targetDisplayID] + (compInfo.line.blockOffsetCustom * displayInfo.pixelBytes));
-			const size_t pixCount = (!displayInfo.didPerformCustomRender[this->_targetDisplayID]) ? GPU_FRAMEBUFFER_NATIVE_WIDTH : compInfo.line.pixelCount;
-			
-			this->ApplyMasterBrightness<OUTPUTFORMAT, false>(dstColorLine,
-															 pixCount,
-															 (GPUMasterBrightMode)displayInfo.masterBrightnessMode[this->_targetDisplayID][line],
-															 displayInfo.masterBrightnessIntensity[this->_targetDisplayID][line]);
-		}
-	}
-	else
-	{
-		this->ApplyMasterBrightness<OUTPUTFORMAT, false>(displayInfo.renderedBuffer[this->_targetDisplayID],
-														 displayInfo.renderedWidth[this->_targetDisplayID] * displayInfo.renderedHeight[this->_targetDisplayID],
-														 (GPUMasterBrightMode)displayInfo.masterBrightnessMode[this->_targetDisplayID][0],
-														 displayInfo.masterBrightnessIntensity[this->_targetDisplayID][0]);
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT, bool ISFULLINTENSITYHINT>
-void GPUEngineBase::ApplyMasterBrightness(void *dst, const size_t pixCount, const GPUMasterBrightMode mode, const u8 intensity)
-{
-	if (!ISFULLINTENSITYHINT && (intensity == 0)) return;
-	
-	const bool isFullIntensity = (intensity >= 16);
-	const u8 intensityClamped = (isFullIntensity) ? 16 : intensity;
-	
-	switch (mode)
-	{
-		case GPUMasterBrightMode_Disable:
-			break;
-			
-		case GPUMasterBrightMode_Up:
-		{
-			if (!ISFULLINTENSITYHINT && !isFullIntensity)
-			{
-				size_t i = 0;
-				
-				switch (OUTPUTFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-					{
-#ifdef ENABLE_SSE2
-						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
-						
-						const size_t ssePixCount = pixCount - (pixCount % 8);
-						for (; i < ssePixCount; i += 8)
-						{
-							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
-							dstColor_vec128 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
-							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi16(0x8000));
-							_mm_store_si128((__m128i *)((u16 *)dst + i), dstColor_vec128);
-						}
-#endif
-						
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-						for (; i < pixCount; i++)
-						{
-							((u16 *)dst)[i] = GPUEngineBase::_brightnessUpTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000;
-						}
-						break;
-					}
-						
-					case NDSColorFormat_BGR666_Rev:
-					case NDSColorFormat_BGR888_Rev:
-					{
-#ifdef ENABLE_SSE2
-						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
-						
-						const size_t ssePixCount = pixCount - (pixCount % 4);
-						for (; i < ssePixCount; i += 4)
-						{
-							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((FragmentColor *)dst + i));
-							dstColor_vec128 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
-							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000));
-							_mm_store_si128((__m128i *)((FragmentColor *)dst + i), dstColor_vec128);
-						}
-#endif
-						
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-						for (; i < pixCount; i++)
-						{
-							((FragmentColor *)dst)[i] = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(((FragmentColor *)dst)[i], intensityClamped);
-							((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF;
-						}
-						break;
-					}
-						
-					default:
-						break;
-				}
-			}
-			else
-			{
-				// all white (optimization)
-				switch (OUTPUTFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-						memset_u16(dst, 0xFFFF, pixCount);
-						break;
-						
-					case NDSColorFormat_BGR666_Rev:
-						memset_u32(dst, 0x1F3F3F3F, pixCount);
-						break;
-						
-					case NDSColorFormat_BGR888_Rev:
-						memset_u32(dst, 0xFFFFFFFF, pixCount);
-						break;
-						
-					default:
-						break;
-				}
-			}
-			break;
-		}
-			
-		case GPUMasterBrightMode_Down:
-		{
-			if (!ISFULLINTENSITYHINT && !isFullIntensity)
-			{
-				size_t i = 0;
-				
-				switch (OUTPUTFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-					{
-#ifdef ENABLE_SSE2
-						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
-						
-						const size_t ssePixCount = pixCount - (pixCount % 8);
-						for (; i < ssePixCount; i += 8)
-						{
-							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
-							dstColor_vec128 = this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
-							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi16(0x8000));
-							_mm_store_si128((__m128i *)((u16 *)dst + i), dstColor_vec128);
-						}
-#endif
-						
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-						for (; i < pixCount; i++)
-						{
-							((u16 *)dst)[i] = GPUEngineBase::_brightnessDownTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000;
-						}
-						break;
-					}
-						
-					case NDSColorFormat_BGR666_Rev:
-					case NDSColorFormat_BGR888_Rev:
-					{
-#ifdef ENABLE_SSE2
-						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
-						
-						const size_t ssePixCount = pixCount - (pixCount % 4);
-						for (; i < ssePixCount; i += 4)
-						{
-							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((FragmentColor *)dst + i));
-							dstColor_vec128 = this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
-							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000));
-							_mm_store_si128((__m128i *)((FragmentColor *)dst + i), dstColor_vec128);
-						}
-#endif
-						
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-						for (; i < pixCount; i++)
-						{
-							((FragmentColor *)dst)[i] = this->_ColorEffectDecreaseBrightness(((FragmentColor *)dst)[i], intensityClamped);
-							((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF;
-						}
-						break;
-					}
-						
-					default:
-						break;
-				}
-			}
-			else
-			{
-				// all black (optimization)
-				switch (OUTPUTFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-						memset_u16(dst, 0x8000, pixCount);
-						break;
-						
-					case NDSColorFormat_BGR666_Rev:
-						memset_u32(dst, 0x1F000000, pixCount);
-						break;
-						
-					case NDSColorFormat_BGR888_Rev:
-						memset_u32(dst, 0xFF000000, pixCount);
-						break;
-						
-					default:
-						break;
-				}
-			}
-			break;
-		}
-			
-		case GPUMasterBrightMode_Reserved:
-			break;
-	}
-}
-
-template <size_t WIN_NUM>
-bool GPUEngineBase::_IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo)
-{
-	const u16 windowTop    = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0V.Top : this->_IORegisterMap->WIN1V.Top;
-	const u16 windowBottom = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0V.Bottom : this->_IORegisterMap->WIN1V.Bottom;
-	
-	if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) goto allout;
-	if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) goto allout;
-
-	if (windowTop > windowBottom)
-	{
-		if ((compInfo.line.indexNative < windowTop) && (compInfo.line.indexNative > windowBottom)) goto allout;
-	}
-	else
-	{
-		if ((compInfo.line.indexNative < windowTop) || (compInfo.line.indexNative >= windowBottom)) goto allout;
-	}
-
-	//the x windows will apply for this scanline
-	return true;
-	
-allout:
-	return false;
-}
-
-template <size_t WIN_NUM>
-void GPUEngineBase::_UpdateWINH(GPUEngineCompositorInfo &compInfo)
-{
-	//dont even waste any time in here if the window isnt enabled
-	if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) return;
-	if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) return;
-
-	this->_needUpdateWINH[WIN_NUM] = false;
-	const size_t windowLeft  = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0H.Left  : this->_IORegisterMap->WIN1H.Left;
-	const size_t windowRight = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0H.Right : this->_IORegisterMap->WIN1H.Right;
-
-	//the original logic: if you doubt the window code, please check it against the newer implementation below
-	//if(windowLeft > windowRight)
-	//{
-	//	if((x < windowLeft) && (x > windowRight)) return false;
-	//}
-	//else
-	//{
-	//	if((x < windowLeft) || (x >= windowRight)) return false;
-	//}
-
-	if (windowLeft > windowRight)
-	{
-		memset(this->_h_win[WIN_NUM], 1, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
-		memset(this->_h_win[WIN_NUM] + windowRight + 1, 0, (windowLeft - (windowRight + 1)) * sizeof(u8));
-	}
-	else
-	{
-		memset(this->_h_win[WIN_NUM], 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
-		memset(this->_h_win[WIN_NUM] + windowLeft, 1, (windowRight - windowLeft) * sizeof(u8));
-	}
-}
-
-void GPUEngineBase::_PerformWindowTesting(GPUEngineCompositorInfo &compInfo)
-{
-	if (this->_needUpdateWINH[0]) this->_UpdateWINH<0>(compInfo);
-	if (this->_needUpdateWINH[1]) this->_UpdateWINH<1>(compInfo);
-	
-	for (size_t layerID = GPULayerID_BG0; layerID <= GPULayerID_OBJ; layerID++)
-	{
-		if (!this->_enableLayer[layerID])
-		{
-			continue;
-		}
-		
-#ifdef ENABLE_SSE2
-		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=16)
-		{
-			__m128i win_vec128;
-			
-			__m128i didPassWindowTest = _mm_setzero_si128();
-			__m128i enableColorEffect = _mm_setzero_si128();
-			
-			__m128i win0HandledMask = _mm_setzero_si128();
-			__m128i win1HandledMask = _mm_setzero_si128();
-			__m128i winOBJHandledMask = _mm_setzero_si128();
-			__m128i winOUTHandledMask = _mm_setzero_si128();
-			
-			// Window 0 has the highest priority, so always check this first.
-			if (compInfo.renderState.WIN0_ENABLED && this->_IsWindowInsideVerticalRange<0>(compInfo))
-			{
-				win_vec128 = _mm_load_si128((__m128i *)(this->_h_win[0] + i));
-				win0HandledMask = _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1));
-				
-				didPassWindowTest = _mm_and_si128(win0HandledMask, compInfo.renderState.WIN0_enable_SSE2[layerID]);
-				enableColorEffect = _mm_and_si128(win0HandledMask, compInfo.renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]);
-			}
-			
-			// Window 1 has medium priority, and is checked after Window 0.
-			if (compInfo.renderState.WIN1_ENABLED && this->_IsWindowInsideVerticalRange<1>(compInfo))
-			{
-				win_vec128 = _mm_load_si128((__m128i *)(this->_h_win[1] + i));
-				win1HandledMask = _mm_andnot_si128(win0HandledMask, _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)));
-				
-				didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(win1HandledMask, compInfo.renderState.WIN1_enable_SSE2[layerID]) );
-				enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(win1HandledMask, compInfo.renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) );
-			}
-			
-			// Window OBJ has low priority, and is checked after both Window 0 and Window 1.
-			if (compInfo.renderState.WINOBJ_ENABLED)
-			{
-				win_vec128 = _mm_load_si128((__m128i *)(this->_sprWin + i));
-				winOBJHandledMask = _mm_andnot_si128( _mm_or_si128(win0HandledMask, win1HandledMask), _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)) );
-				
-				didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOBJHandledMask, compInfo.renderState.WINOBJ_enable_SSE2[layerID]) );
-				enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOBJHandledMask, compInfo.renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) );
-			}
-			
-			// If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags.
-			// This has the lowest priority, and is always checked last.
-			winOUTHandledMask = _mm_xor_si128( _mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi32(0xFFFFFFFF) );
-			didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOUTHandledMask, compInfo.renderState.WINOUT_enable_SSE2[layerID]) );
-			enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOUTHandledMask, compInfo.renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) );
-			
-			_mm_store_si128((__m128i *)(this->_didPassWindowTestNative[layerID] + i), _mm_and_si128(didPassWindowTest, _mm_set1_epi8(0x01)));
-			_mm_store_si128((__m128i *)(this->_enableColorEffectNative[layerID] + i), _mm_and_si128(enableColorEffect, _mm_set1_epi8(0x01)));
-		}
-#else
-		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
-		{
-			// Window 0 has the highest priority, so always check this first.
-			if (compInfo.renderState.WIN0_ENABLED && this->_IsWindowInsideVerticalRange<0>(compInfo))
-			{
-				if (this->_h_win[0][i] != 0)
-				{
-					this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WIN0_enable[layerID];
-					this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG];
-					continue;
-				}
-			}
-			
-			// Window 1 has medium priority, and is checked after Window 0.
-			if (compInfo.renderState.WIN1_ENABLED && this->_IsWindowInsideVerticalRange<1>(compInfo))
-			{
-				if (this->_h_win[1][i] != 0)
-				{
-					this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WIN1_enable[layerID];
-					this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG];
-					continue;
-				}
-			}
-			
-			// Window OBJ has low priority, and is checked after both Window 0 and Window 1.
-			if (compInfo.renderState.WINOBJ_ENABLED)
-			{
-				if (this->_sprWin[i] != 0)
-				{
-					this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WINOBJ_enable[layerID];
-					this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG];
-					continue;
-				}
-			}
-			
-			// If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags.
-			// This has the lowest priority, and is always checked last.
-			this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WINOUT_enable[layerID];
-			this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG];
-		}
-#endif
-		if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 1))
-		{
-			CopyLineExpand<1, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH);
-			CopyLineExpand<1, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH);
-		}
-		else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2))
-		{
-			CopyLineExpand<2, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 2);
-			CopyLineExpand<2, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 2);
-		}
-		else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3))
-		{
-			CopyLineExpand<3, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 3);
-			CopyLineExpand<3, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 3);
-		}
-		else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4))
-		{
-			CopyLineExpand<4, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
-			CopyLineExpand<4, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
-		}
-		else if ((compInfo.line.widthCustom % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0)
-		{
-			CopyLineExpand<0xFFFF, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], compInfo.line.widthCustom);
-			CopyLineExpand<0xFFFF, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], compInfo.line.widthCustom);
-		}
-		else
-		{
-			CopyLineExpand<-1, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], compInfo.line.widthCustom);
-			CopyLineExpand<-1, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], compInfo.line.widthCustom);
-		}
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo)
-{
-	bool useCustomVRAM = false;
-	
-	if (WILLDEFERCOMPOSITING)
-	{
-		// Because there is no guarantee for any given pixel to be written out, we need
-		// to zero out the deferred index buffer so that unwritten pixels can properly
-		// fail in _CompositeLineDeferred(). If we don't do this, then previously rendered
-		// layers may leave garbage indices for the current layer to mistakenly use if
-		// the current layer just so happens to have unwritten pixels.
-		//
-		// Test case: The score screen in Sonic Rush will be taken over by BG2, filling
-		// the screen with blue, unless this initialization is done each time.
-		memset(this->_deferredIndexNative, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
-	}
-	
-	switch (compInfo.renderState.selectedBGLayer->baseType)
-	{
-		case BGType_Text: this->_LineText<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo); break;
-		case BGType_Affine: this->_LineRot<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo); break;
-		case BGType_AffineExt: this->_LineExtRot<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, useCustomVRAM); break;
-		case BGType_Large8bpp: this->_LineExtRot<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, useCustomVRAM); break;
-		case BGType_Invalid:
-			PROGINFO("Attempting to render an invalid BG type\n");
-			break;
-		default:
-			break;
-	}
-	
-	// If compositing at the native size, each pixel is composited immediately. However, if
-	// compositing at a custom size, pixel gathering and pixel compositing are split up into
-	// separate steps. If compositing at a custom size, composite the entire line now.
-	if ( (COMPOSITORMODE != GPUCompositorMode_Debug) && (WILLDEFERCOMPOSITING || !this->isLineRenderNative[compInfo.line.indexNative] || (useCustomVRAM && (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) && !GPU->GetDisplayInfo().isCustomSizeRequested)) )
-	{
-		if (useCustomVRAM)
-		{
-			this->_CompositeVRAMLineDeferred<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo);
-		}
-		else
-		{
-			this->_CompositeLineDeferred<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo);
-		}
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
-FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo)
-{
-	if (this->isLineRenderNative[compInfo.line.indexNative])
-	{
-		this->_RenderLine_LayerBG_Final<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, false>(compInfo);
-	}
-	else
-	{
-		this->_RenderLine_LayerBG_Final<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, true>(compInfo);
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
-FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo)
-{
-#ifndef DISABLE_MOSAIC
-	if (compInfo.renderState.selectedBGLayer->isMosaic && compInfo.renderState.isBGMosaicSet)
-	{
-		this->_RenderLine_LayerBG_ApplyMosaic<COMPOSITORMODE, OUTPUTFORMAT, true, WILLPERFORMWINDOWTEST>(compInfo);
-	}
-	else
-#endif
-	{
-		this->_RenderLine_LayerBG_ApplyMosaic<COMPOSITORMODE, OUTPUTFORMAT, false, WILLPERFORMWINDOWTEST>(compInfo);
-	}
-}
-
-void GPUEngineBase::RenderLayerBG(const GPULayerID layerID, u16 *dstColorBuffer)
-{
-	GPUEngineCompositorInfo compInfo;
-	memset(&compInfo, 0, sizeof(compInfo));
-	
-	compInfo.renderState.displayOutputMode = GPUDisplayMode_Normal;
-	compInfo.renderState.selectedLayerID = layerID;
-	compInfo.renderState.selectedBGLayer = &this->_BGLayer[layerID];
-	compInfo.renderState.colorEffect = ColorEffect_Disable;
-	compInfo.renderState.masterBrightnessMode = GPUMasterBrightMode_Disable;
-	compInfo.renderState.masterBrightnessIsFullIntensity = false;
-	compInfo.renderState.masterBrightnessIsMaxOrMin = true;
-	compInfo.renderState.spriteRenderMode = this->_currentRenderState.spriteRenderMode;
-	compInfo.renderState.spriteBoundary = this->_currentRenderState.spriteBoundary;
-	compInfo.renderState.spriteBMPBoundary = this->_currentRenderState.spriteBMPBoundary;
-	
-	const size_t layerWidth = compInfo.renderState.selectedBGLayer->size.width;
-	const size_t layerHeight = compInfo.renderState.selectedBGLayer->size.height;
-	compInfo.line.widthCustom = layerWidth;
-	compInfo.line.renderCount = 1;
-	
-	compInfo.target.lineLayerIDHead = NULL;
-	compInfo.target.lineLayerIDHeadNative = NULL;
-	compInfo.target.lineLayerIDHeadCustom = NULL;
-	
-	compInfo.target.xNative = 0;
-	compInfo.target.xCustom = compInfo.target.xNative;
-	compInfo.target.lineColor = (void **)&compInfo.target.lineColor16;
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative;
-	compInfo.target.lineLayerID = NULL;
-	
-	for (size_t lineIndex = 0; lineIndex < layerHeight; lineIndex++)
-	{
-		compInfo.line.indexNative = lineIndex;
-		compInfo.line.indexCustom = compInfo.line.indexNative;
-		compInfo.line.pixelCount = layerWidth;
-		compInfo.line.blockOffsetNative = compInfo.line.indexNative * layerWidth;
-		compInfo.line.blockOffsetCustom = compInfo.line.blockOffsetNative;
-		
-		compInfo.target.lineColorHead = (u16 *)dstColorBuffer + compInfo.line.blockOffsetNative;
-		compInfo.target.lineColorHeadNative = compInfo.target.lineColorHead;
-		compInfo.target.lineColorHeadCustom = compInfo.target.lineColorHeadNative;
-		
-		this->_RenderLine_LayerBG_Final<GPUCompositorMode_Debug, NDSColorFormat_BGR555_Rev, false, false, false>(compInfo);
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::_HandleDisplayModeOff(const size_t l)
-{
-	// Native rendering only.
-	// In this display mode, the display is cleared to white.
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFF);
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x1F3F3F3F);
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFFFFFF);
-			break;
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::_HandleDisplayModeNormal(const size_t l)
-{
-	if (!this->isLineRenderNative[l])
-	{
-		this->isLineOutputNative[l] = false;
-		this->nativeLineOutputCount--;
-	}
-}
-
-template <size_t WINNUM>
-void GPUEngineBase::ParseReg_WINnH()
-{
-	this->_needUpdateWINH[WINNUM] = true;
-}
-
-void GPUEngineBase::ParseReg_WININ()
-{
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.WIN0_enable[GPULayerID_BG0] = this->_IORegisterMap->WIN0IN.BG0_Enable;
-	renderState.WIN0_enable[GPULayerID_BG1] = this->_IORegisterMap->WIN0IN.BG1_Enable;
-	renderState.WIN0_enable[GPULayerID_BG2] = this->_IORegisterMap->WIN0IN.BG2_Enable;
-	renderState.WIN0_enable[GPULayerID_BG3] = this->_IORegisterMap->WIN0IN.BG3_Enable;
-	renderState.WIN0_enable[GPULayerID_OBJ] = this->_IORegisterMap->WIN0IN.OBJ_Enable;
-	renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WIN0IN.Effect_Enable;
-	
-	renderState.WIN1_enable[GPULayerID_BG0] = this->_IORegisterMap->WIN1IN.BG0_Enable;
-	renderState.WIN1_enable[GPULayerID_BG1] = this->_IORegisterMap->WIN1IN.BG1_Enable;
-	renderState.WIN1_enable[GPULayerID_BG2] = this->_IORegisterMap->WIN1IN.BG2_Enable;
-	renderState.WIN1_enable[GPULayerID_BG3] = this->_IORegisterMap->WIN1IN.BG3_Enable;
-	renderState.WIN1_enable[GPULayerID_OBJ] = this->_IORegisterMap->WIN1IN.OBJ_Enable;
-	renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WIN1IN.Effect_Enable;
-	
-#if defined(ENABLE_SSE2)
-	renderState.WIN0_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG0_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN0_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG1_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN0_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG2_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN0_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG3_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN0_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.OBJ_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.Effect_Enable != 0) ? 0xFF : 0x00);
-	
-	renderState.WIN1_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG0_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN1_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG1_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN1_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG2_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN1_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG3_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN1_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.OBJ_Enable != 0) ? 0xFF : 0x00);
-	renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.Effect_Enable != 0) ? 0xFF : 0x00);
-#endif
-}
-
-void GPUEngineBase::ParseReg_WINOUT()
-{
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.WINOUT_enable[GPULayerID_BG0] = this->_IORegisterMap->WINOUT.BG0_Enable;
-	renderState.WINOUT_enable[GPULayerID_BG1] = this->_IORegisterMap->WINOUT.BG1_Enable;
-	renderState.WINOUT_enable[GPULayerID_BG2] = this->_IORegisterMap->WINOUT.BG2_Enable;
-	renderState.WINOUT_enable[GPULayerID_BG3] = this->_IORegisterMap->WINOUT.BG3_Enable;
-	renderState.WINOUT_enable[GPULayerID_OBJ] = this->_IORegisterMap->WINOUT.OBJ_Enable;
-	renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WINOUT.Effect_Enable;
-	
-	renderState.WINOBJ_enable[GPULayerID_BG0] = this->_IORegisterMap->WINOBJ.BG0_Enable;
-	renderState.WINOBJ_enable[GPULayerID_BG1] = this->_IORegisterMap->WINOBJ.BG1_Enable;
-	renderState.WINOBJ_enable[GPULayerID_BG2] = this->_IORegisterMap->WINOBJ.BG2_Enable;
-	renderState.WINOBJ_enable[GPULayerID_BG3] = this->_IORegisterMap->WINOBJ.BG3_Enable;
-	renderState.WINOBJ_enable[GPULayerID_OBJ] = this->_IORegisterMap->WINOBJ.OBJ_Enable;
-	renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WINOBJ.Effect_Enable;
-	
-#if defined(ENABLE_SSE2)
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG0_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG1_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG2_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOUT_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG3_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOUT_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.OBJ_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.Effect_Enable != 0) ? 0xFF : 0x00);
-	
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG0_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG1_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG2_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOBJ_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG3_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOBJ_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.OBJ_Enable != 0) ? 0xFF : 0x00);
-	renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.Effect_Enable != 0) ? 0xFF : 0x00);
-#endif
-}
-
-void GPUEngineBase::ParseReg_MOSAIC()
-{
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.mosaicWidthBG = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicH];
-	renderState.mosaicHeightBG = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicV];
-	renderState.mosaicWidthOBJ = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicH];
-	renderState.mosaicHeightOBJ = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicV];
-	
-	renderState.isBGMosaicSet = (this->_IORegisterMap->MOSAIC.BG_MosaicH != 0) || (this->_IORegisterMap->MOSAIC.BG_MosaicV != 0);
-	renderState.isOBJMosaicSet = (this->_IORegisterMap->MOSAIC.OBJ_MosaicH != 0) || (this->_IORegisterMap->MOSAIC.OBJ_MosaicV != 0);
-}
-
-void GPUEngineBase::ParseReg_BLDCNT()
-{
-	const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT;
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.colorEffect = (ColorEffect)BLDCNT.ColorEffect;
-	
-	renderState.srcBlendEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target1 != 0);
-	renderState.srcBlendEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target1 != 0);
-	renderState.srcBlendEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target1 != 0);
-	renderState.srcBlendEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target1 != 0);
-	renderState.srcBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target1 != 0);
-	renderState.srcBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target1 != 0);
-	
-	renderState.dstBlendEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target2 != 0);
-	renderState.dstBlendEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target2 != 0);
-	renderState.dstBlendEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target2 != 0);
-	renderState.dstBlendEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target2 != 0);
-	renderState.dstBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target2 != 0);
-	renderState.dstBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target2 != 0);
-	
-	renderState.dstAnyBlendEnable = renderState.dstBlendEnable[GPULayerID_BG0] ||
-	                                renderState.dstBlendEnable[GPULayerID_BG1] ||
-	                                renderState.dstBlendEnable[GPULayerID_BG2] ||
-	                                renderState.dstBlendEnable[GPULayerID_BG3] ||
-	                                renderState.dstBlendEnable[GPULayerID_OBJ] ||
-	                                renderState.dstBlendEnable[GPULayerID_Backdrop];
-	
-#ifdef ENABLE_SSE2
-	const __m128i one_vec128 = _mm_set1_epi8(1);
-	
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG0] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target1), one_vec128);
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG1] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG1_Target1), one_vec128);
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG2] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG2_Target1), one_vec128);
-	renderState.srcBlendEnable_SSE2[GPULayerID_BG3] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG3_Target1), one_vec128);
-	renderState.srcBlendEnable_SSE2[GPULayerID_OBJ] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.OBJ_Target1), one_vec128);
-	renderState.srcBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.Backdrop_Target1), one_vec128);
-	
-#ifdef ENABLE_SSSE3
-	renderState.dstBlendEnable_SSSE3 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-													BLDCNT.Backdrop_Target2,
-													BLDCNT.OBJ_Target2,
-													BLDCNT.BG3_Target2,
-													BLDCNT.BG2_Target2,
-													BLDCNT.BG1_Target2,
-													BLDCNT.BG0_Target2);
-#else
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG0] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target2), one_vec128);
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG1] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG1_Target2), one_vec128);
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG2] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG2_Target2), one_vec128);
-	renderState.dstBlendEnable_SSE2[GPULayerID_BG3] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG3_Target2), one_vec128);
-	renderState.dstBlendEnable_SSE2[GPULayerID_OBJ] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.OBJ_Target2), one_vec128);
-	renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.Backdrop_Target2), one_vec128);
-#endif
-	
-#endif // ENABLE_SSE2
-}
-
-void GPUEngineBase::ParseReg_BLDALPHA()
-{
-	const IOREG_BLDALPHA &BLDALPHA = this->_IORegisterMap->BLDALPHA;
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.blendEVA = (BLDALPHA.EVA >= 16) ? 16 : BLDALPHA.EVA;
-	renderState.blendEVB = (BLDALPHA.EVB >= 16) ? 16 : BLDALPHA.EVB;
-	renderState.blendTable555 = (TBlendTable *)&GPUEngineBase::_blendTable555[renderState.blendEVA][renderState.blendEVB][0][0];
-}
-
-void GPUEngineBase::ParseReg_BLDY()
-{
-	const IOREG_BLDY &BLDY = this->_IORegisterMap->BLDY;
-	GPUEngineRenderState &renderState = this->_currentRenderState;
-	
-	renderState.blendEVY = (BLDY.EVY >= 16) ? 16 : BLDY.EVY;
-	renderState.brightnessUpTable555 = &GPUEngineBase::_brightnessUpTable555[renderState.blendEVY][0];
-	renderState.brightnessUpTable666 = &GPUEngineBase::_brightnessUpTable666[renderState.blendEVY][0];
-	renderState.brightnessUpTable888 = &GPUEngineBase::_brightnessUpTable888[renderState.blendEVY][0];
-	renderState.brightnessDownTable555 = &GPUEngineBase::_brightnessDownTable555[renderState.blendEVY][0];
-	renderState.brightnessDownTable666 = &GPUEngineBase::_brightnessDownTable666[renderState.blendEVY][0];
-	renderState.brightnessDownTable888 = &GPUEngineBase::_brightnessDownTable888[renderState.blendEVY][0];
-}
-
-const BGLayerInfo& GPUEngineBase::GetBGLayerInfoByID(const GPULayerID layerID)
-{
-	return this->_BGLayer[layerID];
-}
-
-NDSDisplayID GPUEngineBase::GetDisplayByID() const
-{
-	return this->_targetDisplayID;
-}
-
-void GPUEngineBase::SetDisplayByID(const NDSDisplayID theDisplayID)
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	this->_targetDisplayID = theDisplayID;
-	
-	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * dispInfo.pixelBytes;
-	const size_t customFramebufferSize = dispInfo.customWidth * dispInfo.customHeight * dispInfo.pixelBytes;
-	
-	this->nativeBuffer = (theDisplayID == NDSDisplayID_Main) ? dispInfo.masterNativeBuffer : (u8 *)dispInfo.masterNativeBuffer + nativeFramebufferSize;
-	this->customBuffer = (theDisplayID == NDSDisplayID_Main) ? dispInfo.masterCustomBuffer : (u8 *)dispInfo.masterCustomBuffer + customFramebufferSize;
-}
-
-GPUEngineID GPUEngineBase::GetEngineID() const
-{
-	return this->_engineID;
-}
-
-void GPUEngineBase::SetCustomFramebufferSize(size_t w, size_t h)
-{
-	void *oldWorkingLineColor = this->_internalRenderLineTargetCustom;
-	u8 *oldWorkingLineLayerID = this->_renderLineLayerIDCustom;
-	u8 *oldDeferredIndexCustom = this->_deferredIndexCustom;
-	u16 *oldDeferredColorCustom = this->_deferredColorCustom;
-	u8 *oldDidPassWindowTestCustomMasterPtr = this->_didPassWindowTestCustomMasterPtr;
-	
-	void *newWorkingLineColor = malloc_alignedCacheLine(w * _gpuLargestDstLineCount * GPU->GetDisplayInfo().pixelBytes);
-	u8 *newWorkingLineLayerID = (u8 *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * 4 * sizeof(u8)); // yes indeed, this is oversized. map debug tools try to write to it
-	u8 *newDeferredIndexCustom = (u8 *)malloc_alignedCacheLine(w * sizeof(u8));
-	u16 *newDeferredColorCustom = (u16 *)malloc_alignedCacheLine(w * sizeof(u16));
-	u8 *newDidPassWindowTestCustomMasterPtr = (u8 *)malloc_alignedCacheLine(w * 10 * sizeof(u8));
-	
-	this->_internalRenderLineTargetCustom = newWorkingLineColor;
-	this->_renderLineLayerIDCustom = newWorkingLineLayerID;
-	this->_deferredIndexCustom = newDeferredIndexCustom;
-	this->_deferredColorCustom = newDeferredColorCustom;
-	
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * dispInfo.pixelBytes;
-	const size_t customFramebufferSize = w * h * dispInfo.pixelBytes;
-	
-	this->nativeBuffer   = (this->_targetDisplayID == NDSDisplayID_Main) ? dispInfo.masterNativeBuffer : (u8 *)dispInfo.masterNativeBuffer + nativeFramebufferSize;
-	this->customBuffer   = (this->_targetDisplayID == NDSDisplayID_Main) ? dispInfo.masterCustomBuffer : (u8 *)dispInfo.masterCustomBuffer + customFramebufferSize;
-	this->renderedBuffer = this->nativeBuffer;
-	this->renderedWidth  = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	this->renderedHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	
-	this->_didPassWindowTestCustomMasterPtr = newDidPassWindowTestCustomMasterPtr;
-	this->_didPassWindowTestCustom[GPULayerID_BG0] = this->_didPassWindowTestCustomMasterPtr + (0 * w * sizeof(u8));
-	this->_didPassWindowTestCustom[GPULayerID_BG1] = this->_didPassWindowTestCustomMasterPtr + (1 * w * sizeof(u8));
-	this->_didPassWindowTestCustom[GPULayerID_BG2] = this->_didPassWindowTestCustomMasterPtr + (2 * w * sizeof(u8));
-	this->_didPassWindowTestCustom[GPULayerID_BG3] = this->_didPassWindowTestCustomMasterPtr + (3 * w * sizeof(u8));
-	this->_didPassWindowTestCustom[GPULayerID_OBJ] = this->_didPassWindowTestCustomMasterPtr + (4 * w * sizeof(u8));
-	
-	this->_enableColorEffectCustomMasterPtr = newDidPassWindowTestCustomMasterPtr + (w * 5 * sizeof(u8));
-	this->_enableColorEffectCustom[GPULayerID_BG0] = this->_enableColorEffectCustomMasterPtr + (0 * w * sizeof(u8));
-	this->_enableColorEffectCustom[GPULayerID_BG1] = this->_enableColorEffectCustomMasterPtr + (1 * w * sizeof(u8));
-	this->_enableColorEffectCustom[GPULayerID_BG2] = this->_enableColorEffectCustomMasterPtr + (2 * w * sizeof(u8));
-	this->_enableColorEffectCustom[GPULayerID_BG3] = this->_enableColorEffectCustomMasterPtr + (3 * w * sizeof(u8));
-	this->_enableColorEffectCustom[GPULayerID_OBJ] = this->_enableColorEffectCustomMasterPtr + (4 * w * sizeof(u8));
-	
-	this->_needUpdateWINH[0] = true;
-	this->_needUpdateWINH[1] = true;
-	
-	for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
-	{
-		GPUEngineLineInfo &lineInfo = this->_currentCompositorInfo[line].line;
-		
-		lineInfo.indexNative = line;
-		lineInfo.indexCustom = _gpuDstLineIndex[lineInfo.indexNative];
-		lineInfo.widthCustom = GPU->GetDisplayInfo().customWidth;
-		lineInfo.renderCount = _gpuDstLineCount[lineInfo.indexNative];
-		lineInfo.pixelCount = lineInfo.widthCustom * lineInfo.renderCount;
-		lineInfo.blockOffsetNative = lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH;
-		lineInfo.blockOffsetCustom = lineInfo.indexCustom * lineInfo.widthCustom;
-		
-		this->_currentCompositorInfo[line].target.lineColor = (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (void **)&this->_currentCompositorInfo[line].target.lineColor16 : (void **)&this->_currentCompositorInfo[line].target.lineColor32;
-	}
-	
-	free_aligned(oldWorkingLineColor);
-	free_aligned(oldWorkingLineLayerID);
-	free_aligned(oldDeferredIndexCustom);
-	free_aligned(oldDeferredColorCustom);
-	free_aligned(oldDidPassWindowTestCustomMasterPtr);
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineBase::ResolveCustomRendering()
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	
-	if (this->nativeLineOutputCount == GPU_FRAMEBUFFER_NATIVE_HEIGHT)
-	{
-		return;
-	}
-	else if (this->nativeLineOutputCount == 0)
-	{
-		this->renderedWidth = dispInfo.customWidth;
-		this->renderedHeight = dispInfo.customHeight;
-		this->renderedBuffer = this->customBuffer;
-		return;
-	}
-	
-	// Resolve any remaining native lines to the custom buffer
-	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
-	{
-		for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
-		{
-			if (this->isLineOutputNative[y])
-			{
-				this->_LineCopy<0xFFFF, true, false, 2>(this->customBuffer, this->nativeBuffer, y);
-				this->isLineOutputNative[y] = false;
-			}
-		}
-	}
-	else
-	{
-		for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
-		{
-			if (this->isLineOutputNative[y])
-			{
-				this->_LineCopy<0xFFFF, true, false, 4>(this->customBuffer, this->nativeBuffer, y);
-				this->isLineOutputNative[y] = false;
-			}
-		}
-	}
-	
-	this->nativeLineOutputCount = 0;
-	this->renderedWidth = dispInfo.customWidth;
-	this->renderedHeight = dispInfo.customHeight;
-	this->renderedBuffer = this->customBuffer;
-}
-
-void GPUEngineBase::ResolveToCustomFramebuffer(NDSDisplayInfo &mutableInfo)
-{
-	if (mutableInfo.didPerformCustomRender[this->_targetDisplayID])
-	{
-		return;
-	}
-	
-	if (mutableInfo.isCustomSizeRequested)
-	{
-		if (mutableInfo.pixelBytes == 2)
-		{
-			for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
-			{
-				this->_LineCopy<0xFFFF, true, false, 2>(mutableInfo.customBuffer[this->_targetDisplayID], mutableInfo.nativeBuffer[this->_targetDisplayID], y);
-			}
-		}
-		else if (mutableInfo.pixelBytes == 4)
-		{
-			for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
-			{
-				this->_LineCopy<0xFFFF, true, false, 4>(mutableInfo.customBuffer[this->_targetDisplayID], mutableInfo.nativeBuffer[this->_targetDisplayID], y);
-			}
-		}
-	}
-	else
-	{
-		memcpy(mutableInfo.customBuffer[this->_targetDisplayID], mutableInfo.nativeBuffer[this->_targetDisplayID], GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * mutableInfo.pixelBytes);
-	}
-	
-	mutableInfo.didPerformCustomRender[this->_targetDisplayID] = true;
-}
-
-void GPUEngineBase::RefreshAffineStartRegs()
-{
-	//this is speculative. the idea is as follows:
-	//whenever the user updates the affine start position regs, it goes into the active regs immediately
-	//(this is handled on the set event from MMU)
-	//maybe it shouldnt take effect until the next hblank or something..
-	//this is a based on a combination of:
-	//heroes of mana intro FMV
-	//SPP level 3-8 rotoscale room
-	//NSMB raster fx backdrops
-	//bubble bobble revolution classic mode
-	//NOTE:
-	//I am REALLY unsatisfied with this logic now. But it seems to be working.
-	
-	this->_IORegisterMap->BG2X = this->savedBG2X;
-	this->_IORegisterMap->BG2Y = this->savedBG2Y;
-	this->_IORegisterMap->BG3X = this->savedBG3X;
-	this->_IORegisterMap->BG3Y = this->savedBG3Y;
-}
-
-// normally should have same addresses
-void GPUEngineBase::REG_DISPx_pack_test()
-{
-	const GPU_IOREG *r = this->_IORegisterMap;
-	
-	printf("%08lx %02x\n", (uintptr_t)r, (u32)((uintptr_t)(&r->DISPCNT) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISPSTAT) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->VCOUNT) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->BGnCNT[GPULayerID_BG0]) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->BGnOFS[GPULayerID_BG0]) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->BG2Param) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->BG3Param) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISP3DCNT) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISPCAPCNT) - (uintptr_t)r) );
-	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISP_MMEM_FIFO) - (uintptr_t)r) );
-}
-
-void GPUEngineBase::ParseAllRegisters()
-{
-	this->ParseReg_DISPCNT();
-	// No need to call ParseReg_BGnCNT(), since it is already called by ParseReg_DISPCNT().
-	
-	this->ParseReg_BGnHOFS<GPULayerID_BG0>();
-	this->ParseReg_BGnHOFS<GPULayerID_BG1>();
-	this->ParseReg_BGnHOFS<GPULayerID_BG2>();
-	this->ParseReg_BGnHOFS<GPULayerID_BG3>();
-	this->ParseReg_BGnVOFS<GPULayerID_BG0>();
-	this->ParseReg_BGnVOFS<GPULayerID_BG1>();
-	this->ParseReg_BGnVOFS<GPULayerID_BG2>();
-	this->ParseReg_BGnVOFS<GPULayerID_BG3>();
-	
-	this->ParseReg_BGnX<GPULayerID_BG2>();
-	this->ParseReg_BGnY<GPULayerID_BG2>();
-	this->ParseReg_BGnX<GPULayerID_BG3>();
-	this->ParseReg_BGnY<GPULayerID_BG3>();
-	
-	this->ParseReg_WINnH<0>();
-	this->ParseReg_WINnH<1>();
-	this->ParseReg_WININ();
-	this->ParseReg_WINOUT();
-	
-	this->ParseReg_MOSAIC();
-	this->ParseReg_BLDCNT();
-	this->ParseReg_BLDALPHA();
-	this->ParseReg_BLDY();
-	this->ParseReg_MASTER_BRIGHT();
-}
-
-GPUEngineA::GPUEngineA()
-{
-	_engineID = GPUEngineID_Main;
-	_targetDisplayID = NDSDisplayID_Main;
-	_IORegisterMap = (GPU_IOREG *)MMU.ARM9_REG;
-	_paletteBG = (u16 *)MMU.ARM9_VMEM;
-	_paletteOBJ = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_512B);
-	_oamList = (OAMAttributes *)(MMU.ARM9_OAM);
-	_sprMem = MMU_AOBJ;
-	
-	_VRAMNativeBlockPtr[0] = (u16 *)MMU.ARM9_LCD;
-	_VRAMNativeBlockPtr[1] = _VRAMNativeBlockPtr[0] + (1 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	_VRAMNativeBlockPtr[2] = _VRAMNativeBlockPtr[0] + (2 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	_VRAMNativeBlockPtr[3] = _VRAMNativeBlockPtr[0] + (3 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	
-	memset(this->_VRAMNativeBlockCaptureCopy, 0, GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
-	_VRAMNativeBlockCaptureCopyPtr[0] = this->_VRAMNativeBlockCaptureCopy;
-	_VRAMNativeBlockCaptureCopyPtr[1] = _VRAMNativeBlockCaptureCopyPtr[0] + (1 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	_VRAMNativeBlockCaptureCopyPtr[2] = _VRAMNativeBlockCaptureCopyPtr[0] + (2 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	_VRAMNativeBlockCaptureCopyPtr[3] = _VRAMNativeBlockCaptureCopyPtr[0] + (3 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	
-	nativeLineCaptureCount[0] = GPU_VRAM_BLOCK_LINES;
-	nativeLineCaptureCount[1] = GPU_VRAM_BLOCK_LINES;
-	nativeLineCaptureCount[2] = GPU_VRAM_BLOCK_LINES;
-	nativeLineCaptureCount[3] = GPU_VRAM_BLOCK_LINES;
-	
-	for (size_t l = 0; l < GPU_VRAM_BLOCK_LINES; l++)
-	{
-		isLineCaptureNative[0][l] = true;
-		isLineCaptureNative[1][l] = true;
-		isLineCaptureNative[2][l] = true;
-		isLineCaptureNative[3][l] = true;
-	}
-	
-	_3DFramebufferMain = (FragmentColor *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(FragmentColor));
-	_3DFramebuffer16 = (u16 *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16));
-	_captureWorkingA16 = (u16 *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16));
-	_captureWorkingB16 = (u16 *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16));
-	_captureWorkingA32 = (FragmentColor *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(FragmentColor));
-	_captureWorkingB32 = (FragmentColor *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(FragmentColor));
-	gfx3d_Update3DFramebuffers(_3DFramebufferMain, _3DFramebuffer16);
-}
-
-GPUEngineA::~GPUEngineA()
-{
-	free_aligned(this->_3DFramebufferMain);
-	free_aligned(this->_3DFramebuffer16);
-	free_aligned(this->_captureWorkingA16);
-	free_aligned(this->_captureWorkingB16);
-	free_aligned(this->_captureWorkingA32);
-	free_aligned(this->_captureWorkingB32);
-	gfx3d_Update3DFramebuffers(NULL, NULL);
-}
-
-GPUEngineA* GPUEngineA::Allocate()
-{
-	return new(malloc_aligned64(sizeof(GPUEngineA))) GPUEngineA();
-}
-
-void GPUEngineA::FinalizeAndDeallocate()
-{
-	this->~GPUEngineA();
-	free_aligned(this);
-}
-
-void GPUEngineA::Reset()
-{
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	this->_Reset_Base();
-	
-	memset(&this->_dispCapCnt, 0, sizeof(DISPCAPCNT_parsed));
-	this->_displayCaptureEnable = false;
-	
-	this->_BGLayer[GPULayerID_BG0].BMPAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG1].BMPAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG2].BMPAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG3].BMPAddress = MMU_ABG;
-	
-	this->_BGLayer[GPULayerID_BG0].largeBMPAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG1].largeBMPAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG2].largeBMPAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG3].largeBMPAddress = MMU_ABG;
-	
-	this->_BGLayer[GPULayerID_BG0].tileMapAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG1].tileMapAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG2].tileMapAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG3].tileMapAddress = MMU_ABG;
-	
-	this->_BGLayer[GPULayerID_BG0].tileEntryAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG1].tileEntryAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG2].tileEntryAddress = MMU_ABG;
-	this->_BGLayer[GPULayerID_BG3].tileEntryAddress = MMU_ABG;
-	
-	memset(this->_VRAMNativeBlockCaptureCopy, 0, GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
-	
-	this->ResetCaptureLineStates();
-	this->SetDisplayByID(NDSDisplayID_Main);
-	
-	memset(this->_3DFramebufferMain, 0, dispInfo.customWidth * dispInfo.customHeight * sizeof(FragmentColor));
-	memset(this->_3DFramebuffer16, 0, dispInfo.customWidth * dispInfo.customHeight * sizeof(u16));
-	memset(this->_captureWorkingA16, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(u16));
-	memset(this->_captureWorkingB16, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(u16));
-	memset(this->_captureWorkingA32, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(FragmentColor));
-	memset(this->_captureWorkingB32, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(FragmentColor));
-}
-
-void GPUEngineA::ResetCaptureLineStates()
-{
-	this->nativeLineCaptureCount[0] = GPU_VRAM_BLOCK_LINES;
-	this->nativeLineCaptureCount[1] = GPU_VRAM_BLOCK_LINES;
-	this->nativeLineCaptureCount[2] = GPU_VRAM_BLOCK_LINES;
-	this->nativeLineCaptureCount[3] = GPU_VRAM_BLOCK_LINES;
-	
-	for (size_t l = 0; l < GPU_VRAM_BLOCK_LINES; l++)
-	{
-		this->isLineCaptureNative[0][l] = true;
-		this->isLineCaptureNative[1][l] = true;
-		this->isLineCaptureNative[2][l] = true;
-		this->isLineCaptureNative[3][l] = true;
-	}
-}
-
-void GPUEngineA::ParseReg_DISPCAPCNT()
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
-	
-	this->_dispCapCnt.EVA = (DISPCAPCNT.EVA >= 16) ? 16 : DISPCAPCNT.EVA;
-	this->_dispCapCnt.EVB = (DISPCAPCNT.EVB >= 16) ? 16 : DISPCAPCNT.EVB;
-	this->_dispCapCnt.readOffset = (DISPCNT.DisplayMode == GPUDisplayMode_VRAM) ? 0 : DISPCAPCNT.VRAMReadOffset;
-	
-	switch (DISPCAPCNT.CaptureSize)
-	{
-		case DisplayCaptureSize_128x128:
-			this->_dispCapCnt.capy = 128;
-			break;
-			
-		case DisplayCaptureSize_256x64:
-			this->_dispCapCnt.capy = 64;
-			break;
-			
-		case DisplayCaptureSize_256x128:
-			this->_dispCapCnt.capy = 128;
-			break;
-			
-		case DisplayCaptureSize_256x192:
-			this->_dispCapCnt.capy = 192;
-			break;
-			
-		default:
-			break;
-	}
-	
-	/*INFO("Capture 0x%X:\n EVA=%i, EVB=%i, wBlock=%i, wOffset=%i, capX=%i, capY=%i\n rBlock=%i, rOffset=%i, srcCap=%i, dst=0x%X, src=0x%X\n srcA=%i, srcB=%i\n\n",
-	 val, this->_dispCapCnt.EVA, this->_dispCapCnt.EVB, this->_dispCapCnt.writeBlock, this->_dispCapCnt.writeOffset,
-	 this->_dispCapCnt.capy, this->_dispCapCnt.readBlock, this->_dispCapCnt.readOffset,
-	 this->_dispCapCnt.capSrc, this->_dispCapCnt.dst - MMU.ARM9_LCD, this->_dispCapCnt.src - MMU.ARM9_LCD,
-	 this->_dispCapCnt.srcA, this->_dispCapCnt.srcB);*/
-}
-
-FragmentColor* GPUEngineA::Get3DFramebufferMain() const
-{
-	return this->_3DFramebufferMain;
-}
-
-u16* GPUEngineA::Get3DFramebuffer16() const
-{
-	return this->_3DFramebuffer16;
-}
-
-void* GPUEngineA::GetCustomVRAMBlockPtr(const size_t blockID)
-{
-	return this->_VRAMCustomBlockPtr[blockID];
-}
-
-void GPUEngineA::SetCustomFramebufferSize(size_t w, size_t h)
-{
-	this->GPUEngineBase::SetCustomFramebufferSize(w, h);
-	
-	FragmentColor *old3DFramebufferMain = this->_3DFramebufferMain;
-	u16 *old3DFramebuffer16 = this->_3DFramebuffer16;
-	u16 *oldCaptureWorkingA16 = this->_captureWorkingA16;
-	u16 *oldCaptureWorkingB16 = this->_captureWorkingB16;
-	FragmentColor *oldCaptureWorkingA32 = this->_captureWorkingA32;
-	FragmentColor *oldCaptureWorkingB32 = this->_captureWorkingB32;
-	
-	FragmentColor *new3DFramebufferMain = (FragmentColor *)malloc_alignedCacheLine(w * h * sizeof(FragmentColor));
-	u16 *new3DFramebuffer16 = (u16 *)malloc_alignedCacheLine(w * h * sizeof(u16));
-	u16 *newCaptureWorkingA16 = (u16 *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(u16));
-	u16 *newCaptureWorkingB16 = (u16 *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(u16));
-	FragmentColor *newCaptureWorkingA32 = (FragmentColor *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(FragmentColor));
-	FragmentColor *newCaptureWorkingB32 = (FragmentColor *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(FragmentColor));
-	
-	this->_3DFramebufferMain = new3DFramebufferMain;
-	this->_3DFramebuffer16 = new3DFramebuffer16;
-	this->_captureWorkingA16 = newCaptureWorkingA16;
-	this->_captureWorkingB16 = newCaptureWorkingB16;
-	this->_captureWorkingA32 = newCaptureWorkingA32;
-	this->_captureWorkingB32 = newCaptureWorkingB32;
-	gfx3d_Update3DFramebuffers(this->_3DFramebufferMain, this->_3DFramebuffer16);
-	
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	
-	if (dispInfo.colorFormat == NDSColorFormat_BGR888_Rev)
-	{
-		this->_VRAMCustomBlockPtr[0] = (FragmentColor *)GPU->GetCustomVRAMBuffer();
-		this->_VRAMCustomBlockPtr[1] = (FragmentColor *)this->_VRAMCustomBlockPtr[0] + (1 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
-		this->_VRAMCustomBlockPtr[2] = (FragmentColor *)this->_VRAMCustomBlockPtr[0] + (2 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
-		this->_VRAMCustomBlockPtr[3] = (FragmentColor *)this->_VRAMCustomBlockPtr[0] + (3 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
-	}
-	else
-	{
-		this->_VRAMCustomBlockPtr[0] = (u16 *)GPU->GetCustomVRAMBuffer();
-		this->_VRAMCustomBlockPtr[1] = (u16 *)this->_VRAMCustomBlockPtr[0] + (1 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
-		this->_VRAMCustomBlockPtr[2] = (u16 *)this->_VRAMCustomBlockPtr[0] + (2 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
-		this->_VRAMCustomBlockPtr[3] = (u16 *)this->_VRAMCustomBlockPtr[0] + (3 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
-	}
-	
-	free_aligned(old3DFramebufferMain);
-	free_aligned(old3DFramebuffer16);
-	free_aligned(oldCaptureWorkingA16);
-	free_aligned(oldCaptureWorkingB16);
-	free_aligned(oldCaptureWorkingA32);
-	free_aligned(oldCaptureWorkingB32);
-}
-
-bool GPUEngineA::WillRender3DLayer()
-{
-	return ( this->_enableLayer[GPULayerID_BG0] && (this->_IORegisterMap->DISPCNT.BG0_3D != 0) );
-}
-
-bool GPUEngineA::WillCapture3DLayerDirect(const size_t l)
-{
-	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
-	return ( this->WillDisplayCapture(l) && (DISPCAPCNT.SrcA != 0) && (DISPCAPCNT.CaptureSrc != 1) );
-}
-
-bool GPUEngineA::WillDisplayCapture(const size_t l)
-{
-	//we must block captures when the capture dest is not mapped to LCDC.
-	//mario kart does this (maybe due to a programming bug, but maybe emulation timing error) when spamming confirm key during course intro and through black transition
-	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
-	return this->_displayCaptureEnable && (vramConfiguration.banks[DISPCAPCNT.VRAMWriteBlock].purpose == VramConfiguration::LCDC) && (l < this->_dispCapCnt.capy);
-}
-
-void GPUEngineA::SetDisplayCaptureEnable()
-{
-	this->_displayCaptureEnable = (this->_IORegisterMap->DISPCAPCNT.CaptureEnable != 0);
-}
-
-void GPUEngineA::ResetDisplayCaptureEnable()
-{
-	IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
-	if (this->_displayCaptureEnable)
-	{
-		DISPCAPCNT.CaptureEnable = 0;
-		this->_displayCaptureEnable = false;
-	}
-}
-
-bool GPUEngineA::VerifyVRAMLineDidChange(const size_t blockID, const size_t l)
-{
-	// This method must be called for ALL instances where captured lines in VRAM may be read back.
-	//
-	// If a line is captured at a custom size, we need to ensure that the line hasn't been changed between
-	// capture time and read time. If the captured line has changed, then we need to fallback to using the
-	// native captured line instead.
-	
-	if (this->isLineCaptureNative[blockID][l])
-	{
-		return false;
-	}
-	
-	u16 *__restrict capturedNativeLine = this->_VRAMNativeBlockCaptureCopyPtr[blockID] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	const u16 *__restrict currentNativeLine = this->_VRAMNativeBlockPtr[blockID] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	
-	const bool didVRAMLineChange = (memcmp(currentNativeLine, capturedNativeLine, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) != 0);
-	if (didVRAMLineChange)
-	{
-		this->_LineCopy<1, true, false, 2>(this->_VRAMNativeBlockCaptureCopyPtr[blockID], this->_VRAMNativeBlockPtr[blockID], l);
-		this->isLineCaptureNative[blockID][l] = true;
-		this->nativeLineCaptureCount[blockID]++;
-	}
-	
-	return didVRAMLineChange;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineA::RenderLine(const size_t l)
-{
-	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
-	const bool isDisplayCaptureNeeded = this->WillDisplayCapture(l);
-	const GPUEngineRenderState &renderState = this->_currentCompositorInfo[l].renderState;
-	
-	// Render the line
-	if ( (renderState.displayOutputMode == GPUDisplayMode_Normal) || isDisplayCaptureNeeded )
-	{
-		if (renderState.isAnyWindowEnabled)
-		{
-			this->_RenderLine_Layers<OUTPUTFORMAT, true>(l);
-		}
-		else
-		{
-			this->_RenderLine_Layers<OUTPUTFORMAT, false>(l);
-		}
-	}
-	
-	// Fill the display output
-	switch (renderState.displayOutputMode)
-	{
-		case GPUDisplayMode_Off: // Display Off(Display white)
-			this->_HandleDisplayModeOff<OUTPUTFORMAT>(l);
-			break;
-			
-		case GPUDisplayMode_Normal: // Display BG and OBJ layers
-			this->_HandleDisplayModeNormal<OUTPUTFORMAT>(l);
-			break;
-			
-		case GPUDisplayMode_VRAM: // Display vram framebuffer
-			this->_HandleDisplayModeVRAM<OUTPUTFORMAT>(l);
-			break;
-			
-		case GPUDisplayMode_MainMemory: // Display memory FIFO
-			this->_HandleDisplayModeMainMemory<OUTPUTFORMAT>(l);
-			break;
-	}
-	
-	//capture after displaying so that we can safely display vram before overwriting it here
-	
-	//BUG!!! if someone is capturing and displaying both from the fifo, then it will have been
-	//consumed above by the display before we get here
-	//(is that even legal? i think so)
-	if (isDisplayCaptureNeeded)
-	{
-		if (DISPCAPCNT.CaptureSize == DisplayCaptureSize_128x128)
-		{
-			this->_RenderLine_DisplayCapture<OUTPUTFORMAT, GPU_FRAMEBUFFER_NATIVE_WIDTH/2>(l);
-		}
-		else
-		{
-			this->_RenderLine_DisplayCapture<OUTPUTFORMAT, GPU_FRAMEBUFFER_NATIVE_WIDTH>(l);
-		}
-	}
-}
-
-template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
-void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
-{
-	const FragmentColor *__restrict framebuffer3D = CurrentRenderer->GetFramebuffer();
-	if (framebuffer3D == NULL)
-	{
-		return;
-	}
-	
-	if (!CurrentRenderer->IsFramebufferNativeSize())
-	{
-		this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
-	}
-	
-	const float customWidthScale = (float)compInfo.line.widthCustom / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	const FragmentColor *__restrict srcLinePtr = framebuffer3D + compInfo.line.blockOffsetCustom;
-	
-	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead;
-	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead;
-	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
-	
-	// Horizontally offset the 3D layer by this amount.
-	// Test case: Blowing up large objects in Nanostray 2 will cause the main screen to shake horizontally.
-	const u16 hofs = (u16)( ((float)compInfo.renderState.selectedBGLayer->xOffset * customWidthScale) + 0.5f );
-	
-	if (hofs == 0)
-	{
-#ifdef ENABLE_SSE2
-		const size_t ssePixCount = (compInfo.line.widthCustom - (compInfo.line.widthCustom % 16));
-		const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID];
-#endif
-		
-		for (size_t line = 0; line < compInfo.line.renderCount; line++)
-		{
-			compInfo.target.xNative = 0;
-			compInfo.target.xCustom = 0;
-			
-#ifdef ENABLE_SSE2
-			for (; compInfo.target.xCustom < ssePixCount; srcLinePtr+=16, compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16)
-			{
-				const __m128i src[4]	= { _mm_load_si128((__m128i *)srcLinePtr + 0),
-										    _mm_load_si128((__m128i *)srcLinePtr + 1),
-										    _mm_load_si128((__m128i *)srcLinePtr + 2),
-										    _mm_load_si128((__m128i *)srcLinePtr + 3) };
-				
-				// Determine which pixels pass by doing the alpha test and the window test.
-				const __m128i srcAlpha = _mm_packs_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)),
-														  _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) );
-				__m128i passMask8;
-				
-				if (WILLPERFORMWINDOWTEST)
-				{
-					// Do the window test.
-					passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) );
-				}
-				else
-				{
-					passMask8 = _mm_set1_epi8(0xFF);
-				}
-				
-				// Do the alpha test. Pixels with an alpha value of 0 are rejected.
-				passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8(srcAlpha, _mm_setzero_si128()), passMask8);
-				
-				const int passMaskValue = _mm_movemask_epi8(passMask8);
-				
-				// If none of the pixels within the vector pass, then reject them all at once.
-				if (passMaskValue == 0)
-				{
-					continue;
-				}
-				
-				// Write out the pixels.
-				const bool didAllPixelsPass = (passMaskValue == 0xFFFF);
-				this->_PixelComposite16_SSE2<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_3D, WILLPERFORMWINDOWTEST>(compInfo,
-																												   didAllPixelsPass,
-																												   passMask8,
-																												   src[3], src[2], src[1], src[0],
-																												   srcEffectEnableMask);
-			}
-#endif
-			
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-			for (; compInfo.target.xCustom < compInfo.line.widthCustom; srcLinePtr++, compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
-			{
-				if ( (srcLinePtr->a == 0) || (WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] == 0)) )
-				{
-					continue;
-				}
-				
-				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true;
-				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_3D>(compInfo, *srcLinePtr, 0, enableColorEffect);
-			}
-		}
-	}
-	else
-	{
-		for (size_t line = 0; line < compInfo.line.renderCount; line++)
-		{
-			for (compInfo.target.xNative = 0, compInfo.target.xCustom = 0; compInfo.target.xCustom < compInfo.line.widthCustom; compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
-			{
-				if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] == 0) )
-				{
-					continue;
-				}
-				
-				size_t srcX = compInfo.target.xCustom + hofs;
-				if (srcX >= compInfo.line.widthCustom * 2)
-				{
-					srcX -= compInfo.line.widthCustom * 2;
-				}
-				
-				if ( (srcX >= compInfo.line.widthCustom) || (srcLinePtr[srcX].a == 0) )
-				{
-					continue;
-				}
-				
-				compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom];
-				
-				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true;
-				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_3D>(compInfo, srcLinePtr[srcX], 0, enableColorEffect);
-			}
-			
-			srcLinePtr += compInfo.line.widthCustom;
-		}
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT, size_t CAPTURELENGTH>
-void GPUEngineA::_RenderLine_DisplayCapture(const u16 l)
-{
-	assert( (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH/2) || (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) );
-	
-	GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[l];
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
-	
-	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-	const bool is3DFramebufferNativeSize = CurrentRenderer->IsFramebufferNativeSize();
-	const u8 vramWriteBlock = DISPCAPCNT.VRAMWriteBlock;
-	const u8 vramReadBlock = DISPCNT.VRAM_Block;
-	const size_t writeLineIndexWithOffset = (DISPCAPCNT.VRAMWriteOffset * 64) + l;
-	const size_t readLineIndexWithOffset = (this->_dispCapCnt.readOffset * 64) + l;
-	bool newCaptureLineNativeState = true;
-	
-	//128-wide captures should write linearly into memory, with no gaps
-	//this is tested by hotel dusk
-	size_t dstNativeOffset = (DISPCAPCNT.VRAMWriteOffset * 64 * GPU_FRAMEBUFFER_NATIVE_WIDTH) + (l * CAPTURELENGTH);
-	
-	//Read/Write block wrap to 00000h when exceeding 1FFFFh (128k)
-	//this has not been tested yet (I thought I needed it for hotel dusk, but it was fixed by the above)
-	dstNativeOffset &= 0x0000FFFF;
-	
-	const u16 *vramNative16 = (u16 *)MMU.blank_memory;
-	const u16 *vramCustom16 = (u16 *)GPU->GetCustomVRAMBlankBuffer();
-	const u32 *vramCustom32 = (u32 *)GPU->GetCustomVRAMBlankBuffer();
-	u16 *dstNative16 = this->_VRAMNativeBlockPtr[vramWriteBlock] + dstNativeOffset;
-	bool readNativeVRAM = true;
-	bool captureLineNativeState32 = newCaptureLineNativeState;
-	
-	// Convert 18-bit and 24-bit framebuffers to 15-bit for native screen capture.
-	if ( (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.CaptureSrc != 1) )
-	{
-		switch (OUTPUTFORMAT)
-		{
-			case NDSColorFormat_BGR555_Rev:
-				break;
-				
-			case NDSColorFormat_BGR666_Rev:
-				ColorspaceConvertBuffer6665To5551<false, false>((u32 *)compInfo.target.lineColorHead, this->_captureWorkingA16, compInfo.line.pixelCount);
-				break;
-				
-			case NDSColorFormat_BGR888_Rev:
-				ColorspaceConvertBuffer8888To5551<false, false>((u32 *)compInfo.target.lineColorHead, this->_captureWorkingA16, compInfo.line.pixelCount);
-				break;
-		}
-	}
-	
-	// Convert VRAM for native VRAM capture.
-	if ( (DISPCAPCNT.SrcB == 0) && (DISPCAPCNT.CaptureSrc != 0) && (vramConfiguration.banks[vramReadBlock].purpose == VramConfiguration::LCDC) )
-	{
-		size_t vramNativeOffset = readLineIndexWithOffset * GPU_FRAMEBUFFER_NATIVE_WIDTH;
-		vramNativeOffset &= 0x0000FFFF;
-		vramNative16 = this->_VRAMNativeBlockPtr[vramReadBlock] + vramNativeOffset;
-		
-		this->VerifyVRAMLineDidChange(vramReadBlock, readLineIndexWithOffset);
-		
-		if (!this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset])
-		{
-			size_t vramCustomOffset = ((this->_dispCapCnt.readOffset * _gpuCaptureLineIndex[64]) + _gpuCaptureLineIndex[l]) * dispInfo.customWidth;
-			while (vramCustomOffset >= _gpuVRAMBlockOffset)
-			{
-				vramCustomOffset -= _gpuVRAMBlockOffset;
-			}
-			
-			switch (OUTPUTFORMAT)
-			{
-				case NDSColorFormat_BGR555_Rev:
-				case NDSColorFormat_BGR666_Rev:
-					vramCustom16 = (u16 *)this->_VRAMCustomBlockPtr[vramReadBlock] + vramCustomOffset;
-					break;
-					
-				case NDSColorFormat_BGR888_Rev:
-					vramCustom32 = (u32 *)this->_VRAMCustomBlockPtr[vramReadBlock] + vramCustomOffset;
-					break;
-			}
-			
-			readNativeVRAM = false;
-		}
-	}
-	
-	static CACHE_ALIGN u16 fifoLine16[GPU_FRAMEBUFFER_NATIVE_WIDTH];
-	const u16 *srcA16 = (DISPCAPCNT.SrcA == 0) ? ((OUTPUTFORMAT != NDSColorFormat_BGR555_Rev) ? this->_captureWorkingA16 : (u16 *)compInfo.target.lineColorHead) : this->_3DFramebuffer16 + compInfo.line.blockOffsetCustom;
-	const u16 *srcB16 = (DISPCAPCNT.SrcB == 0) ? vramNative16 : fifoLine16;
-	
-	switch (DISPCAPCNT.CaptureSrc)
-	{
-		case 0: // Capture source is SourceA
-		{
-			//INFO("Capture source is SourceA\n");
-			switch (DISPCAPCNT.SrcA)
-			{
-				case 0: // Capture screen (BG + OBJ + 3D)
-				{
-					//INFO("Capture screen (BG + OBJ + 3D)\n");
-					if (this->isLineRenderNative[l])
-					{
-						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
-					}
-					else
-					{
-						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, false, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
-					}
-					
-					newCaptureLineNativeState = this->isLineRenderNative[l];
-					break;
-				}
-					
-				case 1: // Capture 3D
-				{
-					//INFO("Capture 3D\n");
-					if (is3DFramebufferNativeSize)
-					{
-						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
-					}
-					else
-					{
-						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, false, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
-					}
-					
-					newCaptureLineNativeState = is3DFramebufferNativeSize;
-					break;
-				}
-			}
-			break;
-		}
-			
-		case 1: // Capture source is SourceB
-		{
-			//INFO("Capture source is SourceB\n");
-			switch (DISPCAPCNT.SrcB)
-			{
-				case 0: // Capture VRAM
-				{
-					this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, true>(srcB16, dstNative16, CAPTURELENGTH, 1);
-					newCaptureLineNativeState = this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset];
-					break;
-				}
-					
-				case 1: // Capture dispfifo (not yet tested)
-				{
-					this->_RenderLine_DispCapture_FIFOToBuffer(fifoLine16);
-					this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, true>(srcB16, dstNative16, CAPTURELENGTH, 1);
-					newCaptureLineNativeState = true;
-					break;
-				}
-			}
-			break;
-		}
-			
-		default: // Capture source is SourceA+B blended
-		{
-			//INFO("Capture source is SourceA+B blended\n");
-			if (DISPCAPCNT.SrcB != 0)
-			{
-				// fifo - tested by splinter cell chaos theory thermal view
-				this->_RenderLine_DispCapture_FIFOToBuffer(fifoLine16);
-			}
-			
-			if (DISPCAPCNT.SrcA == 0)
-			{
-				if (this->isLineRenderNative[l])
-				{
-					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, true, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
-				}
-				else
-				{
-					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, false, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
-				}
-				
-				newCaptureLineNativeState = this->isLineRenderNative[l] && ((DISPCAPCNT.SrcB != 0) || this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset]);
-			}
-			else
-			{
-				if (is3DFramebufferNativeSize)
-				{
-					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, true, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
-					newCaptureLineNativeState = (DISPCAPCNT.SrcB != 0) || this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset];
-				}
-				else
-				{
-					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, false, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
-					newCaptureLineNativeState = false;
-				}
-			}
-			break;
-		}
-	}
-	
-#ifdef ENABLE_SSE2
-	MACRODO_N( CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_stream_si128((__m128i *)(this->_VRAMNativeBlockCaptureCopyPtr[vramWriteBlock] + dstNativeOffset) + (X), _mm_load_si128((__m128i *)dstNative16 + (X))) );
-#else
-	memcpy(this->_VRAMNativeBlockCaptureCopyPtr[vramWriteBlock] + dstNativeOffset, dstNative16, CAPTURELENGTH * sizeof(u16));
-#endif
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-	{
-		captureLineNativeState32 = newCaptureLineNativeState;
-		newCaptureLineNativeState = false;
-	}
-	
-	if (this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] && !newCaptureLineNativeState)
-	{
-		this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] = false;
-		this->nativeLineCaptureCount[vramWriteBlock]--;
-	}
-	else if (!this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] && newCaptureLineNativeState)
-	{
-		this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] = true;
-		this->nativeLineCaptureCount[vramWriteBlock]++;
-	}
-	
-	if (!this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset])
-	{
-		const size_t captureLengthExt = (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? dispInfo.customWidth : dispInfo.customWidth / 2;
-		const size_t captureLineCount = _gpuCaptureLineCount[l];
-		
-		size_t dstCustomOffset = (DISPCAPCNT.VRAMWriteOffset * _gpuCaptureLineIndex[64] * dispInfo.customWidth) + (_gpuCaptureLineIndex[l] * captureLengthExt);
-		while (dstCustomOffset >= _gpuVRAMBlockOffset)
-		{
-			dstCustomOffset -= _gpuVRAMBlockOffset;
-		}
-		
-		if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-		{
-			static CACHE_ALIGN FragmentColor fifoLine32[GPU_FRAMEBUFFER_NATIVE_WIDTH];
-			FragmentColor *dstCustom32 = (FragmentColor *)this->_VRAMCustomBlockPtr[vramWriteBlock] + dstCustomOffset;
-			bool isLineCaptureNative32 = ( (vramWriteBlock == vramReadBlock) && (writeLineIndexWithOffset == readLineIndexWithOffset) ) ? captureLineNativeState32 : this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset];
-			
-			if ( (DISPCAPCNT.SrcB == 1) && (DISPCAPCNT.CaptureSrc != 0) )
-			{
-				ColorspaceConvertBuffer555To8888Opaque<false, false>(fifoLine16, (u32 *)fifoLine32, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-			}
-			
-			if ( (DISPCAPCNT.SrcB == 0) && (DISPCAPCNT.CaptureSrc != 0) && (vramConfiguration.banks[vramReadBlock].purpose == VramConfiguration::LCDC) )
-			{
-				if (readNativeVRAM)
-				{
-					ColorspaceConvertBuffer555To8888Opaque<false, false>(vramNative16, (u32 *)vramCustom32, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				}
-			}
-			
-			const u32 *srcA32 = (DISPCAPCNT.SrcA == 0) ? (u32 *)compInfo.target.lineColorHead : (u32 *)CurrentRenderer->GetFramebuffer() + compInfo.line.blockOffsetCustom;
-			const u32 *srcB32 = (DISPCAPCNT.SrcB == 0) ? vramCustom32 : (u32 *)fifoLine32;
-			
-			switch (DISPCAPCNT.CaptureSrc)
-			{
-				case 0: // Capture source is SourceA
-				{
-					switch (DISPCAPCNT.SrcA)
-					{
-						case 0: // Capture screen (BG + OBJ + 3D)
-						{
-							if (this->isLineRenderNative[l])
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, true, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
-							}
-							else
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, false, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
-							}
-							break;
-						}
-							
-						case 1: // Capture 3D
-						{
-							if (is3DFramebufferNativeSize)
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 1, CAPTURELENGTH, true, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
-							}
-							else
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 1, CAPTURELENGTH, false, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
-							}
-							break;
-						}
-					}
-					break;
-				}
-					
-				case 1: // Capture source is SourceB
-				{
-					switch (DISPCAPCNT.SrcB)
-					{
-						case 0: // Capture VRAM
-						{
-							if (isLineCaptureNative32)
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, true, false>(srcB32, dstCustom32, captureLengthExt, captureLineCount);
-							}
-							else
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, false, false>(srcB32, dstCustom32, captureLengthExt, captureLineCount);
-							}
-							break;
-						}
-							
-						case 1: // Capture dispfifo (not yet tested)
-						{
-							this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 1, CAPTURELENGTH, true, false>(srcB32, dstCustom32, captureLengthExt, captureLineCount);
-							break;
-						}
-					}
-					break;
-				}
-					
-				default: // Capture source is SourceA+B blended
-				{
-					u32 *srcCustomA32 = (u32 *)srcA32;
-					u32 *srcCustomB32 = (u32 *)srcB32;
-					
-					if ( (DISPCAPCNT.SrcB == 1) || isLineCaptureNative32 )
-					{
-						srcCustomB32 = (u32 *)this->_captureWorkingB32;
-						this->_LineCopy<0xFFFF, false, false, 4>(srcCustomB32, srcB32, 0);
-					}
-					
-					if (DISPCAPCNT.SrcA == 0)
-					{
-						if (this->isLineRenderNative[l])
-						{
-							srcCustomA32 = (u32 *)this->_captureWorkingA32;
-							this->_LineCopy<0xFFFF, false, false, 4>(srcCustomA32, srcA32, 0);
-						}
-					}
-					else
-					{
-						if (is3DFramebufferNativeSize)
-						{
-							srcCustomA32 = (u32 *)this->_captureWorkingA32;
-							this->_LineCopy<0xFFFF, false, false, 4>(srcCustomA32, srcA32, 0);
-						}
-					}
-					
-					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR888_Rev, CAPTURELENGTH, false, false, false>(srcCustomA32, srcCustomB32, dstCustom32, captureLengthExt, captureLineCount);
-					break;
-				}
-			}
-		}
-		else
-		{
-			if (!this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] && (DISPCAPCNT.SrcB == 0))
-			{
-				srcB16 = vramCustom16;
-			}
-			
-			u16 *dstCustom16 = (u16 *)this->_VRAMCustomBlockPtr[vramWriteBlock] + dstCustomOffset;
-			
-			switch (DISPCAPCNT.CaptureSrc)
-			{
-				case 0: // Capture source is SourceA
-				{
-					switch (DISPCAPCNT.SrcA)
-					{
-						case 0: // Capture screen (BG + OBJ + 3D)
-						{
-							if (this->isLineRenderNative[l])
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
-							}
-							else
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, false, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
-							}
-							break;
-						}
-							
-						case 1: // Capture 3D
-						{
-							if (is3DFramebufferNativeSize)
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
-							}
-							else
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, false, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
-							}
-							break;
-						}
-					}
-					break;
-				}
-					
-				case 1: // Capture source is SourceB
-				{
-					switch (DISPCAPCNT.SrcB)
-					{
-						case 0: // Capture VRAM
-						{
-							if (this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset])
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, false>(srcB16, dstCustom16, captureLengthExt, captureLineCount);
-							}
-							else
-							{
-								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, false, false>(srcB16, dstCustom16, captureLengthExt, captureLineCount);
-							}
-							break;
-						}
-							
-						case 1: // Capture dispfifo (not yet tested)
-							this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, false>(srcB16, dstCustom16, captureLengthExt, captureLineCount);
-							break;
-					}
-					break;
-				}
-					
-				default: // Capture source is SourceA+B blended
-				{
-					u16 *srcCustomA16 = (u16 *)srcA16;
-					u16 *srcCustomB16 = (u16 *)srcB16;
-					
-					if ( (DISPCAPCNT.SrcB == 1) || this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] )
-					{
-						srcCustomB16 = this->_captureWorkingB16;
-						this->_LineCopy<0xFFFF, false, false, 2>(srcCustomB16, srcB16, 0);
-					}
-					
-					if (DISPCAPCNT.SrcA == 0)
-					{
-						if (this->isLineRenderNative[l])
-						{
-							srcCustomA16 = this->_captureWorkingA16;
-							this->_LineCopy<0xFFFF, false, false, 2>(srcCustomA16, srcA16, 0);
-						}
-					}
-					else
-					{
-						if (is3DFramebufferNativeSize)
-						{
-							srcCustomA16 = this->_captureWorkingA16;
-							this->_LineCopy<0xFFFF, false, false, 2>(srcCustomA16, srcA16, 0);
-						}
-					}
-					
-					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, false, false, false>(srcCustomA16, srcCustomB16, dstCustom16, captureLengthExt, captureLineCount);
-					break;
-				}
-			}
-		}
-	}
-}
-
-void GPUEngineA::_RenderLine_DispCapture_FIFOToBuffer(u16 *fifoLineBuffer)
-{
-#ifdef ENABLE_SSE2
-	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
-	{
-		const __m128i fifoColor = _mm_setr_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
-		_mm_store_si128((__m128i *)fifoLineBuffer + i, fifoColor);
-	}
-#else
-	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
-	{
-		((u32 *)fifoLineBuffer)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv() );
-	}
-#endif
-}
-
-template<NDSColorFormat COLORFORMAT, int SOURCESWITCH, size_t CAPTURELENGTH, bool CAPTUREFROMNATIVESRC, bool CAPTURETONATIVEDST>
-void GPUEngineA::_RenderLine_DispCapture_Copy(const void *src, void *dst, const size_t captureLengthExt, const size_t captureLineCount)
-{
-	const u16 alphaBit16 = (SOURCESWITCH == 0) ? 0x8000 : 0x0000;
-	const u32 alphaBit32 = (SOURCESWITCH == 0) ? ((COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF000000 : 0x1F000000) : 0x00000000;
-	
-#ifdef ENABLE_SSE2
-	const __m128i alpha_vec128 = (COLORFORMAT == NDSColorFormat_BGR555_Rev) ? _mm_set1_epi16(alphaBit16) : _mm_set1_epi32(alphaBit32);
-#endif
-	
-	if (CAPTURETONATIVEDST)
-	{
-		if (CAPTUREFROMNATIVESRC)
-		{
-#ifdef ENABLE_SSE2
-			switch (COLORFORMAT)
-			{
-				case NDSColorFormat_BGR555_Rev:
-					MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_store_si128((__m128i *)dst + (X), _mm_or_si128( _mm_load_si128( (__m128i *)src + (X)), alpha_vec128 ) ));
-					break;
-					
-				case NDSColorFormat_BGR666_Rev:
-				case NDSColorFormat_BGR888_Rev:
-					MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u32)), _mm_store_si128((__m128i *)dst + (X), _mm_or_si128( _mm_load_si128( (__m128i *)src + (X)), alpha_vec128 ) ));
-					break;
-			}
-#else
-			for (size_t i = 0; i < CAPTURELENGTH; i++)
-			{
-				switch (COLORFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-						((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
-						break;
-						
-					case NDSColorFormat_BGR666_Rev:
-					case NDSColorFormat_BGR888_Rev:
-						((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
-						break;
-				}
-			}
-#endif
-		}
-		else
-		{
-			for (size_t i = 0; i < CAPTURELENGTH; i++)
-			{
-				switch (COLORFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-						((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[_gpuDstPitchIndex[i]] | alphaBit16);
-						break;
-						
-					case NDSColorFormat_BGR666_Rev:
-					case NDSColorFormat_BGR888_Rev:
-						((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[_gpuDstPitchIndex[i]] | alphaBit32);
-						break;
-				}
-			}
-		}
-	}
-	else
-	{
-		const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
-		
-		if (CAPTUREFROMNATIVESRC)
-		{
-			for (size_t i = 0; i < CAPTURELENGTH; i++)
-			{
-				for (size_t p = 0; p < _gpuDstPitchCount[i]; p++)
-				{
-					switch (COLORFORMAT)
-					{
-						case NDSColorFormat_BGR555_Rev:
-							((u16 *)dst)[_gpuDstPitchIndex[i] + p] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
-							break;
-							
-						case NDSColorFormat_BGR666_Rev:
-						case NDSColorFormat_BGR888_Rev:
-							((u32 *)dst)[_gpuDstPitchIndex[i] + p] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
-							break;
-					}
-				}
-			}
-			
-			for (size_t line = 1; line < captureLineCount; line++)
-			{
-				switch (COLORFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-						memcpy((u16 *)dst + (line * dispInfo.customWidth), dst, captureLengthExt * sizeof(u16));
-						break;
-						
-					case NDSColorFormat_BGR666_Rev:
-					case NDSColorFormat_BGR888_Rev:
-						memcpy((u32 *)dst + (line * dispInfo.customWidth), dst, captureLengthExt * sizeof(u32));
-						break;
-				}
-			}
-		}
-		else
-		{
-			if (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH)
-			{
-				const size_t pixCountExt = captureLengthExt * captureLineCount;
-				size_t i = 0;
-				
-#ifdef ENABLE_SSE2
-				switch (COLORFORMAT)
-				{
-					case NDSColorFormat_BGR555_Rev:
-					{
-						const size_t ssePixCount = pixCountExt - (pixCountExt % 8);
-						for (; i < ssePixCount; i += 8)
-						{
-							_mm_store_si128((__m128i *)((u16 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u16 *)src + i)), alpha_vec128 ) );
-						}
-						break;
-					}
-						
-					case NDSColorFormat_BGR666_Rev:
-					case NDSColorFormat_BGR888_Rev:
-					{
-						const size_t ssePixCount = pixCountExt - (pixCountExt % 4);
-						for (; i < ssePixCount; i += 4)
-						{
-							_mm_store_si128((__m128i *)((u32 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u32 *)src + i)), alpha_vec128 ) );
-						}
-						break;
-					}
-				}
-#endif
-				
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-				for (; i < pixCountExt; i++)
-				{
-					switch (COLORFORMAT)
-					{
-						case NDSColorFormat_BGR555_Rev:
-							((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
-							break;
-							
-						case NDSColorFormat_BGR666_Rev:
-						case NDSColorFormat_BGR888_Rev:
-							((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
-							break;
-					}
-				}
-			}
-			else
-			{
-				for (size_t line = 0; line < captureLineCount; line++)
-				{
-					size_t i = 0;
-					
-					switch (COLORFORMAT)
-					{
-						case NDSColorFormat_BGR555_Rev:
-						{
-#ifdef ENABLE_SSE2
-							const size_t ssePixCount = captureLengthExt - (captureLengthExt % 8);
-							for (; i < ssePixCount; i += 8)
-							{
-								_mm_store_si128((__m128i *)((u16 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u16 *)src + i)), alpha_vec128 ) );
-							}
-#endif
-							
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-							for (; i < captureLengthExt; i++)
-							{
-								((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
-							}
-							
-							src = (u16 *)src + dispInfo.customWidth;
-							dst = (u16 *)dst + dispInfo.customWidth;
-							break;
-						}
-							
-						case NDSColorFormat_BGR666_Rev:
-						case NDSColorFormat_BGR888_Rev:
-						{
-#ifdef ENABLE_SSE2
-							const size_t ssePixCount = captureLengthExt - (captureLengthExt % 4);
-							for (; i < ssePixCount; i += 4)
-							{
-								_mm_store_si128((__m128i *)((u32 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u32 *)src + i)), alpha_vec128 ) );
-							}
-#endif
-							
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-							for (; i < captureLengthExt; i++)
-							{
-								((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
-							}
-							
-							src = (u32 *)src + dispInfo.customWidth;
-							dst = (u32 *)dst + dispInfo.customWidth;
-							break;
-						}
-					}
-				}
-			}
-		}
-	}
-}
-
-u16 GPUEngineA::_RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB, const u8 blendEVA, const u8 blendEVB)
-{
-	u16 a = 0;
-	u16 r = 0;
-	u16 g = 0;
-	u16 b = 0;
-	u16 a_alpha = srcA & 0x8000;
-	u16 b_alpha = srcB & 0x8000;
-	
-	if (a_alpha)
-	{
-		a = 0x8000;
-		r =  ((srcA        & 0x001F) * blendEVA);
-		g = (((srcA >>  5) & 0x001F) * blendEVA);
-		b = (((srcA >> 10) & 0x001F) * blendEVA);
-	}
-	
-	if (b_alpha)
-	{
-		a = 0x8000;
-		r +=  ((srcB        & 0x001F) * blendEVB);
-		g += (((srcB >>  5) & 0x001F) * blendEVB);
-		b += (((srcB >> 10) & 0x001F) * blendEVB);
-	}
-	
-	r >>= 4;
-	g >>= 4;
-	b >>= 4;
-	
-	//freedom wings sky will overflow while doing some fsaa/motionblur effect without this
-	r = (r > 31) ? 31 : r;
-	g = (g > 31) ? 31 : g;
-	b = (b > 31) ? 31 : b;
-	
-	return LOCAL_TO_LE_16(a | (b << 10) | (g << 5) | r);
-}
-
-template<NDSColorFormat COLORFORMAT>
-FragmentColor GPUEngineA::_RenderLine_DispCapture_BlendFunc(const FragmentColor srcA, const FragmentColor srcB, const u8 blendEVA, const u8 blendEVB)
-{
-	FragmentColor outColor;
-	outColor.color = 0;
-	
-	u16 r = 0;
-	u16 g = 0;
-	u16 b = 0;
-	
-	if (srcA.a > 0)
-	{
-		outColor.a  = (COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
-		r  = srcA.r * blendEVA;
-		g  = srcA.g * blendEVA;
-		b  = srcA.b * blendEVA;
-	}
-	
-	if (srcB.a > 0)
-	{
-		outColor.a  = (COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
-		r += srcB.r * blendEVB;
-		g += srcB.g * blendEVB;
-		b += srcB.b * blendEVB;
-	}
-	
-	r >>= 4;
-	g >>= 4;
-	b >>= 4;
-	
-	//freedom wings sky will overflow while doing some fsaa/motionblur effect without this
-	if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
-	{
-		outColor.r = (r > 255) ? 255 : r;
-		outColor.g = (g > 255) ? 255 : g;
-		outColor.b = (b > 255) ? 255 : b;
-	}
-	else
-	{
-		outColor.r = (r > 63) ? 63 : r;
-		outColor.g = (g > 63) ? 63 : g;
-		outColor.b = (b > 63) ? 63 : b;
-	}
-	
-	return outColor;
-}
-
-#ifdef ENABLE_SSE2
-template <NDSColorFormat COLORFORMAT>
-__m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB)
-{
-#ifdef ENABLE_SSSE3
-	__m128i blendAB = _mm_or_si128( blendEVA, _mm_slli_epi16(blendEVB, 8) );
-#endif
-	
-	switch (COLORFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-		{
-			__m128i srcA_alpha = _mm_and_si128(srcA, _mm_set1_epi16(0x8000));
-			__m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi16(0x8000));
-			__m128i srcA_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcA_alpha, _mm_setzero_si128()), srcA );
-			__m128i srcB_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcB_alpha, _mm_setzero_si128()), srcB );
-			__m128i colorBitMask = _mm_set1_epi16(0x001F);
-			
-			__m128i ra;
-			__m128i ga;
-			__m128i ba;
-			
-#ifdef ENABLE_SSSE3
-			ra = _mm_or_si128( _mm_and_si128(               srcA_masked,      colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 8), _mm_set1_epi16(0x1F00)) );
-			ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked,  5), colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 3), _mm_set1_epi16(0x1F00)) );
-			ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(srcB_masked, 2), _mm_set1_epi16(0x1F00)) );
-			
-			ra = _mm_maddubs_epi16(ra, blendAB);
-			ga = _mm_maddubs_epi16(ga, blendAB);
-			ba = _mm_maddubs_epi16(ba, blendAB);
-#else
-			ra = _mm_and_si128(               srcA_masked,      colorBitMask);
-			ga = _mm_and_si128(_mm_srli_epi16(srcA_masked,  5), colorBitMask);
-			ba = _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask);
-			
-			__m128i rb = _mm_and_si128(               srcB_masked,      colorBitMask);
-			__m128i gb = _mm_and_si128(_mm_srli_epi16(srcB_masked,  5), colorBitMask);
-			__m128i bb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 10), colorBitMask);
-			
-			ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) );
-			ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) );
-			ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) );
-#endif
-			
-			ra = _mm_srli_epi16(ra, 4);
-			ga = _mm_srli_epi16(ga, 4);
-			ba = _mm_srli_epi16(ba, 4);
-			
-			ra = _mm_min_epi16(ra, colorBitMask);
-			ga = _mm_min_epi16(ga, colorBitMask);
-			ba = _mm_min_epi16(ba, colorBitMask);
-			
-			return _mm_or_si128( _mm_or_si128(_mm_or_si128(ra, _mm_slli_epi16(ga,  5)), _mm_slli_epi16(ba, 10)), _mm_or_si128(srcA_alpha, srcB_alpha) );
-		}
-			
-		case NDSColorFormat_BGR666_Rev:
-		case NDSColorFormat_BGR888_Rev:
-		{
-			// Get color masks based on if the alpha value is 0. Colors with an alpha value
-			// equal to 0 are rejected.
-			__m128i srcA_alpha = _mm_and_si128(srcA, _mm_set1_epi32(0xFF000000));
-			__m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi32(0xFF000000));
-			__m128i srcA_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcA_alpha, _mm_setzero_si128()), srcA);
-			__m128i srcB_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcB_alpha, _mm_setzero_si128()), srcB);
-			
-			__m128i outColorLo;
-			__m128i outColorHi;
-			__m128i outColor;
-			
-			// Temporarily convert the color component values from 8-bit to 16-bit, and then
-			// do the blend calculation.
-#ifdef ENABLE_SSSE3
-			outColorLo = _mm_unpacklo_epi8(srcA_masked, srcB_masked);
-			outColorHi = _mm_unpackhi_epi8(srcA_masked, srcB_masked);
-			
-			outColorLo = _mm_maddubs_epi16(outColorLo, blendAB);
-			outColorHi = _mm_maddubs_epi16(outColorHi, blendAB);
-#else
-			__m128i srcA_maskedLo = _mm_unpacklo_epi8(srcA_masked, _mm_setzero_si128());
-			__m128i srcA_maskedHi = _mm_unpackhi_epi8(srcA_masked, _mm_setzero_si128());
-			__m128i srcB_maskedLo = _mm_unpacklo_epi8(srcB_masked, _mm_setzero_si128());
-			__m128i srcB_maskedHi = _mm_unpackhi_epi8(srcB_masked, _mm_setzero_si128());
-			
-			outColorLo = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedLo, blendEVA), _mm_mullo_epi16(srcB_maskedLo, blendEVB) );
-			outColorHi = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedHi, blendEVA), _mm_mullo_epi16(srcB_maskedHi, blendEVB) );
-#endif
-			
-			outColorLo = _mm_srli_epi16(outColorLo, 4);
-			outColorHi = _mm_srli_epi16(outColorHi, 4);
-			
-			// Convert the color components back from 16-bit to 8-bit using a saturated pack.
-			outColor = _mm_packus_epi16(outColorLo, outColorHi);
-			
-			// When the color format is 8888, the packuswb instruction will naturally clamp
-			// the color component values to 255. However, when the color format is 6665, the
-			// color component values must be clamped to 63. In this case, we must call pminub
-			// to do the clamp.
-			if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
-			{
-				outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63));
-			}
-			
-			// Add the alpha components back in.
-			outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF));
-			outColor = _mm_or_si128(outColor, srcA_alpha);
-			outColor = _mm_or_si128(outColor, srcB_alpha);
-			
-			return outColor;
-		}
-	}
-}
-#endif
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length, size_t l)
-{
-#ifdef ENABLE_SSE2
-	const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA);
-	const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB);
-#endif
-	
-	size_t i = 0;
-	
-	if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-	{
-		const FragmentColor *srcA_32 = (const FragmentColor *)srcA;
-		const FragmentColor *srcB_32 = (const FragmentColor *)srcB;
-		FragmentColor *dst32 = (FragmentColor *)dst;
-		
-#ifdef ENABLE_SSE2
-		const size_t ssePixCount = length - (length % 4);
-		for (; i < ssePixCount; i+=4)
-		{
-			const __m128i srcA_vec128 = _mm_load_si128((__m128i *)(srcA_32 + i));
-			const __m128i srcB_vec128 = _mm_load_si128((__m128i *)(srcB_32 + i));
-			
-			_mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<OUTPUTFORMAT>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
-		}
-#endif
-		
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-		for (; i < length; i++)
-		{
-			const FragmentColor colorA = srcA_32[i];
-			const FragmentColor colorB = srcB_32[i];
-			
-			dst32[i] = this->_RenderLine_DispCapture_BlendFunc<OUTPUTFORMAT>(colorA, colorB, blendEVA, blendEVB);
-		}
-	}
-	else
-	{
-		const u16 *srcA_16 = (const u16 *)srcA;
-		const u16 *srcB_16 = (const u16 *)srcB;
-		u16 *dst16 = (u16 *)dst;
-		
-#ifdef ENABLE_SSE2
-		const size_t ssePixCount = length - (length % 8);
-		for (; i < ssePixCount; i+=8)
-		{
-			const __m128i srcA_vec128 = _mm_load_si128((__m128i *)(srcA_16 + i));
-			const __m128i srcB_vec128 = _mm_load_si128((__m128i *)(srcB_16 + i));
-			
-			_mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<NDSColorFormat_BGR555_Rev>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
-		}
-#endif
-		
-#ifdef ENABLE_SSE2
-#pragma LOOPVECTORIZE_DISABLE
-#endif
-		for (; i < length; i++)
-		{
-			const u16 colorA = srcA_16[i];
-			const u16 colorB = srcB_16[i];
-			
-			dst16[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB);
-		}
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT, size_t CAPTURELENGTH, bool CAPTUREFROMNATIVESRCA, bool CAPTUREFROMNATIVESRCB, bool CAPTURETONATIVEDST>
-void GPUEngineA::_RenderLine_DispCapture_Blend(const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt, const size_t l)
-{
-	const u8 blendEVA = this->_dispCapCnt.EVA;
-	const u8 blendEVB = this->_dispCapCnt.EVB;
-	
-	if (CAPTURETONATIVEDST)
-	{
-#ifdef ENABLE_SSE2
-		const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA);
-		const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB);
-		
-		if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-		{
-			const u32 *srcA_32 = (const u32 *)srcA;
-			const u32 *srcB_32 = (const u32 *)srcB;
-			FragmentColor *dst32 = (FragmentColor *)dst;
-			
-			for (size_t i = 0; i < CAPTURELENGTH; i+=4)
-			{
-				__m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_32 + i)) : _mm_set_epi32(srcA_32[_gpuDstPitchIndex[i+3]],
-																														 srcA_32[_gpuDstPitchIndex[i+2]],
-																														 srcA_32[_gpuDstPitchIndex[i+1]],
-																														 srcA_32[_gpuDstPitchIndex[i+0]]);
-				
-				__m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_32 + i)) : _mm_set_epi32(srcB_32[_gpuDstPitchIndex[i+3]],
-																														 srcB_32[_gpuDstPitchIndex[i+2]],
-																														 srcB_32[_gpuDstPitchIndex[i+1]],
-																														 srcB_32[_gpuDstPitchIndex[i+0]]);
-				
-				_mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<OUTPUTFORMAT>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
-			}
-		}
-		else
-		{
-			const u16 *srcA_16 = (const u16 *)srcA;
-			const u16 *srcB_16 = (const u16 *)srcB;
-			u16 *dst16 = (u16 *)dst;
-			
-			for (size_t i = 0; i < CAPTURELENGTH; i+=8)
-			{
-				__m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_16 + i)) : _mm_set_epi16(srcA_16[_gpuDstPitchIndex[i+7]],
-																														 srcA_16[_gpuDstPitchIndex[i+6]],
-																														 srcA_16[_gpuDstPitchIndex[i+5]],
-																														 srcA_16[_gpuDstPitchIndex[i+4]],
-																														 srcA_16[_gpuDstPitchIndex[i+3]],
-																														 srcA_16[_gpuDstPitchIndex[i+2]],
-																														 srcA_16[_gpuDstPitchIndex[i+1]],
-																														 srcA_16[_gpuDstPitchIndex[i+0]]);
-				
-				__m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_16 + i)) : _mm_set_epi16(srcB_16[_gpuDstPitchIndex[i+7]],
-																														 srcB_16[_gpuDstPitchIndex[i+6]],
-																														 srcB_16[_gpuDstPitchIndex[i+5]],
-																														 srcB_16[_gpuDstPitchIndex[i+4]],
-																														 srcB_16[_gpuDstPitchIndex[i+3]],
-																														 srcB_16[_gpuDstPitchIndex[i+2]],
-																														 srcB_16[_gpuDstPitchIndex[i+1]],
-																														 srcB_16[_gpuDstPitchIndex[i+0]]);
-				
-				_mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<NDSColorFormat_BGR555_Rev>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
-			}
-		}
-#else
-		for (size_t i = 0; i < CAPTURELENGTH; i++)
-		{
-			if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-			{
-				const FragmentColor colorA = (CAPTUREFROMNATIVESRCA) ? ((const FragmentColor *)srcA)[i] : ((const FragmentColor *)srcA)[_gpuDstPitchIndex[i]];
-				const FragmentColor colorB = (CAPTUREFROMNATIVESRCB) ? ((const FragmentColor *)srcB)[i] : ((const FragmentColor *)srcB)[_gpuDstPitchIndex[i]];
-				
-				((FragmentColor *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc<OUTPUTFORMAT>(colorA, colorB, blendEVA, blendEVB);
-			}
-			else
-			{
-				const u16 colorA = (CAPTUREFROMNATIVESRCA) ? ((u16 *)srcA)[i] : ((u16 *)srcA)[_gpuDstPitchIndex[i]];
-				const u16 colorB = (CAPTUREFROMNATIVESRCB) ? ((u16 *)srcB)[i] : ((u16 *)srcB)[_gpuDstPitchIndex[i]];
-				
-				((u16 *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB);
-			}
-		}
-#endif
-	}
-	else
-	{
-		const size_t lineWidth = GPU->GetDisplayInfo().customWidth;
-		const size_t captureLineCount = _gpuCaptureLineCount[l];
-		
-		if (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH)
-		{
-			this->_RenderLine_DispCapture_BlendToCustomDstBuffer<OUTPUTFORMAT>(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt * captureLineCount, l);
-		}
-		else
-		{
-			for (size_t line = 0; line < captureLineCount; line++)
-			{
-				this->_RenderLine_DispCapture_BlendToCustomDstBuffer<OUTPUTFORMAT>(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt, l);
-				srcA = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcA + lineWidth) : (void *)((u16 *)srcA + lineWidth);
-				srcB = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcB + lineWidth) : (void *)((u16 *)srcB + lineWidth);
-				dst = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)dst + lineWidth) : (void *)((u16 *)dst + lineWidth);
-			}
-		}
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
-{
-	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
-	this->VerifyVRAMLineDidChange(DISPCNT.VRAM_Block, l);
-	
-	if (this->isLineCaptureNative[DISPCNT.VRAM_Block][l])
-	{
-		switch (OUTPUTFORMAT)
-		{
-			case NDSColorFormat_BGR555_Rev:
-				this->_LineCopy<1, true, true, 2>(this->nativeBuffer, this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block], l);
-				break;
-				
-			case NDSColorFormat_BGR666_Rev:
-			{
-				const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				u32 *dst = (u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				ColorspaceConvertBuffer555To6665Opaque<false, false>(src, dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				break;
-			}
-				
-			case NDSColorFormat_BGR888_Rev:
-			{
-				const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				u32 *dst = (u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				ColorspaceConvertBuffer555To8888Opaque<false, false>(src, dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-				break;
-			}
-		}
-	}
-	else
-	{
-		const size_t customWidth = GPU->GetDisplayInfo().customWidth;
-		const size_t customPixCount = customWidth * _gpuDstLineCount[l];
-		
-		switch (OUTPUTFORMAT)
-		{
-			case NDSColorFormat_BGR555_Rev:
-				this->_LineCopy<0, true, true, 2>(this->customBuffer, this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block], l);
-				break;
-				
-			case NDSColorFormat_BGR666_Rev:
-			{
-				const u16 *src = (u16 *)this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth);
-				u32 *dst = (u32 *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth);
-				ColorspaceConvertBuffer555To6665Opaque<false, false>(src, dst, customPixCount);
-				break;
-			}
-				
-			case NDSColorFormat_BGR888_Rev:
-			{
-				if (GPU->GetDisplayInfo().isCustomSizeRequested)
-				{
-					this->_LineCopy<0, true, true, 4>(this->customBuffer, this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block], l);
-				}
-				else
-				{
-					this->_LineCopy<1, true, true, 4>(this->nativeBuffer, this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block], l);
-				}
-				break;
-			}
-		}
-		
-		if ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)
-		{
-			this->isLineOutputNative[l] = false;
-			this->nativeLineOutputCount--;
-		}
-	}
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineA::_HandleDisplayModeMainMemory(const size_t l)
-{
-	// Native rendering only.
-	//
-	//this has not been tested since the dma timing for dispfifo was changed around the time of
-	//newemuloop. it may not work.
-	
-	u32 *dstColorLine = (u32 *)((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH));
-	
-	switch (OUTPUTFORMAT)
-	{
-		case NDSColorFormat_BGR555_Rev:
-		{
-			u32 *dst = dstColorLine;
-			
-#ifdef ENABLE_SSE2
-			const __m128i alphaBit = _mm_set1_epi16(0x8000);
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
-			{
-				const __m128i fifoColor = _mm_setr_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
-				_mm_store_si128((__m128i *)dst + i, _mm_or_si128(fifoColor, alphaBit));
-			}
-#else
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
-			{
-				dst[i] = DISP_FIFOrecv() | 0x80008000;
-			}
-#endif
-			break;
-		}
-			
-		case NDSColorFormat_BGR666_Rev:
-		{
-			FragmentColor *dst = (FragmentColor *)dstColorLine;
-			
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
-			{
-				u32 src = DISP_FIFOrecv();
-				dst[i+0].color = COLOR555TO6665_OPAQUE((src >>  0) & 0x7FFF);
-				dst[i+1].color = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF);
-			}
-			break;
-		}
-			
-		case NDSColorFormat_BGR888_Rev:
-		{
-			FragmentColor *dst = (FragmentColor *)dstColorLine;
-			
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
-			{
-				u32 src = DISP_FIFOrecv();
-				dst[i+0].color = COLOR555TO8888_OPAQUE((src >>  0) & 0x7FFF);
-				dst[i+1].color = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF);
-			}
-			break;
-		}
-	}
-}
-
-template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
-void GPUEngineA::_LineLarge8bpp(GPUEngineCompositorInfo &compInfo)
-{
-	u16 XBG = compInfo.renderState.selectedBGLayer->xOffset;
-	u16 YBG = compInfo.line.indexNative + compInfo.renderState.selectedBGLayer->yOffset;
-	u16 lg = compInfo.renderState.selectedBGLayer->size.width;
-	u16 ht = compInfo.renderState.selectedBGLayer->size.height;
-	u16 wmask = (lg-1);
-	u16 hmask = (ht-1);
-	YBG &= hmask;
-	
-	//TODO - handle wrapping / out of bounds correctly from rot_scale_op?
-	
-	u32 tmp_map = compInfo.renderState.selectedBGLayer->largeBMPAddress + lg * YBG;
-	u8 *__restrict map = (u8 *)MMU_gpu_map(tmp_map);
-	
-	for (size_t x = 0; x < lg; ++x, ++XBG)
-	{
-		XBG &= wmask;
-		
-		if (WILLDEFERCOMPOSITING)
-		{
-			this->_deferredIndexNative[x] = map[XBG];
-			this->_deferredColorNative[x] = LE_TO_LOCAL_16(this->_paletteBG[this->_deferredIndexNative[x]]);
-		}
-		else
-		{
-			const u8 index = map[XBG];
-			const u16 color = LE_TO_LOCAL_16(this->_paletteBG[index]);
-			this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (color != 0));
-		}
-	}
-}
-
-void GPUEngineA::LastLineProcess()
-{
-	this->GPUEngineBase::LastLineProcess();
-	DISP_FIFOreset();
-}
-
-GPUEngineB::GPUEngineB()
-{
-	_engineID = GPUEngineID_Sub;
-	_targetDisplayID = NDSDisplayID_Touch;
-	_IORegisterMap = (GPU_IOREG *)(&MMU.ARM9_REG[REG_DISPB]);
-	_paletteBG = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_1KB);
-	_paletteOBJ = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_1KB + ADDRESS_STEP_512B);
-	_oamList = (OAMAttributes *)(MMU.ARM9_OAM + ADDRESS_STEP_1KB);
-	_sprMem = MMU_BOBJ;
-}
-
-GPUEngineB::~GPUEngineB()
-{
-}
-
-GPUEngineB* GPUEngineB::Allocate()
-{
-	return new(malloc_aligned64(sizeof(GPUEngineB))) GPUEngineB();
-}
-
-void GPUEngineB::FinalizeAndDeallocate()
-{
-	this->~GPUEngineB();
-	free_aligned(this);
-}
-
-void GPUEngineB::Reset()
-{
-	this->_Reset_Base();
-	
-	this->_BGLayer[GPULayerID_BG0].BMPAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG1].BMPAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG2].BMPAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG3].BMPAddress = MMU_BBG;
-	
-	this->_BGLayer[GPULayerID_BG0].largeBMPAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG1].largeBMPAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG2].largeBMPAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG3].largeBMPAddress = MMU_BBG;
-	
-	this->_BGLayer[GPULayerID_BG0].tileMapAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG1].tileMapAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG2].tileMapAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG3].tileMapAddress = MMU_BBG;
-	
-	this->_BGLayer[GPULayerID_BG0].tileEntryAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG1].tileEntryAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG2].tileEntryAddress = MMU_BBG;
-	this->_BGLayer[GPULayerID_BG3].tileEntryAddress = MMU_BBG;
-	
-	this->SetDisplayByID(NDSDisplayID_Touch);
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUEngineB::RenderLine(const size_t l)
-{
-	const GPUEngineRenderState &renderState = this->_currentCompositorInfo[l].renderState;
-	
-	switch (renderState.displayOutputMode)
-	{
-		case GPUDisplayMode_Off: // Display Off(Display white)
-			this->_HandleDisplayModeOff<OUTPUTFORMAT>(l);
-			break;
-		
-		case GPUDisplayMode_Normal: // Display BG and OBJ layers
-		{
-			if (renderState.isAnyWindowEnabled)
-			{
-				this->_RenderLine_Layers<OUTPUTFORMAT, true>(l);
-			}
-			else
-			{
-				this->_RenderLine_Layers<OUTPUTFORMAT, false>(l);
-			}
-			
-			this->_HandleDisplayModeNormal<OUTPUTFORMAT>(l);
-			break;
-		}
-			
-		default:
-			break;
-	}
-}
-
-GPUSubsystem::GPUSubsystem()
-{
-	ColorspaceHandlerInit();
-	
-	_defaultEventHandler = new GPUEventHandlerDefault;
-	_event = _defaultEventHandler;
-	
-	gfx3d_init();
-	
-	_engineMain = GPUEngineA::Allocate();
-	_engineSub = GPUEngineB::Allocate();
-	
-	_display[NDSDisplayID_Main] = new NDSDisplay(NDSDisplayID_Main);
-	_display[NDSDisplayID_Main]->SetEngine(_engineMain);
-	_display[NDSDisplayID_Touch] = new NDSDisplay(NDSDisplayID_Touch);
-	_display[NDSDisplayID_Touch]->SetEngine(_engineSub);
-	
-	_videoFrameCount = 0;
-	_render3DFrameCount = 0;
-	_frameNeedsFinish = false;
-	_willFrameSkip = false;
-	_willPostprocessDisplays = true;
-	_willAutoResolveToCustomBuffer = true;
-	
-	//TODO OSD
-	//OSDCLASS *previousOSD = osd;
-	//osd = new OSDCLASS(-1);
-	//delete previousOSD;
-	
-	_displayInfo.colorFormat = NDSColorFormat_BGR555_Rev;
-	_displayInfo.pixelBytes = sizeof(u16);
-	_displayInfo.isCustomSizeRequested = false;
-	_displayInfo.customWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	_displayInfo.customHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	
-	_customVRAM = NULL;
-	_customVRAMBlank = NULL;
-	_displayInfo.framebufferSize = ((GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT)) * 2 * _displayInfo.pixelBytes;
-	_masterFramebuffer = malloc_alignedPage(_displayInfo.framebufferSize * 2);
-	
-	_displayInfo.bufferIndex = 0;
-	_displayInfo.masterFramebufferHead = _masterFramebuffer;
-	_displayInfo.masterNativeBuffer = _masterFramebuffer;
-	_displayInfo.nativeBuffer[NDSDisplayID_Main] = _displayInfo.masterNativeBuffer;
-	_displayInfo.nativeBuffer[NDSDisplayID_Touch] = (u8 *)_displayInfo.masterNativeBuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * _displayInfo.pixelBytes);
-	
-	_displayInfo.masterCustomBuffer = (u8 *)_masterFramebuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * 2 * _displayInfo.pixelBytes);
-	_displayInfo.customBuffer[NDSDisplayID_Main] = _displayInfo.masterCustomBuffer;
-	_displayInfo.customBuffer[NDSDisplayID_Touch] = (u8 *)_displayInfo.masterCustomBuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * _displayInfo.pixelBytes);
-	
-	_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = false;
-	_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = false;
-	_displayInfo.renderedWidth[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	_displayInfo.renderedWidth[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	_displayInfo.renderedHeight[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	_displayInfo.renderedHeight[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	_displayInfo.renderedBuffer[NDSDisplayID_Main] = _displayInfo.nativeBuffer[NDSDisplayID_Main];
-	_displayInfo.renderedBuffer[NDSDisplayID_Touch] = _displayInfo.nativeBuffer[NDSDisplayID_Touch];
-	
-	ClearWithColor(0x8000);
-}
-
-GPUSubsystem::~GPUSubsystem()
-{
-	//TODO OSD
-	//delete osd;
-	//osd = NULL;
-	
-	free_aligned(this->_masterFramebuffer);
-	free_aligned(this->_customVRAM);
-	
-	free_aligned(_gpuDstToSrcIndex);
-	_gpuDstToSrcIndex = NULL;
-	
-	free_aligned(_gpuDstToSrcSSSE3_u8_8e);
-	_gpuDstToSrcSSSE3_u8_8e = NULL;
-	free_aligned(_gpuDstToSrcSSSE3_u8_16e);
-	_gpuDstToSrcSSSE3_u8_16e = NULL;
-	free_aligned(_gpuDstToSrcSSSE3_u16_8e);
-	_gpuDstToSrcSSSE3_u16_8e = NULL;
-	free_aligned(_gpuDstToSrcSSSE3_u32_4e);
-	_gpuDstToSrcSSSE3_u32_4e = NULL;
-	
-	delete _display[NDSDisplayID_Main];
-	delete _display[NDSDisplayID_Touch];
-	_engineMain->FinalizeAndDeallocate();
-	_engineSub->FinalizeAndDeallocate();
-	
-	gfx3d_deinit();
-	
-	delete _defaultEventHandler;
-}
-
-void GPUSubsystem::_UpdateFPSRender3D()
-{
-	this->_videoFrameCount++;
-	if (this->_videoFrameCount == 60)
-	{
-		this->_render3DFrameCount = gfx3d.render3DFrameCount;
-		gfx3d.render3DFrameCount = 0;
-		this->_videoFrameCount = 0;
-	}
-}
-
-void GPUSubsystem::SetEventHandler(GPUEventHandler *eventHandler)
-{
-	this->_event = eventHandler;
-}
-
-GPUEventHandler* GPUSubsystem::GetEventHandler()
-{
-	return this->_event;
-}
-
-void GPUSubsystem::Reset()
-{
-	if (this->_customVRAM == NULL)
-	{
-		this->SetCustomFramebufferSize(this->_displayInfo.customWidth, this->_displayInfo.customHeight);
-	}
-	
-	this->_willFrameSkip = false;
-	this->_videoFrameCount = 0;
-	this->_render3DFrameCount = 0;
-	
-	this->ClearWithColor(0xFFFF);
-	
-	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main]  = false;
-	this->_displayInfo.nativeBuffer[NDSDisplayID_Main]    = this->_displayInfo.masterNativeBuffer;
-	this->_displayInfo.customBuffer[NDSDisplayID_Main]    = this->_displayInfo.masterCustomBuffer;
-	this->_displayInfo.renderedWidth[NDSDisplayID_Main]   = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	this->_displayInfo.renderedHeight[NDSDisplayID_Main]  = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->_displayInfo.renderedBuffer[NDSDisplayID_Main]  = this->_displayInfo.nativeBuffer[NDSDisplayID_Main];
-	
-	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = false;
-	this->_displayInfo.nativeBuffer[NDSDisplayID_Touch]   = (u8 *)this->_displayInfo.masterNativeBuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * this->_displayInfo.pixelBytes);
-	this->_displayInfo.customBuffer[NDSDisplayID_Touch]   = (u8 *)this->_displayInfo.masterCustomBuffer + (this->_displayInfo.customWidth * this->_displayInfo.customHeight * this->_displayInfo.pixelBytes);
-	this->_displayInfo.renderedWidth[NDSDisplayID_Touch]  = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = this->_displayInfo.nativeBuffer[NDSDisplayID_Touch];
-	
-	this->_displayInfo.engineID[NDSDisplayID_Main] = GPUEngineID_Main;
-	this->_displayInfo.engineID[NDSDisplayID_Touch] = GPUEngineID_Sub;
-	
-	this->_display[NDSDisplayID_Main]->SetEngineByID(GPUEngineID_Main);
-	this->_display[NDSDisplayID_Touch]->SetEngineByID(GPUEngineID_Sub);
-	
-	gfx3d_reset();
-	this->_engineMain->Reset();
-	this->_engineSub->Reset();
-	
-	DISP_FIFOreset();
-
-	//historically, we reset the OSD here. maybe because we would want a clean drawing surface? anyway this is not the right point to be doing OSD work
-	//osd->clear();
-}
-
-void GPUSubsystem::ForceRender3DFinishAndFlush(bool willFlush)
-{
-	CurrentRenderer->RenderFinish();
-	CurrentRenderer->RenderFlush(willFlush, willFlush);
-}
-
-void GPUSubsystem::ForceFrameStop()
-{
-	if (CurrentRenderer->GetRenderNeedsFinish())
-	{
-		this->ForceRender3DFinishAndFlush(true);
-		CurrentRenderer->SetRenderNeedsFinish(false);
-		this->_event->DidRender3DEnd();
-	}
-	
-	if (this->_frameNeedsFinish)
-	{
-		this->_frameNeedsFinish = false;
-		this->_event->DidFrameEnd(false, this->_displayInfo);
-	}
-}
-
-bool GPUSubsystem::GetWillFrameSkip() const
-{
-	return this->_willFrameSkip;
-}
-
-void GPUSubsystem::SetWillFrameSkip(const bool willFrameSkip)
-{
-	this->_willFrameSkip = willFrameSkip;
-}
-
-void GPUSubsystem::SetDisplayCaptureEnable()
-{
-	this->_engineMain->SetDisplayCaptureEnable();
-}
-
-void GPUSubsystem::ResetDisplayCaptureEnable()
-{
-	this->_engineMain->ResetDisplayCaptureEnable();
-}
-
-void GPUSubsystem::UpdateRenderProperties()
-{
-	this->_engineMain->nativeLineRenderCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->_engineMain->nativeLineOutputCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->_engineSub->nativeLineRenderCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	this->_engineSub->nativeLineOutputCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	for (size_t l = 0; l < GPU_FRAMEBUFFER_NATIVE_HEIGHT; l++)
-	{
-		this->_engineMain->isLineRenderNative[l] = true;
-		this->_engineMain->isLineOutputNative[l] = true;
-		this->_engineSub->isLineRenderNative[l] = true;
-		this->_engineSub->isLineOutputNative[l] = true;
-	}
-	
-	this->_displayInfo.bufferIndex = (this->_displayInfo.bufferIndex + 1) & 0x01;
-	
-	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * this->_displayInfo.pixelBytes;
-	const size_t customFramebufferSize = this->_displayInfo.customWidth * this->_displayInfo.customHeight * this->_displayInfo.pixelBytes;
-	
-	this->_displayInfo.masterNativeBuffer = (u8 *)this->_masterFramebuffer + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
-	this->_displayInfo.masterCustomBuffer = (u8 *)this->_masterFramebuffer + (nativeFramebufferSize * 2) + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
-	
-	this->_engineMain->nativeBuffer = (this->_engineMain->GetDisplayByID() == NDSDisplayID_Main) ? this->_displayInfo.masterNativeBuffer : (u8 *)this->_displayInfo.masterNativeBuffer + nativeFramebufferSize;
-	this->_engineMain->customBuffer = (this->_engineMain->GetDisplayByID() == NDSDisplayID_Main) ? this->_displayInfo.masterCustomBuffer : (u8 *)this->_displayInfo.masterCustomBuffer + customFramebufferSize;
-	this->_engineMain->renderedBuffer = this->_engineMain->nativeBuffer;
-	this->_engineMain->renderedWidth  = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	this->_engineMain->renderedHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	
-	this->_engineSub->nativeBuffer  = (this->_engineSub->GetDisplayByID()  == NDSDisplayID_Main) ? this->_displayInfo.masterNativeBuffer : (u8 *)this->_displayInfo.masterNativeBuffer + nativeFramebufferSize;
-	this->_engineSub->customBuffer  = (this->_engineSub->GetDisplayByID()  == NDSDisplayID_Main) ? this->_displayInfo.masterCustomBuffer : (u8 *)this->_displayInfo.masterCustomBuffer + customFramebufferSize;
-	this->_engineSub->renderedBuffer  = this->_engineSub->nativeBuffer;
-	this->_engineSub->renderedWidth   = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	this->_engineSub->renderedHeight  = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	
-	GPUEngineBase *mainEngine  = this->_display[NDSDisplayID_Main]->GetEngine();
-	this->_displayInfo.nativeBuffer[NDSDisplayID_Main]    = mainEngine->nativeBuffer;
-	this->_displayInfo.customBuffer[NDSDisplayID_Main]    = mainEngine->customBuffer;
-	this->_displayInfo.renderedBuffer[NDSDisplayID_Main]  = mainEngine->renderedBuffer;
-	this->_displayInfo.renderedWidth[NDSDisplayID_Main]   = mainEngine->renderedWidth;
-	this->_displayInfo.renderedHeight[NDSDisplayID_Main]  = mainEngine->renderedHeight;
-	
-	GPUEngineBase *touchEngine = this->_display[NDSDisplayID_Touch]->GetEngine();
-	this->_displayInfo.nativeBuffer[NDSDisplayID_Touch]   = touchEngine->nativeBuffer;
-	this->_displayInfo.customBuffer[NDSDisplayID_Touch]   = touchEngine->customBuffer;
-	this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = touchEngine->renderedBuffer;
-	this->_displayInfo.renderedWidth[NDSDisplayID_Touch]  = touchEngine->renderedWidth;
-	this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = touchEngine->renderedHeight;
-	
-	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = false;
-	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = false;
-	
-	if (!this->_displayInfo.isCustomSizeRequested && (this->_displayInfo.colorFormat != NDSColorFormat_BGR888_Rev))
-	{
-		return;
-	}
-	
-	// Iterate through VRAM banks A-D and determine if they will be used for this frame.
-	for (size_t i = 0; i < 4; i++)
-	{
-		if (this->_engineMain->nativeLineCaptureCount[i] == GPU_VRAM_BLOCK_LINES)
-		{
-			continue;
-		}
-		
-		switch (vramConfiguration.banks[i].purpose)
-		{
-			case VramConfiguration::ABG:
-			case VramConfiguration::BBG:
-			case VramConfiguration::LCDC:
-			case VramConfiguration::AOBJ:
-			case VramConfiguration::BOBJ:
-				break;
-				
-			default:
-			{
-				this->_engineMain->nativeLineCaptureCount[i] = GPU_VRAM_BLOCK_LINES;
-				for (size_t l = 0; l < GPU_VRAM_BLOCK_LINES; l++)
-				{
-					this->_engineMain->isLineCaptureNative[i][l] = true;
-				}
-				break;
-			}
-		}
-	}
-}
-
-const NDSDisplayInfo& GPUSubsystem::GetDisplayInfo()
-{
-	return this->_displayInfo;
-}
-
-u32 GPUSubsystem::GetFPSRender3D() const
-{
-	return this->_render3DFrameCount;
-}
-
-GPUEngineA* GPUSubsystem::GetEngineMain()
-{
-	return this->_engineMain;
-}
-
-GPUEngineB* GPUSubsystem::GetEngineSub()
-{
-	return this->_engineSub;
-}
-
-NDSDisplay* GPUSubsystem::GetDisplayMain()
-{
-	return this->_display[NDSDisplayID_Main];
-}
-
-NDSDisplay* GPUSubsystem::GetDisplayTouch()
-{
-	return this->_display[NDSDisplayID_Touch];
-}
-
-size_t GPUSubsystem::GetCustomFramebufferWidth() const
-{
-	return this->_displayInfo.customWidth;
-}
-
-size_t GPUSubsystem::GetCustomFramebufferHeight() const
-{
-	return this->_displayInfo.customHeight;
-}
-
-void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h)
-{
-	if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT)
-	{
-		return;
-	}
-	
-	const float customWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	const float customHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	const float newGpuLargestDstLineCount = (size_t)ceilf(customHeightScale);
-	
-	u16 *oldGpuDstToSrcIndexPtr = _gpuDstToSrcIndex;
-	u8 *oldGpuDstToSrcSSSE3_u8_8e = _gpuDstToSrcSSSE3_u8_8e;
-	u8 *oldGpuDstToSrcSSSE3_u8_16e = _gpuDstToSrcSSSE3_u8_16e;
-	u8 *oldGpuDstToSrcSSSE3_u16_8e = _gpuDstToSrcSSSE3_u16_8e;
-	u8 *oldGpuDstToSrcSSSE3_u32_4e = _gpuDstToSrcSSSE3_u32_4e;
-	
-	for (size_t srcX = 0, currentPitchCount = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; srcX++)
-	{
-		const size_t pitch = (size_t)ceilf((srcX+1) * customWidthScale) - currentPitchCount;
-		_gpuDstPitchCount[srcX] = pitch;
-		_gpuDstPitchIndex[srcX] = currentPitchCount;
-		currentPitchCount += pitch;
-	}
-	
-	for (size_t srcY = 0, currentLineCount = 0; srcY < GPU_FRAMEBUFFER_NATIVE_HEIGHT; srcY++)
-	{
-		const size_t lineCount = (size_t)ceilf((srcY+1) * customHeightScale) - currentLineCount;
-		_gpuDstLineCount[srcY] = lineCount;
-		_gpuDstLineIndex[srcY] = currentLineCount;
-		currentLineCount += lineCount;
-	}
-	
-	for (size_t srcY = 0, currentLineCount = 0; srcY < GPU_VRAM_BLOCK_LINES + 1; srcY++)
-	{
-		const size_t lineCount = (size_t)ceilf((srcY+1) * customHeightScale) - currentLineCount;
-		_gpuCaptureLineCount[srcY] = lineCount;
-		_gpuCaptureLineIndex[srcY] = currentLineCount;
-		currentLineCount += lineCount;
-	}
-	
-	u16 *newGpuDstToSrcIndex = (u16 *)malloc_alignedCacheLine(w * h * sizeof(u16));
-	u16 *newGpuDstToSrcPtr = newGpuDstToSrcIndex;
-	for (size_t y = 0, dstIdx = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
-	{
-		if (_gpuDstLineCount[y] < 1)
-		{
-			continue;
-		}
-		
-		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
-		{
-			for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
-			{
-				newGpuDstToSrcIndex[dstIdx++] = (y * GPU_FRAMEBUFFER_NATIVE_WIDTH) + x;
-			}
-		}
-		
-		for (size_t l = 1; l < _gpuDstLineCount[y]; l++)
-		{
-			memcpy(newGpuDstToSrcPtr + (w * l), newGpuDstToSrcPtr, w * sizeof(u16));
-		}
-		
-		newGpuDstToSrcPtr += (w * _gpuDstLineCount[y]);
-		dstIdx += (w * (_gpuDstLineCount[y] - 1));
-	}
-	
-	u8 *newGpuDstToSrcSSSE3_u8_8e = (u8 *)malloc_alignedCacheLine(w * sizeof(u8));
-	u8 *newGpuDstToSrcSSSE3_u8_16e = (u8 *)malloc_alignedCacheLine(w * sizeof(u8));
-	u8 *newGpuDstToSrcSSSE3_u16_8e = (u8 *)malloc_alignedCacheLine(w * sizeof(u16));
-	u8 *newGpuDstToSrcSSSE3_u32_4e = (u8 *)malloc_alignedCacheLine(w * sizeof(u32));
-	
-	for (size_t i = 0; i < w; i++)
-	{
-		const u8 value_u8_4 = newGpuDstToSrcIndex[i] & 0x03;
-		const u8 value_u8_8 = newGpuDstToSrcIndex[i] & 0x07;
-		const u8 value_u8_16 = newGpuDstToSrcIndex[i] & 0x0F;
-		const u8 value_u16 = (value_u8_8 << 1);
-		const u8 value_u32 = (value_u8_4 << 2);
-		
-		newGpuDstToSrcSSSE3_u8_8e[i] = value_u8_8;
-		newGpuDstToSrcSSSE3_u8_16e[i] = value_u8_16;
-		
-		newGpuDstToSrcSSSE3_u16_8e[(i << 1) + 0] = value_u16 + 0;
-		newGpuDstToSrcSSSE3_u16_8e[(i << 1) + 1] = value_u16 + 1;
-		
-		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 0] = value_u32 + 0;
-		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 1] = value_u32 + 1;
-		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 2] = value_u32 + 2;
-		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 3] = value_u32 + 3;
-	}
-	
-	_gpuLargestDstLineCount = newGpuLargestDstLineCount;
-	_gpuVRAMBlockOffset = _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w;
-	_gpuDstToSrcIndex = newGpuDstToSrcIndex;
-	_gpuDstToSrcSSSE3_u8_8e = newGpuDstToSrcSSSE3_u8_8e;
-	_gpuDstToSrcSSSE3_u8_16e = newGpuDstToSrcSSSE3_u8_16e;
-	_gpuDstToSrcSSSE3_u16_8e = newGpuDstToSrcSSSE3_u16_8e;
-	_gpuDstToSrcSSSE3_u32_4e = newGpuDstToSrcSSSE3_u32_4e;
-	
-	this->_displayInfo.isCustomSizeRequested = ( (w != GPU_FRAMEBUFFER_NATIVE_WIDTH) || (h != GPU_FRAMEBUFFER_NATIVE_HEIGHT) );
-	this->_displayInfo.customWidth = w;
-	this->_displayInfo.customHeight = h;
-	
-	if (!this->_displayInfo.isCustomSizeRequested)
-	{
-		this->_engineMain->ResetCaptureLineStates();
-	}
-	
-	if (this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main])
-	{
-		this->_displayInfo.renderedWidth[NDSDisplayID_Main] = this->_displayInfo.customWidth;
-		this->_displayInfo.renderedHeight[NDSDisplayID_Main] = this->_displayInfo.customHeight;
-	}
-	else
-	{
-		this->_displayInfo.renderedWidth[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-		this->_displayInfo.renderedHeight[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	}
-	
-	if (this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch])
-	{
-		this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = this->_displayInfo.customWidth;
-		this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = this->_displayInfo.customHeight;
-	}
-	else
-	{
-		this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
-		this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
-	}
-	
-	this->_AllocateFramebuffers(this->_displayInfo.colorFormat, w, h);
-	
-	free_aligned(oldGpuDstToSrcIndexPtr);
-	free_aligned(oldGpuDstToSrcSSSE3_u8_8e);
-	free_aligned(oldGpuDstToSrcSSSE3_u8_16e);
-	free_aligned(oldGpuDstToSrcSSSE3_u16_8e);
-	free_aligned(oldGpuDstToSrcSSSE3_u32_4e);
-}
-
-void GPUSubsystem::SetColorFormat(const NDSColorFormat outputFormat)
-{
-	//check for no-op
-	if(this->_displayInfo.colorFormat == outputFormat)
-		return;
-
-	this->_displayInfo.colorFormat = outputFormat;
-	this->_displayInfo.pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor);
-	
-	if (!this->_displayInfo.isCustomSizeRequested)
-	{
-		this->_engineMain->ResetCaptureLineStates();
-	}
-	
-	this->_AllocateFramebuffers(this->_displayInfo.colorFormat, this->_displayInfo.customWidth, this->_displayInfo.customHeight);
-}
-
-NDSColorFormat GPUSubsystem::GetColorFormat() const
-{
-	return this->_displayInfo.colorFormat;
-}
-
-void GPUSubsystem::_AllocateFramebuffers(NDSColorFormat outputFormat, size_t w, size_t h)
-{
-	void *oldMasterFramebuffer = this->_masterFramebuffer;
-	void *oldCustomVRAM = this->_customVRAM;
-	
-	const size_t pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor);
-	const size_t newCustomVRAMBlockSize = _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w;
-	const size_t newCustomVRAMBlankSize = _gpuLargestDstLineCount * GPU_VRAM_BLANK_REGION_LINES * w;
-	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * pixelBytes;
-	const size_t customFramebufferSize = w * h * pixelBytes;
-	
-	void *newCustomVRAM = NULL;
-	
-	this->_displayInfo.framebufferSize = (nativeFramebufferSize * 2) + (customFramebufferSize * 2);
-	this->_masterFramebuffer = malloc_alignedPage(this->_displayInfo.framebufferSize * 2);
-	this->_displayInfo.masterFramebufferHead = this->_masterFramebuffer;
-	this->_displayInfo.masterNativeBuffer = (u8 *)this->_masterFramebuffer + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
-	this->_displayInfo.masterCustomBuffer = (u8 *)this->_masterFramebuffer + (nativeFramebufferSize * 2) + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
-	
-	GPUEngineBase *mainEngine  = this->_display[NDSDisplayID_Main]->GetEngine();
-	this->_displayInfo.nativeBuffer[NDSDisplayID_Main]    = mainEngine->nativeBuffer;
-	this->_displayInfo.customBuffer[NDSDisplayID_Main]    = mainEngine->customBuffer;
-	this->_displayInfo.renderedBuffer[NDSDisplayID_Main]  = mainEngine->renderedBuffer;
-	this->_displayInfo.renderedWidth[NDSDisplayID_Main]   = mainEngine->renderedWidth;
-	this->_displayInfo.renderedHeight[NDSDisplayID_Main]  = mainEngine->renderedHeight;
-	
-	GPUEngineBase *touchEngine = this->_display[NDSDisplayID_Touch]->GetEngine();
-	this->_displayInfo.nativeBuffer[NDSDisplayID_Touch]   = touchEngine->nativeBuffer;
-	this->_displayInfo.customBuffer[NDSDisplayID_Touch]   = touchEngine->customBuffer;
-	this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = touchEngine->renderedBuffer;
-	this->_displayInfo.renderedWidth[NDSDisplayID_Touch]  = touchEngine->renderedWidth;
-	this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = touchEngine->renderedHeight;
-	
-	switch (outputFormat)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			newCustomVRAM = (void *)malloc_alignedCacheLine(((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
-			memset(newCustomVRAM, 0, ((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
-			memset_u16(this->_masterFramebuffer, 0x8000, (this->_displayInfo.framebufferSize * 2) / sizeof(u16));
-			this->_customVRAM = newCustomVRAM;
-			this->_customVRAMBlank = (u16 *)newCustomVRAM + (newCustomVRAMBlockSize * 4);
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			newCustomVRAM = (void *)malloc_alignedCacheLine(((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
-			memset(newCustomVRAM, 0, ((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
-			memset_u32(this->_masterFramebuffer, 0x1F000000, (this->_displayInfo.framebufferSize * 2) / sizeof(FragmentColor));
-			this->_customVRAM = newCustomVRAM;
-			this->_customVRAMBlank = (u16 *)newCustomVRAM + (newCustomVRAMBlockSize * 4);
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			newCustomVRAM = (void *)malloc_alignedCacheLine(((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(FragmentColor));
-			memset(newCustomVRAM, 0, ((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(FragmentColor));
-			memset_u32(this->_masterFramebuffer, 0xFF000000, (this->_displayInfo.framebufferSize * 2) / sizeof(FragmentColor));
-			this->_customVRAM = newCustomVRAM;
-			this->_customVRAMBlank = (FragmentColor *)newCustomVRAM + (newCustomVRAMBlockSize * 4);
-			break;
-			
-		default:
-			break;
-	}
-	
-	this->_engineMain->SetCustomFramebufferSize(w, h);
-	this->_engineSub->SetCustomFramebufferSize(w, h);
-	
-	BaseRenderer->SetFramebufferSize(w, h); // Since BaseRenderer is persistent, we need to update this manually.
-	if (CurrentRenderer != BaseRenderer)
-	{
-		CurrentRenderer->RequestColorFormat(outputFormat);
-		CurrentRenderer->SetFramebufferSize(w, h);
-	}
-	
-	free_aligned(oldMasterFramebuffer);
-	free_aligned(oldCustomVRAM);
-}
-
-void* GPUSubsystem::GetCustomVRAMBuffer()
-{
-	return this->_customVRAM;
-}
-
-void* GPUSubsystem::GetCustomVRAMBlankBuffer()
-{
-	return this->_customVRAMBlank;
-}
-
-template <NDSColorFormat COLORFORMAT>
-void* GPUSubsystem::GetCustomVRAMAddressUsingMappedAddress(const u32 mappedAddr, const size_t offset)
-{
-	const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(mappedAddr) - MMU.ARM9_LCD) / sizeof(u16);
-	if (vramPixel >= (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
-	{
-		return this->_customVRAMBlank;
-	}
-	
-	const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-	const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
-	const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	const size_t linePixel = blockPixel % GPU_FRAMEBUFFER_NATIVE_WIDTH;
-	
-	return (COLORFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)this->GetEngineMain()->GetCustomVRAMBlockPtr(blockID) + (_gpuCaptureLineIndex[blockLine] * this->_displayInfo.customWidth) + _gpuDstPitchIndex[linePixel] + offset) : (void *)((u16 *)this->GetEngineMain()->GetCustomVRAMBlockPtr(blockID) + (_gpuCaptureLineIndex[blockLine] * this->_displayInfo.customWidth) + _gpuDstPitchIndex[linePixel] + offset);
-}
-
-bool GPUSubsystem::GetWillPostprocessDisplays() const
-{
-	return this->_willPostprocessDisplays;
-}
-
-void GPUSubsystem::SetWillPostprocessDisplays(const bool willPostprocess)
-{
-	this->_willPostprocessDisplays = willPostprocess;
-}
-
-void GPUSubsystem::PostprocessDisplay(const NDSDisplayID displayID, NDSDisplayInfo &mutableInfo)
-{
-	if (mutableInfo.isDisplayEnabled[displayID])
-	{
-		if (mutableInfo.colorFormat == NDSColorFormat_BGR666_Rev)
-		{
-			if (mutableInfo.needConvertColorFormat[displayID])
-			{
-				ColorspaceConvertBuffer6665To8888<false, false>((u32 *)mutableInfo.renderedBuffer[displayID], (u32 *)mutableInfo.renderedBuffer[displayID], mutableInfo.renderedWidth[displayID] * mutableInfo.renderedHeight[displayID]);
-			}
-			
-			if (mutableInfo.needApplyMasterBrightness[displayID])
-			{
-				this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR888_Rev>(mutableInfo);
-			}
-		}
-		else
-		{
-			if (mutableInfo.needApplyMasterBrightness[displayID])
-			{
-				switch (mutableInfo.colorFormat)
-				{
-					case NDSColorFormat_BGR555_Rev:
-						this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR555_Rev>(mutableInfo);
-						break;
-						
-					case NDSColorFormat_BGR666_Rev:
-						this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR666_Rev>(mutableInfo);
-						break;
-						
-					case NDSColorFormat_BGR888_Rev:
-						this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR888_Rev>(mutableInfo);
-						break;
-						
-					default:
-						break;
-				}
-			}
-		}
-	}
-	else
-	{
-		if (mutableInfo.colorFormat == NDSColorFormat_BGR555_Rev)
-		{
-			memset(mutableInfo.renderedBuffer[displayID], 0, mutableInfo.renderedWidth[displayID] * mutableInfo.renderedHeight[displayID] * sizeof(u16));
-		}
-		else
-		{
-			memset(mutableInfo.renderedBuffer[displayID], 0, mutableInfo.renderedWidth[displayID] * mutableInfo.renderedHeight[displayID] * sizeof(u32));
-		}
-	}
-	
-	mutableInfo.needConvertColorFormat[displayID] = false;
-	mutableInfo.needApplyMasterBrightness[displayID] = false;
-}
-
-void GPUSubsystem::ResolveDisplayToCustomFramebuffer(const NDSDisplayID displayID, NDSDisplayInfo &mutableInfo)
-{
-	this->_display[displayID]->GetEngine()->ResolveToCustomFramebuffer(mutableInfo);
-}
-
-bool GPUSubsystem::GetWillAutoResolveToCustomBuffer() const
-{
-	return this->_willAutoResolveToCustomBuffer;
-}
-
-void GPUSubsystem::SetWillAutoResolveToCustomBuffer(const bool willAutoResolve)
-{
-	this->_willAutoResolveToCustomBuffer = willAutoResolve;
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void GPUSubsystem::RenderLine(const size_t l)
-{
-	if (!this->_frameNeedsFinish)
-	{
-		u8 targetBufferIndex = this->_displayInfo.bufferIndex;
-		
-		if ( (l == 0) && !this->_willFrameSkip )
-		{
-			targetBufferIndex = (targetBufferIndex + 1) & 0x01;
-		}
-		
-		this->_event->DidFrameBegin(this->_willFrameSkip, targetBufferIndex, l);
-		this->_frameNeedsFinish = true;
-	}
-	
-	this->_engineMain->UpdateRenderStates(l);
-	this->_engineSub->UpdateRenderStates(l);
-	
-	const bool isDisplayCaptureNeeded = this->_engineMain->WillDisplayCapture(l);
-	const bool isFramebufferRenderNeeded[2]	= { CommonSettings.showGpu.main, CommonSettings.showGpu.sub };
-	
-	if (l == 0)
-	{
-		if (!this->_willFrameSkip)
-		{
-			this->UpdateRenderProperties();
-		}
-	}
-	
-	if ( (isFramebufferRenderNeeded[GPUEngineID_Main] || isDisplayCaptureNeeded) && !this->_willFrameSkip )
-	{
-		// GPUEngineA:WillRender3DLayer() and GPUEngineA:WillCapture3DLayerDirect() both rely on register
-		// states that might change on a per-line basis. Therefore, we need to check these states on a
-		// per-line basis as well. While most games will set up these states by line 0 and keep these
-		// states constant all the way through line 191, this may not always be the case.
-		//
-		// Test case: If a conversation occurs in Advance Wars: Dual Strike where the conversation
-		// originates from the top of the screen, the BG0 layer will only be enabled at line 46. This
-		// means that we need to check the states at that particular time to ensure that the 3D renderer
-		// finishes before we read the 3D framebuffer. Otherwise, the map will render incorrectly.
-		
-		const bool need3DCaptureFramebuffer = this->_engineMain->WillCapture3DLayerDirect(l);
-		const bool need3DDisplayFramebuffer = this->_engineMain->WillRender3DLayer() || ((OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) && need3DCaptureFramebuffer);
-		
-		if (need3DCaptureFramebuffer || need3DDisplayFramebuffer)
-		{
-			if (CurrentRenderer->GetRenderNeedsFinish())
-			{
-				CurrentRenderer->RenderFinish();
-				CurrentRenderer->SetRenderNeedsFinish(false);
-				this->_event->DidRender3DEnd();
-			}
-			
-			CurrentRenderer->RenderFlush(need3DDisplayFramebuffer && CurrentRenderer->GetRenderNeedsFlushMain(),
-			                             need3DCaptureFramebuffer && CurrentRenderer->GetRenderNeedsFlush16());
-		}
-		
-		this->_engineMain->RenderLine<OUTPUTFORMAT>(l);
-	}
-	else
-	{
-		this->_engineMain->UpdatePropertiesWithoutRender(l);
-	}
-	
-	if (isFramebufferRenderNeeded[GPUEngineID_Sub] && !this->_willFrameSkip)
-	{
-		this->_engineSub->RenderLine<OUTPUTFORMAT>(l);
-	}
-	else
-	{
-		this->_engineSub->UpdatePropertiesWithoutRender(l);
-	}
-	
-	if (l == 191)
-	{
-		this->_engineMain->LastLineProcess();
-		this->_engineSub->LastLineProcess();
-		
-		this->_UpdateFPSRender3D();
-		
-		if (!this->_willFrameSkip)
-		{
-			if (this->_displayInfo.isCustomSizeRequested)
-			{
-				this->_engineMain->ResolveCustomRendering<OUTPUTFORMAT>();
-				this->_engineSub->ResolveCustomRendering<OUTPUTFORMAT>();
-			}
-			
-			this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = (this->_display[NDSDisplayID_Main]->GetEngine()->nativeLineOutputCount < GPU_FRAMEBUFFER_NATIVE_HEIGHT);
-			this->_displayInfo.renderedBuffer[NDSDisplayID_Main] = this->_display[NDSDisplayID_Main]->GetEngine()->renderedBuffer;
-			this->_displayInfo.renderedWidth[NDSDisplayID_Main] = this->_display[NDSDisplayID_Main]->GetEngine()->renderedWidth;
-			this->_displayInfo.renderedHeight[NDSDisplayID_Main] = this->_display[NDSDisplayID_Main]->GetEngine()->renderedHeight;
-			
-			this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = (this->_display[NDSDisplayID_Touch]->GetEngine()->nativeLineOutputCount < GPU_FRAMEBUFFER_NATIVE_HEIGHT);
-			this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngine()->renderedBuffer;
-			this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngine()->renderedWidth;
-			this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngine()->renderedHeight;
-			
-			this->_displayInfo.engineID[NDSDisplayID_Main]  = this->_display[NDSDisplayID_Main]->GetEngineID();
-			this->_displayInfo.engineID[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngineID();
-			
-			this->_displayInfo.isDisplayEnabled[NDSDisplayID_Main]  = CommonSettings.showGpu.screens[this->_displayInfo.engineID[NDSDisplayID_Main]];
-			this->_displayInfo.isDisplayEnabled[NDSDisplayID_Touch] = CommonSettings.showGpu.screens[this->_displayInfo.engineID[NDSDisplayID_Touch]];
-			
-			this->_displayInfo.needConvertColorFormat[NDSDisplayID_Main]  = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev);
-			this->_displayInfo.needConvertColorFormat[NDSDisplayID_Touch] = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev);
-			
-			this->_engineMain->UpdateMasterBrightnessDisplayInfo(this->_displayInfo);
-			this->_engineSub->UpdateMasterBrightnessDisplayInfo(this->_displayInfo);
-			
-			if (this->_willPostprocessDisplays)
-			{
-				this->PostprocessDisplay(NDSDisplayID_Main,  this->_displayInfo);
-				this->PostprocessDisplay(NDSDisplayID_Touch, this->_displayInfo);
-			}
-			
-			if (this->_willAutoResolveToCustomBuffer)
-			{
-				this->ResolveDisplayToCustomFramebuffer(NDSDisplayID_Main,  this->_displayInfo);
-				this->ResolveDisplayToCustomFramebuffer(NDSDisplayID_Touch, this->_displayInfo);
-			}
-		}
-		
-		if (this->_frameNeedsFinish)
-		{
-			this->_frameNeedsFinish = false;
-			this->_event->DidFrameEnd(this->_willFrameSkip, this->_displayInfo);
-		}
-	}
-}
-
-void GPUSubsystem::ClearWithColor(const u16 colorBGRA5551)
-{
-	u16 color16 = colorBGRA5551;
-	FragmentColor color32;
-	
-	switch (this->_displayInfo.colorFormat)
-	{
-		case NDSColorFormat_BGR555_Rev:
-			color16 = colorBGRA5551 | 0x8000;
-			break;
-			
-		case NDSColorFormat_BGR666_Rev:
-			color32.color = COLOR555TO6665_OPAQUE(colorBGRA5551 & 0x7FFF);
-			break;
-			
-		case NDSColorFormat_BGR888_Rev:
-			color32.color = COLOR555TO8888_OPAQUE(colorBGRA5551 & 0x7FFF);
-			break;
-			
-		default:
-			break;
-	}
-	
-	switch (this->_displayInfo.pixelBytes)
-	{
-		case 2:
-			memset_u16(this->_masterFramebuffer, color16, (this->_displayInfo.framebufferSize * 2) / this->_displayInfo.pixelBytes);
-			break;
-			
-		case 4:
-			memset_u32(this->_masterFramebuffer, color32.color, (this->_displayInfo.framebufferSize * 2) / this->_displayInfo.pixelBytes);
-			break;
-			
-		default:
-			break;
-	}
-}
-
-GPUClientFetchObject::GPUClientFetchObject()
-{
-	memset(&_fetchDisplayInfo[0], 0, sizeof(NDSDisplayInfo));
-	memset(&_fetchDisplayInfo[1], 0, sizeof(NDSDisplayInfo));
-	_clientData = NULL;
-	_lastFetchIndex = 0;
-}
-
-void GPUClientFetchObject::Init()
-{
-	// Do nothing. This is implementation dependent.
-}
-
-void GPUClientFetchObject::SetFetchBuffers(const NDSDisplayInfo &currentDisplayInfo)
-{
-	// Do nothing. This is implementation dependent.
-}
-
-void GPUClientFetchObject::FetchFromBufferIndex(const u8 index)
-{
-	if (this->_fetchDisplayInfo[index].isDisplayEnabled[NDSDisplayID_Main])
-	{
-		if (!this->_fetchDisplayInfo[index].didPerformCustomRender[NDSDisplayID_Main])
-		{
-			this->_FetchNativeDisplayByID(NDSDisplayID_Main, index);
-		}
-		else
-		{
-			this->_FetchCustomDisplayByID(NDSDisplayID_Main, index);
-		}
-	}
-	
-	if (this->_fetchDisplayInfo[index].isDisplayEnabled[NDSDisplayID_Touch])
-	{
-		if (!this->_fetchDisplayInfo[index].didPerformCustomRender[NDSDisplayID_Touch])
-		{
-			this->_FetchNativeDisplayByID(NDSDisplayID_Touch, index);
-		}
-		else
-		{
-			this->_FetchCustomDisplayByID(NDSDisplayID_Touch, index);
-		}
-	}
-	
-	this->SetLastFetchIndex(index);
-}
-
-void GPUClientFetchObject::_FetchNativeDisplayByID(const NDSDisplayID displayID, const u8 bufferIndex)
-{
-	// Do nothing. This is implementation dependent.
-}
-
-void GPUClientFetchObject::_FetchCustomDisplayByID(const NDSDisplayID displayID, const u8 bufferIndex)
-{
-	// Do nothing. This is implementation dependent.
-}
-
-const NDSDisplayInfo& GPUClientFetchObject::GetFetchDisplayInfoForBufferIndex(const u8 bufferIndex) const
-{
-	return this->_fetchDisplayInfo[bufferIndex];
-}
-
-void GPUClientFetchObject::SetFetchDisplayInfo(const NDSDisplayInfo &displayInfo)
-{
-	this->_fetchDisplayInfo[displayInfo.bufferIndex] = displayInfo;
-}
-
-u8 GPUClientFetchObject::GetLastFetchIndex() const
-{
-	return this->_lastFetchIndex;
-}
-
-void GPUClientFetchObject::SetLastFetchIndex(const u8 fetchIndex)
-{
-	this->_lastFetchIndex = fetchIndex;
-}
-
-void* GPUClientFetchObject::GetClientData() const
-{
-	return this->_clientData;
-}
-
-void GPUClientFetchObject::SetClientData(void *clientData)
-{
-	this->_clientData = clientData;
-}
-
-NDSDisplay::NDSDisplay()
-{
-	_ID = NDSDisplayID_Main;
-	_gpu = NULL;
-}
-
-NDSDisplay::NDSDisplay(const NDSDisplayID displayID)
-{
-	_ID = displayID;
-	_gpu = NULL;
-}
-
-NDSDisplay::NDSDisplay(const NDSDisplayID displayID, GPUEngineBase *theEngine)
-{
-	_ID = displayID;
-	_gpu = theEngine;
-}
-
-GPUEngineBase* NDSDisplay::GetEngine()
-{
-	return this->_gpu;
-}
-
-void NDSDisplay::SetEngine(GPUEngineBase *theEngine)
-{
-	this->_gpu = theEngine;
-}
-
-GPUEngineID NDSDisplay::GetEngineID()
-{
-	return this->_gpu->GetEngineID();
-}
-
-void NDSDisplay::SetEngineByID(const GPUEngineID theID)
-{
-	this->_gpu = (theID == GPUEngineID_Main) ? (GPUEngineBase *)GPU->GetEngineMain() : (GPUEngineBase *)GPU->GetEngineSub();
-	this->_gpu->SetDisplayByID(this->_ID);
-}
-
-template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG0>();
-template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG1>();
-template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG2>();
-template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG3>();
-
-template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG0>();
-template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG1>();
-template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG2>();
-template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG3>();
-
-template void GPUEngineBase::ParseReg_WINnH<0>();
-template void GPUEngineBase::ParseReg_WINnH<1>();
-
-template void GPUEngineBase::ParseReg_BGnX<GPULayerID_BG2>();
-template void GPUEngineBase::ParseReg_BGnY<GPULayerID_BG2>();
-template void GPUEngineBase::ParseReg_BGnX<GPULayerID_BG3>();
-template void GPUEngineBase::ParseReg_BGnY<GPULayerID_BG3>();
-
-template void GPUSubsystem::RenderLine<NDSColorFormat_BGR555_Rev>(const size_t l);
-template void GPUSubsystem::RenderLine<NDSColorFormat_BGR666_Rev>(const size_t l);
-template void GPUSubsystem::RenderLine<NDSColorFormat_BGR888_Rev>(const size_t l);
+/*
+	Copyright (C) 2006 yopyop
+	Copyright (C) 2006-2007 Theo Berkau
+	Copyright (C) 2007 shash
+	Copyright (C) 2008-2017 DeSmuME team
+
+	This file is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 2 of the License, or
+	(at your option) any later version.
+
+	This file is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with the this software.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "GPU.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+#include <iostream>
+
+#include "common.h"
+#include "MMU.h"
+#include "FIFO.h"
+#include "debug.h"
+#include "render3D.h"
+#include "registers.h"
+#include "gfx3d.h"
+#include "debug.h"
+#include "NDSSystem.h"
+#include "readwrite.h"
+#include "matrix.h"
+#include "emufile.h"
+
+#ifdef FASTBUILD
+	#undef FORCEINLINE
+	#define FORCEINLINE
+	//compilation speed hack (cuts time exactly in half by cutting out permutations)
+	#define DISABLE_MOSAIC
+	#define DISABLE_COMPOSITOR_FAST_PATHS
+#endif
+
+//instantiate static instance
+u16 GPUEngineBase::_brightnessUpTable555[17][0x8000];
+FragmentColor GPUEngineBase::_brightnessUpTable666[17][0x8000];
+FragmentColor GPUEngineBase::_brightnessUpTable888[17][0x8000];
+u16 GPUEngineBase::_brightnessDownTable555[17][0x8000];
+FragmentColor GPUEngineBase::_brightnessDownTable666[17][0x8000];
+FragmentColor GPUEngineBase::_brightnessDownTable888[17][0x8000];
+u8 GPUEngineBase::_blendTable555[17][17][32][32];
+GPUEngineBase::MosaicLookup GPUEngineBase::_mosaicLookup;
+
+GPUSubsystem *GPU = NULL;
+
+static size_t _gpuLargestDstLineCount = 1;
+static size_t _gpuVRAMBlockOffset = GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH;
+
+static u16 *_gpuDstToSrcIndex = NULL; // Key: Destination pixel index / Value: Source pixel index
+static u8 *_gpuDstToSrcSSSE3_u8_8e = NULL;
+static u8 *_gpuDstToSrcSSSE3_u8_16e = NULL;
+static u8 *_gpuDstToSrcSSSE3_u16_8e = NULL;
+static u8 *_gpuDstToSrcSSSE3_u32_4e = NULL;
+
+static CACHE_ALIGN size_t _gpuDstPitchCount[GPU_FRAMEBUFFER_NATIVE_WIDTH];	// Key: Source pixel index in x-dimension / Value: Number of x-dimension destination pixels for the source pixel
+static CACHE_ALIGN size_t _gpuDstPitchIndex[GPU_FRAMEBUFFER_NATIVE_WIDTH];	// Key: Source pixel index in x-dimension / Value: First destination pixel that maps to the source pixel
+static CACHE_ALIGN size_t _gpuDstLineCount[GPU_FRAMEBUFFER_NATIVE_HEIGHT];	// Key: Source line index / Value: Number of destination lines for the source line
+static CACHE_ALIGN size_t _gpuDstLineIndex[GPU_FRAMEBUFFER_NATIVE_HEIGHT];	// Key: Source line index / Value: First destination line that maps to the source line
+static CACHE_ALIGN size_t _gpuCaptureLineCount[GPU_VRAM_BLOCK_LINES + 1];	// Key: Source line index / Value: Number of destination lines for the source line
+static CACHE_ALIGN size_t _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES + 1];	// Key: Source line index / Value: First destination line that maps to the source line
+
+const CACHE_ALIGN SpriteSize GPUEngineBase::_sprSizeTab[4][4] = {
+     {{8, 8}, {16, 8}, {8, 16}, {8, 8}},
+     {{16, 16}, {32, 8}, {8, 32}, {8, 8}},
+     {{32, 32}, {32, 16}, {16, 32}, {8, 8}},
+     {{64, 64}, {64, 32}, {32, 64}, {8, 8}},
+};
+
+const CACHE_ALIGN BGType GPUEngineBase::_mode2type[8][4] = {
+      {BGType_Text, BGType_Text, BGType_Text, BGType_Text},
+      {BGType_Text, BGType_Text, BGType_Text, BGType_Affine},
+      {BGType_Text, BGType_Text, BGType_Affine, BGType_Affine},
+      {BGType_Text, BGType_Text, BGType_Text, BGType_AffineExt},
+      {BGType_Text, BGType_Text, BGType_Affine, BGType_AffineExt},
+      {BGType_Text, BGType_Text, BGType_AffineExt, BGType_AffineExt},
+      {BGType_Invalid, BGType_Invalid, BGType_Large8bpp, BGType_Invalid},
+      {BGType_Invalid, BGType_Invalid, BGType_Invalid, BGType_Invalid}
+};
+
+//dont ever think of changing these to bits because you could avoid the multiplies in the main tile blitter.
+//it doesnt really help any
+const CACHE_ALIGN BGLayerSize GPUEngineBase::_BGLayerSizeLUT[8][4] = {
+	{{0, 0}, {0, 0}, {0, 0}, {0, 0}}, //Invalid
+    {{256,256}, {512,256}, {256,512}, {512,512}}, //text
+    {{128,128}, {256,256}, {512,512}, {1024,1024}}, //affine
+    {{512,1024}, {1024,512}, {0,0}, {0,0}}, //large 8bpp
+	{{0, 0}, {0, 0}, {0, 0}, {0, 0}}, //affine ext (to be elaborated with another value)
+	{{128,128}, {256,256}, {512,512}, {1024,1024}}, //affine ext 256x16
+	{{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext 256x1
+	{{128,128}, {256,256}, {512,256}, {512,512}}, //affine ext direct
+};
+
+template <s32 INTEGERSCALEHINT, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
+static FORCEINLINE void CopyLineExpand_C(void *__restrict dst, const void *__restrict src, size_t dstLength)
+{
+	if (INTEGERSCALEHINT == 0)
+	{
+#if defined(MSB_FIRST)
+		if (NEEDENDIANSWAP && (ELEMENTSIZE != 1))
+		{
+			for (size_t i = 0; i < dstLength; i++)
+			{
+				if (ELEMENTSIZE == 2)
+				{
+					((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] );
+				}
+				else if (ELEMENTSIZE == 4)
+				{
+					((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] );
+				}
+			}
+		}
+		else
+#endif
+		{
+			memcpy(dst, src, dstLength * ELEMENTSIZE);
+		}
+	}
+	else if (INTEGERSCALEHINT == 1)
+	{
+#if defined(MSB_FIRST)
+		if (NEEDENDIANSWAP && (ELEMENTSIZE != 1))
+		{
+			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
+			{
+				if (ELEMENTSIZE == 2)
+				{
+					((u16 *)dst)[i] = LE_TO_LOCAL_16( ((u16 *)src)[i] );
+				}
+				else if (ELEMENTSIZE == 4)
+				{
+					((u32 *)dst)[i] = LE_TO_LOCAL_32( ((u32 *)src)[i] );
+				}
+			}
+		}
+		else
+#endif
+		{
+			memcpy(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE);
+		}
+	}
+	else
+	{
+		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
+		{
+			for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
+			{
+				if (ELEMENTSIZE == 1)
+				{
+					( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x];
+				}
+				else if (ELEMENTSIZE == 2)
+				{
+					((u16 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[x] ) : ((u16 *)src)[x];
+				}
+				else if (ELEMENTSIZE == 4)
+				{
+					((u32 *)dst)[_gpuDstPitchIndex[x] + p] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[x] ) : ((u32 *)src)[x];
+				}
+			}
+		}
+	}
+}
+
+#ifdef ENABLE_SSE2
+template <s32 INTEGERSCALEHINT, size_t ELEMENTSIZE>
+static FORCEINLINE void CopyLineExpand_SSE2(void *__restrict dst, const void *__restrict src, size_t dstLength)
+{
+	if (INTEGERSCALEHINT == 0)
+	{
+		memcpy(dst, src, dstLength * ELEMENTSIZE);
+	}
+	else if (INTEGERSCALEHINT == 1)
+	{
+		MACRODO_N( GPU_FRAMEBUFFER_NATIVE_WIDTH / (sizeof(__m128i) / ELEMENTSIZE), _mm_store_si128((__m128i *)dst + (X), _mm_load_si128((__m128i *)src + (X))) );
+	}
+	else if (INTEGERSCALEHINT == 2)
+	{
+		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
+		{
+			if (ELEMENTSIZE == 1)
+			{
+				const __m128i src8  = _mm_load_si128((__m128i *)( (u8 *)src + srcX));
+				const __m128i src8out[2]  = { _mm_unpacklo_epi8(src8, src8), _mm_unpackhi_epi8(src8, src8) };
+				
+				_mm_store_si128((__m128i *)( (u8 *)dst + dstX +  0), src8out[0]);
+				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 16), src8out[1]);
+				
+				srcX += 16;
+				dstX += 32;
+			}
+			else if (ELEMENTSIZE == 2)
+			{
+				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
+				const __m128i src16out[2] = { _mm_unpacklo_epi16(src16, src16), _mm_unpackhi_epi16(src16, src16) };
+				
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  0), src16out[0]);
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  8), src16out[1]);
+				
+				srcX += 8;
+				dstX += 16;
+			}
+			else if (ELEMENTSIZE == 4)
+			{
+				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
+				const __m128i src32out[2] = { _mm_unpacklo_epi32(src32, src32), _mm_unpackhi_epi32(src32, src32) };
+				
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  0), src32out[0]);
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  4), src32out[1]);
+				
+				srcX += 4;
+				dstX += 8;
+			}
+		}
+	}
+	else if ((INTEGERSCALEHINT == 3) && (ELEMENTSIZE != 1))
+	{
+		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
+		{
+			if (ELEMENTSIZE == 2)
+			{
+				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
+				const __m128i src16lo = _mm_shuffle_epi32(src16, 0x44);
+				const __m128i src16hi = _mm_shuffle_epi32(src16, 0xEE);
+				const __m128i src16out[3] = { _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16lo, 0x40), 0xA5), _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16, 0xFE), 0x40), _mm_shufflehi_epi16(_mm_shufflelo_epi16(src16hi, 0xA5), 0xFE) };
+				
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  0), src16out[0]);
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  8), src16out[1]);
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX + 16), src16out[2]);
+				
+				srcX += 8;
+				dstX += 24;
+			}
+			else if (ELEMENTSIZE == 4)
+			{
+				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
+				const __m128i src32out[3] = { _mm_shuffle_epi32(src32, 0x40), _mm_shuffle_epi32(src32, 0xA5), _mm_shuffle_epi32(src32, 0xFE) };
+				
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  0), src32out[0]);
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  4), src32out[1]);
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  8), src32out[2]);
+				
+				srcX += 4;
+				dstX += 12;
+			}
+		}
+	}
+	else if (INTEGERSCALEHINT == 4)
+	{
+		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH;)
+		{
+			if (ELEMENTSIZE == 1)
+			{
+				const __m128i src8  = _mm_load_si128((__m128i *)( (u8 *)src + srcX));
+				const __m128i src8_lo  = _mm_unpacklo_epi8(src8, src8);
+				const __m128i src8_hi  = _mm_unpackhi_epi8(src8, src8);
+				const __m128i src8out[4] = { _mm_unpacklo_epi8(src8_lo, src8_lo), _mm_unpackhi_epi8(src8_lo, src8_lo), _mm_unpacklo_epi8(src8_hi, src8_hi), _mm_unpackhi_epi8(src8_hi, src8_hi) };
+				
+				_mm_store_si128((__m128i *)( (u8 *)dst + dstX +  0), src8out[0]);
+				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 16), src8out[1]);
+				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 32), src8out[2]);
+				_mm_store_si128((__m128i *)( (u8 *)dst + dstX + 48), src8out[3]);
+				
+				srcX += 16;
+				dstX += 64;
+			}
+			else if (ELEMENTSIZE == 2)
+			{
+				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
+				const __m128i src16_lo = _mm_unpacklo_epi16(src16, src16);
+				const __m128i src16_hi = _mm_unpackhi_epi16(src16, src16);
+				const __m128i src16out[4] = { _mm_unpacklo_epi16(src16_lo, src16_lo), _mm_unpackhi_epi16(src16_lo, src16_lo), _mm_unpacklo_epi16(src16_hi, src16_hi), _mm_unpackhi_epi16(src16_hi, src16_hi) };
+				
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  0), src16out[0]);
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX +  8), src16out[1]);
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX + 16), src16out[2]);
+				_mm_store_si128((__m128i *)((u16 *)dst + dstX + 24), src16out[3]);
+				
+				srcX += 8;
+				dstX += 32;
+			}
+			else if (ELEMENTSIZE == 4)
+			{
+				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
+				const __m128i src32_lo = _mm_unpacklo_epi32(src32, src32);
+				const __m128i src32_hi = _mm_unpackhi_epi32(src32, src32);
+				const __m128i src32out[4] = { _mm_unpacklo_epi32(src32_lo, src32_lo), _mm_unpackhi_epi32(src32_lo, src32_lo), _mm_unpacklo_epi32(src32_hi, src32_hi), _mm_unpackhi_epi32(src32_hi, src32_hi) };
+				
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  0), src32out[0]);
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  4), src32out[1]);
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX +  8), src32out[2]);
+				_mm_store_si128((__m128i *)((u32 *)dst + dstX + 12), src32out[3]);
+				
+				srcX += 4;
+				dstX += 16;
+			}
+		}
+	}
+#ifdef ENABLE_SSSE3
+	else if (INTEGERSCALEHINT >= 0)
+	{
+		const size_t scale = dstLength / GPU_FRAMEBUFFER_NATIVE_WIDTH;
+		
+		for (size_t srcX = 0, dstX = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; )
+		{
+			if (ELEMENTSIZE == 1)
+			{
+				const __m128i src8 = _mm_load_si128((__m128i *)((u8 *)src + srcX));
+				
+				for (size_t s = 0; s < scale; s++)
+				{
+					const __m128i ssse3idx_u8 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u8_16e + (s * 16)));
+					_mm_store_si128( (__m128i *)( (u8 *)dst + dstX + (s * 16)), _mm_shuffle_epi8( src8, ssse3idx_u8 ) );
+				}
+				
+				srcX += 16;
+				dstX += (16 * scale);
+			}
+			else if (ELEMENTSIZE == 2)
+			{
+				const __m128i src16 = _mm_load_si128((__m128i *)((u16 *)src + srcX));
+				
+				for (size_t s = 0; s < scale; s++)
+				{
+					const __m128i ssse3idx_u16 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u16_8e + (s * 16)));
+					_mm_store_si128( (__m128i *)((u16 *)dst + dstX + (s *  8)), _mm_shuffle_epi8(src16, ssse3idx_u16) );
+				}
+				
+				srcX += 8;
+				dstX += (8 * scale);
+			}
+			else if (ELEMENTSIZE == 4)
+			{
+				const __m128i src32 = _mm_load_si128((__m128i *)((u32 *)src + srcX));
+				
+				for (size_t s = 0; s < scale; s++)
+				{
+					const __m128i ssse3idx_u32 = _mm_load_si128((__m128i *)(_gpuDstToSrcSSSE3_u32_4e + (s * 16)));
+					_mm_store_si128( (__m128i *)((u32 *)dst + dstX + (s *  4)), _mm_shuffle_epi8(src32, ssse3idx_u32) );
+				}
+				
+				srcX += 4;
+				dstX += (4 * scale);
+			}
+		}
+	}
+#endif
+	else
+	{
+		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
+		{
+			for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
+			{
+				if (ELEMENTSIZE == 1)
+				{
+					( (u8 *)dst)[_gpuDstPitchIndex[x] + p] = ((u8 *)src)[x];
+				}
+				else if (ELEMENTSIZE == 2)
+				{
+					((u16 *)dst)[_gpuDstPitchIndex[x] + p] = ((u16 *)src)[x];
+				}
+				else if (ELEMENTSIZE == 4)
+				{
+					((u32 *)dst)[_gpuDstPitchIndex[x] + p] = ((u32 *)src)[x];
+				}
+			}
+		}
+	}
+}
+#endif
+
+template <s32 INTEGERSCALEHINT, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
+static FORCEINLINE void CopyLineExpand(void *__restrict dst, const void *__restrict src, size_t dstLength)
+{
+	// Use INTEGERSCALEHINT to provide a hint to CopyLineExpand() for the fastest execution path.
+	// INTEGERSCALEHINT represents the scaling value of the framebuffer width, and is always
+	// assumed to be a positive integer.
+	//
+	// Use cases:
+	// - Passing a value of 0 causes CopyLineExpand() to perform a simple copy, using dstLength
+	//   to copy dstLength elements.
+	// - Passing a value of 1 causes CopyLineExpand() to perform a simple copy, ignoring dstLength
+	//   and always copying GPU_FRAMEBUFFER_NATIVE_WIDTH elements.
+	// - Passing any negative value causes CopyLineExpand() to assume that the framebuffer width
+	//   is NOT scaled by an integer value, and will therefore take the safest (but slowest)
+	//   execution path.
+	// - Passing any positive value greater than 1 causes CopyLineExpand() to expand the line
+	//   using the integer scaling value.
+	
+#ifdef ENABLE_SSE2
+	CopyLineExpand_SSE2<INTEGERSCALEHINT, ELEMENTSIZE>(dst, src, dstLength);
+#else
+	CopyLineExpand_C<INTEGERSCALEHINT, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, dstLength);
+#endif
+}
+
+template <bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
+static FORCEINLINE void CopyLineReduce(void *__restrict dst, const void *__restrict src)
+{
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
+	{
+		if (ELEMENTSIZE == 1)
+		{
+			( (u8 *)dst)[i] = ((u8 *)src)[_gpuDstPitchIndex[i]];
+		}
+		else if (ELEMENTSIZE == 2)
+		{
+			((u16 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16( ((u16 *)src)[_gpuDstPitchIndex[i]] ) : ((u16 *)src)[_gpuDstPitchIndex[i]];
+		}
+		else if (ELEMENTSIZE == 4)
+		{
+			((u32 *)dst)[i] = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32( ((u32 *)src)[_gpuDstPitchIndex[i]] ) : ((u32 *)src)[_gpuDstPitchIndex[i]];
+		}
+	}
+}
+
+/*****************************************************************************/
+//			BACKGROUND RENDERING -ROTOSCALE-
+/*****************************************************************************/
+
+FORCEINLINE void rot_tiled_8bit_entry(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
+{
+	const u16 tileindex = *(u8*)MMU_gpu_map(map + ((auxX>>3) + (auxY>>3) * (lg>>3)));
+	const u16 x = auxX & 0x0007;
+	const u16 y = auxY & 0x0007;
+	
+	outIndex = *(u8*)MMU_gpu_map(tile + ((tileindex<<6)+(y<<3)+x));
+	outColor = LE_TO_LOCAL_16(pal[outIndex]);
+}
+
+template<bool EXTPAL>
+FORCEINLINE void rot_tiled_16bit_entry(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
+{
+	TILEENTRY tileentry;
+	tileentry.val = LE_TO_LOCAL_16( *(u16 *)MMU_gpu_map(map + (((auxX>>3) + (auxY>>3) * (lg>>3))<<1)) );
+	
+	const u16 x = ((tileentry.bits.HFlip) ? 7 - (auxX) : (auxX)) & 0x0007;
+	const u16 y = ((tileentry.bits.VFlip) ? 7 - (auxY) : (auxY)) & 0x0007;
+	
+	outIndex = *(u8*)MMU_gpu_map(tile + ((tileentry.bits.TileNum<<6)+(y<<3)+x));
+	outColor = LE_TO_LOCAL_16(pal[(outIndex + (EXTPAL ? (tileentry.bits.Palette<<8) : 0))]);
+}
+
+FORCEINLINE void rot_256_map(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
+{
+	outIndex = *(u8*)MMU_gpu_map(map + ((auxX + auxY * lg)));
+	outColor = LE_TO_LOCAL_16(pal[outIndex]);
+}
+
+FORCEINLINE void rot_BMP_map(const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *__restrict pal, u8 &outIndex, u16 &outColor)
+{
+	outColor = LE_TO_LOCAL_16( *(u16 *)MMU_gpu_map(map + ((auxX + auxY * lg) << 1)) );
+	outIndex = ((outColor & 0x8000) == 0) ? 0 : 1;
+}
+
+void gpu_savestate(EMUFILE* os)
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	const GPUEngineA *mainEngine = GPU->GetEngineMain();
+	const GPUEngineB *subEngine = GPU->GetEngineSub();
+	
+	//version
+	write32le(1,os);
+	
+	os->fwrite((u8 *)dispInfo.masterCustomBuffer, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16) * 2);
+	
+	write32le(mainEngine->savedBG2X.value, os);
+	write32le(mainEngine->savedBG2Y.value, os);
+	write32le(mainEngine->savedBG3X.value, os);
+	write32le(mainEngine->savedBG3Y.value, os);
+	write32le(subEngine->savedBG2X.value, os);
+	write32le(subEngine->savedBG2Y.value, os);
+	write32le(subEngine->savedBG3X.value, os);
+	write32le(subEngine->savedBG3Y.value, os);
+}
+
+bool gpu_loadstate(EMUFILE* is, int size)
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	GPUEngineA *mainEngine = GPU->GetEngineMain();
+	GPUEngineB *subEngine = GPU->GetEngineSub();
+	
+	//read version
+	u32 version;
+	
+	//sigh.. shouldve used a new version number
+	if (size == GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16) * 2)
+	{
+		version = 0;
+	}
+	else if (size == 0x30024)
+	{
+		read32le(&version,is);
+		version = 1;
+	}
+	else
+	{
+		if(read32le(&version,is) != 1) return false;
+	}
+	
+	if (version > 1) return false;
+	
+	is->fread((u8 *)dispInfo.masterCustomBuffer, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16) * 2);
+	
+	if (version == 1)
+	{
+		read32le((u32 *)&mainEngine->savedBG2X, is);
+		read32le((u32 *)&mainEngine->savedBG2Y, is);
+		read32le((u32 *)&mainEngine->savedBG3X, is);
+		read32le((u32 *)&mainEngine->savedBG3Y, is);
+		read32le((u32 *)&subEngine->savedBG2X, is);
+		read32le((u32 *)&subEngine->savedBG2Y, is);
+		read32le((u32 *)&subEngine->savedBG3X, is);
+		read32le((u32 *)&subEngine->savedBG3Y, is);
+		//removed per nitsuja feedback. anyway, this same thing will happen almost immediately in gpu line=0
+		//mainEngine->refreshAffineStartRegs(-1,-1);
+		//subEngine->refreshAffineStartRegs(-1,-1);
+	}
+	
+	mainEngine->ParseAllRegisters();
+	subEngine->ParseAllRegisters();
+	
+	return !is->fail();
+}
+
+/*****************************************************************************/
+//			INITIALIZATION
+/*****************************************************************************/
+void GPUEngineBase::_InitLUTs()
+{
+	static bool didInit = false;
+	
+	if (didInit)
+	{
+		return;
+	}
+	
+	/*
+	NOTE: gbatek (in the reference above) seems to expect 6bit values 
+	per component, but as desmume works with 5bit per component, 
+	we use 31 as top, instead of 63. Testing it on a few games, 
+	using 63 seems to give severe color wraping, and 31 works
+	nicely, so for now we'll just that, until proven wrong.
+
+	i have seen pics of pokemon ranger getting white with 31, with 63 it is nice.
+	it could be pb of alpha or blending or...
+
+	MightyMax> created a test NDS to check how the brightness values work,
+	and 31 seems to be correct. FactorEx is a override for max brighten/darken
+	See: http://mightymax.org/gfx_test_brightness.nds
+	The Pokemon Problem could be a problem with 8/32 bit writes not recognized yet,
+	i'll add that so you can check back.
+	*/
+	
+	for (u16 i = 0; i <= 16; i++)
+	{
+		for (u16 j = 0x0000; j < 0x8000; j++)
+		{
+			COLOR cur;
+
+			cur.val = j;
+			cur.bits.red = (cur.bits.red + ((31 - cur.bits.red) * i / 16));
+			cur.bits.green = (cur.bits.green + ((31 - cur.bits.green) * i / 16));
+			cur.bits.blue = (cur.bits.blue + ((31 - cur.bits.blue) * i / 16));
+			cur.bits.alpha = 0;
+			GPUEngineBase::_brightnessUpTable555[i][j] = cur.val;
+			GPUEngineBase::_brightnessUpTable666[i][j].color = COLOR555TO666(cur.val);
+			GPUEngineBase::_brightnessUpTable888[i][j].color = COLOR555TO888(cur.val);
+			
+			cur.val = j;
+			cur.bits.red = (cur.bits.red - (cur.bits.red * i / 16));
+			cur.bits.green = (cur.bits.green - (cur.bits.green * i / 16));
+			cur.bits.blue = (cur.bits.blue - (cur.bits.blue * i / 16));
+			cur.bits.alpha = 0;
+			GPUEngineBase::_brightnessDownTable555[i][j] = cur.val;
+			GPUEngineBase::_brightnessDownTable666[i][j].color = COLOR555TO666(cur.val);
+			GPUEngineBase::_brightnessDownTable888[i][j].color = COLOR555TO888(cur.val);
+		}
+	}
+	
+	for(int c0=0;c0<=31;c0++) 
+		for(int c1=0;c1<=31;c1++) 
+			for(int eva=0;eva<=16;eva++)
+				for(int evb=0;evb<=16;evb++)
+				{
+					int blend = ((c0 * eva) + (c1 * evb) ) / 16;
+					int final = std::min<int>(31,blend);
+					GPUEngineBase::_blendTable555[eva][evb][c0][c1] = final;
+				}
+	
+	didInit = true;
+}
+
+GPUEngineBase::GPUEngineBase()
+{
+	_IORegisterMap = NULL;
+	_paletteOBJ = NULL;
+	
+	_BGLayer[GPULayerID_BG0].layerID = GPULayerID_BG0;
+	_BGLayer[GPULayerID_BG1].layerID = GPULayerID_BG1;
+	_BGLayer[GPULayerID_BG2].layerID = GPULayerID_BG2;
+	_BGLayer[GPULayerID_BG3].layerID = GPULayerID_BG3;
+	
+	_BGLayer[GPULayerID_BG0].extPaletteSlot = GPULayerID_BG0;
+	_BGLayer[GPULayerID_BG1].extPaletteSlot = GPULayerID_BG1;
+	_BGLayer[GPULayerID_BG2].extPaletteSlot = GPULayerID_BG2;
+	_BGLayer[GPULayerID_BG3].extPaletteSlot = GPULayerID_BG3;
+	
+	_BGLayer[GPULayerID_BG0].extPalette = NULL;
+	_BGLayer[GPULayerID_BG1].extPalette = NULL;
+	_BGLayer[GPULayerID_BG2].extPalette = NULL;
+	_BGLayer[GPULayerID_BG3].extPalette = NULL;
+	
+	_InitLUTs();
+	_internalRenderLineTargetCustom = NULL;
+	_renderLineLayerIDCustom = NULL;
+	_deferredIndexCustom = NULL;
+	_deferredColorCustom = NULL;
+	
+	_didPassWindowTestCustomMasterPtr = NULL;
+	_didPassWindowTestCustom[GPULayerID_BG0] = NULL;
+	_didPassWindowTestCustom[GPULayerID_BG1] = NULL;
+	_didPassWindowTestCustom[GPULayerID_BG2] = NULL;
+	_didPassWindowTestCustom[GPULayerID_BG3] = NULL;
+	_didPassWindowTestCustom[GPULayerID_OBJ] = NULL;
+	
+	_enableColorEffectCustomMasterPtr = NULL;
+	_enableColorEffectCustom[GPULayerID_BG0] = NULL;
+	_enableColorEffectCustom[GPULayerID_BG1] = NULL;
+	_enableColorEffectCustom[GPULayerID_BG2] = NULL;
+	_enableColorEffectCustom[GPULayerID_BG3] = NULL;
+	_enableColorEffectCustom[GPULayerID_OBJ] = NULL;
+}
+
+GPUEngineBase::~GPUEngineBase()
+{
+	free_aligned(this->_internalRenderLineTargetCustom);
+	this->_internalRenderLineTargetCustom = NULL;
+	free_aligned(this->_renderLineLayerIDCustom);
+	this->_renderLineLayerIDCustom = NULL;
+	free_aligned(this->_deferredIndexCustom);
+	this->_deferredIndexCustom = NULL;
+	free_aligned(this->_deferredColorCustom);
+	this->_deferredColorCustom = NULL;
+	
+	free_aligned(this->_didPassWindowTestCustomMasterPtr);
+	this->_didPassWindowTestCustomMasterPtr = NULL;
+	this->_didPassWindowTestCustom[GPULayerID_BG0] = NULL;
+	this->_didPassWindowTestCustom[GPULayerID_BG1] = NULL;
+	this->_didPassWindowTestCustom[GPULayerID_BG2] = NULL;
+	this->_didPassWindowTestCustom[GPULayerID_BG3] = NULL;
+	this->_didPassWindowTestCustom[GPULayerID_OBJ] = NULL;
+	
+	this->_enableColorEffectCustomMasterPtr = NULL;
+	this->_enableColorEffectCustom[GPULayerID_BG0] = NULL;
+	this->_enableColorEffectCustom[GPULayerID_BG1] = NULL;
+	this->_enableColorEffectCustom[GPULayerID_BG2] = NULL;
+	this->_enableColorEffectCustom[GPULayerID_BG3] = NULL;
+	this->_enableColorEffectCustom[GPULayerID_OBJ] = NULL;
+}
+
+void GPUEngineBase::_Reset_Base()
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	
+	memset(this->_sprColor, 0, sizeof(this->_sprColor));
+	memset(this->_sprAlpha, 0, sizeof(this->_sprAlpha));
+	memset(this->_sprType, OBJMode_Normal, sizeof(this->_sprType));
+	memset(this->_sprPrio, 0x7F, sizeof(this->_sprPrio));
+	memset(this->_sprNum, 0, sizeof(this->_sprNum));
+	
+	memset(this->_didPassWindowTestNative, 1, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
+	memset(this->_enableColorEffectNative, 1, 5 * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
+	memset(this->_didPassWindowTestCustomMasterPtr, 1, 10 * dispInfo.customWidth * sizeof(u8));
+	
+	memset(this->_h_win[0], 0, sizeof(this->_h_win[0]));
+	memset(this->_h_win[1], 0, sizeof(this->_h_win[1]));
+	memset(&this->_mosaicColors, 0, sizeof(MosaicColor));
+	memset(this->_itemsForPriority, 0, sizeof(this->_itemsForPriority));
+	
+	memset(this->_internalRenderLineTargetNative, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(FragmentColor));
+	
+	if (this->_internalRenderLineTargetCustom != NULL)
+	{
+		memset(this->_internalRenderLineTargetCustom, 0, dispInfo.customWidth * _gpuLargestDstLineCount * dispInfo.pixelBytes);
+	}
+	if (this->_renderLineLayerIDCustom != NULL)
+	{
+		memset(this->_renderLineLayerIDCustom, 0, dispInfo.customWidth * _gpuLargestDstLineCount * 4 * sizeof(u8));
+	}
+	
+	this->_enableLayer[GPULayerID_BG0] = false;
+	this->_enableLayer[GPULayerID_BG1] = false;
+	this->_enableLayer[GPULayerID_BG2] = false;
+	this->_enableLayer[GPULayerID_BG3] = false;
+	this->_enableLayer[GPULayerID_OBJ] = false;
+	this->_isAnyBGLayerEnabled = false;
+	
+	this->_BGLayer[GPULayerID_BG0].BGnCNT = this->_IORegisterMap->BG0CNT;
+	this->_BGLayer[GPULayerID_BG1].BGnCNT = this->_IORegisterMap->BG1CNT;
+	this->_BGLayer[GPULayerID_BG2].BGnCNT = this->_IORegisterMap->BG2CNT;
+	this->_BGLayer[GPULayerID_BG3].BGnCNT = this->_IORegisterMap->BG3CNT;
+	
+	this->_BGLayer[GPULayerID_BG0].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
+	this->_BGLayer[GPULayerID_BG1].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
+	this->_BGLayer[GPULayerID_BG2].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
+	this->_BGLayer[GPULayerID_BG3].size = GPUEngineBase::_BGLayerSizeLUT[BGType_Affine][1];
+	
+	this->_BGLayer[GPULayerID_BG0].baseType = BGType_Invalid;
+	this->_BGLayer[GPULayerID_BG1].baseType = BGType_Invalid;
+	this->_BGLayer[GPULayerID_BG2].baseType = BGType_Invalid;
+	this->_BGLayer[GPULayerID_BG3].baseType = BGType_Invalid;
+	
+	this->_BGLayer[GPULayerID_BG0].type = BGType_Invalid;
+	this->_BGLayer[GPULayerID_BG1].type = BGType_Invalid;
+	this->_BGLayer[GPULayerID_BG2].type = BGType_Invalid;
+	this->_BGLayer[GPULayerID_BG3].type = BGType_Invalid;
+	
+	this->_BGLayer[GPULayerID_BG0].priority = 0;
+	this->_BGLayer[GPULayerID_BG1].priority = 0;
+	this->_BGLayer[GPULayerID_BG2].priority = 0;
+	this->_BGLayer[GPULayerID_BG3].priority = 0;
+	
+	this->_BGLayer[GPULayerID_BG0].isVisible = false;
+	this->_BGLayer[GPULayerID_BG1].isVisible = false;
+	this->_BGLayer[GPULayerID_BG2].isVisible = false;
+	this->_BGLayer[GPULayerID_BG3].isVisible = false;
+	
+	this->_BGLayer[GPULayerID_BG0].isMosaic = false;
+	this->_BGLayer[GPULayerID_BG1].isMosaic = false;
+	this->_BGLayer[GPULayerID_BG2].isMosaic = false;
+	this->_BGLayer[GPULayerID_BG3].isMosaic = false;
+	
+	this->_BGLayer[GPULayerID_BG0].isDisplayWrapped = false;
+	this->_BGLayer[GPULayerID_BG1].isDisplayWrapped = false;
+	this->_BGLayer[GPULayerID_BG2].isDisplayWrapped = false;
+	this->_BGLayer[GPULayerID_BG3].isDisplayWrapped = false;
+	
+	this->_BGLayer[GPULayerID_BG0].extPaletteSlot = GPULayerID_BG0;
+	this->_BGLayer[GPULayerID_BG1].extPaletteSlot = GPULayerID_BG1;
+	this->_BGLayer[GPULayerID_BG0].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG0];
+	this->_BGLayer[GPULayerID_BG1].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG1];
+	this->_BGLayer[GPULayerID_BG2].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG2];
+	this->_BGLayer[GPULayerID_BG3].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][GPULayerID_BG3];
+	
+	this->_needUpdateWINH[0] = true;
+	this->_needUpdateWINH[1] = true;
+	
+	this->vramBlockOBJAddress = 0;
+	
+	this->nativeLineRenderCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->nativeLineOutputCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	
+	for (size_t l = 0; l < GPU_FRAMEBUFFER_NATIVE_HEIGHT; l++)
+	{
+		this->isLineRenderNative[l] = true;
+		this->isLineOutputNative[l] = true;
+	}
+	
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.displayOutputMode = GPUDisplayMode_Off;
+	renderState.selectedLayerID = GPULayerID_BG0;
+	renderState.selectedBGLayer = &this->_BGLayer[GPULayerID_BG0];
+	renderState.backdropColor16 = LE_TO_LOCAL_16(this->_paletteBG[0]) & 0x7FFF;
+	renderState.colorEffect = (ColorEffect)this->_IORegisterMap->BLDCNT.ColorEffect;
+	renderState.blendEVA = 0;
+	renderState.blendEVB = 0;
+	renderState.blendEVY = 0;
+	renderState.masterBrightnessMode = GPUMasterBrightMode_Disable;
+	renderState.masterBrightnessIntensity = 0;
+	renderState.masterBrightnessIsFullIntensity = false;
+	renderState.masterBrightnessIsMaxOrMin = true;
+	renderState.blendTable555 = (TBlendTable *)&GPUEngineBase::_blendTable555[renderState.blendEVA][renderState.blendEVB][0][0];
+	renderState.brightnessUpTable555 = &GPUEngineBase::_brightnessUpTable555[renderState.blendEVY][0];
+	renderState.brightnessUpTable666 = &GPUEngineBase::_brightnessUpTable666[renderState.blendEVY][0];
+	renderState.brightnessUpTable888 = &GPUEngineBase::_brightnessUpTable888[renderState.blendEVY][0];
+	renderState.brightnessDownTable555 = &GPUEngineBase::_brightnessDownTable555[renderState.blendEVY][0];
+	renderState.brightnessDownTable666 = &GPUEngineBase::_brightnessDownTable666[renderState.blendEVY][0];
+	renderState.brightnessDownTable888 = &GPUEngineBase::_brightnessDownTable888[renderState.blendEVY][0];
+	
+	renderState.srcBlendEnable[GPULayerID_BG0] = false;
+	renderState.srcBlendEnable[GPULayerID_BG1] = false;
+	renderState.srcBlendEnable[GPULayerID_BG2] = false;
+	renderState.srcBlendEnable[GPULayerID_BG3] = false;
+	renderState.srcBlendEnable[GPULayerID_OBJ] = false;
+	renderState.srcBlendEnable[GPULayerID_Backdrop] = false;
+	
+	renderState.dstBlendEnable[GPULayerID_BG0] = false;
+	renderState.dstBlendEnable[GPULayerID_BG1] = false;
+	renderState.dstBlendEnable[GPULayerID_BG2] = false;
+	renderState.dstBlendEnable[GPULayerID_BG3] = false;
+	renderState.dstBlendEnable[GPULayerID_OBJ] = false;
+	renderState.dstBlendEnable[GPULayerID_Backdrop] = false;
+	renderState.dstAnyBlendEnable = false;
+	
+#ifdef ENABLE_SSE2
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
+	renderState.srcBlendEnable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
+	renderState.srcBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_setzero_si128();
+#ifdef ENABLE_SSSE3
+	renderState.dstBlendEnable_SSSE3 = _mm_setzero_si128();
+#else
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
+	renderState.dstBlendEnable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
+	renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_setzero_si128();
+#endif
+#endif
+	
+	renderState.WIN0_enable[GPULayerID_BG0] = 0;
+	renderState.WIN0_enable[GPULayerID_BG1] = 0;
+	renderState.WIN0_enable[GPULayerID_BG2] = 0;
+	renderState.WIN0_enable[GPULayerID_BG3] = 0;
+	renderState.WIN0_enable[GPULayerID_OBJ] = 0;
+	renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
+	
+	renderState.WIN1_enable[GPULayerID_BG0] = 0;
+	renderState.WIN1_enable[GPULayerID_BG1] = 0;
+	renderState.WIN1_enable[GPULayerID_BG2] = 0;
+	renderState.WIN1_enable[GPULayerID_BG3] = 0;
+	renderState.WIN1_enable[GPULayerID_OBJ] = 0;
+	renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
+	
+	renderState.WINOUT_enable[GPULayerID_BG0] = 0;
+	renderState.WINOUT_enable[GPULayerID_BG1] = 0;
+	renderState.WINOUT_enable[GPULayerID_BG2] = 0;
+	renderState.WINOUT_enable[GPULayerID_BG3] = 0;
+	renderState.WINOUT_enable[GPULayerID_OBJ] = 0;
+	renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
+	
+	renderState.WINOBJ_enable[GPULayerID_BG0] = 0;
+	renderState.WINOBJ_enable[GPULayerID_BG1] = 0;
+	renderState.WINOBJ_enable[GPULayerID_BG2] = 0;
+	renderState.WINOBJ_enable[GPULayerID_BG3] = 0;
+	renderState.WINOBJ_enable[GPULayerID_OBJ] = 0;
+	renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = 0;
+	
+#if defined(ENABLE_SSE2)
+	renderState.WIN0_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
+	renderState.WIN0_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
+	renderState.WIN0_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
+	renderState.WIN0_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
+	renderState.WIN0_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
+	renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
+	
+	renderState.WIN1_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
+	renderState.WIN1_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
+	renderState.WIN1_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
+	renderState.WIN1_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
+	renderState.WIN1_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
+	renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
+	
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
+	renderState.WINOUT_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
+	renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
+	
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG0] = _mm_setzero_si128();
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG1] = _mm_setzero_si128();
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG2] = _mm_setzero_si128();
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG3] = _mm_setzero_si128();
+	renderState.WINOBJ_enable_SSE2[GPULayerID_OBJ] = _mm_setzero_si128();
+	renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_setzero_si128();
+#endif
+	
+	renderState.WIN0_ENABLED = false;
+	renderState.WIN1_ENABLED = false;
+	renderState.WINOBJ_ENABLED = false;
+	renderState.isAnyWindowEnabled = false;
+	
+	renderState.mosaicWidthBG = this->_mosaicLookup.table[0];
+	renderState.mosaicHeightBG = this->_mosaicLookup.table[0];
+	renderState.mosaicWidthOBJ = this->_mosaicLookup.table[0];
+	renderState.mosaicHeightOBJ = this->_mosaicLookup.table[0];
+	renderState.isBGMosaicSet = false;
+	renderState.isOBJMosaicSet = false;
+	
+	renderState.spriteRenderMode = SpriteRenderMode_Sprite1D;
+	renderState.spriteBoundary = 0;
+	renderState.spriteBMPBoundary = 0;
+	
+	this->savedBG2X.value = 0;
+	this->savedBG2Y.value = 0;
+	this->savedBG3X.value = 0;
+	this->savedBG3Y.value = 0;
+	
+	this->renderedWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	this->renderedHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->renderedBuffer = this->nativeBuffer;
+	
+	for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
+	{
+		this->_currentCompositorInfo[line].renderState = renderState;
+	}
+}
+
+void GPUEngineBase::Reset()
+{
+	this->_Reset_Base();
+}
+
+void GPUEngineBase::_ResortBGLayers()
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	int i, prio;
+	itemsForPriority_t *item;
+	
+	// we don't need to check for windows here...
+	// if we tick boxes, invisible layers become invisible & vice versa
+#define OP ^ !
+	// if we untick boxes, layers become invisible
+	//#define OP &&
+	this->_enableLayer[GPULayerID_BG0] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG0] OP(this->_BGLayer[GPULayerID_BG0].isVisible);
+	this->_enableLayer[GPULayerID_BG1] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG1] OP(this->_BGLayer[GPULayerID_BG1].isVisible);
+	this->_enableLayer[GPULayerID_BG2] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG2] OP(this->_BGLayer[GPULayerID_BG2].isVisible);
+	this->_enableLayer[GPULayerID_BG3] = CommonSettings.dispLayers[this->_engineID][GPULayerID_BG3] OP(this->_BGLayer[GPULayerID_BG3].isVisible);
+	this->_enableLayer[GPULayerID_OBJ] = CommonSettings.dispLayers[this->_engineID][GPULayerID_OBJ] OP(DISPCNT.OBJ_Enable);
+	
+	this->_isAnyBGLayerEnabled = this->_enableLayer[GPULayerID_BG0] || this->_enableLayer[GPULayerID_BG1] || this->_enableLayer[GPULayerID_BG2] || this->_enableLayer[GPULayerID_BG3];
+	
+	// KISS ! lower priority first, if same then lower num
+	for (i = 0; i < NB_PRIORITIES; i++)
+	{
+		item = &(this->_itemsForPriority[i]);
+		item->nbBGs = 0;
+		item->nbPixelsX = 0;
+	}
+	
+	for (i = NB_BG; i > 0; )
+	{
+		i--;
+		if (!this->_enableLayer[i]) continue;
+		prio = this->_BGLayer[i].priority;
+		item = &(this->_itemsForPriority[prio]);
+		item->BGs[item->nbBGs]=i;
+		item->nbBGs++;
+	}
+	
+#if 0
+	//debug
+	for (i = 0; i < NB_PRIORITIES; i++)
+	{
+		item = &(this->_itemsForPriority[i]);
+		printf("%d : ", i);
+		for (j=0; j<NB_PRIORITIES; j++)
+		{
+			if (j < item->nbBGs)
+				printf("BG%d ", item->BGs[j]);
+			else
+				printf("... ", item->BGs[j]);
+		}
+	}
+	printf("\n");
+#endif
+}
+
+FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const u16 blendEVA, const u16 blendEVB)
+{
+	u16 ra =  colA        & 0x001F;
+	u16 ga = (colA >>  5) & 0x001F;
+	u16 ba = (colA >> 10) & 0x001F;
+	u16 rb =  colB        & 0x001F;
+	u16 gb = (colB >>  5) & 0x001F;
+	u16 bb = (colB >> 10) & 0x001F;
+	
+	ra = ( (ra * blendEVA) + (rb * blendEVB) ) / 16;
+	ga = ( (ga * blendEVA) + (gb * blendEVB) ) / 16;
+	ba = ( (ba * blendEVA) + (bb * blendEVB) ) / 16;
+	
+	ra = (ra > 31) ? 31 : ra;
+	ga = (ga > 31) ? 31 : ga;
+	ba = (ba > 31) ? 31 : ba;
+	
+	return ra | (ga << 5) | (ba << 10);
+}
+
+template <NDSColorFormat COLORFORMAT>
+FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend(const FragmentColor colA, const FragmentColor colB, const u16 blendEVA, const u16 blendEVB)
+{
+	FragmentColor outColor;
+	
+	u16 r16 = ( (colA.r * blendEVA) + (colB.r * blendEVB) ) / 16;
+	u16 g16 = ( (colA.g * blendEVA) + (colB.g * blendEVB) ) / 16;
+	u16 b16 = ( (colA.b * blendEVA) + (colB.b * blendEVB) ) / 16;
+	
+	if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
+	{
+		outColor.r = (r16 > 63) ? 63 : r16;
+		outColor.g = (g16 > 63) ? 63 : g16;
+		outColor.b = (b16 > 63) ? 63 : b16;
+	}
+	else if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
+	{
+		outColor.r = (r16 > 255) ? 255 : r16;
+		outColor.g = (g16 > 255) ? 255 : g16;
+		outColor.b = (b16 > 255) ? 255 : b16;
+	}
+	
+	outColor.a = 0;
+	return outColor;
+}
+
+FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend(const u16 colA, const u16 colB, const TBlendTable *blendTable)
+{
+	const u8 r = (*blendTable)[ colA        & 0x1F][ colB        & 0x1F];
+	const u8 g = (*blendTable)[(colA >>  5) & 0x1F][(colB >>  5) & 0x1F];
+	const u8 b = (*blendTable)[(colA >> 10) & 0x1F][(colB >> 10) & 0x1F];
+
+	return r | (g << 5) | (b << 10);
+}
+
+FORCEINLINE u16 GPUEngineBase::_ColorEffectBlend3D(const FragmentColor colA, const u16 colB)
+{
+	const u16 alpha = colA.a + 1;
+	COLOR c2;
+	COLOR cfinal;
+	
+	c2.val = colB;
+	
+	cfinal.bits.red   = ((colA.r * alpha) + ((c2.bits.red   << 1) * (32 - alpha))) >> 6;
+	cfinal.bits.green = ((colA.g * alpha) + ((c2.bits.green << 1) * (32 - alpha))) >> 6;
+	cfinal.bits.blue  = ((colA.b * alpha) + ((c2.bits.blue  << 1) * (32 - alpha))) >> 6;
+	cfinal.bits.alpha = 0;
+	
+	return cfinal.val;
+}
+
+template <NDSColorFormat COLORFORMATB>
+FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectBlend3D(const FragmentColor colA, const FragmentColor colB)
+{
+	FragmentColor blendedColor;
+	const u16 alpha = colA.a + 1;
+	
+	if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
+	{
+		blendedColor.r = ((colA.r * alpha) + (colB.r * (32 - alpha))) >> 5;
+		blendedColor.g = ((colA.g * alpha) + (colB.g * (32 - alpha))) >> 5;
+		blendedColor.b = ((colA.b * alpha) + (colB.b * (32 - alpha))) >> 5;
+	}
+	else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
+	{
+		blendedColor.r = ((colA.r * alpha) + (colB.r * (256 - alpha))) >> 8;
+		blendedColor.g = ((colA.g * alpha) + (colB.g * (256 - alpha))) >> 8;
+		blendedColor.b = ((colA.b * alpha) + (colB.b * (256 - alpha))) >> 8;
+	}
+	
+	blendedColor.a = 0;
+	return blendedColor;
+}
+
+FORCEINLINE u16 GPUEngineBase::_ColorEffectIncreaseBrightness(const u16 col, const u16 blendEVY)
+{
+	u16 r =  col        & 0x001F;
+	u16 g = (col >>  5) & 0x001F;
+	u16 b = (col >> 10) & 0x001F;
+	
+	r = (r + ((31 - r) * blendEVY / 16));
+	g = (g + ((31 - g) * blendEVY / 16));
+	b = (b + ((31 - b) * blendEVY / 16));
+	
+	return r | (g << 5) | (b << 10);
+}
+
+template <NDSColorFormat COLORFORMAT>
+FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectIncreaseBrightness(const FragmentColor col, const u16 blendEVY)
+{
+	FragmentColor newColor;
+	newColor.color = 0;
+	
+	u32 r = col.r;
+	u32 g = col.g;
+	u32 b = col.b;
+	
+	if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
+	{
+		newColor.r = (r + ((63 - r) * blendEVY / 16));
+		newColor.g = (g + ((63 - g) * blendEVY / 16));
+		newColor.b = (b + ((63 - b) * blendEVY / 16));
+	}
+	else if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
+	{
+		newColor.r = (r + ((255 - r) * blendEVY / 16));
+		newColor.g = (g + ((255 - g) * blendEVY / 16));
+		newColor.b = (b + ((255 - b) * blendEVY / 16));
+	}
+	
+	return newColor;
+}
+
+FORCEINLINE u16 GPUEngineBase::_ColorEffectDecreaseBrightness(const u16 col, const u16 blendEVY)
+{
+	u16 r =  col        & 0x001F;
+	u16 g = (col >>  5) & 0x001F;
+	u16 b = (col >> 10) & 0x001F;
+	
+	r = (r - (r * blendEVY / 16));
+	g = (g - (g * blendEVY / 16));
+	b = (b - (b * blendEVY / 16));
+	
+	return r | (g << 5) | (b << 10);
+}
+
+FORCEINLINE FragmentColor GPUEngineBase::_ColorEffectDecreaseBrightness(const FragmentColor col, const u16 blendEVY)
+{
+	FragmentColor newColor;
+	newColor.color = 0;
+	
+	u32 r = col.r;
+	u32 g = col.g;
+	u32 b = col.b;
+	
+	newColor.r = (r - (r * blendEVY / 16));
+	newColor.g = (g - (g * blendEVY / 16));
+	newColor.b = (b - (b * blendEVY / 16));
+	
+	return newColor;
+}
+
+#ifdef ENABLE_SSE2
+
+template <NDSColorFormat COLORFORMAT>
+FORCEINLINE __m128i GPUEngineBase::_ColorEffectIncreaseBrightness(const __m128i &col, const __m128i &blendEVY)
+{
+	if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		__m128i r_vec128 = _mm_and_si128(                col,      _mm_set1_epi16(0x001F) );
+		__m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col,  5), _mm_set1_epi16(0x001F) );
+		__m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) );
+		
+		r_vec128 = _mm_add_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), r_vec128), blendEVY), 4) );
+		g_vec128 = _mm_add_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), g_vec128), blendEVY), 4) );
+		b_vec128 = _mm_add_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16(31), b_vec128), blendEVY), 4) );
+		
+		return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) );
+	}
+	else
+	{
+		__m128i rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128());
+		__m128i rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128());
+		
+		rgbLo = _mm_add_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbLo), blendEVY), 4) );
+		rgbHi = _mm_add_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(_mm_set1_epi16((COLORFORMAT == NDSColorFormat_BGR666_Rev) ? 63 : 255), rgbHi), blendEVY), 4) );
+		
+		return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) );
+	}
+}
+
+template <NDSColorFormat COLORFORMAT>
+FORCEINLINE __m128i GPUEngineBase::_ColorEffectDecreaseBrightness(const __m128i &col, const __m128i &blendEVY)
+{
+	if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		__m128i r_vec128 = _mm_and_si128(                col,      _mm_set1_epi16(0x001F) );
+		__m128i g_vec128 = _mm_and_si128( _mm_srli_epi16(col,  5), _mm_set1_epi16(0x001F) );
+		__m128i b_vec128 = _mm_and_si128( _mm_srli_epi16(col, 10), _mm_set1_epi16(0x001F) );
+		
+		r_vec128 = _mm_sub_epi16( r_vec128, _mm_srli_epi16(_mm_mullo_epi16(r_vec128, blendEVY), 4) );
+		g_vec128 = _mm_sub_epi16( g_vec128, _mm_srli_epi16(_mm_mullo_epi16(g_vec128, blendEVY), 4) );
+		b_vec128 = _mm_sub_epi16( b_vec128, _mm_srli_epi16(_mm_mullo_epi16(b_vec128, blendEVY), 4) );
+		
+		return _mm_or_si128(r_vec128, _mm_or_si128( _mm_slli_epi16(g_vec128, 5), _mm_slli_epi16(b_vec128, 10)) );
+	}
+	else
+	{
+		__m128i rgbLo = _mm_unpacklo_epi8(col, _mm_setzero_si128());
+		__m128i rgbHi = _mm_unpackhi_epi8(col, _mm_setzero_si128());
+		
+		rgbLo = _mm_sub_epi16( rgbLo, _mm_srli_epi16(_mm_mullo_epi16(rgbLo, blendEVY), 4) );
+		rgbHi = _mm_sub_epi16( rgbHi, _mm_srli_epi16(_mm_mullo_epi16(rgbHi, blendEVY), 4) );
+		
+		return _mm_and_si128( _mm_packus_epi16(rgbLo, rgbHi), _mm_set1_epi32(0x00FFFFFF) );
+	}
+}
+
+template <NDSColorFormat COLORFORMAT>
+FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend(const __m128i &colA, const __m128i &colB, const __m128i &blendEVA, const __m128i &blendEVB)
+{
+#ifdef ENABLE_SSSE3
+	__m128i blendAB = _mm_or_si128(blendEVA, _mm_slli_epi16(blendEVB, 8));
+#endif
+	
+	if (COLORFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		__m128i ra;
+		__m128i ga;
+		__m128i ba;
+		__m128i colorBitMask = _mm_set1_epi16(0x001F);
+		
+#ifdef ENABLE_SSSE3
+		ra = _mm_or_si128( _mm_and_si128(               colA,      colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 8), _mm_set1_epi16(0x1F00)) );
+		ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA,  5), colorBitMask), _mm_and_si128(_mm_slli_epi16(colB, 3), _mm_set1_epi16(0x1F00)) );
+		ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(colB, 2), _mm_set1_epi16(0x1F00)) );
+		
+		ra = _mm_maddubs_epi16(ra, blendAB);
+		ga = _mm_maddubs_epi16(ga, blendAB);
+		ba = _mm_maddubs_epi16(ba, blendAB);
+#else
+		ra = _mm_and_si128(               colA,      colorBitMask);
+		ga = _mm_and_si128(_mm_srli_epi16(colA,  5), colorBitMask);
+		ba = _mm_and_si128(_mm_srli_epi16(colA, 10), colorBitMask);
+		
+		__m128i rb = _mm_and_si128(               colB,      colorBitMask);
+		__m128i gb = _mm_and_si128(_mm_srli_epi16(colB,  5), colorBitMask);
+		__m128i bb = _mm_and_si128(_mm_srli_epi16(colB, 10), colorBitMask);
+		
+		ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) );
+		ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) );
+		ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) );
+#endif
+		
+		ra = _mm_srli_epi16(ra, 4);
+		ga = _mm_srli_epi16(ga, 4);
+		ba = _mm_srli_epi16(ba, 4);
+		
+		ra = _mm_min_epi16(ra, colorBitMask);
+		ga = _mm_min_epi16(ga, colorBitMask);
+		ba = _mm_min_epi16(ba, colorBitMask);
+		
+		return _mm_or_si128(ra, _mm_or_si128( _mm_slli_epi16(ga, 5), _mm_slli_epi16(ba, 10)) );
+	}
+	else
+	{
+		__m128i outColorLo;
+		__m128i outColorHi;
+		__m128i outColor;
+		
+#ifdef ENABLE_SSSE3
+		outColorLo = _mm_unpacklo_epi8(colA, colB);
+		outColorHi = _mm_unpackhi_epi8(colA, colB);
+		
+		outColorLo = _mm_maddubs_epi16(outColorLo, blendAB);
+		outColorHi = _mm_maddubs_epi16(outColorHi, blendAB);
+#else
+		__m128i colALo = _mm_unpacklo_epi8(colA, _mm_setzero_si128());
+		__m128i colAHi = _mm_unpackhi_epi8(colA, _mm_setzero_si128());
+		__m128i colBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128());
+		__m128i colBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128());
+		
+		outColorLo = _mm_add_epi16( _mm_mullo_epi16(colALo, blendEVA), _mm_mullo_epi16(colBLo, blendEVB) );
+		outColorHi = _mm_add_epi16( _mm_mullo_epi16(colAHi, blendEVA), _mm_mullo_epi16(colBHi, blendEVB) );
+#endif
+		
+		outColorLo = _mm_srli_epi16(outColorLo, 4);
+		outColorHi = _mm_srli_epi16(outColorHi, 4);
+		outColor = _mm_packus_epi16(outColorLo, outColorHi);
+		
+		// When the color format is 888, the packuswb instruction will naturally clamp
+		// the color component values to 255. However, when the color format is 666, the
+		// color component values must be clamped to 63. In this case, we must call pminub
+		// to do the clamp.
+		if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
+		{
+			outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63));
+		}
+		
+		outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF));
+		
+		return outColor;
+	}
+}
+
+template <NDSColorFormat COLORFORMATB>
+FORCEINLINE __m128i GPUEngineBase::_ColorEffectBlend3D(const __m128i &colA_Lo, const __m128i &colA_Hi, const __m128i &colB)
+{
+	if (COLORFORMATB == NDSColorFormat_BGR555_Rev)
+	{
+		// If the color format of B is 555, then the colA_Hi parameter is required.
+		// The color format of A is assumed to be RGB666.
+		__m128i ra_lo = _mm_and_si128(                colA_Lo,      _mm_set1_epi32(0x000000FF) );
+		__m128i ga_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo,  8), _mm_set1_epi32(0x000000FF) );
+		__m128i ba_lo = _mm_and_si128( _mm_srli_epi32(colA_Lo, 16), _mm_set1_epi32(0x000000FF) );
+		__m128i aa_lo =                _mm_srli_epi32(colA_Lo, 24);
+		
+		__m128i ra_hi = _mm_and_si128(                colA_Hi,      _mm_set1_epi32(0x000000FF) );
+		__m128i ga_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi,  8), _mm_set1_epi32(0x000000FF) );
+		__m128i ba_hi = _mm_and_si128( _mm_srli_epi32(colA_Hi, 16), _mm_set1_epi32(0x000000FF) );
+		__m128i aa_hi =                _mm_srli_epi32(colA_Hi, 24);
+		
+		__m128i ra = _mm_packs_epi32(ra_lo, ra_hi);
+		__m128i ga = _mm_packs_epi32(ga_lo, ga_hi);
+		__m128i ba = _mm_packs_epi32(ba_lo, ba_hi);
+		__m128i aa = _mm_packs_epi32(aa_lo, aa_hi);
+		
+#ifdef ENABLE_SSSE3
+		ra = _mm_or_si128( ra, _mm_and_si128(_mm_slli_epi16(colB, 9), _mm_set1_epi16(0x3E00)) );
+		ga = _mm_or_si128( ga, _mm_and_si128(_mm_slli_epi16(colB, 4), _mm_set1_epi16(0x3E00)) );
+		ba = _mm_or_si128( ba, _mm_and_si128(_mm_srli_epi16(colB, 1), _mm_set1_epi16(0x3E00)) );
+		
+		aa = _mm_adds_epu8(aa, _mm_set1_epi16(1));
+		aa = _mm_or_si128( aa, _mm_slli_epi16(_mm_subs_epu16(_mm_set1_epi8(32), aa), 8) );
+		
+		ra = _mm_maddubs_epi16(ra, aa);
+		ga = _mm_maddubs_epi16(ga, aa);
+		ba = _mm_maddubs_epi16(ba, aa);
+#else
+		aa = _mm_adds_epu16(aa, _mm_set1_epi16(1));
+		__m128i rb = _mm_and_si128( _mm_slli_epi16(colB, 1), _mm_set1_epi16(0x003E) );
+		__m128i gb = _mm_and_si128( _mm_srli_epi16(colB, 4), _mm_set1_epi16(0x003E) );
+		__m128i bb = _mm_and_si128( _mm_srli_epi16(colB, 9), _mm_set1_epi16(0x003E) );
+		__m128i ab = _mm_subs_epu16( _mm_set1_epi16(32), aa );
+		
+		ra = _mm_add_epi16( _mm_mullo_epi16(ra, aa), _mm_mullo_epi16(rb, ab) );
+		ga = _mm_add_epi16( _mm_mullo_epi16(ga, aa), _mm_mullo_epi16(gb, ab) );
+		ba = _mm_add_epi16( _mm_mullo_epi16(ba, aa), _mm_mullo_epi16(bb, ab) );
+#endif
+		
+		ra = _mm_srli_epi16(ra, 6);
+		ga = _mm_srli_epi16(ga, 6);
+		ba = _mm_srli_epi16(ba, 6);
+		
+		return _mm_or_si128( _mm_or_si128(ra, _mm_slli_epi16(ga, 5)), _mm_slli_epi16(ba, 10) );
+	}
+	else
+	{
+		// If the color format of B is 666 or 888, then the colA_Hi parameter is ignored.
+		// The color format of A is assumed to match the color format of B.
+		__m128i rgbALo;
+		__m128i rgbAHi;
+		
+#ifdef ENABLE_SSSE3
+		if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
+		{
+			// Does not work for RGBA8888 color format. The reason is because this
+			// algorithm depends on the pmaddubsw instruction, which multiplies
+			// two unsigned 8-bit integers into an intermediate signed 16-bit
+			// integer. This means that we can overrun the signed 16-bit value
+			// range, which would be limited to [-32767 - 32767]. For example, a
+			// color component of value 255 multiplied by an alpha value of 255
+			// would equal 65025, which is greater than the upper range of a signed
+			// 16-bit value.
+			rgbALo = _mm_unpacklo_epi8(colA_Lo, colB);
+			rgbAHi = _mm_unpackhi_epi8(colA_Lo, colB);
+			
+			__m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x0000001F) );
+			alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) );
+			alpha = _mm_adds_epu8(alpha, _mm_set1_epi8(1));
+			
+			__m128i invAlpha = _mm_subs_epu8(_mm_set1_epi8(32), alpha);
+			__m128i alphaLo = _mm_unpacklo_epi8(alpha, invAlpha);
+			__m128i alphaHi = _mm_unpackhi_epi8(alpha, invAlpha);
+			
+			rgbALo = _mm_maddubs_epi16(rgbALo, alphaLo);
+			rgbAHi = _mm_maddubs_epi16(rgbAHi, alphaHi);
+		}
+		else
+#endif
+		{
+			rgbALo = _mm_unpacklo_epi8(colA_Lo, _mm_setzero_si128());
+			rgbAHi = _mm_unpackhi_epi8(colA_Lo, _mm_setzero_si128());
+			__m128i rgbBLo = _mm_unpacklo_epi8(colB, _mm_setzero_si128());
+			__m128i rgbBHi = _mm_unpackhi_epi8(colB, _mm_setzero_si128());
+			
+			__m128i alpha = _mm_and_si128( _mm_srli_epi32(colA_Lo, 24), _mm_set1_epi32(0x000000FF) );
+			alpha = _mm_or_si128( alpha, _mm_or_si128(_mm_slli_epi32(alpha, 8), _mm_slli_epi32(alpha, 16)) );
+			
+			__m128i alphaLo = _mm_unpacklo_epi8(alpha, _mm_setzero_si128());
+			__m128i alphaHi = _mm_unpackhi_epi8(alpha, _mm_setzero_si128());
+			alphaLo = _mm_add_epi16(alphaLo, _mm_set1_epi16(1));
+			alphaHi = _mm_add_epi16(alphaHi, _mm_set1_epi16(1));
+			
+			if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
+			{
+				rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(32), alphaLo)) );
+				rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(32), alphaHi)) );
+			}
+			else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
+			{
+				rgbALo = _mm_add_epi16( _mm_mullo_epi16(rgbALo, alphaLo), _mm_mullo_epi16(rgbBLo, _mm_sub_epi16(_mm_set1_epi16(256), alphaLo)) );
+				rgbAHi = _mm_add_epi16( _mm_mullo_epi16(rgbAHi, alphaHi), _mm_mullo_epi16(rgbBHi, _mm_sub_epi16(_mm_set1_epi16(256), alphaHi)) );
+			}
+		}
+		
+		if (COLORFORMATB == NDSColorFormat_BGR666_Rev)
+		{
+			rgbALo = _mm_srli_epi16(rgbALo, 5);
+			rgbAHi = _mm_srli_epi16(rgbAHi, 5);
+		}
+		else if (COLORFORMATB == NDSColorFormat_BGR888_Rev)
+		{
+			rgbALo = _mm_srli_epi16(rgbALo, 8);
+			rgbAHi = _mm_srli_epi16(rgbAHi, 8);
+		}
+		
+		return _mm_and_si128( _mm_packus_epi16(rgbALo, rgbAHi), _mm_set1_epi32(0x00FFFFFF) );
+	}
+}
+
+#endif
+
+void GPUEngineBase::ParseReg_MASTER_BRIGHT()
+{
+	const IOREG_MASTER_BRIGHT &MASTER_BRIGHT = this->_IORegisterMap->MASTER_BRIGHT;
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.masterBrightnessIntensity = (MASTER_BRIGHT.Intensity >= 16) ? 16 : MASTER_BRIGHT.Intensity;
+	renderState.masterBrightnessMode = (GPUMasterBrightMode)MASTER_BRIGHT.Mode;
+	renderState.masterBrightnessIsFullIntensity = ( (MASTER_BRIGHT.Intensity >= 16) && ((MASTER_BRIGHT.Mode == GPUMasterBrightMode_Up) || (MASTER_BRIGHT.Mode == GPUMasterBrightMode_Down)) );
+	renderState.masterBrightnessIsMaxOrMin = ( (MASTER_BRIGHT.Intensity >= 16) || (MASTER_BRIGHT.Intensity == 0) );
+}
+
+//Sets up LCD control variables for Display Engines A and B for quick reading
+void GPUEngineBase::ParseReg_DISPCNT()
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.displayOutputMode = (this->_engineID == GPUEngineID_Main) ? (GPUDisplayMode)DISPCNT.DisplayMode : (GPUDisplayMode)(DISPCNT.DisplayMode & 0x01);
+	
+	renderState.WIN0_ENABLED = (DISPCNT.Win0_Enable != 0);
+	renderState.WIN1_ENABLED = (DISPCNT.Win1_Enable != 0);
+	renderState.WINOBJ_ENABLED = (DISPCNT.WinOBJ_Enable != 0);
+	renderState.isAnyWindowEnabled = (renderState.WIN0_ENABLED || renderState.WIN1_ENABLED || renderState.WINOBJ_ENABLED);
+	
+	if (DISPCNT.OBJ_Tile_mapping)
+	{
+		//1-d sprite mapping boundaries:
+		//32k, 64k, 128k, 256k
+		renderState.spriteBoundary = 5 + DISPCNT.OBJ_Tile_1D_Bound;
+		
+		//do not be deceived: even though a sprBoundary==8 (256KB region) is impossible to fully address
+		//in GPU_SUB, it is still fully legal to address it with that granularity.
+		//so don't do this: //if((gpu->core == GPU_SUB) && (cnt->OBJ_Tile_1D_Bound == 3)) gpu->sprBoundary = 7;
+
+		renderState.spriteRenderMode = SpriteRenderMode_Sprite1D;
+	}
+	else
+	{
+		//2d sprite mapping
+		//boundary : 32k
+		renderState.spriteBoundary = 5;
+		renderState.spriteRenderMode = SpriteRenderMode_Sprite2D;
+	}
+     
+	if (DISPCNT.OBJ_BMP_1D_Bound && (this->_engineID == GPUEngineID_Main))
+		renderState.spriteBMPBoundary = 8;
+	else
+		renderState.spriteBMPBoundary = 7;
+	
+	this->ParseReg_BGnCNT(GPULayerID_BG3);
+	this->ParseReg_BGnCNT(GPULayerID_BG2);
+	this->ParseReg_BGnCNT(GPULayerID_BG1);
+	this->ParseReg_BGnCNT(GPULayerID_BG0);
+}
+
+void GPUEngineBase::ParseReg_BGnCNT(const GPULayerID layerID)
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	const IOREG_BGnCNT &BGnCNT = this->_IORegisterMap->BGnCNT[layerID];
+	this->_BGLayer[layerID].BGnCNT = BGnCNT;
+	
+	switch (layerID)
+	{
+		case GPULayerID_BG0: this->_BGLayer[layerID].isVisible = (DISPCNT.BG0_Enable != 0); break;
+		case GPULayerID_BG1: this->_BGLayer[layerID].isVisible = (DISPCNT.BG1_Enable != 0); break;
+		case GPULayerID_BG2: this->_BGLayer[layerID].isVisible = (DISPCNT.BG2_Enable != 0); break;
+		case GPULayerID_BG3: this->_BGLayer[layerID].isVisible = (DISPCNT.BG3_Enable != 0); break;
+			
+		default:
+			break;
+	}
+	
+	if (this->_engineID == GPUEngineID_Main)
+	{
+		this->_BGLayer[layerID].largeBMPAddress  = MMU_ABG;
+		this->_BGLayer[layerID].BMPAddress       = MMU_ABG + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_16KB);
+		this->_BGLayer[layerID].tileMapAddress   = MMU_ABG + (DISPCNT.ScreenBase_Block * ADDRESS_STEP_64KB) + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_2KB);
+		this->_BGLayer[layerID].tileEntryAddress = MMU_ABG + (DISPCNT.CharacBase_Block * ADDRESS_STEP_64KB) + (BGnCNT.CharacBase_Block * ADDRESS_STEP_16KB);
+	}
+	else
+	{
+		this->_BGLayer[layerID].largeBMPAddress  = MMU_BBG;
+		this->_BGLayer[layerID].BMPAddress       = MMU_BBG + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_16KB);
+		this->_BGLayer[layerID].tileMapAddress   = MMU_BBG + (BGnCNT.ScreenBase_Block * ADDRESS_STEP_2KB);
+		this->_BGLayer[layerID].tileEntryAddress = MMU_BBG + (BGnCNT.CharacBase_Block * ADDRESS_STEP_16KB);
+	}
+	
+	//clarify affine ext modes
+	BGType mode = GPUEngineBase::_mode2type[DISPCNT.BG_Mode][layerID];
+	this->_BGLayer[layerID].baseType = mode;
+	
+	if (mode == BGType_AffineExt)
+	{
+		//see: http://nocash.emubase.de/gbatek.htm#dsvideobgmodescontrol
+		const u8 affineModeSelection = (BGnCNT.PaletteMode << 1) | (BGnCNT.CharacBase_Block & 1);
+		switch (affineModeSelection)
+		{
+			case 0:
+			case 1:
+				mode = BGType_AffineExt_256x16;
+				break;
+			case 2:
+				mode = BGType_AffineExt_256x1;
+				break;
+			case 3:
+				mode = BGType_AffineExt_Direct;
+				break;
+		}
+	}
+	
+	// Extended palette slots can be changed for BG0 and BG1, but BG2 and BG3 remain constant.
+	// Display wrapping can be changed for BG2 and BG3, but BG0 and BG1 cannot wrap.
+	if (layerID == GPULayerID_BG0 || layerID == GPULayerID_BG1)
+	{
+		this->_BGLayer[layerID].extPaletteSlot = (BGnCNT.PaletteSet_Wrap * 2) + layerID;
+	}
+	else
+	{
+		this->_BGLayer[layerID].isDisplayWrapped = (BGnCNT.PaletteSet_Wrap != 0);
+	}
+	
+	this->_BGLayer[layerID].type = mode;
+	this->_BGLayer[layerID].size = GPUEngineBase::_BGLayerSizeLUT[mode][BGnCNT.ScreenSize];
+	this->_BGLayer[layerID].isMosaic = (BGnCNT.Mosaic != 0);
+	this->_BGLayer[layerID].priority = BGnCNT.Priority;
+	this->_BGLayer[layerID].extPalette = (u16 **)&MMU.ExtPal[this->_engineID][this->_BGLayer[layerID].extPaletteSlot];
+	
+	this->_ResortBGLayers();
+}
+
+template <GPULayerID LAYERID>
+void GPUEngineBase::ParseReg_BGnHOFS()
+{
+	const IOREG_BGnHOFS &BGnHOFS = this->_IORegisterMap->BGnOFS[LAYERID].BGnHOFS;
+	this->_BGLayer[LAYERID].BGnHOFS = BGnHOFS;
+	
+#ifdef MSB_FIRST
+	this->_BGLayer[LAYERID].xOffset = LOCAL_TO_LE_16(BGnHOFS.value) & 0x01FF;
+#else
+	this->_BGLayer[LAYERID].xOffset = BGnHOFS.Offset;
+#endif
+}
+
+template <GPULayerID LAYERID>
+void GPUEngineBase::ParseReg_BGnVOFS()
+{
+	const IOREG_BGnVOFS &BGnVOFS = this->_IORegisterMap->BGnOFS[LAYERID].BGnVOFS;
+	this->_BGLayer[LAYERID].BGnVOFS = BGnVOFS;
+	
+#ifdef MSB_FIRST
+	this->_BGLayer[LAYERID].yOffset = LOCAL_TO_LE_16(BGnVOFS.value) & 0x01FF;
+#else
+	this->_BGLayer[LAYERID].yOffset = BGnVOFS.Offset;
+#endif
+}
+
+template <GPULayerID LAYERID>
+void GPUEngineBase::ParseReg_BGnX()
+{
+	if (LAYERID == GPULayerID_BG2)
+	{
+		this->savedBG2X = this->_IORegisterMap->BG2X;
+	}
+	else if (LAYERID == GPULayerID_BG3)
+	{
+		this->savedBG3X = this->_IORegisterMap->BG3X;
+	}
+}
+
+template <GPULayerID LAYERID>
+void GPUEngineBase::ParseReg_BGnY()
+{
+	if (LAYERID == GPULayerID_BG2)
+	{
+		this->savedBG2Y = this->_IORegisterMap->BG2Y;
+	}
+	else if (LAYERID == GPULayerID_BG3)
+	{
+		this->savedBG3Y = this->_IORegisterMap->BG3Y;
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::_RenderLine_Clear(GPUEngineCompositorInfo &compInfo)
+{
+	// Clear the current line with the clear color
+	u16 dstClearColor16 = compInfo.renderState.backdropColor16;
+	
+	if (compInfo.renderState.srcBlendEnable[GPULayerID_Backdrop])
+	{
+		if (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness)
+		{
+			dstClearColor16 = compInfo.renderState.brightnessUpTable555[compInfo.renderState.backdropColor16];
+		}
+		else if (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)
+		{
+			dstClearColor16 = compInfo.renderState.brightnessDownTable555[compInfo.renderState.backdropColor16];
+		}
+	}
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(*compInfo.target.lineColor, dstClearColor16);
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(*compInfo.target.lineColor, COLOR555TO666(dstClearColor16));
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(*compInfo.target.lineColor, COLOR555TO888(dstClearColor16));
+			break;
+	}
+	
+	memset(this->_renderLineLayerIDNative, GPULayerID_Backdrop, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	memset(this->_sprWin, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	
+	// init pixels priorities
+	assert(NB_PRIORITIES == 4);
+	this->_itemsForPriority[0].nbPixelsX = 0;
+	this->_itemsForPriority[1].nbPixelsX = 0;
+	this->_itemsForPriority[2].nbPixelsX = 0;
+	this->_itemsForPriority[3].nbPixelsX = 0;
+}
+
+void GPUEngineBase::UpdateRenderStates(const size_t l)
+{
+	GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[l];
+	
+	this->_currentRenderState.backdropColor16 = LE_TO_LOCAL_16(this->_paletteBG[0]) & 0x7FFF;
+	compInfo.renderState = this->_currentRenderState;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::RenderLine(const size_t l)
+{
+	// By default, do nothing.
+	this->UpdatePropertiesWithoutRender(l);
+}
+
+void GPUEngineBase::UpdatePropertiesWithoutRender(const u16 l)
+{
+	// Update BG2/BG3 parameters for Affine and AffineExt modes
+	if (  this->_enableLayer[GPULayerID_BG2] &&
+		((this->_BGLayer[GPULayerID_BG2].baseType == BGType_Affine) || (this->_BGLayer[GPULayerID_BG2].baseType == BGType_AffineExt)) )
+	{
+		IOREG_BG2Parameter &BG2Param = this->_IORegisterMap->BG2Param;
+		
+		BG2Param.BG2X.value += BG2Param.BG2PB.value;
+		BG2Param.BG2Y.value += BG2Param.BG2PD.value;
+	}
+	
+	if (  this->_enableLayer[GPULayerID_BG3] &&
+		((this->_BGLayer[GPULayerID_BG3].baseType == BGType_Affine) || (this->_BGLayer[GPULayerID_BG3].baseType == BGType_AffineExt)) )
+	{
+		IOREG_BG3Parameter &BG3Param = this->_IORegisterMap->BG3Param;
+		
+		BG3Param.BG3X.value += BG3Param.BG3PB.value;
+		BG3Param.BG3Y.value += BG3Param.BG3PD.value;
+	}
+}
+
+void GPUEngineBase::LastLineProcess()
+{
+	this->RefreshAffineStartRegs();
+}
+
+const GPU_IOREG& GPUEngineBase::GetIORegisterMap() const
+{
+	return *this->_IORegisterMap;
+}
+
+bool GPUEngineBase::IsMasterBrightFullIntensity() const
+{
+	return this->_currentRenderState.masterBrightnessIsFullIntensity;
+}
+
+bool GPUEngineBase::IsMasterBrightMaxOrMin() const
+{
+	return this->_currentRenderState.masterBrightnessIsMaxOrMin;
+}
+
+bool GPUEngineBase::IsMasterBrightFullIntensityAtLineZero() const
+{
+	return this->_currentCompositorInfo[0].renderState.masterBrightnessIsFullIntensity;
+}
+
+void GPUEngineBase::GetMasterBrightnessAtLineZero(GPUMasterBrightMode &outMode, u8 &outIntensity)
+{
+	outMode = this->_currentCompositorInfo[0].renderState.masterBrightnessMode;
+	outIntensity = this->_currentCompositorInfo[0].renderState.masterBrightnessIntensity;
+}
+
+/*****************************************************************************/
+//			ENABLING / DISABLING LAYERS
+/*****************************************************************************/
+
+bool GPUEngineBase::GetEnableState()
+{
+	return CommonSettings.showGpu.screens[this->_engineID];
+}
+
+void GPUEngineBase::SetEnableState(bool theState)
+{
+	CommonSettings.showGpu.screens[this->_engineID] = theState;
+}
+
+bool GPUEngineBase::GetLayerEnableState(const size_t layerIndex)
+{
+	return CommonSettings.dispLayers[this->_engineID][layerIndex];
+}
+
+void GPUEngineBase::SetLayerEnableState(const size_t layerIndex, bool theState)
+{
+	CommonSettings.dispLayers[this->_engineID][layerIndex] = theState;
+	this->_ResortBGLayers();
+}
+
+template <s32 INTEGERSCALEHINT, bool USELINEINDEX, bool NEEDENDIANSWAP, size_t ELEMENTSIZE>
+void GPUEngineBase::_LineCopy(void *__restrict dstBuffer, const void *__restrict srcBuffer, const size_t l)
+{
+	switch (INTEGERSCALEHINT)
+	{
+		case 0:
+		{
+			const size_t lineWidth = GPU->GetDisplayInfo().customWidth;
+			const size_t lineIndex = _gpuCaptureLineIndex[l];
+			const size_t lineCount = _gpuCaptureLineCount[l];
+			
+			const void *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (lineIndex * lineWidth * ELEMENTSIZE) : (u8 *)srcBuffer;
+			void *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (lineIndex * lineWidth * ELEMENTSIZE) : (u8 *)dstBuffer;
+			
+			CopyLineExpand<INTEGERSCALEHINT, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, lineWidth * lineCount);
+			break;
+		}
+			
+		case 1:
+		{
+			const void *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer;
+			void *__restrict dst = (USELINEINDEX) ? (u8 *)dstBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)dstBuffer;
+			
+			CopyLineExpand<INTEGERSCALEHINT, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+			break;
+		}
+			
+		default:
+		{
+			const size_t lineWidth = GPU->GetDisplayInfo().customWidth;
+			const size_t lineCount = _gpuCaptureLineCount[l];
+			const size_t lineIndex = _gpuCaptureLineIndex[l];
+			
+			const void *__restrict src = (USELINEINDEX) ? (u8 *)srcBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH * ELEMENTSIZE) : (u8 *)srcBuffer;
+			u8 *__restrict dstLineHead = (USELINEINDEX) ? (u8 *)dstBuffer + (lineIndex * lineWidth * ELEMENTSIZE) : (u8 *)dstBuffer;
+			
+			// TODO: Determine INTEGERSCALEHINT earlier in the pipeline, preferably when the framebuffer is first initialized.
+			//
+			// The implementation below is a stopgap measure for getting the faster code paths to run.
+			// However, this setup is not ideal, since the code size will greatly increase in order to
+			// include all possible code paths, possibly causing cache misses on lesser CPUs.
+			if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2))
+			{
+				CopyLineExpand<2, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 2);
+			}
+			else if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3))
+			{
+				CopyLineExpand<3, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 3);
+			}
+			else if (lineWidth == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4))
+			{
+				CopyLineExpand<4, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
+			}
+			else if ((lineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0)
+			{
+				CopyLineExpand<0xFFFF, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, lineWidth);
+			}
+			else
+			{
+				CopyLineExpand<-1, NEEDENDIANSWAP, ELEMENTSIZE>(dstLineHead, src, lineWidth);
+			}
+			
+			u8 *__restrict dst = (u8 *)dstLineHead + (lineWidth * ELEMENTSIZE);
+			
+			for (size_t line = 1; line < lineCount; line++)
+			{
+				memcpy(dst, dstLineHead, lineWidth * ELEMENTSIZE);
+				dst += (lineWidth * ELEMENTSIZE);
+			}
+			
+			break;
+		}
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::_TransitionLineNativeToCustom(GPUEngineCompositorInfo &compInfo)
+{
+	if (this->isLineRenderNative[compInfo.line.indexNative])
+	{
+		switch (OUTPUTFORMAT)
+		{
+			case NDSColorFormat_BGR555_Rev:
+				this->_LineCopy<0xFFFF, false, false, 2>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, 0);
+				break;
+				
+			case NDSColorFormat_BGR666_Rev:
+			case NDSColorFormat_BGR888_Rev:
+				this->_LineCopy<0xFFFF, false, false, 4>(compInfo.target.lineColorHeadCustom, compInfo.target.lineColorHeadNative, 0);
+				break;
+		}
+		
+		this->_LineCopy<0xFFFF, false, false, 1>(compInfo.target.lineLayerIDHeadCustom, compInfo.target.lineLayerIDHeadNative, 0);
+		
+		compInfo.target.lineColorHead = compInfo.target.lineColorHeadCustom;
+		compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadCustom;
+		this->isLineRenderNative[compInfo.line.indexNative] = false;
+		this->nativeLineRenderCount--;
+	}
+}
+
+/*****************************************************************************/
+//			PIXEL RENDERING
+/*****************************************************************************/
+template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
+FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, const u16 srcColor16)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			dstColor16 = srcColor16 | 0x8000;
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			dstColor32.color = ColorspaceConvert555To6665Opaque<false>(srcColor16);
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			dstColor32.color = ColorspaceConvert555To8888Opaque<false>(srcColor16);
+			break;
+	}
+	
+	if (!ISDEBUGRENDER)
+	{
+		dstLayerID = compInfo.renderState.selectedLayerID;
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
+FORCEINLINE void GPUEngineBase::_PixelCopy(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			dstColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
+			dstColor16 = dstColor16 | 0x8000;
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			dstColor32 = srcColor32;
+			dstColor32.a = 0x1F;
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			dstColor32 = srcColor32;
+			dstColor32.a = 0xFF;
+			break;
+			
+		default:
+			return;
+	}
+	
+	if (!ISDEBUGRENDER)
+	{
+		dstLayerID = compInfo.renderState.selectedLayerID;
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const u16 srcColor16)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF] | 0x8000;
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF];
+			dstColor32.a = 0x1F;
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF];
+			dstColor32.a = 0xFF;
+			break;
+	}
+	
+	dstLayerID = compInfo.renderState.selectedLayerID;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessUp(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const u16 srcColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
+		dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF];
+		dstColor16 = dstColor16 | 0x8000;
+	}
+	else
+	{
+		dstColor32 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(srcColor32, compInfo.renderState.blendEVY);
+		dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
+	}
+	
+	dstLayerID = compInfo.renderState.selectedLayerID;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const u16 srcColor16)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF] | 0x8000;
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF];
+			dstColor32.a = 0x1F;
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF];
+			dstColor32.a = 0xFF;
+			break;
+	}
+	
+	dstLayerID = compInfo.renderState.selectedLayerID;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessDown(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const u16 srcColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
+		dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF];
+		dstColor16 = dstColor16 | 0x8000;
+	}
+	else
+	{
+		dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY);
+		dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
+	}
+	
+	dstLayerID = compInfo.renderState.selectedLayerID;
+}
+
+template <NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
+FORCEINLINE void GPUEngineBase::_PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha, const bool enableColorEffect)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	TBlendTable *selectedBlendTable = compInfo.renderState.blendTable555;
+	u8 blendEVA = compInfo.renderState.blendEVA;
+	u8 blendEVB = compInfo.renderState.blendEVB;
+	
+	const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID];
+	bool forceBlendEffect = false;
+	
+	if ((LAYERTYPE == GPULayerType_OBJ) && enableColorEffect)
+	{
+		//translucent-capable OBJ are forcing the function to blend when the second target is satisfied
+		const OBJMode objMode = (OBJMode)this->_sprType[compInfo.target.xNative];
+		const bool isObjTranslucentType = (objMode == OBJMode_Transparent) || (objMode == OBJMode_Bitmap);
+		if (isObjTranslucentType && dstEffectEnable)
+		{
+			// OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha.
+			// Test cases:
+			// * The spriteblend demo
+			// * Glory of Heracles - fairy on the title screen
+			// * Phoenix Wright: Ace Attorney - character fade-in/fade-out
+			if (spriteAlpha != 0xFF)
+			{
+				blendEVA = spriteAlpha;
+				blendEVB = 16 - spriteAlpha;
+				selectedBlendTable = &GPUEngineBase::_blendTable555[blendEVA][blendEVB];
+			}
+			
+			forceBlendEffect = true;
+		}
+	}
+	
+	ColorEffect selectedEffect = (forceBlendEffect) ? ColorEffect_Blend : ColorEffect_Disable;
+	
+	// If we're not forcing blending, then select the color effect based on the BLDCNT target flags.
+	if (!forceBlendEffect && enableColorEffect && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID])
+	{
+		switch (compInfo.renderState.colorEffect)
+		{
+			// For the Blend effect, both first and second target flags must be checked.
+			case ColorEffect_Blend:
+			{
+				if (dstEffectEnable) selectedEffect = compInfo.renderState.colorEffect;
+				break;
+			}
+				
+			// For the Increase/Decrease Brightness effects, only the first target flag needs to be checked.
+			// Test case: Bomberman Land Touch! dialog boxes will render too dark without this check.
+			case ColorEffect_IncreaseBrightness:
+			case ColorEffect_DecreaseBrightness:
+				selectedEffect = compInfo.renderState.colorEffect;
+				break;
+				
+			default:
+				break;
+		}
+	}
+	
+	// Render the pixel using the selected color effect.
+	switch (selectedEffect)
+	{
+		case ColorEffect_Disable:
+		{
+			switch (OUTPUTFORMAT)
+			{
+				case NDSColorFormat_BGR555_Rev:
+					dstColor16 = srcColor16;
+					dstColor16 |= 0x8000;
+					break;
+					
+				case NDSColorFormat_BGR666_Rev:
+					dstColor32.color = ColorspaceConvert555To6665Opaque<false>(srcColor16);
+					break;
+					
+				case NDSColorFormat_BGR888_Rev:
+					dstColor32.color = ColorspaceConvert555To8888Opaque<false>(srcColor16);
+					break;
+			}
+			break;
+		}
+			
+		case ColorEffect_IncreaseBrightness:
+		{
+			switch (OUTPUTFORMAT)
+			{
+				case NDSColorFormat_BGR555_Rev:
+					dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF];
+					dstColor16 |= 0x8000;
+					break;
+					
+				case NDSColorFormat_BGR666_Rev:
+					dstColor32 = compInfo.renderState.brightnessUpTable666[srcColor16 & 0x7FFF];
+					dstColor32.a = 0x1F;
+					break;
+					
+				case NDSColorFormat_BGR888_Rev:
+					dstColor32 = compInfo.renderState.brightnessUpTable888[srcColor16 & 0x7FFF];
+					dstColor32.a = 0xFF;
+					break;
+			}
+			break;
+		}
+			
+		case ColorEffect_DecreaseBrightness:
+		{
+			switch (OUTPUTFORMAT)
+			{
+				case NDSColorFormat_BGR555_Rev:
+					dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF];
+					dstColor16 |= 0x8000;
+					break;
+					
+				case NDSColorFormat_BGR666_Rev:
+					dstColor32 = compInfo.renderState.brightnessDownTable666[srcColor16 & 0x7FFF];
+					dstColor32.a = 0x1F;
+					break;
+					
+				case NDSColorFormat_BGR888_Rev:
+					dstColor32 = compInfo.renderState.brightnessDownTable888[srcColor16 & 0x7FFF];
+					dstColor32.a = 0xFF;
+					break;
+			}
+			break;
+		}
+			
+		case ColorEffect_Blend:
+		{
+			FragmentColor srcColor32;
+			
+			switch (OUTPUTFORMAT)
+			{
+				case NDSColorFormat_BGR555_Rev:
+					dstColor16 = this->_ColorEffectBlend(srcColor16, dstColor16, selectedBlendTable);
+					dstColor16 |= 0x8000;
+					break;
+					
+				case NDSColorFormat_BGR666_Rev:
+					srcColor32.color = ColorspaceConvert555To6665Opaque<false>(srcColor16);
+					dstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(srcColor32, dstColor32, blendEVA, blendEVB);
+					dstColor32.a = 0x1F;
+					break;
+					
+				case NDSColorFormat_BGR888_Rev:
+					srcColor32.color = ColorspaceConvert555To8888Opaque<false>(srcColor16);
+					dstColor32 = this->_ColorEffectBlend<OUTPUTFORMAT>(srcColor32, dstColor32, blendEVA, blendEVB);
+					dstColor32.a = 0xFF;
+					break;
+			}
+			break;
+		}
+	}
+	
+	dstLayerID = compInfo.renderState.selectedLayerID;
+}
+
+template <NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
+FORCEINLINE void GPUEngineBase::_PixelUnknownEffect(GPUEngineCompositorInfo &compInfo, const FragmentColor srcColor32, const u8 spriteAlpha, const bool enableColorEffect)
+{
+	u16 &dstColor16 = *compInfo.target.lineColor16;
+	FragmentColor &dstColor32 = *compInfo.target.lineColor32;
+	u8 &dstLayerID = *compInfo.target.lineLayerID;
+	
+	u8 blendEVA = compInfo.renderState.blendEVA;
+	u8 blendEVB = compInfo.renderState.blendEVB;
+	
+	const bool dstEffectEnable = (dstLayerID != compInfo.renderState.selectedLayerID) && compInfo.renderState.dstBlendEnable[dstLayerID];
+	
+	// 3D rendering has a special override: If the destination pixel is set to blend, then always blend.
+	// Test case: When starting a stage in Super Princess Peach, the screen will be solid black unless
+	// blending is forced here.
+	//
+	// This behavior must take priority over checking for the window color effect enable flag.
+	// Test case: Dialogue boxes in Front Mission will be rendered with blending disabled unless
+	// blend forcing takes priority.
+	bool forceBlendEffect = (LAYERTYPE == GPULayerType_3D) ? dstEffectEnable : false;
+	
+	if ((LAYERTYPE == GPULayerType_OBJ) && enableColorEffect)
+	{
+		//translucent-capable OBJ are forcing the function to blend when the second target is satisfied
+		const OBJMode objMode = (OBJMode)this->_sprType[compInfo.target.xNative];
+		const bool isObjTranslucentType = (objMode == OBJMode_Transparent) || (objMode == OBJMode_Bitmap);
+		if (isObjTranslucentType && dstEffectEnable)
+		{
+			// OBJ without fine-grained alpha are using EVA/EVB for blending. This is signified by receiving 0xFF in the alpha.
+			// Test cases:
+			// * The spriteblend demo
+			// * Glory of Heracles - fairy on the title screen
+			// * Phoenix Wright: Ace Attorney - character fade-in/fade-out
+			if (spriteAlpha != 0xFF)
+			{
+				blendEVA = spriteAlpha;
+				blendEVB = 16 - spriteAlpha;
+			}
+			
+			forceBlendEffect = true;
+		}
+	}
+	
+	ColorEffect selectedEffect = (forceBlendEffect) ? ColorEffect_Blend : ColorEffect_Disable;
+	
+	// If we're not forcing blending, then select the color effect based on the BLDCNT target flags.
+	if (!forceBlendEffect && enableColorEffect && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID])
+	{
+		switch (compInfo.renderState.colorEffect)
+		{
+			// For the Blend effect, both first and second target flags must be checked.
+			case ColorEffect_Blend:
+			{
+				if (dstEffectEnable) selectedEffect = compInfo.renderState.colorEffect;
+				break;
+			}
+				
+			// For the Increase/Decrease Brightness effects, only the first target flag needs to be checked.
+			// Test case: Bomberman Land Touch! dialog boxes will render too dark without this check.
+			case ColorEffect_IncreaseBrightness:
+			case ColorEffect_DecreaseBrightness:
+				selectedEffect = compInfo.renderState.colorEffect;
+				break;
+				
+			default:
+				break;
+		}
+	}
+	
+	// Render the pixel using the selected color effect.
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const u16 srcColor16 = ColorspaceConvert6665To5551<false>(srcColor32);
+		
+		switch (selectedEffect)
+		{
+			case ColorEffect_Disable:
+				dstColor16 = srcColor16;
+				break;
+				
+			case ColorEffect_IncreaseBrightness:
+				dstColor16 = compInfo.renderState.brightnessUpTable555[srcColor16 & 0x7FFF];
+				break;
+				
+			case ColorEffect_DecreaseBrightness:
+				dstColor16 = compInfo.renderState.brightnessDownTable555[srcColor16 & 0x7FFF];
+				break;
+				
+			case ColorEffect_Blend:
+				dstColor16 = this->_ColorEffectBlend3D(srcColor32, dstColor16);
+				break;
+		}
+		
+		dstColor16 |= 0x8000;
+	}
+	else
+	{
+		switch (selectedEffect)
+		{
+			case ColorEffect_Disable:
+				dstColor32 = srcColor32;
+				break;
+				
+			case ColorEffect_IncreaseBrightness:
+				dstColor32 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(srcColor32, compInfo.renderState.blendEVY);
+				break;
+				
+			case ColorEffect_DecreaseBrightness:
+				dstColor32 = this->_ColorEffectDecreaseBrightness(srcColor32, compInfo.renderState.blendEVY);
+				break;
+				
+			case ColorEffect_Blend:
+				dstColor32 = (LAYERTYPE == GPULayerType_3D) ? this->_ColorEffectBlend3D<OUTPUTFORMAT>(srcColor32, dstColor32) : this->_ColorEffectBlend<OUTPUTFORMAT>(srcColor32, dstColor32, blendEVA, blendEVB);
+				break;
+		}
+		
+		dstColor32.a = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
+	}
+	
+	dstLayerID = compInfo.renderState.selectedLayerID;
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
+FORCEINLINE void GPUEngineBase::_PixelComposite(GPUEngineCompositorInfo &compInfo, const u16 srcColor16, const u8 spriteAlpha, const bool enableColorEffect)
+{
+	switch (COMPOSITORMODE)
+	{
+		case GPUCompositorMode_Debug:
+			this->_PixelCopy<OUTPUTFORMAT, true>(compInfo, srcColor16);
+			break;
+			
+		case GPUCompositorMode_Copy:
+			this->_PixelCopy<OUTPUTFORMAT, false>(compInfo, srcColor16);
+			break;
+			
+		case GPUCompositorMode_BrightUp:
+			this->_PixelBrightnessUp<OUTPUTFORMAT>(compInfo, srcColor16);
+			break;
+			
+		case GPUCompositorMode_BrightDown:
+			this->_PixelBrightnessDown<OUTPUTFORMAT>(compInfo, srcColor16);
+			break;
+			
+		default:
+			this->_PixelUnknownEffect<OUTPUTFORMAT, LAYERTYPE>(compInfo, srcColor16, spriteAlpha, enableColorEffect);
+			break;
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
+FORCEINLINE void GPUEngineBase::_PixelComposite(GPUEngineCompositorInfo &compInfo, FragmentColor srcColor32, const u8 spriteAlpha, const bool enableColorEffect)
+{
+	switch (COMPOSITORMODE)
+	{
+		case GPUCompositorMode_Debug:
+			this->_PixelCopy<OUTPUTFORMAT, true>(compInfo, srcColor32);
+			break;
+			
+		case GPUCompositorMode_Copy:
+			this->_PixelCopy<OUTPUTFORMAT, false>(compInfo, srcColor32);
+			break;
+			
+		case GPUCompositorMode_BrightUp:
+			this->_PixelBrightnessUp<OUTPUTFORMAT>(compInfo, srcColor32);
+			break;
+			
+		case GPUCompositorMode_BrightDown:
+			this->_PixelBrightnessDown<OUTPUTFORMAT>(compInfo, srcColor32);
+			break;
+			
+		default:
+			this->_PixelUnknownEffect<OUTPUTFORMAT, LAYERTYPE>(compInfo, srcColor32, spriteAlpha, enableColorEffect);
+			break;
+	}
+}
+
+#ifdef ENABLE_SSE2
+
+template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
+FORCEINLINE void GPUEngineBase::_PixelCopy16_SSE2(GPUEngineCompositorInfo &compInfo,
+												  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+												  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+												  __m128i &dstLayerID)
+{
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const __m128i alphaBits = _mm_set1_epi16(0x8000);
+		dst0 = _mm_or_si128(src0, alphaBits);
+		dst1 = _mm_or_si128(src1, alphaBits);
+	}
+	else
+	{
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		dst0 = _mm_or_si128(src0, alphaBits);
+		dst1 = _mm_or_si128(src1, alphaBits);
+		dst2 = _mm_or_si128(src2, alphaBits);
+		dst3 = _mm_or_si128(src3, alphaBits);
+	}
+	
+	if (!ISDEBUGRENDER)
+	{
+		dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER>
+FORCEINLINE void GPUEngineBase::_PixelCopyWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
+														  const __m128i &passMask8,
+														  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+														  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+														  __m128i &dstLayerID)
+{
+	const __m128i passMask16[2]	= { _mm_unpacklo_epi8(passMask8, passMask8),
+								    _mm_unpackhi_epi8(passMask8, passMask8) };
+	
+	// Do the masked copy.
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const __m128i alphaBits = _mm_set1_epi16(0x8000);
+		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(src0, alphaBits), passMask16[0]);
+		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(src1, alphaBits), passMask16[1]);
+	}
+	else
+	{
+		const __m128i passMask32[4]	= { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
+									    _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
+									    _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
+									    _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
+		
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(src0, alphaBits), passMask32[0]);
+		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(src1, alphaBits), passMask32[1]);
+		dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(src2, alphaBits), passMask32[2]);
+		dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(src3, alphaBits), passMask32[3]);
+	}
+	
+	if (!ISDEBUGRENDER)
+	{
+		const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+		dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessUp16_SSE2(GPUEngineCompositorInfo &compInfo,
+														  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+														  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+														  __m128i &dstLayerID)
+{
+	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const __m128i alphaBits = _mm_set1_epi16(0x8000);
+		dst0 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
+		dst1 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
+	}
+	else
+	{
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		dst0 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
+		dst1 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
+		dst2 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits);
+		dst3 = _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits);
+	}
+	
+	dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessUpWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
+																  const __m128i &passMask8,
+																  const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+																  __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+																  __m128i &dstLayerID)
+{
+	const __m128i passMask16[2]	= { _mm_unpacklo_epi8(passMask8, passMask8),
+								    _mm_unpackhi_epi8(passMask8, passMask8) };
+	
+	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const __m128i alphaBits = _mm_set1_epi16(0x8000);
+		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask16[0]);
+		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask16[1]);
+	}
+	else
+	{
+		const __m128i passMask32[4]	= { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
+									    _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
+									    _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
+									    _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
+		
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask32[0]);
+		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask32[1]);
+		dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits), passMask32[2]);
+		dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits), passMask32[3]);
+	}
+	
+	const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+	dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessDown16_SSE2(GPUEngineCompositorInfo &compInfo,
+															const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+															__m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+															__m128i &dstLayerID)
+{
+	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const __m128i alphaBits = _mm_set1_epi16(0x8000);
+		dst0 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
+		dst1 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
+	}
+	else
+	{
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		dst0 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits);
+		dst1 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits);
+		dst2 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits);
+		dst3 = _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits);
+	}
+	
+	dstLayerID = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+FORCEINLINE void GPUEngineBase::_PixelBrightnessDownWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
+																	const __m128i &passMask8,
+																	const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+																	__m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+																	__m128i &dstLayerID)
+{
+	const __m128i passMask16[2]	= { _mm_unpacklo_epi8(passMask8, passMask8),
+								    _mm_unpackhi_epi8(passMask8, passMask8) };
+	
+	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		const __m128i alphaBits = _mm_set1_epi16(0x8000);
+		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask16[0]);
+		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask16[1]);
+	}
+	else
+	{
+		const __m128i passMask32[4]	= { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
+									    _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
+									    _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
+									    _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
+		
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		dst0 = _mm_blendv_epi8(dst0, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src0, evy_vec128), alphaBits), passMask32[0]);
+		dst1 = _mm_blendv_epi8(dst1, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src1, evy_vec128), alphaBits), passMask32[1]);
+		dst2 = _mm_blendv_epi8(dst2, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src2, evy_vec128), alphaBits), passMask32[2]);
+		dst3 = _mm_blendv_epi8(dst3, _mm_or_si128(this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(src3, evy_vec128), alphaBits), passMask32[3]);
+	}
+	
+	const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+	dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
+}
+
+template <NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
+FORCEINLINE void GPUEngineBase::_PixelUnknownEffectWithMask16_SSE2(GPUEngineCompositorInfo &compInfo,
+																   const __m128i &passMask8,
+																   const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+																   const __m128i &spriteAlpha,
+																   const __m128i &srcEffectEnableMask,
+																   const __m128i &enableColorEffectMask,
+																   __m128i &dst3, __m128i &dst2, __m128i &dst1, __m128i &dst0,
+																   __m128i &dstLayerID)
+{
+	const __m128i srcLayerID_vec128 = _mm_set1_epi8(compInfo.renderState.selectedLayerID);
+	const __m128i passMask16[2] = { _mm_unpacklo_epi8(passMask8, passMask8),
+	                                _mm_unpackhi_epi8(passMask8, passMask8) };
+	
+	const __m128i passMask32[4] = { _mm_unpacklo_epi16(passMask16[0], passMask16[0]),
+	                                _mm_unpackhi_epi16(passMask16[0], passMask16[0]),
+	                                _mm_unpacklo_epi16(passMask16[1], passMask16[1]),
+	                                _mm_unpackhi_epi16(passMask16[1], passMask16[1]) };
+	
+	__m128i dstEffectEnableMask;
+	
+#ifdef ENABLE_SSSE3
+	dstEffectEnableMask = _mm_shuffle_epi8(compInfo.renderState.dstBlendEnable_SSSE3, dstLayerID);
+	dstEffectEnableMask = _mm_xor_si128( _mm_cmpeq_epi8(dstEffectEnableMask, _mm_setzero_si128()), _mm_set1_epi32(0xFFFFFFFF) );
+#else
+	dstEffectEnableMask =                                   _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG0)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG0]);
+	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG1)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG1]) );
+	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG2)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG2]) );
+	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_BG3)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_BG3]) );
+	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_OBJ)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_OBJ]) );
+	dstEffectEnableMask = _mm_or_si128(dstEffectEnableMask, _mm_and_si128(_mm_cmpeq_epi8(dstLayerID, _mm_set1_epi8(GPULayerID_Backdrop)), compInfo.renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop]) );
+#endif
+	
+	dstEffectEnableMask = _mm_andnot_si128( _mm_cmpeq_epi8(dstLayerID, srcLayerID_vec128), dstEffectEnableMask );
+	
+	// Select the color effect based on the BLDCNT target flags.
+	const __m128i colorEffect_vec128 = _mm_blendv_epi8(_mm_set1_epi8(ColorEffect_Disable), _mm_set1_epi8(compInfo.renderState.colorEffect), enableColorEffectMask);
+	const __m128i evy_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVY);
+	__m128i eva_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVA);
+	__m128i evb_vec128 = _mm_set1_epi16(compInfo.renderState.blendEVB);
+	__m128i forceBlendEffectMask = (LAYERTYPE == GPULayerType_3D) ? dstEffectEnableMask : _mm_setzero_si128();
+	
+	if (LAYERTYPE == GPULayerType_OBJ)
+	{
+		const __m128i objMode_vec128 = _mm_loadu_si128((__m128i *)(this->_sprType + compInfo.target.xNative));
+		const __m128i isObjTranslucentMask = _mm_and_si128( _mm_and_si128(enableColorEffectMask, dstEffectEnableMask), _mm_or_si128(_mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Transparent)), _mm_cmpeq_epi8(objMode_vec128, _mm_set1_epi8(OBJMode_Bitmap))) );
+		forceBlendEffectMask = isObjTranslucentMask;
+		
+		const __m128i spriteAlphaMask = _mm_andnot_si128(_mm_cmpeq_epi8(spriteAlpha, _mm_set1_epi8(0xFF)), isObjTranslucentMask);
+		eva_vec128 = _mm_blendv_epi8(eva_vec128, spriteAlpha, spriteAlphaMask);
+		evb_vec128 = _mm_blendv_epi8(evb_vec128, _mm_sub_epi8(_mm_set1_epi8(16), spriteAlpha), spriteAlphaMask);
+	}
+	
+	__m128i tmpSrc[4];
+	
+	if ( (LAYERTYPE == GPULayerType_3D) && (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) )
+	{
+		// 3D layer blending requires that all src colors are preserved as 32-bit values.
+		// Since dst2 and dst3 are currently unused for RGB555 output, we used these variables
+		// to store the converted 16-bit src colors in a previous step.
+		tmpSrc[0] = dst2;
+		tmpSrc[1] = dst3;
+	}
+	else
+	{
+		tmpSrc[0] = src0;
+		tmpSrc[1] = src1;
+		tmpSrc[2] = src2;
+		tmpSrc[3] = src3;
+	}
+	
+	switch (compInfo.renderState.colorEffect)
+	{
+		case ColorEffect_IncreaseBrightness:
+		{
+			const __m128i brightnessMask8 = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_IncreaseBrightness))) );
+			const __m128i brightnessMask16[2] = {_mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8)};
+			
+			if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+			{
+				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask16[0] );
+				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask16[1] );
+			}
+			else
+			{
+				const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]),
+				                                      _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]),
+				                                      _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]),
+				                                      _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) };
+				
+				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask32[0] );
+				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask32[1] );
+				tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[2], evy_vec128), brightnessMask32[2] );
+				tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(tmpSrc[3], evy_vec128), brightnessMask32[3] );
+			}
+			break;
+		}
+			
+		case ColorEffect_DecreaseBrightness:
+		{
+			const __m128i brightnessMask8 = _mm_andnot_si128( forceBlendEffectMask, _mm_and_si128(srcEffectEnableMask, _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_DecreaseBrightness))) );
+			const __m128i brightnessMask16[2] = {_mm_unpacklo_epi8(brightnessMask8, brightnessMask8), _mm_unpackhi_epi8(brightnessMask8, brightnessMask8)};
+			
+			if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+			{
+				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask16[0] );
+				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask16[1] );
+			}
+			else
+			{
+				const __m128i brightnessMask32[4] = { _mm_unpacklo_epi16(brightnessMask16[0], brightnessMask16[0]),
+				                                      _mm_unpackhi_epi16(brightnessMask16[0], brightnessMask16[0]),
+				                                      _mm_unpacklo_epi16(brightnessMask16[1], brightnessMask16[1]),
+				                                      _mm_unpackhi_epi16(brightnessMask16[1], brightnessMask16[1]) };
+				
+				tmpSrc[0] = _mm_blendv_epi8( tmpSrc[0], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[0], evy_vec128), brightnessMask32[0] );
+				tmpSrc[1] = _mm_blendv_epi8( tmpSrc[1], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[1], evy_vec128), brightnessMask32[1] );
+				tmpSrc[2] = _mm_blendv_epi8( tmpSrc[2], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[2], evy_vec128), brightnessMask32[2] );
+				tmpSrc[3] = _mm_blendv_epi8( tmpSrc[3], this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(tmpSrc[3], evy_vec128), brightnessMask32[3] );
+			}
+			break;
+		}
+			
+		default:
+			break;
+	}
+	
+	// Render the pixel using the selected color effect.
+	const __m128i blendMask8 = _mm_or_si128( forceBlendEffectMask, _mm_and_si128(_mm_and_si128(srcEffectEnableMask, dstEffectEnableMask), _mm_cmpeq_epi8(colorEffect_vec128, _mm_set1_epi8(ColorEffect_Blend))) );
+	const __m128i blendMask16[2] = {_mm_unpacklo_epi8(blendMask8, blendMask8), _mm_unpackhi_epi8(blendMask8, blendMask8)};
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		__m128i blendSrc16[2];
+		
+		if (LAYERTYPE == GPULayerType_3D)
+		{
+			blendSrc16[0] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src0, src1, dst0);
+			blendSrc16[1] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src2, src3, dst1);
+		}
+		else
+		{
+			blendSrc16[0] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[0], dst0, eva_vec128, evb_vec128);
+			blendSrc16[1] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[1], dst1, eva_vec128, evb_vec128);
+		}
+		
+		tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc16[0], blendMask16[0]);
+		tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc16[1], blendMask16[1]);
+		
+		// Combine the final colors.
+		tmpSrc[0] = _mm_or_si128(tmpSrc[0], _mm_set1_epi16(0x8000));
+		tmpSrc[1] = _mm_or_si128(tmpSrc[1], _mm_set1_epi16(0x8000));
+		
+		dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask16[0]);
+		dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask16[1]);
+	}
+	else
+	{
+		__m128i blendSrc32[4];
+		
+		if (LAYERTYPE == GPULayerType_3D)
+		{
+			blendSrc32[0] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src0, src0, dst0);
+			blendSrc32[1] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src1, src1, dst1);
+			blendSrc32[2] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src2, src2, dst2);
+			blendSrc32[3] = this->_ColorEffectBlend3D<OUTPUTFORMAT>(src3, src3, dst3);
+		}
+		else
+		{
+			blendSrc32[0] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[0], dst0, eva_vec128, evb_vec128);
+			blendSrc32[1] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[1], dst1, eva_vec128, evb_vec128);
+			blendSrc32[2] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[2], dst2, eva_vec128, evb_vec128);
+			blendSrc32[3] = this->_ColorEffectBlend<OUTPUTFORMAT>(tmpSrc[3], dst3, eva_vec128, evb_vec128);
+		}
+		
+		const __m128i blendMask32[4] = { _mm_unpacklo_epi16(blendMask16[0], blendMask16[0]),
+		                                 _mm_unpackhi_epi16(blendMask16[0], blendMask16[0]),
+		                                 _mm_unpacklo_epi16(blendMask16[1], blendMask16[1]),
+		                                 _mm_unpackhi_epi16(blendMask16[1], blendMask16[1]) };
+		
+		const __m128i alphaBits = _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000);
+		
+		tmpSrc[0] = _mm_blendv_epi8(tmpSrc[0], blendSrc32[0], blendMask32[0]);
+		tmpSrc[1] = _mm_blendv_epi8(tmpSrc[1], blendSrc32[1], blendMask32[1]);
+		tmpSrc[2] = _mm_blendv_epi8(tmpSrc[2], blendSrc32[2], blendMask32[2]);
+		tmpSrc[3] = _mm_blendv_epi8(tmpSrc[3], blendSrc32[3], blendMask32[3]);
+		
+		tmpSrc[0] = _mm_or_si128(tmpSrc[0], alphaBits);
+		tmpSrc[1] = _mm_or_si128(tmpSrc[1], alphaBits);
+		tmpSrc[2] = _mm_or_si128(tmpSrc[2], alphaBits);
+		tmpSrc[3] = _mm_or_si128(tmpSrc[3], alphaBits);
+		
+		dst0 = _mm_blendv_epi8(dst0, tmpSrc[0], passMask32[0]);
+		dst1 = _mm_blendv_epi8(dst1, tmpSrc[1], passMask32[1]);
+		dst2 = _mm_blendv_epi8(dst2, tmpSrc[2], passMask32[2]);
+		dst3 = _mm_blendv_epi8(dst3, tmpSrc[3], passMask32[3]);
+	}
+	
+	dstLayerID = _mm_blendv_epi8(dstLayerID, srcLayerID_vec128, passMask8);
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
+FORCEINLINE void GPUEngineBase::_PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo,
+													   const bool didAllPixelsPass,
+													   const __m128i &passMask8,
+													   const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0,
+													   const __m128i &srcEffectEnableMask)
+{
+	const bool is555and3D = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) && (LAYERTYPE == GPULayerType_3D);
+	__m128i dst[4];
+	__m128i dstLayerID_vec128;
+	
+	if (is555and3D)
+	{
+		// 3D layer blending requires that all src colors are preserved as 32-bit values.
+		// Since dst2 and dst3 are currently unused for RGB555 output, we using these variables
+		// to store the converted 16-bit src colors.
+		dst[2] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src0, _mm_set1_epi32(0x003E0000)), 7)),
+		                          _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src1, _mm_set1_epi32(0x003E0000)), 7)) );
+		dst[3] = _mm_packs_epi32( _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src2, _mm_set1_epi32(0x003E0000)), 7)),
+		                          _mm_or_si128(_mm_or_si128(_mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x0000003E)), 1), _mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x00003E00)), 4)), _mm_srli_epi32(_mm_and_si128(src3, _mm_set1_epi32(0x003E0000)), 7)) );
+	}
+	
+	if ((COMPOSITORMODE != GPUCompositorMode_Unknown) && didAllPixelsPass)
+	{
+		switch (COMPOSITORMODE)
+		{
+			case GPUCompositorMode_Debug:
+				this->_PixelCopy16_SSE2<OUTPUTFORMAT, true>(compInfo,
+															src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+															dst[3], dst[2], dst[1], dst[0],
+															dstLayerID_vec128);
+				break;
+				
+			case GPUCompositorMode_Copy:
+				this->_PixelCopy16_SSE2<OUTPUTFORMAT, false>(compInfo,
+															 src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+															 dst[3], dst[2], dst[1], dst[0],
+															 dstLayerID_vec128);
+				break;
+				
+			case GPUCompositorMode_BrightUp:
+				this->_PixelBrightnessUp16_SSE2<OUTPUTFORMAT>(compInfo,
+															  src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+															  dst[3], dst[2], dst[1], dst[0],
+															  dstLayerID_vec128);
+				break;
+				
+			case GPUCompositorMode_BrightDown:
+				this->_PixelBrightnessDown16_SSE2<OUTPUTFORMAT>(compInfo,
+																src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+																dst[3], dst[2], dst[1], dst[0],
+																dstLayerID_vec128);
+				break;
+				
+			default:
+				break;
+		}
+	}
+	else
+	{
+		// Read the destination pixels into registers if we're doing a masked pixel write.
+		dst[0] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 0);
+		dst[1] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 1);
+		
+		if (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev)
+		{
+			dst[2] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 2);
+			dst[3] = _mm_load_si128((__m128i *)*compInfo.target.lineColor + 3);
+		}
+		
+		dstLayerID_vec128 = _mm_load_si128((__m128i *)compInfo.target.lineLayerID);
+		
+		switch (COMPOSITORMODE)
+		{
+			case GPUCompositorMode_Debug:
+				this->_PixelCopyWithMask16_SSE2<OUTPUTFORMAT, true>(compInfo,
+																	passMask8,
+																	src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+																	dst[3], dst[2], dst[1], dst[0],
+																	dstLayerID_vec128);
+				break;
+				
+			case GPUCompositorMode_Copy:
+				this->_PixelCopyWithMask16_SSE2<OUTPUTFORMAT, false>(compInfo,
+																	 passMask8,
+																	 src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+																	 dst[3], dst[2], dst[1], dst[0],
+																	 dstLayerID_vec128);
+				break;
+				
+			case GPUCompositorMode_BrightUp:
+				this->_PixelBrightnessUpWithMask16_SSE2<OUTPUTFORMAT>(compInfo,
+																	  passMask8,
+																	  src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+																	  dst[3], dst[2], dst[1], dst[0],
+																	  dstLayerID_vec128);
+				break;
+				
+			case GPUCompositorMode_BrightDown:
+				this->_PixelBrightnessDownWithMask16_SSE2<OUTPUTFORMAT>(compInfo,
+																		passMask8,
+																		src3, src2, (!is555and3D) ? src1 : dst[3], (!is555and3D) ? src0 : dst[2],
+																		dst[3], dst[2], dst[1], dst[0],
+																		dstLayerID_vec128);
+				break;
+				
+			default:
+			{
+				const __m128i spriteAlpha = _mm_setzero_si128();
+				const __m128i enableColorEffectMask = (WILLPERFORMWINDOWTEST) ? _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) ) : _mm_set1_epi8(0xFF);
+				
+				this->_PixelUnknownEffectWithMask16_SSE2<OUTPUTFORMAT, LAYERTYPE>(compInfo,
+																				  passMask8,
+																				  src3, src2, src1, src0,
+																				  spriteAlpha,
+																				  srcEffectEnableMask,
+																				  enableColorEffectMask,
+																				  dst[3], dst[2], dst[1], dst[0],
+																				  dstLayerID_vec128);
+				break;
+			}
+		}
+	}
+	
+	_mm_store_si128((__m128i *)*compInfo.target.lineColor + 0, dst[0]);
+	_mm_store_si128((__m128i *)*compInfo.target.lineColor + 1, dst[1]);
+	
+	if (OUTPUTFORMAT != NDSColorFormat_BGR555_Rev)
+	{
+		_mm_store_si128((__m128i *)*compInfo.target.lineColor + 2, dst[2]);
+		_mm_store_si128((__m128i *)*compInfo.target.lineColor + 3, dst[3]);
+	}
+	
+	_mm_store_si128((__m128i *)compInfo.target.lineLayerID, dstLayerID_vec128);
+}
+
+#endif
+
+//this is fantastically inaccurate.
+//we do the early return even though it reduces the resulting accuracy
+//because we need the speed, and because it is inaccurate anyway
+void GPUEngineBase::_MosaicSpriteLinePixel(GPUEngineCompositorInfo &compInfo, const size_t x, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
+{
+	const bool enableMosaic = (this->_oamList[this->_sprNum[x]].Mosaic != 0);
+	if (!enableMosaic)
+		return;
+	
+	const bool opaque = prioTab[x] <= 4;
+
+	GPUEngineBase::MosaicColor::Obj objColor;
+	objColor.color = LE_TO_LOCAL_16(dst[x]);
+	objColor.alpha = dst_alpha[x];
+	objColor.opaque = opaque;
+
+	const size_t y = compInfo.line.indexNative;
+	
+	if (!compInfo.renderState.mosaicWidthOBJ[x].begin || !compInfo.renderState.mosaicHeightOBJ[y].begin)
+	{
+		objColor = this->_mosaicColors.obj[compInfo.renderState.mosaicWidthOBJ[x].trunc];
+	}
+	
+	this->_mosaicColors.obj[x] = objColor;
+	
+	dst[x] = LE_TO_LOCAL_16(objColor.color);
+	dst_alpha[x] = objColor.alpha;
+	if (!objColor.opaque) prioTab[x] = 0x7F;
+}
+
+void GPUEngineBase::_MosaicSpriteLine(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
+{
+	if (!compInfo.renderState.isOBJMosaicSet)
+	{
+		return;
+	}
+	
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
+	{
+		this->_MosaicSpriteLinePixel(compInfo, i, dst, dst_alpha, typeTab, prioTab);
+	}
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc, bool WRAP>
+void GPUEngineBase::_RenderPixelIterate_Final(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal)
+{
+	const u16 lineWidth = (COMPOSITORMODE == GPUCompositorMode_Debug) ? compInfo.renderState.selectedBGLayer->size.width : GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	const s16 dx = (s16)LOCAL_TO_LE_16(param.BGnPA.value);
+	const s16 dy = (s16)LOCAL_TO_LE_16(param.BGnPC.value);
+	const s32 wh = compInfo.renderState.selectedBGLayer->size.width;
+	const s32 ht = compInfo.renderState.selectedBGLayer->size.height;
+	const s32 wmask = wh - 1;
+	const s32 hmask = ht - 1;
+	
+	IOREG_BGnX x = param.BGnX;
+	IOREG_BGnY y = param.BGnY;
+	
+#ifdef MSB_FIRST
+	// This only seems to work in the unrotated/unscaled case. I'm not too sure
+	// about how these bits should really be arranged on big-endian, but at
+	// least this arrangement fixes a bunch of games that use affine or extended
+	// layers, just as long as they don't perform any rotation/scaling.
+	// - rogerman, 2016-07-05
+	x.value = ((x.value & 0x00FFFFFF) << 8) | ((x.value & 0xFF000000) >> 24);
+	y.value = ((y.value & 0x00FFFFFF) << 8) | ((y.value & 0xFF000000) >> 24);
+#endif
+	
+	u8 index;
+	u16 srcColor;
+	
+	// as an optimization, specially handle the fairly common case of
+	// "unrotated + unscaled + no boundary checking required"
+	if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0)
+	{
+		s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer;
+		const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer;
+		
+		if ( WRAP || ((auxX >= 0) && (auxX + lineWidth <= wh) && (auxY >= 0) && (auxY < ht)) )
+		{
+			for (size_t i = 0; i < lineWidth; i++)
+			{
+				GetPixelFunc(auxX, auxY, wh, map, tile, pal, index, srcColor);
+				
+				if (WILLDEFERCOMPOSITING)
+				{
+					this->_deferredIndexNative[i] = index;
+					this->_deferredColorNative[i] = srcColor;
+				}
+				else
+				{
+					this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, i, srcColor, (index != 0));
+				}
+				
+				auxX++;
+				
+				if (WRAP)
+				{
+					auxX &= wmask;
+				}
+			}
+			
+			return;
+		}
+	}
+	
+	for (size_t i = 0; i < lineWidth; i++, x.value+=dx, y.value+=dy)
+	{
+		const s32 auxX = (WRAP) ? (x.Integer & wmask) : x.Integer;
+		const s32 auxY = (WRAP) ? (y.Integer & hmask) : y.Integer;
+		
+		if (WRAP || ((auxX >= 0) && (auxX < wh) && (auxY >= 0) && (auxY < ht)))
+		{
+			GetPixelFunc(auxX, auxY, wh, map, tile, pal, index, srcColor);
+			
+			if (WILLDEFERCOMPOSITING)
+			{
+				this->_deferredIndexNative[i] = index;
+				this->_deferredColorNative[i] = srcColor;
+			}
+			else
+			{
+				this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, i, srcColor, (index != 0));
+			}
+		}
+	}
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc, bool WRAP>
+void GPUEngineBase::_RenderPixelIterate_ApplyWrap(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal)
+{
+	this->_RenderPixelIterate_Final<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, GetPixelFunc, WRAP>(compInfo, param, map, tile, pal);
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING, PixelLookupFunc GetPixelFunc>
+void GPUEngineBase::_RenderPixelIterate(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, const u32 map, const u32 tile, const u16 *__restrict pal)
+{
+	if (compInfo.renderState.selectedBGLayer->isDisplayWrapped)
+	{
+		this->_RenderPixelIterate_ApplyWrap<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, GetPixelFunc, true>(compInfo, param, map, tile, pal);
+	}
+	else
+	{
+		this->_RenderPixelIterate_ApplyWrap<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, GetPixelFunc, false>(compInfo, param, map, tile, pal);
+	}
+}
+
+TILEENTRY GPUEngineBase::_GetTileEntry(const u32 tileMapAddress, const u16 xOffset, const u16 layerWidthMask)
+{
+	TILEENTRY theTileEntry;
+	
+	const u16 tmp = (xOffset & layerWidthMask) >> 3;
+	u32 mapinfo = tileMapAddress + (tmp & 0x1F) * 2;
+	if (tmp > 31) mapinfo += 32*32*2;
+	theTileEntry.val = LOCAL_TO_LE_16( *(u16 *)MMU_gpu_map(mapinfo) );
+	
+	return theTileEntry;
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
+FORCEINLINE void GPUEngineBase::_CompositePixelImmediate(GPUEngineCompositorInfo &compInfo, const size_t srcX, u16 srcColor16, const bool opaque)
+{
+	bool willRenderColor = opaque;
+	
+	if (MOSAIC)
+	{
+		//due to this early out, we will get incorrect behavior in cases where
+		//we enable mosaic in the middle of a frame. this is deemed unlikely.
+		
+		if (!opaque) srcColor16 = 0xFFFF;
+		else srcColor16 &= 0x7FFF;
+		
+		if (!compInfo.renderState.mosaicWidthBG[srcX].begin || !compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin)
+		{
+			srcColor16 = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][compInfo.renderState.mosaicWidthBG[srcX].trunc];
+		}
+		
+		this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][srcX] = srcColor16;
+		
+		willRenderColor = (srcColor16 != 0xFFFF);
+	}
+	
+	if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
+	{
+		return;
+	}
+	
+	if (!willRenderColor)
+	{
+		return;
+	}
+	
+	compInfo.target.xNative = srcX;
+	compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
+	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHeadNative + srcX;
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative + srcX;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative + srcX;
+	
+	const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+	this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, srcColor16, 0, enableColorEffect);
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
+void GPUEngineBase::_CompositeLineDeferred(GPUEngineCompositorInfo &compInfo)
+{
+	if (MOSAIC)
+	{
+#ifdef ENABLE_SSE2
+		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x+=8)
+		{
+			const __m128i index_vec128 = _mm_loadl_epi64((__m128i *)(this->_deferredIndexNative + x));
+			const __m128i col_vec128 = _mm_load_si128((__m128i *)(this->_deferredColorNative + x));
+			
+			const __m128i idxMask = _mm_cmpeq_epi16(_mm_unpacklo_epi8(index_vec128, _mm_setzero_si128()), _mm_setzero_si128());
+			const __m128i tmpColor_vec128 = _mm_blendv_epi8(_mm_and_si128(col_vec128, _mm_set1_epi16(0x7FFF)), _mm_set1_epi16(0xFFFF), idxMask);
+			
+			const __m128i mosaicWidthMask = _mm_cmpeq_epi16( _mm_and_si128(_mm_set1_epi16(0x00FF), _mm_loadu_si128((__m128i *)(compInfo.renderState.mosaicWidthBG + x))), _mm_setzero_si128() );
+			const __m128i mosaicHeightMask = _mm_cmpeq_epi16(_mm_set1_epi16(compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin), _mm_setzero_si128());
+			const __m128i mosaicMask = _mm_or_si128(mosaicWidthMask, mosaicHeightMask);
+			
+			u16 *mosaicColorBG = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID];
+			mosaicColorBG[x+0] = (_mm_extract_epi16(mosaicMask, 0) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+0].trunc] : _mm_extract_epi16(tmpColor_vec128, 0);
+			mosaicColorBG[x+1] = (_mm_extract_epi16(mosaicMask, 1) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+1].trunc] : _mm_extract_epi16(tmpColor_vec128, 1);
+			mosaicColorBG[x+2] = (_mm_extract_epi16(mosaicMask, 2) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+2].trunc] : _mm_extract_epi16(tmpColor_vec128, 2);
+			mosaicColorBG[x+3] = (_mm_extract_epi16(mosaicMask, 3) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+3].trunc] : _mm_extract_epi16(tmpColor_vec128, 3);
+			mosaicColorBG[x+4] = (_mm_extract_epi16(mosaicMask, 4) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+4].trunc] : _mm_extract_epi16(tmpColor_vec128, 4);
+			mosaicColorBG[x+5] = (_mm_extract_epi16(mosaicMask, 5) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+5].trunc] : _mm_extract_epi16(tmpColor_vec128, 5);
+			mosaicColorBG[x+6] = (_mm_extract_epi16(mosaicMask, 6) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+6].trunc] : _mm_extract_epi16(tmpColor_vec128, 6);
+			mosaicColorBG[x+7] = (_mm_extract_epi16(mosaicMask, 7) != 0) ? mosaicColorBG[compInfo.renderState.mosaicWidthBG[x+7].trunc] : _mm_extract_epi16(tmpColor_vec128, 7);
+			
+			const __m128i mosaicColor_vec128 = _mm_loadu_si128((__m128i *)(mosaicColorBG + x));
+			const __m128i mosaicColorMask = _mm_cmpeq_epi16(mosaicColor_vec128, _mm_set1_epi16(0xFFFF));
+			_mm_storel_epi64( (__m128i *)(this->_deferredIndexNative + x), _mm_andnot_si128(_mm_packs_epi16(mosaicColorMask, _mm_setzero_si128()), index_vec128) );
+			_mm_store_si128( (__m128i *)(this->_deferredColorNative + x), _mm_blendv_epi8(mosaicColor_vec128, col_vec128, mosaicColorMask) );
+		}
+#else
+		for (size_t x = 0, dstIdx = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
+		{
+			u16 tmpColor = (this->_deferredIndexNative[x] == 0) ? 0xFFFF : this->_deferredColorNative[x] & 0x7FFF;
+			
+			if (!compInfo.renderState.mosaicWidthBG[x].begin || !compInfo.renderState.mosaicHeightBG[compInfo.line.indexNative].begin)
+			{
+				tmpColor = this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][compInfo.renderState.mosaicWidthBG[x].trunc];
+			}
+			
+			this->_mosaicColors.bg[compInfo.renderState.selectedLayerID][x] = tmpColor;
+			
+			if (tmpColor == 0xFFFF)
+			{
+				this->_deferredIndexNative[x] = 0;
+			}
+			else
+			{
+				this->_deferredColorNative[x] = tmpColor;
+			}
+		}
+#endif
+	}
+	
+	CopyLineExpand<0xFFFF, false, 2>(this->_deferredColorCustom, this->_deferredColorNative, compInfo.line.widthCustom);
+	CopyLineExpand<0xFFFF, false, 1>(this->_deferredIndexCustom, this->_deferredIndexNative, compInfo.line.widthCustom);
+	
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead;
+	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
+	
+#ifdef ENABLE_SSE2
+	const size_t ssePixCount = (compInfo.line.widthCustom - (compInfo.line.widthCustom % 16));
+	const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID];
+#endif
+	
+	for (size_t l = 0; l < compInfo.line.renderCount; l++)
+	{
+		compInfo.target.xNative = 0;
+		compInfo.target.xCustom = 0;
+		
+#ifdef ENABLE_SSE2
+		for (; compInfo.target.xCustom < ssePixCount; compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16)
+		{
+			__m128i passMask8;
+			
+			if (WILLPERFORMWINDOWTEST)
+			{
+				// Do the window test.
+				passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) );
+			}
+			else
+			{
+				passMask8 = _mm_set1_epi8(0xFF);
+			}
+			
+			// Do the index test. Pixels with an index value of 0 are rejected.
+			passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8(_mm_load_si128((__m128i *)(this->_deferredIndexCustom + compInfo.target.xCustom)), _mm_setzero_si128()), passMask8);
+			
+			const int passMaskValue = _mm_movemask_epi8(passMask8);
+			
+			// If none of the pixels within the vector pass, then reject them all at once.
+			if (passMaskValue == 0)
+			{
+				continue;
+			}
+			
+			__m128i src[4];
+			
+			if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+			{
+				src[0] = _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 0));
+				src[1] = _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 8));
+			}
+			else
+			{
+				const __m128i src16[2] = { _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 0)),
+										   _mm_load_si128((__m128i *)(this->_deferredColorCustom + compInfo.target.xCustom + 8)) };
+				
+				if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
+				{
+					ColorspaceConvert555To6665Opaque_SSE2<false>(src16[0], src[0], src[1]);
+					ColorspaceConvert555To6665Opaque_SSE2<false>(src16[1], src[2], src[3]);
+				}
+				else
+				{
+					ColorspaceConvert555To8888Opaque_SSE2<false>(src16[0], src[0], src[1]);
+					ColorspaceConvert555To8888Opaque_SSE2<false>(src16[1], src[2], src[3]);
+				}
+			}
+			
+			// Write out the pixels.
+			const bool didAllPixelsPass = (passMaskValue == 0xFFFF);
+			this->_PixelComposite16_SSE2<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG, WILLPERFORMWINDOWTEST>(compInfo,
+																											   didAllPixelsPass,
+																											   passMask8,
+																											   src[3], src[2], src[1], src[0],
+																											   srcEffectEnableMask);
+		}
+#endif
+		
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+		for (; compInfo.target.xCustom < compInfo.line.widthCustom; compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
+		{
+			if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] == 0) )
+			{
+				continue;
+			}
+			
+			if (this->_deferredIndexCustom[compInfo.target.xCustom] == 0)
+			{
+				continue;
+			}
+			
+			const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+			this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, this->_deferredColorCustom[compInfo.target.xCustom], 0, enableColorEffect);
+		}
+	}
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
+void GPUEngineBase::_CompositeVRAMLineDeferred(GPUEngineCompositorInfo &compInfo)
+{
+	const void *__restrict vramColorPtr = GPU->GetCustomVRAMAddressUsingMappedAddress<OUTPUTFORMAT>(compInfo.renderState.selectedBGLayer->BMPAddress, compInfo.line.blockOffsetCustom);
+	
+	compInfo.target.xNative = 0;
+	compInfo.target.xCustom = 0;
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead;
+	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
+	
+	size_t i = 0;
+	
+#ifdef ENABLE_SSE2
+	const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID];
+	
+	const size_t ssePixCount = (compInfo.line.pixelCount - (compInfo.line.pixelCount % 16));
+	for (; i < ssePixCount; i+=16, compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16)
+	{
+		__m128i src[4];
+		__m128i passMask8;
+		
+		switch (OUTPUTFORMAT)
+		{
+			case NDSColorFormat_BGR555_Rev:
+			{
+				const __m128i src16[2] = { _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 0)), _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 8)) };
+				src[0] = src16[0];
+				src[1] = src16[1];
+				passMask8 = _mm_packus_epi16( _mm_srli_epi16(src16[0], 15), _mm_srli_epi16(src16[1], 15) );
+				passMask8 = _mm_cmpeq_epi8(passMask8, _mm_set1_epi8(1));
+				break;
+			}
+				
+			case NDSColorFormat_BGR666_Rev:
+			{
+				const __m128i src16[2] = { _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 0)), _mm_load_si128((__m128i *)((u16 *)vramColorPtr + i + 8)) };
+				ColorspaceConvert555To6665Opaque_SSE2<false>(src16[0], src[0], src[1]);
+				ColorspaceConvert555To6665Opaque_SSE2<false>(src16[1], src[2], src[3]);
+				passMask8 = _mm_packus_epi16( _mm_srli_epi16(src16[0], 15), _mm_srli_epi16(src16[1], 15) );
+				passMask8 = _mm_cmpeq_epi8(passMask8, _mm_set1_epi8(1));
+				break;
+			}
+				
+			case NDSColorFormat_BGR888_Rev:
+				src[0] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 0));
+				src[1] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 4));
+				src[2] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 8));
+				src[3] = _mm_load_si128((__m128i *)((FragmentColor *)vramColorPtr + i + 12));
+				passMask8 = _mm_packus_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)), _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) );
+				passMask8 = _mm_cmpeq_epi8(passMask8, _mm_setzero_si128());
+				passMask8 = _mm_xor_si128(passMask8, _mm_set1_epi32(0xFFFFFFFF));
+				break;
+		}
+		
+		if (WILLPERFORMWINDOWTEST)
+		{
+			// Do the window test.
+			passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_setzero_si128()), passMask8);
+		}
+		
+		const int passMaskValue = _mm_movemask_epi8(passMask8);
+		
+		// If none of the pixels within the vector pass, then reject them all at once.
+		if (passMaskValue == 0)
+		{
+			continue;
+		}
+		
+		// Write out the pixels.
+		const bool didAllPixelsPass = (passMaskValue == 0xFFFF);
+		this->_PixelComposite16_SSE2<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG, WILLPERFORMWINDOWTEST>(compInfo,
+																										   didAllPixelsPass,
+																										   passMask8,
+																										   src[3], src[2], src[1], src[0],
+																										   srcEffectEnableMask);
+	}
+#endif
+	
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+	for (; i < compInfo.line.pixelCount; i++, compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
+	{
+		if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] == 0) )
+		{
+			continue;
+		}
+		
+		if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+		{
+			if ((((u32 *)vramColorPtr)[i] & 0xFF000000) == 0)
+			{
+				continue;
+			}
+			
+			const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+			this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, ((u32 *)vramColorPtr)[i], 0, enableColorEffect);
+		}
+		else
+		{
+			if ((((u16 *)vramColorPtr)[i] & 0x8000) == 0)
+			{
+				continue;
+			}
+			
+			const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+			this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_BG>(compInfo, ((u16 *)vramColorPtr)[i], 0, enableColorEffect);
+		}
+	}
+}
+
+/*****************************************************************************/
+//			BACKGROUND RENDERING -TEXT-
+/*****************************************************************************/
+// render a text background to the combined pixelbuffer
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineBase::_RenderLine_BGText(GPUEngineCompositorInfo &compInfo, const u16 XBG, const u16 YBG)
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	const u16 lineWidth = (COMPOSITORMODE == GPUCompositorMode_Debug) ? compInfo.renderState.selectedBGLayer->size.width : GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	const u16 lg    = compInfo.renderState.selectedBGLayer->size.width;
+	const u16 ht    = compInfo.renderState.selectedBGLayer->size.height;
+	const u32 tile  = compInfo.renderState.selectedBGLayer->tileEntryAddress;
+	const u16 wmask = lg - 1;
+	const u16 hmask = ht - 1;
+	
+	const size_t pixCountLo = 8 - (XBG & 0x0007);
+	size_t x = 0;
+	size_t xoff = XBG;
+	
+	const u16 tmp = (YBG & hmask) >> 3;
+	u32 map = compInfo.renderState.selectedBGLayer->tileMapAddress + (tmp & 31) * 64;
+	if (tmp > 31)
+		map += ADDRESS_STEP_512B << compInfo.renderState.selectedBGLayer->BGnCNT.ScreenSize;
+	
+	if (compInfo.renderState.selectedBGLayer->BGnCNT.PaletteMode == PaletteMode_16x16) // color: 16 palette entries
+	{
+		const u16 *__restrict pal = this->_paletteBG;
+		const u16 yoff = (YBG & 0x0007) << 2;
+		u8 index;
+		u16 color;
+		
+		for (size_t xfin = pixCountLo; x < lineWidth; xfin = std::min<u16>(x+8, lineWidth))
+		{
+			const TILEENTRY tileEntry = this->_GetTileEntry(map, xoff, wmask);
+			const u16 tilePalette = tileEntry.bits.Palette * 16;
+			u8 *__restrict tileColorIdx = (u8 *)MMU_gpu_map(tile + (tileEntry.bits.TileNum * 0x20) + ((tileEntry.bits.VFlip) ? (7*4)-yoff : yoff));
+			
+			if (tileEntry.bits.HFlip)
+			{
+				tileColorIdx += 3 - ((xoff & 0x0007) >> 1);
+				
+				if (xoff & 1)
+				{
+					if (WILLDEFERCOMPOSITING)
+					{
+						this->_deferredIndexNative[x] = *tileColorIdx & 0x0F;
+						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
+					}
+					else
+					{
+						index = *tileColorIdx & 0x0F;
+						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
+						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+					}
+					
+					x++;
+					xoff++;
+					tileColorIdx--;
+				}
+				
+				for (; x < xfin; tileColorIdx--)
+				{
+					if (WILLDEFERCOMPOSITING)
+					{
+						this->_deferredIndexNative[x] = *tileColorIdx >> 4;
+						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
+					}
+					else
+					{
+						index = *tileColorIdx >> 4;
+						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
+						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+					}
+					
+					x++;
+					xoff++;
+					
+					if (x < xfin)
+					{
+						if (WILLDEFERCOMPOSITING)
+						{
+							this->_deferredIndexNative[x] = *tileColorIdx & 0x0F;
+							this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
+						}
+						else
+						{
+							index = *tileColorIdx & 0x0F;
+							color = LE_TO_LOCAL_16(pal[index + tilePalette]);
+							this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+						}
+						
+						x++;
+						xoff++;
+					}
+				}
+			}
+			else
+			{
+				tileColorIdx += ((xoff & 0x0007) >> 1);
+				
+				if (xoff & 1)
+				{
+					if (WILLDEFERCOMPOSITING)
+					{
+						this->_deferredIndexNative[x] = *tileColorIdx >> 4;
+						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
+					}
+					else
+					{
+						index = *tileColorIdx >> 4;
+						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
+						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+					}
+					
+					x++;
+					xoff++;
+					tileColorIdx++;
+				}
+				
+				for (; x < xfin; tileColorIdx++)
+				{
+					if (WILLDEFERCOMPOSITING)
+					{
+						this->_deferredIndexNative[x] = *tileColorIdx & 0x0F;
+						this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
+					}
+					else
+					{
+						index = *tileColorIdx & 0x0F;
+						color = LE_TO_LOCAL_16(pal[index + tilePalette]);
+						this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+					}
+					
+					x++;
+					xoff++;
+					
+					if (x < xfin)
+					{
+						if (WILLDEFERCOMPOSITING)
+						{
+							this->_deferredIndexNative[x] = *tileColorIdx >> 4;
+							this->_deferredColorNative[x] = LE_TO_LOCAL_16(pal[this->_deferredIndexNative[x] + tilePalette]);
+						}
+						else
+						{
+							index = *tileColorIdx >> 4;
+							color = LE_TO_LOCAL_16(pal[index + tilePalette]);
+							this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+						}
+						
+						x++;
+						xoff++;
+					}
+				}
+			}
+		}
+	}
+	else //256-color BG
+	{
+		const u16 *__restrict pal = (DISPCNT.ExBGxPalette_Enable) ? *(compInfo.renderState.selectedBGLayer->extPalette) : this->_paletteBG;
+		const u32 extPalMask = -DISPCNT.ExBGxPalette_Enable;
+		const u16 yoff = (YBG & 0x0007) << 3;
+		size_t line_dir;
+		
+		for (size_t xfin = pixCountLo; x < lineWidth; xfin = std::min<u16>(x+8, lineWidth))
+		{
+			const TILEENTRY tileEntry = this->_GetTileEntry(map, xoff, wmask);
+			const u16 *__restrict tilePal = (u16 *)((u8 *)pal + ((tileEntry.bits.Palette<<9) & extPalMask));
+			const u8 *__restrict tileColorIdx = (u8 *)MMU_gpu_map(tile + (tileEntry.bits.TileNum * 0x40) + ((tileEntry.bits.VFlip) ? (7*8)-yoff : yoff));
+			
+			if (tileEntry.bits.HFlip)
+			{
+				tileColorIdx += (7 - (xoff & 0x0007));
+				line_dir = -1;
+			}
+			else
+			{
+				tileColorIdx += (xoff & 0x0007);
+				line_dir = 1;
+			}
+			
+			for (; x < xfin; x++, xoff++, tileColorIdx += line_dir)
+			{
+				if (WILLDEFERCOMPOSITING)
+				{
+					this->_deferredIndexNative[x] = *tileColorIdx;
+					this->_deferredColorNative[x] = LE_TO_LOCAL_16(tilePal[this->_deferredIndexNative[x]]);
+				}
+				else
+				{
+					const u8 index = *tileColorIdx;
+					const u16 color = LE_TO_LOCAL_16(tilePal[index]);
+					this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (index != 0));
+				}
+			}
+		}
+	}
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineBase::_RenderLine_BGAffine(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param)
+{
+	this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_tiled_8bit_entry>(compInfo, param, compInfo.renderState.selectedBGLayer->tileMapAddress, compInfo.renderState.selectedBGLayer->tileEntryAddress, this->_paletteBG);
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineBase::_RenderLine_BGExtended(GPUEngineCompositorInfo &compInfo, const IOREG_BGnParameter &param, bool &outUseCustomVRAM)
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	
+	switch (compInfo.renderState.selectedBGLayer->type)
+	{
+		case BGType_AffineExt_256x16: // 16  bit bgmap entries
+		{
+			if (DISPCNT.ExBGxPalette_Enable)
+			{
+				this->_RenderPixelIterate< COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_tiled_16bit_entry<true> >(compInfo, param, compInfo.renderState.selectedBGLayer->tileMapAddress, compInfo.renderState.selectedBGLayer->tileEntryAddress, *(compInfo.renderState.selectedBGLayer->extPalette));
+			}
+			else
+			{
+				this->_RenderPixelIterate< COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_tiled_16bit_entry<false> >(compInfo, param, compInfo.renderState.selectedBGLayer->tileMapAddress, compInfo.renderState.selectedBGLayer->tileEntryAddress, this->_paletteBG);
+			}
+			break;
+		}
+			
+		case BGType_AffineExt_256x1: // 256 colors
+			this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_256_map>(compInfo, param, compInfo.renderState.selectedBGLayer->BMPAddress, 0, this->_paletteBG);
+			break;
+			
+		case BGType_AffineExt_Direct: // direct colors / BMP
+		{
+			outUseCustomVRAM = false;
+			
+			if (!MOSAIC)
+			{
+				const bool isRotationScaled = ( (param.BGnPA.value != 0x100) ||
+				                                (param.BGnPC.value !=     0) ||
+				                                (param.BGnX.value  !=     0) ||
+				                                (param.BGnY.value  != (0x100 * compInfo.line.indexNative)) );
+				if (!isRotationScaled)
+				{
+					const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(compInfo.renderState.selectedBGLayer->BMPAddress) - MMU.ARM9_LCD) / sizeof(u16);
+					
+					if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
+					{
+						const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+						const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+						const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
+						
+						GPU->GetEngineMain()->VerifyVRAMLineDidChange(blockID, compInfo.line.indexNative + blockLine);
+						outUseCustomVRAM = !GPU->GetEngineMain()->isLineCaptureNative[blockID][compInfo.line.indexNative + blockLine];
+					}
+				}
+			}
+			
+			if (!outUseCustomVRAM)
+			{
+				this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_BMP_map>(compInfo, param, compInfo.renderState.selectedBGLayer->BMPAddress, 0, this->_paletteBG);
+			}
+			else
+			{
+				if ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)
+				{
+					this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
+				}
+			}
+			break;
+		}
+			
+		case BGType_Large8bpp: // large screen 256 colors
+			this->_RenderPixelIterate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING, rot_256_map>(compInfo, param, compInfo.renderState.selectedBGLayer->largeBMPAddress, 0, this->_paletteBG);
+			break;
+			
+		default:
+			break;
+	}
+}
+
+/*****************************************************************************/
+//			BACKGROUND RENDERING -HELPER FUNCTIONS-
+/*****************************************************************************/
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineBase::_LineText(GPUEngineCompositorInfo &compInfo)
+{
+	if (COMPOSITORMODE == GPUCompositorMode_Debug)
+	{
+		this->_RenderLine_BGText<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, 0, compInfo.line.indexNative);
+	}
+	else
+	{
+		this->_RenderLine_BGText<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, compInfo.renderState.selectedBGLayer->xOffset, compInfo.line.indexNative + compInfo.renderState.selectedBGLayer->yOffset);
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineBase::_LineRot(GPUEngineCompositorInfo &compInfo)
+{
+	if (COMPOSITORMODE == GPUCompositorMode_Debug)
+	{
+		static const IOREG_BGnParameter debugParams = {256, 0, 0, -77, 0, (s32)compInfo.line.blockOffsetNative};
+		this->_RenderLine_BGAffine<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, debugParams);
+	}
+	else
+	{
+		IOREG_BGnParameter *__restrict bgParams = (compInfo.renderState.selectedLayerID == GPULayerID_BG2) ? (IOREG_BGnParameter *)&this->_IORegisterMap->BG2Param : (IOREG_BGnParameter *)&this->_IORegisterMap->BG3Param;
+		this->_RenderLine_BGAffine<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, *bgParams);
+		
+		bgParams->BGnX.value += bgParams->BGnPB.value;
+		bgParams->BGnY.value += bgParams->BGnPD.value;
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineBase::_LineExtRot(GPUEngineCompositorInfo &compInfo, bool &outUseCustomVRAM)
+{
+	if (COMPOSITORMODE == GPUCompositorMode_Debug)
+	{
+		static const IOREG_BGnParameter debugParams = {256, 0, 0, -77, 0, (s32)compInfo.line.blockOffsetNative};
+		this->_RenderLine_BGExtended<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, debugParams, outUseCustomVRAM);
+	}
+	else
+	{
+		IOREG_BGnParameter *__restrict bgParams = (compInfo.renderState.selectedLayerID == GPULayerID_BG2) ? (IOREG_BGnParameter *)&this->_IORegisterMap->BG2Param : (IOREG_BGnParameter *)&this->_IORegisterMap->BG3Param;
+		this->_RenderLine_BGExtended<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, *bgParams, outUseCustomVRAM);
+		
+		bgParams->BGnX.value += bgParams->BGnPB.value;
+		bgParams->BGnY.value += bgParams->BGnPD.value;
+	}
+}
+
+/*****************************************************************************/
+//			SPRITE RENDERING -HELPER FUNCTIONS-
+/*****************************************************************************/
+
+/* if i understand it correct, and it fixes some sprite problems in chameleon shot */
+/* we have a 15 bit color, and should use the pal entry bits as alpha ?*/
+/* http://nocash.emubase.de/gbatek.htm#dsvideoobjs */
+template <bool ISDEBUGRENDER>
+void GPUEngineBase::_RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha)
+{
+	const u16 *__restrict bmpBuffer = (u16 *)MMU_gpu_map(srcadr);
+	size_t i = 0;
+	
+#ifdef ENABLE_SSE2
+	if (xdir == 1)
+	{
+		if (ISDEBUGRENDER)
+		{
+			const size_t ssePixCount = lg - (lg % 8);
+			for (; i < ssePixCount; i += 8, x += 8, sprX += 8)
+			{
+				const __m128i color_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x));
+				const __m128i alphaCompare = _mm_cmpeq_epi16( _mm_srli_epi16(color_vec128, 15), _mm_set1_epi16(0x0001) );
+				_mm_storeu_si128( (__m128i *)(dst + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX)), color_vec128, alphaCompare) );
+			}
+		}
+		else
+		{
+			const __m128i prio_vec128 = _mm_set1_epi8(prio);
+			
+			const size_t ssePixCount = lg - (lg % 16);
+			for (; i < ssePixCount; i += 16, x += 16, sprX += 16)
+			{
+				const __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + sprX));
+				const __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x));
+				const __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x + 8));
+				
+				const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128);
+				const __m128i alphaCompare = _mm_cmpeq_epi8( _mm_packs_epi16(_mm_srli_epi16(colorLo_vec128, 15), _mm_srli_epi16(colorHi_vec128, 15)), _mm_set1_epi8(0x01) );
+				
+				const __m128i combinedPackedCompare = _mm_and_si128(prioCompare, alphaCompare);
+				const __m128i combinedLoCompare = _mm_unpacklo_epi8(combinedPackedCompare, combinedPackedCompare);
+				const __m128i combinedHiCompare = _mm_unpackhi_epi8(combinedPackedCompare, combinedPackedCompare);
+				
+				// Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because
+				// maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite
+				// to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs.
+				_mm_storeu_si128( (__m128i *)(dst + sprX + 0),       _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 0)), colorLo_vec128, combinedLoCompare) );
+				_mm_storeu_si128( (__m128i *)(dst + sprX + 8),       _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 8)), colorHi_vec128, combinedHiCompare) );
+				_mm_storeu_si128( (__m128i *)(dst_alpha + sprX),     _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst_alpha + sprX)), _mm_set1_epi8(alpha + 1), combinedPackedCompare) );
+				_mm_storeu_si128( (__m128i *)(typeTab + sprX),       _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(typeTab + sprX)), _mm_set1_epi8(OBJMode_Bitmap), combinedPackedCompare) );
+				_mm_storeu_si128( (__m128i *)(prioTab + sprX),       _mm_blendv_epi8(prioTab_vec128, prio_vec128, combinedPackedCompare) );
+				_mm_storeu_si128( (__m128i *)(this->_sprNum + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(this->_sprNum + sprX)), _mm_set1_epi8(spriteNum), combinedPackedCompare) );
+			}
+		}
+	}
+#endif
+	
+	for (; i < lg; i++, sprX++, x += xdir)
+	{
+		const u16 color = LE_TO_LOCAL_16(bmpBuffer[x]);
+		
+		//a cleared alpha bit suppresses the pixel from processing entirely; it doesnt exist
+		if (ISDEBUGRENDER)
+		{
+			if (color & 0x8000)
+			{
+				dst[sprX] = color;
+			}
+		}
+		else
+		{
+			if ((color & 0x8000) && (prio < prioTab[sprX]))
+			{
+				dst[sprX] = color;
+				dst_alpha[sprX] = alpha+1;
+				typeTab[sprX] = OBJMode_Bitmap;
+				prioTab[sprX] = prio;
+				this->_sprNum[sprX] = spriteNum;
+			}
+		}
+	}
+}
+
+template<bool ISDEBUGRENDER, bool ISWINDOW>
+void GPUEngineBase::_RenderSprite256(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha)
+{
+	for (size_t i = 0; i < lg; i++, ++sprX, x += xdir)
+	{
+		const u32 adr = srcadr + (u32)( (x & 0x7) + ((x & 0xFFF8) << 3) );
+		const u8 *__restrict src = (u8 *)MMU_gpu_map(adr);
+		const u8 palette_entry = *src;
+
+		//a zero value suppresses the pixel from processing entirely; it doesnt exist
+		if (ISDEBUGRENDER)
+		{
+			if (palette_entry > 0)
+			{
+				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
+			}
+		}
+		else
+		{
+			if(ISWINDOW)
+			{
+				if(palette_entry > 0)
+					this->_sprWin[sprX] = 1;
+			}
+			else if ((palette_entry > 0) && (prio < prioTab[sprX]))
+			{
+				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
+				dst_alpha[sprX] = 0xFF;
+				typeTab[sprX] = (alpha ? OBJMode_Transparent : OBJMode_Normal);
+				prioTab[sprX] = prio;
+				this->_sprNum[sprX] = spriteNum;
+			}
+		}
+	}
+}
+
+template<bool ISDEBUGRENDER, bool ISWINDOW>
+void GPUEngineBase::_RenderSprite16(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha)
+{
+	for (size_t i = 0; i < lg; i++, ++sprX, x += xdir)
+	{
+		const u16 x1 = x >> 1;
+		const u32 adr = srcadr + (x1 & 0x3) + ((x1 & 0xFFFC) << 3);
+		const u8 *__restrict src = (u8 *)MMU_gpu_map(adr);
+		const u8 palette = *src;
+		const u8 palette_entry = (x & 1) ? palette >> 4 : palette & 0xF;
+		
+		//a zero value suppresses the pixel from processing entirely; it doesnt exist
+		if (ISDEBUGRENDER)
+		{
+			if (palette_entry > 0)
+			{
+				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
+			}
+		}
+		else
+		{
+			if(ISWINDOW)
+			{
+				if(palette_entry > 0)
+					this->_sprWin[sprX] = 1;
+			}
+			else if ((palette_entry > 0) && (prio < prioTab[sprX]))
+			{
+				dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
+				dst_alpha[sprX] = 0xFF;
+				typeTab[sprX] = (alpha ? OBJMode_Transparent : OBJMode_Normal);
+				prioTab[sprX] = prio;
+				this->_sprNum[sprX] = spriteNum;
+			}
+		}
+	}
+}
+
+// return val means if the sprite is to be drawn or not
+bool GPUEngineBase::_ComputeSpriteVars(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, SpriteSize &sprSize, s32 &sprX, s32 &sprY, s32 &x, s32 &y, s32 &lg, s32 &xdir)
+{
+	x = 0;
+	// get sprite location and size
+	sprX = spriteInfo.X;
+	sprY = spriteInfo.Y;
+	sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape];
+	lg = sprSize.width;
+	
+// FIXME: for rot/scale, a list of entries into the sprite should be maintained,
+// that tells us where the first pixel of a screenline starts in the sprite,
+// and how a step to the right in a screenline translates within the sprite
+
+	//this wasn't really tested by anything. very unlikely to get triggered
+	y = (compInfo.line.indexNative - sprY) & 0xFF;                        /* get the y line within sprite coords */
+	if (y >= sprSize.height)
+		return false;
+
+	if ((sprX == GPU_FRAMEBUFFER_NATIVE_WIDTH) || ((sprX+sprSize.width) <= 0))	/* sprite pixels outside of line */
+		return false;				/* not to be drawn */
+
+	// sprite portion out of the screen (LEFT)
+	if (sprX < 0)
+	{
+		lg += sprX;	
+		x = -(sprX);
+		sprX = 0;
+	}
+	// sprite portion out of the screen (RIGHT)
+	if ((sprX+sprSize.width) >= GPU_FRAMEBUFFER_NATIVE_WIDTH)
+		lg = GPU_FRAMEBUFFER_NATIVE_WIDTH - sprX;
+
+	// switch TOP<-->BOTTOM
+	if (spriteInfo.VFlip)
+		y = sprSize.height - y - 1;
+	
+	// switch LEFT<-->RIGHT
+	if (spriteInfo.HFlip)
+	{
+		x = sprSize.width - x - 1;
+		xdir = -1;
+	}
+	else
+	{
+		xdir = 1;
+	}
+	
+	return true;
+}
+
+/*****************************************************************************/
+//			SPRITE RENDERING
+/*****************************************************************************/
+
+
+//TODO - refactor this so there isnt as much duped code between rotozoomed and non-rotozoomed versions
+
+u32 GPUEngineBase::_SpriteAddressBMP(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, const SpriteSize sprSize, const s32 y)
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	
+	if (DISPCNT.OBJ_BMP_mapping)
+	{
+		//tested by buffy sacrifice damage blood splatters in corner
+		return this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBMPBoundary) + (y * sprSize.width * 2);
+	}
+	else
+	{
+		//2d mapping:
+		//verified in rotozoomed mode by knights in the nightmare intro
+
+		if (DISPCNT.OBJ_BMP_2D_dim)
+			//256*256, verified by heroes of mana FMV intro
+			return this->_sprMem + (((spriteInfo.TileIndex&0x3E0) * 64 + (spriteInfo.TileIndex&0x1F) * 8 + (y << 8)) << 1);
+		else 
+			//128*512, verified by harry potter and the order of the phoenix conversation portraits
+			return this->_sprMem + (((spriteInfo.TileIndex&0x3F0) * 64 + (spriteInfo.TileIndex&0x0F) * 8 + (y << 7)) << 1);
+	}
+}
+
+template <bool ISDEBUGRENDER>
+void GPUEngineBase::_SpriteRender(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
+{
+	if (compInfo.renderState.spriteRenderMode == SpriteRenderMode_Sprite1D)
+		this->_SpriteRenderPerform<SpriteRenderMode_Sprite1D, ISDEBUGRENDER>(compInfo, dst, dst_alpha, typeTab, prioTab);
+	else
+		this->_SpriteRenderPerform<SpriteRenderMode_Sprite2D, ISDEBUGRENDER>(compInfo, dst, dst_alpha, typeTab, prioTab);
+}
+
+void GPUEngineBase::SpriteRenderDebug(const u16 lineIndex, u16 *dst)
+{
+	GPUEngineCompositorInfo compInfo;
+	memset(&compInfo, 0, sizeof(compInfo));
+	
+	compInfo.renderState.displayOutputMode = GPUDisplayMode_Normal;
+	compInfo.renderState.selectedLayerID = GPULayerID_OBJ;
+	compInfo.renderState.colorEffect = ColorEffect_Disable;
+	compInfo.renderState.masterBrightnessMode = GPUMasterBrightMode_Disable;
+	compInfo.renderState.masterBrightnessIsFullIntensity = false;
+	compInfo.renderState.masterBrightnessIsMaxOrMin = true;
+	compInfo.renderState.spriteRenderMode = this->_currentRenderState.spriteRenderMode;
+	compInfo.renderState.spriteBoundary = this->_currentRenderState.spriteBoundary;
+	compInfo.renderState.spriteBMPBoundary = this->_currentRenderState.spriteBMPBoundary;
+	
+	compInfo.line.indexNative = lineIndex;
+	compInfo.line.indexCustom = compInfo.line.indexNative;
+	compInfo.line.widthCustom = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	compInfo.line.renderCount = 1;
+	compInfo.line.pixelCount = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	compInfo.line.blockOffsetNative = compInfo.line.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	compInfo.line.blockOffsetCustom = compInfo.line.blockOffsetNative;
+	
+	compInfo.target.lineColorHead = dst;
+	compInfo.target.lineColorHeadNative = compInfo.target.lineColorHead;
+	compInfo.target.lineColorHeadCustom = compInfo.target.lineColorHeadNative;
+	compInfo.target.lineLayerIDHead = NULL;
+	compInfo.target.lineLayerIDHeadNative = NULL;
+	compInfo.target.lineLayerIDHeadCustom = NULL;
+	
+	compInfo.target.xNative = 0;
+	compInfo.target.xCustom = 0;
+	compInfo.target.lineColor = (void **)&compInfo.target.lineColor16;
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative;
+	compInfo.target.lineLayerID = NULL;
+	
+	this->_SpriteRender<true>(compInfo, dst, NULL, NULL, NULL);
+}
+
+template <SpriteRenderMode MODE, bool ISDEBUGRENDER>
+void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	size_t cost = 0;
+	
+	for (size_t i = 0; i < 128; i++)
+	{
+		OAMAttributes spriteInfo = this->_oamList[i];
+
+		//for each sprite:
+		if (cost >= 2130)
+		{
+			//out of sprite rendering time
+			//printf("sprite overflow!\n");
+			//return;		
+		}
+
+		//do we incur a cost if a sprite is disabled?? we guess so.
+		cost += 2;
+		
+		// Check if sprite is disabled before everything
+		if (spriteInfo.RotScale == 0 && spriteInfo.Disable != 0)
+			continue;
+		
+		// Must explicitly convert endianness with attributes 1 and 2.
+		spriteInfo.attr[1] = LOCAL_TO_LE_16(spriteInfo.attr[1]);
+		spriteInfo.attr[2] = LOCAL_TO_LE_16(spriteInfo.attr[2]);
+		
+		const OBJMode objMode = (OBJMode)spriteInfo.Mode;
+
+		SpriteSize sprSize;
+		s32 sprX;
+		s32 sprY;
+		s32 x;
+		s32 y;
+		s32 lg;
+		s32 xdir;
+		u8 prio = spriteInfo.Priority;
+		u16 *__restrict pal;
+		u8 *__restrict src;
+		u32 srcadr;
+		
+		if (spriteInfo.RotScale != 0)
+		{
+			s32		fieldX, fieldY, auxX, auxY, realX, realY, offset;
+			u8		blockparameter;
+			s16		dx, dmx, dy, dmy;
+			u16		colour;
+
+			// Get sprite positions and size
+			sprX = spriteInfo.X;
+			sprY = spriteInfo.Y;
+			sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape];
+
+			// Copy sprite size, to check change it if needed
+			fieldX = sprSize.width;
+			fieldY = sprSize.height;
+			lg = sprSize.width;
+
+			// If we are using double size mode, double our control vars
+			if (spriteInfo.DoubleSize != 0)
+			{
+				fieldX <<= 1;
+				fieldY <<= 1;
+				lg <<= 1;
+			}
+
+			//check if the sprite is visible y-wise. unfortunately our logic for x and y is different due to our scanline based rendering
+			//tested thoroughly by many large sprites in Super Robot Wars K which wrap around the screen
+			y = (compInfo.line.indexNative - sprY) & 0xFF;
+			if (y >= fieldY)
+				continue;
+
+			//check if sprite is visible x-wise.
+			if ((sprX == GPU_FRAMEBUFFER_NATIVE_WIDTH) || (sprX + fieldX <= 0))
+				continue;
+
+			cost += (sprSize.width * 2) + 10;
+
+			// Get which four parameter block is assigned to this sprite
+			blockparameter = (spriteInfo.RotScaleIndex + (spriteInfo.HFlip << 3) + (spriteInfo.VFlip << 4)) * 4;
+
+			// Get rotation/scale parameters
+			dx  = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+0].attr3);
+			dmx = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+1].attr3);
+			dy  = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+2].attr3);
+			dmy = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+3].attr3);
+			
+			// Calculate fixed point 8.8 start offsets
+			realX = (sprSize.width  << 7) - (fieldX >> 1)*dx - (fieldY >> 1)*dmx + y*dmx;
+			realY = (sprSize.height << 7) - (fieldX >> 1)*dy - (fieldY >> 1)*dmy + y*dmy;
+
+			if (sprX < 0)
+			{
+				// If sprite is not in the window
+				if (sprX + fieldX <= 0)
+					continue;
+
+				// Otherwise, is partially visible
+				lg += sprX;
+				realX -= sprX*dx;
+				realY -= sprX*dy;
+				sprX = 0;
+			}
+			else
+			{
+				if (sprX + fieldX > GPU_FRAMEBUFFER_NATIVE_WIDTH)
+					lg = GPU_FRAMEBUFFER_NATIVE_WIDTH - sprX;
+			}
+
+			// If we are using 1 palette of 256 colours
+			if (spriteInfo.PaletteMode == PaletteMode_1x256)
+			{
+				src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary));
+
+				// If extended palettes are set, use them
+				pal = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
+
+				for (size_t j = 0; j < lg; ++j, ++sprX)
+				{
+					// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
+					auxX = (realX >> 8);
+					auxY = (realY >> 8);
+
+					if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
+					{
+						if (MODE == SpriteRenderMode_Sprite2D)
+							offset = (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*8);
+						else
+							offset = (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)*sprSize.width*8) + ((auxY&0x7)*8);
+
+						colour = src[offset];
+						
+						if (ISDEBUGRENDER)
+						{
+							if (colour)
+							{
+								dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
+							}
+						}
+						else
+						{
+							if (colour && (prio < prioTab[sprX]))
+							{
+								dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
+								dst_alpha[sprX] = 0xFF;
+								typeTab[sprX] = objMode;
+								prioTab[sprX] = prio;
+								this->_sprNum[sprX] = i;
+							}
+						}
+					}
+
+					// Add the rotation/scale coefficients, here the rotation/scaling is performed
+					realX += dx;
+					realY += dy;
+				}
+			}
+			// Rotozoomed direct color
+			else if (objMode == OBJMode_Bitmap)
+			{
+				//transparent (i think, dont bother to render?) if alpha is 0
+				if (spriteInfo.PaletteIndex == 0)
+					continue;
+
+				srcadr = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, 0);
+
+				for (size_t j = 0; j < lg; ++j, ++sprX)
+				{
+					// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
+					auxX = realX >> 8;
+					auxY = realY >> 8;
+
+					//this is all very slow, and so much dup code with other rotozoomed modes.
+					//dont bother fixing speed until this whole thing gets reworked
+
+					if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
+					{
+						if (DISPCNT.OBJ_BMP_2D_dim)
+							//tested by knights in the nightmare
+							offset = ((this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, auxY) - srcadr) / 2) + auxX;
+						else //tested by lego indiana jones (somehow?)
+							//tested by buffy sacrifice damage blood splatters in corner
+							offset = auxX + (auxY * sprSize.width);
+
+						const u32 finalAddr = srcadr + (offset << 1);
+						u16 *mem = (u16 *)MMU_gpu_map(finalAddr);
+						colour = LE_TO_LOCAL_16(*mem);
+						
+						if (ISDEBUGRENDER)
+						{
+							if (colour & 0x8000)
+							{
+								dst[sprX] = colour;
+							}
+						}
+						else
+						{
+							if ((colour & 0x8000) && (prio < prioTab[sprX]))
+							{
+								dst[sprX] = colour;
+								dst_alpha[sprX] = spriteInfo.PaletteIndex;
+								typeTab[sprX] = objMode;
+								prioTab[sprX] = prio;
+								this->_sprNum[sprX] = i;
+							}
+						}
+					}
+
+					// Add the rotation/scale coefficients, here the rotation/scaling is performed
+					realX += dx;
+					realY += dy;
+				}
+			}
+			// Rotozoomed 16/16 palette
+			else
+			{
+				if (MODE == SpriteRenderMode_Sprite2D)
+				{
+					src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << 5));
+				}
+				else
+				{
+					src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary));
+				}
+				
+				pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4);
+
+				for (size_t j = 0; j < lg; ++j, ++sprX)
+				{
+					// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
+					auxX = realX >> 8;
+					auxY = realY >> 8;
+
+					if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
+					{
+						if (MODE == SpriteRenderMode_Sprite2D)
+							offset = ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*4);
+						else
+							offset = ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)*sprSize.width)*4 + ((auxY&0x7)*4);
+						
+						colour = src[offset];
+
+						// Get 4bits value from the readed 8bits
+						if (auxX&1)	colour >>= 4;
+						else		colour &= 0xF;
+						
+						if (ISDEBUGRENDER)
+						{
+							if (colour)
+							{
+								dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
+							}
+						}
+						else
+						{
+							if (colour && (prio < prioTab[sprX]))
+							{
+								if (objMode == OBJMode_Window)
+								{
+									this->_sprWin[sprX] = 1;
+								}
+								else
+								{
+									dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
+									dst_alpha[sprX] = 0xFF;
+									typeTab[sprX] = objMode;
+									prioTab[sprX] = prio;
+									this->_sprNum[sprX] = i;
+								}
+							}
+						}
+					}
+
+					// Add the rotation/scale coeficients, here the rotation/scaling  is performed
+					realX += dx;
+					realY += dy;
+				}
+			}
+		}
+		else //NOT rotozoomed
+		{
+			if (!this->_ComputeSpriteVars(compInfo, spriteInfo, sprSize, sprX, sprY, x, y, lg, xdir))
+				continue;
+
+			cost += sprSize.width;
+
+			if (objMode == OBJMode_Bitmap) //sprite is in BMP format
+			{
+				//transparent (i think, dont bother to render?) if alpha is 0
+				if (spriteInfo.PaletteIndex == 0)
+					continue;
+				
+				srcadr = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, y);
+
+				this->_RenderSpriteBMP<ISDEBUGRENDER>(compInfo, i, dst, srcadr, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, spriteInfo.PaletteIndex);
+
+				const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(srcadr) - MMU.ARM9_LCD) / sizeof(u16);
+				if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
+				{
+					const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+					const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+					const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
+					const size_t linePixel = blockPixel % GPU_FRAMEBUFFER_NATIVE_WIDTH;
+					
+					if (!GPU->GetEngineMain()->isLineCaptureNative[blockID][blockLine] && (linePixel == 0))
+					{
+						this->vramBlockOBJAddress = srcadr;
+					}
+				}
+			}
+			else if (spriteInfo.PaletteMode == PaletteMode_1x256) //256 colors; handles OBJ windows too
+			{
+				if (MODE == SpriteRenderMode_Sprite2D)
+					srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*8);
+				else
+					srcadr = this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*8) + ((y&0x7)*8);
+
+				pal = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
+
+				if (objMode == OBJMode_Window)
+					this->_RenderSprite256<ISDEBUGRENDER,true>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
+				else
+					this->_RenderSprite256<ISDEBUGRENDER,false>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
+			}
+			else // 16 colors; handles OBJ windows too
+			{
+				if (MODE == SpriteRenderMode_Sprite2D)
+				{
+					srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*4);
+				}
+				else
+				{
+					srcadr = this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*4) + ((y&0x7)*4);
+				}
+				
+				pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4);
+
+				if (objMode == OBJMode_Window)
+					this->_RenderSprite16<ISDEBUGRENDER, true>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
+				else 
+					this->_RenderSprite16<ISDEBUGRENDER, false>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
+			}
+		}
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
+void GPUEngineBase::_RenderLine_Layers(const size_t l)
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	itemsForPriority_t *item;
+	
+	GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[l];
+	
+	// Optimization: For normal display mode, render straight to the output buffer when that is what we are going to end
+	// up displaying anyway. Otherwise, we need to use the working buffer.
+	compInfo.target.lineColorHeadNative = (compInfo.renderState.displayOutputMode == GPUDisplayMode_Normal) ? (u8 *)this->nativeBuffer + (compInfo.line.blockOffsetNative * dispInfo.pixelBytes) : (u8 *)this->_internalRenderLineTargetNative;
+	compInfo.target.lineColorHeadCustom = (compInfo.renderState.displayOutputMode == GPUDisplayMode_Normal) ? (u8 *)this->customBuffer + (compInfo.line.blockOffsetCustom * dispInfo.pixelBytes) : (u8 *)this->_internalRenderLineTargetCustom;
+	compInfo.target.lineColorHead = compInfo.target.lineColorHeadNative;
+	
+	compInfo.target.lineLayerIDHeadNative = this->_renderLineLayerIDNative;
+	compInfo.target.lineLayerIDHeadCustom = this->_renderLineLayerIDCustom;
+	compInfo.target.lineLayerIDHead = compInfo.target.lineLayerIDHeadNative;
+	
+	compInfo.target.xNative = 0;
+	compInfo.target.xCustom = 0;
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative;
+	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
+	
+	this->_RenderLine_Clear<OUTPUTFORMAT>(compInfo);
+	
+	// for all the pixels in the line
+	if (this->_enableLayer[GPULayerID_OBJ])
+	{
+		this->vramBlockOBJAddress = 0;
+		this->_RenderLine_SetupSprites(compInfo);
+	}
+	
+	if (WILLPERFORMWINDOWTEST)
+	{
+		this->_PerformWindowTesting(compInfo);
+	}
+	
+	// paint lower priorities first
+	// then higher priorities on top
+	for (size_t prio = NB_PRIORITIES; prio > 0; )
+	{
+		prio--;
+		item = &(this->_itemsForPriority[prio]);
+		// render BGs
+		if (this->_isAnyBGLayerEnabled)
+		{
+			for (size_t i = 0; i < item->nbBGs; i++)
+			{
+				const GPULayerID layerID = (GPULayerID)item->BGs[i];
+				
+				if (this->_enableLayer[layerID])
+				{
+					compInfo.renderState.selectedLayerID = layerID;
+					compInfo.renderState.selectedBGLayer = &this->_BGLayer[layerID];
+					
+					if (this->_engineID == GPUEngineID_Main)
+					{
+						if ( (layerID == GPULayerID_BG0) && GPU->GetEngineMain()->WillRender3DLayer() )
+						{
+#ifndef DISABLE_COMPOSITOR_FAST_PATHS
+							if ( !compInfo.renderState.dstAnyBlendEnable && (  (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
+																			   !compInfo.renderState.srcBlendEnable[GPULayerID_BG0] ||
+																			 (((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) )
+							{
+								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_Copy, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+							}
+							else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_BG0] && (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) )
+							{
+								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_BrightUp, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+							}
+							else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_BG0] && (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness) )
+							{
+								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_BrightDown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+							}
+							else
+#endif
+							{
+								GPU->GetEngineMain()->RenderLine_Layer3D<GPUCompositorMode_Unknown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+							}
+							continue;
+						}
+					}
+										
+#ifndef DISABLE_COMPOSITOR_FAST_PATHS
+					if ( (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
+						 !compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] ||
+						((compInfo.renderState.colorEffect == ColorEffect_Blend) && !compInfo.renderState.dstAnyBlendEnable) ||
+						(((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) )
+					{
+						this->_RenderLine_LayerBG<GPUCompositorMode_Copy, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+					}
+					else if ( !WILLPERFORMWINDOWTEST && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] && (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) )
+					{
+						this->_RenderLine_LayerBG<GPUCompositorMode_BrightUp, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+					}
+					else if ( !WILLPERFORMWINDOWTEST && compInfo.renderState.srcBlendEnable[compInfo.renderState.selectedLayerID] && (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness) )
+					{
+						this->_RenderLine_LayerBG<GPUCompositorMode_BrightDown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+					}
+					else
+#endif
+					{
+						this->_RenderLine_LayerBG<GPUCompositorMode_Unknown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo);
+					}
+				} //layer enabled
+			}
+		}
+		
+		// render sprite Pixels
+		if ( this->_enableLayer[GPULayerID_OBJ] && (item->nbPixelsX > 0) )
+		{
+			compInfo.renderState.selectedLayerID = GPULayerID_OBJ;
+			compInfo.renderState.selectedBGLayer = NULL;
+			
+#ifndef DISABLE_COMPOSITOR_FAST_PATHS
+			if ( !compInfo.renderState.dstAnyBlendEnable && (  (compInfo.renderState.colorEffect == ColorEffect_Disable) ||
+															   !compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] ||
+															 (((compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) || (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness)) && (compInfo.renderState.blendEVY == 0)) ) )
+			{
+				this->_RenderLine_LayerOBJ<GPUCompositorMode_Copy, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
+			}
+			else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] && (compInfo.renderState.colorEffect == ColorEffect_IncreaseBrightness) )
+			{
+				this->_RenderLine_LayerOBJ<GPUCompositorMode_BrightUp, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
+			}
+			else if ( !WILLPERFORMWINDOWTEST && !compInfo.renderState.dstAnyBlendEnable && compInfo.renderState.srcBlendEnable[GPULayerID_OBJ] && (compInfo.renderState.colorEffect == ColorEffect_DecreaseBrightness) )
+			{
+				this->_RenderLine_LayerOBJ<GPUCompositorMode_BrightDown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
+			}
+			else
+#endif
+			{
+				this->_RenderLine_LayerOBJ<GPUCompositorMode_Unknown, OUTPUTFORMAT, WILLPERFORMWINDOWTEST>(compInfo, item);
+			}
+		}
+	}
+}
+
+void GPUEngineBase::_RenderLine_SetupSprites(GPUEngineCompositorInfo &compInfo)
+{
+	itemsForPriority_t *item;
+	
+	//n.b. - this is clearing the sprite line buffer to the background color,
+	memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(this->_sprColor, compInfo.renderState.backdropColor16);
+	memset(this->_sprAlpha, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	memset(this->_sprType, OBJMode_Normal, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	memset(this->_sprPrio, 0x7F, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	
+	//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
+	//how it interacts with this. I wish we knew why we needed this
+	
+	this->_SpriteRender<false>(compInfo, this->_sprColor, this->_sprAlpha, this->_sprType, this->_sprPrio);
+	this->_MosaicSpriteLine(compInfo, this->_sprColor, this->_sprAlpha, this->_sprType, this->_sprPrio);
+	
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
+	{
+		// assign them to the good priority item
+		const size_t prio = this->_sprPrio[i];
+		if (prio >= 4) continue;
+		
+		item = &(this->_itemsForPriority[prio]);
+		item->PixelsX[item->nbPixelsX] = i;
+		item->nbPixelsX++;
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
+void GPUEngineBase::_RenderLine_LayerOBJ(GPUEngineCompositorInfo &compInfo, itemsForPriority_t *__restrict item)
+{
+	bool useCustomVRAM = false;
+	
+	if (this->vramBlockOBJAddress != 0)
+	{
+		const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(this->vramBlockOBJAddress) - MMU.ARM9_LCD) / sizeof(u16);
+		
+		if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
+		{
+			const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+			const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+			const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
+			
+			GPU->GetEngineMain()->VerifyVRAMLineDidChange(blockID, blockLine);
+			useCustomVRAM = !GPU->GetEngineMain()->isLineCaptureNative[blockID][blockLine];
+		}
+	}
+	
+	if (useCustomVRAM && ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested))
+	{
+		this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
+	}
+	
+	if (this->isLineRenderNative[compInfo.line.indexNative])
+	{
+		if (useCustomVRAM && (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev))
+		{
+			const FragmentColor *__restrict vramColorPtr = (FragmentColor *)GPU->GetCustomVRAMAddressUsingMappedAddress<OUTPUTFORMAT>(this->vramBlockOBJAddress, 0);
+			
+			for (size_t i = 0; i < item->nbPixelsX; i++)
+			{
+				const size_t srcX = item->PixelsX[i];
+				
+				if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
+				{
+					continue;
+				}
+				
+				compInfo.target.xNative = srcX;
+				compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
+				compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX;
+				compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX;
+				compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX;
+				
+				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, vramColorPtr[srcX], this->_sprAlpha[srcX], enableColorEffect);
+			}
+		}
+		else
+		{
+			for (size_t i = 0; i < item->nbPixelsX; i++)
+			{
+				const size_t srcX = item->PixelsX[i];
+				
+				if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
+				{
+					continue;
+				}
+				
+				compInfo.target.xNative = srcX;
+				compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
+				compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead + srcX;
+				compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead + srcX;
+				compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead + srcX;
+				
+				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect);
+			}
+		}
+	}
+	else
+	{
+		void *__restrict dstColorPtr = compInfo.target.lineColorHead;
+		u8 *__restrict dstLayerIDPtr = compInfo.target.lineLayerIDHead;
+		
+		if (useCustomVRAM)
+		{
+			const void *__restrict vramColorPtr = GPU->GetCustomVRAMAddressUsingMappedAddress<OUTPUTFORMAT>(this->vramBlockOBJAddress, 0);
+			
+			for (size_t line = 0; line < compInfo.line.renderCount; line++)
+			{
+				compInfo.target.lineColor16 = (u16 *)dstColorPtr;
+				compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr;
+				compInfo.target.lineLayerID = dstLayerIDPtr;
+				
+				for (size_t i = 0; i < item->nbPixelsX; i++)
+				{
+					const size_t srcX = item->PixelsX[i];
+					
+					if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
+					{
+						continue;
+					}
+					
+					compInfo.target.xNative = srcX;
+					compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
+					
+					for (size_t p = 0; p < _gpuDstPitchCount[srcX]; p++)
+					{
+						const size_t dstX = compInfo.target.xCustom + p;
+						
+						compInfo.target.lineColor16 = (u16 *)dstColorPtr + dstX;
+						compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr + dstX;
+						compInfo.target.lineLayerID = dstLayerIDPtr + dstX;
+						
+						const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+						
+						if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+						{
+							this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, ((FragmentColor *)vramColorPtr)[dstX], this->_sprAlpha[srcX], enableColorEffect);
+						}
+						else
+						{
+							this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, ((u16 *)vramColorPtr)[dstX], this->_sprAlpha[srcX], enableColorEffect);
+						}
+					}
+				}
+				
+				vramColorPtr = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)vramColorPtr + compInfo.line.widthCustom) : (void *)((u16 *)vramColorPtr + compInfo.line.widthCustom);
+				dstColorPtr = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) ? (void *)((u16 *)dstColorPtr + compInfo.line.widthCustom) : (void *)((FragmentColor *)dstColorPtr + compInfo.line.widthCustom);
+				dstLayerIDPtr += compInfo.line.widthCustom;
+			}
+		}
+		else
+		{
+			for (size_t line = 0; line < compInfo.line.renderCount; line++)
+			{
+				compInfo.target.lineColor16 = (u16 *)dstColorPtr;
+				compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr;
+				compInfo.target.lineLayerID = dstLayerIDPtr;
+				
+				for (size_t i = 0; i < item->nbPixelsX; i++)
+				{
+					const size_t srcX = item->PixelsX[i];
+					
+					if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestNative[compInfo.renderState.selectedLayerID][srcX] == 0) )
+					{
+						continue;
+					}
+					
+					compInfo.target.xNative = srcX;
+					compInfo.target.xCustom = _gpuDstPitchIndex[srcX];
+					
+					for (size_t p = 0; p < _gpuDstPitchCount[srcX]; p++)
+					{
+						const size_t dstX = compInfo.target.xCustom + p;
+						
+						compInfo.target.lineColor16 = (u16 *)dstColorPtr + dstX;
+						compInfo.target.lineColor32 = (FragmentColor *)dstColorPtr + dstX;
+						compInfo.target.lineLayerID = dstLayerIDPtr + dstX;
+						
+						const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectNative[compInfo.renderState.selectedLayerID][compInfo.target.xNative] != 0) : true;
+						this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_OBJ>(compInfo, this->_sprColor[srcX], this->_sprAlpha[srcX], enableColorEffect);
+					}
+				}
+				
+				dstColorPtr = (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) ? (void *)((u16 *)dstColorPtr + compInfo.line.widthCustom) : (void *)((FragmentColor *)dstColorPtr + compInfo.line.widthCustom);
+				dstLayerIDPtr += compInfo.line.widthCustom;
+			}
+		}
+	}
+}
+
+void GPUEngineBase::UpdateMasterBrightnessDisplayInfo(NDSDisplayInfo &mutableInfo)
+{
+	const GPUEngineCompositorInfo &compInfoZero = this->_currentCompositorInfo[0];
+	bool needsApply = false;
+	bool processPerScanline = false;
+	
+	for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
+	{
+		const GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[line];
+		
+		if ( !needsApply &&
+			 (compInfo.renderState.masterBrightnessIntensity != 0) &&
+			((compInfo.renderState.masterBrightnessMode == GPUMasterBrightMode_Up) || (compInfo.renderState.masterBrightnessMode == GPUMasterBrightMode_Down)) )
+		{
+			needsApply = true;
+		}
+		
+		mutableInfo.masterBrightnessMode[this->_targetDisplayID][line] = compInfo.renderState.masterBrightnessMode;
+		mutableInfo.masterBrightnessIntensity[this->_targetDisplayID][line] = compInfo.renderState.masterBrightnessIntensity;
+		
+		if ( !processPerScanline &&
+			((compInfo.renderState.masterBrightnessMode != compInfoZero.renderState.masterBrightnessMode) ||
+			 (compInfo.renderState.masterBrightnessIntensity != compInfoZero.renderState.masterBrightnessIntensity)) )
+		{
+			processPerScanline = true;
+		}
+	}
+	
+	mutableInfo.masterBrightnessDiffersPerLine[this->_targetDisplayID] = processPerScanline;
+	mutableInfo.needApplyMasterBrightness[this->_targetDisplayID] = needsApply;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::ApplyMasterBrightness(const NDSDisplayInfo &displayInfo)
+{
+	// Most games maintain the exact same master brightness values for all 192 lines, so we
+	// can easily apply the master brightness to the entire framebuffer at once, which is
+	// faster than applying it per scanline.
+	//
+	// However, some games need to have the master brightness values applied on a per-scanline
+	// basis since they can differ for each scanline. For example, Mega Man Zero Collection
+	// purposely changes the master brightness intensity to 31 on line 0, 0 on line 16, and
+	// then back to 31 on line 176. Since the MMZC are originally GBA games, the master
+	// brightness intensity changes are done to disable the unused scanlines on the NDS.
+	
+	if (displayInfo.masterBrightnessDiffersPerLine[this->_targetDisplayID])
+	{
+		for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
+		{
+			const GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[line];
+			void *dstColorLine = (!displayInfo.didPerformCustomRender[this->_targetDisplayID]) ? ((u8 *)displayInfo.nativeBuffer[this->_targetDisplayID] + (compInfo.line.blockOffsetNative * displayInfo.pixelBytes)) : ((u8 *)displayInfo.customBuffer[this->_targetDisplayID] + (compInfo.line.blockOffsetCustom * displayInfo.pixelBytes));
+			const size_t pixCount = (!displayInfo.didPerformCustomRender[this->_targetDisplayID]) ? GPU_FRAMEBUFFER_NATIVE_WIDTH : compInfo.line.pixelCount;
+			
+			this->ApplyMasterBrightness<OUTPUTFORMAT, false>(dstColorLine,
+															 pixCount,
+															 (GPUMasterBrightMode)displayInfo.masterBrightnessMode[this->_targetDisplayID][line],
+															 displayInfo.masterBrightnessIntensity[this->_targetDisplayID][line]);
+		}
+	}
+	else
+	{
+		this->ApplyMasterBrightness<OUTPUTFORMAT, false>(displayInfo.renderedBuffer[this->_targetDisplayID],
+														 displayInfo.renderedWidth[this->_targetDisplayID] * displayInfo.renderedHeight[this->_targetDisplayID],
+														 (GPUMasterBrightMode)displayInfo.masterBrightnessMode[this->_targetDisplayID][0],
+														 displayInfo.masterBrightnessIntensity[this->_targetDisplayID][0]);
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT, bool ISFULLINTENSITYHINT>
+void GPUEngineBase::ApplyMasterBrightness(void *dst, const size_t pixCount, const GPUMasterBrightMode mode, const u8 intensity)
+{
+	if (!ISFULLINTENSITYHINT && (intensity == 0)) return;
+	
+	const bool isFullIntensity = (intensity >= 16);
+	const u8 intensityClamped = (isFullIntensity) ? 16 : intensity;
+	
+	switch (mode)
+	{
+		case GPUMasterBrightMode_Disable:
+			break;
+			
+		case GPUMasterBrightMode_Up:
+		{
+			if (!ISFULLINTENSITYHINT && !isFullIntensity)
+			{
+				size_t i = 0;
+				
+				switch (OUTPUTFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+					{
+#ifdef ENABLE_SSE2
+						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
+						
+						const size_t ssePixCount = pixCount - (pixCount % 8);
+						for (; i < ssePixCount; i += 8)
+						{
+							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
+							dstColor_vec128 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
+							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi16(0x8000));
+							_mm_store_si128((__m128i *)((u16 *)dst + i), dstColor_vec128);
+						}
+#endif
+						
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+						for (; i < pixCount; i++)
+						{
+							((u16 *)dst)[i] = GPUEngineBase::_brightnessUpTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000;
+						}
+						break;
+					}
+						
+					case NDSColorFormat_BGR666_Rev:
+					case NDSColorFormat_BGR888_Rev:
+					{
+#ifdef ENABLE_SSE2
+						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
+						
+						const size_t ssePixCount = pixCount - (pixCount % 4);
+						for (; i < ssePixCount; i += 4)
+						{
+							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((FragmentColor *)dst + i));
+							dstColor_vec128 = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
+							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000));
+							_mm_store_si128((__m128i *)((FragmentColor *)dst + i), dstColor_vec128);
+						}
+#endif
+						
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+						for (; i < pixCount; i++)
+						{
+							((FragmentColor *)dst)[i] = this->_ColorEffectIncreaseBrightness<OUTPUTFORMAT>(((FragmentColor *)dst)[i], intensityClamped);
+							((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF;
+						}
+						break;
+					}
+						
+					default:
+						break;
+				}
+			}
+			else
+			{
+				// all white (optimization)
+				switch (OUTPUTFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+						memset_u16(dst, 0xFFFF, pixCount);
+						break;
+						
+					case NDSColorFormat_BGR666_Rev:
+						memset_u32(dst, 0x1F3F3F3F, pixCount);
+						break;
+						
+					case NDSColorFormat_BGR888_Rev:
+						memset_u32(dst, 0xFFFFFFFF, pixCount);
+						break;
+						
+					default:
+						break;
+				}
+			}
+			break;
+		}
+			
+		case GPUMasterBrightMode_Down:
+		{
+			if (!ISFULLINTENSITYHINT && !isFullIntensity)
+			{
+				size_t i = 0;
+				
+				switch (OUTPUTFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+					{
+#ifdef ENABLE_SSE2
+						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
+						
+						const size_t ssePixCount = pixCount - (pixCount % 8);
+						for (; i < ssePixCount; i += 8)
+						{
+							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
+							dstColor_vec128 = this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
+							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi16(0x8000));
+							_mm_store_si128((__m128i *)((u16 *)dst + i), dstColor_vec128);
+						}
+#endif
+						
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+						for (; i < pixCount; i++)
+						{
+							((u16 *)dst)[i] = GPUEngineBase::_brightnessDownTable555[intensityClamped][ ((u16 *)dst)[i] & 0x7FFF ] | 0x8000;
+						}
+						break;
+					}
+						
+					case NDSColorFormat_BGR666_Rev:
+					case NDSColorFormat_BGR888_Rev:
+					{
+#ifdef ENABLE_SSE2
+						const __m128i intensity_vec128 = _mm_set1_epi16(intensityClamped);
+						
+						const size_t ssePixCount = pixCount - (pixCount % 4);
+						for (; i < ssePixCount; i += 4)
+						{
+							__m128i dstColor_vec128 = _mm_load_si128((__m128i *)((FragmentColor *)dst + i));
+							dstColor_vec128 = this->_ColorEffectDecreaseBrightness<OUTPUTFORMAT>(dstColor_vec128, intensity_vec128);
+							dstColor_vec128 = _mm_or_si128(dstColor_vec128, _mm_set1_epi32((OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F000000 : 0xFF000000));
+							_mm_store_si128((__m128i *)((FragmentColor *)dst + i), dstColor_vec128);
+						}
+#endif
+						
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+						for (; i < pixCount; i++)
+						{
+							((FragmentColor *)dst)[i] = this->_ColorEffectDecreaseBrightness(((FragmentColor *)dst)[i], intensityClamped);
+							((FragmentColor *)dst)[i].a = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) ? 0x1F : 0xFF;
+						}
+						break;
+					}
+						
+					default:
+						break;
+				}
+			}
+			else
+			{
+				// all black (optimization)
+				switch (OUTPUTFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+						memset_u16(dst, 0x8000, pixCount);
+						break;
+						
+					case NDSColorFormat_BGR666_Rev:
+						memset_u32(dst, 0x1F000000, pixCount);
+						break;
+						
+					case NDSColorFormat_BGR888_Rev:
+						memset_u32(dst, 0xFF000000, pixCount);
+						break;
+						
+					default:
+						break;
+				}
+			}
+			break;
+		}
+			
+		case GPUMasterBrightMode_Reserved:
+			break;
+	}
+}
+
+template <size_t WIN_NUM>
+bool GPUEngineBase::_IsWindowInsideVerticalRange(GPUEngineCompositorInfo &compInfo)
+{
+	const u16 windowTop    = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0V.Top : this->_IORegisterMap->WIN1V.Top;
+	const u16 windowBottom = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0V.Bottom : this->_IORegisterMap->WIN1V.Bottom;
+	
+	if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) goto allout;
+	if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) goto allout;
+
+	if (windowTop > windowBottom)
+	{
+		if ((compInfo.line.indexNative < windowTop) && (compInfo.line.indexNative > windowBottom)) goto allout;
+	}
+	else
+	{
+		if ((compInfo.line.indexNative < windowTop) || (compInfo.line.indexNative >= windowBottom)) goto allout;
+	}
+
+	//the x windows will apply for this scanline
+	return true;
+	
+allout:
+	return false;
+}
+
+template <size_t WIN_NUM>
+void GPUEngineBase::_UpdateWINH(GPUEngineCompositorInfo &compInfo)
+{
+	//dont even waste any time in here if the window isnt enabled
+	if (WIN_NUM == 0 && !compInfo.renderState.WIN0_ENABLED) return;
+	if (WIN_NUM == 1 && !compInfo.renderState.WIN1_ENABLED) return;
+
+	this->_needUpdateWINH[WIN_NUM] = false;
+	const size_t windowLeft  = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0H.Left  : this->_IORegisterMap->WIN1H.Left;
+	const size_t windowRight = (WIN_NUM == 0) ? this->_IORegisterMap->WIN0H.Right : this->_IORegisterMap->WIN1H.Right;
+
+	//the original logic: if you doubt the window code, please check it against the newer implementation below
+	//if(windowLeft > windowRight)
+	//{
+	//	if((x < windowLeft) && (x > windowRight)) return false;
+	//}
+	//else
+	//{
+	//	if((x < windowLeft) || (x >= windowRight)) return false;
+	//}
+
+	if (windowLeft > windowRight)
+	{
+		memset(this->_h_win[WIN_NUM], 1, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
+		memset(this->_h_win[WIN_NUM] + windowRight + 1, 0, (windowLeft - (windowRight + 1)) * sizeof(u8));
+	}
+	else
+	{
+		memset(this->_h_win[WIN_NUM], 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
+		memset(this->_h_win[WIN_NUM] + windowLeft, 1, (windowRight - windowLeft) * sizeof(u8));
+	}
+}
+
+void GPUEngineBase::_PerformWindowTesting(GPUEngineCompositorInfo &compInfo)
+{
+	if (this->_needUpdateWINH[0]) this->_UpdateWINH<0>(compInfo);
+	if (this->_needUpdateWINH[1]) this->_UpdateWINH<1>(compInfo);
+	
+	for (size_t layerID = GPULayerID_BG0; layerID <= GPULayerID_OBJ; layerID++)
+	{
+		if (!this->_enableLayer[layerID])
+		{
+			continue;
+		}
+		
+#ifdef ENABLE_SSE2
+		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=16)
+		{
+			__m128i win_vec128;
+			
+			__m128i didPassWindowTest = _mm_setzero_si128();
+			__m128i enableColorEffect = _mm_setzero_si128();
+			
+			__m128i win0HandledMask = _mm_setzero_si128();
+			__m128i win1HandledMask = _mm_setzero_si128();
+			__m128i winOBJHandledMask = _mm_setzero_si128();
+			__m128i winOUTHandledMask = _mm_setzero_si128();
+			
+			// Window 0 has the highest priority, so always check this first.
+			if (compInfo.renderState.WIN0_ENABLED && this->_IsWindowInsideVerticalRange<0>(compInfo))
+			{
+				win_vec128 = _mm_load_si128((__m128i *)(this->_h_win[0] + i));
+				win0HandledMask = _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1));
+				
+				didPassWindowTest = _mm_and_si128(win0HandledMask, compInfo.renderState.WIN0_enable_SSE2[layerID]);
+				enableColorEffect = _mm_and_si128(win0HandledMask, compInfo.renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]);
+			}
+			
+			// Window 1 has medium priority, and is checked after Window 0.
+			if (compInfo.renderState.WIN1_ENABLED && this->_IsWindowInsideVerticalRange<1>(compInfo))
+			{
+				win_vec128 = _mm_load_si128((__m128i *)(this->_h_win[1] + i));
+				win1HandledMask = _mm_andnot_si128(win0HandledMask, _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)));
+				
+				didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(win1HandledMask, compInfo.renderState.WIN1_enable_SSE2[layerID]) );
+				enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(win1HandledMask, compInfo.renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) );
+			}
+			
+			// Window OBJ has low priority, and is checked after both Window 0 and Window 1.
+			if (compInfo.renderState.WINOBJ_ENABLED)
+			{
+				win_vec128 = _mm_load_si128((__m128i *)(this->_sprWin + i));
+				winOBJHandledMask = _mm_andnot_si128( _mm_or_si128(win0HandledMask, win1HandledMask), _mm_cmpeq_epi8(win_vec128, _mm_set1_epi8(1)) );
+				
+				didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOBJHandledMask, compInfo.renderState.WINOBJ_enable_SSE2[layerID]) );
+				enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOBJHandledMask, compInfo.renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) );
+			}
+			
+			// If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags.
+			// This has the lowest priority, and is always checked last.
+			winOUTHandledMask = _mm_xor_si128( _mm_or_si128(win0HandledMask, _mm_or_si128(win1HandledMask, winOBJHandledMask)), _mm_set1_epi32(0xFFFFFFFF) );
+			didPassWindowTest = _mm_or_si128( didPassWindowTest, _mm_and_si128(winOUTHandledMask, compInfo.renderState.WINOUT_enable_SSE2[layerID]) );
+			enableColorEffect = _mm_or_si128( enableColorEffect, _mm_and_si128(winOUTHandledMask, compInfo.renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG]) );
+			
+			_mm_store_si128((__m128i *)(this->_didPassWindowTestNative[layerID] + i), _mm_and_si128(didPassWindowTest, _mm_set1_epi8(0x01)));
+			_mm_store_si128((__m128i *)(this->_enableColorEffectNative[layerID] + i), _mm_and_si128(enableColorEffect, _mm_set1_epi8(0x01)));
+		}
+#else
+		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++)
+		{
+			// Window 0 has the highest priority, so always check this first.
+			if (compInfo.renderState.WIN0_ENABLED && this->_IsWindowInsideVerticalRange<0>(compInfo))
+			{
+				if (this->_h_win[0][i] != 0)
+				{
+					this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WIN0_enable[layerID];
+					this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG];
+					continue;
+				}
+			}
+			
+			// Window 1 has medium priority, and is checked after Window 0.
+			if (compInfo.renderState.WIN1_ENABLED && this->_IsWindowInsideVerticalRange<1>(compInfo))
+			{
+				if (this->_h_win[1][i] != 0)
+				{
+					this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WIN1_enable[layerID];
+					this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG];
+					continue;
+				}
+			}
+			
+			// Window OBJ has low priority, and is checked after both Window 0 and Window 1.
+			if (compInfo.renderState.WINOBJ_ENABLED)
+			{
+				if (this->_sprWin[i] != 0)
+				{
+					this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WINOBJ_enable[layerID];
+					this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG];
+					continue;
+				}
+			}
+			
+			// If the pixel isn't inside any windows, then the pixel is outside, and therefore uses the WINOUT flags.
+			// This has the lowest priority, and is always checked last.
+			this->_didPassWindowTestNative[layerID][i] = compInfo.renderState.WINOUT_enable[layerID];
+			this->_enableColorEffectNative[layerID][i] = compInfo.renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG];
+		}
+#endif
+		if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 1))
+		{
+			CopyLineExpand<1, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH);
+			CopyLineExpand<1, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH);
+		}
+		else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 2))
+		{
+			CopyLineExpand<2, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 2);
+			CopyLineExpand<2, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 2);
+		}
+		else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 3))
+		{
+			CopyLineExpand<3, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 3);
+			CopyLineExpand<3, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 3);
+		}
+		else if (compInfo.line.widthCustom == (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4))
+		{
+			CopyLineExpand<4, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
+			CopyLineExpand<4, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
+		}
+		else if ((compInfo.line.widthCustom % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0)
+		{
+			CopyLineExpand<0xFFFF, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], compInfo.line.widthCustom);
+			CopyLineExpand<0xFFFF, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], compInfo.line.widthCustom);
+		}
+		else
+		{
+			CopyLineExpand<-1, false, 1>(this->_didPassWindowTestCustom[layerID], this->_didPassWindowTestNative[layerID], compInfo.line.widthCustom);
+			CopyLineExpand<-1, false, 1>(this->_enableColorEffectCustom[layerID], this->_enableColorEffectNative[layerID], compInfo.line.widthCustom);
+		}
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_Final(GPUEngineCompositorInfo &compInfo)
+{
+	bool useCustomVRAM = false;
+	
+	if (WILLDEFERCOMPOSITING)
+	{
+		// Because there is no guarantee for any given pixel to be written out, we need
+		// to zero out the deferred index buffer so that unwritten pixels can properly
+		// fail in _CompositeLineDeferred(). If we don't do this, then previously rendered
+		// layers may leave garbage indices for the current layer to mistakenly use if
+		// the current layer just so happens to have unwritten pixels.
+		//
+		// Test case: The score screen in Sonic Rush will be taken over by BG2, filling
+		// the screen with blue, unless this initialization is done each time.
+		memset(this->_deferredIndexNative, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u8));
+	}
+	
+	switch (compInfo.renderState.selectedBGLayer->baseType)
+	{
+		case BGType_Text: this->_LineText<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo); break;
+		case BGType_Affine: this->_LineRot<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo); break;
+		case BGType_AffineExt: this->_LineExtRot<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, useCustomVRAM); break;
+		case BGType_Large8bpp: this->_LineExtRot<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, WILLDEFERCOMPOSITING>(compInfo, useCustomVRAM); break;
+		case BGType_Invalid:
+			PROGINFO("Attempting to render an invalid BG type\n");
+			break;
+		default:
+			break;
+	}
+	
+	// If compositing at the native size, each pixel is composited immediately. However, if
+	// compositing at a custom size, pixel gathering and pixel compositing are split up into
+	// separate steps. If compositing at a custom size, composite the entire line now.
+	if ( (COMPOSITORMODE != GPUCompositorMode_Debug) && (WILLDEFERCOMPOSITING || !this->isLineRenderNative[compInfo.line.indexNative] || (useCustomVRAM && (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) && !GPU->GetDisplayInfo().isCustomSizeRequested)) )
+	{
+		if (useCustomVRAM)
+		{
+			this->_CompositeVRAMLineDeferred<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo);
+		}
+		else
+		{
+			this->_CompositeLineDeferred<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo);
+		}
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST>
+FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG_ApplyMosaic(GPUEngineCompositorInfo &compInfo)
+{
+	if (this->isLineRenderNative[compInfo.line.indexNative])
+	{
+		this->_RenderLine_LayerBG_Final<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, false>(compInfo);
+	}
+	else
+	{
+		this->_RenderLine_LayerBG_Final<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST, true>(compInfo);
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
+FORCEINLINE void GPUEngineBase::_RenderLine_LayerBG(GPUEngineCompositorInfo &compInfo)
+{
+#ifndef DISABLE_MOSAIC
+	if (compInfo.renderState.selectedBGLayer->isMosaic && compInfo.renderState.isBGMosaicSet)
+	{
+		this->_RenderLine_LayerBG_ApplyMosaic<COMPOSITORMODE, OUTPUTFORMAT, true, WILLPERFORMWINDOWTEST>(compInfo);
+	}
+	else
+#endif
+	{
+		this->_RenderLine_LayerBG_ApplyMosaic<COMPOSITORMODE, OUTPUTFORMAT, false, WILLPERFORMWINDOWTEST>(compInfo);
+	}
+}
+
+void GPUEngineBase::RenderLayerBG(const GPULayerID layerID, u16 *dstColorBuffer)
+{
+	GPUEngineCompositorInfo compInfo;
+	memset(&compInfo, 0, sizeof(compInfo));
+	
+	compInfo.renderState.displayOutputMode = GPUDisplayMode_Normal;
+	compInfo.renderState.selectedLayerID = layerID;
+	compInfo.renderState.selectedBGLayer = &this->_BGLayer[layerID];
+	compInfo.renderState.colorEffect = ColorEffect_Disable;
+	compInfo.renderState.masterBrightnessMode = GPUMasterBrightMode_Disable;
+	compInfo.renderState.masterBrightnessIsFullIntensity = false;
+	compInfo.renderState.masterBrightnessIsMaxOrMin = true;
+	compInfo.renderState.spriteRenderMode = this->_currentRenderState.spriteRenderMode;
+	compInfo.renderState.spriteBoundary = this->_currentRenderState.spriteBoundary;
+	compInfo.renderState.spriteBMPBoundary = this->_currentRenderState.spriteBMPBoundary;
+	
+	const size_t layerWidth = compInfo.renderState.selectedBGLayer->size.width;
+	const size_t layerHeight = compInfo.renderState.selectedBGLayer->size.height;
+	compInfo.line.widthCustom = layerWidth;
+	compInfo.line.renderCount = 1;
+	
+	compInfo.target.lineLayerIDHead = NULL;
+	compInfo.target.lineLayerIDHeadNative = NULL;
+	compInfo.target.lineLayerIDHeadCustom = NULL;
+	
+	compInfo.target.xNative = 0;
+	compInfo.target.xCustom = compInfo.target.xNative;
+	compInfo.target.lineColor = (void **)&compInfo.target.lineColor16;
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHeadNative;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHeadNative;
+	compInfo.target.lineLayerID = NULL;
+	
+	for (size_t lineIndex = 0; lineIndex < layerHeight; lineIndex++)
+	{
+		compInfo.line.indexNative = lineIndex;
+		compInfo.line.indexCustom = compInfo.line.indexNative;
+		compInfo.line.pixelCount = layerWidth;
+		compInfo.line.blockOffsetNative = compInfo.line.indexNative * layerWidth;
+		compInfo.line.blockOffsetCustom = compInfo.line.blockOffsetNative;
+		
+		compInfo.target.lineColorHead = (u16 *)dstColorBuffer + compInfo.line.blockOffsetNative;
+		compInfo.target.lineColorHeadNative = compInfo.target.lineColorHead;
+		compInfo.target.lineColorHeadCustom = compInfo.target.lineColorHeadNative;
+		
+		this->_RenderLine_LayerBG_Final<GPUCompositorMode_Debug, NDSColorFormat_BGR555_Rev, false, false, false>(compInfo);
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::_HandleDisplayModeOff(const size_t l)
+{
+	// Native rendering only.
+	// In this display mode, the display is cleared to white.
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFF);
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0x1F3F3F3F);
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			memset_u32_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>((u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH), 0xFFFFFFFF);
+			break;
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::_HandleDisplayModeNormal(const size_t l)
+{
+	if (!this->isLineRenderNative[l])
+	{
+		this->isLineOutputNative[l] = false;
+		this->nativeLineOutputCount--;
+	}
+}
+
+template <size_t WINNUM>
+void GPUEngineBase::ParseReg_WINnH()
+{
+	this->_needUpdateWINH[WINNUM] = true;
+}
+
+void GPUEngineBase::ParseReg_WININ()
+{
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.WIN0_enable[GPULayerID_BG0] = this->_IORegisterMap->WIN0IN.BG0_Enable;
+	renderState.WIN0_enable[GPULayerID_BG1] = this->_IORegisterMap->WIN0IN.BG1_Enable;
+	renderState.WIN0_enable[GPULayerID_BG2] = this->_IORegisterMap->WIN0IN.BG2_Enable;
+	renderState.WIN0_enable[GPULayerID_BG3] = this->_IORegisterMap->WIN0IN.BG3_Enable;
+	renderState.WIN0_enable[GPULayerID_OBJ] = this->_IORegisterMap->WIN0IN.OBJ_Enable;
+	renderState.WIN0_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WIN0IN.Effect_Enable;
+	
+	renderState.WIN1_enable[GPULayerID_BG0] = this->_IORegisterMap->WIN1IN.BG0_Enable;
+	renderState.WIN1_enable[GPULayerID_BG1] = this->_IORegisterMap->WIN1IN.BG1_Enable;
+	renderState.WIN1_enable[GPULayerID_BG2] = this->_IORegisterMap->WIN1IN.BG2_Enable;
+	renderState.WIN1_enable[GPULayerID_BG3] = this->_IORegisterMap->WIN1IN.BG3_Enable;
+	renderState.WIN1_enable[GPULayerID_OBJ] = this->_IORegisterMap->WIN1IN.OBJ_Enable;
+	renderState.WIN1_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WIN1IN.Effect_Enable;
+	
+#if defined(ENABLE_SSE2)
+	renderState.WIN0_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG0_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN0_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG1_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN0_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG2_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN0_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.BG3_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN0_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.OBJ_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN0_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WIN0IN.Effect_Enable != 0) ? 0xFF : 0x00);
+	
+	renderState.WIN1_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG0_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN1_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG1_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN1_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG2_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN1_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.BG3_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN1_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.OBJ_Enable != 0) ? 0xFF : 0x00);
+	renderState.WIN1_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WIN1IN.Effect_Enable != 0) ? 0xFF : 0x00);
+#endif
+}
+
+void GPUEngineBase::ParseReg_WINOUT()
+{
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.WINOUT_enable[GPULayerID_BG0] = this->_IORegisterMap->WINOUT.BG0_Enable;
+	renderState.WINOUT_enable[GPULayerID_BG1] = this->_IORegisterMap->WINOUT.BG1_Enable;
+	renderState.WINOUT_enable[GPULayerID_BG2] = this->_IORegisterMap->WINOUT.BG2_Enable;
+	renderState.WINOUT_enable[GPULayerID_BG3] = this->_IORegisterMap->WINOUT.BG3_Enable;
+	renderState.WINOUT_enable[GPULayerID_OBJ] = this->_IORegisterMap->WINOUT.OBJ_Enable;
+	renderState.WINOUT_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WINOUT.Effect_Enable;
+	
+	renderState.WINOBJ_enable[GPULayerID_BG0] = this->_IORegisterMap->WINOBJ.BG0_Enable;
+	renderState.WINOBJ_enable[GPULayerID_BG1] = this->_IORegisterMap->WINOBJ.BG1_Enable;
+	renderState.WINOBJ_enable[GPULayerID_BG2] = this->_IORegisterMap->WINOBJ.BG2_Enable;
+	renderState.WINOBJ_enable[GPULayerID_BG3] = this->_IORegisterMap->WINOBJ.BG3_Enable;
+	renderState.WINOBJ_enable[GPULayerID_OBJ] = this->_IORegisterMap->WINOBJ.OBJ_Enable;
+	renderState.WINOBJ_enable[WINDOWCONTROL_EFFECTFLAG] = this->_IORegisterMap->WINOBJ.Effect_Enable;
+	
+#if defined(ENABLE_SSE2)
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG0_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG1_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG2_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOUT_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.BG3_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOUT_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.OBJ_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOUT_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WINOUT.Effect_Enable != 0) ? 0xFF : 0x00);
+	
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG0] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG0_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG1] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG1_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG2] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG2_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOBJ_enable_SSE2[GPULayerID_BG3] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.BG3_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOBJ_enable_SSE2[GPULayerID_OBJ] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.OBJ_Enable != 0) ? 0xFF : 0x00);
+	renderState.WINOBJ_enable_SSE2[WINDOWCONTROL_EFFECTFLAG] = _mm_set1_epi8((this->_IORegisterMap->WINOBJ.Effect_Enable != 0) ? 0xFF : 0x00);
+#endif
+}
+
+void GPUEngineBase::ParseReg_MOSAIC()
+{
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.mosaicWidthBG = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicH];
+	renderState.mosaicHeightBG = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.BG_MosaicV];
+	renderState.mosaicWidthOBJ = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicH];
+	renderState.mosaicHeightOBJ = this->_mosaicLookup.table[this->_IORegisterMap->MOSAIC.OBJ_MosaicV];
+	
+	renderState.isBGMosaicSet = (this->_IORegisterMap->MOSAIC.BG_MosaicH != 0) || (this->_IORegisterMap->MOSAIC.BG_MosaicV != 0);
+	renderState.isOBJMosaicSet = (this->_IORegisterMap->MOSAIC.OBJ_MosaicH != 0) || (this->_IORegisterMap->MOSAIC.OBJ_MosaicV != 0);
+}
+
+void GPUEngineBase::ParseReg_BLDCNT()
+{
+	const IOREG_BLDCNT &BLDCNT = this->_IORegisterMap->BLDCNT;
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.colorEffect = (ColorEffect)BLDCNT.ColorEffect;
+	
+	renderState.srcBlendEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target1 != 0);
+	renderState.srcBlendEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target1 != 0);
+	renderState.srcBlendEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target1 != 0);
+	renderState.srcBlendEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target1 != 0);
+	renderState.srcBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target1 != 0);
+	renderState.srcBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target1 != 0);
+	
+	renderState.dstBlendEnable[GPULayerID_BG0] = (BLDCNT.BG0_Target2 != 0);
+	renderState.dstBlendEnable[GPULayerID_BG1] = (BLDCNT.BG1_Target2 != 0);
+	renderState.dstBlendEnable[GPULayerID_BG2] = (BLDCNT.BG2_Target2 != 0);
+	renderState.dstBlendEnable[GPULayerID_BG3] = (BLDCNT.BG3_Target2 != 0);
+	renderState.dstBlendEnable[GPULayerID_OBJ] = (BLDCNT.OBJ_Target2 != 0);
+	renderState.dstBlendEnable[GPULayerID_Backdrop] = (BLDCNT.Backdrop_Target2 != 0);
+	
+	renderState.dstAnyBlendEnable = renderState.dstBlendEnable[GPULayerID_BG0] ||
+	                                renderState.dstBlendEnable[GPULayerID_BG1] ||
+	                                renderState.dstBlendEnable[GPULayerID_BG2] ||
+	                                renderState.dstBlendEnable[GPULayerID_BG3] ||
+	                                renderState.dstBlendEnable[GPULayerID_OBJ] ||
+	                                renderState.dstBlendEnable[GPULayerID_Backdrop];
+	
+#ifdef ENABLE_SSE2
+	const __m128i one_vec128 = _mm_set1_epi8(1);
+	
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG0] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target1), one_vec128);
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG1] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG1_Target1), one_vec128);
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG2] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG2_Target1), one_vec128);
+	renderState.srcBlendEnable_SSE2[GPULayerID_BG3] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG3_Target1), one_vec128);
+	renderState.srcBlendEnable_SSE2[GPULayerID_OBJ] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.OBJ_Target1), one_vec128);
+	renderState.srcBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.Backdrop_Target1), one_vec128);
+	
+#ifdef ENABLE_SSSE3
+	renderState.dstBlendEnable_SSSE3 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+													BLDCNT.Backdrop_Target2,
+													BLDCNT.OBJ_Target2,
+													BLDCNT.BG3_Target2,
+													BLDCNT.BG2_Target2,
+													BLDCNT.BG1_Target2,
+													BLDCNT.BG0_Target2);
+#else
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG0] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG0_Target2), one_vec128);
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG1] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG1_Target2), one_vec128);
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG2] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG2_Target2), one_vec128);
+	renderState.dstBlendEnable_SSE2[GPULayerID_BG3] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.BG3_Target2), one_vec128);
+	renderState.dstBlendEnable_SSE2[GPULayerID_OBJ] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.OBJ_Target2), one_vec128);
+	renderState.dstBlendEnable_SSE2[GPULayerID_Backdrop] = _mm_cmpeq_epi8(_mm_set1_epi8(BLDCNT.Backdrop_Target2), one_vec128);
+#endif
+	
+#endif // ENABLE_SSE2
+}
+
+void GPUEngineBase::ParseReg_BLDALPHA()
+{
+	const IOREG_BLDALPHA &BLDALPHA = this->_IORegisterMap->BLDALPHA;
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.blendEVA = (BLDALPHA.EVA >= 16) ? 16 : BLDALPHA.EVA;
+	renderState.blendEVB = (BLDALPHA.EVB >= 16) ? 16 : BLDALPHA.EVB;
+	renderState.blendTable555 = (TBlendTable *)&GPUEngineBase::_blendTable555[renderState.blendEVA][renderState.blendEVB][0][0];
+}
+
+void GPUEngineBase::ParseReg_BLDY()
+{
+	const IOREG_BLDY &BLDY = this->_IORegisterMap->BLDY;
+	GPUEngineRenderState &renderState = this->_currentRenderState;
+	
+	renderState.blendEVY = (BLDY.EVY >= 16) ? 16 : BLDY.EVY;
+	renderState.brightnessUpTable555 = &GPUEngineBase::_brightnessUpTable555[renderState.blendEVY][0];
+	renderState.brightnessUpTable666 = &GPUEngineBase::_brightnessUpTable666[renderState.blendEVY][0];
+	renderState.brightnessUpTable888 = &GPUEngineBase::_brightnessUpTable888[renderState.blendEVY][0];
+	renderState.brightnessDownTable555 = &GPUEngineBase::_brightnessDownTable555[renderState.blendEVY][0];
+	renderState.brightnessDownTable666 = &GPUEngineBase::_brightnessDownTable666[renderState.blendEVY][0];
+	renderState.brightnessDownTable888 = &GPUEngineBase::_brightnessDownTable888[renderState.blendEVY][0];
+}
+
+const BGLayerInfo& GPUEngineBase::GetBGLayerInfoByID(const GPULayerID layerID)
+{
+	return this->_BGLayer[layerID];
+}
+
+NDSDisplayID GPUEngineBase::GetDisplayByID() const
+{
+	return this->_targetDisplayID;
+}
+
+void GPUEngineBase::SetDisplayByID(const NDSDisplayID theDisplayID)
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	this->_targetDisplayID = theDisplayID;
+	
+	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * dispInfo.pixelBytes;
+	const size_t customFramebufferSize = dispInfo.customWidth * dispInfo.customHeight * dispInfo.pixelBytes;
+	
+	this->nativeBuffer = (theDisplayID == NDSDisplayID_Main) ? dispInfo.masterNativeBuffer : (u8 *)dispInfo.masterNativeBuffer + nativeFramebufferSize;
+	this->customBuffer = (theDisplayID == NDSDisplayID_Main) ? dispInfo.masterCustomBuffer : (u8 *)dispInfo.masterCustomBuffer + customFramebufferSize;
+}
+
+GPUEngineID GPUEngineBase::GetEngineID() const
+{
+	return this->_engineID;
+}
+
+void GPUEngineBase::SetCustomFramebufferSize(size_t w, size_t h)
+{
+	void *oldWorkingLineColor = this->_internalRenderLineTargetCustom;
+	u8 *oldWorkingLineLayerID = this->_renderLineLayerIDCustom;
+	u8 *oldDeferredIndexCustom = this->_deferredIndexCustom;
+	u16 *oldDeferredColorCustom = this->_deferredColorCustom;
+	u8 *oldDidPassWindowTestCustomMasterPtr = this->_didPassWindowTestCustomMasterPtr;
+	
+	void *newWorkingLineColor = malloc_alignedCacheLine(w * _gpuLargestDstLineCount * GPU->GetDisplayInfo().pixelBytes);
+	u8 *newWorkingLineLayerID = (u8 *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * 4 * sizeof(u8)); // yes indeed, this is oversized. map debug tools try to write to it
+	u8 *newDeferredIndexCustom = (u8 *)malloc_alignedCacheLine(w * sizeof(u8));
+	u16 *newDeferredColorCustom = (u16 *)malloc_alignedCacheLine(w * sizeof(u16));
+	u8 *newDidPassWindowTestCustomMasterPtr = (u8 *)malloc_alignedCacheLine(w * 10 * sizeof(u8));
+	
+	this->_internalRenderLineTargetCustom = newWorkingLineColor;
+	this->_renderLineLayerIDCustom = newWorkingLineLayerID;
+	this->_deferredIndexCustom = newDeferredIndexCustom;
+	this->_deferredColorCustom = newDeferredColorCustom;
+	
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * dispInfo.pixelBytes;
+	const size_t customFramebufferSize = w * h * dispInfo.pixelBytes;
+	
+	this->nativeBuffer   = (this->_targetDisplayID == NDSDisplayID_Main) ? dispInfo.masterNativeBuffer : (u8 *)dispInfo.masterNativeBuffer + nativeFramebufferSize;
+	this->customBuffer   = (this->_targetDisplayID == NDSDisplayID_Main) ? dispInfo.masterCustomBuffer : (u8 *)dispInfo.masterCustomBuffer + customFramebufferSize;
+	this->renderedBuffer = this->nativeBuffer;
+	this->renderedWidth  = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	this->renderedHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	
+	this->_didPassWindowTestCustomMasterPtr = newDidPassWindowTestCustomMasterPtr;
+	this->_didPassWindowTestCustom[GPULayerID_BG0] = this->_didPassWindowTestCustomMasterPtr + (0 * w * sizeof(u8));
+	this->_didPassWindowTestCustom[GPULayerID_BG1] = this->_didPassWindowTestCustomMasterPtr + (1 * w * sizeof(u8));
+	this->_didPassWindowTestCustom[GPULayerID_BG2] = this->_didPassWindowTestCustomMasterPtr + (2 * w * sizeof(u8));
+	this->_didPassWindowTestCustom[GPULayerID_BG3] = this->_didPassWindowTestCustomMasterPtr + (3 * w * sizeof(u8));
+	this->_didPassWindowTestCustom[GPULayerID_OBJ] = this->_didPassWindowTestCustomMasterPtr + (4 * w * sizeof(u8));
+	
+	this->_enableColorEffectCustomMasterPtr = newDidPassWindowTestCustomMasterPtr + (w * 5 * sizeof(u8));
+	this->_enableColorEffectCustom[GPULayerID_BG0] = this->_enableColorEffectCustomMasterPtr + (0 * w * sizeof(u8));
+	this->_enableColorEffectCustom[GPULayerID_BG1] = this->_enableColorEffectCustomMasterPtr + (1 * w * sizeof(u8));
+	this->_enableColorEffectCustom[GPULayerID_BG2] = this->_enableColorEffectCustomMasterPtr + (2 * w * sizeof(u8));
+	this->_enableColorEffectCustom[GPULayerID_BG3] = this->_enableColorEffectCustomMasterPtr + (3 * w * sizeof(u8));
+	this->_enableColorEffectCustom[GPULayerID_OBJ] = this->_enableColorEffectCustomMasterPtr + (4 * w * sizeof(u8));
+	
+	this->_needUpdateWINH[0] = true;
+	this->_needUpdateWINH[1] = true;
+	
+	for (size_t line = 0; line < GPU_FRAMEBUFFER_NATIVE_HEIGHT; line++)
+	{
+		GPUEngineLineInfo &lineInfo = this->_currentCompositorInfo[line].line;
+		
+		lineInfo.indexNative = line;
+		lineInfo.indexCustom = _gpuDstLineIndex[lineInfo.indexNative];
+		lineInfo.widthCustom = GPU->GetDisplayInfo().customWidth;
+		lineInfo.renderCount = _gpuDstLineCount[lineInfo.indexNative];
+		lineInfo.pixelCount = lineInfo.widthCustom * lineInfo.renderCount;
+		lineInfo.blockOffsetNative = lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH;
+		lineInfo.blockOffsetCustom = lineInfo.indexCustom * lineInfo.widthCustom;
+		
+		this->_currentCompositorInfo[line].target.lineColor = (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (void **)&this->_currentCompositorInfo[line].target.lineColor16 : (void **)&this->_currentCompositorInfo[line].target.lineColor32;
+	}
+	
+	free_aligned(oldWorkingLineColor);
+	free_aligned(oldWorkingLineLayerID);
+	free_aligned(oldDeferredIndexCustom);
+	free_aligned(oldDeferredColorCustom);
+	free_aligned(oldDidPassWindowTestCustomMasterPtr);
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineBase::ResolveCustomRendering()
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	
+	if (this->nativeLineOutputCount == GPU_FRAMEBUFFER_NATIVE_HEIGHT)
+	{
+		return;
+	}
+	else if (this->nativeLineOutputCount == 0)
+	{
+		this->renderedWidth = dispInfo.customWidth;
+		this->renderedHeight = dispInfo.customHeight;
+		this->renderedBuffer = this->customBuffer;
+		return;
+	}
+	
+	// Resolve any remaining native lines to the custom buffer
+	if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
+	{
+		for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
+		{
+			if (this->isLineOutputNative[y])
+			{
+				this->_LineCopy<0xFFFF, true, false, 2>(this->customBuffer, this->nativeBuffer, y);
+				this->isLineOutputNative[y] = false;
+			}
+		}
+	}
+	else
+	{
+		for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
+		{
+			if (this->isLineOutputNative[y])
+			{
+				this->_LineCopy<0xFFFF, true, false, 4>(this->customBuffer, this->nativeBuffer, y);
+				this->isLineOutputNative[y] = false;
+			}
+		}
+	}
+	
+	this->nativeLineOutputCount = 0;
+	this->renderedWidth = dispInfo.customWidth;
+	this->renderedHeight = dispInfo.customHeight;
+	this->renderedBuffer = this->customBuffer;
+}
+
+void GPUEngineBase::ResolveToCustomFramebuffer(NDSDisplayInfo &mutableInfo)
+{
+	if (mutableInfo.didPerformCustomRender[this->_targetDisplayID])
+	{
+		return;
+	}
+	
+	if (mutableInfo.isCustomSizeRequested)
+	{
+		if (mutableInfo.pixelBytes == 2)
+		{
+			for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
+			{
+				this->_LineCopy<0xFFFF, true, false, 2>(mutableInfo.customBuffer[this->_targetDisplayID], mutableInfo.nativeBuffer[this->_targetDisplayID], y);
+			}
+		}
+		else if (mutableInfo.pixelBytes == 4)
+		{
+			for (size_t y = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
+			{
+				this->_LineCopy<0xFFFF, true, false, 4>(mutableInfo.customBuffer[this->_targetDisplayID], mutableInfo.nativeBuffer[this->_targetDisplayID], y);
+			}
+		}
+	}
+	else
+	{
+		memcpy(mutableInfo.customBuffer[this->_targetDisplayID], mutableInfo.nativeBuffer[this->_targetDisplayID], GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * mutableInfo.pixelBytes);
+	}
+	
+	mutableInfo.didPerformCustomRender[this->_targetDisplayID] = true;
+}
+
+void GPUEngineBase::RefreshAffineStartRegs()
+{
+	//this is speculative. the idea is as follows:
+	//whenever the user updates the affine start position regs, it goes into the active regs immediately
+	//(this is handled on the set event from MMU)
+	//maybe it shouldnt take effect until the next hblank or something..
+	//this is a based on a combination of:
+	//heroes of mana intro FMV
+	//SPP level 3-8 rotoscale room
+	//NSMB raster fx backdrops
+	//bubble bobble revolution classic mode
+	//NOTE:
+	//I am REALLY unsatisfied with this logic now. But it seems to be working.
+	
+	this->_IORegisterMap->BG2X = this->savedBG2X;
+	this->_IORegisterMap->BG2Y = this->savedBG2Y;
+	this->_IORegisterMap->BG3X = this->savedBG3X;
+	this->_IORegisterMap->BG3Y = this->savedBG3Y;
+}
+
+// normally should have same addresses
+void GPUEngineBase::REG_DISPx_pack_test()
+{
+	const GPU_IOREG *r = this->_IORegisterMap;
+	
+	printf("%08lx %02x\n", (uintptr_t)r, (u32)((uintptr_t)(&r->DISPCNT) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISPSTAT) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->VCOUNT) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->BGnCNT[GPULayerID_BG0]) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->BGnOFS[GPULayerID_BG0]) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->BG2Param) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->BG3Param) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISP3DCNT) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISPCAPCNT) - (uintptr_t)r) );
+	printf("\t%02x\n", (u32)((uintptr_t)(&r->DISP_MMEM_FIFO) - (uintptr_t)r) );
+}
+
+void GPUEngineBase::ParseAllRegisters()
+{
+	this->ParseReg_DISPCNT();
+	// No need to call ParseReg_BGnCNT(), since it is already called by ParseReg_DISPCNT().
+	
+	this->ParseReg_BGnHOFS<GPULayerID_BG0>();
+	this->ParseReg_BGnHOFS<GPULayerID_BG1>();
+	this->ParseReg_BGnHOFS<GPULayerID_BG2>();
+	this->ParseReg_BGnHOFS<GPULayerID_BG3>();
+	this->ParseReg_BGnVOFS<GPULayerID_BG0>();
+	this->ParseReg_BGnVOFS<GPULayerID_BG1>();
+	this->ParseReg_BGnVOFS<GPULayerID_BG2>();
+	this->ParseReg_BGnVOFS<GPULayerID_BG3>();
+	
+	this->ParseReg_BGnX<GPULayerID_BG2>();
+	this->ParseReg_BGnY<GPULayerID_BG2>();
+	this->ParseReg_BGnX<GPULayerID_BG3>();
+	this->ParseReg_BGnY<GPULayerID_BG3>();
+	
+	this->ParseReg_WINnH<0>();
+	this->ParseReg_WINnH<1>();
+	this->ParseReg_WININ();
+	this->ParseReg_WINOUT();
+	
+	this->ParseReg_MOSAIC();
+	this->ParseReg_BLDCNT();
+	this->ParseReg_BLDALPHA();
+	this->ParseReg_BLDY();
+	this->ParseReg_MASTER_BRIGHT();
+}
+
+GPUEngineA::GPUEngineA()
+{
+	_engineID = GPUEngineID_Main;
+	_targetDisplayID = NDSDisplayID_Main;
+	_IORegisterMap = (GPU_IOREG *)MMU.ARM9_REG;
+	_paletteBG = (u16 *)MMU.ARM9_VMEM;
+	_paletteOBJ = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_512B);
+	_oamList = (OAMAttributes *)(MMU.ARM9_OAM);
+	_sprMem = MMU_AOBJ;
+	
+	_VRAMNativeBlockPtr[0] = (u16 *)MMU.ARM9_LCD;
+	_VRAMNativeBlockPtr[1] = _VRAMNativeBlockPtr[0] + (1 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	_VRAMNativeBlockPtr[2] = _VRAMNativeBlockPtr[0] + (2 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	_VRAMNativeBlockPtr[3] = _VRAMNativeBlockPtr[0] + (3 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	
+	memset(this->_VRAMNativeBlockCaptureCopy, 0, GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
+	_VRAMNativeBlockCaptureCopyPtr[0] = this->_VRAMNativeBlockCaptureCopy;
+	_VRAMNativeBlockCaptureCopyPtr[1] = _VRAMNativeBlockCaptureCopyPtr[0] + (1 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	_VRAMNativeBlockCaptureCopyPtr[2] = _VRAMNativeBlockCaptureCopyPtr[0] + (2 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	_VRAMNativeBlockCaptureCopyPtr[3] = _VRAMNativeBlockCaptureCopyPtr[0] + (3 * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	
+	nativeLineCaptureCount[0] = GPU_VRAM_BLOCK_LINES;
+	nativeLineCaptureCount[1] = GPU_VRAM_BLOCK_LINES;
+	nativeLineCaptureCount[2] = GPU_VRAM_BLOCK_LINES;
+	nativeLineCaptureCount[3] = GPU_VRAM_BLOCK_LINES;
+	
+	for (size_t l = 0; l < GPU_VRAM_BLOCK_LINES; l++)
+	{
+		isLineCaptureNative[0][l] = true;
+		isLineCaptureNative[1][l] = true;
+		isLineCaptureNative[2][l] = true;
+		isLineCaptureNative[3][l] = true;
+	}
+	
+	_3DFramebufferMain = (FragmentColor *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(FragmentColor));
+	_3DFramebuffer16 = (u16 *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16));
+	_captureWorkingA16 = (u16 *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16));
+	_captureWorkingB16 = (u16 *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16));
+	_captureWorkingA32 = (FragmentColor *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(FragmentColor));
+	_captureWorkingB32 = (FragmentColor *)malloc_alignedCacheLine(GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(FragmentColor));
+	gfx3d_Update3DFramebuffers(_3DFramebufferMain, _3DFramebuffer16);
+}
+
+GPUEngineA::~GPUEngineA()
+{
+	free_aligned(this->_3DFramebufferMain);
+	free_aligned(this->_3DFramebuffer16);
+	free_aligned(this->_captureWorkingA16);
+	free_aligned(this->_captureWorkingB16);
+	free_aligned(this->_captureWorkingA32);
+	free_aligned(this->_captureWorkingB32);
+	gfx3d_Update3DFramebuffers(NULL, NULL);
+}
+
+GPUEngineA* GPUEngineA::Allocate()
+{
+	return new(malloc_aligned64(sizeof(GPUEngineA))) GPUEngineA();
+}
+
+void GPUEngineA::FinalizeAndDeallocate()
+{
+	this->~GPUEngineA();
+	free_aligned(this);
+}
+
+void GPUEngineA::Reset()
+{
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	this->_Reset_Base();
+	
+	memset(&this->_dispCapCnt, 0, sizeof(DISPCAPCNT_parsed));
+	this->_displayCaptureEnable = false;
+	
+	this->_BGLayer[GPULayerID_BG0].BMPAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG1].BMPAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG2].BMPAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG3].BMPAddress = MMU_ABG;
+	
+	this->_BGLayer[GPULayerID_BG0].largeBMPAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG1].largeBMPAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG2].largeBMPAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG3].largeBMPAddress = MMU_ABG;
+	
+	this->_BGLayer[GPULayerID_BG0].tileMapAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG1].tileMapAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG2].tileMapAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG3].tileMapAddress = MMU_ABG;
+	
+	this->_BGLayer[GPULayerID_BG0].tileEntryAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG1].tileEntryAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG2].tileEntryAddress = MMU_ABG;
+	this->_BGLayer[GPULayerID_BG3].tileEntryAddress = MMU_ABG;
+	
+	memset(this->_VRAMNativeBlockCaptureCopy, 0, GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH * 4);
+	
+	this->ResetCaptureLineStates();
+	this->SetDisplayByID(NDSDisplayID_Main);
+	
+	memset(this->_3DFramebufferMain, 0, dispInfo.customWidth * dispInfo.customHeight * sizeof(FragmentColor));
+	memset(this->_3DFramebuffer16, 0, dispInfo.customWidth * dispInfo.customHeight * sizeof(u16));
+	memset(this->_captureWorkingA16, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(u16));
+	memset(this->_captureWorkingB16, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(u16));
+	memset(this->_captureWorkingA32, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(FragmentColor));
+	memset(this->_captureWorkingB32, 0, dispInfo.customWidth * _gpuLargestDstLineCount * sizeof(FragmentColor));
+}
+
+void GPUEngineA::ResetCaptureLineStates()
+{
+	this->nativeLineCaptureCount[0] = GPU_VRAM_BLOCK_LINES;
+	this->nativeLineCaptureCount[1] = GPU_VRAM_BLOCK_LINES;
+	this->nativeLineCaptureCount[2] = GPU_VRAM_BLOCK_LINES;
+	this->nativeLineCaptureCount[3] = GPU_VRAM_BLOCK_LINES;
+	
+	for (size_t l = 0; l < GPU_VRAM_BLOCK_LINES; l++)
+	{
+		this->isLineCaptureNative[0][l] = true;
+		this->isLineCaptureNative[1][l] = true;
+		this->isLineCaptureNative[2][l] = true;
+		this->isLineCaptureNative[3][l] = true;
+	}
+}
+
+void GPUEngineA::ParseReg_DISPCAPCNT()
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	
+	this->_dispCapCnt.EVA = (DISPCAPCNT.EVA >= 16) ? 16 : DISPCAPCNT.EVA;
+	this->_dispCapCnt.EVB = (DISPCAPCNT.EVB >= 16) ? 16 : DISPCAPCNT.EVB;
+	this->_dispCapCnt.readOffset = (DISPCNT.DisplayMode == GPUDisplayMode_VRAM) ? 0 : DISPCAPCNT.VRAMReadOffset;
+	
+	switch (DISPCAPCNT.CaptureSize)
+	{
+		case DisplayCaptureSize_128x128:
+			this->_dispCapCnt.capy = 128;
+			break;
+			
+		case DisplayCaptureSize_256x64:
+			this->_dispCapCnt.capy = 64;
+			break;
+			
+		case DisplayCaptureSize_256x128:
+			this->_dispCapCnt.capy = 128;
+			break;
+			
+		case DisplayCaptureSize_256x192:
+			this->_dispCapCnt.capy = 192;
+			break;
+			
+		default:
+			break;
+	}
+	
+	/*INFO("Capture 0x%X:\n EVA=%i, EVB=%i, wBlock=%i, wOffset=%i, capX=%i, capY=%i\n rBlock=%i, rOffset=%i, srcCap=%i, dst=0x%X, src=0x%X\n srcA=%i, srcB=%i\n\n",
+	 val, this->_dispCapCnt.EVA, this->_dispCapCnt.EVB, this->_dispCapCnt.writeBlock, this->_dispCapCnt.writeOffset,
+	 this->_dispCapCnt.capy, this->_dispCapCnt.readBlock, this->_dispCapCnt.readOffset,
+	 this->_dispCapCnt.capSrc, this->_dispCapCnt.dst - MMU.ARM9_LCD, this->_dispCapCnt.src - MMU.ARM9_LCD,
+	 this->_dispCapCnt.srcA, this->_dispCapCnt.srcB);*/
+}
+
+FragmentColor* GPUEngineA::Get3DFramebufferMain() const
+{
+	return this->_3DFramebufferMain;
+}
+
+u16* GPUEngineA::Get3DFramebuffer16() const
+{
+	return this->_3DFramebuffer16;
+}
+
+void* GPUEngineA::GetCustomVRAMBlockPtr(const size_t blockID)
+{
+	return this->_VRAMCustomBlockPtr[blockID];
+}
+
+void GPUEngineA::SetCustomFramebufferSize(size_t w, size_t h)
+{
+	this->GPUEngineBase::SetCustomFramebufferSize(w, h);
+	
+	FragmentColor *old3DFramebufferMain = this->_3DFramebufferMain;
+	u16 *old3DFramebuffer16 = this->_3DFramebuffer16;
+	u16 *oldCaptureWorkingA16 = this->_captureWorkingA16;
+	u16 *oldCaptureWorkingB16 = this->_captureWorkingB16;
+	FragmentColor *oldCaptureWorkingA32 = this->_captureWorkingA32;
+	FragmentColor *oldCaptureWorkingB32 = this->_captureWorkingB32;
+	
+	FragmentColor *new3DFramebufferMain = (FragmentColor *)malloc_alignedCacheLine(w * h * sizeof(FragmentColor));
+	u16 *new3DFramebuffer16 = (u16 *)malloc_alignedCacheLine(w * h * sizeof(u16));
+	u16 *newCaptureWorkingA16 = (u16 *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(u16));
+	u16 *newCaptureWorkingB16 = (u16 *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(u16));
+	FragmentColor *newCaptureWorkingA32 = (FragmentColor *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(FragmentColor));
+	FragmentColor *newCaptureWorkingB32 = (FragmentColor *)malloc_alignedCacheLine(w * _gpuLargestDstLineCount * sizeof(FragmentColor));
+	
+	this->_3DFramebufferMain = new3DFramebufferMain;
+	this->_3DFramebuffer16 = new3DFramebuffer16;
+	this->_captureWorkingA16 = newCaptureWorkingA16;
+	this->_captureWorkingB16 = newCaptureWorkingB16;
+	this->_captureWorkingA32 = newCaptureWorkingA32;
+	this->_captureWorkingB32 = newCaptureWorkingB32;
+	gfx3d_Update3DFramebuffers(this->_3DFramebufferMain, this->_3DFramebuffer16);
+	
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	
+	if (dispInfo.colorFormat == NDSColorFormat_BGR888_Rev)
+	{
+		this->_VRAMCustomBlockPtr[0] = (FragmentColor *)GPU->GetCustomVRAMBuffer();
+		this->_VRAMCustomBlockPtr[1] = (FragmentColor *)this->_VRAMCustomBlockPtr[0] + (1 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
+		this->_VRAMCustomBlockPtr[2] = (FragmentColor *)this->_VRAMCustomBlockPtr[0] + (2 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
+		this->_VRAMCustomBlockPtr[3] = (FragmentColor *)this->_VRAMCustomBlockPtr[0] + (3 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
+	}
+	else
+	{
+		this->_VRAMCustomBlockPtr[0] = (u16 *)GPU->GetCustomVRAMBuffer();
+		this->_VRAMCustomBlockPtr[1] = (u16 *)this->_VRAMCustomBlockPtr[0] + (1 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
+		this->_VRAMCustomBlockPtr[2] = (u16 *)this->_VRAMCustomBlockPtr[0] + (2 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
+		this->_VRAMCustomBlockPtr[3] = (u16 *)this->_VRAMCustomBlockPtr[0] + (3 * _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w);
+	}
+	
+	free_aligned(old3DFramebufferMain);
+	free_aligned(old3DFramebuffer16);
+	free_aligned(oldCaptureWorkingA16);
+	free_aligned(oldCaptureWorkingB16);
+	free_aligned(oldCaptureWorkingA32);
+	free_aligned(oldCaptureWorkingB32);
+}
+
+bool GPUEngineA::WillRender3DLayer()
+{
+	return ( this->_enableLayer[GPULayerID_BG0] && (this->_IORegisterMap->DISPCNT.BG0_3D != 0) );
+}
+
+bool GPUEngineA::WillCapture3DLayerDirect(const size_t l)
+{
+	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	return ( this->WillDisplayCapture(l) && (DISPCAPCNT.SrcA != 0) && (DISPCAPCNT.CaptureSrc != 1) );
+}
+
+bool GPUEngineA::WillDisplayCapture(const size_t l)
+{
+	//we must block captures when the capture dest is not mapped to LCDC.
+	//mario kart does this (maybe due to a programming bug, but maybe emulation timing error) when spamming confirm key during course intro and through black transition
+	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	return this->_displayCaptureEnable && (vramConfiguration.banks[DISPCAPCNT.VRAMWriteBlock].purpose == VramConfiguration::LCDC) && (l < this->_dispCapCnt.capy);
+}
+
+void GPUEngineA::SetDisplayCaptureEnable()
+{
+	this->_displayCaptureEnable = (this->_IORegisterMap->DISPCAPCNT.CaptureEnable != 0);
+}
+
+void GPUEngineA::ResetDisplayCaptureEnable()
+{
+	IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	if (this->_displayCaptureEnable)
+	{
+		DISPCAPCNT.CaptureEnable = 0;
+		this->_displayCaptureEnable = false;
+	}
+}
+
+bool GPUEngineA::VerifyVRAMLineDidChange(const size_t blockID, const size_t l)
+{
+	// This method must be called for ALL instances where captured lines in VRAM may be read back.
+	//
+	// If a line is captured at a custom size, we need to ensure that the line hasn't been changed between
+	// capture time and read time. If the captured line has changed, then we need to fallback to using the
+	// native captured line instead.
+	
+	if (this->isLineCaptureNative[blockID][l])
+	{
+		return false;
+	}
+	
+	u16 *__restrict capturedNativeLine = this->_VRAMNativeBlockCaptureCopyPtr[blockID] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	const u16 *__restrict currentNativeLine = this->_VRAMNativeBlockPtr[blockID] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+	
+	const bool didVRAMLineChange = (memcmp(currentNativeLine, capturedNativeLine, GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) != 0);
+	if (didVRAMLineChange)
+	{
+		this->_LineCopy<1, true, false, 2>(this->_VRAMNativeBlockCaptureCopyPtr[blockID], this->_VRAMNativeBlockPtr[blockID], l);
+		this->isLineCaptureNative[blockID][l] = true;
+		this->nativeLineCaptureCount[blockID]++;
+	}
+	
+	return didVRAMLineChange;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineA::RenderLine(const size_t l)
+{
+	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	const bool isDisplayCaptureNeeded = this->WillDisplayCapture(l);
+	const GPUEngineRenderState &renderState = this->_currentCompositorInfo[l].renderState;
+	
+	// Render the line
+	if ( (renderState.displayOutputMode == GPUDisplayMode_Normal) || isDisplayCaptureNeeded )
+	{
+		if (renderState.isAnyWindowEnabled)
+		{
+			this->_RenderLine_Layers<OUTPUTFORMAT, true>(l);
+		}
+		else
+		{
+			this->_RenderLine_Layers<OUTPUTFORMAT, false>(l);
+		}
+	}
+	
+	// Fill the display output
+	switch (renderState.displayOutputMode)
+	{
+		case GPUDisplayMode_Off: // Display Off(Display white)
+			this->_HandleDisplayModeOff<OUTPUTFORMAT>(l);
+			break;
+			
+		case GPUDisplayMode_Normal: // Display BG and OBJ layers
+			this->_HandleDisplayModeNormal<OUTPUTFORMAT>(l);
+			break;
+			
+		case GPUDisplayMode_VRAM: // Display vram framebuffer
+			this->_HandleDisplayModeVRAM<OUTPUTFORMAT>(l);
+			break;
+			
+		case GPUDisplayMode_MainMemory: // Display memory FIFO
+			this->_HandleDisplayModeMainMemory<OUTPUTFORMAT>(l);
+			break;
+	}
+	
+	//capture after displaying so that we can safely display vram before overwriting it here
+	
+	//BUG!!! if someone is capturing and displaying both from the fifo, then it will have been
+	//consumed above by the display before we get here
+	//(is that even legal? i think so)
+	if (isDisplayCaptureNeeded)
+	{
+		if (DISPCAPCNT.CaptureSize == DisplayCaptureSize_128x128)
+		{
+			this->_RenderLine_DisplayCapture<OUTPUTFORMAT, GPU_FRAMEBUFFER_NATIVE_WIDTH/2>(l);
+		}
+		else
+		{
+			this->_RenderLine_DisplayCapture<OUTPUTFORMAT, GPU_FRAMEBUFFER_NATIVE_WIDTH>(l);
+		}
+	}
+}
+
+template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool WILLPERFORMWINDOWTEST>
+void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo)
+{
+	const FragmentColor *__restrict framebuffer3D = CurrentRenderer->GetFramebuffer();
+	if (framebuffer3D == NULL)
+	{
+		return;
+	}
+	
+	if (!CurrentRenderer->IsFramebufferNativeSize())
+	{
+		this->_TransitionLineNativeToCustom<OUTPUTFORMAT>(compInfo);
+	}
+	
+	const float customWidthScale = (float)compInfo.line.widthCustom / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	const FragmentColor *__restrict srcLinePtr = framebuffer3D + compInfo.line.blockOffsetCustom;
+	
+	compInfo.target.lineColor16 = (u16 *)compInfo.target.lineColorHead;
+	compInfo.target.lineColor32 = (FragmentColor *)compInfo.target.lineColorHead;
+	compInfo.target.lineLayerID = compInfo.target.lineLayerIDHead;
+	
+	// Horizontally offset the 3D layer by this amount.
+	// Test case: Blowing up large objects in Nanostray 2 will cause the main screen to shake horizontally.
+	const u16 hofs = (u16)( ((float)compInfo.renderState.selectedBGLayer->xOffset * customWidthScale) + 0.5f );
+	
+	if (hofs == 0)
+	{
+#ifdef ENABLE_SSE2
+		const size_t ssePixCount = (compInfo.line.widthCustom - (compInfo.line.widthCustom % 16));
+		const __m128i srcEffectEnableMask = compInfo.renderState.srcBlendEnable_SSE2[compInfo.renderState.selectedLayerID];
+#endif
+		
+		for (size_t line = 0; line < compInfo.line.renderCount; line++)
+		{
+			compInfo.target.xNative = 0;
+			compInfo.target.xCustom = 0;
+			
+#ifdef ENABLE_SSE2
+			for (; compInfo.target.xCustom < ssePixCount; srcLinePtr+=16, compInfo.target.xCustom+=16, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16+=16, compInfo.target.lineColor32+=16, compInfo.target.lineLayerID+=16)
+			{
+				const __m128i src[4]	= { _mm_load_si128((__m128i *)srcLinePtr + 0),
+										    _mm_load_si128((__m128i *)srcLinePtr + 1),
+										    _mm_load_si128((__m128i *)srcLinePtr + 2),
+										    _mm_load_si128((__m128i *)srcLinePtr + 3) };
+				
+				// Determine which pixels pass by doing the alpha test and the window test.
+				const __m128i srcAlpha = _mm_packs_epi16( _mm_packs_epi32(_mm_srli_epi32(src[0], 24), _mm_srli_epi32(src[1], 24)),
+														  _mm_packs_epi32(_mm_srli_epi32(src[2], 24), _mm_srli_epi32(src[3], 24)) );
+				__m128i passMask8;
+				
+				if (WILLPERFORMWINDOWTEST)
+				{
+					// Do the window test.
+					passMask8 = _mm_cmpeq_epi8( _mm_load_si128((__m128i *)(this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID] + compInfo.target.xCustom)), _mm_set1_epi8(1) );
+				}
+				else
+				{
+					passMask8 = _mm_set1_epi8(0xFF);
+				}
+				
+				// Do the alpha test. Pixels with an alpha value of 0 are rejected.
+				passMask8 = _mm_andnot_si128(_mm_cmpeq_epi8(srcAlpha, _mm_setzero_si128()), passMask8);
+				
+				const int passMaskValue = _mm_movemask_epi8(passMask8);
+				
+				// If none of the pixels within the vector pass, then reject them all at once.
+				if (passMaskValue == 0)
+				{
+					continue;
+				}
+				
+				// Write out the pixels.
+				const bool didAllPixelsPass = (passMaskValue == 0xFFFF);
+				this->_PixelComposite16_SSE2<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_3D, WILLPERFORMWINDOWTEST>(compInfo,
+																												   didAllPixelsPass,
+																												   passMask8,
+																												   src[3], src[2], src[1], src[0],
+																												   srcEffectEnableMask);
+			}
+#endif
+			
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+			for (; compInfo.target.xCustom < compInfo.line.widthCustom; srcLinePtr++, compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
+			{
+				if ( (srcLinePtr->a == 0) || (WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] == 0)) )
+				{
+					continue;
+				}
+				
+				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true;
+				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_3D>(compInfo, *srcLinePtr, 0, enableColorEffect);
+			}
+		}
+	}
+	else
+	{
+		for (size_t line = 0; line < compInfo.line.renderCount; line++)
+		{
+			for (compInfo.target.xNative = 0, compInfo.target.xCustom = 0; compInfo.target.xCustom < compInfo.line.widthCustom; compInfo.target.xCustom++, compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom], compInfo.target.lineColor16++, compInfo.target.lineColor32++, compInfo.target.lineLayerID++)
+			{
+				if ( WILLPERFORMWINDOWTEST && (this->_didPassWindowTestCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] == 0) )
+				{
+					continue;
+				}
+				
+				size_t srcX = compInfo.target.xCustom + hofs;
+				if (srcX >= compInfo.line.widthCustom * 2)
+				{
+					srcX -= compInfo.line.widthCustom * 2;
+				}
+				
+				if ( (srcX >= compInfo.line.widthCustom) || (srcLinePtr[srcX].a == 0) )
+				{
+					continue;
+				}
+				
+				compInfo.target.xNative = _gpuDstToSrcIndex[compInfo.target.xCustom];
+				
+				const bool enableColorEffect = (WILLPERFORMWINDOWTEST) ? (this->_enableColorEffectCustom[compInfo.renderState.selectedLayerID][compInfo.target.xCustom] != 0) : true;
+				this->_PixelComposite<COMPOSITORMODE, OUTPUTFORMAT, GPULayerType_3D>(compInfo, srcLinePtr[srcX], 0, enableColorEffect);
+			}
+			
+			srcLinePtr += compInfo.line.widthCustom;
+		}
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT, size_t CAPTURELENGTH>
+void GPUEngineA::_RenderLine_DisplayCapture(const u16 l)
+{
+	assert( (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH/2) || (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) );
+	
+	GPUEngineCompositorInfo &compInfo = this->_currentCompositorInfo[l];
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	
+	const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+	const bool is3DFramebufferNativeSize = CurrentRenderer->IsFramebufferNativeSize();
+	const u8 vramWriteBlock = DISPCAPCNT.VRAMWriteBlock;
+	const u8 vramReadBlock = DISPCNT.VRAM_Block;
+	const size_t writeLineIndexWithOffset = (DISPCAPCNT.VRAMWriteOffset * 64) + l;
+	const size_t readLineIndexWithOffset = (this->_dispCapCnt.readOffset * 64) + l;
+	bool newCaptureLineNativeState = true;
+	
+	//128-wide captures should write linearly into memory, with no gaps
+	//this is tested by hotel dusk
+	size_t dstNativeOffset = (DISPCAPCNT.VRAMWriteOffset * 64 * GPU_FRAMEBUFFER_NATIVE_WIDTH) + (l * CAPTURELENGTH);
+	
+	//Read/Write block wrap to 00000h when exceeding 1FFFFh (128k)
+	//this has not been tested yet (I thought I needed it for hotel dusk, but it was fixed by the above)
+	dstNativeOffset &= 0x0000FFFF;
+	
+	const u16 *vramNative16 = (u16 *)MMU.blank_memory;
+	const u16 *vramCustom16 = (u16 *)GPU->GetCustomVRAMBlankBuffer();
+	const u32 *vramCustom32 = (u32 *)GPU->GetCustomVRAMBlankBuffer();
+	u16 *dstNative16 = this->_VRAMNativeBlockPtr[vramWriteBlock] + dstNativeOffset;
+	bool readNativeVRAM = true;
+	bool captureLineNativeState32 = newCaptureLineNativeState;
+	
+	// Convert 18-bit and 24-bit framebuffers to 15-bit for native screen capture.
+	if ( (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.CaptureSrc != 1) )
+	{
+		switch (OUTPUTFORMAT)
+		{
+			case NDSColorFormat_BGR555_Rev:
+				break;
+				
+			case NDSColorFormat_BGR666_Rev:
+				ColorspaceConvertBuffer6665To5551<false, false>((u32 *)compInfo.target.lineColorHead, this->_captureWorkingA16, compInfo.line.pixelCount);
+				break;
+				
+			case NDSColorFormat_BGR888_Rev:
+				ColorspaceConvertBuffer8888To5551<false, false>((u32 *)compInfo.target.lineColorHead, this->_captureWorkingA16, compInfo.line.pixelCount);
+				break;
+		}
+	}
+	
+	// Convert VRAM for native VRAM capture.
+	if ( (DISPCAPCNT.SrcB == 0) && (DISPCAPCNT.CaptureSrc != 0) && (vramConfiguration.banks[vramReadBlock].purpose == VramConfiguration::LCDC) )
+	{
+		size_t vramNativeOffset = readLineIndexWithOffset * GPU_FRAMEBUFFER_NATIVE_WIDTH;
+		vramNativeOffset &= 0x0000FFFF;
+		vramNative16 = this->_VRAMNativeBlockPtr[vramReadBlock] + vramNativeOffset;
+		
+		this->VerifyVRAMLineDidChange(vramReadBlock, readLineIndexWithOffset);
+		
+		if (!this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset])
+		{
+			size_t vramCustomOffset = ((this->_dispCapCnt.readOffset * _gpuCaptureLineIndex[64]) + _gpuCaptureLineIndex[l]) * dispInfo.customWidth;
+			while (vramCustomOffset >= _gpuVRAMBlockOffset)
+			{
+				vramCustomOffset -= _gpuVRAMBlockOffset;
+			}
+			
+			switch (OUTPUTFORMAT)
+			{
+				case NDSColorFormat_BGR555_Rev:
+				case NDSColorFormat_BGR666_Rev:
+					vramCustom16 = (u16 *)this->_VRAMCustomBlockPtr[vramReadBlock] + vramCustomOffset;
+					break;
+					
+				case NDSColorFormat_BGR888_Rev:
+					vramCustom32 = (u32 *)this->_VRAMCustomBlockPtr[vramReadBlock] + vramCustomOffset;
+					break;
+			}
+			
+			readNativeVRAM = false;
+		}
+	}
+	
+	static CACHE_ALIGN u16 fifoLine16[GPU_FRAMEBUFFER_NATIVE_WIDTH];
+	const u16 *srcA16 = (DISPCAPCNT.SrcA == 0) ? ((OUTPUTFORMAT != NDSColorFormat_BGR555_Rev) ? this->_captureWorkingA16 : (u16 *)compInfo.target.lineColorHead) : this->_3DFramebuffer16 + compInfo.line.blockOffsetCustom;
+	const u16 *srcB16 = (DISPCAPCNT.SrcB == 0) ? vramNative16 : fifoLine16;
+	
+	switch (DISPCAPCNT.CaptureSrc)
+	{
+		case 0: // Capture source is SourceA
+		{
+			//INFO("Capture source is SourceA\n");
+			switch (DISPCAPCNT.SrcA)
+			{
+				case 0: // Capture screen (BG + OBJ + 3D)
+				{
+					//INFO("Capture screen (BG + OBJ + 3D)\n");
+					if (this->isLineRenderNative[l])
+					{
+						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
+					}
+					else
+					{
+						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, false, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
+					}
+					
+					newCaptureLineNativeState = this->isLineRenderNative[l];
+					break;
+				}
+					
+				case 1: // Capture 3D
+				{
+					//INFO("Capture 3D\n");
+					if (is3DFramebufferNativeSize)
+					{
+						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
+					}
+					else
+					{
+						this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, false, true>(srcA16, dstNative16, CAPTURELENGTH, 1);
+					}
+					
+					newCaptureLineNativeState = is3DFramebufferNativeSize;
+					break;
+				}
+			}
+			break;
+		}
+			
+		case 1: // Capture source is SourceB
+		{
+			//INFO("Capture source is SourceB\n");
+			switch (DISPCAPCNT.SrcB)
+			{
+				case 0: // Capture VRAM
+				{
+					this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, true>(srcB16, dstNative16, CAPTURELENGTH, 1);
+					newCaptureLineNativeState = this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset];
+					break;
+				}
+					
+				case 1: // Capture dispfifo (not yet tested)
+				{
+					this->_RenderLine_DispCapture_FIFOToBuffer(fifoLine16);
+					this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, true>(srcB16, dstNative16, CAPTURELENGTH, 1);
+					newCaptureLineNativeState = true;
+					break;
+				}
+			}
+			break;
+		}
+			
+		default: // Capture source is SourceA+B blended
+		{
+			//INFO("Capture source is SourceA+B blended\n");
+			if (DISPCAPCNT.SrcB != 0)
+			{
+				// fifo - tested by splinter cell chaos theory thermal view
+				this->_RenderLine_DispCapture_FIFOToBuffer(fifoLine16);
+			}
+			
+			if (DISPCAPCNT.SrcA == 0)
+			{
+				if (this->isLineRenderNative[l])
+				{
+					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, true, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
+				}
+				else
+				{
+					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, false, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
+				}
+				
+				newCaptureLineNativeState = this->isLineRenderNative[l] && ((DISPCAPCNT.SrcB != 0) || this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset]);
+			}
+			else
+			{
+				if (is3DFramebufferNativeSize)
+				{
+					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, true, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
+					newCaptureLineNativeState = (DISPCAPCNT.SrcB != 0) || this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset];
+				}
+				else
+				{
+					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, false, true, true>(srcA16, srcB16, dstNative16, CAPTURELENGTH, 1);
+					newCaptureLineNativeState = false;
+				}
+			}
+			break;
+		}
+	}
+	
+#ifdef ENABLE_SSE2
+	MACRODO_N( CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_stream_si128((__m128i *)(this->_VRAMNativeBlockCaptureCopyPtr[vramWriteBlock] + dstNativeOffset) + (X), _mm_load_si128((__m128i *)dstNative16 + (X))) );
+#else
+	memcpy(this->_VRAMNativeBlockCaptureCopyPtr[vramWriteBlock] + dstNativeOffset, dstNative16, CAPTURELENGTH * sizeof(u16));
+#endif
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+	{
+		captureLineNativeState32 = newCaptureLineNativeState;
+		newCaptureLineNativeState = false;
+	}
+	
+	if (this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] && !newCaptureLineNativeState)
+	{
+		this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] = false;
+		this->nativeLineCaptureCount[vramWriteBlock]--;
+	}
+	else if (!this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] && newCaptureLineNativeState)
+	{
+		this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset] = true;
+		this->nativeLineCaptureCount[vramWriteBlock]++;
+	}
+	
+	if (!this->isLineCaptureNative[vramWriteBlock][writeLineIndexWithOffset])
+	{
+		const size_t captureLengthExt = (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? dispInfo.customWidth : dispInfo.customWidth / 2;
+		const size_t captureLineCount = _gpuCaptureLineCount[l];
+		
+		size_t dstCustomOffset = (DISPCAPCNT.VRAMWriteOffset * _gpuCaptureLineIndex[64] * dispInfo.customWidth) + (_gpuCaptureLineIndex[l] * captureLengthExt);
+		while (dstCustomOffset >= _gpuVRAMBlockOffset)
+		{
+			dstCustomOffset -= _gpuVRAMBlockOffset;
+		}
+		
+		if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+		{
+			static CACHE_ALIGN FragmentColor fifoLine32[GPU_FRAMEBUFFER_NATIVE_WIDTH];
+			FragmentColor *dstCustom32 = (FragmentColor *)this->_VRAMCustomBlockPtr[vramWriteBlock] + dstCustomOffset;
+			bool isLineCaptureNative32 = ( (vramWriteBlock == vramReadBlock) && (writeLineIndexWithOffset == readLineIndexWithOffset) ) ? captureLineNativeState32 : this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset];
+			
+			if ( (DISPCAPCNT.SrcB == 1) && (DISPCAPCNT.CaptureSrc != 0) )
+			{
+				ColorspaceConvertBuffer555To8888Opaque<false, false>(fifoLine16, (u32 *)fifoLine32, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+			}
+			
+			if ( (DISPCAPCNT.SrcB == 0) && (DISPCAPCNT.CaptureSrc != 0) && (vramConfiguration.banks[vramReadBlock].purpose == VramConfiguration::LCDC) )
+			{
+				if (readNativeVRAM)
+				{
+					ColorspaceConvertBuffer555To8888Opaque<false, false>(vramNative16, (u32 *)vramCustom32, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				}
+			}
+			
+			const u32 *srcA32 = (DISPCAPCNT.SrcA == 0) ? (u32 *)compInfo.target.lineColorHead : (u32 *)CurrentRenderer->GetFramebuffer() + compInfo.line.blockOffsetCustom;
+			const u32 *srcB32 = (DISPCAPCNT.SrcB == 0) ? vramCustom32 : (u32 *)fifoLine32;
+			
+			switch (DISPCAPCNT.CaptureSrc)
+			{
+				case 0: // Capture source is SourceA
+				{
+					switch (DISPCAPCNT.SrcA)
+					{
+						case 0: // Capture screen (BG + OBJ + 3D)
+						{
+							if (this->isLineRenderNative[l])
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, true, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
+							}
+							else
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, false, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
+							}
+							break;
+						}
+							
+						case 1: // Capture 3D
+						{
+							if (is3DFramebufferNativeSize)
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 1, CAPTURELENGTH, true, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
+							}
+							else
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 1, CAPTURELENGTH, false, false>(srcA32, dstCustom32, captureLengthExt, captureLineCount);
+							}
+							break;
+						}
+					}
+					break;
+				}
+					
+				case 1: // Capture source is SourceB
+				{
+					switch (DISPCAPCNT.SrcB)
+					{
+						case 0: // Capture VRAM
+						{
+							if (isLineCaptureNative32)
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, true, false>(srcB32, dstCustom32, captureLengthExt, captureLineCount);
+							}
+							else
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 0, CAPTURELENGTH, false, false>(srcB32, dstCustom32, captureLengthExt, captureLineCount);
+							}
+							break;
+						}
+							
+						case 1: // Capture dispfifo (not yet tested)
+						{
+							this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR888_Rev, 1, CAPTURELENGTH, true, false>(srcB32, dstCustom32, captureLengthExt, captureLineCount);
+							break;
+						}
+					}
+					break;
+				}
+					
+				default: // Capture source is SourceA+B blended
+				{
+					u32 *srcCustomA32 = (u32 *)srcA32;
+					u32 *srcCustomB32 = (u32 *)srcB32;
+					
+					if ( (DISPCAPCNT.SrcB == 1) || isLineCaptureNative32 )
+					{
+						srcCustomB32 = (u32 *)this->_captureWorkingB32;
+						this->_LineCopy<0xFFFF, false, false, 4>(srcCustomB32, srcB32, 0);
+					}
+					
+					if (DISPCAPCNT.SrcA == 0)
+					{
+						if (this->isLineRenderNative[l])
+						{
+							srcCustomA32 = (u32 *)this->_captureWorkingA32;
+							this->_LineCopy<0xFFFF, false, false, 4>(srcCustomA32, srcA32, 0);
+						}
+					}
+					else
+					{
+						if (is3DFramebufferNativeSize)
+						{
+							srcCustomA32 = (u32 *)this->_captureWorkingA32;
+							this->_LineCopy<0xFFFF, false, false, 4>(srcCustomA32, srcA32, 0);
+						}
+					}
+					
+					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR888_Rev, CAPTURELENGTH, false, false, false>(srcCustomA32, srcCustomB32, dstCustom32, captureLengthExt, captureLineCount);
+					break;
+				}
+			}
+		}
+		else
+		{
+			if (!this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] && (DISPCAPCNT.SrcB == 0))
+			{
+				srcB16 = vramCustom16;
+			}
+			
+			u16 *dstCustom16 = (u16 *)this->_VRAMCustomBlockPtr[vramWriteBlock] + dstCustomOffset;
+			
+			switch (DISPCAPCNT.CaptureSrc)
+			{
+				case 0: // Capture source is SourceA
+				{
+					switch (DISPCAPCNT.SrcA)
+					{
+						case 0: // Capture screen (BG + OBJ + 3D)
+						{
+							if (this->isLineRenderNative[l])
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
+							}
+							else
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, false, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
+							}
+							break;
+						}
+							
+						case 1: // Capture 3D
+						{
+							if (is3DFramebufferNativeSize)
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
+							}
+							else
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, false, false>(srcA16, dstCustom16, captureLengthExt, captureLineCount);
+							}
+							break;
+						}
+					}
+					break;
+				}
+					
+				case 1: // Capture source is SourceB
+				{
+					switch (DISPCAPCNT.SrcB)
+					{
+						case 0: // Capture VRAM
+						{
+							if (this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset])
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, true, false>(srcB16, dstCustom16, captureLengthExt, captureLineCount);
+							}
+							else
+							{
+								this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 0, CAPTURELENGTH, false, false>(srcB16, dstCustom16, captureLengthExt, captureLineCount);
+							}
+							break;
+						}
+							
+						case 1: // Capture dispfifo (not yet tested)
+							this->_RenderLine_DispCapture_Copy<NDSColorFormat_BGR555_Rev, 1, CAPTURELENGTH, true, false>(srcB16, dstCustom16, captureLengthExt, captureLineCount);
+							break;
+					}
+					break;
+				}
+					
+				default: // Capture source is SourceA+B blended
+				{
+					u16 *srcCustomA16 = (u16 *)srcA16;
+					u16 *srcCustomB16 = (u16 *)srcB16;
+					
+					if ( (DISPCAPCNT.SrcB == 1) || this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] )
+					{
+						srcCustomB16 = this->_captureWorkingB16;
+						this->_LineCopy<0xFFFF, false, false, 2>(srcCustomB16, srcB16, 0);
+					}
+					
+					if (DISPCAPCNT.SrcA == 0)
+					{
+						if (this->isLineRenderNative[l])
+						{
+							srcCustomA16 = this->_captureWorkingA16;
+							this->_LineCopy<0xFFFF, false, false, 2>(srcCustomA16, srcA16, 0);
+						}
+					}
+					else
+					{
+						if (is3DFramebufferNativeSize)
+						{
+							srcCustomA16 = this->_captureWorkingA16;
+							this->_LineCopy<0xFFFF, false, false, 2>(srcCustomA16, srcA16, 0);
+						}
+					}
+					
+					this->_RenderLine_DispCapture_Blend<NDSColorFormat_BGR555_Rev, CAPTURELENGTH, false, false, false>(srcCustomA16, srcCustomB16, dstCustom16, captureLengthExt, captureLineCount);
+					break;
+				}
+			}
+		}
+	}
+}
+
+void GPUEngineA::_RenderLine_DispCapture_FIFOToBuffer(u16 *fifoLineBuffer)
+{
+#ifdef ENABLE_SSE2
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
+	{
+		const __m128i fifoColor = _mm_setr_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
+		_mm_store_si128((__m128i *)fifoLineBuffer + i, fifoColor);
+	}
+#else
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
+	{
+		((u32 *)fifoLineBuffer)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv() );
+	}
+#endif
+}
+
+template<NDSColorFormat COLORFORMAT, int SOURCESWITCH, size_t CAPTURELENGTH, bool CAPTUREFROMNATIVESRC, bool CAPTURETONATIVEDST>
+void GPUEngineA::_RenderLine_DispCapture_Copy(const void *src, void *dst, const size_t captureLengthExt, const size_t captureLineCount)
+{
+	const u16 alphaBit16 = (SOURCESWITCH == 0) ? 0x8000 : 0x0000;
+	const u32 alphaBit32 = (SOURCESWITCH == 0) ? ((COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF000000 : 0x1F000000) : 0x00000000;
+	
+#ifdef ENABLE_SSE2
+	const __m128i alpha_vec128 = (COLORFORMAT == NDSColorFormat_BGR555_Rev) ? _mm_set1_epi16(alphaBit16) : _mm_set1_epi32(alphaBit32);
+#endif
+	
+	if (CAPTURETONATIVEDST)
+	{
+		if (CAPTUREFROMNATIVESRC)
+		{
+#ifdef ENABLE_SSE2
+			switch (COLORFORMAT)
+			{
+				case NDSColorFormat_BGR555_Rev:
+					MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_store_si128((__m128i *)dst + (X), _mm_or_si128( _mm_load_si128( (__m128i *)src + (X)), alpha_vec128 ) ));
+					break;
+					
+				case NDSColorFormat_BGR666_Rev:
+				case NDSColorFormat_BGR888_Rev:
+					MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u32)), _mm_store_si128((__m128i *)dst + (X), _mm_or_si128( _mm_load_si128( (__m128i *)src + (X)), alpha_vec128 ) ));
+					break;
+			}
+#else
+			for (size_t i = 0; i < CAPTURELENGTH; i++)
+			{
+				switch (COLORFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+						((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
+						break;
+						
+					case NDSColorFormat_BGR666_Rev:
+					case NDSColorFormat_BGR888_Rev:
+						((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
+						break;
+				}
+			}
+#endif
+		}
+		else
+		{
+			for (size_t i = 0; i < CAPTURELENGTH; i++)
+			{
+				switch (COLORFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+						((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[_gpuDstPitchIndex[i]] | alphaBit16);
+						break;
+						
+					case NDSColorFormat_BGR666_Rev:
+					case NDSColorFormat_BGR888_Rev:
+						((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[_gpuDstPitchIndex[i]] | alphaBit32);
+						break;
+				}
+			}
+		}
+	}
+	else
+	{
+		const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo();
+		
+		if (CAPTUREFROMNATIVESRC)
+		{
+			for (size_t i = 0; i < CAPTURELENGTH; i++)
+			{
+				for (size_t p = 0; p < _gpuDstPitchCount[i]; p++)
+				{
+					switch (COLORFORMAT)
+					{
+						case NDSColorFormat_BGR555_Rev:
+							((u16 *)dst)[_gpuDstPitchIndex[i] + p] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
+							break;
+							
+						case NDSColorFormat_BGR666_Rev:
+						case NDSColorFormat_BGR888_Rev:
+							((u32 *)dst)[_gpuDstPitchIndex[i] + p] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
+							break;
+					}
+				}
+			}
+			
+			for (size_t line = 1; line < captureLineCount; line++)
+			{
+				switch (COLORFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+						memcpy((u16 *)dst + (line * dispInfo.customWidth), dst, captureLengthExt * sizeof(u16));
+						break;
+						
+					case NDSColorFormat_BGR666_Rev:
+					case NDSColorFormat_BGR888_Rev:
+						memcpy((u32 *)dst + (line * dispInfo.customWidth), dst, captureLengthExt * sizeof(u32));
+						break;
+				}
+			}
+		}
+		else
+		{
+			if (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH)
+			{
+				const size_t pixCountExt = captureLengthExt * captureLineCount;
+				size_t i = 0;
+				
+#ifdef ENABLE_SSE2
+				switch (COLORFORMAT)
+				{
+					case NDSColorFormat_BGR555_Rev:
+					{
+						const size_t ssePixCount = pixCountExt - (pixCountExt % 8);
+						for (; i < ssePixCount; i += 8)
+						{
+							_mm_store_si128((__m128i *)((u16 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u16 *)src + i)), alpha_vec128 ) );
+						}
+						break;
+					}
+						
+					case NDSColorFormat_BGR666_Rev:
+					case NDSColorFormat_BGR888_Rev:
+					{
+						const size_t ssePixCount = pixCountExt - (pixCountExt % 4);
+						for (; i < ssePixCount; i += 4)
+						{
+							_mm_store_si128((__m128i *)((u32 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u32 *)src + i)), alpha_vec128 ) );
+						}
+						break;
+					}
+				}
+#endif
+				
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+				for (; i < pixCountExt; i++)
+				{
+					switch (COLORFORMAT)
+					{
+						case NDSColorFormat_BGR555_Rev:
+							((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
+							break;
+							
+						case NDSColorFormat_BGR666_Rev:
+						case NDSColorFormat_BGR888_Rev:
+							((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
+							break;
+					}
+				}
+			}
+			else
+			{
+				for (size_t line = 0; line < captureLineCount; line++)
+				{
+					size_t i = 0;
+					
+					switch (COLORFORMAT)
+					{
+						case NDSColorFormat_BGR555_Rev:
+						{
+#ifdef ENABLE_SSE2
+							const size_t ssePixCount = captureLengthExt - (captureLengthExt % 8);
+							for (; i < ssePixCount; i += 8)
+							{
+								_mm_store_si128((__m128i *)((u16 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u16 *)src + i)), alpha_vec128 ) );
+							}
+#endif
+							
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+							for (; i < captureLengthExt; i++)
+							{
+								((u16 *)dst)[i] = LE_TO_LOCAL_16(((u16 *)src)[i] | alphaBit16);
+							}
+							
+							src = (u16 *)src + dispInfo.customWidth;
+							dst = (u16 *)dst + dispInfo.customWidth;
+							break;
+						}
+							
+						case NDSColorFormat_BGR666_Rev:
+						case NDSColorFormat_BGR888_Rev:
+						{
+#ifdef ENABLE_SSE2
+							const size_t ssePixCount = captureLengthExt - (captureLengthExt % 4);
+							for (; i < ssePixCount; i += 4)
+							{
+								_mm_store_si128((__m128i *)((u32 *)dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)((u32 *)src + i)), alpha_vec128 ) );
+							}
+#endif
+							
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+							for (; i < captureLengthExt; i++)
+							{
+								((u32 *)dst)[i] = LE_TO_LOCAL_32(((u32 *)src)[i] | alphaBit32);
+							}
+							
+							src = (u32 *)src + dispInfo.customWidth;
+							dst = (u32 *)dst + dispInfo.customWidth;
+							break;
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+u16 GPUEngineA::_RenderLine_DispCapture_BlendFunc(const u16 srcA, const u16 srcB, const u8 blendEVA, const u8 blendEVB)
+{
+	u16 a = 0;
+	u16 r = 0;
+	u16 g = 0;
+	u16 b = 0;
+	u16 a_alpha = srcA & 0x8000;
+	u16 b_alpha = srcB & 0x8000;
+	
+	if (a_alpha)
+	{
+		a = 0x8000;
+		r =  ((srcA        & 0x001F) * blendEVA);
+		g = (((srcA >>  5) & 0x001F) * blendEVA);
+		b = (((srcA >> 10) & 0x001F) * blendEVA);
+	}
+	
+	if (b_alpha)
+	{
+		a = 0x8000;
+		r +=  ((srcB        & 0x001F) * blendEVB);
+		g += (((srcB >>  5) & 0x001F) * blendEVB);
+		b += (((srcB >> 10) & 0x001F) * blendEVB);
+	}
+	
+	r >>= 4;
+	g >>= 4;
+	b >>= 4;
+	
+	//freedom wings sky will overflow while doing some fsaa/motionblur effect without this
+	r = (r > 31) ? 31 : r;
+	g = (g > 31) ? 31 : g;
+	b = (b > 31) ? 31 : b;
+	
+	return LOCAL_TO_LE_16(a | (b << 10) | (g << 5) | r);
+}
+
+template<NDSColorFormat COLORFORMAT>
+FragmentColor GPUEngineA::_RenderLine_DispCapture_BlendFunc(const FragmentColor srcA, const FragmentColor srcB, const u8 blendEVA, const u8 blendEVB)
+{
+	FragmentColor outColor;
+	outColor.color = 0;
+	
+	u16 r = 0;
+	u16 g = 0;
+	u16 b = 0;
+	
+	if (srcA.a > 0)
+	{
+		outColor.a  = (COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
+		r  = srcA.r * blendEVA;
+		g  = srcA.g * blendEVA;
+		b  = srcA.b * blendEVA;
+	}
+	
+	if (srcB.a > 0)
+	{
+		outColor.a  = (COLORFORMAT == NDSColorFormat_BGR888_Rev) ? 0xFF : 0x1F;
+		r += srcB.r * blendEVB;
+		g += srcB.g * blendEVB;
+		b += srcB.b * blendEVB;
+	}
+	
+	r >>= 4;
+	g >>= 4;
+	b >>= 4;
+	
+	//freedom wings sky will overflow while doing some fsaa/motionblur effect without this
+	if (COLORFORMAT == NDSColorFormat_BGR888_Rev)
+	{
+		outColor.r = (r > 255) ? 255 : r;
+		outColor.g = (g > 255) ? 255 : g;
+		outColor.b = (b > 255) ? 255 : b;
+	}
+	else
+	{
+		outColor.r = (r > 63) ? 63 : r;
+		outColor.g = (g > 63) ? 63 : g;
+		outColor.b = (b > 63) ? 63 : b;
+	}
+	
+	return outColor;
+}
+
+#ifdef ENABLE_SSE2
+template <NDSColorFormat COLORFORMAT>
+__m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB)
+{
+#ifdef ENABLE_SSSE3
+	__m128i blendAB = _mm_or_si128( blendEVA, _mm_slli_epi16(blendEVB, 8) );
+#endif
+	
+	switch (COLORFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+		{
+			__m128i srcA_alpha = _mm_and_si128(srcA, _mm_set1_epi16(0x8000));
+			__m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi16(0x8000));
+			__m128i srcA_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcA_alpha, _mm_setzero_si128()), srcA );
+			__m128i srcB_masked = _mm_andnot_si128( _mm_cmpeq_epi16(srcB_alpha, _mm_setzero_si128()), srcB );
+			__m128i colorBitMask = _mm_set1_epi16(0x001F);
+			
+			__m128i ra;
+			__m128i ga;
+			__m128i ba;
+			
+#ifdef ENABLE_SSSE3
+			ra = _mm_or_si128( _mm_and_si128(               srcA_masked,      colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 8), _mm_set1_epi16(0x1F00)) );
+			ga = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked,  5), colorBitMask), _mm_and_si128(_mm_slli_epi16(srcB_masked, 3), _mm_set1_epi16(0x1F00)) );
+			ba = _mm_or_si128( _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask), _mm_and_si128(_mm_srli_epi16(srcB_masked, 2), _mm_set1_epi16(0x1F00)) );
+			
+			ra = _mm_maddubs_epi16(ra, blendAB);
+			ga = _mm_maddubs_epi16(ga, blendAB);
+			ba = _mm_maddubs_epi16(ba, blendAB);
+#else
+			ra = _mm_and_si128(               srcA_masked,      colorBitMask);
+			ga = _mm_and_si128(_mm_srli_epi16(srcA_masked,  5), colorBitMask);
+			ba = _mm_and_si128(_mm_srli_epi16(srcA_masked, 10), colorBitMask);
+			
+			__m128i rb = _mm_and_si128(               srcB_masked,      colorBitMask);
+			__m128i gb = _mm_and_si128(_mm_srli_epi16(srcB_masked,  5), colorBitMask);
+			__m128i bb = _mm_and_si128(_mm_srli_epi16(srcB_masked, 10), colorBitMask);
+			
+			ra = _mm_add_epi16( _mm_mullo_epi16(ra, blendEVA), _mm_mullo_epi16(rb, blendEVB) );
+			ga = _mm_add_epi16( _mm_mullo_epi16(ga, blendEVA), _mm_mullo_epi16(gb, blendEVB) );
+			ba = _mm_add_epi16( _mm_mullo_epi16(ba, blendEVA), _mm_mullo_epi16(bb, blendEVB) );
+#endif
+			
+			ra = _mm_srli_epi16(ra, 4);
+			ga = _mm_srli_epi16(ga, 4);
+			ba = _mm_srli_epi16(ba, 4);
+			
+			ra = _mm_min_epi16(ra, colorBitMask);
+			ga = _mm_min_epi16(ga, colorBitMask);
+			ba = _mm_min_epi16(ba, colorBitMask);
+			
+			return _mm_or_si128( _mm_or_si128(_mm_or_si128(ra, _mm_slli_epi16(ga,  5)), _mm_slli_epi16(ba, 10)), _mm_or_si128(srcA_alpha, srcB_alpha) );
+		}
+			
+		case NDSColorFormat_BGR666_Rev:
+		case NDSColorFormat_BGR888_Rev:
+		{
+			// Get color masks based on if the alpha value is 0. Colors with an alpha value
+			// equal to 0 are rejected.
+			__m128i srcA_alpha = _mm_and_si128(srcA, _mm_set1_epi32(0xFF000000));
+			__m128i srcB_alpha = _mm_and_si128(srcB, _mm_set1_epi32(0xFF000000));
+			__m128i srcA_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcA_alpha, _mm_setzero_si128()), srcA);
+			__m128i srcB_masked = _mm_andnot_si128(_mm_cmpeq_epi32(srcB_alpha, _mm_setzero_si128()), srcB);
+			
+			__m128i outColorLo;
+			__m128i outColorHi;
+			__m128i outColor;
+			
+			// Temporarily convert the color component values from 8-bit to 16-bit, and then
+			// do the blend calculation.
+#ifdef ENABLE_SSSE3
+			outColorLo = _mm_unpacklo_epi8(srcA_masked, srcB_masked);
+			outColorHi = _mm_unpackhi_epi8(srcA_masked, srcB_masked);
+			
+			outColorLo = _mm_maddubs_epi16(outColorLo, blendAB);
+			outColorHi = _mm_maddubs_epi16(outColorHi, blendAB);
+#else
+			__m128i srcA_maskedLo = _mm_unpacklo_epi8(srcA_masked, _mm_setzero_si128());
+			__m128i srcA_maskedHi = _mm_unpackhi_epi8(srcA_masked, _mm_setzero_si128());
+			__m128i srcB_maskedLo = _mm_unpacklo_epi8(srcB_masked, _mm_setzero_si128());
+			__m128i srcB_maskedHi = _mm_unpackhi_epi8(srcB_masked, _mm_setzero_si128());
+			
+			outColorLo = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedLo, blendEVA), _mm_mullo_epi16(srcB_maskedLo, blendEVB) );
+			outColorHi = _mm_add_epi16( _mm_mullo_epi16(srcA_maskedHi, blendEVA), _mm_mullo_epi16(srcB_maskedHi, blendEVB) );
+#endif
+			
+			outColorLo = _mm_srli_epi16(outColorLo, 4);
+			outColorHi = _mm_srli_epi16(outColorHi, 4);
+			
+			// Convert the color components back from 16-bit to 8-bit using a saturated pack.
+			outColor = _mm_packus_epi16(outColorLo, outColorHi);
+			
+			// When the color format is 8888, the packuswb instruction will naturally clamp
+			// the color component values to 255. However, when the color format is 6665, the
+			// color component values must be clamped to 63. In this case, we must call pminub
+			// to do the clamp.
+			if (COLORFORMAT == NDSColorFormat_BGR666_Rev)
+			{
+				outColor = _mm_min_epu8(outColor, _mm_set1_epi8(63));
+			}
+			
+			// Add the alpha components back in.
+			outColor = _mm_and_si128(outColor, _mm_set1_epi32(0x00FFFFFF));
+			outColor = _mm_or_si128(outColor, srcA_alpha);
+			outColor = _mm_or_si128(outColor, srcB_alpha);
+			
+			return outColor;
+		}
+	}
+}
+#endif
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length, size_t l)
+{
+#ifdef ENABLE_SSE2
+	const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA);
+	const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB);
+#endif
+	
+	size_t i = 0;
+	
+	if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+	{
+		const FragmentColor *srcA_32 = (const FragmentColor *)srcA;
+		const FragmentColor *srcB_32 = (const FragmentColor *)srcB;
+		FragmentColor *dst32 = (FragmentColor *)dst;
+		
+#ifdef ENABLE_SSE2
+		const size_t ssePixCount = length - (length % 4);
+		for (; i < ssePixCount; i+=4)
+		{
+			const __m128i srcA_vec128 = _mm_load_si128((__m128i *)(srcA_32 + i));
+			const __m128i srcB_vec128 = _mm_load_si128((__m128i *)(srcB_32 + i));
+			
+			_mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<OUTPUTFORMAT>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
+		}
+#endif
+		
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+		for (; i < length; i++)
+		{
+			const FragmentColor colorA = srcA_32[i];
+			const FragmentColor colorB = srcB_32[i];
+			
+			dst32[i] = this->_RenderLine_DispCapture_BlendFunc<OUTPUTFORMAT>(colorA, colorB, blendEVA, blendEVB);
+		}
+	}
+	else
+	{
+		const u16 *srcA_16 = (const u16 *)srcA;
+		const u16 *srcB_16 = (const u16 *)srcB;
+		u16 *dst16 = (u16 *)dst;
+		
+#ifdef ENABLE_SSE2
+		const size_t ssePixCount = length - (length % 8);
+		for (; i < ssePixCount; i+=8)
+		{
+			const __m128i srcA_vec128 = _mm_load_si128((__m128i *)(srcA_16 + i));
+			const __m128i srcB_vec128 = _mm_load_si128((__m128i *)(srcB_16 + i));
+			
+			_mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<NDSColorFormat_BGR555_Rev>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
+		}
+#endif
+		
+#ifdef ENABLE_SSE2
+#pragma LOOPVECTORIZE_DISABLE
+#endif
+		for (; i < length; i++)
+		{
+			const u16 colorA = srcA_16[i];
+			const u16 colorB = srcB_16[i];
+			
+			dst16[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB);
+		}
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT, size_t CAPTURELENGTH, bool CAPTUREFROMNATIVESRCA, bool CAPTUREFROMNATIVESRCB, bool CAPTURETONATIVEDST>
+void GPUEngineA::_RenderLine_DispCapture_Blend(const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt, const size_t l)
+{
+	const u8 blendEVA = this->_dispCapCnt.EVA;
+	const u8 blendEVB = this->_dispCapCnt.EVB;
+	
+	if (CAPTURETONATIVEDST)
+	{
+#ifdef ENABLE_SSE2
+		const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA);
+		const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB);
+		
+		if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+		{
+			const u32 *srcA_32 = (const u32 *)srcA;
+			const u32 *srcB_32 = (const u32 *)srcB;
+			FragmentColor *dst32 = (FragmentColor *)dst;
+			
+			for (size_t i = 0; i < CAPTURELENGTH; i+=4)
+			{
+				__m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_32 + i)) : _mm_set_epi32(srcA_32[_gpuDstPitchIndex[i+3]],
+																														 srcA_32[_gpuDstPitchIndex[i+2]],
+																														 srcA_32[_gpuDstPitchIndex[i+1]],
+																														 srcA_32[_gpuDstPitchIndex[i+0]]);
+				
+				__m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_32 + i)) : _mm_set_epi32(srcB_32[_gpuDstPitchIndex[i+3]],
+																														 srcB_32[_gpuDstPitchIndex[i+2]],
+																														 srcB_32[_gpuDstPitchIndex[i+1]],
+																														 srcB_32[_gpuDstPitchIndex[i+0]]);
+				
+				_mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<OUTPUTFORMAT>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
+			}
+		}
+		else
+		{
+			const u16 *srcA_16 = (const u16 *)srcA;
+			const u16 *srcB_16 = (const u16 *)srcB;
+			u16 *dst16 = (u16 *)dst;
+			
+			for (size_t i = 0; i < CAPTURELENGTH; i+=8)
+			{
+				__m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_16 + i)) : _mm_set_epi16(srcA_16[_gpuDstPitchIndex[i+7]],
+																														 srcA_16[_gpuDstPitchIndex[i+6]],
+																														 srcA_16[_gpuDstPitchIndex[i+5]],
+																														 srcA_16[_gpuDstPitchIndex[i+4]],
+																														 srcA_16[_gpuDstPitchIndex[i+3]],
+																														 srcA_16[_gpuDstPitchIndex[i+2]],
+																														 srcA_16[_gpuDstPitchIndex[i+1]],
+																														 srcA_16[_gpuDstPitchIndex[i+0]]);
+				
+				__m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_16 + i)) : _mm_set_epi16(srcB_16[_gpuDstPitchIndex[i+7]],
+																														 srcB_16[_gpuDstPitchIndex[i+6]],
+																														 srcB_16[_gpuDstPitchIndex[i+5]],
+																														 srcB_16[_gpuDstPitchIndex[i+4]],
+																														 srcB_16[_gpuDstPitchIndex[i+3]],
+																														 srcB_16[_gpuDstPitchIndex[i+2]],
+																														 srcB_16[_gpuDstPitchIndex[i+1]],
+																														 srcB_16[_gpuDstPitchIndex[i+0]]);
+				
+				_mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2<NDSColorFormat_BGR555_Rev>(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) );
+			}
+		}
+#else
+		for (size_t i = 0; i < CAPTURELENGTH; i++)
+		{
+			if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+			{
+				const FragmentColor colorA = (CAPTUREFROMNATIVESRCA) ? ((const FragmentColor *)srcA)[i] : ((const FragmentColor *)srcA)[_gpuDstPitchIndex[i]];
+				const FragmentColor colorB = (CAPTUREFROMNATIVESRCB) ? ((const FragmentColor *)srcB)[i] : ((const FragmentColor *)srcB)[_gpuDstPitchIndex[i]];
+				
+				((FragmentColor *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc<OUTPUTFORMAT>(colorA, colorB, blendEVA, blendEVB);
+			}
+			else
+			{
+				const u16 colorA = (CAPTUREFROMNATIVESRCA) ? ((u16 *)srcA)[i] : ((u16 *)srcA)[_gpuDstPitchIndex[i]];
+				const u16 colorB = (CAPTUREFROMNATIVESRCB) ? ((u16 *)srcB)[i] : ((u16 *)srcB)[_gpuDstPitchIndex[i]];
+				
+				((u16 *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB);
+			}
+		}
+#endif
+	}
+	else
+	{
+		const size_t lineWidth = GPU->GetDisplayInfo().customWidth;
+		const size_t captureLineCount = _gpuCaptureLineCount[l];
+		
+		if (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH)
+		{
+			this->_RenderLine_DispCapture_BlendToCustomDstBuffer<OUTPUTFORMAT>(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt * captureLineCount, l);
+		}
+		else
+		{
+			for (size_t line = 0; line < captureLineCount; line++)
+			{
+				this->_RenderLine_DispCapture_BlendToCustomDstBuffer<OUTPUTFORMAT>(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt, l);
+				srcA = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcA + lineWidth) : (void *)((u16 *)srcA + lineWidth);
+				srcB = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcB + lineWidth) : (void *)((u16 *)srcB + lineWidth);
+				dst = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)dst + lineWidth) : (void *)((u16 *)dst + lineWidth);
+			}
+		}
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineA::_HandleDisplayModeVRAM(const size_t l)
+{
+	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
+	this->VerifyVRAMLineDidChange(DISPCNT.VRAM_Block, l);
+	
+	if (this->isLineCaptureNative[DISPCNT.VRAM_Block][l])
+	{
+		switch (OUTPUTFORMAT)
+		{
+			case NDSColorFormat_BGR555_Rev:
+				this->_LineCopy<1, true, true, 2>(this->nativeBuffer, this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block], l);
+				break;
+				
+			case NDSColorFormat_BGR666_Rev:
+			{
+				const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				u32 *dst = (u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				ColorspaceConvertBuffer555To6665Opaque<false, false>(src, dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				break;
+			}
+				
+			case NDSColorFormat_BGR888_Rev:
+			{
+				const u16 *src = this->_VRAMNativeBlockPtr[DISPCNT.VRAM_Block] + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				u32 *dst = (u32 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				ColorspaceConvertBuffer555To8888Opaque<false, false>(src, dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+				break;
+			}
+		}
+	}
+	else
+	{
+		const size_t customWidth = GPU->GetDisplayInfo().customWidth;
+		const size_t customPixCount = customWidth * _gpuDstLineCount[l];
+		
+		switch (OUTPUTFORMAT)
+		{
+			case NDSColorFormat_BGR555_Rev:
+				this->_LineCopy<0, true, true, 2>(this->customBuffer, this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block], l);
+				break;
+				
+			case NDSColorFormat_BGR666_Rev:
+			{
+				const u16 *src = (u16 *)this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block] + (_gpuDstLineIndex[l] * customWidth);
+				u32 *dst = (u32 *)this->customBuffer + (_gpuDstLineIndex[l] * customWidth);
+				ColorspaceConvertBuffer555To6665Opaque<false, false>(src, dst, customPixCount);
+				break;
+			}
+				
+			case NDSColorFormat_BGR888_Rev:
+			{
+				if (GPU->GetDisplayInfo().isCustomSizeRequested)
+				{
+					this->_LineCopy<0, true, true, 4>(this->customBuffer, this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block], l);
+				}
+				else
+				{
+					this->_LineCopy<1, true, true, 4>(this->nativeBuffer, this->_VRAMCustomBlockPtr[DISPCNT.VRAM_Block], l);
+				}
+				break;
+			}
+		}
+		
+		if ((OUTPUTFORMAT != NDSColorFormat_BGR888_Rev) || GPU->GetDisplayInfo().isCustomSizeRequested)
+		{
+			this->isLineOutputNative[l] = false;
+			this->nativeLineOutputCount--;
+		}
+	}
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineA::_HandleDisplayModeMainMemory(const size_t l)
+{
+	// Native rendering only.
+	//
+	//this has not been tested since the dma timing for dispfifo was changed around the time of
+	//newemuloop. it may not work.
+	
+	u32 *dstColorLine = (u32 *)((u16 *)this->nativeBuffer + (l * GPU_FRAMEBUFFER_NATIVE_WIDTH));
+	
+	switch (OUTPUTFORMAT)
+	{
+		case NDSColorFormat_BGR555_Rev:
+		{
+			u32 *dst = dstColorLine;
+			
+#ifdef ENABLE_SSE2
+			const __m128i alphaBit = _mm_set1_epi16(0x8000);
+			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
+			{
+				const __m128i fifoColor = _mm_setr_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
+				_mm_store_si128((__m128i *)dst + i, _mm_or_si128(fifoColor, alphaBit));
+			}
+#else
+			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
+			{
+				dst[i] = DISP_FIFOrecv() | 0x80008000;
+			}
+#endif
+			break;
+		}
+			
+		case NDSColorFormat_BGR666_Rev:
+		{
+			FragmentColor *dst = (FragmentColor *)dstColorLine;
+			
+			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
+			{
+				u32 src = DISP_FIFOrecv();
+				dst[i+0].color = COLOR555TO6665_OPAQUE((src >>  0) & 0x7FFF);
+				dst[i+1].color = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF);
+			}
+			break;
+		}
+			
+		case NDSColorFormat_BGR888_Rev:
+		{
+			FragmentColor *dst = (FragmentColor *)dstColorLine;
+			
+			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
+			{
+				u32 src = DISP_FIFOrecv();
+				dst[i+0].color = COLOR555TO8888_OPAQUE((src >>  0) & 0x7FFF);
+				dst[i+1].color = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF);
+			}
+			break;
+		}
+	}
+}
+
+template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>
+void GPUEngineA::_LineLarge8bpp(GPUEngineCompositorInfo &compInfo)
+{
+	u16 XBG = compInfo.renderState.selectedBGLayer->xOffset;
+	u16 YBG = compInfo.line.indexNative + compInfo.renderState.selectedBGLayer->yOffset;
+	u16 lg = compInfo.renderState.selectedBGLayer->size.width;
+	u16 ht = compInfo.renderState.selectedBGLayer->size.height;
+	u16 wmask = (lg-1);
+	u16 hmask = (ht-1);
+	YBG &= hmask;
+	
+	//TODO - handle wrapping / out of bounds correctly from rot_scale_op?
+	
+	u32 tmp_map = compInfo.renderState.selectedBGLayer->largeBMPAddress + lg * YBG;
+	u8 *__restrict map = (u8 *)MMU_gpu_map(tmp_map);
+	
+	for (size_t x = 0; x < lg; ++x, ++XBG)
+	{
+		XBG &= wmask;
+		
+		if (WILLDEFERCOMPOSITING)
+		{
+			this->_deferredIndexNative[x] = map[XBG];
+			this->_deferredColorNative[x] = LE_TO_LOCAL_16(this->_paletteBG[this->_deferredIndexNative[x]]);
+		}
+		else
+		{
+			const u8 index = map[XBG];
+			const u16 color = LE_TO_LOCAL_16(this->_paletteBG[index]);
+			this->_CompositePixelImmediate<COMPOSITORMODE, OUTPUTFORMAT, MOSAIC, WILLPERFORMWINDOWTEST>(compInfo, x, color, (color != 0));
+		}
+	}
+}
+
+void GPUEngineA::LastLineProcess()
+{
+	this->GPUEngineBase::LastLineProcess();
+	DISP_FIFOreset();
+}
+
+GPUEngineB::GPUEngineB()
+{
+	_engineID = GPUEngineID_Sub;
+	_targetDisplayID = NDSDisplayID_Touch;
+	_IORegisterMap = (GPU_IOREG *)(&MMU.ARM9_REG[REG_DISPB]);
+	_paletteBG = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_1KB);
+	_paletteOBJ = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_1KB + ADDRESS_STEP_512B);
+	_oamList = (OAMAttributes *)(MMU.ARM9_OAM + ADDRESS_STEP_1KB);
+	_sprMem = MMU_BOBJ;
+}
+
+GPUEngineB::~GPUEngineB()
+{
+}
+
+GPUEngineB* GPUEngineB::Allocate()
+{
+	return new(malloc_aligned64(sizeof(GPUEngineB))) GPUEngineB();
+}
+
+void GPUEngineB::FinalizeAndDeallocate()
+{
+	this->~GPUEngineB();
+	free_aligned(this);
+}
+
+void GPUEngineB::Reset()
+{
+	this->_Reset_Base();
+	
+	this->_BGLayer[GPULayerID_BG0].BMPAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG1].BMPAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG2].BMPAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG3].BMPAddress = MMU_BBG;
+	
+	this->_BGLayer[GPULayerID_BG0].largeBMPAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG1].largeBMPAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG2].largeBMPAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG3].largeBMPAddress = MMU_BBG;
+	
+	this->_BGLayer[GPULayerID_BG0].tileMapAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG1].tileMapAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG2].tileMapAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG3].tileMapAddress = MMU_BBG;
+	
+	this->_BGLayer[GPULayerID_BG0].tileEntryAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG1].tileEntryAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG2].tileEntryAddress = MMU_BBG;
+	this->_BGLayer[GPULayerID_BG3].tileEntryAddress = MMU_BBG;
+	
+	this->SetDisplayByID(NDSDisplayID_Touch);
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUEngineB::RenderLine(const size_t l)
+{
+	const GPUEngineRenderState &renderState = this->_currentCompositorInfo[l].renderState;
+	
+	switch (renderState.displayOutputMode)
+	{
+		case GPUDisplayMode_Off: // Display Off(Display white)
+			this->_HandleDisplayModeOff<OUTPUTFORMAT>(l);
+			break;
+		
+		case GPUDisplayMode_Normal: // Display BG and OBJ layers
+		{
+			if (renderState.isAnyWindowEnabled)
+			{
+				this->_RenderLine_Layers<OUTPUTFORMAT, true>(l);
+			}
+			else
+			{
+				this->_RenderLine_Layers<OUTPUTFORMAT, false>(l);
+			}
+			
+			this->_HandleDisplayModeNormal<OUTPUTFORMAT>(l);
+			break;
+		}
+			
+		default:
+			break;
+	}
+}
+
+GPUSubsystem::GPUSubsystem()
+{
+	ColorspaceHandlerInit();
+	
+	_defaultEventHandler = new GPUEventHandlerDefault;
+	_event = _defaultEventHandler;
+	
+	gfx3d_init();
+	
+	_engineMain = GPUEngineA::Allocate();
+	_engineSub = GPUEngineB::Allocate();
+	
+	_display[NDSDisplayID_Main] = new NDSDisplay(NDSDisplayID_Main);
+	_display[NDSDisplayID_Main]->SetEngine(_engineMain);
+	_display[NDSDisplayID_Touch] = new NDSDisplay(NDSDisplayID_Touch);
+	_display[NDSDisplayID_Touch]->SetEngine(_engineSub);
+	
+	_videoFrameCount = 0;
+	_render3DFrameCount = 0;
+	_frameNeedsFinish = false;
+	_willFrameSkip = false;
+	_willPostprocessDisplays = true;
+	_willAutoResolveToCustomBuffer = true;
+	
+	//TODO OSD
+	//OSDCLASS *previousOSD = osd;
+	//osd = new OSDCLASS(-1);
+	//delete previousOSD;
+	
+	_displayInfo.colorFormat = NDSColorFormat_BGR555_Rev;
+	_displayInfo.pixelBytes = sizeof(u16);
+	_displayInfo.isCustomSizeRequested = false;
+	_displayInfo.customWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	_displayInfo.customHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	
+	_customVRAM = NULL;
+	_customVRAMBlank = NULL;
+	_displayInfo.framebufferSize = ((GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT)) * 2 * _displayInfo.pixelBytes;
+	_masterFramebuffer = malloc_alignedPage(_displayInfo.framebufferSize * 2);
+	
+	_displayInfo.bufferIndex = 0;
+	_displayInfo.masterFramebufferHead = _masterFramebuffer;
+	_displayInfo.masterNativeBuffer = _masterFramebuffer;
+	_displayInfo.nativeBuffer[NDSDisplayID_Main] = _displayInfo.masterNativeBuffer;
+	_displayInfo.nativeBuffer[NDSDisplayID_Touch] = (u8 *)_displayInfo.masterNativeBuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * _displayInfo.pixelBytes);
+	
+	_displayInfo.masterCustomBuffer = (u8 *)_masterFramebuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * 2 * _displayInfo.pixelBytes);
+	_displayInfo.customBuffer[NDSDisplayID_Main] = _displayInfo.masterCustomBuffer;
+	_displayInfo.customBuffer[NDSDisplayID_Touch] = (u8 *)_displayInfo.masterCustomBuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * _displayInfo.pixelBytes);
+	
+	_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = false;
+	_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = false;
+	_displayInfo.renderedWidth[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	_displayInfo.renderedWidth[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	_displayInfo.renderedHeight[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	_displayInfo.renderedHeight[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	_displayInfo.renderedBuffer[NDSDisplayID_Main] = _displayInfo.nativeBuffer[NDSDisplayID_Main];
+	_displayInfo.renderedBuffer[NDSDisplayID_Touch] = _displayInfo.nativeBuffer[NDSDisplayID_Touch];
+	
+	ClearWithColor(0x8000);
+}
+
+GPUSubsystem::~GPUSubsystem()
+{
+	//TODO OSD
+	//delete osd;
+	//osd = NULL;
+	
+	free_aligned(this->_masterFramebuffer);
+	free_aligned(this->_customVRAM);
+	
+	free_aligned(_gpuDstToSrcIndex);
+	_gpuDstToSrcIndex = NULL;
+	
+	free_aligned(_gpuDstToSrcSSSE3_u8_8e);
+	_gpuDstToSrcSSSE3_u8_8e = NULL;
+	free_aligned(_gpuDstToSrcSSSE3_u8_16e);
+	_gpuDstToSrcSSSE3_u8_16e = NULL;
+	free_aligned(_gpuDstToSrcSSSE3_u16_8e);
+	_gpuDstToSrcSSSE3_u16_8e = NULL;
+	free_aligned(_gpuDstToSrcSSSE3_u32_4e);
+	_gpuDstToSrcSSSE3_u32_4e = NULL;
+	
+	delete _display[NDSDisplayID_Main];
+	delete _display[NDSDisplayID_Touch];
+	_engineMain->FinalizeAndDeallocate();
+	_engineSub->FinalizeAndDeallocate();
+	
+	gfx3d_deinit();
+	
+	delete _defaultEventHandler;
+}
+
+void GPUSubsystem::_UpdateFPSRender3D()
+{
+	this->_videoFrameCount++;
+	if (this->_videoFrameCount == 60)
+	{
+		this->_render3DFrameCount = gfx3d.render3DFrameCount;
+		gfx3d.render3DFrameCount = 0;
+		this->_videoFrameCount = 0;
+	}
+}
+
+void GPUSubsystem::SetEventHandler(GPUEventHandler *eventHandler)
+{
+	this->_event = eventHandler;
+}
+
+GPUEventHandler* GPUSubsystem::GetEventHandler()
+{
+	return this->_event;
+}
+
+void GPUSubsystem::Reset()
+{
+	if (this->_customVRAM == NULL)
+	{
+		this->SetCustomFramebufferSize(this->_displayInfo.customWidth, this->_displayInfo.customHeight);
+	}
+	
+	this->_willFrameSkip = false;
+	this->_videoFrameCount = 0;
+	this->_render3DFrameCount = 0;
+	
+	this->ClearWithColor(0xFFFF);
+	
+	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main]  = false;
+	this->_displayInfo.nativeBuffer[NDSDisplayID_Main]    = this->_displayInfo.masterNativeBuffer;
+	this->_displayInfo.customBuffer[NDSDisplayID_Main]    = this->_displayInfo.masterCustomBuffer;
+	this->_displayInfo.renderedWidth[NDSDisplayID_Main]   = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	this->_displayInfo.renderedHeight[NDSDisplayID_Main]  = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->_displayInfo.renderedBuffer[NDSDisplayID_Main]  = this->_displayInfo.nativeBuffer[NDSDisplayID_Main];
+	
+	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = false;
+	this->_displayInfo.nativeBuffer[NDSDisplayID_Touch]   = (u8 *)this->_displayInfo.masterNativeBuffer + (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * this->_displayInfo.pixelBytes);
+	this->_displayInfo.customBuffer[NDSDisplayID_Touch]   = (u8 *)this->_displayInfo.masterCustomBuffer + (this->_displayInfo.customWidth * this->_displayInfo.customHeight * this->_displayInfo.pixelBytes);
+	this->_displayInfo.renderedWidth[NDSDisplayID_Touch]  = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = this->_displayInfo.nativeBuffer[NDSDisplayID_Touch];
+	
+	this->_displayInfo.engineID[NDSDisplayID_Main] = GPUEngineID_Main;
+	this->_displayInfo.engineID[NDSDisplayID_Touch] = GPUEngineID_Sub;
+	
+	this->_display[NDSDisplayID_Main]->SetEngineByID(GPUEngineID_Main);
+	this->_display[NDSDisplayID_Touch]->SetEngineByID(GPUEngineID_Sub);
+	
+	gfx3d_reset();
+	this->_engineMain->Reset();
+	this->_engineSub->Reset();
+	
+	DISP_FIFOreset();
+
+	//historically, we reset the OSD here. maybe because we would want a clean drawing surface? anyway this is not the right point to be doing OSD work
+	//osd->clear();
+}
+
+void GPUSubsystem::ForceRender3DFinishAndFlush(bool willFlush)
+{
+	CurrentRenderer->RenderFinish();
+	CurrentRenderer->RenderFlush(willFlush, willFlush);
+}
+
+void GPUSubsystem::ForceFrameStop()
+{
+	if (CurrentRenderer->GetRenderNeedsFinish())
+	{
+		this->ForceRender3DFinishAndFlush(true);
+		CurrentRenderer->SetRenderNeedsFinish(false);
+		this->_event->DidRender3DEnd();
+	}
+	
+	if (this->_frameNeedsFinish)
+	{
+		this->_frameNeedsFinish = false;
+		this->_event->DidFrameEnd(false, this->_displayInfo);
+	}
+}
+
+bool GPUSubsystem::GetWillFrameSkip() const
+{
+	return this->_willFrameSkip;
+}
+
+void GPUSubsystem::SetWillFrameSkip(const bool willFrameSkip)
+{
+	this->_willFrameSkip = willFrameSkip;
+}
+
+void GPUSubsystem::SetDisplayCaptureEnable()
+{
+	this->_engineMain->SetDisplayCaptureEnable();
+}
+
+void GPUSubsystem::ResetDisplayCaptureEnable()
+{
+	this->_engineMain->ResetDisplayCaptureEnable();
+}
+
+void GPUSubsystem::UpdateRenderProperties()
+{
+	this->_engineMain->nativeLineRenderCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->_engineMain->nativeLineOutputCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->_engineSub->nativeLineRenderCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	this->_engineSub->nativeLineOutputCount = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	for (size_t l = 0; l < GPU_FRAMEBUFFER_NATIVE_HEIGHT; l++)
+	{
+		this->_engineMain->isLineRenderNative[l] = true;
+		this->_engineMain->isLineOutputNative[l] = true;
+		this->_engineSub->isLineRenderNative[l] = true;
+		this->_engineSub->isLineOutputNative[l] = true;
+	}
+	
+	this->_displayInfo.bufferIndex = (this->_displayInfo.bufferIndex + 1) & 0x01;
+	
+	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * this->_displayInfo.pixelBytes;
+	const size_t customFramebufferSize = this->_displayInfo.customWidth * this->_displayInfo.customHeight * this->_displayInfo.pixelBytes;
+	
+	this->_displayInfo.masterNativeBuffer = (u8 *)this->_masterFramebuffer + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
+	this->_displayInfo.masterCustomBuffer = (u8 *)this->_masterFramebuffer + (nativeFramebufferSize * 2) + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
+	
+	this->_engineMain->nativeBuffer = (this->_engineMain->GetDisplayByID() == NDSDisplayID_Main) ? this->_displayInfo.masterNativeBuffer : (u8 *)this->_displayInfo.masterNativeBuffer + nativeFramebufferSize;
+	this->_engineMain->customBuffer = (this->_engineMain->GetDisplayByID() == NDSDisplayID_Main) ? this->_displayInfo.masterCustomBuffer : (u8 *)this->_displayInfo.masterCustomBuffer + customFramebufferSize;
+	this->_engineMain->renderedBuffer = this->_engineMain->nativeBuffer;
+	this->_engineMain->renderedWidth  = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	this->_engineMain->renderedHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	
+	this->_engineSub->nativeBuffer  = (this->_engineSub->GetDisplayByID()  == NDSDisplayID_Main) ? this->_displayInfo.masterNativeBuffer : (u8 *)this->_displayInfo.masterNativeBuffer + nativeFramebufferSize;
+	this->_engineSub->customBuffer  = (this->_engineSub->GetDisplayByID()  == NDSDisplayID_Main) ? this->_displayInfo.masterCustomBuffer : (u8 *)this->_displayInfo.masterCustomBuffer + customFramebufferSize;
+	this->_engineSub->renderedBuffer  = this->_engineSub->nativeBuffer;
+	this->_engineSub->renderedWidth   = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	this->_engineSub->renderedHeight  = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	
+	GPUEngineBase *mainEngine  = this->_display[NDSDisplayID_Main]->GetEngine();
+	this->_displayInfo.nativeBuffer[NDSDisplayID_Main]    = mainEngine->nativeBuffer;
+	this->_displayInfo.customBuffer[NDSDisplayID_Main]    = mainEngine->customBuffer;
+	this->_displayInfo.renderedBuffer[NDSDisplayID_Main]  = mainEngine->renderedBuffer;
+	this->_displayInfo.renderedWidth[NDSDisplayID_Main]   = mainEngine->renderedWidth;
+	this->_displayInfo.renderedHeight[NDSDisplayID_Main]  = mainEngine->renderedHeight;
+	
+	GPUEngineBase *touchEngine = this->_display[NDSDisplayID_Touch]->GetEngine();
+	this->_displayInfo.nativeBuffer[NDSDisplayID_Touch]   = touchEngine->nativeBuffer;
+	this->_displayInfo.customBuffer[NDSDisplayID_Touch]   = touchEngine->customBuffer;
+	this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = touchEngine->renderedBuffer;
+	this->_displayInfo.renderedWidth[NDSDisplayID_Touch]  = touchEngine->renderedWidth;
+	this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = touchEngine->renderedHeight;
+	
+	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = false;
+	this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = false;
+	
+	if (!this->_displayInfo.isCustomSizeRequested && (this->_displayInfo.colorFormat != NDSColorFormat_BGR888_Rev))
+	{
+		return;
+	}
+	
+	// Iterate through VRAM banks A-D and determine if they will be used for this frame.
+	for (size_t i = 0; i < 4; i++)
+	{
+		if (this->_engineMain->nativeLineCaptureCount[i] == GPU_VRAM_BLOCK_LINES)
+		{
+			continue;
+		}
+		
+		switch (vramConfiguration.banks[i].purpose)
+		{
+			case VramConfiguration::ABG:
+			case VramConfiguration::BBG:
+			case VramConfiguration::LCDC:
+			case VramConfiguration::AOBJ:
+			case VramConfiguration::BOBJ:
+				break;
+				
+			default:
+			{
+				this->_engineMain->nativeLineCaptureCount[i] = GPU_VRAM_BLOCK_LINES;
+				for (size_t l = 0; l < GPU_VRAM_BLOCK_LINES; l++)
+				{
+					this->_engineMain->isLineCaptureNative[i][l] = true;
+				}
+				break;
+			}
+		}
+	}
+}
+
+const NDSDisplayInfo& GPUSubsystem::GetDisplayInfo()
+{
+	return this->_displayInfo;
+}
+
+u32 GPUSubsystem::GetFPSRender3D() const
+{
+	return this->_render3DFrameCount;
+}
+
+GPUEngineA* GPUSubsystem::GetEngineMain()
+{
+	return this->_engineMain;
+}
+
+GPUEngineB* GPUSubsystem::GetEngineSub()
+{
+	return this->_engineSub;
+}
+
+NDSDisplay* GPUSubsystem::GetDisplayMain()
+{
+	return this->_display[NDSDisplayID_Main];
+}
+
+NDSDisplay* GPUSubsystem::GetDisplayTouch()
+{
+	return this->_display[NDSDisplayID_Touch];
+}
+
+size_t GPUSubsystem::GetCustomFramebufferWidth() const
+{
+	return this->_displayInfo.customWidth;
+}
+
+size_t GPUSubsystem::GetCustomFramebufferHeight() const
+{
+	return this->_displayInfo.customHeight;
+}
+
+void GPUSubsystem::SetCustomFramebufferSize(size_t w, size_t h)
+{
+	if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT)
+	{
+		return;
+	}
+	
+	const float customWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	const float customHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	const float newGpuLargestDstLineCount = (size_t)ceilf(customHeightScale);
+	
+	u16 *oldGpuDstToSrcIndexPtr = _gpuDstToSrcIndex;
+	u8 *oldGpuDstToSrcSSSE3_u8_8e = _gpuDstToSrcSSSE3_u8_8e;
+	u8 *oldGpuDstToSrcSSSE3_u8_16e = _gpuDstToSrcSSSE3_u8_16e;
+	u8 *oldGpuDstToSrcSSSE3_u16_8e = _gpuDstToSrcSSSE3_u16_8e;
+	u8 *oldGpuDstToSrcSSSE3_u32_4e = _gpuDstToSrcSSSE3_u32_4e;
+	
+	for (size_t srcX = 0, currentPitchCount = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; srcX++)
+	{
+		const size_t pitch = (size_t)ceilf((srcX+1) * customWidthScale) - currentPitchCount;
+		_gpuDstPitchCount[srcX] = pitch;
+		_gpuDstPitchIndex[srcX] = currentPitchCount;
+		currentPitchCount += pitch;
+	}
+	
+	for (size_t srcY = 0, currentLineCount = 0; srcY < GPU_FRAMEBUFFER_NATIVE_HEIGHT; srcY++)
+	{
+		const size_t lineCount = (size_t)ceilf((srcY+1) * customHeightScale) - currentLineCount;
+		_gpuDstLineCount[srcY] = lineCount;
+		_gpuDstLineIndex[srcY] = currentLineCount;
+		currentLineCount += lineCount;
+	}
+	
+	for (size_t srcY = 0, currentLineCount = 0; srcY < GPU_VRAM_BLOCK_LINES + 1; srcY++)
+	{
+		const size_t lineCount = (size_t)ceilf((srcY+1) * customHeightScale) - currentLineCount;
+		_gpuCaptureLineCount[srcY] = lineCount;
+		_gpuCaptureLineIndex[srcY] = currentLineCount;
+		currentLineCount += lineCount;
+	}
+	
+	u16 *newGpuDstToSrcIndex = (u16 *)malloc_alignedCacheLine(w * h * sizeof(u16));
+	u16 *newGpuDstToSrcPtr = newGpuDstToSrcIndex;
+	for (size_t y = 0, dstIdx = 0; y < GPU_FRAMEBUFFER_NATIVE_HEIGHT; y++)
+	{
+		if (_gpuDstLineCount[y] < 1)
+		{
+			continue;
+		}
+		
+		for (size_t x = 0; x < GPU_FRAMEBUFFER_NATIVE_WIDTH; x++)
+		{
+			for (size_t p = 0; p < _gpuDstPitchCount[x]; p++)
+			{
+				newGpuDstToSrcIndex[dstIdx++] = (y * GPU_FRAMEBUFFER_NATIVE_WIDTH) + x;
+			}
+		}
+		
+		for (size_t l = 1; l < _gpuDstLineCount[y]; l++)
+		{
+			memcpy(newGpuDstToSrcPtr + (w * l), newGpuDstToSrcPtr, w * sizeof(u16));
+		}
+		
+		newGpuDstToSrcPtr += (w * _gpuDstLineCount[y]);
+		dstIdx += (w * (_gpuDstLineCount[y] - 1));
+	}
+	
+	u8 *newGpuDstToSrcSSSE3_u8_8e = (u8 *)malloc_alignedCacheLine(w * sizeof(u8));
+	u8 *newGpuDstToSrcSSSE3_u8_16e = (u8 *)malloc_alignedCacheLine(w * sizeof(u8));
+	u8 *newGpuDstToSrcSSSE3_u16_8e = (u8 *)malloc_alignedCacheLine(w * sizeof(u16));
+	u8 *newGpuDstToSrcSSSE3_u32_4e = (u8 *)malloc_alignedCacheLine(w * sizeof(u32));
+	
+	for (size_t i = 0; i < w; i++)
+	{
+		const u8 value_u8_4 = newGpuDstToSrcIndex[i] & 0x03;
+		const u8 value_u8_8 = newGpuDstToSrcIndex[i] & 0x07;
+		const u8 value_u8_16 = newGpuDstToSrcIndex[i] & 0x0F;
+		const u8 value_u16 = (value_u8_8 << 1);
+		const u8 value_u32 = (value_u8_4 << 2);
+		
+		newGpuDstToSrcSSSE3_u8_8e[i] = value_u8_8;
+		newGpuDstToSrcSSSE3_u8_16e[i] = value_u8_16;
+		
+		newGpuDstToSrcSSSE3_u16_8e[(i << 1) + 0] = value_u16 + 0;
+		newGpuDstToSrcSSSE3_u16_8e[(i << 1) + 1] = value_u16 + 1;
+		
+		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 0] = value_u32 + 0;
+		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 1] = value_u32 + 1;
+		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 2] = value_u32 + 2;
+		newGpuDstToSrcSSSE3_u32_4e[(i << 2) + 3] = value_u32 + 3;
+	}
+	
+	_gpuLargestDstLineCount = newGpuLargestDstLineCount;
+	_gpuVRAMBlockOffset = _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w;
+	_gpuDstToSrcIndex = newGpuDstToSrcIndex;
+	_gpuDstToSrcSSSE3_u8_8e = newGpuDstToSrcSSSE3_u8_8e;
+	_gpuDstToSrcSSSE3_u8_16e = newGpuDstToSrcSSSE3_u8_16e;
+	_gpuDstToSrcSSSE3_u16_8e = newGpuDstToSrcSSSE3_u16_8e;
+	_gpuDstToSrcSSSE3_u32_4e = newGpuDstToSrcSSSE3_u32_4e;
+	
+	this->_displayInfo.isCustomSizeRequested = ( (w != GPU_FRAMEBUFFER_NATIVE_WIDTH) || (h != GPU_FRAMEBUFFER_NATIVE_HEIGHT) );
+	this->_displayInfo.customWidth = w;
+	this->_displayInfo.customHeight = h;
+	
+	if (!this->_displayInfo.isCustomSizeRequested)
+	{
+		this->_engineMain->ResetCaptureLineStates();
+	}
+	
+	if (this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main])
+	{
+		this->_displayInfo.renderedWidth[NDSDisplayID_Main] = this->_displayInfo.customWidth;
+		this->_displayInfo.renderedHeight[NDSDisplayID_Main] = this->_displayInfo.customHeight;
+	}
+	else
+	{
+		this->_displayInfo.renderedWidth[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+		this->_displayInfo.renderedHeight[NDSDisplayID_Main] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	}
+	
+	if (this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch])
+	{
+		this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = this->_displayInfo.customWidth;
+		this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = this->_displayInfo.customHeight;
+	}
+	else
+	{
+		this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_WIDTH;
+		this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
+	}
+	
+	this->_AllocateFramebuffers(this->_displayInfo.colorFormat, w, h);
+	
+	free_aligned(oldGpuDstToSrcIndexPtr);
+	free_aligned(oldGpuDstToSrcSSSE3_u8_8e);
+	free_aligned(oldGpuDstToSrcSSSE3_u8_16e);
+	free_aligned(oldGpuDstToSrcSSSE3_u16_8e);
+	free_aligned(oldGpuDstToSrcSSSE3_u32_4e);
+}
+
+void GPUSubsystem::SetColorFormat(const NDSColorFormat outputFormat)
+{
+	//check for no-op
+	if(this->_displayInfo.colorFormat == outputFormat)
+		return;
+
+	this->_displayInfo.colorFormat = outputFormat;
+	this->_displayInfo.pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor);
+	
+	if (!this->_displayInfo.isCustomSizeRequested)
+	{
+		this->_engineMain->ResetCaptureLineStates();
+	}
+	
+	this->_AllocateFramebuffers(this->_displayInfo.colorFormat, this->_displayInfo.customWidth, this->_displayInfo.customHeight);
+}
+
+NDSColorFormat GPUSubsystem::GetColorFormat() const
+{
+	return this->_displayInfo.colorFormat;
+}
+
+void GPUSubsystem::_AllocateFramebuffers(NDSColorFormat outputFormat, size_t w, size_t h)
+{
+	void *oldMasterFramebuffer = this->_masterFramebuffer;
+	void *oldCustomVRAM = this->_customVRAM;
+	
+	const size_t pixelBytes = (outputFormat == NDSColorFormat_BGR555_Rev) ? sizeof(u16) : sizeof(FragmentColor);
+	const size_t newCustomVRAMBlockSize = _gpuCaptureLineIndex[GPU_VRAM_BLOCK_LINES] * w;
+	const size_t newCustomVRAMBlankSize = _gpuLargestDstLineCount * GPU_VRAM_BLANK_REGION_LINES * w;
+	const size_t nativeFramebufferSize = GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * pixelBytes;
+	const size_t customFramebufferSize = w * h * pixelBytes;
+	
+	void *newCustomVRAM = NULL;
+	
+	this->_displayInfo.framebufferSize = (nativeFramebufferSize * 2) + (customFramebufferSize * 2);
+	this->_masterFramebuffer = malloc_alignedPage(this->_displayInfo.framebufferSize * 2);
+	this->_displayInfo.masterFramebufferHead = this->_masterFramebuffer;
+	this->_displayInfo.masterNativeBuffer = (u8 *)this->_masterFramebuffer + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
+	this->_displayInfo.masterCustomBuffer = (u8 *)this->_masterFramebuffer + (nativeFramebufferSize * 2) + (this->_displayInfo.bufferIndex * this->_displayInfo.framebufferSize);
+	
+	GPUEngineBase *mainEngine  = this->_display[NDSDisplayID_Main]->GetEngine();
+	this->_displayInfo.nativeBuffer[NDSDisplayID_Main]    = mainEngine->nativeBuffer;
+	this->_displayInfo.customBuffer[NDSDisplayID_Main]    = mainEngine->customBuffer;
+	this->_displayInfo.renderedBuffer[NDSDisplayID_Main]  = mainEngine->renderedBuffer;
+	this->_displayInfo.renderedWidth[NDSDisplayID_Main]   = mainEngine->renderedWidth;
+	this->_displayInfo.renderedHeight[NDSDisplayID_Main]  = mainEngine->renderedHeight;
+	
+	GPUEngineBase *touchEngine = this->_display[NDSDisplayID_Touch]->GetEngine();
+	this->_displayInfo.nativeBuffer[NDSDisplayID_Touch]   = touchEngine->nativeBuffer;
+	this->_displayInfo.customBuffer[NDSDisplayID_Touch]   = touchEngine->customBuffer;
+	this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = touchEngine->renderedBuffer;
+	this->_displayInfo.renderedWidth[NDSDisplayID_Touch]  = touchEngine->renderedWidth;
+	this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = touchEngine->renderedHeight;
+	
+	switch (outputFormat)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			newCustomVRAM = (void *)malloc_alignedCacheLine(((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
+			memset(newCustomVRAM, 0, ((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
+			memset_u16(this->_masterFramebuffer, 0x8000, (this->_displayInfo.framebufferSize * 2) / sizeof(u16));
+			this->_customVRAM = newCustomVRAM;
+			this->_customVRAMBlank = (u16 *)newCustomVRAM + (newCustomVRAMBlockSize * 4);
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			newCustomVRAM = (void *)malloc_alignedCacheLine(((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
+			memset(newCustomVRAM, 0, ((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(u16));
+			memset_u32(this->_masterFramebuffer, 0x1F000000, (this->_displayInfo.framebufferSize * 2) / sizeof(FragmentColor));
+			this->_customVRAM = newCustomVRAM;
+			this->_customVRAMBlank = (u16 *)newCustomVRAM + (newCustomVRAMBlockSize * 4);
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			newCustomVRAM = (void *)malloc_alignedCacheLine(((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(FragmentColor));
+			memset(newCustomVRAM, 0, ((newCustomVRAMBlockSize * 4) + newCustomVRAMBlankSize) * sizeof(FragmentColor));
+			memset_u32(this->_masterFramebuffer, 0xFF000000, (this->_displayInfo.framebufferSize * 2) / sizeof(FragmentColor));
+			this->_customVRAM = newCustomVRAM;
+			this->_customVRAMBlank = (FragmentColor *)newCustomVRAM + (newCustomVRAMBlockSize * 4);
+			break;
+			
+		default:
+			break;
+	}
+	
+	this->_engineMain->SetCustomFramebufferSize(w, h);
+	this->_engineSub->SetCustomFramebufferSize(w, h);
+	
+	BaseRenderer->SetFramebufferSize(w, h); // Since BaseRenderer is persistent, we need to update this manually.
+	if (CurrentRenderer != BaseRenderer)
+	{
+		CurrentRenderer->RequestColorFormat(outputFormat);
+		CurrentRenderer->SetFramebufferSize(w, h);
+	}
+	
+	free_aligned(oldMasterFramebuffer);
+	free_aligned(oldCustomVRAM);
+}
+
+void* GPUSubsystem::GetCustomVRAMBuffer()
+{
+	return this->_customVRAM;
+}
+
+void* GPUSubsystem::GetCustomVRAMBlankBuffer()
+{
+	return this->_customVRAMBlank;
+}
+
+template <NDSColorFormat COLORFORMAT>
+void* GPUSubsystem::GetCustomVRAMAddressUsingMappedAddress(const u32 mappedAddr, const size_t offset)
+{
+	const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(mappedAddr) - MMU.ARM9_LCD) / sizeof(u16);
+	if (vramPixel >= (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
+	{
+		return this->_customVRAMBlank;
+	}
+	
+	const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+	const size_t blockPixel = vramPixel % (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
+	const size_t blockLine = blockPixel / GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	const size_t linePixel = blockPixel % GPU_FRAMEBUFFER_NATIVE_WIDTH;
+	
+	return (COLORFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)this->GetEngineMain()->GetCustomVRAMBlockPtr(blockID) + (_gpuCaptureLineIndex[blockLine] * this->_displayInfo.customWidth) + _gpuDstPitchIndex[linePixel] + offset) : (void *)((u16 *)this->GetEngineMain()->GetCustomVRAMBlockPtr(blockID) + (_gpuCaptureLineIndex[blockLine] * this->_displayInfo.customWidth) + _gpuDstPitchIndex[linePixel] + offset);
+}
+
+bool GPUSubsystem::GetWillPostprocessDisplays() const
+{
+	return this->_willPostprocessDisplays;
+}
+
+void GPUSubsystem::SetWillPostprocessDisplays(const bool willPostprocess)
+{
+	this->_willPostprocessDisplays = willPostprocess;
+}
+
+void GPUSubsystem::PostprocessDisplay(const NDSDisplayID displayID, NDSDisplayInfo &mutableInfo)
+{
+	if (mutableInfo.isDisplayEnabled[displayID])
+	{
+		if (mutableInfo.colorFormat == NDSColorFormat_BGR666_Rev)
+		{
+			if (mutableInfo.needConvertColorFormat[displayID])
+			{
+				ColorspaceConvertBuffer6665To8888<false, false>((u32 *)mutableInfo.renderedBuffer[displayID], (u32 *)mutableInfo.renderedBuffer[displayID], mutableInfo.renderedWidth[displayID] * mutableInfo.renderedHeight[displayID]);
+			}
+			
+			if (mutableInfo.needApplyMasterBrightness[displayID])
+			{
+				this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR888_Rev>(mutableInfo);
+			}
+		}
+		else
+		{
+			if (mutableInfo.needApplyMasterBrightness[displayID])
+			{
+				switch (mutableInfo.colorFormat)
+				{
+					case NDSColorFormat_BGR555_Rev:
+						this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR555_Rev>(mutableInfo);
+						break;
+						
+					case NDSColorFormat_BGR666_Rev:
+						this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR666_Rev>(mutableInfo);
+						break;
+						
+					case NDSColorFormat_BGR888_Rev:
+						this->_display[displayID]->GetEngine()->ApplyMasterBrightness<NDSColorFormat_BGR888_Rev>(mutableInfo);
+						break;
+						
+					default:
+						break;
+				}
+			}
+		}
+	}
+	else
+	{
+		if (mutableInfo.colorFormat == NDSColorFormat_BGR555_Rev)
+		{
+			memset(mutableInfo.renderedBuffer[displayID], 0, mutableInfo.renderedWidth[displayID] * mutableInfo.renderedHeight[displayID] * sizeof(u16));
+		}
+		else
+		{
+			memset(mutableInfo.renderedBuffer[displayID], 0, mutableInfo.renderedWidth[displayID] * mutableInfo.renderedHeight[displayID] * sizeof(u32));
+		}
+	}
+	
+	mutableInfo.needConvertColorFormat[displayID] = false;
+	mutableInfo.needApplyMasterBrightness[displayID] = false;
+}
+
+void GPUSubsystem::ResolveDisplayToCustomFramebuffer(const NDSDisplayID displayID, NDSDisplayInfo &mutableInfo)
+{
+	this->_display[displayID]->GetEngine()->ResolveToCustomFramebuffer(mutableInfo);
+}
+
+bool GPUSubsystem::GetWillAutoResolveToCustomBuffer() const
+{
+	return this->_willAutoResolveToCustomBuffer;
+}
+
+void GPUSubsystem::SetWillAutoResolveToCustomBuffer(const bool willAutoResolve)
+{
+	this->_willAutoResolveToCustomBuffer = willAutoResolve;
+}
+
+template <NDSColorFormat OUTPUTFORMAT>
+void GPUSubsystem::RenderLine(const size_t l)
+{
+	if (!this->_frameNeedsFinish)
+	{
+		u8 targetBufferIndex = this->_displayInfo.bufferIndex;
+		
+		if ( (l == 0) && !this->_willFrameSkip )
+		{
+			targetBufferIndex = (targetBufferIndex + 1) & 0x01;
+		}
+		
+		this->_event->DidFrameBegin(this->_willFrameSkip, targetBufferIndex, l);
+		this->_frameNeedsFinish = true;
+	}
+	
+	this->_engineMain->UpdateRenderStates(l);
+	this->_engineSub->UpdateRenderStates(l);
+	
+	const bool isDisplayCaptureNeeded = this->_engineMain->WillDisplayCapture(l);
+	const bool isFramebufferRenderNeeded[2]	= { CommonSettings.showGpu.main, CommonSettings.showGpu.sub };
+	
+	if (l == 0)
+	{
+		if (!this->_willFrameSkip)
+		{
+			this->UpdateRenderProperties();
+		}
+	}
+	
+	if ( (isFramebufferRenderNeeded[GPUEngineID_Main] || isDisplayCaptureNeeded) && !this->_willFrameSkip )
+	{
+		// GPUEngineA:WillRender3DLayer() and GPUEngineA:WillCapture3DLayerDirect() both rely on register
+		// states that might change on a per-line basis. Therefore, we need to check these states on a
+		// per-line basis as well. While most games will set up these states by line 0 and keep these
+		// states constant all the way through line 191, this may not always be the case.
+		//
+		// Test case: If a conversation occurs in Advance Wars: Dual Strike where the conversation
+		// originates from the top of the screen, the BG0 layer will only be enabled at line 46. This
+		// means that we need to check the states at that particular time to ensure that the 3D renderer
+		// finishes before we read the 3D framebuffer. Otherwise, the map will render incorrectly.
+		
+		const bool need3DCaptureFramebuffer = this->_engineMain->WillCapture3DLayerDirect(l);
+		const bool need3DDisplayFramebuffer = this->_engineMain->WillRender3DLayer() || ((OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) && need3DCaptureFramebuffer);
+		
+		if (need3DCaptureFramebuffer || need3DDisplayFramebuffer)
+		{
+			if (CurrentRenderer->GetRenderNeedsFinish())
+			{
+				CurrentRenderer->RenderFinish();
+				CurrentRenderer->SetRenderNeedsFinish(false);
+				this->_event->DidRender3DEnd();
+			}
+			
+			CurrentRenderer->RenderFlush(need3DDisplayFramebuffer && CurrentRenderer->GetRenderNeedsFlushMain(),
+			                             need3DCaptureFramebuffer && CurrentRenderer->GetRenderNeedsFlush16());
+		}
+		
+		this->_engineMain->RenderLine<OUTPUTFORMAT>(l);
+	}
+	else
+	{
+		this->_engineMain->UpdatePropertiesWithoutRender(l);
+	}
+	
+	if (isFramebufferRenderNeeded[GPUEngineID_Sub] && !this->_willFrameSkip)
+	{
+		this->_engineSub->RenderLine<OUTPUTFORMAT>(l);
+	}
+	else
+	{
+		this->_engineSub->UpdatePropertiesWithoutRender(l);
+	}
+	
+	if (l == 191)
+	{
+		this->_engineMain->LastLineProcess();
+		this->_engineSub->LastLineProcess();
+		
+		this->_UpdateFPSRender3D();
+		
+		if (!this->_willFrameSkip)
+		{
+			if (this->_displayInfo.isCustomSizeRequested)
+			{
+				this->_engineMain->ResolveCustomRendering<OUTPUTFORMAT>();
+				this->_engineSub->ResolveCustomRendering<OUTPUTFORMAT>();
+			}
+			
+			this->_displayInfo.didPerformCustomRender[NDSDisplayID_Main] = (this->_display[NDSDisplayID_Main]->GetEngine()->nativeLineOutputCount < GPU_FRAMEBUFFER_NATIVE_HEIGHT);
+			this->_displayInfo.renderedBuffer[NDSDisplayID_Main] = this->_display[NDSDisplayID_Main]->GetEngine()->renderedBuffer;
+			this->_displayInfo.renderedWidth[NDSDisplayID_Main] = this->_display[NDSDisplayID_Main]->GetEngine()->renderedWidth;
+			this->_displayInfo.renderedHeight[NDSDisplayID_Main] = this->_display[NDSDisplayID_Main]->GetEngine()->renderedHeight;
+			
+			this->_displayInfo.didPerformCustomRender[NDSDisplayID_Touch] = (this->_display[NDSDisplayID_Touch]->GetEngine()->nativeLineOutputCount < GPU_FRAMEBUFFER_NATIVE_HEIGHT);
+			this->_displayInfo.renderedBuffer[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngine()->renderedBuffer;
+			this->_displayInfo.renderedWidth[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngine()->renderedWidth;
+			this->_displayInfo.renderedHeight[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngine()->renderedHeight;
+			
+			this->_displayInfo.engineID[NDSDisplayID_Main]  = this->_display[NDSDisplayID_Main]->GetEngineID();
+			this->_displayInfo.engineID[NDSDisplayID_Touch] = this->_display[NDSDisplayID_Touch]->GetEngineID();
+			
+			this->_displayInfo.isDisplayEnabled[NDSDisplayID_Main]  = CommonSettings.showGpu.screens[this->_displayInfo.engineID[NDSDisplayID_Main]];
+			this->_displayInfo.isDisplayEnabled[NDSDisplayID_Touch] = CommonSettings.showGpu.screens[this->_displayInfo.engineID[NDSDisplayID_Touch]];
+			
+			this->_displayInfo.needConvertColorFormat[NDSDisplayID_Main]  = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev);
+			this->_displayInfo.needConvertColorFormat[NDSDisplayID_Touch] = (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev);
+			
+			this->_engineMain->UpdateMasterBrightnessDisplayInfo(this->_displayInfo);
+			this->_engineSub->UpdateMasterBrightnessDisplayInfo(this->_displayInfo);
+			
+			if (this->_willPostprocessDisplays)
+			{
+				this->PostprocessDisplay(NDSDisplayID_Main,  this->_displayInfo);
+				this->PostprocessDisplay(NDSDisplayID_Touch, this->_displayInfo);
+			}
+			
+			if (this->_willAutoResolveToCustomBuffer)
+			{
+				this->ResolveDisplayToCustomFramebuffer(NDSDisplayID_Main,  this->_displayInfo);
+				this->ResolveDisplayToCustomFramebuffer(NDSDisplayID_Touch, this->_displayInfo);
+			}
+		}
+		
+		if (this->_frameNeedsFinish)
+		{
+			this->_frameNeedsFinish = false;
+			this->_event->DidFrameEnd(this->_willFrameSkip, this->_displayInfo);
+		}
+	}
+}
+
+void GPUSubsystem::ClearWithColor(const u16 colorBGRA5551)
+{
+	u16 color16 = colorBGRA5551;
+	FragmentColor color32;
+	
+	switch (this->_displayInfo.colorFormat)
+	{
+		case NDSColorFormat_BGR555_Rev:
+			color16 = colorBGRA5551 | 0x8000;
+			break;
+			
+		case NDSColorFormat_BGR666_Rev:
+			color32.color = COLOR555TO6665_OPAQUE(colorBGRA5551 & 0x7FFF);
+			break;
+			
+		case NDSColorFormat_BGR888_Rev:
+			color32.color = COLOR555TO8888_OPAQUE(colorBGRA5551 & 0x7FFF);
+			break;
+			
+		default:
+			break;
+	}
+	
+	switch (this->_displayInfo.pixelBytes)
+	{
+		case 2:
+			memset_u16(this->_masterFramebuffer, color16, (this->_displayInfo.framebufferSize * 2) / this->_displayInfo.pixelBytes);
+			break;
+			
+		case 4:
+			memset_u32(this->_masterFramebuffer, color32.color, (this->_displayInfo.framebufferSize * 2) / this->_displayInfo.pixelBytes);
+			break;
+			
+		default:
+			break;
+	}
+}
+
+GPUClientFetchObject::GPUClientFetchObject()
+{
+	memset(&_fetchDisplayInfo[0], 0, sizeof(NDSDisplayInfo));
+	memset(&_fetchDisplayInfo[1], 0, sizeof(NDSDisplayInfo));
+	_clientData = NULL;
+	_lastFetchIndex = 0;
+}
+
+void GPUClientFetchObject::Init()
+{
+	// Do nothing. This is implementation dependent.
+}
+
+void GPUClientFetchObject::SetFetchBuffers(const NDSDisplayInfo &currentDisplayInfo)
+{
+	// Do nothing. This is implementation dependent.
+}
+
+void GPUClientFetchObject::FetchFromBufferIndex(const u8 index)
+{
+	if (this->_fetchDisplayInfo[index].isDisplayEnabled[NDSDisplayID_Main])
+	{
+		if (!this->_fetchDisplayInfo[index].didPerformCustomRender[NDSDisplayID_Main])
+		{
+			this->_FetchNativeDisplayByID(NDSDisplayID_Main, index);
+		}
+		else
+		{
+			this->_FetchCustomDisplayByID(NDSDisplayID_Main, index);
+		}
+	}
+	
+	if (this->_fetchDisplayInfo[index].isDisplayEnabled[NDSDisplayID_Touch])
+	{
+		if (!this->_fetchDisplayInfo[index].didPerformCustomRender[NDSDisplayID_Touch])
+		{
+			this->_FetchNativeDisplayByID(NDSDisplayID_Touch, index);
+		}
+		else
+		{
+			this->_FetchCustomDisplayByID(NDSDisplayID_Touch, index);
+		}
+	}
+	
+	this->SetLastFetchIndex(index);
+}
+
+void GPUClientFetchObject::_FetchNativeDisplayByID(const NDSDisplayID displayID, const u8 bufferIndex)
+{
+	// Do nothing. This is implementation dependent.
+}
+
+void GPUClientFetchObject::_FetchCustomDisplayByID(const NDSDisplayID displayID, const u8 bufferIndex)
+{
+	// Do nothing. This is implementation dependent.
+}
+
+const NDSDisplayInfo& GPUClientFetchObject::GetFetchDisplayInfoForBufferIndex(const u8 bufferIndex) const
+{
+	return this->_fetchDisplayInfo[bufferIndex];
+}
+
+void GPUClientFetchObject::SetFetchDisplayInfo(const NDSDisplayInfo &displayInfo)
+{
+	this->_fetchDisplayInfo[displayInfo.bufferIndex] = displayInfo;
+}
+
+u8 GPUClientFetchObject::GetLastFetchIndex() const
+{
+	return this->_lastFetchIndex;
+}
+
+void GPUClientFetchObject::SetLastFetchIndex(const u8 fetchIndex)
+{
+	this->_lastFetchIndex = fetchIndex;
+}
+
+void* GPUClientFetchObject::GetClientData() const
+{
+	return this->_clientData;
+}
+
+void GPUClientFetchObject::SetClientData(void *clientData)
+{
+	this->_clientData = clientData;
+}
+
+NDSDisplay::NDSDisplay()
+{
+	_ID = NDSDisplayID_Main;
+	_gpu = NULL;
+}
+
+NDSDisplay::NDSDisplay(const NDSDisplayID displayID)
+{
+	_ID = displayID;
+	_gpu = NULL;
+}
+
+NDSDisplay::NDSDisplay(const NDSDisplayID displayID, GPUEngineBase *theEngine)
+{
+	_ID = displayID;
+	_gpu = theEngine;
+}
+
+GPUEngineBase* NDSDisplay::GetEngine()
+{
+	return this->_gpu;
+}
+
+void NDSDisplay::SetEngine(GPUEngineBase *theEngine)
+{
+	this->_gpu = theEngine;
+}
+
+GPUEngineID NDSDisplay::GetEngineID()
+{
+	return this->_gpu->GetEngineID();
+}
+
+void NDSDisplay::SetEngineByID(const GPUEngineID theID)
+{
+	this->_gpu = (theID == GPUEngineID_Main) ? (GPUEngineBase *)GPU->GetEngineMain() : (GPUEngineBase *)GPU->GetEngineSub();
+	this->_gpu->SetDisplayByID(this->_ID);
+}
+
+template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG0>();
+template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG1>();
+template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG2>();
+template void GPUEngineBase::ParseReg_BGnHOFS<GPULayerID_BG3>();
+
+template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG0>();
+template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG1>();
+template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG2>();
+template void GPUEngineBase::ParseReg_BGnVOFS<GPULayerID_BG3>();
+
+template void GPUEngineBase::ParseReg_WINnH<0>();
+template void GPUEngineBase::ParseReg_WINnH<1>();
+
+template void GPUEngineBase::ParseReg_BGnX<GPULayerID_BG2>();
+template void GPUEngineBase::ParseReg_BGnY<GPULayerID_BG2>();
+template void GPUEngineBase::ParseReg_BGnX<GPULayerID_BG3>();
+template void GPUEngineBase::ParseReg_BGnY<GPULayerID_BG3>();
+
+template void GPUSubsystem::RenderLine<NDSColorFormat_BGR555_Rev>(const size_t l);
+template void GPUSubsystem::RenderLine<NDSColorFormat_BGR666_Rev>(const size_t l);
+template void GPUSubsystem::RenderLine<NDSColorFormat_BGR888_Rev>(const size_t l);
diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h
index bd808e1c9..c97189310 100644
--- a/desmume/src/GPU.h
+++ b/desmume/src/GPU.h
@@ -1437,8 +1437,8 @@ protected:
 #endif
 	
 	template<bool ISDEBUGRENDER> void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);
-	template<bool ISDEBUGRENDER> void _RenderSprite256(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);
-	template<bool ISDEBUGRENDER> void _RenderSprite16(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);
+	template<bool ISDEBUGRENDER, bool ISWINDOW> void _RenderSprite256(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);
+	template<bool ISDEBUGRENDER, bool ISWINDOW> void _RenderSprite16(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha);
 	void _RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir);
 	bool _ComputeSpriteVars(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, SpriteSize &sprSize, s32 &sprX, s32 &sprY, s32 &x, s32 &y, s32 &lg, s32 &xdir);