diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp
index 6025d0268..b94062edc 100755
--- a/desmume/src/FIFO.cpp
+++ b/desmume/src/FIFO.cpp
@@ -374,22 +374,35 @@ static void _DISP_FIFOrecv_LineAdvance()
 
 void DISP_FIFOrecv_Line16(u16 *__restrict dst)
 {
-#ifndef ENABLE_ALTIVEC // buffer_copy_fast() doesn't support endian swapping
-	if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000)
 #ifdef USEMANUALVECTORIZATION
-		&& (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1)))
-#endif
-		)
+	if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000) && (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1))) )
 	{
+#ifdef ENABLE_ALTIVEC
+		// Big-endian systems read the pixels in their correct bit order, but swap 16-bit chunks
+		// within 32-bit lanes, and so we can't use a standard buffer copy function here.
+		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=sizeof(v128u16))
+		{
+			v128u16 fifoColor = vec_ld(i, disp_fifo.buf + disp_fifo.head);
+			fifoColor = vec_perm( fifoColor, fifoColor, ((v128u8){2,3, 0,1, 6,7, 4,5, 10,11, 8,9, 14,15, 12,13}) );
+			vec_st(fifoColor, i, dst);
+		}
+#else
 		buffer_copy_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)>(dst, disp_fifo.buf + disp_fifo.head);
+#endif // ENABLE_ALTIVEC
+		
 		_DISP_FIFOrecv_LineAdvance();
 	}
 	else
-#endif // ENABLE_ALTIVEC
+#endif // USEMANUALVECTORIZATION
 	{
 		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
 		{
-			((u32 *)dst)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv_u32() );
+			const u32 src = DISP_FIFOrecv_u32();
+#ifdef MSB_FIRST
+			((u32 *)dst)[i] = (src >> 16) | (src << 16);
+#else
+			((u32 *)dst)[i] = src;
+#endif
 		}
 	}
 }
@@ -399,7 +412,7 @@ void DISP_FIFOrecv_Line16(u16 *__restrict dst)
 template <NDSColorFormat OUTPUTFORMAT>
 void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst)
 {
-#if defined(ENABLE_ALTIVEC)
+#ifdef ENABLE_ALTIVEC
 	// Big-endian systems read the pixels in their correct bit order, but swap 16-bit chunks
 	// within 32-bit lanes, and so we can't use a standard buffer copy function here.
 	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=sizeof(v128u16))
@@ -411,7 +424,7 @@ void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst)
 	}
 #else
 	buffer_copy_or_constant_s16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16), false>(dst, disp_fifo.buf + disp_fifo.head, 0x8000);
-#endif
+#endif // ENABLE_ALTIVEC
 	
 	_DISP_FIFOrecv_LineAdvance();
 }
@@ -419,7 +432,7 @@ void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst)
 template <NDSColorFormat OUTPUTFORMAT>
 void _DISP_FIFOrecv_LineOpaque32_vec(u32 *__restrict dst)
 {
-#if defined(ENABLE_ALTIVEC)
+#ifdef ENABLE_ALTIVEC
 	for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=16, d+=32)
 	{
 		v128u16 fifoColor = vec_ld(0, disp_fifo.buf + disp_fifo.head);
@@ -456,10 +469,10 @@ void _DISP_FIFOrecv_LineOpaque32_vec(u32 *__restrict dst)
 		ColorspaceConvertBuffer555To8888Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
 	}
 	_DISP_FIFOrecv_LineAdvance();
-#endif
+#endif // ENABLE_ALTIVEC
 }
 
-#endif
+#endif // USEMANUALVECTORIZATION
 
 template <NDSColorFormat OUTPUTFORMAT>
 void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst)
diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp
index 99e480655..c53fb3dcd 100644
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@@ -3967,9 +3967,10 @@ void GPUEngineA::_RenderLine_DisplayCaptureCustom(const IOREG_DISPCAPCNT &DISPCA
 												  const void *srcBPtr,
 												  void *dstCustomPtr)
 {
+	const u32 captureSrcBits = LOCAL_TO_LE_32(DISPCAPCNT.value) & 0x63000000;
 	const size_t captureLengthExt = (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) ? lineInfo.widthCustom : lineInfo.widthCustom / 2;
 	
-	switch (DISPCAPCNT.value & 0x63000000)
+	switch (captureSrcBits)
 	{
 		case 0x00000000: // Display only - ((DISPCAPCNT.CaptureSrc == 0) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 0))
 		case 0x02000000: // Display only - ((DISPCAPCNT.CaptureSrc == 0) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 1))
@@ -4071,6 +4072,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI
 	
 	const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
 	const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
+	const u32 captureSrcBits = LOCAL_TO_LE_32(DISPCAPCNT.value) & 0x63000000;
 	
 	const size_t writeLineIndexWithOffset = (DISPCAPCNT.VRAMWriteOffset * 64) + compInfo.line.indexNative;
 	const size_t readLineIndexWithOffset = (this->_dispCapCnt.readOffset * 64) + compInfo.line.indexNative;
@@ -4107,7 +4109,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI
 		willReadNativeVRAM = this->_isLineCaptureNative[DISPCNT.VRAM_Block][readLineIndexWithOffset];
 	}
 	
-	switch (DISPCAPCNT.value & 0x63000000)
+	switch (captureSrcBits)
 	{
 		case 0x00000000: // Display only - ((DISPCAPCNT.CaptureSrc == 0) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 0))
 		case 0x02000000: // Display only - ((DISPCAPCNT.CaptureSrc == 0) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 1))
@@ -4283,7 +4285,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI
 			srcAPtr = this->_captureWorkingDisplay16;
 		}
 		
-		switch (DISPCAPCNT.value & 0x63000000)
+		switch (captureSrcBits)
 		{
 			case 0x00000000: // Display only - ((DISPCAPCNT.CaptureSrc == 0) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 0))
 			case 0x02000000: // Display only - ((DISPCAPCNT.CaptureSrc == 0) && (DISPCAPCNT.SrcA == 0) && (DISPCAPCNT.SrcB == 1))