diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp
index 94b04d51a..29accf506 100644
--- a/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp
+++ b/desmume/src/utils/colorspacehandler/colorspacehandler_AVX2.cpp
@@ -495,30 +495,36 @@ size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__re
 		
 		if (SWAP_RB)
 		{
-			src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19,   15,11, 7, 3,   29,30,28,25,   26,24,21,22,   20,17,18,16,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0) );
-			src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(10, 8, 5, 6,    4, 1, 2, 0,   31,27,23,19,   15,11, 7, 3,   29,30,28,25,   26,24,21,22,   20,17,18,16,   13,14,12, 9) );
-			src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(20,17,18,16,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0,   31,27,23,19,   15,11, 7, 3,   29,30,28,25,   26,24,21,22) );
-			src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(29,30,28,25,   26,24,21,22,   20,17,18,16,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0,   31,27,23,19,   15,11, 7, 3) );
+			src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19,   29,30,28,25,   26,24,21,22,   20,17,18,16,   15,11, 7, 3,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0) );
+			src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(31,27,23,19,   29,30,28,25,   26,24,21,22,   20,17,18,16,   15,11, 7, 3,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0) );
+			src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(31,27,23,19,   29,30,28,25,   26,24,21,22,   20,17,18,16,   15,11, 7, 3,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0) );
+			src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(31,27,23,19,   29,30,28,25,   26,24,21,22,   20,17,18,16,   15,11, 7, 3,   13,14,12, 9,   10, 8, 5, 6,    4, 1, 2, 0) );
 		}
 		else
 		{
-			src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19,   15,11, 7, 3,   28,30,29,24,   26,25,20,22,   21,16,18,17,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1) );
-			src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(10, 9, 4, 6,    5, 0, 2, 1,   31,27,23,19,   15,11, 7, 3,   28,30,29,24,   26,25,20,22,   21,16,18,17,   12,14,13, 8) );
-			src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(21,16,18,17,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1,   31,27,23,19,   15,11, 7, 3,   28,30,29,24,   26,25,20,22) );
-			src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(28,30,29,24,   26,25,20,22,   21,16,18,17,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1,   31,27,23,19,   15,11, 7, 3) );
+			src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19,   28,30,29,24,   26,25,20,22,   21,16,18,17,   15,11, 7, 3,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1) );
+			src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(31,27,23,19,   28,30,29,24,   26,25,20,22,   21,16,18,17,   15,11, 7, 3,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1) );
+			src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(31,27,23,19,   28,30,29,24,   26,25,20,22,   21,16,18,17,   15,11, 7, 3,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1) );
+			src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(31,27,23,19,   28,30,29,24,   26,25,20,22,   21,16,18,17,   15,11, 7, 3,   12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1) );
 		}
 		
+		// This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can.
+		src_v256u32[0] = _mm256_permutevar8x32_epi32( src_v256u32[0], _mm256_set_epi32(7, 3, 6, 5, 4, 2, 1, 0) );
+		src_v256u32[1] = _mm256_permutevar8x32_epi32( src_v256u32[1], _mm256_set_epi32(1, 0, 7, 3, 6, 5, 4, 2) );
+		src_v256u32[2] = _mm256_permutevar8x32_epi32( src_v256u32[2], _mm256_set_epi32(4, 2, 1, 0, 7, 3, 6, 5) );
+		src_v256u32[3] = _mm256_permutevar8x32_epi32( src_v256u32[3], _mm256_set_epi32(6, 5, 4, 2, 1, 0, 7, 3) );
+		
 		if (IS_UNALIGNED)
 		{
-			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), src_v256u32[0]) );
-			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
-			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(                 src_v256u32[3],                                                                                                                    _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
+			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
+			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
+			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
 		}
 		else
 		{
-			_mm256_store_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), src_v256u32[0]) );
-			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
-			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(                 src_v256u32[3],                                                                                                                    _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
+			_mm256_store_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
+			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
+			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
 		}
 	}
 	
@@ -550,30 +556,36 @@ size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__re
 		
 		if (SWAP_RB)
 		{
-			src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19,   15,11, 7, 3,   28,29,30,24,   25,26,20,21,   22,16,17,18,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2));
-			src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8( 9,10, 4, 5,    6, 0, 1, 2,   31,27,23,19,   15,11, 7, 3,   28,29,30,24,   25,26,20,21,   22,16,17,18,   12,13,14, 8));
-			src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(22,16,17,18,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2,   31,27,23,19,   15,11, 7, 3,   28,29,30,24,   25,26,20,21));
-			src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(28,29,30,24,   25,26,20,21,   22,16,17,18,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2,   31,27,23,19,   15,11, 7, 3));
+			src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19,   28,29,30,24,   25,26,20,21,   22,16,17,18,   15,11, 7, 3,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2));
+			src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8(31,27,23,19,   28,29,30,24,   25,26,20,21,   22,16,17,18,   15,11, 7, 3,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2));
+			src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(31,27,23,19,   28,29,30,24,   25,26,20,21,   22,16,17,18,   15,11, 7, 3,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2));
+			src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(31,27,23,19,   28,29,30,24,   25,26,20,21,   22,16,17,18,   15,11, 7, 3,   12,13,14, 8,    9,10, 4, 5,    6, 0, 1, 2));
 		}
 		else
 		{
-			src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19,   15,11, 7, 3,   30,29,28,26,   25,24,22,21,   20,18,17,16,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0));
-			src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8( 9, 8, 6, 5,    4, 2, 1, 0,   31,27,23,19,   15,11, 7, 3,   30,29,28,26,   25,24,22,21,   20,18,17,16,   14,13,12,10));
-			src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(20,18,17,16,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0,   31,27,23,19,   15,11, 7, 3,   30,29,28,26,   25,24,22,21));
-			src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(30,29,28,26,   25,24,22,21,   20,18,17,16,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0,   31,27,23,19,   15,11, 7, 3));
+			src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19,   30,29,28,26,   25,24,22,21,   20,18,17,16,   15,11, 7, 3,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0));
+			src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8(31,27,23,19,   30,29,28,26,   25,24,22,21,   20,18,17,16,   15,11, 7, 3,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0));
+			src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(31,27,23,19,   30,29,28,26,   25,24,22,21,   20,18,17,16,   15,11, 7, 3,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0));
+			src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(31,27,23,19,   30,29,28,26,   25,24,22,21,   20,18,17,16,   15,11, 7, 3,   14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0));
 		}
 		
+		// This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can.
+		src_v256u32[0] = _mm256_permutevar8x32_epi32( src_v256u32[0], _mm256_set_epi32(7, 3, 6, 5, 4, 2, 1, 0) );
+		src_v256u32[1] = _mm256_permutevar8x32_epi32( src_v256u32[1], _mm256_set_epi32(1, 0, 7, 3, 6, 5, 4, 2) );
+		src_v256u32[2] = _mm256_permutevar8x32_epi32( src_v256u32[2], _mm256_set_epi32(4, 2, 1, 0, 7, 3, 6, 5) );
+		src_v256u32[3] = _mm256_permutevar8x32_epi32( src_v256u32[3], _mm256_set_epi32(6, 5, 4, 2, 1, 0, 7, 3) );
+		
 		if (IS_UNALIGNED)
 		{
-			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[0], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
-			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
-			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(_mm256_and_si256(src_v256u32[3], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
+			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
+			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
+			_mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
 		}
 		else
 		{
-			_mm256_store_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_or_si256(_mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[0], _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
-			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_or_si256(_mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[1], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
-			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_or_si256(_mm256_and_si256(src_v256u32[3], _mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm256_and_si256(src_v256u32[2], _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
+			_mm256_store_si256( (v256u8 *)(dst + (i * 3) +  0), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) );
+			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 32), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) );
+			_mm256_store_si256( (v256u8 *)(dst + (i * 3) + 64), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) );
 		}
 	}
 	
diff --git a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp
index f95c988a6..dc1d65056 100644
--- a/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp
+++ b/desmume/src/utils/colorspacehandler/colorspacehandler_SSE2.cpp
@@ -550,18 +550,33 @@ size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__r
 			src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(12,14,13, 8,   10, 9, 4, 6,    5, 0, 2, 1,   15,11, 7, 3) );
 		}
 		
+#ifdef ENABLE_SSE4_1
 		if (IS_UNALIGNED)
 		{
-			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
+		}
+		else
+		{
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
+		}
+#else
+		if (IS_UNALIGNED)
+		{
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)),               src_v128u32[0]) );
 			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
 			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(              src_v128u32[3],                                                                 _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
 		}
 		else
 		{
-			_mm_store_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) );
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)),               src_v128u32[0]) );
 			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
 			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(              src_v128u32[3],                                                                 _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
 		}
+#endif
 	}
 	
 	return i;
@@ -605,6 +620,20 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r
 			src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(14,13,12,10,    9, 8, 6, 5,    4, 2, 1, 0,   15,11, 7, 3));
 		}
 		
+#ifdef ENABLE_SSE4_1
+		if (IS_UNALIGNED)
+		{
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
+			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
+		}
+		else
+		{
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) );
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) );
+			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) );
+		}
+#else
 		if (IS_UNALIGNED)
 		{
 			_mm_storeu_si128( (v128u8 *)(dst + (i * 3) +  0), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) );
@@ -617,6 +646,7 @@ size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__r
 			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 16), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) );
 			_mm_store_si128( (v128u8 *)(dst + (i * 3) + 32), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) );
 		}
+#endif
 	}
 	
 	return i;