diff --git a/README.md b/README.md
index f3212cb..2a448e6 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,9 @@ Video-related settings.
 `float brightness` - Screen lift
 * Default: `0.0`
 
+`string colorProfile` - Color correction profile. `none`, `gba`, `nds` or `nds_white`.
+* Default: `none`
+
 ### Audio
 Audio settings.
 
@@ -255,8 +258,9 @@ You may use this under the terms of the GNU General Public License GPL v3 or the
 * **MAME**
 * **No-Intro**
 * **Wolfvak, Sono and all the other people in #GodMode9 on freenode/Discord**
-* **endrift, Extrems and all the other people in #mgba on freenode**
+* **endrift, Extrems and all the other people in #mgba on Libera.Chat**
 * **Oleh Prypin (oprypin) for nightly.link**
+* **hunterk and Pokefan531 for their amazing libretro shaders**
 * ...everyone who contributed to **3dbrew.org**
 
-Copyright (C) 2021 derrek, profi200, d0k3
+Copyright (C) 2024 derrek, profi200, d0k3
\ No newline at end of file
diff --git a/include/arm11/config.h b/include/arm11/config.h
index ef928df..6d5a3cd 100644
--- a/include/arm11/config.h
+++ b/include/arm11/config.h
@@ -46,6 +46,7 @@ typedef struct
 	float lcdGamma;
 	float contrast;
 	float brightness;
+	u8 colorProfile;    // 0 = none, 1 = GBA, 2 = DS phat, 3 = DS phat white.
 
 	// [audio]
 	u8 audioOut;        // 0 = auto, 1 = speakers, 2 = headphones.
diff --git a/include/arm11/fast_frame_convert.h b/include/arm11/fast_frame_convert.h
new file mode 100644
index 0000000..c3d6b99
--- /dev/null
+++ b/include/arm11/fast_frame_convert.h
@@ -0,0 +1,33 @@
+#pragma once
+
+/*
+ *   This file is part of open_agb_firm
+ *   Copyright (C) 2024 profi200
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void convert160pFrameFast(void);
+void convert240pFrameFast(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
\ No newline at end of file
diff --git a/include/arm11/gpu_cmd_lists.h b/include/arm11/gpu_cmd_lists.h
index 168c9ce..17b7090 100644
--- a/include/arm11/gpu_cmd_lists.h
+++ b/include/arm11/gpu_cmd_lists.h
@@ -27,6 +27,8 @@ extern "C"
 #endif
 
 #define GPU_RENDER_BUF_ADDR  (0x18180000)
+#define GPU_TEXTURE_ADDR     (0x18200000)
+#define GPU_TEXTURE2_ADDR    (0x18300000)
 #define GBA_INIT_LIST_SIZE   (1136)
 #define GBA_LIST2_SIZE       (448)
 
@@ -36,7 +38,7 @@ extern u8 gbaGpuList2[GBA_LIST2_SIZE];
 
 
 
-void patchGbaGpuCmdList(u8 scaleType);
+void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/libraries/libn3ds b/libraries/libn3ds
index f5e5e8e..6259b6b 160000
--- a/libraries/libn3ds
+++ b/libraries/libn3ds
@@ -1 +1 @@
-Subproject commit f5e5e8efb3a33959ee5489e2f13f1aecf62f1e04
+Subproject commit 6259b6b8ffe4bf82481dc93aeadbcc96738c2b9f
diff --git a/source/arm11/config.c b/source/arm11/config.c
index e8925ff..0b08bb3 100644
--- a/source/arm11/config.c
+++ b/source/arm11/config.c
@@ -36,7 +36,8 @@
                         "gbaGamma=2.2\n"          \
                         "lcdGamma=1.54\n"         \
                         "contrast=1.0\n"          \
-                        "brightness=0.0\n\n"      \
+                        "brightness=0.0\n"        \
+                        "colorProfile=none\n\n"   \
                         "[audio]\n"               \
                         "audioOut=0\n"            \
                         "volume=127\n\n"          \
@@ -61,6 +62,7 @@ OafConfig g_oafConfig =
 	1.54f, // lcdGamma
 	1.f,   // contrast
 	0.f,   // brightness
+	0,     // colorProfile
 
 	// [audio]
 	0,     // Automatic audio output.
@@ -154,6 +156,19 @@ static int cfgIniCallback(void* user, const char* section, const char* name, con
 			config->contrast = str2float(value);
 		else if(strcmp(name, "brightness") == 0)
 			config->brightness = str2float(value);
+		else if(strcmp(name, "colorProfile") == 0)
+		{
+			if(strcmp(value, "none") == 0)
+				config->colorProfile = 0;
+			else if(strcmp(value, "gba") == 0)
+				config->colorProfile = 1;
+			else if(strcmp(value, "nds") == 0)
+				config->colorProfile = 2;
+			else if(strcmp(value, "nds_white") == 0)
+				config->colorProfile = 3;
+			//else if(strcmp(value, "custom") == 0) // TODO: Implement user provided profile.
+			//	config->colorProfile = 4;
+		}
 	}
 	else if(strcmp(section, "audio") == 0)
 	{
diff --git a/source/arm11/fast_frame_convert.s b/source/arm11/fast_frame_convert.s
new file mode 100644
index 0000000..ea4b5cc
--- /dev/null
+++ b/source/arm11/fast_frame_convert.s
@@ -0,0 +1,312 @@
+@ This file is part of open_agb_firm
+@ Copyright (C) 2024 profi200
+@
+@ This program is free software: you can redistribute it and/or modify
+@ it under the terms of the GNU General Public License as published by
+@ the Free Software Foundation, either version 3 of the License, or
+@ (at your option) any later version.
+@
+@ This program is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+@ GNU General Public License for more details.
+@
+@ You should have received a copy of the GNU General Public License
+@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "asm_macros.h"
+#include "mem_map.h"
+
+.syntax unified
+.cpu mpcore
+.fpu vfpv2
+
+
+
+@ Whole frame converter.
+/*BEGIN_ASM_FUNC convertFrameFast
+	@ Load frame, output and lookup table pointers.
+	@ Our frame is in a 512x512 texture. Same for the output.
+	@ The table is a 15 to 32-bit 3D lookup table with color correction pre-applied.
+	ldr  r0, =0x18200000                @ r0 = 0x18200000;
+	ldr  r1, =0x18300000                @ r1 = 0x18300000;
+	ldr  r2, =0x1FF00000                @ r2 = 0x1FF00000;
+
+	@ Prefetch first cache line, save registers, load color mask and load 8 line counter.
+	pld  [r0]                           @ Prefetch from r0.
+	stmfd sp!, {r4-r11, lr}             @ Save registers.
+	ldrh r12, =0x7FFF                   @ r12 = 0x7FFF;
+	mov  r11, #30                       @ r11 = 30;
+
+	@ Convert 8 lines each round until we have a whole frame.
+	convertFrameFast_8l_lp:
+		@ Load size of 8 lines in bytes.
+		mov  r3, #0x1680                @ r3 = 0x1680;
+
+		@ Convert 8 pixels each round until we have 8 lines.
+		convertFrameFast_8p_lp:
+			@ Load 8 pixels from frame.
+			ldmia  r0!, {r8-r10, lr}    @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+			@ Decrement size and extract first 2 pixels.
+			subs r3,  r3, #16           @ r3 -= 16;              // Updates flags.
+			and  r4, r12,  r8, lsr #1   @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+			lsr  r5,  r8, #17           @ r5 = r8>>17;
+
+			@ Look up pixel 1 and extract pixel 3.
+			ldr  r4, [r2,  r4, lsl #2]  @ r4 = r2[r4];           // u32.
+			and  r6, r12,  r9, lsr #1   @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+			@ Look up pixel 2 and extract pixel 4.
+			ldr  r5, [r2,  r5, lsl #2]  @ r5 = r2[r5]; // u32.
+			lsr  r7,  r9, #17           @ r7 = r9>>17;
+
+			@ Look up pixel 3 and extract pixel 5.
+			ldr  r6, [r2,  r6, lsl #2]  @ r6 = r2[r6];            // u32.
+			and  r8, r12, r10, lsr #1   @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+			@ Look up pixel 4 and extract pixel 6.
+			ldr  r7, [r2,  r7, lsl #2]  @ r7 = r2[r7];  // u32.
+			lsr  r9, r10, #17           @ r9 = r10>>17;
+
+			@ Look up pixel 5 and extract pixel 7.
+			ldr  r8, [r2,  r8, lsl #2]  @ r8 = r2[r8];            // u32.
+			and r10, r12,  lr, lsr #1   @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+			@ Look up pixel 6 and extract pixel 8.
+			ldr  r9, [r2,  r9, lsl #2]  @ r9 = r2[r9]; // u32.
+			lsr  lr,  lr, #17           @ lr = lr>>17;
+
+			@ Look up pixel 7 and 8.
+			ldr r10, [r2, r10, lsl #2]  @ r10 = r2[r10]; // u32.
+			ldr  lr, [r2,  lr, lsl #2]  @ lr = r2[lr];   // u32.
+
+			@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+			pld [r0, #32]               @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+			stmia  r1!, {r4-r10, lr}    @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+			bne convertFrameFast_8p_lp  @ if(r3 != 0) goto convertFrameFast_8p_lp;
+
+		@ Decrement 8 line counter, skip texture padding and jump back if we are not done yet.
+		subs r11, r11, #1               @ r11--;        // Updates flags.
+		add   r0,  r0, #0x980           @ r0 += 0x980;
+		add   r1,  r1, #0x1300          @ r1 += 0x1300;
+		bne convertFrameFast_8l_lp      @ if(r11 != 0) goto convertFrameFast_8l_lp;
+
+	ldmfd sp!, {r4-r11, pc}             @ Restore registers and return.
+END_ASM_FUNC*/
+
+@ Converts a 160p frame while it's being DMAd to memory.
+BEGIN_ASM_FUNC convert160pFrameFast
+	@ Enable top LCD LgyCap IRQs.
+	mov  r0, #77                                   @ r0 = 77; // id     IRQ_LGYCAP_TOP.
+	mov  r1, #0                                    @ r1 = 0;  // prio   0 (highest).
+	mov  r2, #0                                    @ r2 = 0;  // target 0 (this CPU).
+	mov  r3, #0                                    @ r3 = 0;  // isr    NULL.
+	blx IRQ_registerIsr                            @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
+
+	@ We will be using IRQs without our IRQ handler to minimize latency.
+	cpsid i                                        @ __disableIrq();
+
+	@ Load lookup table address and color mask.
+	ldr   r2, =0x1FF00000                          @ r2 = 0x1FF00000;
+	ldrh r12, =0x7FFF                              @ r12 = 0x7FFF;
+
+	convert160pFrameFast_frame_lp:
+		@ Load input and output addresses.
+		ldr  r0, =0x18200000                       @ r0 = 0x18200000;    // u32.
+		@ldr  r1, =0x18300000                       @ r1 = 0x18300000;    // u32.
+		add  r1,  r0, #0x100000                    @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
+
+		@ Convert 8 lines each round until we have a whole frame.
+		convert160pFrameFast_8l_lp:
+			ldr  r4, =0x10111008                   @ r4 = &REG_LGYCAP1_STAT; // u32.
+			ldr  r5, =MPCORE_PRIV_BASE             @ r5 = MPCORE_PRIV_BASE;  // u32.
+
+			convert160pFrameFast_wait_irq:
+				@ Wait for LgyCap IRQs.
+				wfi                                @ __waitForInterrupt();
+
+				@ Acknowledge IRQ and extract line number.
+				ldr r11, [r4]                      @ r11 = REG_LGYCAP_STAT; // u32.
+				ldr  r7, [r5, #0x10C]              @ r7 = REG_GICC_INTACK;  // u32.
+				str r11, [r4]                      @ REG_LGYCAP_STAT = r11; // u32.
+				lsrs r11, r11, #16                 @ r11 >>= 16;            // Updates flags.
+				str  r7, [r5, #0x110]              @ REG_GICC_EOI = r7;     // u32.
+
+				@ Ignore DREQ IRQ for line 0.
+				beq convert160pFrameFast_wait_irq      @ if((r11>>16) == 0) goto convert160pFrameFast_wait_irq;
+
+			convert160pFrameFast_skip_irq_wait:
+			@ Load size of 8 lines in bytes.
+			mov  r3, #0xF00                        @ r3 = 0xF00;
+
+			@ Convert 8 pixels each round until we have 8 lines.
+			convert160pFrameFast_8p_lp:
+				@ Load 8 pixels from frame.
+				ldmia  r0!, {r8-r10, lr}           @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+				@ Decrement size and extract first 2 pixels.
+				subs r3,  r3, #16                  @ r3 -= 16;              // Updates flags.
+				and  r4, r12,  r8, lsr #1          @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+				lsr  r5,  r8, #17                  @ r5 = r8>>17;
+
+				@ Look up pixel 1 and extract pixel 3.
+				ldr  r4, [r2,  r4, lsl #2]         @ r4 = r2[r4];           // u32.
+				and  r6, r12,  r9, lsr #1          @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 2 and extract pixel 4.
+				ldr  r5, [r2,  r5, lsl #2]         @ r5 = r2[r5]; // u32.
+				lsr  r7,  r9, #17                  @ r7 = r9>>17;
+
+				@ Look up pixel 3 and extract pixel 5.
+				ldr  r6, [r2,  r6, lsl #2]         @ r6 = r2[r6];            // u32.
+				and  r8, r12, r10, lsr #1          @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 4 and extract pixel 6.
+				ldr  r7, [r2,  r7, lsl #2]         @ r7 = r2[r7];  // u32.
+				lsr  r9, r10, #17                  @ r9 = r10>>17;
+
+				@ Look up pixel 5 and extract pixel 7.
+				ldr  r8, [r2,  r8, lsl #2]         @ r8 = r2[r8];            // u32.
+				and r10, r12,  lr, lsr #1          @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 6 and extract pixel 8.
+				ldr  r9, [r2,  r9, lsl #2]         @ r9 = r2[r9]; // u32.
+				lsr  lr,  lr, #17                  @ lr = lr>>17;
+
+				@ Look up pixel 7 and 8.
+				ldr r10, [r2, r10, lsl #2]         @ r10 = r2[r10]; // u32.
+				ldr  lr, [r2,  lr, lsl #2]         @ lr = r2[lr];   // u32.
+
+				@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+				pld [r0, #32]                      @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+				stmia  r1!, {r4-r10, lr}           @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+				bne convert160pFrameFast_8p_lp     @ if(r3 != 0) goto convert160pFrameFast_8p_lp;
+
+			@ Test if 8 line counter is 152, skip texture padding and jump back if we are not done yet.
+			cmp r11, #152                          @ r11 - 152; // Updates flags.
+			add  r0,  r0, #0x1100                  @ r0 += 0x1100;
+			add  r1,  r1, #0x2200                  @ r1 += 0x2200;
+			moveq r11, #160                        @ if(r11 == 152) r11 = 160;
+			beq convert160pFrameFast_skip_irq_wait @ if(r11 == 152) goto convert160pFrameFast_skip_irq_wait;
+			bls convert160pFrameFast_8l_lp         @ if(r11 <= 152) goto convert160pFrameFast_8l_lp;
+
+		@ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
+		@ Note: r3 has been decremented down to 0 previously and so it's safe to use.
+		mcr p15, 0, r3, c7, c14, 0                 @ Clean and Invalidate Entire Data Cache.
+		ldr  r4, =MPCORE_PRIV_BASE                 @ r4 = MPCORE_PRIV_BASE;  // u32.
+		mov  r5, #0x10000                          @ r5 = 0x10000;
+		orr  r5,  r5, #0xF                         @ r5 |= 0xF;
+		add  r4,  r4, #0x1F00                      @ r4 += 0x1F00; // REG_GICD_SOFTINT.
+		mcr p15, 0, r3, c7, c10, 4                 @ Data Synchronization Barrier.
+		str  r5, [r4]                              @ *r4 = r5; // u32.
+		b convert160pFrameFast_frame_lp            @ goto convert160pFrameFast_frame_lp;
+END_ASM_FUNC
+
+@ Converts the frame while it's being DMAd to memory.
+BEGIN_ASM_FUNC convert240pFrameFast
+	@ Enable top LCD LgyCap IRQs.
+	mov  r0, #77                                   @ r0 = 77; // id     IRQ_LGYCAP_TOP.
+	mov  r1, #0                                    @ r1 = 0;  // prio   0 (highest).
+	mov  r2, #0                                    @ r2 = 0;  // target 0 (this CPU).
+	mov  r3, #0                                    @ r3 = 0;  // isr    NULL.
+	blx IRQ_registerIsr                            @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
+
+	@ We will be using IRQs without our IRQ handler to minimize latency.
+	cpsid i                                        @ __disableIrq();
+
+	@ Load lookup table address and color mask.
+	ldr   r2, =0x1FF00000                          @ r2 = 0x1FF00000;
+	ldrh r12, =0x7FFF                              @ r12 = 0x7FFF;
+
+	convert240pFrameFast_frame_lp:
+		@ Load input and output addresses.
+		ldr  r0, =0x18200000                       @ r0 = 0x18200000;    // u32.
+		@ldr  r1, =0x18300000                       @ r1 = 0x18300000;    // u32.
+		add  r1,  r0, #0x100000                    @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
+
+		@ Convert 8 lines each round until we have a whole frame.
+		convert240pFrameFast_8l_lp:
+			ldr  r4, =0x10111008                   @ r4 = &REG_LGYCAP1_STAT; // u32.
+			ldr  r5, =MPCORE_PRIV_BASE             @ r5 = MPCORE_PRIV_BASE;  // u32.
+
+			convert240pFrameFast_wait_irq:
+				@ Wait for LgyCap IRQs.
+				wfi                                @ __waitForInterrupt();
+
+				@ Acknowledge IRQ and extract line number.
+				ldr r11, [r4]                      @ r11 = REG_LGYCAP_STAT; // u32.
+				ldr  r7, [r5, #0x10C]              @ r7 = REG_GICC_INTACK;  // u32.
+				str r11, [r4]                      @ REG_LGYCAP_STAT = r11; // u32.
+				lsrs r11, r11, #16                 @ r11 >>= 16;            // Updates flags.
+				str  r7, [r5, #0x110]              @ REG_GICC_EOI = r7;     // u32.
+
+				@ Ignore DREQ IRQ for line 0.
+				beq convert240pFrameFast_wait_irq      @ if((r11>>16) == 0) goto convert240pFrameFast_wait_irq;
+
+			convert240pFrameFast_skip_irq_wait:
+			@ Load size of 8 lines in bytes.
+			mov  r3, #0x1680                       @ r3 = 0x1680;
+
+			@ Convert 8 pixels each round until we have 8 lines.
+			convert240pFrameFast_8p_lp:
+				@ Load 8 pixels from frame.
+				ldmia  r0!, {r8-r10, lr}           @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+				@ Decrement size and extract first 2 pixels.
+				subs r3,  r3, #16                  @ r3 -= 16;              // Updates flags.
+				and  r4, r12,  r8, lsr #1          @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+				lsr  r5,  r8, #17                  @ r5 = r8>>17;
+
+				@ Look up pixel 1 and extract pixel 3.
+				ldr  r4, [r2,  r4, lsl #2]         @ r4 = r2[r4];           // u32.
+				and  r6, r12,  r9, lsr #1          @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 2 and extract pixel 4.
+				ldr  r5, [r2,  r5, lsl #2]         @ r5 = r2[r5]; // u32.
+				lsr  r7,  r9, #17                  @ r7 = r9>>17;
+
+				@ Look up pixel 3 and extract pixel 5.
+				ldr  r6, [r2,  r6, lsl #2]         @ r6 = r2[r6];            // u32.
+				and  r8, r12, r10, lsr #1          @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 4 and extract pixel 6.
+				ldr  r7, [r2,  r7, lsl #2]         @ r7 = r2[r7];  // u32.
+				lsr  r9, r10, #17                  @ r9 = r10>>17;
+
+				@ Look up pixel 5 and extract pixel 7.
+				ldr  r8, [r2,  r8, lsl #2]         @ r8 = r2[r8];            // u32.
+				and r10, r12,  lr, lsr #1          @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 6 and extract pixel 8.
+				ldr  r9, [r2,  r9, lsl #2]         @ r9 = r2[r9]; // u32.
+				lsr  lr,  lr, #17                  @ lr = lr>>17;
+
+				@ Look up pixel 7 and 8.
+				ldr r10, [r2, r10, lsl #2]         @ r10 = r2[r10]; // u32.
+				ldr  lr, [r2,  lr, lsl #2]         @ lr = r2[lr];   // u32.
+
+				@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+				pld [r0, #32]                      @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+				stmia  r1!, {r4-r10, lr}           @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+				bne convert240pFrameFast_8p_lp     @ if(r3 != 0) goto convert240pFrameFast_8p_lp;
+
+			@ Test if 8 line counter is 232, skip texture padding and jump back if we are not done yet.
+			cmp r11, #232                          @ r11 - 232; // Updates flags.
+			add  r0,  r0, #0x980                   @ r0 += 0x980;
+			add  r1,  r1, #0x1300                  @ r1 += 0x1300;
+			moveq r11, #240                        @ if(r11 == 232) r11 = 240;
+			beq convert240pFrameFast_skip_irq_wait @ if(r11 == 232) goto convert240pFrameFast_skip_irq_wait;
+			bls convert240pFrameFast_8l_lp         @ if(r11 <= 232) goto convert240pFrameFast_8l_lp;
+
+		@ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
+		@ Note: r3 has been decremented down to 0 previously and so it's safe to use.
+		mcr p15, 0, r3, c7, c14, 0                 @ Clean and Invalidate Entire Data Cache.
+		ldr  r4, =MPCORE_PRIV_BASE                 @ r4 = MPCORE_PRIV_BASE;  // u32.
+		mov  r5, #0x10000                          @ r5 = 0x10000;
+		orr  r5,  r5, #0xF                         @ r5 |= 0xF;
+		add  r4,  r4, #0x1F00                      @ r4 += 0x1F00; // REG_GICD_SOFTINT.
+		mcr p15, 0, r3, c7, c10, 4                 @ Data Synchronization Barrier.
+		str  r5, [r4]                              @ *r4 = r5; // u32.
+		b convert240pFrameFast_frame_lp            @ goto convert240pFrameFast_frame_lp;
+END_ASM_FUNC
\ No newline at end of file
diff --git a/source/arm11/gpu_cmd_lists.c b/source/arm11/gpu_cmd_lists.c
index 615febb..603077b 100644
--- a/source/arm11/gpu_cmd_lists.c
+++ b/source/arm11/gpu_cmd_lists.c
@@ -167,8 +167,16 @@ alignas(16) u8 gbaGpuList2[GBA_LIST2_SIZE] =
 
 
 
-void patchGbaGpuCmdList(u8 scaleType)
+void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture)
 {
+	if(useSecondTexture)
+	{
+		u32 tmp = GPU_TEXTURE2_ADDR>>3;
+		memcpy(&gbaGpuInitList[580], &tmp, 4);
+		tmp = 0;
+		memcpy(&gbaGpuInitList[584], &tmp, 4);
+	}
+
 	if(scaleType == 0)
 	{
 		u32 tmp = 0x4440;
@@ -215,7 +223,7 @@ void patchGbaGpuCmdList(u8 scaleType)
 		memcpy(&gbaGpuList2[316], &tmp, 4);
 		memcpy(&gbaGpuList2[380], &tmp, 4);
 	}
-	else return; // Nothing to do.
+	// else nothing to do.
 
 	flushDCacheRange(gbaGpuInitList, sizeof(gbaGpuInitList));
 	flushDCacheRange(gbaGpuList2, sizeof(gbaGpuList2));
diff --git a/source/arm11/oaf_video.c b/source/arm11/oaf_video.c
index d3446f3..b03a162 100644
--- a/source/arm11/oaf_video.c
+++ b/source/arm11/oaf_video.c
@@ -21,6 +21,7 @@
 #include "types.h"
 #include "arm11/config.h"
 #include "arm11/drivers/gx.h"
+#include "drivers/cache.h"
 #include "util.h"
 #include "oaf_error_codes.h"
 #include "arm11/drivers/lgycap.h"
@@ -31,8 +32,17 @@
 #include "fsutil.h"
 #include "kernel.h"
 #include "kevent.h"
-#include "arm11/gpu_cmd_lists.h"
 #include "arm11/drivers/hid.h"
+#include "arm11/drivers/interrupt.h"
+#include "arm11/gpu_cmd_lists.h"
+#include "system.h"
+#include "arm11/fast_frame_convert.h"
+
+
+#define COLOR_LUT_ADDR (0x1FF00000u)
+
+
+static KHandle g_convFinishedEvent = 0;
 
 
 
@@ -58,6 +68,108 @@ static void adjustGammaTableForGba(void)
 	}
 }
 
+typedef struct
+{
+	float targetGamma;
+	float lum;
+	float  r, gr, br;
+	float rg,  g, bg;
+	float rb, gb,  b;
+	float displayGamma;
+} ColorProfile;
+
+static const ColorProfile g_colorProfiles[3] =
+{
+	{ // libretro GBA color (sRGB). Credits: hunterk and Pokefan531.
+		2.f + 0.5f,
+		0.93f,
+		0.8f,   0.275f, -0.075f,
+		0.135f, 0.64f,   0.225f,
+		0.195f, 0.155f,  0.65f,
+		1.f / 2.f
+	},
+	{ // libretro DS phat (sRGB). Credits: hunterk and Pokefan531.
+		2.f,
+		1.f,
+		0.705f,  0.235f,  -0.075f,
+		0.09f,   0.585f,   0.24f,
+		0.1075f, 0.1725f,  0.72f,
+		1.f / 2.f
+	},
+	{ // libretro DS phat white (sRGB). Credits: hunterk and Pokefan531.
+		2.f,
+		0.915f,
+		0.815f,  0.275f,  -0.09f,
+		0.1f,    0.64f,    0.26f,
+		0.1075f, 0.1725f,  0.72f,
+		1.f / 2.f
+	}
+};
+
+ALWAYS_INLINE float clamp_float(const float x, const float min, const float max)
+{
+	return (x < min ? min : (x > max ? max : x));
+}
+
+static void makeColorLut(const ColorProfile *const p)
+{
+	u32 *colorLut = (u32*)COLOR_LUT_ADDR;
+	for(u32 i = 0; i < 32768; i++)
+	{
+		// Convert to 8-bit and normalize.
+		float b = (float)rgbFive2Eight(i & 31u) / 255;
+		float g = (float)rgbFive2Eight((i>>5) & 31u) / 255;
+		float r = (float)rgbFive2Eight(i>>10) / 255;
+
+		// Convert to linear gamma.
+		const float targetGamma = p->targetGamma;
+		b = powf(b, targetGamma);
+		g = powf(g, targetGamma);
+		r = powf(r, targetGamma);
+
+		// Apply luminance.
+		const float lum = p->lum;
+		b = clamp_float(b * lum, 0.f, 1.f);
+		g = clamp_float(g * lum, 0.f, 1.f);
+		r = clamp_float(r * lum, 0.f, 1.f);
+
+		/*
+		 *               Input
+		 *                [r]
+		 *                [g]
+		 *                [b]
+		 *
+		 * Correction    Output
+		 * [ r][gr][br]   [r]
+		 * [rg][ g][bg]   [g]
+		 * [rb][gb][ b]   [b]
+		*/
+		// Assuming no alpha channel in original calculation.
+		float newB = p->rb * r + p->gb * g + p->b * b;
+		float newG = p->rg * r + p->g * g + p->bg * b;
+		float newR = p->r * r + p->gr * g + p->br * b;
+
+		newB = (newB < 0.f ? 0.f : newB);
+		newG = (newG < 0.f ? 0.f : newG);
+		newR = (newR < 0.f ? 0.f : newR);
+
+		// Convert to display gamma.
+		const float displayGamma = p->displayGamma;
+		newB = powf(newB, displayGamma);
+		newG = powf(newG, displayGamma);
+		newR = powf(newR, displayGamma);
+
+		// Denormalize, clamp, convert to ABGR8 and write lut.
+		u32 tmp = 0xFF; // Alpha.
+		tmp |= clamp_s32(lroundf(newB * 255), 0, 255)<<8;
+		tmp |= clamp_s32(lroundf(newG * 255), 0, 255)<<16;
+		tmp |= clamp_s32(lroundf(newR * 255), 0, 255)<<24;
+		*colorLut++ = tmp;
+	}
+
+	flushDCacheRange((void*)COLOR_LUT_ADDR, 1024u * 128);
+}
+
 static Result dumpFrameTex(void)
 {
 	// Stop LgyCap before dumping the frame to prevent glitches.
@@ -110,7 +222,7 @@ static Result dumpFrameTex(void)
 	// Note: This is a race with the currently displaying frame buffer
 	//       because we just swapped buffers in the gfx handler function.
 	u32 *const tmpBuf = GFX_getBuffer(GFX_LCD_TOP, GFX_SIDE_LEFT);
-	GX_displayTransfer((u32*)0x18200000, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim,
+	GX_displayTransfer((u32*)GPU_TEXTURE_ADDR, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim,
 	                   PPF_O_FMT(GX_A1BGR5) | PPF_I_FMT(GX_A1BGR5) | PPF_CROP_EN);
 	memcpy(tmpBuf, &bmpHeaders, sizeof(bmpHeaders));
 	GFX_waitForPPF();
@@ -131,6 +243,11 @@ static Result dumpFrameTex(void)
 	return res;
 }
 
+static void convFinishedHandler(UNUSED const u32 intSource)
+{
+	signalEvent(g_convFinishedEvent, false);
+}
+
 static void gbaGfxHandler(void *args)
 {
 	const KHandle event = (KHandle)args;
@@ -181,7 +298,7 @@ static void gbaGfxHandler(void *args)
 	taskExit();
 }
 
-static KHandle setupFrameCapture(const u8 scaler)
+static KHandle setupFrameCapture(const u8 scaler, const bool colorCorrectionEnabled)
 {
 	const bool is240x160 = scaler < 2;
 	static s16 matrix[12 * 8] =
@@ -213,7 +330,7 @@ static KHandle setupFrameCapture(const u8 scaler)
 	gbaCfg.cnt   = LGYCAP_SWIZZLE | LGYCAP_ROT_NONE | LGYCAP_FMT_A1BGR5 | (is240x160 ? 0 : LGYCAP_HSCALE_EN | LGYCAP_VSCALE_EN);
 	gbaCfg.w     = (is240x160 ? 240 : 360);
 	gbaCfg.h     = (is240x160 ? 160 : 240);
-	gbaCfg.irq   = 0;
+	gbaCfg.irq   = (colorCorrectionEnabled ? LGYCAP_IRQ_DMA_REQ : 0); // We need the DMA request IRQ for core 1.
 	gbaCfg.vLen  = 6;
 	gbaCfg.vPatt = 0b00011011;
 	memcpy(gbaCfg.vMatrix, matrix, 6 * 8 * 2);
@@ -234,13 +351,41 @@ KHandle OAF_videoInit(void)
 		GFX_powerOffBacklight(GFX_BL_BOT);
 #endif
 
-	// Initialize frame capture and frame handler.
+	// Initialize frame capture.
 	const u8 scaler = g_oafConfig.scaler;
-	const KHandle frameReadyEvent = setupFrameCapture(scaler);
-	patchGbaGpuCmdList(scaler);
-	createTask(0x800, 3, gbaGfxHandler, (void*)frameReadyEvent);
+	const u8 colorProfile = g_oafConfig.colorProfile;
+	KHandle frameReadyEvent;
+	KHandle convFinishedEvent;
+	if(colorProfile > 0)
+	{
+		// Start capture hardware and create event handles.
+		frameReadyEvent = setupFrameCapture(scaler, true);
+		convFinishedEvent = createEvent(false);
+		g_convFinishedEvent = convFinishedEvent;
 
-	// Adjust gamma table and setup button overrides.
+		// Patch GPU cmd list with texture location 2.
+		patchGbaGpuCmdList(scaler, true);
+
+		// Compute the (linear) 3D lookup table.
+		makeColorLut(&g_colorProfiles[colorProfile - 1]);
+
+		// Register IPI handler and start core 1 for color conversion.
+		IRQ_registerIsr(IRQ_IPI15, 13, 0, convFinishedHandler);
+		__systemBootCore1((scaler < 2 ? convert160pFrameFast : convert240pFrameFast));
+	}
+	else
+	{
+		// Start capture hardware.
+		frameReadyEvent = setupFrameCapture(scaler, false);
+
+		// Patch GPU cmd list with texture location 1.
+		patchGbaGpuCmdList(scaler, false);
+	}
+
+	// Start frame handler.
+	createTask(0x800, 3, gbaGfxHandler, (void*)(colorProfile > 0 ? convFinishedEvent : frameReadyEvent));
+
+	// Adjust hardware gamma table.
 	adjustGammaTableForGba();
 
 	// Load border if any exists.
@@ -265,4 +410,9 @@ void OAF_videoExit(void)
 	// frameReadyEvent deleted by this function.
 	// gbaGfxHandler() will automatically terminate.
 	LGYCAP_deinit(LGYCAP_DEV_TOP);
+	if(g_convFinishedEvent != 0)
+	{
+		deleteEvent(g_convFinishedEvent);
+		g_convFinishedEvent = 0;
+	}
 }
\ No newline at end of file
diff --git a/source/arm11/open_agb_firm.c b/source/arm11/open_agb_firm.c
index af92636..5abaad7 100644
--- a/source/arm11/open_agb_firm.c
+++ b/source/arm11/open_agb_firm.c
@@ -363,6 +363,7 @@ void oafUpdate(void)
 	CODEC_runHeadphoneDetection();
 	updateBacklight();
 	waitForEvent(g_frameReadyEvent);
+	clearEvent(g_frameReadyEvent);
 }
 
 void oafFinish(void)