From b1c66e24a27e5c6ba4b40963ce7c96f0c1c646b7 Mon Sep 17 00:00:00 2001
From: profi200 <fd3194@gmx.de>
Date: Tue, 23 Jul 2024 00:23:14 +0200
Subject: [PATCH] Added experimental support for true color correction to mimic
 the look of the crappy GBA LCD. Thanks to hunterk and Pokefan531 for their
 work on the libretro shaders this is based on.

---
 README.md                          |   8 +-
 include/arm11/config.h             |   1 +
 include/arm11/fast_frame_convert.h |  33 +++
 include/arm11/gpu_cmd_lists.h      |   4 +-
 libraries/libn3ds                  |   2 +-
 source/arm11/config.c              |  17 +-
 source/arm11/fast_frame_convert.s  | 312 +++++++++++++++++++++++++++++
 source/arm11/gpu_cmd_lists.c       |  12 +-
 source/arm11/oaf_video.c           | 168 +++++++++++++++-
 source/arm11/open_agb_firm.c       |   1 +
 10 files changed, 542 insertions(+), 16 deletions(-)
 create mode 100644 include/arm11/fast_frame_convert.h
 create mode 100644 source/arm11/fast_frame_convert.s

diff --git a/README.md b/README.md
index f3212cb..2a448e6 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,9 @@ Video-related settings.
 `float brightness` - Screen lift
 * Default: `0.0`
 
+`string colorProfile` - Color correction profile. `none`, `gba`, `nds` or `nds_white`.
+* Default: `none`
+
 ### Audio
 Audio settings.
 
@@ -255,8 +258,9 @@ You may use this under the terms of the GNU General Public License GPL v3 or the
 * **MAME**
 * **No-Intro**
 * **Wolfvak, Sono and all the other people in #GodMode9 on freenode/Discord**
-* **endrift, Extrems and all the other people in #mgba on freenode**
+* **endrift, Extrems and all the other people in #mgba on Libera.Chat**
 * **Oleh Prypin (oprypin) for nightly.link**
+* **hunterk and Pokefan531 for their amazing libretro shaders**
 * ...everyone who contributed to **3dbrew.org**
 
-Copyright (C) 2021 derrek, profi200, d0k3
+Copyright (C) 2024 derrek, profi200, d0k3
\ No newline at end of file
diff --git a/include/arm11/config.h b/include/arm11/config.h
index ef928df..6d5a3cd 100644
--- a/include/arm11/config.h
+++ b/include/arm11/config.h
@@ -46,6 +46,7 @@ typedef struct
 	float lcdGamma;
 	float contrast;
 	float brightness;
+	u8 colorProfile;    // 0 = none, 1 = GBA, 2 = DS phat, 3 = DS phat white.
 
 	// [audio]
 	u8 audioOut;        // 0 = auto, 1 = speakers, 2 = headphones.
diff --git a/include/arm11/fast_frame_convert.h b/include/arm11/fast_frame_convert.h
new file mode 100644
index 0000000..c3d6b99
--- /dev/null
+++ b/include/arm11/fast_frame_convert.h
@@ -0,0 +1,33 @@
+#pragma once
+
+/*
+ *   This file is part of open_agb_firm
+ *   Copyright (C) 2024 profi200
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void convert160pFrameFast(void);
+void convert240pFrameFast(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
\ No newline at end of file
diff --git a/include/arm11/gpu_cmd_lists.h b/include/arm11/gpu_cmd_lists.h
index 168c9ce..17b7090 100644
--- a/include/arm11/gpu_cmd_lists.h
+++ b/include/arm11/gpu_cmd_lists.h
@@ -27,6 +27,8 @@ extern "C"
 #endif
 
 #define GPU_RENDER_BUF_ADDR  (0x18180000)
+#define GPU_TEXTURE_ADDR     (0x18200000)
+#define GPU_TEXTURE2_ADDR    (0x18300000)
 #define GBA_INIT_LIST_SIZE   (1136)
 #define GBA_LIST2_SIZE       (448)
 
@@ -36,7 +38,7 @@ extern u8 gbaGpuList2[GBA_LIST2_SIZE];
 
 
 
-void patchGbaGpuCmdList(u8 scaleType);
+void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/libraries/libn3ds b/libraries/libn3ds
index f5e5e8e..6259b6b 160000
--- a/libraries/libn3ds
+++ b/libraries/libn3ds
@@ -1 +1 @@
-Subproject commit f5e5e8efb3a33959ee5489e2f13f1aecf62f1e04
+Subproject commit 6259b6b8ffe4bf82481dc93aeadbcc96738c2b9f
diff --git a/source/arm11/config.c b/source/arm11/config.c
index e8925ff..0b08bb3 100644
--- a/source/arm11/config.c
+++ b/source/arm11/config.c
@@ -36,7 +36,8 @@
                         "gbaGamma=2.2\n"          \
                         "lcdGamma=1.54\n"         \
                         "contrast=1.0\n"          \
-                        "brightness=0.0\n\n"      \
+                        "brightness=0.0\n"        \
+                        "colorProfile=none\n\n"   \
                         "[audio]\n"               \
                         "audioOut=0\n"            \
                         "volume=127\n\n"          \
@@ -61,6 +62,7 @@ OafConfig g_oafConfig =
 	1.54f, // lcdGamma
 	1.f,   // contrast
 	0.f,   // brightness
+	0,     // colorProfile
 
 	// [audio]
 	0,     // Automatic audio output.
@@ -154,6 +156,19 @@ static int cfgIniCallback(void* user, const char* section, const char* name, con
 			config->contrast = str2float(value);
 		else if(strcmp(name, "brightness") == 0)
 			config->brightness = str2float(value);
+		else if(strcmp(name, "colorProfile") == 0)
+		{
+			if(strcmp(value, "none") == 0)
+				config->colorProfile = 0;
+			else if(strcmp(value, "gba") == 0)
+				config->colorProfile = 1;
+			else if(strcmp(value, "nds") == 0)
+				config->colorProfile = 2;
+			else if(strcmp(value, "nds_white") == 0)
+				config->colorProfile = 3;
+			//else if(strcmp(value, "custom") == 0) // TODO: Implement user provided profile.
+			//	config->colorProfile = 4;
+		}
 	}
 	else if(strcmp(section, "audio") == 0)
 	{
diff --git a/source/arm11/fast_frame_convert.s b/source/arm11/fast_frame_convert.s
new file mode 100644
index 0000000..ea4b5cc
--- /dev/null
+++ b/source/arm11/fast_frame_convert.s
@@ -0,0 +1,312 @@
+@ This file is part of open_agb_firm
+@ Copyright (C) 2024 profi200
+@
+@ This program is free software: you can redistribute it and/or modify
+@ it under the terms of the GNU General Public License as published by
+@ the Free Software Foundation, either version 3 of the License, or
+@ (at your option) any later version.
+@
+@ This program is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+@ GNU General Public License for more details.
+@
+@ You should have received a copy of the GNU General Public License
+@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "asm_macros.h"
+#include "mem_map.h"
+
+.syntax unified
+.cpu mpcore
+.fpu vfpv2
+
+
+
+@ Whole frame converter.
+/*BEGIN_ASM_FUNC convertFrameFast
+	@ Load frame, output and lookup table pointers.
+	@ Our frame is in a 512x512 texture. Same for the output.
+	@ The table is a 15 to 32-bit 3D lookup table with color correction pre-applied.
+	ldr  r0, =0x18200000                @ r0 = 0x18200000;
+	ldr  r1, =0x18300000                @ r1 = 0x18300000;
+	ldr  r2, =0x1FF00000                @ r2 = 0x1FF00000;
+
+	@ Prefetch first cache line, save registers, load color mask and load 8 line counter.
+	pld  [r0]                           @ Prefetch from r0.
+	stmfd sp!, {r4-r11, lr}             @ Save registers.
+	ldrh r12, =0x7FFF                   @ r12 = 0x7FFF;
+	mov  r11, #30                       @ r11 = 30;
+
+	@ Convert 8 lines each round until we have a whole frame.
+	convertFrameFast_8l_lp:
+		@ Load size of 8 lines in bytes.
+		mov  r3, #0x1680                @ r3 = 0x1680;
+
+		@ Convert 8 pixels each round until we have 8 lines.
+		convertFrameFast_8p_lp:
+			@ Load 8 pixels from frame.
+			ldmia  r0!, {r8-r10, lr}    @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+			@ Decrement size and extract first 2 pixels.
+			subs r3,  r3, #16           @ r3 -= 16;              // Updates flags.
+			and  r4, r12,  r8, lsr #1   @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+			lsr  r5,  r8, #17           @ r5 = r8>>17;
+
+			@ Look up pixel 1 and extract pixel 3.
+			ldr  r4, [r2,  r4, lsl #2]  @ r4 = r2[r4];           // u32.
+			and  r6, r12,  r9, lsr #1   @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+			@ Look up pixel 2 and extract pixel 4.
+			ldr  r5, [r2,  r5, lsl #2]  @ r5 = r2[r5]; // u32.
+			lsr  r7,  r9, #17           @ r7 = r9>>17;
+
+			@ Look up pixel 3 and extract pixel 5.
+			ldr  r6, [r2,  r6, lsl #2]  @ r6 = r2[r6];            // u32.
+			and  r8, r12, r10, lsr #1   @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+			@ Look up pixel 4 and extract pixel 6.
+			ldr  r7, [r2,  r7, lsl #2]  @ r7 = r2[r7];  // u32.
+			lsr  r9, r10, #17           @ r9 = r10>>17;
+
+			@ Look up pixel 5 and extract pixel 7.
+			ldr  r8, [r2,  r8, lsl #2]  @ r8 = r2[r8];            // u32.
+			and r10, r12,  lr, lsr #1   @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+			@ Look up pixel 6 and extract pixel 8.
+			ldr  r9, [r2,  r9, lsl #2]  @ r9 = r2[r9]; // u32.
+			lsr  lr,  lr, #17           @ lr = lr>>17;
+
+			@ Look up pixel 7 and 8.
+			ldr r10, [r2, r10, lsl #2]  @ r10 = r2[r10]; // u32.
+			ldr  lr, [r2,  lr, lsl #2]  @ lr = r2[lr];   // u32.
+
+			@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+			pld [r0, #32]               @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+			stmia  r1!, {r4-r10, lr}    @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+			bne convertFrameFast_8p_lp  @ if(r3 != 0) goto convertFrameFast_8p_lp;
+
+		@ Decrement 8 line counter, skip texture padding and jump back if we are not done yet.
+		subs r11, r11, #1               @ r11--;        // Updates flags.
+		add   r0,  r0, #0x980           @ r0 += 0x980;
+		add   r1,  r1, #0x1300          @ r1 += 0x1300;
+		bne convertFrameFast_8l_lp      @ if(r11 != 0) goto convertFrameFast_8l_lp;
+
+	ldmfd sp!, {r4-r11, pc}             @ Restore registers and return.
+END_ASM_FUNC*/
+
+@ Converts a 160p frame while it's being DMAd to memory.
+BEGIN_ASM_FUNC convert160pFrameFast
+	@ Enable top LCD LgyCap IRQs.
+	mov  r0, #77                                   @ r0 = 77; // id     IRQ_LGYCAP_TOP.
+	mov  r1, #0                                    @ r1 = 0;  // prio   0 (highest).
+	mov  r2, #0                                    @ r2 = 0;  // target 0 (this CPU).
+	mov  r3, #0                                    @ r3 = 0;  // isr    NULL.
+	blx IRQ_registerIsr                            @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
+
+	@ We will be using IRQs without our IRQ handler to minimize latency.
+	cpsid i                                        @ __disableIrq();
+
+	@ Load lookup table address and color mask.
+	ldr   r2, =0x1FF00000                          @ r2 = 0x1FF00000;
+	ldrh r12, =0x7FFF                              @ r12 = 0x7FFF;
+
+	convert160pFrameFast_frame_lp:
+		@ Load input and output addresses.
+		ldr  r0, =0x18200000                       @ r0 = 0x18200000;    // u32.
+		@ldr  r1, =0x18300000                       @ r1 = 0x18300000;    // u32.
+		add  r1,  r0, #0x100000                    @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
+
+		@ Convert 8 lines each round until we have a whole frame.
+		convert160pFrameFast_8l_lp:
+			ldr  r4, =0x10111008                   @ r4 = &REG_LGYCAP1_STAT; // u32.
+			ldr  r5, =MPCORE_PRIV_BASE             @ r5 = MPCORE_PRIV_BASE;  // u32.
+
+			convert160pFrameFast_wait_irq:
+				@ Wait for LgyCap IRQs.
+				wfi                                @ __waitForInterrupt();
+
+				@ Acknowledge IRQ and extract line number.
+				ldr r11, [r4]                      @ r11 = REG_LGYCAP_STAT; // u32.
+				ldr  r7, [r5, #0x10C]              @ r7 = REG_GICC_INTACK;  // u32.
+				str r11, [r4]                      @ REG_LGYCAP_STAT = r11; // u32.
+				lsrs r11, r11, #16                 @ r11 >>= 16;            // Updates flags.
+				str  r7, [r5, #0x110]              @ REG_GICC_EOI = r7;     // u32.
+
+				@ Ignore DREQ IRQ for line 0.
+				beq convert160pFrameFast_wait_irq      @ if((r11>>16) == 0) goto convert160pFrameFast_wait_irq;
+
+			convert160pFrameFast_skip_irq_wait:
+			@ Load size of 8 lines in bytes.
+			mov  r3, #0xF00                        @ r3 = 0xF00;
+
+			@ Convert 8 pixels each round until we have 8 lines.
+			convert160pFrameFast_8p_lp:
+				@ Load 8 pixels from frame.
+				ldmia  r0!, {r8-r10, lr}           @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+				@ Decrement size and extract first 2 pixels.
+				subs r3,  r3, #16                  @ r3 -= 16;              // Updates flags.
+				and  r4, r12,  r8, lsr #1          @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+				lsr  r5,  r8, #17                  @ r5 = r8>>17;
+
+				@ Look up pixel 1 and extract pixel 3.
+				ldr  r4, [r2,  r4, lsl #2]         @ r4 = r2[r4];           // u32.
+				and  r6, r12,  r9, lsr #1          @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 2 and extract pixel 4.
+				ldr  r5, [r2,  r5, lsl #2]         @ r5 = r2[r5]; // u32.
+				lsr  r7,  r9, #17                  @ r7 = r9>>17;
+
+				@ Look up pixel 3 and extract pixel 5.
+				ldr  r6, [r2,  r6, lsl #2]         @ r6 = r2[r6];            // u32.
+				and  r8, r12, r10, lsr #1          @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 4 and extract pixel 6.
+				ldr  r7, [r2,  r7, lsl #2]         @ r7 = r2[r7];  // u32.
+				lsr  r9, r10, #17                  @ r9 = r10>>17;
+
+				@ Look up pixel 5 and extract pixel 7.
+				ldr  r8, [r2,  r8, lsl #2]         @ r8 = r2[r8];            // u32.
+				and r10, r12,  lr, lsr #1          @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 6 and extract pixel 8.
+				ldr  r9, [r2,  r9, lsl #2]         @ r9 = r2[r9]; // u32.
+				lsr  lr,  lr, #17                  @ lr = lr>>17;
+
+				@ Look up pixel 7 and 8.
+				ldr r10, [r2, r10, lsl #2]         @ r10 = r2[r10]; // u32.
+				ldr  lr, [r2,  lr, lsl #2]         @ lr = r2[lr];   // u32.
+
+				@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+				pld [r0, #32]                      @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+				stmia  r1!, {r4-r10, lr}           @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+				bne convert160pFrameFast_8p_lp     @ if(r3 != 0) goto convert160pFrameFast_8p_lp;
+
+			@ Test if 8 line counter is 152, skip texture padding and jump back if we are not done yet.
+			cmp r11, #152                          @ r11 - 152; // Updates flags.
+			add  r0,  r0, #0x1100                  @ r0 += 0x1100;
+			add  r1,  r1, #0x2200                  @ r1 += 0x2200;
+			moveq r11, #160                        @ if(r11 == 152) r11 = 160;
+			beq convert160pFrameFast_skip_irq_wait @ if(r11 == 152) goto convert160pFrameFast_skip_irq_wait;
+			bls convert160pFrameFast_8l_lp         @ if(r11 <= 152) goto convert160pFrameFast_8l_lp;
+
+		@ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
+		@ Note: r3 has been decremented down to 0 previously and so it's safe to use.
+		mcr p15, 0, r3, c7, c14, 0                 @ Clean and Invalidate Entire Data Cache.
+		ldr  r4, =MPCORE_PRIV_BASE                 @ r4 = MPCORE_PRIV_BASE;  // u32.
+		mov  r5, #0x10000                          @ r5 = 0x10000;
+		orr  r5,  r5, #0xF                         @ r5 |= 0xF;
+		add  r4,  r4, #0x1F00                      @ r4 += 0x1F00; // REG_GICD_SOFTINT.
+		mcr p15, 0, r3, c7, c10, 4                 @ Data Synchronization Barrier.
+		str  r5, [r4]                              @ *r4 = r5; // u32.
+		b convert160pFrameFast_frame_lp            @ goto convert160pFrameFast_frame_lp;
+END_ASM_FUNC
+
+@ Converts the frame while it's being DMAd to memory.
+BEGIN_ASM_FUNC convert240pFrameFast
+	@ Enable top LCD LgyCap IRQs.
+	mov  r0, #77                                   @ r0 = 77; // id     IRQ_LGYCAP_TOP.
+	mov  r1, #0                                    @ r1 = 0;  // prio   0 (highest).
+	mov  r2, #0                                    @ r2 = 0;  // target 0 (this CPU).
+	mov  r3, #0                                    @ r3 = 0;  // isr    NULL.
+	blx IRQ_registerIsr                            @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
+
+	@ We will be using IRQs without our IRQ handler to minimize latency.
+	cpsid i                                        @ __disableIrq();
+
+	@ Load lookup table address and color mask.
+	ldr   r2, =0x1FF00000                          @ r2 = 0x1FF00000;
+	ldrh r12, =0x7FFF                              @ r12 = 0x7FFF;
+
+	convert240pFrameFast_frame_lp:
+		@ Load input and output addresses.
+		ldr  r0, =0x18200000                       @ r0 = 0x18200000;    // u32.
+		@ldr  r1, =0x18300000                       @ r1 = 0x18300000;    // u32.
+		add  r1,  r0, #0x100000                    @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
+
+		@ Convert 8 lines each round until we have a whole frame.
+		convert240pFrameFast_8l_lp:
+			ldr  r4, =0x10111008                   @ r4 = &REG_LGYCAP1_STAT; // u32.
+			ldr  r5, =MPCORE_PRIV_BASE             @ r5 = MPCORE_PRIV_BASE;  // u32.
+
+			convert240pFrameFast_wait_irq:
+				@ Wait for LgyCap IRQs.
+				wfi                                @ __waitForInterrupt();
+
+				@ Acknowledge IRQ and extract line number.
+				ldr r11, [r4]                      @ r11 = REG_LGYCAP_STAT; // u32.
+				ldr  r7, [r5, #0x10C]              @ r7 = REG_GICC_INTACK;  // u32.
+				str r11, [r4]                      @ REG_LGYCAP_STAT = r11; // u32.
+				lsrs r11, r11, #16                 @ r11 >>= 16;            // Updates flags.
+				str  r7, [r5, #0x110]              @ REG_GICC_EOI = r7;     // u32.
+
+				@ Ignore DREQ IRQ for line 0.
+				beq convert240pFrameFast_wait_irq      @ if((r11>>16) == 0) goto convert240pFrameFast_wait_irq;
+
+			convert240pFrameFast_skip_irq_wait:
+			@ Load size of 8 lines in bytes.
+			mov  r3, #0x1680                       @ r3 = 0x1680;
+
+			@ Convert 8 pixels each round until we have 8 lines.
+			convert240pFrameFast_8p_lp:
+				@ Load 8 pixels from frame.
+				ldmia  r0!, {r8-r10, lr}           @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+				@ Decrement size and extract first 2 pixels.
+				subs r3,  r3, #16                  @ r3 -= 16;              // Updates flags.
+				and  r4, r12,  r8, lsr #1          @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+				lsr  r5,  r8, #17                  @ r5 = r8>>17;
+
+				@ Look up pixel 1 and extract pixel 3.
+				ldr  r4, [r2,  r4, lsl #2]         @ r4 = r2[r4];           // u32.
+				and  r6, r12,  r9, lsr #1          @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 2 and extract pixel 4.
+				ldr  r5, [r2,  r5, lsl #2]         @ r5 = r2[r5]; // u32.
+				lsr  r7,  r9, #17                  @ r7 = r9>>17;
+
+				@ Look up pixel 3 and extract pixel 5.
+				ldr  r6, [r2,  r6, lsl #2]         @ r6 = r2[r6];            // u32.
+				and  r8, r12, r10, lsr #1          @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 4 and extract pixel 6.
+				ldr  r7, [r2,  r7, lsl #2]         @ r7 = r2[r7];  // u32.
+				lsr  r9, r10, #17                  @ r9 = r10>>17;
+
+				@ Look up pixel 5 and extract pixel 7.
+				ldr  r8, [r2,  r8, lsl #2]         @ r8 = r2[r8];            // u32.
+				and r10, r12,  lr, lsr #1          @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+				@ Look up pixel 6 and extract pixel 8.
+				ldr  r9, [r2,  r9, lsl #2]         @ r9 = r2[r9]; // u32.
+				lsr  lr,  lr, #17                  @ lr = lr>>17;
+
+				@ Look up pixel 7 and 8.
+				ldr r10, [r2, r10, lsl #2]         @ r10 = r2[r10]; // u32.
+				ldr  lr, [r2,  lr, lsl #2]         @ lr = r2[lr];   // u32.
+
+				@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+				pld [r0, #32]                      @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+				stmia  r1!, {r4-r10, lr}           @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+				bne convert240pFrameFast_8p_lp     @ if(r3 != 0) goto convert240pFrameFast_8p_lp;
+
+			@ Test if 8 line counter is 232, skip texture padding and jump back if we are not done yet.
+			cmp r11, #232                          @ r11 - 232; // Updates flags.
+			add  r0,  r0, #0x980                   @ r0 += 0x980;
+			add  r1,  r1, #0x1300                  @ r1 += 0x1300;
+			moveq r11, #240                        @ if(r11 == 232) r11 = 240;
+			beq convert240pFrameFast_skip_irq_wait @ if(r11 == 232) goto convert240pFrameFast_skip_irq_wait;
+			bls convert240pFrameFast_8l_lp         @ if(r11 <= 232) goto convert240pFrameFast_8l_lp;
+
+		@ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
+		@ Note: r3 has been decremented down to 0 previously and so it's safe to use.
+		mcr p15, 0, r3, c7, c14, 0                 @ Clean and Invalidate Entire Data Cache.
+		ldr  r4, =MPCORE_PRIV_BASE                 @ r4 = MPCORE_PRIV_BASE;  // u32.
+		mov  r5, #0x10000                          @ r5 = 0x10000;
+		orr  r5,  r5, #0xF                         @ r5 |= 0xF;
+		add  r4,  r4, #0x1F00                      @ r4 += 0x1F00; // REG_GICD_SOFTINT.
+		mcr p15, 0, r3, c7, c10, 4                 @ Data Synchronization Barrier.
+		str  r5, [r4]                              @ *r4 = r5; // u32.
+		b convert240pFrameFast_frame_lp            @ goto convert240pFrameFast_frame_lp;
+END_ASM_FUNC
\ No newline at end of file
diff --git a/source/arm11/gpu_cmd_lists.c b/source/arm11/gpu_cmd_lists.c
index 615febb..603077b 100644
--- a/source/arm11/gpu_cmd_lists.c
+++ b/source/arm11/gpu_cmd_lists.c
@@ -167,8 +167,16 @@ alignas(16) u8 gbaGpuList2[GBA_LIST2_SIZE] =
 
 
 
-void patchGbaGpuCmdList(u8 scaleType)
+void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture)
 {
+	if(useSecondTexture)
+	{
+		u32 tmp = GPU_TEXTURE2_ADDR>>3;
+		memcpy(&gbaGpuInitList[580], &tmp, 4);
+		tmp = 0;
+		memcpy(&gbaGpuInitList[584], &tmp, 4);
+	}
+
 	if(scaleType == 0)
 	{
 		u32 tmp = 0x4440;
@@ -215,7 +223,7 @@ void patchGbaGpuCmdList(u8 scaleType)
 		memcpy(&gbaGpuList2[316], &tmp, 4);
 		memcpy(&gbaGpuList2[380], &tmp, 4);
 	}
-	else return; // Nothing to do.
+	// else nothing to do.
 
 	flushDCacheRange(gbaGpuInitList, sizeof(gbaGpuInitList));
 	flushDCacheRange(gbaGpuList2, sizeof(gbaGpuList2));
diff --git a/source/arm11/oaf_video.c b/source/arm11/oaf_video.c
index d3446f3..b03a162 100644
--- a/source/arm11/oaf_video.c
+++ b/source/arm11/oaf_video.c
@@ -21,6 +21,7 @@
 #include "types.h"
 #include "arm11/config.h"
 #include "arm11/drivers/gx.h"
+#include "drivers/cache.h"
 #include "util.h"
 #include "oaf_error_codes.h"
 #include "arm11/drivers/lgycap.h"
@@ -31,8 +32,17 @@
 #include "fsutil.h"
 #include "kernel.h"
 #include "kevent.h"
-#include "arm11/gpu_cmd_lists.h"
 #include "arm11/drivers/hid.h"
+#include "arm11/drivers/interrupt.h"
+#include "arm11/gpu_cmd_lists.h"
+#include "system.h"
+#include "arm11/fast_frame_convert.h"
+
+
+#define COLOR_LUT_ADDR (0x1FF00000u)
+
+
+static KHandle g_convFinishedEvent = 0;
 
 
 
@@ -58,6 +68,108 @@ static void adjustGammaTableForGba(void)
 	}
 }
 
+typedef struct
+{
+	float targetGamma;
+	float lum;
+	float  r, gr, br;
+	float rg,  g, bg;
+	float rb, gb,  b;
+	float displayGamma;
+} ColorProfile;
+
+static const ColorProfile g_colorProfiles[3] =
+{
+	{ // libretro GBA color (sRGB). Credits: hunterk and Pokefan531.
+		2.f + 0.5f,
+		0.93f,
+		0.8f,   0.275f, -0.075f,
+		0.135f, 0.64f,   0.225f,
+		0.195f, 0.155f,  0.65f,
+		1.f / 2.f
+	},
+	{ // libretro DS phat (sRGB). Credits: hunterk and Pokefan531.
+		2.f,
+		1.f,
+		0.705f,  0.235f,  -0.075f,
+		0.09f,   0.585f,   0.24f,
+		0.1075f, 0.1725f,  0.72f,
+		1.f / 2.f
+	},
+	{ // libretro DS phat white (sRGB). Credits: hunterk and Pokefan531.
+		2.f,
+		0.915f,
+		0.815f,  0.275f,  -0.09f,
+		0.1f,    0.64f,    0.26f,
+		0.1075f, 0.1725f,  0.72f,
+		1.f / 2.f
+	}
+};
+
+ALWAYS_INLINE float clamp_float(const float x, const float min, const float max)
+{
+	return (x < min ? min : (x > max ? max : x));
+}
+
+static void makeColorLut(const ColorProfile *const p)
+{
+	u32 *colorLut = (u32*)COLOR_LUT_ADDR;
+	for(u32 i = 0; i < 32768; i++)
+	{
+		// Convert to 8-bit and normalize.
+		float b = (float)rgbFive2Eight(i & 31u) / 255;
+		float g = (float)rgbFive2Eight((i>>5) & 31u) / 255;
+		float r = (float)rgbFive2Eight(i>>10) / 255;
+
+		// Convert to linear gamma.
+		const float targetGamma = p->targetGamma;
+		b = powf(b, targetGamma);
+		g = powf(g, targetGamma);
+		r = powf(r, targetGamma);
+
+		// Apply luminance.
+		const float lum = p->lum;
+		b = clamp_float(b * lum, 0.f, 1.f);
+		g = clamp_float(g * lum, 0.f, 1.f);
+		r = clamp_float(r * lum, 0.f, 1.f);
+
+		/*
+		 *               Input
+		 *                [r]
+		 *                [g]
+		 *                [b]
+		 *
+		 * Correction    Output
+		 * [ r][gr][br]   [r]
+		 * [rg][ g][bg]   [g]
+		 * [rb][gb][ b]   [b]
+		*/
+		// Assuming no alpha channel in original calculation.
+		float newB = p->rb * r + p->gb * g + p->b * b;
+		float newG = p->rg * r + p->g * g + p->bg * b;
+		float newR = p->r * r + p->gr * g + p->br * b;
+
+		newB = (newB < 0.f ? 0.f : newB);
+		newG = (newG < 0.f ? 0.f : newG);
+		newR = (newR < 0.f ? 0.f : newR);
+
+		// Convert to display gamma.
+		const float displayGamma = p->displayGamma;
+		newB = powf(newB, displayGamma);
+		newG = powf(newG, displayGamma);
+		newR = powf(newR, displayGamma);
+
+		// Denormalize, clamp, convert to ABGR8 and write lut.
+		u32 tmp = 0xFF; // Alpha.
+		tmp |= clamp_s32(lroundf(newB * 255), 0, 255)<<8;
+		tmp |= clamp_s32(lroundf(newG * 255), 0, 255)<<16;
+		tmp |= clamp_s32(lroundf(newR * 255), 0, 255)<<24;
+		*colorLut++ = tmp;
+	}
+
+	flushDCacheRange((void*)COLOR_LUT_ADDR, 1024u * 128);
+}
+
 static Result dumpFrameTex(void)
 {
 	// Stop LgyCap before dumping the frame to prevent glitches.
@@ -110,7 +222,7 @@ static Result dumpFrameTex(void)
 	// Note: This is a race with the currently displaying frame buffer
 	//       because we just swapped buffers in the gfx handler function.
 	u32 *const tmpBuf = GFX_getBuffer(GFX_LCD_TOP, GFX_SIDE_LEFT);
-	GX_displayTransfer((u32*)0x18200000, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim,
+	GX_displayTransfer((u32*)GPU_TEXTURE_ADDR, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim,
 	                   PPF_O_FMT(GX_A1BGR5) | PPF_I_FMT(GX_A1BGR5) | PPF_CROP_EN);
 	memcpy(tmpBuf, &bmpHeaders, sizeof(bmpHeaders));
 	GFX_waitForPPF();
@@ -131,6 +243,11 @@ static Result dumpFrameTex(void)
 	return res;
 }
 
+static void convFinishedHandler(UNUSED const u32 intSource)
+{
+	signalEvent(g_convFinishedEvent, false);
+}
+
 static void gbaGfxHandler(void *args)
 {
 	const KHandle event = (KHandle)args;
@@ -181,7 +298,7 @@ static void gbaGfxHandler(void *args)
 	taskExit();
 }
 
-static KHandle setupFrameCapture(const u8 scaler)
+static KHandle setupFrameCapture(const u8 scaler, const bool colorCorrectionEnabled)
 {
 	const bool is240x160 = scaler < 2;
 	static s16 matrix[12 * 8] =
@@ -213,7 +330,7 @@ static KHandle setupFrameCapture(const u8 scaler)
 	gbaCfg.cnt   = LGYCAP_SWIZZLE | LGYCAP_ROT_NONE | LGYCAP_FMT_A1BGR5 | (is240x160 ? 0 : LGYCAP_HSCALE_EN | LGYCAP_VSCALE_EN);
 	gbaCfg.w     = (is240x160 ? 240 : 360);
 	gbaCfg.h     = (is240x160 ? 160 : 240);
-	gbaCfg.irq   = 0;
+	gbaCfg.irq   = (colorCorrectionEnabled ? LGYCAP_IRQ_DMA_REQ : 0); // We need the DMA request IRQ for core 1.
 	gbaCfg.vLen  = 6;
 	gbaCfg.vPatt = 0b00011011;
 	memcpy(gbaCfg.vMatrix, matrix, 6 * 8 * 2);
@@ -234,13 +351,41 @@ KHandle OAF_videoInit(void)
 		GFX_powerOffBacklight(GFX_BL_BOT);
 #endif
 
-	// Initialize frame capture and frame handler.
+	// Initialize frame capture.
 	const u8 scaler = g_oafConfig.scaler;
-	const KHandle frameReadyEvent = setupFrameCapture(scaler);
-	patchGbaGpuCmdList(scaler);
-	createTask(0x800, 3, gbaGfxHandler, (void*)frameReadyEvent);
+	const u8 colorProfile = g_oafConfig.colorProfile;
+	KHandle frameReadyEvent;
+	KHandle convFinishedEvent;
+	if(colorProfile > 0)
+	{
+		// Start capture hardware and create event handles.
+		frameReadyEvent = setupFrameCapture(scaler, true);
+		convFinishedEvent = createEvent(false);
+		g_convFinishedEvent = convFinishedEvent;
 
-	// Adjust gamma table and setup button overrides.
+		// Patch GPU cmd list with texture location 2.
+		patchGbaGpuCmdList(scaler, true);
+
+		// Compute the (linear) 3D lookup table.
+		makeColorLut(&g_colorProfiles[colorProfile - 1]);
+
+		// Register IPI handler and start core 1 for color conversion.
+		IRQ_registerIsr(IRQ_IPI15, 13, 0, convFinishedHandler);
+		__systemBootCore1((scaler < 2 ? convert160pFrameFast : convert240pFrameFast));
+	}
+	else
+	{
+		// Start capture hardware.
+		frameReadyEvent = setupFrameCapture(scaler, false);
+
+		// Patch GPU cmd list with texture location 1.
+		patchGbaGpuCmdList(scaler, false);
+	}
+
+	// Start frame handler.
+	createTask(0x800, 3, gbaGfxHandler, (void*)(colorProfile > 0 ? convFinishedEvent : frameReadyEvent));
+
+	// Adjust hardware gamma table.
 	adjustGammaTableForGba();
 
 	// Load border if any exists.
@@ -265,4 +410,9 @@ void OAF_videoExit(void)
 	// frameReadyEvent deleted by this function.
 	// gbaGfxHandler() will automatically terminate.
 	LGYCAP_deinit(LGYCAP_DEV_TOP);
+	if(g_convFinishedEvent != 0)
+	{
+		deleteEvent(g_convFinishedEvent);
+		g_convFinishedEvent = 0;
+	}
 }
\ No newline at end of file
diff --git a/source/arm11/open_agb_firm.c b/source/arm11/open_agb_firm.c
index af92636..5abaad7 100644
--- a/source/arm11/open_agb_firm.c
+++ b/source/arm11/open_agb_firm.c
@@ -363,6 +363,7 @@ void oafUpdate(void)
 	CODEC_runHeadphoneDetection();
 	updateBacklight();
 	waitForEvent(g_frameReadyEvent);
+	clearEvent(g_frameReadyEvent);
 }
 
 void oafFinish(void)