diff --git a/README.md b/README.md
index f3212cb..2a448e6 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,9 @@ Video-related settings.
`float brightness` - Screen lift
* Default: `0.0`
+`string colorProfile` - Color correction profile. `none`, `gba`, `nds` or `nds_white`.
+* Default: `none`
+
### Audio
Audio settings.
@@ -255,8 +258,9 @@ You may use this under the terms of the GNU General Public License GPL v3 or the
* **MAME**
* **No-Intro**
* **Wolfvak, Sono and all the other people in #GodMode9 on freenode/Discord**
-* **endrift, Extrems and all the other people in #mgba on freenode**
+* **endrift, Extrems and all the other people in #mgba on Libera.Chat**
* **Oleh Prypin (oprypin) for nightly.link**
+* **hunterk and Pokefan531 for their amazing libretro shaders**
* ...everyone who contributed to **3dbrew.org**
-Copyright (C) 2021 derrek, profi200, d0k3
+Copyright (C) 2024 derrek, profi200, d0k3
\ No newline at end of file
diff --git a/include/arm11/config.h b/include/arm11/config.h
index ef928df..6d5a3cd 100644
--- a/include/arm11/config.h
+++ b/include/arm11/config.h
@@ -46,6 +46,7 @@ typedef struct
float lcdGamma;
float contrast;
float brightness;
+ u8 colorProfile; // 0 = none, 1 = GBA, 2 = DS phat, 3 = DS phat white.
// [audio]
u8 audioOut; // 0 = auto, 1 = speakers, 2 = headphones.
diff --git a/include/arm11/fast_frame_convert.h b/include/arm11/fast_frame_convert.h
new file mode 100644
index 0000000..c3d6b99
--- /dev/null
+++ b/include/arm11/fast_frame_convert.h
@@ -0,0 +1,33 @@
+#pragma once
+
+/*
+ * This file is part of open_agb_firm
+ * Copyright (C) 2024 profi200
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void convert160pFrameFast(void);
+void convert240pFrameFast(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
\ No newline at end of file
diff --git a/include/arm11/gpu_cmd_lists.h b/include/arm11/gpu_cmd_lists.h
index 168c9ce..17b7090 100644
--- a/include/arm11/gpu_cmd_lists.h
+++ b/include/arm11/gpu_cmd_lists.h
@@ -27,6 +27,8 @@ extern "C"
#endif
#define GPU_RENDER_BUF_ADDR (0x18180000)
+#define GPU_TEXTURE_ADDR (0x18200000)
+#define GPU_TEXTURE2_ADDR (0x18300000)
#define GBA_INIT_LIST_SIZE (1136)
#define GBA_LIST2_SIZE (448)
@@ -36,7 +38,7 @@ extern u8 gbaGpuList2[GBA_LIST2_SIZE];
-void patchGbaGpuCmdList(u8 scaleType);
+void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture);
#ifdef __cplusplus
} // extern "C"
diff --git a/libraries/libn3ds b/libraries/libn3ds
index f5e5e8e..6259b6b 160000
--- a/libraries/libn3ds
+++ b/libraries/libn3ds
@@ -1 +1 @@
-Subproject commit f5e5e8efb3a33959ee5489e2f13f1aecf62f1e04
+Subproject commit 6259b6b8ffe4bf82481dc93aeadbcc96738c2b9f
diff --git a/source/arm11/config.c b/source/arm11/config.c
index e8925ff..0b08bb3 100644
--- a/source/arm11/config.c
+++ b/source/arm11/config.c
@@ -36,7 +36,8 @@
"gbaGamma=2.2\n" \
"lcdGamma=1.54\n" \
"contrast=1.0\n" \
- "brightness=0.0\n\n" \
+ "brightness=0.0\n" \
+ "colorProfile=none\n\n" \
"[audio]\n" \
"audioOut=0\n" \
"volume=127\n\n" \
@@ -61,6 +62,7 @@ OafConfig g_oafConfig =
1.54f, // lcdGamma
1.f, // contrast
0.f, // brightness
+ 0, // colorProfile
// [audio]
0, // Automatic audio output.
@@ -154,6 +156,19 @@ static int cfgIniCallback(void* user, const char* section, const char* name, con
config->contrast = str2float(value);
else if(strcmp(name, "brightness") == 0)
config->brightness = str2float(value);
+ else if(strcmp(name, "colorProfile") == 0)
+ {
+ if(strcmp(value, "none") == 0)
+ config->colorProfile = 0;
+ else if(strcmp(value, "gba") == 0)
+ config->colorProfile = 1;
+ else if(strcmp(value, "nds") == 0)
+ config->colorProfile = 2;
+ else if(strcmp(value, "nds_white") == 0)
+ config->colorProfile = 3;
+ //else if(strcmp(value, "custom") == 0) // TODO: Implement user provided profile.
+ // config->colorProfile = 4;
+ }
}
else if(strcmp(section, "audio") == 0)
{
diff --git a/source/arm11/fast_frame_convert.s b/source/arm11/fast_frame_convert.s
new file mode 100644
index 0000000..ea4b5cc
--- /dev/null
+++ b/source/arm11/fast_frame_convert.s
@@ -0,0 +1,312 @@
+@ This file is part of open_agb_firm
+@ Copyright (C) 2024 profi200
+@
+@ This program is free software: you can redistribute it and/or modify
+@ it under the terms of the GNU General Public License as published by
+@ the Free Software Foundation, either version 3 of the License, or
+@ (at your option) any later version.
+@
+@ This program is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+@ GNU General Public License for more details.
+@
+@ You should have received a copy of the GNU General Public License
+@ along with this program. If not, see .
+
+#include "asm_macros.h"
+#include "mem_map.h"
+
+.syntax unified
+.cpu mpcore
+.fpu vfpv2
+
+
+
+@ Whole frame converter.
+/*BEGIN_ASM_FUNC convertFrameFast
+ @ Load frame, output and lookup table pointers.
+ @ Our frame is in a 512x512 texture. Same for the output.
+ @ The table is a 15 to 32-bit 3D lookup table with color correction pre-applied.
+ ldr r0, =0x18200000 @ r0 = 0x18200000;
+ ldr r1, =0x18300000 @ r1 = 0x18300000;
+ ldr r2, =0x1FF00000 @ r2 = 0x1FF00000;
+
+ @ Prefetch first cache line, save registers, load color mask and load 8 line counter.
+ pld [r0] @ Prefetch from r0.
+ stmfd sp!, {r4-r11, lr} @ Save registers.
+ ldrh r12, =0x7FFF @ r12 = 0x7FFF;
+ mov r11, #30 @ r11 = 30;
+
+ @ Convert 8 lines each round until we have a whole frame.
+ convertFrameFast_8l_lp:
+ @ Load size of 8 lines in bytes.
+ mov r3, #0x1680 @ r3 = 0x1680;
+
+ @ Convert 8 pixels each round until we have 8 lines.
+ convertFrameFast_8p_lp:
+ @ Load 8 pixels from frame.
+ ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+ @ Decrement size and extract first 2 pixels.
+ subs r3, r3, #16 @ r3 -= 16; // Updates flags.
+ and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+ lsr r5, r8, #17 @ r5 = r8>>17;
+
+ @ Look up pixel 1 and extract pixel 3.
+ ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32.
+ and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 2 and extract pixel 4.
+ ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32.
+ lsr r7, r9, #17 @ r7 = r9>>17;
+
+ @ Look up pixel 3 and extract pixel 5.
+ ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32.
+ and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 4 and extract pixel 6.
+ ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32.
+ lsr r9, r10, #17 @ r9 = r10>>17;
+
+ @ Look up pixel 5 and extract pixel 7.
+ ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32.
+ and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 6 and extract pixel 8.
+ ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32.
+ lsr lr, lr, #17 @ lr = lr>>17;
+
+ @ Look up pixel 7 and 8.
+ ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32.
+ ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32.
+
+ @ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+ pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+ stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+ bne convertFrameFast_8p_lp @ if(r3 != 0) goto convertFrameFast_8p_lp;
+
+ @ Decrement 8 line counter, skip texture padding and jump back if we are not done yet.
+ subs r11, r11, #1 @ r11--; // Updates flags.
+ add r0, r0, #0x980 @ r0 += 0x980;
+ add r1, r1, #0x1300 @ r1 += 0x1300;
+ bne convertFrameFast_8l_lp @ if(r11 != 0) goto convertFrameFast_8l_lp;
+
+ ldmfd sp!, {r4-r11, pc} @ Restore registers and return.
+END_ASM_FUNC*/
+
+@ Converts a 160p frame while it's being DMAd to memory.
+BEGIN_ASM_FUNC convert160pFrameFast
+ @ Enable top LCD LgyCap IRQs.
+ mov r0, #77 @ r0 = 77; // id IRQ_LGYCAP_TOP.
+ mov r1, #0 @ r1 = 0; // prio 0 (highest).
+ mov r2, #0 @ r2 = 0; // target 0 (this CPU).
+ mov r3, #0 @ r3 = 0; // isr NULL.
+ blx IRQ_registerIsr @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
+
+ @ We will be using IRQs without our IRQ handler to minimize latency.
+ cpsid i @ __disableIrq();
+
+ @ Load lookup table address and color mask.
+ ldr r2, =0x1FF00000 @ r2 = 0x1FF00000;
+ ldrh r12, =0x7FFF @ r12 = 0x7FFF;
+
+ convert160pFrameFast_frame_lp:
+ @ Load input and output addresses.
+ ldr r0, =0x18200000 @ r0 = 0x18200000; // u32.
+ @ldr r1, =0x18300000 @ r1 = 0x18300000; // u32.
+ add r1, r0, #0x100000 @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
+
+ @ Convert 8 lines each round until we have a whole frame.
+ convert160pFrameFast_8l_lp:
+ ldr r4, =0x10111008 @ r4 = ®_LGYCAP1_STAT; // u32.
+ ldr r5, =MPCORE_PRIV_BASE @ r5 = MPCORE_PRIV_BASE; // u32.
+
+ convert160pFrameFast_wait_irq:
+ @ Wait for LgyCap IRQs.
+ wfi @ __waitForInterrupt();
+
+ @ Acknowledge IRQ and extract line number.
+ ldr r11, [r4] @ r11 = REG_LGYCAP_STAT; // u32.
+ ldr r7, [r5, #0x10C] @ r7 = REG_GICC_INTACK; // u32.
+ str r11, [r4] @ REG_LGYCAP_STAT = r11; // u32.
+ lsrs r11, r11, #16 @ r11 >>= 16; // Updates flags.
+ str r7, [r5, #0x110] @ REG_GICC_EOI = r7; // u32.
+
+ @ Ignore DREQ IRQ for line 0.
+ beq convert160pFrameFast_wait_irq @ if((r11>>16) == 0) goto convert160pFrameFast_wait_irq;
+
+ convert160pFrameFast_skip_irq_wait:
+ @ Load size of 8 lines in bytes.
+ mov r3, #0xF00 @ r3 = 0xF00;
+
+ @ Convert 8 pixels each round until we have 8 lines.
+ convert160pFrameFast_8p_lp:
+ @ Load 8 pixels from frame.
+ ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+ @ Decrement size and extract first 2 pixels.
+ subs r3, r3, #16 @ r3 -= 16; // Updates flags.
+ and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+ lsr r5, r8, #17 @ r5 = r8>>17;
+
+ @ Look up pixel 1 and extract pixel 3.
+ ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32.
+ and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 2 and extract pixel 4.
+ ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32.
+ lsr r7, r9, #17 @ r7 = r9>>17;
+
+ @ Look up pixel 3 and extract pixel 5.
+ ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32.
+ and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 4 and extract pixel 6.
+ ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32.
+ lsr r9, r10, #17 @ r9 = r10>>17;
+
+ @ Look up pixel 5 and extract pixel 7.
+ ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32.
+ and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 6 and extract pixel 8.
+ ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32.
+ lsr lr, lr, #17 @ lr = lr>>17;
+
+ @ Look up pixel 7 and 8.
+ ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32.
+ ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32.
+
+ @ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+ pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+ stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+ bne convert160pFrameFast_8p_lp @ if(r3 != 0) goto convert160pFrameFast_8p_lp;
+
+ @ Test if 8 line counter is 152, skip texture padding and jump back if we are not done yet.
+ cmp r11, #152 @ r11 - 152; // Updates flags.
+ add r0, r0, #0x1100 @ r0 += 0x1100;
+ add r1, r1, #0x2200 @ r1 += 0x2200;
+ moveq r11, #160 @ if(r11 == 152) r11 = 160;
+ beq convert160pFrameFast_skip_irq_wait @ if(r11 == 152) goto convert160pFrameFast_skip_irq_wait;
+ bls convert160pFrameFast_8l_lp @ if(r11 <= 152) goto convert160pFrameFast_8l_lp;
+
+ @ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
+ @ Note: r3 has been decremented down to 0 previously and so it's safe to use.
+ mcr p15, 0, r3, c7, c14, 0 @ Clean and Invalidate Entire Data Cache.
+ ldr r4, =MPCORE_PRIV_BASE @ r4 = MPCORE_PRIV_BASE; // u32.
+ mov r5, #0x10000 @ r5 = 0x10000;
+ orr r5, r5, #0xF @ r5 |= 0xF;
+ add r4, r4, #0x1F00 @ r4 += 0x1F00; // REG_GICD_SOFTINT.
+ mcr p15, 0, r3, c7, c10, 4 @ Data Synchronization Barrier.
+ str r5, [r4] @ *r4 = r5; // u32.
+ b convert160pFrameFast_frame_lp @ goto convert160pFrameFast_frame_lp;
+END_ASM_FUNC
+
+@ Converts the frame while it's being DMAd to memory.
+BEGIN_ASM_FUNC convert240pFrameFast
+ @ Enable top LCD LgyCap IRQs.
+ mov r0, #77 @ r0 = 77; // id IRQ_LGYCAP_TOP.
+ mov r1, #0 @ r1 = 0; // prio 0 (highest).
+ mov r2, #0 @ r2 = 0; // target 0 (this CPU).
+ mov r3, #0 @ r3 = 0; // isr NULL.
+ blx IRQ_registerIsr @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
+
+ @ We will be using IRQs without our IRQ handler to minimize latency.
+ cpsid i @ __disableIrq();
+
+ @ Load lookup table address and color mask.
+ ldr r2, =0x1FF00000 @ r2 = 0x1FF00000;
+ ldrh r12, =0x7FFF @ r12 = 0x7FFF;
+
+ convert240pFrameFast_frame_lp:
+ @ Load input and output addresses.
+ ldr r0, =0x18200000 @ r0 = 0x18200000; // u32.
+ @ldr r1, =0x18300000 @ r1 = 0x18300000; // u32.
+ add r1, r0, #0x100000 @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
+
+ @ Convert 8 lines each round until we have a whole frame.
+ convert240pFrameFast_8l_lp:
+ ldr r4, =0x10111008 @ r4 = ®_LGYCAP1_STAT; // u32.
+ ldr r5, =MPCORE_PRIV_BASE @ r5 = MPCORE_PRIV_BASE; // u32.
+
+ convert240pFrameFast_wait_irq:
+ @ Wait for LgyCap IRQs.
+ wfi @ __waitForInterrupt();
+
+ @ Acknowledge IRQ and extract line number.
+ ldr r11, [r4] @ r11 = REG_LGYCAP_STAT; // u32.
+ ldr r7, [r5, #0x10C] @ r7 = REG_GICC_INTACK; // u32.
+ str r11, [r4] @ REG_LGYCAP_STAT = r11; // u32.
+ lsrs r11, r11, #16 @ r11 >>= 16; // Updates flags.
+ str r7, [r5, #0x110] @ REG_GICC_EOI = r7; // u32.
+
+ @ Ignore DREQ IRQ for line 0.
+ beq convert240pFrameFast_wait_irq @ if((r11>>16) == 0) goto convert240pFrameFast_wait_irq;
+
+ convert240pFrameFast_skip_irq_wait:
+ @ Load size of 8 lines in bytes.
+ mov r3, #0x1680 @ r3 = 0x1680;
+
+ @ Convert 8 pixels each round until we have 8 lines.
+ convert240pFrameFast_8p_lp:
+ @ Load 8 pixels from frame.
+ ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
+
+ @ Decrement size and extract first 2 pixels.
+ subs r3, r3, #16 @ r3 -= 16; // Updates flags.
+ and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
+ lsr r5, r8, #17 @ r5 = r8>>17;
+
+ @ Look up pixel 1 and extract pixel 3.
+ ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32.
+ and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 2 and extract pixel 4.
+ ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32.
+ lsr r7, r9, #17 @ r7 = r9>>17;
+
+ @ Look up pixel 3 and extract pixel 5.
+ ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32.
+ and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 4 and extract pixel 6.
+ ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32.
+ lsr r9, r10, #17 @ r9 = r10>>17;
+
+ @ Look up pixel 5 and extract pixel 7.
+ ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32.
+ and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
+
+ @ Look up pixel 6 and extract pixel 8.
+ ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32.
+ lsr lr, lr, #17 @ lr = lr>>17;
+
+ @ Look up pixel 7 and 8.
+ ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32.
+ ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32.
+
+ @ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
+ pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
+ stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
+ bne convert240pFrameFast_8p_lp @ if(r3 != 0) goto convert240pFrameFast_8p_lp;
+
+ @ Test if 8 line counter is 232, skip texture padding and jump back if we are not done yet.
+ cmp r11, #232 @ r11 - 232; // Updates flags.
+ add r0, r0, #0x980 @ r0 += 0x980;
+ add r1, r1, #0x1300 @ r1 += 0x1300;
+ moveq r11, #240 @ if(r11 == 232) r11 = 240;
+ beq convert240pFrameFast_skip_irq_wait @ if(r11 == 232) goto convert240pFrameFast_skip_irq_wait;
+ bls convert240pFrameFast_8l_lp @ if(r11 <= 232) goto convert240pFrameFast_8l_lp;
+
+ @ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
+ @ Note: r3 has been decremented down to 0 previously and so it's safe to use.
+ mcr p15, 0, r3, c7, c14, 0 @ Clean and Invalidate Entire Data Cache.
+ ldr r4, =MPCORE_PRIV_BASE @ r4 = MPCORE_PRIV_BASE; // u32.
+ mov r5, #0x10000 @ r5 = 0x10000;
+ orr r5, r5, #0xF @ r5 |= 0xF;
+ add r4, r4, #0x1F00 @ r4 += 0x1F00; // REG_GICD_SOFTINT.
+ mcr p15, 0, r3, c7, c10, 4 @ Data Synchronization Barrier.
+ str r5, [r4] @ *r4 = r5; // u32.
+ b convert240pFrameFast_frame_lp @ goto convert240pFrameFast_frame_lp;
+END_ASM_FUNC
\ No newline at end of file
diff --git a/source/arm11/gpu_cmd_lists.c b/source/arm11/gpu_cmd_lists.c
index 615febb..603077b 100644
--- a/source/arm11/gpu_cmd_lists.c
+++ b/source/arm11/gpu_cmd_lists.c
@@ -167,8 +167,16 @@ alignas(16) u8 gbaGpuList2[GBA_LIST2_SIZE] =
-void patchGbaGpuCmdList(u8 scaleType)
+void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture)
{
+ if(useSecondTexture)
+ {
+ u32 tmp = GPU_TEXTURE2_ADDR>>3;
+ memcpy(&gbaGpuInitList[580], &tmp, 4);
+ tmp = 0;
+ memcpy(&gbaGpuInitList[584], &tmp, 4);
+ }
+
if(scaleType == 0)
{
u32 tmp = 0x4440;
@@ -215,7 +223,7 @@ void patchGbaGpuCmdList(u8 scaleType)
memcpy(&gbaGpuList2[316], &tmp, 4);
memcpy(&gbaGpuList2[380], &tmp, 4);
}
- else return; // Nothing to do.
+ // else nothing to do.
flushDCacheRange(gbaGpuInitList, sizeof(gbaGpuInitList));
flushDCacheRange(gbaGpuList2, sizeof(gbaGpuList2));
diff --git a/source/arm11/oaf_video.c b/source/arm11/oaf_video.c
index d3446f3..b03a162 100644
--- a/source/arm11/oaf_video.c
+++ b/source/arm11/oaf_video.c
@@ -21,6 +21,7 @@
#include "types.h"
#include "arm11/config.h"
#include "arm11/drivers/gx.h"
+#include "drivers/cache.h"
#include "util.h"
#include "oaf_error_codes.h"
#include "arm11/drivers/lgycap.h"
@@ -31,8 +32,17 @@
#include "fsutil.h"
#include "kernel.h"
#include "kevent.h"
-#include "arm11/gpu_cmd_lists.h"
#include "arm11/drivers/hid.h"
+#include "arm11/drivers/interrupt.h"
+#include "arm11/gpu_cmd_lists.h"
+#include "system.h"
+#include "arm11/fast_frame_convert.h"
+
+
+#define COLOR_LUT_ADDR (0x1FF00000u)
+
+
+static KHandle g_convFinishedEvent = 0;
@@ -58,6 +68,108 @@ static void adjustGammaTableForGba(void)
}
}
+typedef struct
+{
+ float targetGamma;
+ float lum;
+ float r, gr, br;
+ float rg, g, bg;
+ float rb, gb, b;
+ float displayGamma;
+} ColorProfile;
+
+static const ColorProfile g_colorProfiles[3] =
+{
+ { // libretro GBA color (sRGB). Credits: hunterk and Pokefan531.
+ 2.f + 0.5f,
+ 0.93f,
+ 0.8f, 0.275f, -0.075f,
+ 0.135f, 0.64f, 0.225f,
+ 0.195f, 0.155f, 0.65f,
+ 1.f / 2.f
+ },
+ { // libretro DS phat (sRGB). Credits: hunterk and Pokefan531.
+ 2.f,
+ 1.f,
+ 0.705f, 0.235f, -0.075f,
+ 0.09f, 0.585f, 0.24f,
+ 0.1075f, 0.1725f, 0.72f,
+ 1.f / 2.f
+ },
+ { // libretro DS phat white (sRGB). Credits: hunterk and Pokefan531.
+ 2.f,
+ 0.915f,
+ 0.815f, 0.275f, -0.09f,
+ 0.1f, 0.64f, 0.26f,
+ 0.1075f, 0.1725f, 0.72f,
+ 1.f / 2.f
+ }
+};
+
+ALWAYS_INLINE float clamp_float(const float x, const float min, const float max)
+{
+ return (x < min ? min : (x > max ? max : x));
+}
+
+static void makeColorLut(const ColorProfile *const p)
+{
+ u32 *colorLut = (u32*)COLOR_LUT_ADDR;
+ for(u32 i = 0; i < 32768; i++)
+ {
+ // Convert to 8-bit and normalize.
+ float b = (float)rgbFive2Eight(i & 31u) / 255;
+ float g = (float)rgbFive2Eight((i>>5) & 31u) / 255;
+ float r = (float)rgbFive2Eight(i>>10) / 255;
+
+ // Convert to linear gamma.
+ const float targetGamma = p->targetGamma;
+ b = powf(b, targetGamma);
+ g = powf(g, targetGamma);
+ r = powf(r, targetGamma);
+
+ // Apply luminance.
+ const float lum = p->lum;
+ b = clamp_float(b * lum, 0.f, 1.f);
+ g = clamp_float(g * lum, 0.f, 1.f);
+ r = clamp_float(r * lum, 0.f, 1.f);
+
+ /*
+ * Input
+ * [r]
+ * [g]
+ * [b]
+ *
+ * Correction Output
+ * [ r][gr][br] [r]
+ * [rg][ g][bg] [g]
+ * [rb][gb][ b] [b]
+ */
+ // Assuming no alpha channel in original calculation.
+ float newB = p->rb * r + p->gb * g + p->b * b;
+ float newG = p->rg * r + p->g * g + p->bg * b;
+ float newR = p->r * r + p->gr * g + p->br * b;
+
+ newB = (newB < 0.f ? 0.f : newB);
+ newG = (newG < 0.f ? 0.f : newG);
+ newR = (newR < 0.f ? 0.f : newR);
+
+ // Convert to display gamma.
+ const float displayGamma = p->displayGamma;
+ newB = powf(newB, displayGamma);
+ newG = powf(newG, displayGamma);
+ newR = powf(newR, displayGamma);
+
+ // Denormalize, clamp, convert to ABGR8 and write lut.
+ u32 tmp = 0xFF; // Alpha.
+ tmp |= clamp_s32(lroundf(newB * 255), 0, 255)<<8;
+ tmp |= clamp_s32(lroundf(newG * 255), 0, 255)<<16;
+ tmp |= clamp_s32(lroundf(newR * 255), 0, 255)<<24;
+ *colorLut++ = tmp;
+ }
+
+ flushDCacheRange((void*)COLOR_LUT_ADDR, 1024u * 128);
+}
+
static Result dumpFrameTex(void)
{
// Stop LgyCap before dumping the frame to prevent glitches.
@@ -110,7 +222,7 @@ static Result dumpFrameTex(void)
// Note: This is a race with the currently displaying frame buffer
// because we just swapped buffers in the gfx handler function.
u32 *const tmpBuf = GFX_getBuffer(GFX_LCD_TOP, GFX_SIDE_LEFT);
- GX_displayTransfer((u32*)0x18200000, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim,
+ GX_displayTransfer((u32*)GPU_TEXTURE_ADDR, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim,
PPF_O_FMT(GX_A1BGR5) | PPF_I_FMT(GX_A1BGR5) | PPF_CROP_EN);
memcpy(tmpBuf, &bmpHeaders, sizeof(bmpHeaders));
GFX_waitForPPF();
@@ -131,6 +243,11 @@ static Result dumpFrameTex(void)
return res;
}
+static void convFinishedHandler(UNUSED const u32 intSource)
+{
+ signalEvent(g_convFinishedEvent, false);
+}
+
static void gbaGfxHandler(void *args)
{
const KHandle event = (KHandle)args;
@@ -181,7 +298,7 @@ static void gbaGfxHandler(void *args)
taskExit();
}
-static KHandle setupFrameCapture(const u8 scaler)
+static KHandle setupFrameCapture(const u8 scaler, const bool colorCorrectionEnabled)
{
const bool is240x160 = scaler < 2;
static s16 matrix[12 * 8] =
@@ -213,7 +330,7 @@ static KHandle setupFrameCapture(const u8 scaler)
gbaCfg.cnt = LGYCAP_SWIZZLE | LGYCAP_ROT_NONE | LGYCAP_FMT_A1BGR5 | (is240x160 ? 0 : LGYCAP_HSCALE_EN | LGYCAP_VSCALE_EN);
gbaCfg.w = (is240x160 ? 240 : 360);
gbaCfg.h = (is240x160 ? 160 : 240);
- gbaCfg.irq = 0;
+ gbaCfg.irq = (colorCorrectionEnabled ? LGYCAP_IRQ_DMA_REQ : 0); // We need the DMA request IRQ for core 1.
gbaCfg.vLen = 6;
gbaCfg.vPatt = 0b00011011;
memcpy(gbaCfg.vMatrix, matrix, 6 * 8 * 2);
@@ -234,13 +351,41 @@ KHandle OAF_videoInit(void)
GFX_powerOffBacklight(GFX_BL_BOT);
#endif
- // Initialize frame capture and frame handler.
+ // Initialize frame capture.
const u8 scaler = g_oafConfig.scaler;
- const KHandle frameReadyEvent = setupFrameCapture(scaler);
- patchGbaGpuCmdList(scaler);
- createTask(0x800, 3, gbaGfxHandler, (void*)frameReadyEvent);
+ const u8 colorProfile = g_oafConfig.colorProfile;
+ KHandle frameReadyEvent;
+ KHandle convFinishedEvent;
+ if(colorProfile > 0)
+ {
+ // Start capture hardware and create event handles.
+ frameReadyEvent = setupFrameCapture(scaler, true);
+ convFinishedEvent = createEvent(false);
+ g_convFinishedEvent = convFinishedEvent;
- // Adjust gamma table and setup button overrides.
+ // Patch GPU cmd list with texture location 2.
+ patchGbaGpuCmdList(scaler, true);
+
+ // Compute the (linear) 3D lookup table.
+ makeColorLut(&g_colorProfiles[colorProfile - 1]);
+
+ // Register IPI handler and start core 1 for color conversion.
+ IRQ_registerIsr(IRQ_IPI15, 13, 0, convFinishedHandler);
+ __systemBootCore1((scaler < 2 ? convert160pFrameFast : convert240pFrameFast));
+ }
+ else
+ {
+ // Start capture hardware.
+ frameReadyEvent = setupFrameCapture(scaler, false);
+
+ // Patch GPU cmd list with texture location 1.
+ patchGbaGpuCmdList(scaler, false);
+ }
+
+ // Start frame handler.
+ createTask(0x800, 3, gbaGfxHandler, (void*)(colorProfile > 0 ? convFinishedEvent : frameReadyEvent));
+
+ // Adjust hardware gamma table.
adjustGammaTableForGba();
// Load border if any exists.
@@ -265,4 +410,9 @@ void OAF_videoExit(void)
// frameReadyEvent deleted by this function.
// gbaGfxHandler() will automatically terminate.
LGYCAP_deinit(LGYCAP_DEV_TOP);
+ if(g_convFinishedEvent != 0)
+ {
+ deleteEvent(g_convFinishedEvent);
+ g_convFinishedEvent = 0;
+ }
}
\ No newline at end of file
diff --git a/source/arm11/open_agb_firm.c b/source/arm11/open_agb_firm.c
index af92636..5abaad7 100644
--- a/source/arm11/open_agb_firm.c
+++ b/source/arm11/open_agb_firm.c
@@ -363,6 +363,7 @@ void oafUpdate(void)
CODEC_runHeadphoneDetection();
updateBacklight();
waitForEvent(g_frameReadyEvent);
+ clearEvent(g_frameReadyEvent);
}
void oafFinish(void)