diff --git a/README.md b/README.md index f3212cb..2a448e6 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,9 @@ Video-related settings. `float brightness` - Screen lift * Default: `0.0` +`string colorProfile` - Color correction profile. `none`, `gba`, `nds` or `nds_white`. +* Default: `none` + ### Audio Audio settings. @@ -255,8 +258,9 @@ You may use this under the terms of the GNU General Public License GPL v3 or the * **MAME** * **No-Intro** * **Wolfvak, Sono and all the other people in #GodMode9 on freenode/Discord** -* **endrift, Extrems and all the other people in #mgba on freenode** +* **endrift, Extrems and all the other people in #mgba on Libera.Chat** * **Oleh Prypin (oprypin) for nightly.link** +* **hunterk and Pokefan531 for their amazing libretro shaders** * ...everyone who contributed to **3dbrew.org** -Copyright (C) 2021 derrek, profi200, d0k3 +Copyright (C) 2024 derrek, profi200, d0k3 \ No newline at end of file diff --git a/include/arm11/config.h b/include/arm11/config.h index ef928df..6d5a3cd 100644 --- a/include/arm11/config.h +++ b/include/arm11/config.h @@ -46,6 +46,7 @@ typedef struct float lcdGamma; float contrast; float brightness; + u8 colorProfile; // 0 = none, 1 = GBA, 2 = DS phat, 3 = DS phat white. // [audio] u8 audioOut; // 0 = auto, 1 = speakers, 2 = headphones. diff --git a/include/arm11/fast_frame_convert.h b/include/arm11/fast_frame_convert.h new file mode 100644 index 0000000..c3d6b99 --- /dev/null +++ b/include/arm11/fast_frame_convert.h @@ -0,0 +1,33 @@ +#pragma once + +/* + * This file is part of open_agb_firm + * Copyright (C) 2024 profi200 + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + + +#ifdef __cplusplus +extern "C" +{ +#endif + +void convert160pFrameFast(void); +void convert240pFrameFast(void); + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/include/arm11/gpu_cmd_lists.h b/include/arm11/gpu_cmd_lists.h index 168c9ce..17b7090 100644 --- a/include/arm11/gpu_cmd_lists.h +++ b/include/arm11/gpu_cmd_lists.h @@ -27,6 +27,8 @@ extern "C" #endif #define GPU_RENDER_BUF_ADDR (0x18180000) +#define GPU_TEXTURE_ADDR (0x18200000) +#define GPU_TEXTURE2_ADDR (0x18300000) #define GBA_INIT_LIST_SIZE (1136) #define GBA_LIST2_SIZE (448) @@ -36,7 +38,7 @@ extern u8 gbaGpuList2[GBA_LIST2_SIZE]; -void patchGbaGpuCmdList(u8 scaleType); +void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture); #ifdef __cplusplus } // extern "C" diff --git a/libraries/libn3ds b/libraries/libn3ds index f5e5e8e..6259b6b 160000 --- a/libraries/libn3ds +++ b/libraries/libn3ds @@ -1 +1 @@ -Subproject commit f5e5e8efb3a33959ee5489e2f13f1aecf62f1e04 +Subproject commit 6259b6b8ffe4bf82481dc93aeadbcc96738c2b9f diff --git a/source/arm11/config.c b/source/arm11/config.c index e8925ff..0b08bb3 100644 --- a/source/arm11/config.c +++ b/source/arm11/config.c @@ -36,7 +36,8 @@ "gbaGamma=2.2\n" \ "lcdGamma=1.54\n" \ "contrast=1.0\n" \ - "brightness=0.0\n\n" \ + "brightness=0.0\n" \ + "colorProfile=none\n\n" \ "[audio]\n" \ "audioOut=0\n" \ "volume=127\n\n" \ @@ -61,6 +62,7 @@ OafConfig g_oafConfig = 1.54f, // lcdGamma 1.f, // contrast 0.f, // brightness + 0, // colorProfile // [audio] 0, // Automatic audio output. @@ -154,6 +156,19 @@ static int cfgIniCallback(void* user, const char* section, const char* name, con config->contrast = str2float(value); else if(strcmp(name, "brightness") == 0) config->brightness = str2float(value); + else if(strcmp(name, "colorProfile") == 0) + { + if(strcmp(value, "none") == 0) + config->colorProfile = 0; + else if(strcmp(value, "gba") == 0) + config->colorProfile = 1; + else if(strcmp(value, "nds") == 0) + config->colorProfile = 2; + else if(strcmp(value, "nds_white") == 0) + config->colorProfile = 3; + //else if(strcmp(value, "custom") == 0) // TODO: Implement user provided profile. + // config->colorProfile = 4; + } } else if(strcmp(section, "audio") == 0) { diff --git a/source/arm11/fast_frame_convert.s b/source/arm11/fast_frame_convert.s new file mode 100644 index 0000000..ea4b5cc --- /dev/null +++ b/source/arm11/fast_frame_convert.s @@ -0,0 +1,312 @@ +@ This file is part of open_agb_firm +@ Copyright (C) 2024 profi200 +@ +@ This program is free software: you can redistribute it and/or modify +@ it under the terms of the GNU General Public License as published by +@ the Free Software Foundation, either version 3 of the License, or +@ (at your option) any later version. +@ +@ This program is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +@ GNU General Public License for more details. +@ +@ You should have received a copy of the GNU General Public License +@ along with this program. If not, see . + +#include "asm_macros.h" +#include "mem_map.h" + +.syntax unified +.cpu mpcore +.fpu vfpv2 + + + +@ Whole frame converter. +/*BEGIN_ASM_FUNC convertFrameFast + @ Load frame, output and lookup table pointers. + @ Our frame is in a 512x512 texture. Same for the output. + @ The table is a 15 to 32-bit 3D lookup table with color correction pre-applied. + ldr r0, =0x18200000 @ r0 = 0x18200000; + ldr r1, =0x18300000 @ r1 = 0x18300000; + ldr r2, =0x1FF00000 @ r2 = 0x1FF00000; + + @ Prefetch first cache line, save registers, load color mask and load 8 line counter. + pld [r0] @ Prefetch from r0. + stmfd sp!, {r4-r11, lr} @ Save registers. + ldrh r12, =0x7FFF @ r12 = 0x7FFF; + mov r11, #30 @ r11 = 30; + + @ Convert 8 lines each round until we have a whole frame. + convertFrameFast_8l_lp: + @ Load size of 8 lines in bytes. + mov r3, #0x1680 @ r3 = 0x1680; + + @ Convert 8 pixels each round until we have 8 lines. + convertFrameFast_8p_lp: + @ Load 8 pixels from frame. + ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16; + + @ Decrement size and extract first 2 pixels. + subs r3, r3, #16 @ r3 -= 16; // Updates flags. + and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF. + lsr r5, r8, #17 @ r5 = r8>>17; + + @ Look up pixel 1 and extract pixel 3. + ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32. + and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF. + + @ Look up pixel 2 and extract pixel 4. + ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32. + lsr r7, r9, #17 @ r7 = r9>>17; + + @ Look up pixel 3 and extract pixel 5. + ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32. + and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF. + + @ Look up pixel 4 and extract pixel 6. + ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32. + lsr r9, r10, #17 @ r9 = r10>>17; + + @ Look up pixel 5 and extract pixel 7. + ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32. + and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF. + + @ Look up pixel 6 and extract pixel 8. + ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32. + lsr lr, lr, #17 @ lr = lr>>17; + + @ Look up pixel 7 and 8. + ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32. + ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32. + + @ Prefetch next cache line, write 8 pixels and jump back if we are not done yet. + pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64. + stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32; + bne convertFrameFast_8p_lp @ if(r3 != 0) goto convertFrameFast_8p_lp; + + @ Decrement 8 line counter, skip texture padding and jump back if we are not done yet. + subs r11, r11, #1 @ r11--; // Updates flags. + add r0, r0, #0x980 @ r0 += 0x980; + add r1, r1, #0x1300 @ r1 += 0x1300; + bne convertFrameFast_8l_lp @ if(r11 != 0) goto convertFrameFast_8l_lp; + + ldmfd sp!, {r4-r11, pc} @ Restore registers and return. +END_ASM_FUNC*/ + +@ Converts a 160p frame while it's being DMAd to memory. +BEGIN_ASM_FUNC convert160pFrameFast + @ Enable top LCD LgyCap IRQs. + mov r0, #77 @ r0 = 77; // id IRQ_LGYCAP_TOP. + mov r1, #0 @ r1 = 0; // prio 0 (highest). + mov r2, #0 @ r2 = 0; // target 0 (this CPU). + mov r3, #0 @ r3 = 0; // isr NULL. + blx IRQ_registerIsr @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL); + + @ We will be using IRQs without our IRQ handler to minimize latency. + cpsid i @ __disableIrq(); + + @ Load lookup table address and color mask. + ldr r2, =0x1FF00000 @ r2 = 0x1FF00000; + ldrh r12, =0x7FFF @ r12 = 0x7FFF; + + convert160pFrameFast_frame_lp: + @ Load input and output addresses. + ldr r0, =0x18200000 @ r0 = 0x18200000; // u32. + @ldr r1, =0x18300000 @ r1 = 0x18300000; // u32. + add r1, r0, #0x100000 @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes. + + @ Convert 8 lines each round until we have a whole frame. + convert160pFrameFast_8l_lp: + ldr r4, =0x10111008 @ r4 = ®_LGYCAP1_STAT; // u32. + ldr r5, =MPCORE_PRIV_BASE @ r5 = MPCORE_PRIV_BASE; // u32. + + convert160pFrameFast_wait_irq: + @ Wait for LgyCap IRQs. + wfi @ __waitForInterrupt(); + + @ Acknowledge IRQ and extract line number. + ldr r11, [r4] @ r11 = REG_LGYCAP_STAT; // u32. + ldr r7, [r5, #0x10C] @ r7 = REG_GICC_INTACK; // u32. + str r11, [r4] @ REG_LGYCAP_STAT = r11; // u32. + lsrs r11, r11, #16 @ r11 >>= 16; // Updates flags. + str r7, [r5, #0x110] @ REG_GICC_EOI = r7; // u32. + + @ Ignore DREQ IRQ for line 0. + beq convert160pFrameFast_wait_irq @ if((r11>>16) == 0) goto convert160pFrameFast_wait_irq; + + convert160pFrameFast_skip_irq_wait: + @ Load size of 8 lines in bytes. + mov r3, #0xF00 @ r3 = 0xF00; + + @ Convert 8 pixels each round until we have 8 lines. + convert160pFrameFast_8p_lp: + @ Load 8 pixels from frame. + ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16; + + @ Decrement size and extract first 2 pixels. + subs r3, r3, #16 @ r3 -= 16; // Updates flags. + and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF. + lsr r5, r8, #17 @ r5 = r8>>17; + + @ Look up pixel 1 and extract pixel 3. + ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32. + and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF. + + @ Look up pixel 2 and extract pixel 4. + ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32. + lsr r7, r9, #17 @ r7 = r9>>17; + + @ Look up pixel 3 and extract pixel 5. + ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32. + and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF. + + @ Look up pixel 4 and extract pixel 6. + ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32. + lsr r9, r10, #17 @ r9 = r10>>17; + + @ Look up pixel 5 and extract pixel 7. + ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32. + and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF. + + @ Look up pixel 6 and extract pixel 8. + ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32. + lsr lr, lr, #17 @ lr = lr>>17; + + @ Look up pixel 7 and 8. + ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32. + ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32. + + @ Prefetch next cache line, write 8 pixels and jump back if we are not done yet. + pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64. + stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32; + bne convert160pFrameFast_8p_lp @ if(r3 != 0) goto convert160pFrameFast_8p_lp; + + @ Test if 8 line counter is 152, skip texture padding and jump back if we are not done yet. + cmp r11, #152 @ r11 - 152; // Updates flags. + add r0, r0, #0x1100 @ r0 += 0x1100; + add r1, r1, #0x2200 @ r1 += 0x2200; + moveq r11, #160 @ if(r11 == 152) r11 = 160; + beq convert160pFrameFast_skip_irq_wait @ if(r11 == 152) goto convert160pFrameFast_skip_irq_wait; + bls convert160pFrameFast_8l_lp @ if(r11 <= 152) goto convert160pFrameFast_8l_lp; + + @ Flush the D-Cache, wait for flush completion, notify core 0 and jump back. + @ Note: r3 has been decremented down to 0 previously and so it's safe to use. + mcr p15, 0, r3, c7, c14, 0 @ Clean and Invalidate Entire Data Cache. + ldr r4, =MPCORE_PRIV_BASE @ r4 = MPCORE_PRIV_BASE; // u32. + mov r5, #0x10000 @ r5 = 0x10000; + orr r5, r5, #0xF @ r5 |= 0xF; + add r4, r4, #0x1F00 @ r4 += 0x1F00; // REG_GICD_SOFTINT. + mcr p15, 0, r3, c7, c10, 4 @ Data Synchronization Barrier. + str r5, [r4] @ *r4 = r5; // u32. + b convert160pFrameFast_frame_lp @ goto convert160pFrameFast_frame_lp; +END_ASM_FUNC + +@ Converts the frame while it's being DMAd to memory. +BEGIN_ASM_FUNC convert240pFrameFast + @ Enable top LCD LgyCap IRQs. + mov r0, #77 @ r0 = 77; // id IRQ_LGYCAP_TOP. + mov r1, #0 @ r1 = 0; // prio 0 (highest). + mov r2, #0 @ r2 = 0; // target 0 (this CPU). + mov r3, #0 @ r3 = 0; // isr NULL. + blx IRQ_registerIsr @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL); + + @ We will be using IRQs without our IRQ handler to minimize latency. + cpsid i @ __disableIrq(); + + @ Load lookup table address and color mask. + ldr r2, =0x1FF00000 @ r2 = 0x1FF00000; + ldrh r12, =0x7FFF @ r12 = 0x7FFF; + + convert240pFrameFast_frame_lp: + @ Load input and output addresses. + ldr r0, =0x18200000 @ r0 = 0x18200000; // u32. + @ldr r1, =0x18300000 @ r1 = 0x18300000; // u32. + add r1, r0, #0x100000 @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes. + + @ Convert 8 lines each round until we have a whole frame. + convert240pFrameFast_8l_lp: + ldr r4, =0x10111008 @ r4 = ®_LGYCAP1_STAT; // u32. + ldr r5, =MPCORE_PRIV_BASE @ r5 = MPCORE_PRIV_BASE; // u32. + + convert240pFrameFast_wait_irq: + @ Wait for LgyCap IRQs. + wfi @ __waitForInterrupt(); + + @ Acknowledge IRQ and extract line number. + ldr r11, [r4] @ r11 = REG_LGYCAP_STAT; // u32. + ldr r7, [r5, #0x10C] @ r7 = REG_GICC_INTACK; // u32. + str r11, [r4] @ REG_LGYCAP_STAT = r11; // u32. + lsrs r11, r11, #16 @ r11 >>= 16; // Updates flags. + str r7, [r5, #0x110] @ REG_GICC_EOI = r7; // u32. + + @ Ignore DREQ IRQ for line 0. + beq convert240pFrameFast_wait_irq @ if((r11>>16) == 0) goto convert240pFrameFast_wait_irq; + + convert240pFrameFast_skip_irq_wait: + @ Load size of 8 lines in bytes. + mov r3, #0x1680 @ r3 = 0x1680; + + @ Convert 8 pixels each round until we have 8 lines. + convert240pFrameFast_8p_lp: + @ Load 8 pixels from frame. + ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16; + + @ Decrement size and extract first 2 pixels. + subs r3, r3, #16 @ r3 -= 16; // Updates flags. + and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF. + lsr r5, r8, #17 @ r5 = r8>>17; + + @ Look up pixel 1 and extract pixel 3. + ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32. + and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF. + + @ Look up pixel 2 and extract pixel 4. + ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32. + lsr r7, r9, #17 @ r7 = r9>>17; + + @ Look up pixel 3 and extract pixel 5. + ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32. + and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF. + + @ Look up pixel 4 and extract pixel 6. + ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32. + lsr r9, r10, #17 @ r9 = r10>>17; + + @ Look up pixel 5 and extract pixel 7. + ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32. + and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF. + + @ Look up pixel 6 and extract pixel 8. + ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32. + lsr lr, lr, #17 @ lr = lr>>17; + + @ Look up pixel 7 and 8. + ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32. + ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32. + + @ Prefetch next cache line, write 8 pixels and jump back if we are not done yet. + pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64. + stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32; + bne convert240pFrameFast_8p_lp @ if(r3 != 0) goto convert240pFrameFast_8p_lp; + + @ Test if 8 line counter is 232, skip texture padding and jump back if we are not done yet. + cmp r11, #232 @ r11 - 232; // Updates flags. + add r0, r0, #0x980 @ r0 += 0x980; + add r1, r1, #0x1300 @ r1 += 0x1300; + moveq r11, #240 @ if(r11 == 232) r11 = 240; + beq convert240pFrameFast_skip_irq_wait @ if(r11 == 232) goto convert240pFrameFast_skip_irq_wait; + bls convert240pFrameFast_8l_lp @ if(r11 <= 232) goto convert240pFrameFast_8l_lp; + + @ Flush the D-Cache, wait for flush completion, notify core 0 and jump back. + @ Note: r3 has been decremented down to 0 previously and so it's safe to use. + mcr p15, 0, r3, c7, c14, 0 @ Clean and Invalidate Entire Data Cache. + ldr r4, =MPCORE_PRIV_BASE @ r4 = MPCORE_PRIV_BASE; // u32. + mov r5, #0x10000 @ r5 = 0x10000; + orr r5, r5, #0xF @ r5 |= 0xF; + add r4, r4, #0x1F00 @ r4 += 0x1F00; // REG_GICD_SOFTINT. + mcr p15, 0, r3, c7, c10, 4 @ Data Synchronization Barrier. + str r5, [r4] @ *r4 = r5; // u32. + b convert240pFrameFast_frame_lp @ goto convert240pFrameFast_frame_lp; +END_ASM_FUNC \ No newline at end of file diff --git a/source/arm11/gpu_cmd_lists.c b/source/arm11/gpu_cmd_lists.c index 615febb..603077b 100644 --- a/source/arm11/gpu_cmd_lists.c +++ b/source/arm11/gpu_cmd_lists.c @@ -167,8 +167,16 @@ alignas(16) u8 gbaGpuList2[GBA_LIST2_SIZE] = -void patchGbaGpuCmdList(u8 scaleType) +void patchGbaGpuCmdList(const u8 scaleType, const bool useSecondTexture) { + if(useSecondTexture) + { + u32 tmp = GPU_TEXTURE2_ADDR>>3; + memcpy(&gbaGpuInitList[580], &tmp, 4); + tmp = 0; + memcpy(&gbaGpuInitList[584], &tmp, 4); + } + if(scaleType == 0) { u32 tmp = 0x4440; @@ -215,7 +223,7 @@ void patchGbaGpuCmdList(u8 scaleType) memcpy(&gbaGpuList2[316], &tmp, 4); memcpy(&gbaGpuList2[380], &tmp, 4); } - else return; // Nothing to do. + // else nothing to do. flushDCacheRange(gbaGpuInitList, sizeof(gbaGpuInitList)); flushDCacheRange(gbaGpuList2, sizeof(gbaGpuList2)); diff --git a/source/arm11/oaf_video.c b/source/arm11/oaf_video.c index d3446f3..b03a162 100644 --- a/source/arm11/oaf_video.c +++ b/source/arm11/oaf_video.c @@ -21,6 +21,7 @@ #include "types.h" #include "arm11/config.h" #include "arm11/drivers/gx.h" +#include "drivers/cache.h" #include "util.h" #include "oaf_error_codes.h" #include "arm11/drivers/lgycap.h" @@ -31,8 +32,17 @@ #include "fsutil.h" #include "kernel.h" #include "kevent.h" -#include "arm11/gpu_cmd_lists.h" #include "arm11/drivers/hid.h" +#include "arm11/drivers/interrupt.h" +#include "arm11/gpu_cmd_lists.h" +#include "system.h" +#include "arm11/fast_frame_convert.h" + + +#define COLOR_LUT_ADDR (0x1FF00000u) + + +static KHandle g_convFinishedEvent = 0; @@ -58,6 +68,108 @@ static void adjustGammaTableForGba(void) } } +typedef struct +{ + float targetGamma; + float lum; + float r, gr, br; + float rg, g, bg; + float rb, gb, b; + float displayGamma; +} ColorProfile; + +static const ColorProfile g_colorProfiles[3] = +{ + { // libretro GBA color (sRGB). Credits: hunterk and Pokefan531. + 2.f + 0.5f, + 0.93f, + 0.8f, 0.275f, -0.075f, + 0.135f, 0.64f, 0.225f, + 0.195f, 0.155f, 0.65f, + 1.f / 2.f + }, + { // libretro DS phat (sRGB). Credits: hunterk and Pokefan531. + 2.f, + 1.f, + 0.705f, 0.235f, -0.075f, + 0.09f, 0.585f, 0.24f, + 0.1075f, 0.1725f, 0.72f, + 1.f / 2.f + }, + { // libretro DS phat white (sRGB). Credits: hunterk and Pokefan531. + 2.f, + 0.915f, + 0.815f, 0.275f, -0.09f, + 0.1f, 0.64f, 0.26f, + 0.1075f, 0.1725f, 0.72f, + 1.f / 2.f + } +}; + +ALWAYS_INLINE float clamp_float(const float x, const float min, const float max) +{ + return (x < min ? min : (x > max ? max : x)); +} + +static void makeColorLut(const ColorProfile *const p) +{ + u32 *colorLut = (u32*)COLOR_LUT_ADDR; + for(u32 i = 0; i < 32768; i++) + { + // Convert to 8-bit and normalize. + float b = (float)rgbFive2Eight(i & 31u) / 255; + float g = (float)rgbFive2Eight((i>>5) & 31u) / 255; + float r = (float)rgbFive2Eight(i>>10) / 255; + + // Convert to linear gamma. + const float targetGamma = p->targetGamma; + b = powf(b, targetGamma); + g = powf(g, targetGamma); + r = powf(r, targetGamma); + + // Apply luminance. + const float lum = p->lum; + b = clamp_float(b * lum, 0.f, 1.f); + g = clamp_float(g * lum, 0.f, 1.f); + r = clamp_float(r * lum, 0.f, 1.f); + + /* + * Input + * [r] + * [g] + * [b] + * + * Correction Output + * [ r][gr][br] [r] + * [rg][ g][bg] [g] + * [rb][gb][ b] [b] + */ + // Assuming no alpha channel in original calculation. + float newB = p->rb * r + p->gb * g + p->b * b; + float newG = p->rg * r + p->g * g + p->bg * b; + float newR = p->r * r + p->gr * g + p->br * b; + + newB = (newB < 0.f ? 0.f : newB); + newG = (newG < 0.f ? 0.f : newG); + newR = (newR < 0.f ? 0.f : newR); + + // Convert to display gamma. + const float displayGamma = p->displayGamma; + newB = powf(newB, displayGamma); + newG = powf(newG, displayGamma); + newR = powf(newR, displayGamma); + + // Denormalize, clamp, convert to ABGR8 and write lut. + u32 tmp = 0xFF; // Alpha. + tmp |= clamp_s32(lroundf(newB * 255), 0, 255)<<8; + tmp |= clamp_s32(lroundf(newG * 255), 0, 255)<<16; + tmp |= clamp_s32(lroundf(newR * 255), 0, 255)<<24; + *colorLut++ = tmp; + } + + flushDCacheRange((void*)COLOR_LUT_ADDR, 1024u * 128); +} + static Result dumpFrameTex(void) { // Stop LgyCap before dumping the frame to prevent glitches. @@ -110,7 +222,7 @@ static Result dumpFrameTex(void) // Note: This is a race with the currently displaying frame buffer // because we just swapped buffers in the gfx handler function. u32 *const tmpBuf = GFX_getBuffer(GFX_LCD_TOP, GFX_SIDE_LEFT); - GX_displayTransfer((u32*)0x18200000, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim, + GX_displayTransfer((u32*)GPU_TEXTURE_ADDR, PPF_DIM(512, 240), tmpBuf + (alignment / 4), outDim, PPF_O_FMT(GX_A1BGR5) | PPF_I_FMT(GX_A1BGR5) | PPF_CROP_EN); memcpy(tmpBuf, &bmpHeaders, sizeof(bmpHeaders)); GFX_waitForPPF(); @@ -131,6 +243,11 @@ static Result dumpFrameTex(void) return res; } +static void convFinishedHandler(UNUSED const u32 intSource) +{ + signalEvent(g_convFinishedEvent, false); +} + static void gbaGfxHandler(void *args) { const KHandle event = (KHandle)args; @@ -181,7 +298,7 @@ static void gbaGfxHandler(void *args) taskExit(); } -static KHandle setupFrameCapture(const u8 scaler) +static KHandle setupFrameCapture(const u8 scaler, const bool colorCorrectionEnabled) { const bool is240x160 = scaler < 2; static s16 matrix[12 * 8] = @@ -213,7 +330,7 @@ static KHandle setupFrameCapture(const u8 scaler) gbaCfg.cnt = LGYCAP_SWIZZLE | LGYCAP_ROT_NONE | LGYCAP_FMT_A1BGR5 | (is240x160 ? 0 : LGYCAP_HSCALE_EN | LGYCAP_VSCALE_EN); gbaCfg.w = (is240x160 ? 240 : 360); gbaCfg.h = (is240x160 ? 160 : 240); - gbaCfg.irq = 0; + gbaCfg.irq = (colorCorrectionEnabled ? LGYCAP_IRQ_DMA_REQ : 0); // We need the DMA request IRQ for core 1. gbaCfg.vLen = 6; gbaCfg.vPatt = 0b00011011; memcpy(gbaCfg.vMatrix, matrix, 6 * 8 * 2); @@ -234,13 +351,41 @@ KHandle OAF_videoInit(void) GFX_powerOffBacklight(GFX_BL_BOT); #endif - // Initialize frame capture and frame handler. + // Initialize frame capture. const u8 scaler = g_oafConfig.scaler; - const KHandle frameReadyEvent = setupFrameCapture(scaler); - patchGbaGpuCmdList(scaler); - createTask(0x800, 3, gbaGfxHandler, (void*)frameReadyEvent); + const u8 colorProfile = g_oafConfig.colorProfile; + KHandle frameReadyEvent; + KHandle convFinishedEvent; + if(colorProfile > 0) + { + // Start capture hardware and create event handles. + frameReadyEvent = setupFrameCapture(scaler, true); + convFinishedEvent = createEvent(false); + g_convFinishedEvent = convFinishedEvent; - // Adjust gamma table and setup button overrides. + // Patch GPU cmd list with texture location 2. + patchGbaGpuCmdList(scaler, true); + + // Compute the (linear) 3D lookup table. + makeColorLut(&g_colorProfiles[colorProfile - 1]); + + // Register IPI handler and start core 1 for color conversion. + IRQ_registerIsr(IRQ_IPI15, 13, 0, convFinishedHandler); + __systemBootCore1((scaler < 2 ? convert160pFrameFast : convert240pFrameFast)); + } + else + { + // Start capture hardware. + frameReadyEvent = setupFrameCapture(scaler, false); + + // Patch GPU cmd list with texture location 1. + patchGbaGpuCmdList(scaler, false); + } + + // Start frame handler. + createTask(0x800, 3, gbaGfxHandler, (void*)(colorProfile > 0 ? convFinishedEvent : frameReadyEvent)); + + // Adjust hardware gamma table. adjustGammaTableForGba(); // Load border if any exists. @@ -265,4 +410,9 @@ void OAF_videoExit(void) // frameReadyEvent deleted by this function. // gbaGfxHandler() will automatically terminate. LGYCAP_deinit(LGYCAP_DEV_TOP); + if(g_convFinishedEvent != 0) + { + deleteEvent(g_convFinishedEvent); + g_convFinishedEvent = 0; + } } \ No newline at end of file diff --git a/source/arm11/open_agb_firm.c b/source/arm11/open_agb_firm.c index af92636..5abaad7 100644 --- a/source/arm11/open_agb_firm.c +++ b/source/arm11/open_agb_firm.c @@ -363,6 +363,7 @@ void oafUpdate(void) CODEC_runHeadphoneDetection(); updateBacklight(); waitForEvent(g_frameReadyEvent); + clearEvent(g_frameReadyEvent); } void oafFinish(void)