open_agb_firm/source/arm11/fast_frame_convert.s

312 lines
15 KiB
ArmAsm

@ This file is part of open_agb_firm
@ Copyright (C) 2024 profi200
@
@ This program is free software: you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation, either version 3 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "asm_macros.h"
#include "mem_map.h"
.syntax unified
.cpu mpcore
.fpu vfpv2
@ Whole frame converter.
/*BEGIN_ASM_FUNC convertFrameFast
@ Load frame, output and lookup table pointers.
@ Our frame is in a 512x512 texture. Same for the output.
@ The table is a 15 to 32-bit 3D lookup table with color correction pre-applied.
ldr r0, =0x18200000 @ r0 = 0x18200000;
ldr r1, =0x18300000 @ r1 = 0x18300000;
ldr r2, =0x1FF00000 @ r2 = 0x1FF00000;
@ Prefetch first cache line, save registers, load color mask and load 8 line counter.
pld [r0] @ Prefetch from r0.
stmfd sp!, {r4-r11, lr} @ Save registers.
ldrh r12, =0x7FFF @ r12 = 0x7FFF;
mov r11, #30 @ r11 = 30;
@ Convert 8 lines each round until we have a whole frame.
convertFrameFast_8l_lp:
@ Load size of 8 lines in bytes.
mov r3, #0x1680 @ r3 = 0x1680;
@ Convert 8 pixels each round until we have 8 lines.
convertFrameFast_8p_lp:
@ Load 8 pixels from frame.
ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
@ Decrement size and extract first 2 pixels.
subs r3, r3, #16 @ r3 -= 16; // Updates flags.
and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
lsr r5, r8, #17 @ r5 = r8>>17;
@ Look up pixel 1 and extract pixel 3.
ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32.
and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
@ Look up pixel 2 and extract pixel 4.
ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32.
lsr r7, r9, #17 @ r7 = r9>>17;
@ Look up pixel 3 and extract pixel 5.
ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32.
and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
@ Look up pixel 4 and extract pixel 6.
ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32.
lsr r9, r10, #17 @ r9 = r10>>17;
@ Look up pixel 5 and extract pixel 7.
ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32.
and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
@ Look up pixel 6 and extract pixel 8.
ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32.
lsr lr, lr, #17 @ lr = lr>>17;
@ Look up pixel 7 and 8.
ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32.
ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32.
@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
bne convertFrameFast_8p_lp @ if(r3 != 0) goto convertFrameFast_8p_lp;
@ Decrement 8 line counter, skip texture padding and jump back if we are not done yet.
subs r11, r11, #1 @ r11--; // Updates flags.
add r0, r0, #0x980 @ r0 += 0x980;
add r1, r1, #0x1300 @ r1 += 0x1300;
bne convertFrameFast_8l_lp @ if(r11 != 0) goto convertFrameFast_8l_lp;
ldmfd sp!, {r4-r11, pc} @ Restore registers and return.
END_ASM_FUNC*/
@ Converts a 160p frame while it's being DMAd to memory.
BEGIN_ASM_FUNC convert160pFrameFast
@ Enable top LCD LgyCap IRQs.
mov r0, #77 @ r0 = 77; // id IRQ_LGYCAP_TOP.
mov r1, #0 @ r1 = 0; // prio 0 (highest).
mov r2, #0 @ r2 = 0; // target 0 (this CPU).
mov r3, #0 @ r3 = 0; // isr NULL.
blx IRQ_registerIsr @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
@ We will be using IRQs without our IRQ handler to minimize latency.
cpsid i @ __disableIrq();
@ Load lookup table address and color mask.
ldr r2, =0x1FF00000 @ r2 = 0x1FF00000;
ldrh r12, =0x7FFF @ r12 = 0x7FFF;
convert160pFrameFast_frame_lp:
@ Load input and output addresses.
ldr r0, =0x18200000 @ r0 = 0x18200000; // u32.
@ldr r1, =0x18300000 @ r1 = 0x18300000; // u32.
add r1, r0, #0x100000 @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
@ Convert 8 lines each round until we have a whole frame.
convert160pFrameFast_8l_lp:
ldr r4, =0x10111008 @ r4 = &REG_LGYCAP1_STAT; // u32.
ldr r5, =MPCORE_PRIV_BASE @ r5 = MPCORE_PRIV_BASE; // u32.
convert160pFrameFast_wait_irq:
@ Wait for LgyCap IRQs.
wfi @ __waitForInterrupt();
@ Acknowledge IRQ and extract line number.
ldr r11, [r4] @ r11 = REG_LGYCAP_STAT; // u32.
ldr r7, [r5, #0x10C] @ r7 = REG_GICC_INTACK; // u32.
str r11, [r4] @ REG_LGYCAP_STAT = r11; // u32.
lsrs r11, r11, #16 @ r11 >>= 16; // Updates flags.
str r7, [r5, #0x110] @ REG_GICC_EOI = r7; // u32.
@ Ignore DREQ IRQ for line 0.
beq convert160pFrameFast_wait_irq @ if((r11>>16) == 0) goto convert160pFrameFast_wait_irq;
convert160pFrameFast_skip_irq_wait:
@ Load size of 8 lines in bytes.
mov r3, #0xF00 @ r3 = 0xF00;
@ Convert 8 pixels each round until we have 8 lines.
convert160pFrameFast_8p_lp:
@ Load 8 pixels from frame.
ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
@ Decrement size and extract first 2 pixels.
subs r3, r3, #16 @ r3 -= 16; // Updates flags.
and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
lsr r5, r8, #17 @ r5 = r8>>17;
@ Look up pixel 1 and extract pixel 3.
ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32.
and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
@ Look up pixel 2 and extract pixel 4.
ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32.
lsr r7, r9, #17 @ r7 = r9>>17;
@ Look up pixel 3 and extract pixel 5.
ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32.
and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
@ Look up pixel 4 and extract pixel 6.
ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32.
lsr r9, r10, #17 @ r9 = r10>>17;
@ Look up pixel 5 and extract pixel 7.
ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32.
and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
@ Look up pixel 6 and extract pixel 8.
ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32.
lsr lr, lr, #17 @ lr = lr>>17;
@ Look up pixel 7 and 8.
ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32.
ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32.
@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
bne convert160pFrameFast_8p_lp @ if(r3 != 0) goto convert160pFrameFast_8p_lp;
@ Test if 8 line counter is 152, skip texture padding and jump back if we are not done yet.
cmp r11, #152 @ r11 - 152; // Updates flags.
add r0, r0, #0x1100 @ r0 += 0x1100;
add r1, r1, #0x2200 @ r1 += 0x2200;
moveq r11, #160 @ if(r11 == 152) r11 = 160;
beq convert160pFrameFast_skip_irq_wait @ if(r11 == 152) goto convert160pFrameFast_skip_irq_wait;
bls convert160pFrameFast_8l_lp @ if(r11 <= 152) goto convert160pFrameFast_8l_lp;
@ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
@ Note: r3 has been decremented down to 0 previously and so it's safe to use.
mcr p15, 0, r3, c7, c14, 0 @ Clean and Invalidate Entire Data Cache.
ldr r4, =MPCORE_PRIV_BASE @ r4 = MPCORE_PRIV_BASE; // u32.
mov r5, #0x10000 @ r5 = 0x10000;
orr r5, r5, #0xF @ r5 |= 0xF;
add r4, r4, #0x1F00 @ r4 += 0x1F00; // REG_GICD_SOFTINT.
mcr p15, 0, r3, c7, c10, 4 @ Data Synchronization Barrier.
str r5, [r4] @ *r4 = r5; // u32.
b convert160pFrameFast_frame_lp @ goto convert160pFrameFast_frame_lp;
END_ASM_FUNC
@ Converts the frame while it's being DMAd to memory.
BEGIN_ASM_FUNC convert240pFrameFast
@ Enable top LCD LgyCap IRQs.
mov r0, #77 @ r0 = 77; // id IRQ_LGYCAP_TOP.
mov r1, #0 @ r1 = 0; // prio 0 (highest).
mov r2, #0 @ r2 = 0; // target 0 (this CPU).
mov r3, #0 @ r3 = 0; // isr NULL.
blx IRQ_registerIsr @ IRQ_registerIsr(IRQ_LGYCAP_TOP, 0, 0, (IrqIsr)NULL);
@ We will be using IRQs without our IRQ handler to minimize latency.
cpsid i @ __disableIrq();
@ Load lookup table address and color mask.
ldr r2, =0x1FF00000 @ r2 = 0x1FF00000;
ldrh r12, =0x7FFF @ r12 = 0x7FFF;
convert240pFrameFast_frame_lp:
@ Load input and output addresses.
ldr r0, =0x18200000 @ r0 = 0x18200000; // u32.
@ldr r1, =0x18300000 @ r1 = 0x18300000; // u32.
add r1, r0, #0x100000 @ r1 = r0 + 0x100000; // Note: ldr would be faster here (result latency). Saves 4 bytes.
@ Convert 8 lines each round until we have a whole frame.
convert240pFrameFast_8l_lp:
ldr r4, =0x10111008 @ r4 = &REG_LGYCAP1_STAT; // u32.
ldr r5, =MPCORE_PRIV_BASE @ r5 = MPCORE_PRIV_BASE; // u32.
convert240pFrameFast_wait_irq:
@ Wait for LgyCap IRQs.
wfi @ __waitForInterrupt();
@ Acknowledge IRQ and extract line number.
ldr r11, [r4] @ r11 = REG_LGYCAP_STAT; // u32.
ldr r7, [r5, #0x10C] @ r7 = REG_GICC_INTACK; // u32.
str r11, [r4] @ REG_LGYCAP_STAT = r11; // u32.
lsrs r11, r11, #16 @ r11 >>= 16; // Updates flags.
str r7, [r5, #0x110] @ REG_GICC_EOI = r7; // u32.
@ Ignore DREQ IRQ for line 0.
beq convert240pFrameFast_wait_irq @ if((r11>>16) == 0) goto convert240pFrameFast_wait_irq;
convert240pFrameFast_skip_irq_wait:
@ Load size of 8 lines in bytes.
mov r3, #0x1680 @ r3 = 0x1680;
@ Convert 8 pixels each round until we have 8 lines.
convert240pFrameFast_8p_lp:
@ Load 8 pixels from frame.
ldmia r0!, {r8-r10, lr} @ r8_to_r10_lr = *((_16BytesBlock*)r0); r0 += 16;
@ Decrement size and extract first 2 pixels.
subs r3, r3, #16 @ r3 -= 16; // Updates flags.
and r4, r12, r8, lsr #1 @ r4 = 0x7FFF & (r8>>1); // r12 is 0x7FFF.
lsr r5, r8, #17 @ r5 = r8>>17;
@ Look up pixel 1 and extract pixel 3.
ldr r4, [r2, r4, lsl #2] @ r4 = r2[r4]; // u32.
and r6, r12, r9, lsr #1 @ r6 = 0x7FFF & (r9>>1); // r12 is 0x7FFF.
@ Look up pixel 2 and extract pixel 4.
ldr r5, [r2, r5, lsl #2] @ r5 = r2[r5]; // u32.
lsr r7, r9, #17 @ r7 = r9>>17;
@ Look up pixel 3 and extract pixel 5.
ldr r6, [r2, r6, lsl #2] @ r6 = r2[r6]; // u32.
and r8, r12, r10, lsr #1 @ r8 = 0x7FFF & (r10>>1); // r12 is 0x7FFF.
@ Look up pixel 4 and extract pixel 6.
ldr r7, [r2, r7, lsl #2] @ r7 = r2[r7]; // u32.
lsr r9, r10, #17 @ r9 = r10>>17;
@ Look up pixel 5 and extract pixel 7.
ldr r8, [r2, r8, lsl #2] @ r8 = r2[r8]; // u32.
and r10, r12, lr, lsr #1 @ r10 = 0x7FFF & (lr>>1); // r12 is 0x7FFF.
@ Look up pixel 6 and extract pixel 8.
ldr r9, [r2, r9, lsl #2] @ r9 = r2[r9]; // u32.
lsr lr, lr, #17 @ lr = lr>>17;
@ Look up pixel 7 and 8.
ldr r10, [r2, r10, lsl #2] @ r10 = r2[r10]; // u32.
ldr lr, [r2, lr, lsl #2] @ lr = r2[lr]; // u32.
@ Prefetch next cache line, write 8 pixels and jump back if we are not done yet.
pld [r0, #32] @ Prefetch from r0 + 32. // Offset 32 is a tiny bit better. Most of the time the result is the same as 64.
stmia r1!, {r4-r10, lr} @ *((_32BytesBlock*)r1) = r4_to_r10_lr; r1 += 32;
bne convert240pFrameFast_8p_lp @ if(r3 != 0) goto convert240pFrameFast_8p_lp;
@ Test if 8 line counter is 232, skip texture padding and jump back if we are not done yet.
cmp r11, #232 @ r11 - 232; // Updates flags.
add r0, r0, #0x980 @ r0 += 0x980;
add r1, r1, #0x1300 @ r1 += 0x1300;
moveq r11, #240 @ if(r11 == 232) r11 = 240;
beq convert240pFrameFast_skip_irq_wait @ if(r11 == 232) goto convert240pFrameFast_skip_irq_wait;
bls convert240pFrameFast_8l_lp @ if(r11 <= 232) goto convert240pFrameFast_8l_lp;
@ Flush the D-Cache, wait for flush completion, notify core 0 and jump back.
@ Note: r3 has been decremented down to 0 previously and so it's safe to use.
mcr p15, 0, r3, c7, c14, 0 @ Clean and Invalidate Entire Data Cache.
ldr r4, =MPCORE_PRIV_BASE @ r4 = MPCORE_PRIV_BASE; // u32.
mov r5, #0x10000 @ r5 = 0x10000;
orr r5, r5, #0xF @ r5 |= 0xF;
add r4, r4, #0x1F00 @ r4 += 0x1F00; // REG_GICD_SOFTINT.
mcr p15, 0, r3, c7, c10, 4 @ Data Synchronization Barrier.
str r5, [r4] @ *r4 = r5; // u32.
b convert240pFrameFast_frame_lp @ goto convert240pFrameFast_frame_lp;
END_ASM_FUNC