flycast/core/rec-ARM/ngen_arm.S

313 lines
6.2 KiB
ArmAsm
Raw Normal View History

2013-12-19 17:10:14 +00:00
@@
#include "build.h"
2013-12-19 17:10:14 +00:00
.arm
.align 8
.equ SH4_TIMESLICE, 448
.equ BM_BLOCKLIST_MASK, 65532 @FFFC
.equ CPU_RATIO, 5
2020-04-26 08:03:57 +00:00
#if defined(__APPLE__)
#define CSYM(n) _##n
2015-05-16 08:04:30 +00:00
#define HIDDEN(n)
#else
#define CSYM(n) n
2015-05-16 08:04:30 +00:00
#define HIDDEN(n) .hidden CSYM(n)
#endif
2013-12-19 17:10:14 +00:00
@@@@@@@@@@ some helpers @@@@@@@@@@
.global CSYM(do_sqw_nommu_area_3)
2015-05-16 08:04:30 +00:00
HIDDEN(do_sqw_nommu_area_3)
2013-12-19 17:10:14 +00:00
@r0: addr
@r1: sq_both
CSYM(do_sqw_nommu_area_3):
2013-12-19 17:10:14 +00:00
add r3,r1,#0x0C000000 @ get ram ptr from r1, part 1
and r2,r0,#0x20 @ SQ# selection, isolate
ubfx r0,r0,#5,#20 @ get ram offset
2013-12-19 17:10:14 +00:00
add r1,r2 @ SQ# selection, add to SQ ptr
add r3,#512 @ get ram ptr from r1, part 2
add r3,r0,lsl #5 @ ram + offset
vldm r1,{d0-d3}
vstm r3,{d0-d3}
bx lr
.global CSYM(TAWriteSQ)
2015-05-16 08:04:30 +00:00
HIDDEN(TAWriteSQ)
2013-12-19 17:10:14 +00:00
@r0: addr
@r1: sq_both
CSYM(TAWriteSQ):
2013-12-19 17:10:14 +00:00
BIC R3, R0, #0xFE000000 @clear unused bits
AND R0, R0, #0x20 @SQ#, isolate
CMP R3, #0x800000 @TA write?
ADD R0, R1, R0 @SQ#, add to SQ ptr
BCC CSYM(_Z13ta_vtx_data32Pv) @TA write?
2013-12-19 17:10:14 +00:00
CSYM(TAWriteSQ_yuv):
2013-12-19 17:10:14 +00:00
CMP R3, #0x1000000 @Yuv write ?
BCS CSYM(TAWriteSQ_vram)
2013-12-19 17:10:14 +00:00
MOV R1, #1
B CSYM(_Z8YUV_dataPjj)
2013-12-19 17:10:14 +00:00
CSYM(TAWriteSQ_vram): @vram write ..
#ifdef TARGET_IPHONE
bkpt #0
#else
2013-12-19 17:10:14 +00:00
bkpt
#endif
ubfx r0,r3,#5,#19 @ get vram offset
@ should be only 18 bits for 8MB VRAM but it wraps around on dc
2013-12-19 17:10:14 +00:00
add r3,r1,#0x04000000 @ get vram ptr from r1, part 1
add r3,#512 @ get ram ptr from r1, part 2
add r3,r0,lsl #5 @ ram + offset
vldm r1,{d0-d3}
vstm r3,{d0-d3}
bx lr
2015-07-25 06:39:35 +00:00
#if FEAT_SHREC != DYNAREC_NONE
2013-12-19 17:10:14 +00:00
@@@@@@@@@@ ngen_LinkBlock_*****_stub @@@@@@@@@@
.global CSYM(ngen_LinkBlock_Generic_stub)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_LinkBlock_Generic_stub)
CSYM(ngen_LinkBlock_Generic_stub):
2013-12-19 17:10:14 +00:00
mov r1,r4 @ djump/pc -> in case we need it ..
b CSYM(ngen_LinkBlock_Shared_stub)
2013-12-19 17:10:14 +00:00
.global CSYM(ngen_LinkBlock_cond_Branch_stub)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_LinkBlock_cond_Branch_stub)
CSYM(ngen_LinkBlock_cond_Branch_stub):
2013-12-19 17:10:14 +00:00
mov r1,#1
b CSYM(ngen_LinkBlock_Shared_stub)
2013-12-19 17:10:14 +00:00
.global CSYM(ngen_LinkBlock_cond_Next_stub)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_LinkBlock_cond_Next_stub)
CSYM(ngen_LinkBlock_cond_Next_stub):
2013-12-19 17:10:14 +00:00
mov r1,#0
b CSYM(ngen_LinkBlock_Shared_stub)
2013-12-19 17:10:14 +00:00
.global CSYM(ngen_LinkBlock_Shared_stub)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_LinkBlock_Shared_stub)
CSYM(ngen_LinkBlock_Shared_stub):
2013-12-19 17:10:14 +00:00
mov r0,lr
sub r0,#4 @go before the call
bl CSYM(rdv_LinkBlock)
2013-12-19 17:10:14 +00:00
bx r0
@@@@@@@@@@ ngen_FailedToFindBlock_ @@@@@@@@@@
.global CSYM(ngen_FailedToFindBlock_)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_FailedToFindBlock_)
CSYM(ngen_FailedToFindBlock_):
2013-12-19 17:10:14 +00:00
mov r0,r4
bl CSYM(rdv_FailedToFindBlock)
2013-12-19 17:10:14 +00:00
bx r0
@@@@@@@@@@ ngen_blockcheckfail @@@@@@@@@@
.global CSYM(ngen_blockcheckfail)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_blockcheckfail)
CSYM(ngen_blockcheckfail):
bl CSYM(rdv_BlockCheckFail)
2013-12-19 17:10:14 +00:00
bx r0
@@@@@@@@@@ ngen_mainloop @@@@@@@@@@
@ you can load the address of the sh4 reg struct on the mainloop init
@ using (u8*)regptr-(u8*)Sh4cntx
@ all registers are < 1024 bytes from that
@ so you can use reg+imm forms for it
.global CSYM(ngen_mainloop)
2015-05-16 08:04:30 +00:00
HIDDEN(ngen_mainloop)
CSYM(ngen_mainloop):
2013-12-19 17:10:14 +00:00
push { r4-r12,lr }
2020-04-26 08:03:57 +00:00
#if defined(__APPLE__)
mov r11, #SH4_TIMESLICE @ load cycle counter
#else
2013-12-19 17:10:14 +00:00
mov r9, #SH4_TIMESLICE @ load cycle counter
#endif
2013-12-19 17:10:14 +00:00
mov r8, r0 @Load context
ldr r4, [r8,#-184] @load pc
b CSYM(no_update) @Go to mainloop !
2013-12-19 17:10:14 +00:00
@this code is here for fall-through behavior of do_iter
.global CSYM(intc_sched)
2015-05-16 08:04:30 +00:00
HIDDEN(intc_sched)
CSYM(intc_sched): @ next_pc _MUST_ be on ram
2020-04-26 08:03:57 +00:00
#if defined(__APPLE__)
add r11,r11,#SH4_TIMESLICE
#else
2013-12-19 17:10:14 +00:00
add r9,r9,#SH4_TIMESLICE
#endif
2013-12-19 17:10:14 +00:00
mov r4,lr
bl CSYM(UpdateSystem)
2013-12-19 17:10:14 +00:00
mov lr,r4
cmp r0,#0
bxeq lr @faster than bxeq r4 (as it should, call stack cache)
do_iter:
mov r0,r4
bl CSYM(rdv_DoInterrupts)
2013-12-19 17:10:14 +00:00
mov r4,r0
.global CSYM(no_update)
2015-05-16 08:04:30 +00:00
HIDDEN(no_update)
CSYM(no_update): @ next_pc _MUST_ be on r4 *R4 NOT R0 anymore*
2013-12-19 17:10:14 +00:00
ldr r0,[r8,#-156] @load CpuRunning
cmp r0,#0
beq CSYM(cleanup)
#if RAM_SIZE_MAX == 33554432
2015-08-11 17:07:23 +00:00
sub r2,r8,#0x4100000
ubfx r1,r4,#1,#24 @ 24+1 bits: 32 MB
@ RAM wraps around so if actual RAM size is 16MB, we won't overflow
#elif RAM_SIZE_MAX == 16777216
sub r2,r8,#0x2100000
ubfx r1,r4,#1,#23 @ 23+1 bits: 16 MB
2018-10-18 09:55:17 +00:00
#else
#error "Define RAM_SIZE_MAX"
#endif
2013-12-19 17:10:14 +00:00
ldr pc,[r2,r1,lsl #2]
@bic r1,r4,#0xFF000000
@ldr pc,[r2,r1,lsl #1]
HIDDEN(cleanup)
CSYM(cleanup):
2013-12-19 17:10:14 +00:00
pop {r4-r12,lr}
bx lr
end_ngen_mainloop:
@@@@@@@@@@ ngen_mainloop @@@@@@@@@@
#if FEAT_AREC == DYNAREC_JIT
.global CSYM(arm_compilecode)
2015-05-16 08:04:30 +00:00
HIDDEN(arm_compilecode)
CSYM(arm_compilecode):
bl CSYM(CompileCode)
b CSYM(arm_dispatch)
#endif
2013-12-19 17:10:14 +00:00
#ifdef TARGET_IPHONE
Xarm_Reg: .word CSYM(arm_Reg)
XEntryPoints: .word CSYM(EntryPoints)
#endif
.global CSYM(arm_mainloop)
2015-05-16 08:04:30 +00:00
HIDDEN(arm_mainloop)
CSYM(arm_mainloop): @(cntx,lookup_base,cycles)
2013-12-19 17:10:14 +00:00
2020-04-26 08:03:57 +00:00
#if defined(__APPLE__)
push {r4,r5,r8,r11,lr}
#else
2013-12-19 17:10:14 +00:00
push {r4,r5,r8,r9,lr}
#endif
2013-12-19 17:10:14 +00:00
#ifdef TARGET_IPHONE
ldr r8,Xarm_Reg @load cntx
ldr r4,XEntryPoints @load lookup base
#else
mov r8,r1 @load cntx
mov r4,r2 @load lookup base
#endif
2013-12-19 17:10:14 +00:00
ldr r5,[r8,#192] @load cycle count
add r5,r0 @add cycles for this timeslice
b CSYM(arm_dispatch)
2013-12-19 17:10:14 +00:00
.global CSYM(arm_dispatch)
2015-05-16 08:04:30 +00:00
HIDDEN(arm_dispatch)
CSYM(arm_dispatch):
ldrd r0,r1,[r8,#184] @load: Next PC, interrupt
ubfx r2,r0,#2,#21 @ assuming 8 MB address space max (23 bits)
2013-12-19 17:10:14 +00:00
cmp r1,#0
bne arm_dofiq
ldr pc,[r4,r2,lsl #2]
arm_dofiq:
bl CSYM(CPUFiq)
b CSYM(arm_dispatch)
2013-12-19 17:10:14 +00:00
.global CSYM(arm_exit)
2015-05-16 08:04:30 +00:00
HIDDEN(arm_exit)
CSYM(arm_exit):
2013-12-19 17:10:14 +00:00
str r5,[r8,#192] @if timeslice is over, save remaining cycles
2020-04-26 08:03:57 +00:00
#if defined(__APPLE__)
pop {r4,r5,r8,r11,pc}
#else
pop {r4,r5,r8,r9,pc}
#endif
2013-12-19 17:10:14 +00:00
@@@@@@
@matrix mul
#ifndef __ANDROID__
.global CSYM(ftrv_asm)
2015-05-16 08:04:30 +00:00
HIDDEN(ftrv_asm)
CSYM(ftrv_asm):
2013-12-19 17:10:14 +00:00
@r0=dst,r1=vec,r2=mtx
@3x vld1.32 might be faster
vldm r2,{d16-d24}
vldm r1, {d0-d1}
VMUL.F32 Q2,Q8,d0[0]
VMLA.F32 Q2,Q9,d0[1]
VMLA.F32 Q2,Q10,d1[0]
VMLA.F32 Q2,Q11,d1[1]
vstm r0,{d4,d5}
bx lr
.global CSYM(fipr_asm)
2015-05-16 08:04:30 +00:00
HIDDEN(fipr_asm)
CSYM(fipr_asm):
2013-12-19 17:10:14 +00:00
@ vdot
@ idp=fr[n+0]*fr[m+0];
@ idp+=fr[n+1]*fr[m+1];
@ idp+=fr[n+2]*fr[m+2];
@ idp+=fr[n+3]*fr[m+3];
vldm r0, {d0,d1}
vldm r1, {d2,d3}
vmul.f32 q0,q1
@NEON is quite nice actually ! if only its performance was good enough ...
vpadd.f32 d0,d0,d1 @d0={d0[0]+d0[1], d1[0]+d1[1]}
vpadd.f32 d0,d0,d0 @d0={d0[0]+d0[1]+d1[0]+d1[1], d0[0]+d0[1]+d1[0]+d1[1]}
@store to ret ..
vmov r0,s0
bx lr
#endif
#endif