unit trap; {$mode ObjFPC}{$H+} {$CALLING SysV_ABI_CDecl} interface uses sysutils, ucontext, kern_thr; const T_PRIVINFLT = 1; // privileged instruction T_BPTFLT = 3; // breakpoint instruction T_ARITHTRAP = 6; // arithmetic trap T_PROTFLT = 9; // protection fault T_TRCTRAP =10; // debug exception (sic) T_PAGEFLT =12; // page fault T_ALIGNFLT =14; // alignment fault T_DIVIDE =18; // integer divide fault T_NMI =19; // non-maskable trap T_OFLOW =20; // overflow trap T_BOUND =21; // bound instruction fault T_DNA =22; // device not available fault T_DOUBLEFLT =23; // double fault T_FPOPFLT =24; // fp coprocessor operand fetch fault T_TSSFLT =25; // invalid tss fault T_SEGNPFLT =26; // segment not present fault T_STKFLT =27; // stack fault T_MCHK =28; // machine check trap T_XMMFLT =29; // SIMD floating-point exception T_RESERVED =30; // reserved (unknown) T_DTRACE_RET=32; // DTrace pid return // XXX most of the following codes aren't used, but could be. // definitions for ILL_PRIVIN_FAULT=T_PRIVINFLT; ILL_ALIGN_FAULT =T_ALIGNFLT ; ILL_FPOP_FAULT =T_FPOPFLT ; // coprocessor operand fault // codes for SIGBUS BUS_PAGE_FAULT =T_PAGEFLT ; // page fault protection base BUS_SEGNP_FAULT=T_SEGNPFLT; // segment not present BUS_STK_FAULT =T_STKFLT ; // stack segment BUS_SEGM_FAULT =T_RESERVED; // segment protection base // Trap's coming from user mode T_USER=$100; MAX_TRAP_MSG=32; trap_msg:array[0..MAX_TRAP_MSG] of PChar=( '', // 0 unused 'privileged instruction fault', // 1 T_PRIVINFLT '', // 2 unused 'breakpoint instruction fault', // 3 T_BPTFLT '', // 4 unused '', // 5 unused 'arithmetic trap', // 6 T_ARITHTRAP '', // 7 unused '', // 8 unused 'general protection fault', // 9 T_PROTFLT 'trace trap', // 10 T_TRCTRAP '', // 11 unused 'page fault', // 12 T_PAGEFLT '', // 13 unused 'alignment fault', // 14 T_ALIGNFLT '', // 15 unused '', // 16 unused '', // 17 unused 'integer divide fault', // 18 T_DIVIDE 'non-maskable interrupt trap', // 19 T_NMI 'overflow trap', // 20 T_OFLOW 'FPU bounds check fault', // 21 T_BOUND 'FPU device not available', // 22 T_DNA 'double fault', // 23 T_DOUBLEFLT 'FPU operand fetch fault', // 24 T_FPOPFLT 'invalid TSS fault', // 25 T_TSSFLT 'segment not present fault', // 26 T_SEGNPFLT 'stack fault', // 27 T_STKFLT 'machine check trap', // 28 T_MCHK 'SIMD floating-point exception', // 29 T_XMMFLT 'reserved (unknown) fault', // 30 T_RESERVED '', // 31 unused (reserved) 'DTrace pid return trap' // 32 T_DTRACE_RET ); procedure _sig_lock; procedure _sig_unlock; procedure sig_lock; procedure sig_unlock; procedure fast_syscall; procedure amd64_syscall; procedure host_sigcode; procedure host_sigipi; function IS_TRAP_FUNC(rip:qword):Boolean; function trap(frame:p_trapframe;usermode:Boolean):Integer; function trap_pfault(frame:p_trapframe;usermode:Boolean):Integer; implementation uses errno, vm, vmparam, vm_map, vm_fault, machdep, md_context, signal, kern_proc, sys_bootparam, subr_backtrace; // procedure _sig_lock; assembler; nostackframe; asm pushf lock incl %gs:teb.iflag //lock interrupt popf end; procedure _sig_unlock; assembler; nostackframe; asm pushf lock decl %gs:teb.iflag //unlock interrupt popf end; procedure sig_lock; assembler; nostackframe; label _exit; asm //prolog (debugger) pushq %rbp movq %rsp,%rbp pushq %rax pushf movq $1,%rax lock xadd %rax,%gs:teb.iflag //lock interrupt test %rax,%rax jnz _exit movqq %gs:teb.thread,%rax //curkthread testl TDF_AST,kthread.td_flags(%rax) je _exit mov $0,%rax call fast_syscall _exit: //epilog (debugger) popf popq %rax popq %rbp end; procedure sig_unlock; assembler; nostackframe; label _exit; asm //prolog (debugger) pushq %rbp movq %rsp,%rbp pushq %rax pushf lock decl %gs:teb.iflag //unlock interrupt jnz _exit movqq %gs:teb.thread,%rax //curkthread testl TDF_AST,kthread.td_flags(%rax) je _exit mov $0,%rax call fast_syscall _exit: //epilog (debugger) popf popq %rax popq %rbp end; type tsyscall=function(rdi,rsi,rdx,rcx,r8,r9:QWORD):Integer; var sys_args_idx:array[0..5] of Byte=( Byte(ptruint(@p_trapframe(nil)^.tf_rdi) div SizeOf(QWORD)), Byte(ptruint(@p_trapframe(nil)^.tf_rsi) div SizeOf(QWORD)), Byte(ptruint(@p_trapframe(nil)^.tf_rdx) div SizeOf(QWORD)), Byte(ptruint(@p_trapframe(nil)^.tf_r10) div SizeOf(QWORD)), Byte(ptruint(@p_trapframe(nil)^.tf_r8 ) div SizeOf(QWORD)), Byte(ptruint(@p_trapframe(nil)^.tf_r9 ) div SizeOf(QWORD)) ); procedure print_syscall_args(var f:text;const header:RawByteString;td_frame:p_trapframe); var i,count:Integer; str:shortstring; begin str:=header+#13#10; count:=p_proc.p_sysent^.sv_table[td_frame^.tf_rax].sy_narg; if (count<>0) then begin For i:=0 to count-1 do begin str:=str+' ['+IntToStr(i+1)+']:0x'+HexStr(PQWORD(td_frame)[sys_args_idx[i]],16)+#13#10; end; end; Write(f,str); end; procedure print_error_syscall(td_frame:p_trapframe); var count:integer; str:shortstring; begin thread_suspend_all(p_host_ipc_td); count:=p_proc.p_sysent^.sv_table[td_frame^.tf_rax].sy_narg; Assert(count<=6); str:='Unhandled syscall:0x'+ IntToSTr(td_frame^.tf_rax)+':'+ p_proc.p_sysent^.sv_table[td_frame^.tf_rax].sy_name; print_syscall_args(StdErr,str,td_frame); print_backtrace_td(StdErr); p_host_ipc.error(str); Assert(false,p_proc.p_sysent^.sv_table[td_frame^.tf_rax].sy_name) end; procedure amd64_syscall; var td:p_kthread; td_frame:p_trapframe; scall:tsyscall; rip:QWORD; error:Integer; is_guest:Boolean; // FPUCW:WORD; MXCSR:DWORD; begin FPUCW:=Get8087CW; MXCSR:=GetMXCSR; //Call by ID table td:=curkthread; td_frame:=@td^.td_frame; if (FPUCW<>__INITIAL_FPUCW__) then begin //writeln('changed FPUCW(0x',HexStr(FPUCW,4),') on ',td^.td_name); end; if ((MXCSR and __INITIAL_MXCSR__)<>__INITIAL_MXCSR__) then begin writeln('changed MXCSR(0x',HexStr((MXCSR and __INITIAL_MXCSR__),8),') on ',td^.td_name); end; td^.td_fpstate.XMM_SAVE_AREA.ControlWord:=FPUCW; td^.td_fpstate.XMM_SAVE_AREA.MxCsr :=MXCSR; cpu_fetch_syscall_args(td); rip:=td_frame^.tf_rip; error:=0; scall:=nil; is_guest:=False; if (td_frame^.tf_raxtd^.td_frame.tf_rip) then begin set_pcb_flags(td,PCB_FULL_IRET); //call ipi_sigreturn end; //move to pcb? Set8087CW(FPUCW); SetMXCSR (MXCSR); end; procedure fast_syscall; assembler; nostackframe; label _align, _restore, _after_call, _doreti, _fail, _ast, _doreti_exit; asm //prolog (debugger) pushq %rbp movq %rsp,%rbp movqq %rax,%r11 //save rax movqq %rcx,%r10 //save rcx pushf pop %rcx //save flags to rcx movqq %gs:teb.thread,%rax //curkthread test %rax,%rax jz _fail testl PCB_IS_HLE,kthread.pcb_flags(%rax) jne _align movqq kthread.td_kstack.stack(%rax),%rsp //td_kstack (Implicit lock interrupt) andl NOT_PCB_FULL_IRET,kthread.pcb_flags(%rax) //clear PCB_FULL_IRET _align: andq $-16,%rsp //align stack movqq $0 ,kthread.td_frame.tf_rflags(%rax) //clear movb %ch ,kthread.td_frame.tf_rflags(%rax) //save flags movqq %rdi,kthread.td_frame.tf_rdi (%rax) movqq %rsi,kthread.td_frame.tf_rsi (%rax) movqq %rdx,kthread.td_frame.tf_rdx (%rax) movqq $0,kthread.td_frame.tf_rcx (%rax) movqq %r8 ,kthread.td_frame.tf_r8 (%rax) movqq %r9 ,kthread.td_frame.tf_r9 (%rax) movqq %r11,kthread.td_frame.tf_rax (%rax) movqq %rbx,kthread.td_frame.tf_rbx (%rax) movqq %r10,kthread.td_frame.tf_r10 (%rax) movqq $0,kthread.td_frame.tf_r11 (%rax) movqq %r12,kthread.td_frame.tf_r12 (%rax) movqq %r13,kthread.td_frame.tf_r13 (%rax) movqq %r14,kthread.td_frame.tf_r14 (%rax) movqq %r15,kthread.td_frame.tf_r15 (%rax) movqq $0 ,kthread.td_frame.tf_trapno(%rax) movqq $0 ,kthread.td_frame.tf_addr (%rax) movqq $0 ,kthread.td_frame.tf_flags (%rax) movqq $5 ,kthread.td_frame.tf_err (%rax) //sizeof(call $32) movqq (%rbp),%r11 //get prev rbp movqq %r11,kthread.td_frame.tf_rbp(%rax) lea 16(%rbp),%r11 //get prev rsp movqq %r11,kthread.td_frame.tf_rsp(%rax) movqq 8(%rbp),%r11 //get prev rip movqq %r11,kthread.td_frame.tf_rip(%rax) call amd64_syscall _after_call: movqq %gs:teb.thread,%rcx //curkthread //Requested full context restore testl PCB_FULL_IRET,kthread.pcb_flags(%rcx) jnz _doreti testl PCB_IS_HLE,kthread.pcb_flags(%rcx) jne _restore testl TDF_AST,kthread.td_flags(%rcx) jne _ast //Restore preserved registers. _restore: //get flags movqq kthread.td_frame.tf_rflags(%rcx),%rax push %rax popf movqq kthread.td_frame.tf_rdi(%rcx),%rdi movqq kthread.td_frame.tf_rsi(%rcx),%rsi movqq kthread.td_frame.tf_rdx(%rcx),%rdx movqq kthread.td_frame.tf_rax(%rcx),%rax movqq kthread.td_frame.tf_rsp(%rcx),%r11 lea -16(%r11),%r11 movqq %r11,%rsp //restore rsp (Implicit unlock interrupt) movqq $0,%rcx movqq $0,%r11 //epilog (debugger) popq %rbp ret //fail (curkthread=nil) _fail: or $1,%rcx //set CF push %rcx popf movqq $14,%rax //EFAULT movqq $0,%rdx movqq $0,%rcx movqq $0,%r11 popq %rbp ret //ast _ast: call ast jmp _after_call //doreti _doreti: testl PCB_IS_HLE,kthread.pcb_flags(%rcx) jne _doreti_exit //%rcx=curkthread testl TDF_AST,kthread.td_flags(%rcx) je _doreti_exit call ast jmp _doreti _doreti_exit: //Restore full. call ipi_sigreturn hlt //marker .quad 0xDEADC0DEDEADC0DE end; procedure host_sigcode; assembler; nostackframe; public; asm call sigframe.sf_ahu(%rsp) lea sigframe.sf_uc (%rsp),%rdi pushq $0 movqq $417,%rax //sys_sigreturn call fast_syscall hlt end; var guest_sigcode:array[0..21] of Byte=( $ff,$14,$24, //CALL qword ptr [RSP] $48,$8d,$7c,$24,$40, //LEA RDI,[RSP + 0x40] $6a,$00, //PUSH 0x0 $48,$c7,$c0,$a1,$01,$00,$00, //MOV RAX,417 $0f,$05, //SYSCALL $f4, //HLT $eb,$fd //JMP -3 ); public; guest_szsigcode:Integer=Length(guest_sigcode); public; procedure host_sigipi; assembler; nostackframe; public; label _ast, _ast_exit; asm lea sigframe.sf_uc(%rsp),%rdi call sys_sigreturn //ast _ast: movqq %gs:teb.thread,%rax //curkthread testl TDF_AST,kthread.td_flags(%rax) je _ast_exit call ast jmp _ast _ast_exit: call ipi_sigreturn hlt end; //// function IndexMarker(pbuf:Pointer):Pointer; begin Result:=nil; while True do begin if (PQWORD(pbuf)^=QWORD($DEADC0DEDEADC0DE)) then begin Break; end; Inc(pbuf); end; Result:=pbuf; end; var fast_syscall_end:Pointer=nil; function IS_TRAP_FUNC(rip:qword):Boolean; public; begin if (fast_syscall_end=nil) then fast_syscall_end:=IndexMarker(@fast_syscall); Result:=( (rip>=QWORD(@fast_syscall)) and (rip<=QWORD(fast_syscall_end)) //fast_syscall func size ); end; { function IS_USERMODE(td:p_kthread;frame:p_trapframe):Boolean; inline; begin Result:=(frame^.tf_rsp>QWORD(td^.td_kstack.stack)) or (frame^.tf_rsp<=(QWORD(td^.td_kstack.sttop))); end; } function trap(frame:p_trapframe;usermode:Boolean):Integer; begin Result:=0; case frame^.tf_trapno of T_PAGEFLT: begin Result:=trap_pfault(frame,usermode); //print_backtrace_td(stderr); //writeln; end; end; end; procedure trap_fatal(frame:p_trapframe;eva:vm_offset_t;usermode:Boolean); var td:p_kthread; trapno:Integer; msg,msg2:pchar; begin trapno:=frame^.tf_trapno; if (trapno <= MAX_TRAP_MSG) then msg:=trap_msg[trapno] else msg:='UNKNOWN'; if usermode then msg2:='user' else msg2:='kernel'; Writeln(StdErr,'Fatal trap ',trapno,': ',msg,' while in ',msg2,' mode'); Writeln(StdErr,'fault virtual address = 0x',HexStr(eva,16)); Writeln(StdErr,'instruction pointer = 0x',HexStr(frame^.tf_rip,16)); Writeln(StdErr,'stack pointer = 0x',HexStr(frame^.tf_rsp,16)); Writeln(StdErr,'frame pointer = 0x',HexStr(frame^.tf_rbp,16)); td:=curkthread; if (td<>nil) then begin Writeln(StdErr,'td_ustack.stack = 0x',HexStr(td^.td_ustack.stack)); Writeln(StdErr,'td_ustack.sttop = 0x',HexStr(td^.td_ustack.sttop)); Writeln(StdErr,'td_kstack.stack = 0x',HexStr(td^.td_kstack.stack)); Writeln(StdErr,'td_kstack.sttop = 0x',HexStr(td^.td_kstack.sttop)); end; end; function trap_pfault(frame:p_trapframe;usermode:Boolean):Integer; var td:p_kthread; eva:vm_offset_t; map:vm_map_t; rv:Integer; begin Result:=SIGSEGV; td:=curkthread; eva:=frame^.tf_addr; if ((td^.td_pflags and TDP_NOFAULTING)<>0) then begin Writeln('TDP_NOFAULTING:',curkthread^.td_name); Exit(SIGSEGV); end; if is_guest_addr(eva) then begin map:=p_proc.p_vmspace; rv:=vm_fault.vm_fault(map, frame^.tf_addr, frame^.tf_rip, frame^.tf_err, VM_FAULT_NORMAL); case rv of 0:Exit(0); KERN_PROTECTION_FAILURE:Result:=SIGBUS; else Result:=SIGSEGV; end; end else begin Writeln('not is_guest_addr:0x',HexStr(eva,16),':',curkthread^.td_name); Result:=SIGSEGV; end; if (not usermode) then begin if (td^.pcb_onfault<>nil) then begin Writeln('pcb_onfault<:',HexStr(td^.pcb_onfault),':',curkthread^.td_name); frame^.tf_rip:=QWORD(td^.pcb_onfault); Exit(0); end else begin trap_fatal(frame, eva, usermode); Exit(-1); end; end; end; end.