unit vfs_subr; {$mode ObjFPC}{$H+} {$CALLING SysV_ABI_CDecl} interface uses mqueue, vmount, kern_param, sys_event, vfile, vstat, vnode, vnode_if, vdirent, vfcntl, kern_mtx, kern_condvar, kern_synch, time, kern_time, kern_thr; type t_insmntque1_dtr=procedure(v:p_vnode;p:Pointer); function makedev(x,y:Integer):Integer; function vfs_busy(mp:p_mount;flags:Integer):Integer; procedure vfs_unbusy(mp:p_mount); function vfs_getvfs(fsid:p_fsid):p_mount; procedure vfs_getnewfsid(mp:p_mount); procedure vfs_timestamp(tsp:p_timespec); procedure vattr_null(vap:p_vattr); procedure v_incr_usecount(vp:p_vnode); procedure vholdl(vp:p_vnode); procedure vdropl(vp:p_vnode); procedure vgonel(vp:p_vnode); procedure vhold(vp:p_vnode); procedure vdrop(vp:p_vnode); function vrecycle(vp:p_vnode):Integer; procedure vgone(vp:p_vnode); function vget(vp:p_vnode;flags:Integer):Integer; procedure vref(vp:p_vnode); function vrefcnt(vp:p_vnode):Integer; procedure vrele(vp:p_vnode); procedure vput(vp:p_vnode); procedure vunref(vp:p_vnode); procedure vinactive(vp:p_vnode); function vflush(mp:p_mount;rootrefs,flags:Integer):Integer; procedure assert_vi_locked (vp:p_vnode;str:PChar); procedure assert_vi_unlocked (vp:p_vnode;str:PChar); procedure assert_vop_locked (vp:p_vnode;str:PChar); procedure assert_vop_unlocked(vp:p_vnode;str:PChar); procedure assert_vop_elocked (vp:p_vnode;str:PChar); function VOP_WRITE_PRE(ap:p_vop_write_args;var osize,ooffset:Int64):Integer; procedure VOP_WRITE_POST(ap:p_vop_write_args;ret:Integer;var osize,ooffset:Int64); procedure vop_rename_fail(ap:p_vop_rename_args); procedure vop_rename_pre(ap:p_vop_rename_args); procedure vop_create_post(ap:p_vop_create_args;rc:Integer); procedure vop_link_post(ap:p_vop_link_args;rc:Integer); procedure vop_mkdir_post(ap:p_vop_mkdir_args;rc:Integer); procedure vop_mknod_post(ap:p_vop_mknod_args;rc:Integer); procedure vop_remove_post(ap:p_vop_remove_args;rc:Integer); procedure vop_rename_post(ap:p_vop_rename_args;rc:Integer); procedure vop_rmdir_post(ap:p_vop_rmdir_args;rc:Integer); procedure vop_setattr_post(ap:p_vop_setattr_args;rc:Integer); procedure vop_symlink_post(ap:p_vop_symlink_args;rc:Integer); procedure vfs_event_init(); //SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); procedure vfs_event_signal(fsid:p_fsid;event:DWORD;data:ptrint); function vfs_kqfilter(ap:p_vop_kqfilter_args):Integer; function vfs_read_dirent(ap:p_vop_readdir_args;dp:p_dirent;off:QWORD):Integer; procedure vfs_mark_atime(vp:p_vnode); function vfs_unixify_accmode(accmode:p_accmode_t):Integer; function vcount(vp:p_vnode):Integer; function count_dev(dev:Pointer):Integer; //cdev procedure vfs_msync(mp:p_mount;flags:Integer); procedure destroy_vpollinfo_free(vi:p_vpollinfo); procedure destroy_vpollinfo(vi:p_vpollinfo); procedure v_addpollinfo(vp:p_vnode); function vn_pollrecord(vp:p_vnode;events:Integer):Integer; function vn_isdisk(vp:p_vnode;errp:PInteger):Boolean; function vaccess(_type:vtype; file_mode:mode_t; file_uid:uid_t; file_gid:gid_t; accmode:accmode_t; privused:PInteger):Integer; function getnewvnode(tag:PChar;mp:p_mount;vops:p_vop_vector;vpp:pp_vnode):Integer; procedure insmntque_stddtr(vp:p_vnode;dtr_arg:Pointer); function insmntque1(vp:p_vnode;mp:p_mount;dtr:t_insmntque1_dtr;dtr_arg:Pointer):Integer; function insmntque(vp:p_vnode;mp:p_mount):Integer; function vinvalbuf(vp:p_vnode;flags,slpflag,slptimeo:Integer):Integer; function __mnt_vnode_next_all(mvp:pp_vnode;mp:p_mount):p_vnode; function __mnt_vnode_first_all(mvp:pp_vnode;mp:p_mount):p_vnode; procedure __mnt_vnode_markerfree_all(mvp:pp_vnode;mp:p_mount); procedure mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount); function mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode; function __mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode; function __mnt_vnode_first_active(mvp:pp_vnode;mp:p_mount):p_vnode; procedure __mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount); procedure vntblinit; //SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); procedure vnlru_proc(); //SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, @vnlru_kp); function filt_fsattach(kn:p_knote):Integer; procedure filt_fsdetach(kn:p_knote); function filt_fsevent(kn:p_knote;hint:QWORD):Integer; const fs_filtops:t_filterops=( f_isfd :0; f_attach:@filt_fsattach; f_detach:@filt_fsdetach; f_event :@filt_fsevent; ); { * List of vnodes that are ready for recycling. } var numvnodes:QWORD=0; vnode_free_list:TAILQ_HEAD=(tqh_first:nil;tqh_last:@vnode_free_list.tqh_first); //vnode mntid_mtx:mtx; vnode_free_list_mtx:mtx; syncer_delayno:Integer; syncer_mask:QWORD; //LIST_HEAD(synclist, bufobj); //static struct synclist *syncer_workitem_pending[2]; sync_mtx:mtx; sync_wakeup:t_cv; syncer_maxdelay:Integer=32; syncdelay:Integer=30; filedelay:Integer=30; dirdelay:Integer=29; metadelay:Integer=28; rushjob:Integer; stat_rush_requests:Integer; fs_knlist:t_knlist; const SYNCER_SHUTDOWN_SPEEDUP=4; var sync_vnode_count:Integer; syncer_worklist_len:Integer; type syncer_state=(SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY); var desiredvnodes :Integer=0; wantfreevnodes:Integer=0; freevnodes :Integer=0; vnlru_nowhere :Integer=0; implementation uses errno, vfs_vnops, subr_uio, sys_vm_object, vsys_generic, kern_rangelock, rtprio, sys_conf; // var dead_vnodeops:vop_vector; external; // { * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. } function VCANRECYCLE(vp:p_vnode):Boolean; inline; begin Result:=((vp^.v_iflag and VI_FREE)<>0) and (vp^.v_holdcnt=0); end; function VSHOULDFREE(vp:p_vnode):Boolean; inline; begin Result:=((vp^.v_iflag and VI_FREE)=0) and (vp^.v_holdcnt=0); end; function VSHOULDBUSY(vp:p_vnode):Boolean; inline; begin Result:=((vp^.v_iflag and VI_FREE)<>0) and (vp^.v_holdcnt<>0); end; var { Shift count for (uintptr_t)vp to initialize vp^.v_hash. } vnsz2log:Integer; { * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of physical pages to vnodes approaches sixteen to one. } const MAXVNODES_MAX=(512 * (1024 * 1024 * 1024 div (16*1024) div 16)); v_page_count=524288; procedure vntblinit; var i:DWORD; begin desiredvnodes:=10000; if (desiredvnodes > MAXVNODES_MAX) then begin desiredvnodes:=MAXVNODES_MAX; end; wantfreevnodes:=desiredvnodes div 4; mtx_init(mntid_mtx,'mntid'); mtx_init(vnode_free_list_mtx,'vnode_free_list'); { * Initialize the filesystem syncer. } //syncer_workitem_pending[WI_MPSAFEQ]:=hashinit(syncer_maxdelay, M_VNODE,&syncer_mask); //syncer_workitem_pending[WI_GIANTQ]:=hashinit(syncer_maxdelay, M_VNODE,&syncer_mask); syncer_maxdelay:=syncer_mask + 1; mtx_init(sync_mtx,'Syncer mtx'); cv_init(@sync_wakeup,'syncer'); i:=1; While (i<=sizeof(t_vnode)) do begin Inc(vnsz2log); i:=i shl 1; end; Dec(vnsz2log); end; function vfs_busy(mp:p_mount;flags:Integer):Integer; begin MNT_ILOCK(mp); MNT_REF(mp); while ((mp^.mnt_kern_flag and MNTK_UNMOUNT)<>0) do begin if ((flags and MBF_NOWAIT)<>0) or ((mp^.mnt_kern_flag and MNTK_REFEXPIRE)<>0) then begin MNT_REL(mp); MNT_IUNLOCK(mp); Exit(ENOENT); end; if ((flags and MBF_MNTLSTLOCK)<>0) then begin mtx_unlock(mountlist_mtx); end; mp^.mnt_kern_flag:=mp^.mnt_kern_flag or MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS or PDROP,'vfs_busy', 0); if ((flags and MBF_MNTLSTLOCK)<>0) then begin mtx_lock(mountlist_mtx); end; MNT_ILOCK(mp); end; if ((flags and MBF_MNTLSTLOCK)<>0) then begin mtx_unlock(mountlist_mtx); end; Inc(mp^.mnt_lockref); MNT_IUNLOCK(mp); Exit(0); end; { * Free a busy filesystem. } procedure vfs_unbusy(mp:p_mount); begin MNT_ILOCK(mp); MNT_REL(mp); Assert(mp^.mnt_lockref>0,'negative mnt_lockref'); Dec(mp^.mnt_lockref); if (mp^.mnt_lockref=0) and ((mp^.mnt_kern_flag and MNTK_DRAINING)<>0) then begin mp^.mnt_kern_flag:=mp^.mnt_kern_flag and (not MNTK_DRAINING); wakeup(@mp^.mnt_lockref); end; MNT_IUNLOCK(mp); end; { * Lookup a mount point by filesystem identifier. } function vfs_getvfs(fsid:p_fsid):p_mount; var mp:p_mount; begin mtx_lock(mountlist_mtx); mp:=TAILQ_FIRST(@mountlist); while (mp<>nil) do begin if (mp^.mnt_stat.f_fsid.val[0]=fsid^.val[0]) and (mp^.mnt_stat.f_fsid.val[1]=fsid^.val[1]) then begin MNT_REF(mp); mtx_unlock(mountlist_mtx); Exit(mp); end; mp:=TAILQ_NEXT(mp,@mp^.mnt_list); end; mtx_unlock(mountlist_mtx); Exit(nil); end; function makedev(x,y:Integer):Integer; inline; begin Result:=(x shl 8) or y; end; procedure vfs_getnewfsid(mp:p_mount); var mntid_base:Word; nmp:p_mount; tfsid:fsid_t; mtype:Integer; begin mtx_lock(mntid_mtx); mtype:=mp^.mnt_vfc^.vfc_typenum; tfsid.val[1]:=mtype; mntid_base:=0; mtype:=(mtype and $FF) shl 24; repeat tfsid.val[0]:=makedev(255,mtype or ((mntid_base and $FF00) shl 8) or (mntid_base and $FF)); Inc(mntid_base); nmp:=vfs_getvfs(@tfsid); if (nmp=nil) then break; MNT_REL(nmp); until false; mp^.mnt_stat.f_fsid.val[0]:=tfsid.val[0]; mp^.mnt_stat.f_fsid.val[1]:=tfsid.val[1]; mtx_unlock(mntid_mtx); end; { * Get a current timestamp. } procedure vfs_timestamp(tsp:p_timespec); begin getnanotime(tsp); end; { * Set vnode attributes to VNOVAL } procedure vattr_null(vap:p_vattr); begin vap^.va_type :=VNON; vap^.va_size :=VNOVAL; vap^.va_bytes :=VNOVAL; vap^.va_mode :=VNOVAL; vap^.va_nlink :=VNOVAL; vap^.va_uid :=VNOVAL; vap^.va_gid :=VNOVAL; vap^.va_fsid :=VNOVAL; vap^.va_fileid :=VNOVAL; vap^.va_blocksize :=VNOVAL; vap^.va_rdev :=VNOVAL; vap^.va_atime.tv_sec :=VNOVAL; vap^.va_atime.tv_nsec :=VNOVAL; vap^.va_mtime.tv_sec :=VNOVAL; vap^.va_mtime.tv_nsec :=VNOVAL; vap^.va_ctime.tv_sec :=VNOVAL; vap^.va_ctime.tv_nsec :=VNOVAL; vap^.va_birthtime.tv_sec :=VNOVAL; vap^.va_birthtime.tv_nsec:=VNOVAL; vap^.va_flags :=VNOVAL; vap^.va_gen :=VNOVAL; vap^.va_vaflags :=0; end; function vlrureclaim(mp:p_mount):Integer; label next_iter, next_iter_mntunlocked, yield, relock_mnt; var vp:p_vnode; done :Integer; trigger :Integer; usevnodes:Integer; count :Integer; begin usevnodes:=desiredvnodes; if (usevnodes <= 0) then usevnodes:=1; trigger:=v_page_count * 2 div usevnodes; done:=0; vn_start_write(nil, @mp, V_WAIT); MNT_ILOCK(mp); count:=mp^.mnt_nvnodelistsize div 10 + 1; while (count<>0) do begin vp:=TAILQ_FIRST(@mp^.mnt_nvnodelist); while (vp<>nil) do begin if (vp^.v_type<>VMARKER) then Break; vp:=TAILQ_NEXT(vp,@vp^.v_nmntvnodes); end; if (vp=nil) then break; TAILQ_REMOVE (@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes); TAILQ_INSERT_TAIL(@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes); Dec(count); if (not VI_TRYLOCK(vp)) then begin goto next_iter; end; { * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. } if (vp^.v_usecount<>0) or {((vlru_allow_cache_src=0) and (not LIST_EMPTY(@vp^.v_cache_src))) or} ((vp^.v_iflag and VI_DOOMED)<>0) {or ((vp^.v_object<>nil) and (vp^.v_object^.resident_page_count > trigger))} then begin VI_UNLOCK(vp); goto next_iter; end; MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK or LK_EXCLUSIVE or LK_NOWAIT,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%})<>0) then begin vdrop(vp); goto next_iter_mntunlocked; end; VI_LOCK(vp); { * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. } if (vp^.v_usecount<>0) {or ((vlru_allow_cache_src=0) and (not LIST_EMPTY(@vp^.v_cache_src))) or ((vp^.v_object<>nil) and (vp^.v_object^.resident_page_count > trigger))} then begin VOP_UNLOCK(vp, LK_INTERLOCK); goto next_iter_mntunlocked; end; Assert((vp^.v_iflag and VI_DOOMED)=0,'VI_DOOMED unexpectedly detected in vlrureclaim()'); //atomic_add_long(@recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); Inc(done); next_iter_mntunlocked: //if (not should_yield()) then goto relock_mnt; //goto yield; next_iter: //if (not should_yield()) then continue; MNT_IUNLOCK(mp); yield: kern_yield(PRI_UNCHANGED); relock_mnt: MNT_ILOCK(mp); end; MNT_IUNLOCK(mp); vn_finished_write(mp); Exit(done); end; function vtryrecycle(vp:p_vnode):Integer; forward; procedure vnlru_free(count:Integer); var vp:p_vnode; vfslocked:Integer; begin mtx_assert(vnode_free_list_mtx); For count:=count downto 0 do begin vp:=TAILQ_FIRST(@vnode_free_list); { * The list can be modified while the free_list_mtx * has been dropped and vp could be nil here. } if (vp=nil) then break; Assert(vp^.v_op<>nil,'vnlru_free: vnode already reclaimed.'); Assert((vp^.v_iflag and VI_FREE)<>0,'Removing vnode not on freelist'); Assert((vp^.v_iflag and VI_ACTIVE)=0,'Mangling active vnode'); TAILQ_REMOVE(@vnode_free_list,vp,@vp^.v_actfreelist); { * Don't recycle if we can't get the interlock. } if (not VI_TRYLOCK(vp)) then begin TAILQ_INSERT_TAIL(@vnode_free_list,vp,@vp^.v_actfreelist); continue; end; Assert(VCANRECYCLE(vp),'vp inconsistent on freelist'); Dec(freevnodes); vp^.v_iflag:=vp^.v_iflag and (not VI_FREE); vholdl(vp); mtx_unlock(vnode_free_list_mtx); VI_UNLOCK(vp); vfslocked:=VFS_LOCK_GIANT(vp^.v_mount); vtryrecycle(vp); VFS_UNLOCK_GIANT(vfslocked); { * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. } vdrop(vp); mtx_lock(vnode_free_list_mtx); end; end; //static struct proc *vnlruproc; var vnlruproc_sig:Integer=0; procedure vnlru_proc(); var mp,nmp:p_mount; done,vfslocked:Integer; begin //EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); //kproc_suspend_check(p); mtx_lock(vnode_free_list_mtx); if (freevnodes > wantfreevnodes) then begin vnlru_free(freevnodes - wantfreevnodes); end; if (numvnodes <= desiredvnodes * 9 div 10) then begin vnlruproc_sig:=0; wakeup(@vnlruproc_sig); // mtx_unlock(vnode_free_list_mtx); Exit; end; mtx_unlock(vnode_free_list_mtx); done:=0; mtx_lock(mountlist_mtx); mp:=TAILQ_FIRST(@mountlist); While (mp<>nil) do begin if (vfs_busy(mp, MBF_NOWAIT or MBF_MNTLSTLOCK)<>0) then begin nmp:=TAILQ_NEXT(mp,@mp^.mnt_list); continue; end; vfslocked:=VFS_LOCK_GIANT(mp); Inc(done,vlrureclaim(mp)); VFS_UNLOCK_GIANT(vfslocked); mtx_lock(mountlist_mtx); nmp:=TAILQ_NEXT(mp,@mp^.mnt_list); vfs_unbusy(mp); // mp:=nmp end; mtx_unlock(mountlist_mtx); if (done=0) then begin Inc(vnlru_nowhere); end; end; function vtryrecycle(vp:p_vnode):Integer; var vnmp:p_mount; begin Assert(vp^.v_holdcnt<>0,'vtryrecycle: Recycling vp %p without a reference.'); { * This vnode may found and locked via some other list, if so we * can't recycle it yet. } if (VOP_LOCK(vp, LK_EXCLUSIVE or LK_NOWAIT,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%})<>0) then begin Exit(EWOULDBLOCK); end; { * Don't recycle if its filesystem is being suspended. } if (vn_start_write(vp, @vnmp, V_NOWAIT)<>0) then begin VOP_UNLOCK(vp, 0); Exit(EBUSY); end; { * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. } VI_LOCK(vp); if (vp^.v_usecount<>0) then begin VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); Exit(EBUSY); end; if ((vp^.v_iflag and VI_DOOMED)=0) then begin //atomic_add_long(@recycles_count, 1); vgonel(vp); end; VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); Exit(0); end; function getnewvnode_wait(suspended:Integer):Integer; begin mtx_assert(vnode_free_list_mtx); if (curkthread<>nil) and (numvnodes > desiredvnodes) then begin if (suspended<>0) then begin { * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. } if (freevnodes > wantfreevnodes) then begin vnlru_free(freevnodes - wantfreevnodes); end; Exit(0); end; if (vnlruproc_sig=0) then begin vnlruproc_sig:=1; { avoid unnecessary wakeups } wakeup(@vnlruproc_sig); end; msleep(@vnlruproc_sig,@vnode_free_list_mtx,PVFS,'vlruwk', hz); end; if (numvnodes>desiredvnodes) then Exit(ENFILE) else Exit(0); end; procedure getnewvnode_reserve(count:DWORD); var td:p_kthread; begin td:=curkthread; { First try to be quick and racy. } if (System.InterlockedExchangeAdd64(numvnodes,count) + count <= desiredvnodes) then begin Inc(td^.td_vp_reserv,count); Exit; end else begin System.InterlockedExchangeAdd64(numvnodes, -count); end; mtx_lock(vnode_free_list_mtx); while (count > 0) do begin if (getnewvnode_wait(0)=0) then begin Dec(count); Inc(td^.td_vp_reserv); System.InterlockedIncrement64(numvnodes); end; end; mtx_unlock(vnode_free_list_mtx); end; procedure getnewvnode_drop_reserve(); var td:p_kthread; begin td:=curkthread; System.InterlockedExchangeAdd64(numvnodes,-td^.td_vp_reserv); td^.td_vp_reserv:=0; end; { * Return the next vnode from the free list. } function getnewvnode(tag:PChar;mp:p_mount;vops:p_vop_vector;vpp:pp_vnode):Integer; label alloc; var td:p_kthread; vp:p_vnode; //struct bufobj *bo; error,susp:Integer; begin vp:=nil; td:=curkthread; if (td<>nil) then begin if (td^.td_vp_reserv>0) then begin Dec(td^.td_vp_reserv,1); goto alloc; end; end; mtx_lock(vnode_free_list_mtx); { * Lend our context to reclaim vnodes if they've exceeded the max. } if (freevnodes > wantfreevnodes) then begin vnlru_free(1); end; susp:=ord(False); if (mp<>nil) then begin susp:=ord(((mp^.mnt_kern_flag and MNTK_SUSPEND)<>0)); end; error:=getnewvnode_wait(susp); System.InterlockedIncrement64(numvnodes); mtx_unlock(vnode_free_list_mtx); alloc: //atomic_add_long(@vnodes_created, 1); vp:=AllocMem(SizeOf(t_vnode)); { * Setup locks. } vp^.v_vnlock:=@vp^.v_lock; mtx_init(vp^.v_interlock,'vnode interlock'); { * By default, don't allow shared locks unless filesystems * opt-in. } mtx_init(vp^.v_vnlock^,'PVFS'); //lockinit(vp^.v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); { * Initialize bufobj. } //bo:=@vp^.v_bufobj; //bo^.__bo_vnode:=vp; //mtx_init(BO_MTX(bo), "bufobj interlock", nil, MTX_DEF); //bo^.bo_ops:=@buf_ops_bio; //bo^.bo_private:=vp; //TAILQ_INIT(@bo^.bo_clean.bv_hd); //TAILQ_INIT(@bo^.bo_dirty.bv_hd); { * Initialize namecache. } //LIST_INIT(@vp^.v_cache_src); //TAILQ_INIT(@vp^.v_cache_dst); { * Finalize various vnode identity bits. } vp^.v_type:=VNON; vp^.v_tag:=tag; vp^.v_op:=vops; v_incr_usecount(vp); vp^.v_data:=nil; //mac_vnode_init(vp); //if (mp<>nil and (mp^.mnt_flag and MNT_MULTILABEL)=0) // mac_vnode_associate_singlelabel(mp, vp); //else if (mp=nil and vops<>@dead_vnodeops) // printf'nil mp in getnewvnode()\n'; if (mp<>nil) then begin //bo^.bo_bsize:=mp^.mnt_stat.f_iosize; if ((mp^.mnt_kern_flag and MNTK_NOKNOTE)<>0) then begin vp^.v_vflag:=vp^.v_vflag or VV_NOKNOTE; end; end; rangelock_init(@vp^.v_rl); { * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nilfs uses vfs_hash_index() on the lower vnode for * its own hashing. } vp^.v_hash:=ptruint(vp) shr vnsz2log; vpp^:=vp; Exit(0); end; { * Delete from old mount point vnode list, if on one. } procedure delmntque(vp:p_vnode); var mp:p_mount; active:Integer; begin mp:=vp^.v_mount; if (mp=nil) then Exit; MNT_ILOCK(mp); VI_LOCK(vp); Assert(mp^.mnt_activevnodelistsize <= mp^.mnt_nvnodelistsize, 'Active vnode list size %d > Vnode list size %d'); active:=vp^.v_iflag and VI_ACTIVE; vp^.v_iflag:=vp^.v_iflag and (not VI_ACTIVE); if (active<>0) then begin mtx_lock(vnode_free_list_mtx); TAILQ_REMOVE(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist); Dec(mp^.mnt_activevnodelistsize); mtx_unlock(vnode_free_list_mtx); end; vp^.v_mount:=nil; VI_UNLOCK(vp); Assert(mp^.mnt_nvnodelistsize > 0,'bad mount point vnode list size'); TAILQ_REMOVE(@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes); Dec(mp^.mnt_nvnodelistsize); MNT_REL(mp); MNT_IUNLOCK(mp); end; procedure insmntque_stddtr(vp:p_vnode;dtr_arg:Pointer); begin vp^.v_data:=nil; vp^.v_op:=@dead_vnodeops; { XXX non mp-safe fs may still call insmntque with vnode unlocked } if (VOP_ISLOCKED(vp)=0) then begin vn_lock(vp, LK_EXCLUSIVE or LK_RETRY,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); end; vgone(vp); vput(vp); end; { * Insert into list of vnodes for the new mount point, if available. } function insmntque1(vp:p_vnode;mp:p_mount;dtr:t_insmntque1_dtr;dtr_arg:Pointer):Integer; var locked:Integer; begin Assert(vp^.v_mount=nil,'insmntque: vnode already on per mount vnode list'); Assert(mp<>nil, 'Dont call insmntque(foo, nil)'); { * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. } MNT_ILOCK(mp); VI_LOCK(vp); if ((mp^.mnt_kern_flag and MNTK_NOINSMNTQ)<>0) and (((mp^.mnt_kern_flag and MNTK_UNMOUNTF)<>0) or (mp^.mnt_nvnodelistsize=0)) then begin locked:=VOP_ISLOCKED(vp); if (locked=0) or ((locked=LK_EXCLUSIVE) and ((vp^.v_vflag and VV_FORCEINSMQ)=0)) then begin VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr<>nil) then begin dtr(vp, dtr_arg); end; Exit(EBUSY); end; end; vp^.v_mount:=mp; MNT_REF(mp); TAILQ_INSERT_TAIL(@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes); Assert(mp^.mnt_nvnodelistsize >= 0,'neg mount point vnode list size'); Inc(mp^.mnt_nvnodelistsize); Assert((vp^.v_iflag and VI_ACTIVE)=0,'Activating already active vnode'); vp^.v_iflag:=vp^.v_iflag or VI_ACTIVE; mtx_lock(vnode_free_list_mtx); TAILQ_INSERT_HEAD(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist); Inc(mp^.mnt_activevnodelistsize); mtx_unlock(vnode_free_list_mtx); VI_UNLOCK(vp); MNT_IUNLOCK(mp); Exit(0); end; function insmntque(vp:p_vnode;mp:p_mount):Integer; begin Exit(insmntque1(vp, mp, @insmntque_stddtr, nil)); end; { { * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. } int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) begin int error; BO_LOCK(bo); if (flags and V_SAVE) begin error:=bufobj_wwait(bo, slpflag, slptimeo); if (error) begin BO_UNLOCK(bo); Exit(error); end; if (bo^.bo_dirty.bv_cnt > 0) begin BO_UNLOCK(bo); if ((error:=BO_SYNC(bo, MNT_WAIT))<>0) Exit(error); { * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS } BO_LOCK(bo); if (bo^.bo_numoutput > 0 or bo^.bo_dirty.bv_cnt > 0) panic'vinvalbuf: dirty bufs'; end; end; { * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. } do begin error:=flushbuflist(@bo^.bo_clean, flags, bo, slpflag, slptimeo); if (error=0 and !(flags and V_CLEANONLY)) error:=flushbuflist(@bo^.bo_dirty, flags, bo, slpflag, slptimeo); if (error<>0 and error<>EAGAIN) begin BO_UNLOCK(bo); Exit(error); end; end; while (error<>0); { * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. } do begin bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo^.bo_object<>nil) begin VM_OBJECT_LOCK(bo^.bo_object); vm_object_pip_wait(bo^.bo_object, "bovlbx'; VM_OBJECT_UNLOCK(bo^.bo_object); end; BO_LOCK(bo); end; while (bo^.bo_numoutput > 0); BO_UNLOCK(bo); { * Destroy the copy in the VM cache, too. } if (bo^.bo_object<>nil and (flags and (V_ALT or V_NORMAL or V_CLEANONLY))=0) begin VM_OBJECT_LOCK(bo^.bo_object); vm_object_page_remove(bo^.bo_object, 0, 0, (flags and V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_UNLOCK(bo^.bo_object); end; #ifdef INVARIANTS BO_LOCK(bo); if ((flags and (V_ALT or V_NORMAL or V_CLEANONLY))=0 and (bo^.bo_dirty.bv_cnt > 0 or bo^.bo_clean.bv_cnt > 0)) panic'vinvalbuf: flush failed'; BO_UNLOCK(bo); #endif Exit(0); end; } { * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. } function vinvalbuf(vp:p_vnode;flags,slpflag,slptimeo:Integer):Integer; begin ASSERT_VOP_LOCKED(vp, 'vinvalbuf'); if (vp^.v_object<>nil) then if (vm_object_t(vp^.v_object)^.handle<>vp) then begin Exit(0); end; //Exit(bufobj_invalbuf(@vp^.v_bufobj, flags, slpflag, slptimeo)); Result:=0; end; { { * Flush out buffers on the specified list. * } static int flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) begin struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval:=0; TAILQ_FOREACH_SAFE(bp, &bufv^.bv_hd, b_bobufs, nbp) begin if (((flags and V_NORMAL) and (bp^.b_xflags and BX_ALTDATA)) or ((flags and V_ALT) and (bp^.b_xflags and BX_ALTDATA)=0)) begin continue; end; lblkno:=0; xflags:=0; if (nbp<>nil) begin lblkno:=nbp^.b_lblkno; xflags:=nbp^.b_xflags & (BX_BKGRDMARKER or BX_VNDIRTY or BX_VNCLEAN); end; retval:=EAGAIN; error:=BUF_TIMELOCK(bp, LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) begin BO_LOCK(bo); Exit(error<>ENOLCK ? error : EAGAIN); end; Assert(bp^.b_bufobj=bo, 'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo)); if (bp^.b_bufobj<>bo) begin { XXX: necessary ? } BUF_UNLOCK(bp); BO_LOCK(bo); Exit(EAGAIN); end; { * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. } if (((bp^.b_flags and (B_DELWRI or B_INVAL))=B_DELWRI) and (flags and V_SAVE)) begin BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp^.b_flags:= or B_ASYNC; bwrite(bp); BO_LOCK(bo); Exit(EAGAIN); { XXX: why not loop ? } end; BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp^.b_flags:= or (B_INVAL or B_RELBUF); bp^.b_flags:= and ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp<>nil and (nbp^.b_bufobj<>bo or nbp^.b_lblkno<>lblkno or (nbp^.b_xflags & (BX_BKGRDMARKER or BX_VNDIRTY or BX_VNCLEAN))<>xflags)) break; { nbp invalid } end; Exit(retval); end; { * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. } int vtruncbuf(vp:p_vnode, struct ucred *cred, struct thread *td, off_t length, int blksize) begin struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", {$I %LINE%}, vp, cred, blksize, (uintmax_t)length); { * Round up to the *next* lbn. } trunclbn:=(length + blksize - 1) div blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf'; restart: bo:=@vp^.v_bufobj; BO_LOCK(bo); anyfreed:=1; for (;anyfreed;) begin anyfreed:=0; TAILQ_FOREACH_SAFE(bp, &bo^.bo_clean.bv_hd, b_bobufs, nbp) begin if (bp^.b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK, BO_MTX(bo))=ENOLCK) goto restart; BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp^.b_flags:= or (B_INVAL or B_RELBUF); bp^.b_flags:= and ~B_ASYNC; brelse(bp); anyfreed:=1; BO_LOCK(bo); if (nbp<>nil and (((nbp^.b_xflags and BX_VNCLEAN)=0) or (nbp^.b_vp<>vp) or (nbp^.b_flags and B_DELWRI))) begin BO_UNLOCK(bo); goto restart; end; end; TAILQ_FOREACH_SAFE(bp, &bo^.bo_dirty.bv_hd, b_bobufs, nbp) begin if (bp^.b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK, BO_MTX(bo))=ENOLCK) goto restart; BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp^.b_flags:= or (B_INVAL or B_RELBUF); bp^.b_flags:= and ~B_ASYNC; brelse(bp); anyfreed:=1; BO_LOCK(bo); if (nbp<>nil and (((nbp^.b_xflags and BX_VNDIRTY)=0) or (nbp^.b_vp<>vp) or (nbp^.b_flags and B_DELWRI)=0)) begin BO_UNLOCK(bo); goto restart; end; end; end; if (length > 0) begin restartsync: TAILQ_FOREACH_SAFE(bp, &bo^.bo_dirty.bv_hd, b_bobufs, nbp) begin if (bp^.b_lblkno > 0) continue; { * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. } if (BUF_LOCK(bp, LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK, BO_MTX(bo))=ENOLCK) begin goto restart; end; Assert((bp^.b_flags and B_DELWRI), vp, 'buf(%p) on dirty queue without DELWRI", bp)); BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bawrite(bp); BO_LOCK(bo); goto restartsync; end; end; bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); Exit(0); end; { * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. } static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) begin struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root=nil) Exit(nil); lefttreemax:=righttreemin:=@dummy; for (;;) begin if (lblkno < root^.b_lblkno or (lblkno=root^.b_lblkno and (xflags and BX_BKGRDMARKER) < (root^.b_xflags and BX_BKGRDMARKER))) begin if ((y:=root^.b_left)=nil) break; if (lblkno < y^.b_lblkno) begin { Rotate right. } root^.b_left:=y^.b_right; y^.b_right:=root; root:=y; if ((y:=root^.b_left)=nil) break; end; { Link into the new root's right tree. } righttreemin^.b_left:=root; righttreemin:=root; end; else if (lblkno > root^.b_lblkno or (lblkno=root^.b_lblkno and (xflags and BX_BKGRDMARKER) > (root^.b_xflags and BX_BKGRDMARKER))) begin if ((y:=root^.b_right)=nil) break; if (lblkno > y^.b_lblkno) begin { Rotate left. } root^.b_right:=y^.b_left; y^.b_left:=root; root:=y; if ((y:=root^.b_right)=nil) break; end; { Link into the new root's left tree. } lefttreemax^.b_right:=root; lefttreemax:=root; end; else begin break; end; root:=y; end; { Assemble the new root. } lefttreemax^.b_right:=root^.b_left; righttreemin^.b_left:=root^.b_right; root^.b_left:=dummy.b_right; root^.b_right:=dummy.b_left; Exit(root); end; static void buf_vlist_remove(struct buf *bp) begin struct buf *root; struct bufv *bv; Assert(bp^.b_bufobj<>nil, 'No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp^.b_bufobj); Assert((bp^.b_xflags and (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), 'buf_vlist_remove: Buf %p is on two lists", bp)); if (bp^.b_xflags and BX_VNDIRTY) bv:=@bp^.b_bufobj^.bo_dirty; else bv:=@bp^.b_bufobj^.bo_clean; if (bp<>bv^.bv_root) begin root:=buf_splay(bp^.b_lblkno, bp^.b_xflags, bv^.bv_root); Assert(root=bp, 'splay lookup failed in remove'); end; if (bp^.b_left=nil) begin root:=bp^.b_right; end; else begin root:=buf_splay(bp^.b_lblkno, bp^.b_xflags, bp^.b_left); root^.b_right:=bp^.b_right; end; bv^.bv_root:=root; TAILQ_REMOVE(@bv^.bv_hd, bp, b_bobufs); bv^.bv_cnt--; bp^.b_xflags:= and ~(BX_VNDIRTY or BX_VNCLEAN); end; { * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! } static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) begin struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); Assert((bp^.b_xflags and (BX_VNDIRTY|BX_VNCLEAN))=0, 'buf_vlist_add: Buf %p has existing xflags %d", bp, bp^.b_xflags)); bp^.b_xflags:= or xflags; if (xflags and BX_VNDIRTY) bv:=@bo^.bo_dirty; else bv:=@bo^.bo_clean; root:=buf_splay(bp^.b_lblkno, bp^.b_xflags, bv^.bv_root); if (root=nil) begin bp^.b_left:=nil; bp^.b_right:=nil; TAILQ_INSERT_TAIL(@bv^.bv_hd, bp, b_bobufs); end; else if (bp^.b_lblkno < root^.b_lblkno or (bp^.b_lblkno=root^.b_lblkno and (bp^.b_xflags and BX_BKGRDMARKER) < (root^.b_xflags and BX_BKGRDMARKER))) begin bp^.b_left:=root^.b_left; bp^.b_right:=root; root^.b_left:=nil; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); end; else begin bp^.b_right:=root^.b_right; bp^.b_left:=root; root^.b_right:=nil; TAILQ_INSERT_AFTER(@bv^.bv_hd, root, bp, b_bobufs); end; bv^.bv_cnt++; bv^.bv_root:=bp; end; { * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. } struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) begin struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp:=bo^.bo_clean.bv_root)<>nil and bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER)) Exit(bp); if ((bp:=bo^.bo_dirty.bv_root)<>nil and bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER)) Exit(bp); if ((bp:=bo^.bo_clean.bv_root)<>nil) begin bo^.bo_clean.bv_root:=bp:=buf_splay(lblkno, 0, bp); if (bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER)) Exit(bp); end; if ((bp:=bo^.bo_dirty.bv_root)<>nil) begin bo^.bo_dirty.bv_root:=bp:=buf_splay(lblkno, 0, bp); if (bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER)) Exit(bp); end; Exit(nil); end; { * Associate a buffer with a vnode. } void bgetvp(vp:p_vnode, struct buf *bp) begin struct bufobj *bo; bo:=@vp^.v_bufobj; ASSERT_BO_LOCKED(bo); Assert(bp^.b_vp=nil, bp^.b_vp, 'bgetvp: not free'); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp^.b_flags); Assert((bp^.b_xflags and (BX_VNDIRTY|BX_VNCLEAN))=0, vp, 'bgetvp: bp already attached! %p", bp)); vhold(vp); if (VFS_NEEDSGIANT(vp^.v_mount) or bo^.bo_flag and BO_NEEDSGIANT) bp^.b_flags:= or B_NEEDSGIANT; bp^.b_vp:=vp; bp^.b_bufobj:=bo; { * Insert onto list for new vnode. } buf_vlist_add(bp, bo, BX_VNCLEAN); end; { * Disassociate a buffer from a vnode. } void brelvp(struct buf *bp) begin struct bufobj *bo; vp:p_vnode; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp^.b_vp, bp^.b_flags); Assert(bp^.b_vp<>nil, 'brelvp: nil'); { * Delete from old vnode list, if on one. } vp:=bp^.b_vp; { XXX } bo:=bp^.b_bufobj; BO_LOCK(bo); if (bp^.b_xflags and (BX_VNDIRTY or BX_VNCLEAN)) buf_vlist_remove(bp); else panic'brelvp: Buffer %p not on queue.", bp); if ((bo^.bo_flag and BO_ONWORKLST) and bo^.bo_dirty.bv_cnt=0) begin bo^.bo_flag:= and ~BO_ONWORKLST; mtx_lock(@sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(@sync_mtx); end; bp^.b_flags:= and ~B_NEEDSGIANT; bp^.b_vp:=nil; bp^.b_bufobj:=nil; BO_UNLOCK(bo); vdrop(vp); end; { * Add an item to the syncer work queue. } static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) begin int queue, slot; ASSERT_BO_LOCKED(bo); mtx_lock(@sync_mtx); if (bo^.bo_flag and BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else begin bo^.bo_flag:= or BO_ONWORKLST; syncer_worklist_len++; end; if (delay > syncer_maxdelay - 2) delay:=syncer_maxdelay - 2; slot:=(syncer_delayno + delay) and syncer_mask; queue:=VFS_NEEDSGIANT(bo^.__bo_vnode^.v_mount) ? WI_GIANTQ : WI_MPSAFEQ; LIST_INSERT_HEAD(@syncer_workitem_pending[queue][slot], bo, bo_synclist); mtx_unlock(@sync_mtx); end; static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) begin int error, len; mtx_lock(@sync_mtx); len:=syncer_worklist_len - sync_vnode_count; mtx_unlock(@sync_mtx); error:=SYSCTL_OUT(req, &len, sizeof(len)); Exit(error); end; SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT or CTLFLAG_RD, nil, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length'; static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp:=begin "syncer", sched_sync, &updateproc end; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) begin vp:p_vnode; mp:p_mount; *bo:=LIST_FIRST(slp); if (*bo=nil) Exit(0); vp:=(*bo)^.__bo_vnode; { XXX } if (VOP_ISLOCKED(vp)<>0 or VI_TRYLOCK(vp)=0) Exit(1); { * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. } vholdl(vp); mtx_unlock(@sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT)<>0) begin vdrop(vp); mtx_lock(@sync_mtx); Exit(*bo=LIST_FIRST(slp)); end; vn_lock(vp, LK_EXCLUSIVE or LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)^.bo_flag and BO_ONWORKLST)<>0) begin { * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. } vn_syncer_add_to_worklist(*bo, syncdelay); end; BO_UNLOCK(*bo); vdrop(vp); mtx_lock(@sync_mtx); Exit(0); end; { * System filesystem synchronizer daemon. } static void sched_sync(void) begin struct synclist *gnext, *next; struct synclist *gslp, *slp; struct bufobj *bo; long starttime; struct thread *td:=curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; last_work_seen:=0; syncer_final_iter:=0; first_printf:=1; syncer_state:=SYNCER_RUNNING; starttime:=time_uptime; td^.td_pflags:= or TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td^.td_proc, SHUTDOWN_PRI_LAST); mtx_lock(@sync_mtx); for (;;) begin if (syncer_state=SYNCER_FINAL_DELAY and syncer_final_iter=0) begin mtx_unlock(@sync_mtx); kproc_suspend_check(td^.td_proc); mtx_lock(@sync_mtx); end; net_worklist_len:=syncer_worklist_len - sync_vnode_count; if (syncer_state<>SYNCER_RUNNING and starttime<>time_uptime) begin if (first_printf) begin printf'\nSyncing disks, vnodes remaining...'; first_printf:=0; end; printf'%d ", net_worklist_len); end; starttime:=time_uptime; { * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. } do begin slp:=@syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gslp:=@syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; syncer_delayno += 1; if (syncer_delayno=syncer_maxdelay) syncer_delayno:=0; next:=@syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gnext:=@syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; { * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. } if (syncer_state=SYNCER_SHUTTING_DOWN and net_worklist_len=0 and last_work_seen=syncer_delayno) begin syncer_state:=SYNCER_FINAL_DELAY; syncer_final_iter:=SYNCER_SHUTDOWN_SPEEDUP; end; end; while (syncer_state<>SYNCER_RUNNING and LIST_EMPTY(slp) and LIST_EMPTY(gslp) and syncer_worklist_len > 0); { * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Exitto the SHUTTING_DOWN state if any * new work appears. } if (net_worklist_len > 0 or syncer_state=SYNCER_RUNNING) last_work_seen:=syncer_delayno; if (net_worklist_len > 0 and syncer_state=SYNCER_FINAL_DELAY) syncer_state:=SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) begin error:=sync_vnode(slp, &bo, td); if (error=1) begin LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; end; if (first_printf=0) wdog_kern_pat(WD_LASTVAL); end; if (!LIST_EMPTY(gslp)) begin mtx_unlock(@sync_mtx); mtx_lock(@Giant); mtx_lock(@sync_mtx); while (!LIST_EMPTY(gslp)) begin error:=sync_vnode(gslp, &bo, td); if (error=1) begin LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(gnext, bo, bo_synclist); continue; end; end; mtx_unlock(@Giant); end; if (syncer_state=SYNCER_FINAL_DELAY and syncer_final_iter > 0) syncer_final_iter--; { * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. } if (rushjob > 0) begin rushjob -= 1; continue; end; { * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. } if (syncer_state<>SYNCER_RUNNING or time_uptime=starttime) begin thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); end; if (syncer_state<>SYNCER_RUNNING) cv_timedwait(@sync_wakeup, &sync_mtx, hz div SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime=starttime) cv_timedwait(@sync_wakeup, &sync_mtx, hz); end; end; { * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. } int speedup_syncer(void) begin int ret:=0; mtx_lock(@sync_mtx); if (rushjob < syncdelay div 2) begin rushjob += 1; stat_rush_requests += 1; ret:=1; end; mtx_unlock(@sync_mtx); cv_broadcast(@sync_wakeup); Exit(ret); end; { * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. } static void syncer_shutdown(void *arg, int howto) begin if (howto and RB_NOSYNC) Exit; mtx_lock(@sync_mtx); syncer_state:=SYNCER_SHUTTING_DOWN; rushjob:=0; mtx_unlock(@sync_mtx); cv_broadcast(@sync_wakeup); kproc_shutdown(arg, howto); end; { * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. } void reassignbuf(struct buf *bp) begin vp:p_vnode; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp:=bp^.b_vp; bo:=bp^.b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp^.b_vp, bp^.b_flags); { * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. } if (bp^.b_flags and B_PAGING) panic'cannot reassign paging buffer'; { * Delete from old vnode list, if on one. } BO_LOCK(bo); if (bp^.b_xflags and (BX_VNDIRTY or BX_VNCLEAN)) buf_vlist_remove(bp); else panic'reassignbuf: Buffer %p not on queue.", bp); { * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. } if (bp^.b_flags and B_DELWRI) begin if ((bo^.bo_flag and BO_ONWORKLST)=0) begin switch (vp^.v_type) begin case VDIR: delay:=dirdelay; break; case VCHR: delay:=metadelay; break; default: delay:=filedelay; end; vn_syncer_add_to_worklist(bo, delay); end; buf_vlist_add(bp, bo, BX_VNDIRTY); end; else begin buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo^.bo_flag and BO_ONWORKLST) and bo^.bo_dirty.bv_cnt=0) begin mtx_lock(@sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(@sync_mtx); bo^.bo_flag:= and ~BO_ONWORKLST; end; end; #ifdef INVARIANTS bv:=@bo^.bo_clean; bp:=TAILQ_FIRST(@bv^.bv_hd); Assert(bp=nil or bp^.b_bufobj=bo, 'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo)); bp:=TAILQ_LAST(@bv^.bv_hd, buflists); Assert(bp=nil or bp^.b_bufobj=bo, 'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo)); bv:=@bo^.bo_dirty; bp:=TAILQ_FIRST(@bv^.bv_hd); Assert(bp=nil or bp^.b_bufobj=bo, 'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo)); bp:=TAILQ_LAST(@bv^.bv_hd, buflists); Assert(bp=nil or bp^.b_bufobj=bo, 'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo)); #endif BO_UNLOCK(bo); end; } { * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. } procedure v_incr_usecount(vp:p_vnode); begin Inc(vp^.v_usecount); if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then begin dev_lock(); Inc(p_cdev(vp^.v_rdev)^.si_usecount); dev_unlock(); end; vholdl(vp); end; { * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. } procedure v_upgrade_usecount(vp:p_vnode); begin Inc(vp^.v_usecount); if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then begin dev_lock(); Inc(p_cdev(vp^.v_rdev)^.si_usecount); dev_unlock(); end; end; { * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. } procedure v_decr_usecount(vp:p_vnode); begin ASSERT_VI_LOCKED(vp,{$I %LINE%}); Assert(vp^.v_usecount>0,'v_decr_usecount: negative usecount'); Dec(vp^.v_usecount); if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then begin dev_lock(); Inc(p_cdev(vp^.v_rdev)^.si_usecount); dev_unlock(); end; vdropl(vp); end; { * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. } procedure v_decr_useonly(vp:p_vnode); begin ASSERT_VI_LOCKED(vp,{$I %LINE%}); Assert(vp^.v_usecount>0,'v_decr_useonly: negative usecount'); Dec(vp^.v_usecount); if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then begin dev_lock(); Dec(p_cdev(vp^.v_rdev)^.si_usecount); dev_unlock(); end; end; { * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. } function vget(vp:p_vnode;flags:Integer):Integer; var error:Integer; begin error:=0; VFS_ASSERT_GIANT(vp^.v_mount); Assert((flags and LK_TYPE_MASK)<>0,'vget: invalid lock operation'); if ((flags and LK_INTERLOCK)=0) then begin VI_LOCK(vp); end; vholdl(vp); error:=vn_lock(vp,flags or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); if (error<>0) then begin vdrop(vp); Exit(error); end; if ((vp^.v_iflag and VI_DOOMED)<>0) and ((flags and LK_RETRY)=0) then begin Assert(false,'vget: vn_lock failed to Exit ENOENT'); end; VI_LOCK(vp); { Upgrade our holdcnt to a usecount. } v_upgrade_usecount(vp); { * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. } if ((vp^.v_iflag and VI_OWEINACT)<>0) then begin if (VOP_ISLOCKED(vp)=LK_EXCLUSIVE) and ((flags and LK_NOWAIT)=0) then begin vinactive(vp); end; vp^.v_iflag:=vp^.v_iflag and (not VI_OWEINACT); end; VI_UNLOCK(vp); Exit(0); end; { * Increase the reference count of a vnode. } procedure vref(vp:p_vnode); public; begin VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); end; { * Exitreference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. } function vrefcnt(vp:p_vnode):Integer; begin VI_LOCK(vp); Result:=vp^.v_usecount; VI_UNLOCK(vp); end; const VPUTX_VRELE =1; VPUTX_VPUT =2; VPUTX_VUNREF=3; procedure vputx(vp:p_vnode;func:Integer); var error:Integer; begin Assert(vp<>nil,'vputx: nil vp'); if (func=VPUTX_VUNREF) then ASSERT_VOP_LOCKED(vp,'vunref') else if (func=VPUTX_VPUT) then ASSERT_VOP_LOCKED(vp,'vput') else Assert(func=VPUTX_VRELE,'vputx: wrong func'); VFS_ASSERT_GIANT(vp^.v_mount); VI_LOCK(vp); { Skip this v_writecount check if we're going to panic below. } Assert((vp^.v_writecount < vp^.v_usecount) or (vp^.v_usecount < 1),'vputx: missed vn_close'); error:=0; if (vp^.v_usecount > 1) or (((vp^.v_iflag and VI_DOINGINACT)<>0) and (vp^.v_usecount=1)) then begin if (func=VPUTX_VPUT) then begin VOP_UNLOCK(vp, 0); end; v_decr_usecount(vp); Exit; end; if (vp^.v_usecount<>1) then begin Assert(false,'vputx: negative ref cnt'); end; { * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. } v_decr_useonly(vp); { * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. } vp^.v_iflag:=vp^.v_iflag or VI_OWEINACT; case (func) of VPUTX_VRELE: begin error:=vn_lock(vp, LK_EXCLUSIVE or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); VI_LOCK(vp); end; VPUTX_VPUT: begin if (VOP_ISLOCKED(vp)<>LK_EXCLUSIVE) then begin error:=VOP_LOCK(vp, LK_UPGRADE or LK_INTERLOCK or LK_NOWAIT,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); VI_LOCK(vp); end; end; VPUTX_VUNREF: begin if (VOP_ISLOCKED(vp)<>LK_EXCLUSIVE) then begin error:=VOP_LOCK(vp, LK_TRYUPGRADE or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); VI_LOCK(vp); end; end; end; if (vp^.v_usecount > 0) then begin vp^.v_iflag:=vp^.v_iflag and (not VI_OWEINACT); end; if (error=0) then begin if ((vp^.v_iflag and VI_OWEINACT)<>0) then begin vinactive(vp); end; if (func<>VPUTX_VUNREF) then begin VOP_UNLOCK(vp, 0); end; end; vdropl(vp); end; { * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. } procedure vrele(vp:p_vnode); begin vputx(vp, VPUTX_VRELE); end; { * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) } procedure vput(vp:p_vnode); begin vputx(vp, VPUTX_VPUT); end; { * Release an exclusively locked vnode. Do not unlock the vnode lock. } procedure vunref(vp:p_vnode); begin vputx(vp, VPUTX_VUNREF); end; { * Somebody doesn't want the vnode recycled. } procedure vhold(vp:p_vnode); begin VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); end; { * Increase the hold count and activate if this is the first reference. } procedure vholdl(vp:p_vnode); var mp:p_mount; begin Inc(vp^.v_holdcnt); if (not VSHOULDBUSY(vp)) then Exit; ASSERT_VI_LOCKED(vp,'vholdl'); Assert((vp^.v_iflag and VI_FREE)<>0,'vnode not free'); Assert(vp^.v_op<>nil,'vholdl: vnode already reclaimed.'); { * Remove a vnode from the free list, mark it as in use, * and put it on the active list. } mtx_lock(vnode_free_list_mtx); TAILQ_REMOVE(@vnode_free_list,vp,@vp^.v_actfreelist); Dec(freevnodes); vp^.v_iflag:=vp^.v_iflag and (not (VI_FREE or VI_AGE)); Assert((vp^.v_iflag and VI_ACTIVE)=0,'Activating already active vnode'); vp^.v_iflag:=vp^.v_iflag or VI_ACTIVE; mp:=vp^.v_mount; TAILQ_INSERT_HEAD(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist); Inc(mp^.mnt_activevnodelistsize); mtx_unlock(vnode_free_list_mtx); end; { * Note that there is one less who cares about this vnode. * vdrop() is the opposite of vhold(). } procedure vdrop(vp:p_vnode); begin VI_LOCK(vp); vdropl(vp); end; { * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VI_DOOMED) in which case we will free it. } procedure vdropl(vp:p_vnode); var //struct bufobj *bo; mp:p_mount; active:Integer; begin ASSERT_VI_LOCKED(vp,'vdropl'); if (vp^.v_holdcnt <= 0) then begin Assert(false,'vdrop: holdcnt'); end; Dec(vp^.v_holdcnt); if (vp^.v_holdcnt > 0) then begin VI_UNLOCK(vp); Exit; end; if ((vp^.v_iflag and VI_DOOMED)=0) then begin { * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. } Assert(vp^.v_op<>nil,'vdropl: vnode already reclaimed.'); Assert((vp^.v_iflag and VI_FREE)=0,'vnode already free'); Assert(VSHOULDFREE(vp),'vdropl: freeing when we shouldnt'); active:=vp^.v_iflag and VI_ACTIVE; vp^.v_iflag:=vp^.v_iflag and (not VI_ACTIVE); mp:=vp^.v_mount; mtx_lock(vnode_free_list_mtx); if (active<>0) then begin TAILQ_REMOVE(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist); Dec(mp^.mnt_activevnodelistsize); end; if ((vp^.v_iflag and VI_AGE)<>0) then begin TAILQ_INSERT_HEAD(@vnode_free_list,vp,@vp^.v_actfreelist); end else begin TAILQ_INSERT_TAIL(@vnode_free_list,vp,@vp^.v_actfreelist); end; Inc(freevnodes); vp^.v_iflag:=vp^.v_iflag and (not VI_AGE); vp^.v_iflag:=vp^.v_iflag or VI_FREE; mtx_unlock(vnode_free_list_mtx); VI_UNLOCK(vp); Exit; end; { * The vnode has been marked for destruction, so free it. } System.InterlockedDecrement64(numvnodes); //bo:=@vp^.v_bufobj; Assert((vp^.v_iflag and VI_FREE)=0,'cleaned vnode still on the free list.'); Assert(vp^.v_data=nil, 'cleaned vnode isnt'); Assert(vp^.v_holdcnt=0, 'Non-zero hold count'); Assert(vp^.v_usecount=0, 'Non-zero use count'); Assert(vp^.v_writecount=0, 'Non-zero write count'); //Assert(bo^.bo_numoutput=0, 'Clean vnode has pending I/Os'); //Assert(bo^.bo_clean.bv_cnt=0, 'cleanbufcnt not 0'); //Assert(bo^.bo_clean.bv_root=nil, 'cleanblkroot not nil'); //Assert(bo^.bo_dirty.bv_cnt=0, 'dirtybufcnt not 0'); //Assert(bo^.bo_dirty.bv_root=nil, 'dirtyblkroot not nil'); //Assert(TAILQ_EMPTY(@vp^.v_cache_dst), 'vp has namecache dst'); //Assert(LIST_EMPTY(@vp^.v_cache_src), 'vp has namecache src'); //Assert(vp^.v_cache_dd=nil, 'vp has namecache for ..'); VI_UNLOCK(vp); //mac_vnode_destroy(vp); if (vp^.v_pollinfo<>nil) then begin destroy_vpollinfo(vp^.v_pollinfo); end; { XXX Elsewhere we detect an already freed vnode via nil v_op. } vp^.v_op:=nil; rangelock_destroy(@vp^.v_rl); //lockdestroy(vp^.v_vnlock); mtx_destroy(vp^.v_vnlock^); mtx_destroy(vp^.v_interlock); mtx_destroy(vp^.v_lock); //mtx_destroy(BO_MTX(bo)); FreeMem(vp); end; { * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. } procedure vinactive(vp:p_vnode); var obj:vm_object_t; begin ASSERT_VOP_ELOCKED(vp,'vinactive'); ASSERT_VI_LOCKED(vp,'vinactive'); Assert((vp^.v_iflag and VI_DOINGINACT)=0,'vinactive: recursed on VI_DOINGINACT'); vp^.v_iflag:=vp^.v_iflag or VI_DOINGINACT; vp^.v_iflag:=vp^.v_iflag and (not VI_OWEINACT); VI_UNLOCK(vp); { * Before moving off the active list, we must be sure that any * modified pages are on the vnode's dirty list since these will * no longer be checked once the vnode is on the inactive list. * Because the vnode vm object keeps a hold reference on the vnode * if there is at least one resident non-cached page, the vnode * cannot leave the active list without the page cleanup done. } obj:=vp^.v_object; if (obj<>nil) then if ((obj^.flags and OBJ_MIGHTBEDIRTY)<>0) then begin VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); end; VOP_INACTIVE(vp); VI_LOCK(vp); Assert((vp^.v_iflag and VI_DOINGINACT)<>0,'vinactive: lost VI_DOINGINACT'); vp^.v_iflag:=vp^.v_iflag and (not VI_DOINGINACT); end; { * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * Exiterror if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. } function vflush(mp:p_mount;rootrefs,flags:Integer):Integer; label loop; var vp,mvp,rootvp:p_vnode; vattr:t_vattr; busy,error:Integer; begin rootvp:=nil; busy:=0; if (rootrefs > 0) then begin Assert((flags and (SKIPSYSTEM or WRITECLOSE))=0,'vflush: bad args'); { * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. } error:=VFS_ROOT(mp, LK_EXCLUSIVE, @rootvp); if (error<>0) then begin Exit(error); end; vput(rootvp); end; loop: vp:=__mnt_vnode_first_all(@mvp,mp); while (vp<>nil) do begin vholdl(vp); error:=vn_lock(vp, LK_INTERLOCK or LK_EXCLUSIVE,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); if (error<>0) then begin vdrop(vp); //MNT_VNODE_FOREACH_ALL_ABORT MNT_ILOCK(mp); __mnt_vnode_markerfree_all(@mvp,mp); //MNT_VNODE_FOREACH_ALL_ABORT goto loop; end; { * Skip over a vnodes marked VV_SYSTEM. } if (((flags and SKIPSYSTEM)<>0) and ((vp^.v_vflag and VV_SYSTEM)<>0)) then begin VOP_UNLOCK(vp, 0); vdrop(vp); // vp:=__mnt_vnode_next_all(@mvp,mp); continue; end; { * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. } if ((flags and WRITECLOSE)<>0) then begin if (vp^.v_object<>nil) then begin VM_OBJECT_LOCK(vp^.v_object); vm_object_page_clean(vp^.v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp^.v_object); end; error:=VOP_FSYNC(vp, MNT_WAIT); if (error<>0) then begin VOP_UNLOCK(vp, 0); vdrop(vp); //MNT_VNODE_FOREACH_ALL_ABORT MNT_ILOCK(mp); __mnt_vnode_markerfree_all(@mvp,mp); //MNT_VNODE_FOREACH_ALL_ABORT Exit(error); end; error:=VOP_GETATTR(vp, @vattr); VI_LOCK(vp); if ((vp^.v_type=VNON) or ((error=0) and (vattr.va_nlink > 0))) and ((vp^.v_writecount=0) or (vp^.v_type<>VREG)) then begin VOP_UNLOCK(vp, 0); vdropl(vp); // vp:=__mnt_vnode_next_all(@mvp,mp); continue; end; end else VI_LOCK(vp); { * With v_usecount=0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. } if (vp^.v_usecount=0 or (flags and FORCECLOSE)) then begin Assert((vp^.v_usecount=0) or ((vp^.v_type<>VCHR) and (vp^.v_type<>VBLK)),'device VNODE %p is FORCECLOSED'); vgonel(vp); end else begin Inc(busy); end; VOP_UNLOCK(vp, 0); vdropl(vp); // vp:=__mnt_vnode_next_all(@mvp,mp); end; if (rootrefs > 0) and ((flags and FORCECLOSE)=0) then begin { * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. } VI_LOCK(rootvp); Assert(busy > 0, 'vflush: not busy'); Assert(rootvp^.v_usecount >= rootrefs,'vflush: usecount %d < rootrefs %d'); if (busy=1) and (rootvp^.v_usecount=rootrefs) then begin VOP_LOCK(rootvp, LK_EXCLUSIVE or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy:=0; end else VI_UNLOCK(rootvp); end; if (busy<>0) then begin Exit(EBUSY); end; while (rootrefs > 0) do begin vrele(rootvp); Dec(rootrefs); end; Exit(0); end; { * Recycle an unused vnode to the front of the free list. } function vrecycle(vp:p_vnode):Integer; var recycled:Integer; begin ASSERT_VOP_ELOCKED(vp, 'vrecycle'); recycled:=0; VI_LOCK(vp); if (vp^.v_usecount=0)then begin recycled:=1; vgonel(vp); end; VI_UNLOCK(vp); Exit(recycled); end; { * Eliminate all activity associated with a vnode * in preparation for reuse. } procedure vgone(vp:p_vnode); begin VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); end; { * vgone, with the vp interlock held. } procedure vgonel(vp:p_vnode); var oweinact:Integer; active:Integer; mp:p_mount; begin ASSERT_VOP_ELOCKED(vp, 'vgonel'); ASSERT_VI_LOCKED(vp, 'vgonel'); Assert(vp^.v_holdcnt<>0,'vgonel: vp %p has no reference.'); { * Don't vgonel if we're already doomed. } if ((vp^.v_iflag and VI_DOOMED)<>0) then Exit; vp^.v_iflag:=vp^.v_iflag or VI_DOOMED; { * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). } active:=vp^.v_usecount; oweinact:=(vp^.v_iflag and VI_OWEINACT); VI_UNLOCK(vp); { * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. } mp:=nil; //if (not TAILQ_EMPTY(@vp^.v_bufobj.bo_dirty.bv_hd)) then // vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0)<>0) then begin vinvalbuf(vp, 0, 0, 0); end; { * If purging an active vnode, it must be closed and * deactivated before being reclaimed. } if (active<>0) then begin VOP_CLOSE(vp, FNONBLOCK); end; if (oweinact<>0) or (active<>0) then begin VI_LOCK(vp); if ((vp^.v_iflag and VI_DOINGINACT)=0) then begin vinactive(vp); end; VI_UNLOCK(vp); end; //if (vp^.v_type=VSOCK) then // vfs_unp_reclaim(vp); { * Reclaim the vnode. } if (VOP_RECLAIM(vp)<>0) then begin Assert(false,'vgone: cannot reclaim'); end; //if (mp<>nil) then // vn_finished_secondary_write(mp); //Assert(vp^.v_object=nil,'vop_reclaim left v_object vp=%p, tag=%s')); { * Clear the advisory locks and wake up waiting threads. } VOP_ADVLOCKPURGE(vp); { * Delete from old mount point vnode list. } delmntque(vp); //cache_purge(vp); { * Done with purge, reset to the standard lock and invalidate * the vnode. } VI_LOCK(vp); vp^.v_vnlock:=@vp^.v_lock; vp^.v_op :=@dead_vnodeops; vp^.v_tag :='none'; vp^.v_type :=VBAD; end; { * Calculate the total number of references to a special device. } function vcount(vp:p_vnode):Integer; begin dev_lock(); Result:=p_cdev(vp^.v_rdev)^.si_usecount; dev_unlock(); end; { * Same as above, but using the struct cdev *as argument } function count_dev(dev:Pointer):Integer; //cdev begin dev_lock(); Result:=p_cdev(dev)^.si_usecount; dev_unlock(); end; { * perform msync on all vnodes under a mount point * the mount point must be locked. } procedure vfs_msync(mp:p_mount;flags:Integer); var vp,mvp:p_vnode; obj:vm_object_t; begin vp:=__mnt_vnode_first_active(@mvp,mp); While (vp<>nil) do begin obj:=vp^.v_object; if (obj<>nil) and ((obj^.flags and OBJ_MIGHTBEDIRTY)<>0) and ((flags=MNT_WAIT) or (VOP_ISLOCKED(vp)=0)) then begin if (vget(vp, LK_EXCLUSIVE or LK_RETRY or LK_INTERLOCK)=0) then begin if ((vp^.v_vflag and VV_NOSYNC)<>0) then begin { unlinked } vput(vp); // vp:=__mnt_vnode_next_active(@mvp,mp); continue; end; obj:=vp^.v_object; if (obj<>nil) then begin VM_OBJECT_LOCK(obj); if (flags=MNT_WAIT) then begin vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); end else begin vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); end; VM_OBJECT_UNLOCK(obj); end; vput(vp); end; end else begin VI_UNLOCK(vp); end; // vp:=__mnt_vnode_next_active(@mvp,mp); end; end; procedure destroy_vpollinfo_free(vi:p_vpollinfo); begin knlist_destroy(@vi^.vpi_selinfo.si_note); mtx_destroy(vi^.vpi_lock); FreeMem(vi); end; procedure destroy_vpollinfo(vi:p_vpollinfo); begin knlist_clear(@vi^.vpi_selinfo.si_note, 1); seldrain(@vi^.vpi_selinfo); destroy_vpollinfo_free(vi); end; procedure vfs_knllock(arg:Pointer); forward; procedure vfs_knlunlock(arg:Pointer); forward; procedure vfs_knl_assert_locked(arg:Pointer); forward; procedure vfs_knl_assert_unlocked(arg:Pointer); forward; { * Initalize per-vnode helper structure to hold poll-related state. } procedure v_addpollinfo(vp:p_vnode); var vi:p_vpollinfo; begin if (vp^.v_pollinfo<>nil) then begin Exit; end; vi:=AllocMem(SizeOf(vpollinfo)); mtx_init(vi^.vpi_lock,'vnode pollinfo'); knlist_init(@vi^.vpi_selinfo.si_note, vp, @vfs_knllock, @vfs_knlunlock, @vfs_knl_assert_locked, @vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp^.v_pollinfo<>nil) then begin VI_UNLOCK(vp); destroy_vpollinfo_free(vi); Exit; end; vp^.v_pollinfo:=vi; VI_UNLOCK(vp); end; { * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) } function vn_pollrecord(vp:p_vnode;events:Integer):Integer; begin v_addpollinfo(vp); mtx_lock(vp^.v_pollinfo^.vpi_lock); if ((vp^.v_pollinfo^.vpi_revents and events)<>0) then begin { * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). } events:=events and vp^.v_pollinfo^.vpi_revents; vp^.v_pollinfo^.vpi_revents:=vp^.v_pollinfo^.vpi_revents and (not events); mtx_unlock(vp^.v_pollinfo^.vpi_lock); Exit(events); end; vp^.v_pollinfo^.vpi_events:=vp^.v_pollinfo^.vpi_events or events; selrecord(curkthread, @vp^.v_pollinfo^.vpi_selinfo); mtx_unlock(vp^.v_pollinfo^.vpi_lock); Exit(0); end; { * Routine to create and manage a filesystem syncer vnode. } { #define sync_close ((int (*)(struct vop_close_args *))nilop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops:=begin .vop_bypass:=VOP_EOPNOTSUPP, .vop_close:=sync_close, { close } .vop_fsync:=sync_fsync, { fsync } .vop_inactive:=sync_inactive, { inactive } .vop_reclaim:=sync_reclaim, { reclaim } .vop_lock1:=vop_stdlock, { lock } .vop_unlock:=vop_stdunlock, { unlock } .vop_islocked:=vop_stdislocked, { islocked } end; { * Create a new filesystem syncer vnode for the specified mount point. } void vfs_allocate_syncvnode(mp:p_mount) begin vp:p_vnode; struct bufobj *bo; static long start, incr, next; int error; { Allocate a new vnode } error:=getnewvnode'syncer", mp, &sync_vnodeops, &vp); if (error<>0) panic'vfs_allocate_syncvnode: getnewvnode() failed'; vp^.v_type:=VNON; vn_lock(vp, LK_EXCLUSIVE or LK_RETRY); vp^.v_vflag:= or VV_FORCEINSMQ; error:=insmntque(vp, mp); if (error<>0) panic'vfs_allocate_syncvnode: insmntque() failed'; vp^.v_vflag:= and ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); { * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. } next += incr; if (next=0 or next > syncer_maxdelay) begin start /= 2; incr /= 2; if (start=0) begin start:=syncer_maxdelay div 2; incr:=syncer_maxdelay; end; next:=start; end; bo:=@vp^.v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); { XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. } mtx_lock(@sync_mtx); sync_vnode_count++; if (mp^.mnt_syncer=nil) begin mp^.mnt_syncer:=vp; vp:=nil; end; mtx_unlock(@sync_mtx); BO_UNLOCK(bo); if (vp<>nil) begin vn_lock(vp, LK_EXCLUSIVE or LK_RETRY); vgone(vp); vput(vp); end; end; void vfs_deallocate_syncvnode(mp:p_mount) begin vp:p_vnode; mtx_lock(@sync_mtx); vp:=mp^.mnt_syncer; if (vp<>nil) mp^.mnt_syncer:=nil; mtx_unlock(@sync_mtx); if (vp<>nil) vrele(vp); end; { * Do a lazy sync of the filesystem. } static int sync_fsync(struct vop_fsync_args *ap) begin struct vnode *syncvp:=ap^.a_vp; mp:p_mount:=syncvp^.v_mount; int error, save; struct bufobj *bo; { * We only need to do something if this is a lazy evaluation. } if (ap^.a_waitfor<>MNT_LAZY) Exit(0); { * Move ourselves to the back of the sync list. } bo:=@syncvp^.v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); { * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. } if (vfs_busy(mp, MBF_NOWAIT)<>0) Exit(0); if (vn_start_write(nil, &mp, V_NOWAIT)<>0) begin vfs_unbusy(mp); Exit(0); end; save:=curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); error:=VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); Exit(error); end; { * The syncer vnode is no referenced. } static int sync_inactive(struct vop_inactive_args *ap) begin vgone(ap^.a_vp); Exit(0); end; { * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. } static int sync_reclaim(struct vop_reclaim_args *ap) begin vp:p_vnode:=ap^.a_vp; struct bufobj *bo; bo:=@vp^.v_bufobj; BO_LOCK(bo); mtx_lock(@sync_mtx); if (vp^.v_mount^.mnt_syncer=vp) vp^.v_mount^.mnt_syncer:=nil; if (bo^.bo_flag and BO_ONWORKLST) begin LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo^.bo_flag:= and ~BO_ONWORKLST; end; mtx_unlock(@sync_mtx); BO_UNLOCK(bo); Exit(0); end; } { * Check if vnode represents a disk device } function vn_isdisk(vp:p_vnode;errp:PInteger):Boolean; var error:Integer; begin error:=0; dev_lock(); if (vp^.v_type<>VCHR) then begin error:=ENOTBLK end else if (vp^.v_rdev=nil) then begin error:=ENXIO end else if (p_cdev(vp^.v_rdev)^.si_devsw=nil) then begin error:=ENXIO end else if ((p_cdevsw(p_cdev(vp^.v_rdev)^.si_devsw)^.d_flags and D_DISK)=0) then begin error:=ENOTBLK; end; dev_unlock(); error:=ENOTBLK; if (errp<>nil) then begin errp^:=error; end; Exit(error=0); end; { * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. } function vaccess(_type:vtype; file_mode:mode_t; file_uid:uid_t; file_gid:gid_t; accmode:accmode_t; privused:PInteger):Integer; label privcheck; var dac_granted :accmode_t; priv_granted:accmode_t; begin Assert((accmode and (not (VEXEC or VWRITE or VREAD or VADMIN or VAPPEND)))=0,'invalid bit in accmode'); Assert(((accmode and VAPPEND)=0) or ((accmode and VWRITE)<>0),'VAPPEND without VWRITE'); { * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. } if (privused<>nil) then begin privused^:=0; end; dac_granted:=0; { Check the owner. } if {(cred^.cr_uid=file_uid)} True then begin dac_granted:=dac_granted or VADMIN; if ((file_mode and S_IXUSR)<>0) then dac_granted:=dac_granted or VEXEC; if ((file_mode and S_IRUSR)<>0) then dac_granted:=dac_granted or VREAD; if ((file_mode and S_IWUSR)<>0) then dac_granted:=dac_granted or (VWRITE or VAPPEND); if ((accmode and dac_granted)=accmode) then begin Exit(0); end; goto privcheck; end; { Otherwise, check the groups (first match) } if {(groupmember(file_gid, cred))} True then begin if ((file_mode and S_IXGRP)<>0) then dac_granted:=dac_granted or VEXEC; if ((file_mode and S_IRGRP)<>0) then dac_granted:=dac_granted or VREAD; if ((file_mode and S_IWGRP)<>0) then dac_granted:=dac_granted or (VWRITE or VAPPEND); if ((accmode and dac_granted)=accmode) then begin Exit(0); end; goto privcheck; end; { Otherwise, check everyone else. } if ((file_mode and S_IXOTH)<>0) then dac_granted:=dac_granted or VEXEC; if ((file_mode and S_IROTH)<>0) then dac_granted:=dac_granted or VREAD; if ((file_mode and S_IWOTH)<>0) then dac_granted:=dac_granted or (VWRITE or VAPPEND); if ((accmode and dac_granted)=accmode) then begin Exit(0); end; privcheck: { * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. } priv_granted:=0; if (_type=VDIR) then begin { * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. } if ((accmode and VEXEC)<>0) and ((dac_granted and VEXEC)=0) {and (priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)=0)} then begin priv_granted:=priv_granted or VEXEC; end; end else begin { * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. } if ((accmode and VEXEC)<>0) and ((dac_granted and VEXEC)=0) and ((file_mode and (S_IXUSR or S_IXGRP or S_IXOTH))<>0) {and (priv_check_cred(cred, PRIV_VFS_EXEC, 0)=0)} then begin priv_granted:=priv_granted or VEXEC; end; end; if ((accmode and VREAD)<>0) and ((dac_granted and VREAD)=0) {and (priv_check_cred(cred, PRIV_VFS_READ, 0)=0)} then begin priv_granted:=priv_granted or VREAD; end; if ((accmode and VWRITE)<>0) and ((dac_granted and VWRITE)=0) {and (priv_check_cred(cred, PRIV_VFS_WRITE, 0)=0)} then begin priv_granted:=priv_granted or (VWRITE or VAPPEND); end; if ((accmode and VADMIN)<>0) and ((dac_granted and VADMIN)=0) {and (priv_check_cred(cred, PRIV_VFS_ADMIN, 0)=0)} then begin priv_granted:=priv_granted or VADMIN; end; if ((accmode and (priv_granted or dac_granted))=accmode) then begin { XXX audit: privilege used } if (privused<>nil) then begin privused^:=1; end; Exit(0); end; if ((accmode and VADMIN)<>0) then Exit(EPERM) else Exit(EACCES); end; procedure vfs_badlock(msg,str:PChar;vp:p_vnode); begin Writeln(msg,' ',str); Assert(false,RawByteString(msg)+' '+RawByteString(str)); end; procedure assert_vi_locked(vp:p_vnode;str:PChar); begin if {vfs_badlock_mutex and} (not mtx_owned(VI_MTX(vp)^)) then vfs_badlock('interlock is not locked but should be', str, vp); end; procedure assert_vi_unlocked(vp:p_vnode;str:PChar); begin if {vfs_badlock_mutex and} mtx_owned(VI_MTX(vp)^) then vfs_badlock('interlock is locked but should not be', str, vp); end; procedure assert_vop_locked(vp:p_vnode;str:PChar); var locked:Integer; begin if (not IGNORE_LOCK(vp)) then begin locked:=VOP_ISLOCKED(vp); if (locked=0) or (locked=LK_EXCLOTHER) then vfs_badlock('is not locked but should be', str, vp); end; end; procedure assert_vop_unlocked(vp:p_vnode;str:PChar); begin if (not IGNORE_LOCK(vp)) and (VOP_ISLOCKED(vp)=LK_EXCLUSIVE) then vfs_badlock('is locked but should not be', str, vp); end; procedure assert_vop_elocked(vp:p_vnode;str:PChar); begin if (not IGNORE_LOCK(vp)) and (VOP_ISLOCKED(vp)<>LK_EXCLUSIVE) then vfs_badlock('is not exclusive locked but should be', str, vp); end; function VOP_WRITE_PRE(ap:p_vop_write_args;var osize,ooffset:Int64):Integer; public; var va:t_vattr; error:Integer; begin Result :=0; osize :=0; ooffset:=0; if (not VN_KNLIST_EMPTY(ap^.a_vp)) then begin error:=VOP_GETATTR(ap^.a_vp, @va); if (error<>0) then Exit(error); ooffset:=ap^.a_uio^.uio_offset; osize:=va.va_size; end; end; procedure VOP_WRITE_POST(ap:p_vop_write_args;ret:Integer;var osize,ooffset:Int64); public; var noffset:Int64; begin noffset:=ap^.a_uio^.uio_offset; if (noffset>ooffset) and (not VN_KNLIST_EMPTY(ap^.a_vp)) then begin if (noffset>osize) then begin VFS_KNOTE_LOCKED(ap^.a_vp, NOTE_WRITE or NOTE_EXTEND); end else begin VFS_KNOTE_LOCKED(ap^.a_vp, NOTE_WRITE); end; end; end; procedure vop_rename_fail(ap:p_vop_rename_args); begin if (ap^.a_tvp<>nil) then vput(ap^.a_tvp); if (ap^.a_tdvp=ap^.a_tvp) then vrele(ap^.a_tdvp) else vput(ap^.a_tdvp); vrele(ap^.a_fdvp); vrele(ap^.a_fvp); end; procedure vop_rename_pre(ap:p_vop_rename_args); public; begin if (ap^.a_tdvp<>ap^.a_fdvp) then vhold(ap^.a_fdvp); if (ap^.a_tvp<>ap^.a_fvp) then vhold(ap^.a_fvp); vhold(ap^.a_tdvp); if (ap^.a_tvp<>nil) then vhold(ap^.a_tvp); end; procedure vop_create_post(ap:p_vop_create_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE); end; end; procedure vop_link_post(ap:p_vop_link_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_vp , NOTE_LINK); VFS_KNOTE_LOCKED(ap^.a_tdvp, NOTE_WRITE); end; end; procedure vop_mkdir_post(ap:p_vop_mkdir_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE or NOTE_LINK); end; end; procedure vop_mknod_post(ap:p_vop_mknod_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE); end; end; procedure vop_remove_post(ap:p_vop_remove_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(ap^.a_vp , NOTE_DELETE); end; end; procedure vop_rename_post(ap:p_vop_rename_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_UNLOCKED(ap^.a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(ap^.a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(ap^.a_fvp , NOTE_RENAME); if (ap^.a_tvp<>nil) then begin VFS_KNOTE_UNLOCKED(ap^.a_tvp, NOTE_DELETE); end; end; if (ap^.a_tdvp<>ap^.a_fdvp) then begin vdrop(ap^.a_fdvp); end; if (ap^.a_tvp<>ap^.a_fvp) then begin vdrop(ap^.a_fvp); end; vdrop(ap^.a_tdvp); if (ap^.a_tvp<>nil) then begin vdrop(ap^.a_tvp); end; end; procedure vop_rmdir_post(ap:p_vop_rmdir_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE or NOTE_LINK); VFS_KNOTE_LOCKED(ap^.a_vp , NOTE_DELETE); end; end; procedure vop_setattr_post(ap:p_vop_setattr_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_vp, NOTE_ATTRIB); end; end; procedure vop_symlink_post(ap:p_vop_symlink_args;rc:Integer); public; begin if (rc=0) then begin VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE); end; end; procedure vfs_event_init(); begin knlist_init_mtx(@fs_knlist, nil); end; procedure vfs_event_signal(fsid:p_fsid;event:DWORD;data:ptrint); begin KNOTE_UNLOCKED(@fs_knlist, event); end; function filt_fsattach(kn:p_knote):Integer; begin kn^.kn_flags:=kn^.kn_flags or EV_CLEAR; knlist_add(@fs_knlist, kn, 0); Exit(0); end; procedure filt_fsdetach(kn:p_knote); begin knlist_remove(@fs_knlist, kn, 0); end; function filt_fsevent(kn:p_knote;hint:QWORD):Integer; begin kn^.kn_fflags:=kn^.kn_fflags or hint; Exit(ord(kn^.kn_fflags<>0)); end; { static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) begin struct vfsidctl vc; int error; mp:p_mount; error:=SYSCTL_IN(req, &vc, sizeof(vc)); if (error) Exit(error); if (vc.vc_vers<>VFS_CTL_VERS1) Exit(EINVAL); mp:=vfs_getvfs(@vc.vc_fsid); if (mp=nil) Exit(ENOENT); { ensure that a specific sysctl goes to the right filesystem. } if (strcmp(vc.vc_fstypename, "*'<>0 and strcmp(vc.vc_fstypename, mp^.mnt_vfc^.vfc_name)<>0) begin vfs_rel(mp); Exit(EINVAL); end; VCTLTOREQ(@vc, req); error:=VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); Exit(error); end; SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE or CTLFLAG_WR, nil, 0, sysctl_vfs_ctl, "", "Sysctl by fsid'; { * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? } u_quad_t init_va_filerev(void) begin struct bintime bt; getbinuptime(@bt); Exit(((u_quad_t)bt.sec shl 32LL) or (bt.frac shr 32LL)); end; } procedure filt_vfsdetach(kn:p_knote); forward; function filt_vfsread (kn:p_knote;hint:QWORD):Integer; forward; function filt_vfswrite (kn:p_knote;hint:QWORD):Integer; forward; function filt_vfsvnode (kn:p_knote;hint:QWORD):Integer; forward; const vfsread_filtops:t_filterops=( f_isfd :1; f_detach:@filt_vfsdetach; f_event :@filt_vfsread; ); vfswrite_filtops:t_filterops=( f_isfd :1; f_detach:@filt_vfsdetach; f_event :@filt_vfswrite; ); vfsvnode_filtops:t_filterops=( f_isfd :1; f_detach:@filt_vfsdetach; f_event :@filt_vfsvnode; ); procedure vfs_knllock(arg:Pointer); begin vn_lock(p_vnode(arg), LK_EXCLUSIVE or LK_RETRY,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%}); end; procedure vfs_knlunlock(arg:Pointer); begin VOP_UNLOCK(p_vnode(arg), 0); end; procedure vfs_knl_assert_locked(arg:Pointer); begin // end; procedure vfs_knl_assert_unlocked(arg:Pointer); begin // end; function vfs_kqfilter(ap:p_vop_kqfilter_args):Integer; var vp:p_vnode; kn:p_knote; knl:p_knlist; begin vp:=ap^.a_vp; kn:=ap^.a_kn; case (kn^.kn_filter) of EVFILT_READ :kn^.kn_fop:=@vfsread_filtops; EVFILT_WRITE:kn^.kn_fop:=@vfswrite_filtops; EVFILT_VNODE:kn^.kn_fop:=@vfsvnode_filtops; else Exit(EINVAL); end; kn^.kn_hook:=vp; v_addpollinfo(vp); if (vp^.v_pollinfo=nil) then begin Exit(ENOMEM); end; knl:=@vp^.v_pollinfo^.vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); Exit(0); end; { * Detach knote from vnode } procedure filt_vfsdetach(kn:p_knote); var vp:p_vnode; begin vp:=kn^.kn_hook; Assert(vp^.v_pollinfo<>nil, 'Missing v_pollinfo'); knlist_remove(@vp^.v_pollinfo^.vpi_selinfo.si_note, kn, 0); vdrop(vp); end; {ARGSUSED} function filt_vfsread(kn:p_knote;hint:QWORD):Integer; var vp:p_vnode; va:t_vattr; res:Integer; begin vp:=kn^.kn_hook; { * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. } if (hint=NOTE_REVOKE) then begin VI_LOCK(vp); kn^.kn_flags:=kn^.kn_flags or (EV_EOF or EV_ONESHOT); VI_UNLOCK(vp); Exit(1); end; if (VOP_GETATTR(vp, @va)<>0) then begin Exit(0); end; VI_LOCK(vp); kn^.kn_data:=va.va_size - p_file(kn^.kn_fp)^.f_offset; res:=ord(kn^.kn_data<>0); VI_UNLOCK(vp); Exit(res); end; {ARGSUSED} function filt_vfswrite(kn:p_knote;hint:QWORD):Integer; var vp:p_vnode; begin vp:=kn^.kn_hook; VI_LOCK(vp); { * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. } if (hint=NOTE_REVOKE) then begin kn^.kn_flags:=kn^.kn_flags or (EV_EOF or EV_ONESHOT); end; kn^.kn_data:=0; VI_UNLOCK(vp); Exit(1); end; function filt_vfsvnode(kn:p_knote;hint:QWORD):Integer; var vp:p_vnode; res:Integer; begin vp:=kn^.kn_hook; VI_LOCK(vp); if ((kn^.kn_sfflags and hint)<>0) then begin kn^.kn_fflags:=kn^.kn_fflags or hint; end; if (hint=NOTE_REVOKE) then begin kn^.kn_flags:=kn^.kn_flags or EV_EOF; VI_UNLOCK(vp); Exit(1); end; res:=ord(kn^.kn_fflags<>0); VI_UNLOCK(vp); Exit(res); end; function vfs_read_dirent(ap:p_vop_readdir_args;dp:p_dirent;off:QWORD):Integer; var error:Integer; begin if (dp^.d_reclen > ap^.a_uio^.uio_resid) then begin Exit(ENAMETOOLONG); end; error:=uiomove(dp, dp^.d_reclen, ap^.a_uio); if (error<>0) then begin if (ap^.a_ncookies<>nil) then begin if (ap^.a_cookies<>nil) then begin FreeMem(ap^.a_cookies); end; ap^.a_cookies:=nil; ap^.a_ncookies^:=0; end; Exit(error); end; if (ap^.a_ncookies=nil) then Exit(0); Assert(ap^.a_cookies<>nil,'null ap^.a_cookies value with non-null ap^.a_ncookies!'); ap^.a_cookies^:=ReAllocMem(ap^.a_cookies^,(ap^.a_ncookies^ + 1) * sizeof(QWORD)); ap^.a_cookies^[ap^.a_ncookies^]:=off; Inc(ap^.a_ncookies^); Exit(0); end; { * Mark for update the access time of the file if the filesystem * supports VOP_MARKATIME. This functionality is used by execve and * mmap, so we want to avoid the I/O implied by directly setting * va_atime for the sake of efficiency. } procedure vfs_mark_atime(vp:p_vnode); var mp:p_mount; begin mp:=vp^.v_mount; VFS_ASSERT_GIANT(mp); ASSERT_VOP_LOCKED(vp,'vfs_mark_atime'); if (mp<>nil) then if ((mp^.mnt_flag and (MNT_NOATIME or MNT_RDONLY))=0) then begin VOP_MARKATIME(vp); end; end; { * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to Exitthat value * as errno. * * Note that after this routine runs, accmode may be zero. } function vfs_unixify_accmode(accmode:p_accmode_t):Integer; begin { * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. } if ((accmode^ and VEXPLICIT_DENY)<>0) then begin accmode^:=0; Exit(0); end; { * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. } if ((accmode^ and (VDELETE_CHILD or VDELETE))<>0) then begin Exit(EPERM); end; if ((accmode^ and VADMIN_PERMS)<>0) then begin accmode^:=accmode^ and (not VADMIN_PERMS); accmode^:=accmode^ or VADMIN; end; { * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. } accmode^:=accmode^ and (not (VSTAT_PERMS or VSYNCHRONIZE)); Exit(0); end; { * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. } function __mnt_vnode_next_all(mvp:pp_vnode;mp:p_mount):p_vnode; var vp:p_vnode; begin //if (should_yield()) kern_yield(PRI_UNCHANGED); MNT_ILOCK(mp); Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch'); vp:=TAILQ_NEXT(mvp^,@mvp^^.v_nmntvnodes); while (vp<>nil) do begin if not ((vp^.v_type=VMARKER) or ((vp^.v_iflag and VI_DOOMED)<>0)) then Break; vp:=TAILQ_NEXT(vp,@vp^.v_nmntvnodes); end; { Check if we are done } if (vp=nil) then begin __mnt_vnode_markerfree_all(mvp, mp); { MNT_IUNLOCK(mp); -- done in above function } Exit(nil); end; TAILQ_REMOVE(@mp^.mnt_nvnodelist,mvp^,@mvp^^.v_nmntvnodes); TAILQ_INSERT_AFTER(@mp^.mnt_nvnodelist,vp,mvp^,@mvp^^.v_nmntvnodes); VI_LOCK(vp); MNT_IUNLOCK(mp); Exit(vp); end; function __mnt_vnode_first_all(mvp:pp_vnode;mp:p_mount):p_vnode; var vp:p_vnode; begin mvp^:=AllocMem(sizeof(t_vnode)); MNT_ILOCK(mp); MNT_REF(mp); mvp^^.v_type:=VMARKER; vp:=TAILQ_FIRST(@mp^.mnt_nvnodelist); while (vp<>nil) and ((vp^.v_type=VMARKER) or ((vp^.v_iflag and VI_DOOMED)<>0)) do begin vp:=TAILQ_NEXT(vp,@vp^.v_nmntvnodes); end; { Check if we are done } if (vp=nil) then begin MNT_REL(mp); MNT_IUNLOCK(mp); FreeMem(mvp^); mvp^:=nil; Exit(nil); end; mvp^^.v_mount:=mp; TAILQ_INSERT_AFTER(@mp^.mnt_nvnodelist,vp,mvp^,@mvp^^.v_nmntvnodes); VI_LOCK(vp); MNT_IUNLOCK(mp); Exit(vp); end; procedure __mnt_vnode_markerfree_all(mvp:pp_vnode;mp:p_mount); begin if (mvp^=nil) then begin MNT_IUNLOCK(mp); Exit; end; mtx_assert(MNT_MTX(mp)^); Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch'); TAILQ_REMOVE(@mp^.mnt_nvnodelist,mvp^,@mvp^^.v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); FreeMem(mvp^); mvp^:=nil; end; { * These are helper functions for filesystems to traverse their * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h } procedure mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount); begin Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch'); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); FreeMem(mvp^); mvp^:=nil; end; function mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode; label restart; var vp,nvp:p_vnode; begin mtx_assert(vnode_free_list_mtx); Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch'); restart: vp:=TAILQ_NEXT(mvp^,@mvp^^.v_actfreelist); TAILQ_REMOVE(@mp^.mnt_activevnodelist,mvp^,@mvp^^.v_actfreelist); while (vp<>nil) do begin if (vp^.v_type=VMARKER) then begin vp:=TAILQ_NEXT(vp,@vp^.v_actfreelist); continue; end; if (not VI_TRYLOCK(vp)) then begin continue; end; Assert(vp^.v_type<>VMARKER, 'locked marker %p'); Assert((vp^.v_mount=mp) or (vp^.v_mount=nil),'alien vnode on the active list %p %p'); if (vp^.v_mount=mp) and ((vp^.v_iflag and VI_DOOMED)=0) then begin break; end; nvp:=TAILQ_NEXT(vp,@vp^.v_actfreelist); VI_UNLOCK(vp); vp:=nvp; end; { Check if we are done } if (vp=nil) then begin mtx_unlock(vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); Exit(nil); end; TAILQ_INSERT_AFTER(@mp^.mnt_activevnodelist,vp,mvp^,@mvp^^.v_actfreelist); mtx_unlock(vnode_free_list_mtx); ASSERT_VI_LOCKED(vp, 'active iter'); Assert((vp^.v_iflag and VI_ACTIVE)<>0, 'Non-active vp %p'); Exit(vp); end; function __mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode; begin //if (should_yield()) kern_yield(PRI_UNCHANGED); mtx_lock(vnode_free_list_mtx); Exit(mnt_vnode_next_active(mvp, mp)); end; function __mnt_vnode_first_active(mvp:pp_vnode;mp:p_mount):p_vnode; var vp:p_vnode; begin mvp^:=AllocMem(sizeof(t_vnode)); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mvp^^.v_type:=VMARKER; mvp^^.v_mount:=mp; mtx_lock(vnode_free_list_mtx); vp:=TAILQ_FIRST(@mp^.mnt_activevnodelist); if (vp=nil) then begin mtx_unlock(vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); Exit(nil); end; TAILQ_INSERT_BEFORE(vp, mvp^,@mvp^^.v_actfreelist); Exit(mnt_vnode_next_active(mvp, mp)); end; procedure __mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount); begin if (mvp^=nil) then Exit; mtx_lock(vnode_free_list_mtx); TAILQ_REMOVE(@mp^.mnt_activevnodelist,mvp^,@mvp^^.v_actfreelist); mtx_unlock(vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); end; end.