FPPS4/sys/vfs/vfs_subr.pas

unit vfs_subr;

{$mode ObjFPC}{$H+}
{$CALLING SysV_ABI_CDecl}

interface

uses
 mqueue,
 vmount,
 kern_param,
 sys_event,
 vfile,
 vstat,
 vnode,
 vnode_if,
 vdirent,
 vfcntl,
 kern_mtx,
 kern_condvar,
 kern_synch,
 time,
 kern_time,
 kern_thr;

type
 t_insmntque1_dtr=procedure(v:p_vnode;p:Pointer);

function  makedev(x,y:Integer):Integer;

function  vfs_busy(mp:p_mount;flags:Integer):Integer;
procedure vfs_unbusy(mp:p_mount);
function  vfs_getvfs(fsid:p_fsid):p_mount;
procedure vfs_getnewfsid(mp:p_mount);
procedure vfs_timestamp(tsp:p_timespec);
procedure vattr_null(vap:p_vattr);
procedure v_incr_usecount(vp:p_vnode);

procedure vholdl(vp:p_vnode);
procedure vdropl(vp:p_vnode);
procedure vgonel(vp:p_vnode);

procedure vhold(vp:p_vnode);
procedure vdrop(vp:p_vnode);
function  vrecycle(vp:p_vnode):Integer;
procedure vgone(vp:p_vnode);

function  vget(vp:p_vnode;flags:Integer):Integer;
procedure vref(vp:p_vnode);
function  vrefcnt(vp:p_vnode):Integer;
procedure vrele(vp:p_vnode);
procedure vput(vp:p_vnode);
procedure vunref(vp:p_vnode);

procedure vinactive(vp:p_vnode);
function  vflush(mp:p_mount;rootrefs,flags:Integer):Integer;

procedure assert_vi_locked   (vp:p_vnode;str:PChar);
procedure assert_vi_unlocked (vp:p_vnode;str:PChar);
procedure assert_vop_locked  (vp:p_vnode;str:PChar);
procedure assert_vop_unlocked(vp:p_vnode;str:PChar);
procedure assert_vop_elocked (vp:p_vnode;str:PChar);

function  VOP_WRITE_PRE(ap:p_vop_write_args;var osize,ooffset:Int64):Integer;
procedure VOP_WRITE_POST(ap:p_vop_write_args;ret:Integer;var osize,ooffset:Int64);
procedure vop_rename_fail(ap:p_vop_rename_args);
procedure vop_rename_pre(ap:p_vop_rename_args);
procedure vop_create_post(ap:p_vop_create_args;rc:Integer);
procedure vop_link_post(ap:p_vop_link_args;rc:Integer);
procedure vop_mkdir_post(ap:p_vop_mkdir_args;rc:Integer);
procedure vop_mknod_post(ap:p_vop_mknod_args;rc:Integer);
procedure vop_remove_post(ap:p_vop_remove_args;rc:Integer);
procedure vop_rename_post(ap:p_vop_rename_args;rc:Integer);
procedure vop_rmdir_post(ap:p_vop_rmdir_args;rc:Integer);
procedure vop_setattr_post(ap:p_vop_setattr_args;rc:Integer);
procedure vop_symlink_post(ap:p_vop_symlink_args;rc:Integer);

procedure vfs_event_init(); //SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
procedure vfs_event_signal(fsid:p_fsid;event:DWORD;data:ptrint);
function  vfs_kqfilter(ap:p_vop_kqfilter_args):Integer;

function  vfs_read_dirent(ap:p_vop_readdir_args;dp:p_dirent;off:QWORD):Integer;
procedure vfs_mark_atime(vp:p_vnode);
function  vfs_unixify_accmode(accmode:p_accmode_t):Integer;

function  vcount(vp:p_vnode):Integer;
function  count_dev(dev:Pointer):Integer; //cdev

procedure vfs_msync(mp:p_mount;flags:Integer);

procedure destroy_vpollinfo_free(vi:p_vpollinfo);
procedure destroy_vpollinfo(vi:p_vpollinfo);
procedure v_addpollinfo(vp:p_vnode);
function  vn_pollrecord(vp:p_vnode;events:Integer):Integer;

function  vn_isdisk(vp:p_vnode;errp:PInteger):Boolean;

function  vaccess(_type:vtype;
                  file_mode:mode_t;
                  file_uid:uid_t;
                  file_gid:gid_t;
                  accmode:accmode_t;
                  privused:PInteger):Integer;

function  getnewvnode(tag:PChar;mp:p_mount;vops:p_vop_vector;vpp:pp_vnode):Integer;

procedure insmntque_stddtr(vp:p_vnode;dtr_arg:Pointer);
function  insmntque1(vp:p_vnode;mp:p_mount;dtr:t_insmntque1_dtr;dtr_arg:Pointer):Integer;
function  insmntque(vp:p_vnode;mp:p_mount):Integer;

function  vinvalbuf(vp:p_vnode;flags,slpflag,slptimeo:Integer):Integer;

function  __mnt_vnode_next_all(mvp:pp_vnode;mp:p_mount):p_vnode;
function  __mnt_vnode_first_all(mvp:pp_vnode;mp:p_mount):p_vnode;
procedure __mnt_vnode_markerfree_all(mvp:pp_vnode;mp:p_mount);

procedure mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount);
function  mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode;

function  __mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode;
function  __mnt_vnode_first_active(mvp:pp_vnode;mp:p_mount):p_vnode;
procedure __mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount);

procedure vntblinit;    //SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
procedure vnlru_proc(); //SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, @vnlru_kp);

function  filt_fsattach(kn:p_knote):Integer;
procedure filt_fsdetach(kn:p_knote);
function  filt_fsevent(kn:p_knote;hint:QWORD):Integer;

const
 fs_filtops:t_filterops=(
  f_isfd  :0;
  f_attach:@filt_fsattach;
  f_detach:@filt_fsdetach;
  f_event :@filt_fsevent;
 );

{
 * List of vnodes that are ready for recycling.
 }
var
 numvnodes:QWORD=0;

 vnode_free_list:TAILQ_HEAD=(tqh_first:nil;tqh_last:@vnode_free_list.tqh_first); //vnode

 mntid_mtx:mtx;
 vnode_free_list_mtx:mtx;

 syncer_delayno:Integer;
 syncer_mask:QWORD;

 //LIST_HEAD(synclist, bufobj);
 //static struct synclist *syncer_workitem_pending[2];

 sync_mtx:mtx;
 sync_wakeup:t_cv;

 syncer_maxdelay:Integer=32;
 syncdelay:Integer=30;
 filedelay:Integer=30;
 dirdelay:Integer=29;
 metadelay:Integer=28;

 rushjob:Integer;
 stat_rush_requests:Integer;

 fs_knlist:t_knlist;

const
 SYNCER_SHUTDOWN_SPEEDUP=4;

var
 sync_vnode_count:Integer;
 syncer_worklist_len:Integer;

type
 syncer_state=(SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY);

var
 desiredvnodes :Integer=0;
 wantfreevnodes:Integer=0;
 freevnodes    :Integer=0;
 vnlru_nowhere :Integer=0;

implementation

uses
 errno,
 vfs_vnops,
 subr_uio,
 sys_vm_object,
 vsys_generic,
 kern_rangelock,
 rtprio,
 sys_conf;

//

var
 dead_vnodeops:vop_vector; external;

//

{
 * Macros to control when a vnode is freed and recycled.  All require
 * the vnode interlock.
 }
function VCANRECYCLE(vp:p_vnode):Boolean; inline;
begin
 Result:=((vp^.v_iflag and VI_FREE)<>0) and (vp^.v_holdcnt=0);
end;

function VSHOULDFREE(vp:p_vnode):Boolean; inline;
begin
 Result:=((vp^.v_iflag and VI_FREE)=0) and (vp^.v_holdcnt=0);
end;

function VSHOULDBUSY(vp:p_vnode):Boolean; inline;
begin
 Result:=((vp^.v_iflag and VI_FREE)<>0) and (vp^.v_holdcnt<>0);
end;


var
{ Shift count for (uintptr_t)vp to initialize vp^.v_hash. }
 vnsz2log:Integer;

{
 * Initialize the vnode management data structures.
 *
 * Reevaluate the following cap on the number of vnodes after the physical
 * memory size exceeds 512GB.  In the limit, as the physical memory size
 * grows, the ratio of physical pages to vnodes approaches sixteen to one.
 }
const
 MAXVNODES_MAX=(512 * (1024 * 1024 * 1024 div (16*1024) div 16));
 v_page_count=524288;

procedure vntblinit;
var
 i:DWORD;
begin
 desiredvnodes:=10000;
 if (desiredvnodes > MAXVNODES_MAX) then
 begin
  desiredvnodes:=MAXVNODES_MAX;
 end;
 wantfreevnodes:=desiredvnodes div 4;

 mtx_init(mntid_mtx,'mntid');
 mtx_init(vnode_free_list_mtx,'vnode_free_list');
 {
  * Initialize the filesystem syncer.
  }
 //syncer_workitem_pending[WI_MPSAFEQ]:=hashinit(syncer_maxdelay, M_VNODE,&syncer_mask);
 //syncer_workitem_pending[WI_GIANTQ]:=hashinit(syncer_maxdelay, M_VNODE,&syncer_mask);
 syncer_maxdelay:=syncer_mask + 1;
 mtx_init(sync_mtx,'Syncer mtx');
 cv_init(@sync_wakeup,'syncer');

 i:=1;
 While (i<=sizeof(t_vnode)) do
 begin
  Inc(vnsz2log);
  i:=i shl 1;
 end;
 Dec(vnsz2log);
end;

function vfs_busy(mp:p_mount;flags:Integer):Integer;
begin
 MNT_ILOCK(mp);
 MNT_REF(mp);

 while ((mp^.mnt_kern_flag and MNTK_UNMOUNT)<>0) do
 begin
  if ((flags and MBF_NOWAIT)<>0) or ((mp^.mnt_kern_flag and MNTK_REFEXPIRE)<>0) then
  begin
   MNT_REL(mp);
   MNT_IUNLOCK(mp);
   Exit(ENOENT);
  end;
  if ((flags and MBF_MNTLSTLOCK)<>0) then
  begin
   mtx_unlock(mountlist_mtx);
  end;
  mp^.mnt_kern_flag:=mp^.mnt_kern_flag or MNTK_MWAIT;
  msleep(mp, MNT_MTX(mp), PVFS or PDROP,'vfs_busy', 0);
  if ((flags and MBF_MNTLSTLOCK)<>0) then
  begin
   mtx_lock(mountlist_mtx);
  end;
  MNT_ILOCK(mp);
 end;

 if ((flags and MBF_MNTLSTLOCK)<>0) then
 begin
  mtx_unlock(mountlist_mtx);
 end;

 Inc(mp^.mnt_lockref);
 MNT_IUNLOCK(mp);
 Exit(0);
end;

{
 * Free a busy filesystem.
 }
procedure vfs_unbusy(mp:p_mount);
begin
 MNT_ILOCK(mp);
 MNT_REL(mp);
 Assert(mp^.mnt_lockref>0,'negative mnt_lockref');
 Dec(mp^.mnt_lockref);
 if (mp^.mnt_lockref=0) and ((mp^.mnt_kern_flag and MNTK_DRAINING)<>0) then
 begin
  mp^.mnt_kern_flag:=mp^.mnt_kern_flag and (not MNTK_DRAINING);
  wakeup(@mp^.mnt_lockref);
 end;
 MNT_IUNLOCK(mp);
end;

{
 * Lookup a mount point by filesystem identifier.
 }
function vfs_getvfs(fsid:p_fsid):p_mount;
var
 mp:p_mount;
begin
 mtx_lock(mountlist_mtx);

 mp:=TAILQ_FIRST(@mountlist);
 while (mp<>nil) do
 begin
  if (mp^.mnt_stat.f_fsid.val[0]=fsid^.val[0]) and
     (mp^.mnt_stat.f_fsid.val[1]=fsid^.val[1]) then
  begin
   MNT_REF(mp);
   mtx_unlock(mountlist_mtx);
   Exit(mp);
  end;
  mp:=TAILQ_NEXT(mp,@mp^.mnt_list);
 end;
 mtx_unlock(mountlist_mtx);
 Exit(nil);
end;

function makedev(x,y:Integer):Integer; inline;
begin
 Result:=(x shl 8) or y;
end;

procedure vfs_getnewfsid(mp:p_mount);
var
 mntid_base:Word;
 nmp:p_mount;
 tfsid:fsid_t;
 mtype:Integer;
begin
 mtx_lock(mntid_mtx);
 mtype:=mp^.mnt_vfc^.vfc_typenum;
 tfsid.val[1]:=mtype;
 mntid_base:=0;
 mtype:=(mtype and $FF) shl 24;
 repeat
  tfsid.val[0]:=makedev(255,mtype or ((mntid_base and $FF00) shl 8) or (mntid_base and $FF));
  Inc(mntid_base);
  nmp:=vfs_getvfs(@tfsid);
  if (nmp=nil) then break;
  MNT_REL(nmp);
 until false;
 mp^.mnt_stat.f_fsid.val[0]:=tfsid.val[0];
 mp^.mnt_stat.f_fsid.val[1]:=tfsid.val[1];
 mtx_unlock(mntid_mtx);
end;

{
 * Get a current timestamp.
 }
procedure vfs_timestamp(tsp:p_timespec);
begin
 getnanotime(tsp);
end;

{
 * Set vnode attributes to VNOVAL
 }
procedure vattr_null(vap:p_vattr);
begin
 vap^.va_type             :=VNON;
 vap^.va_size             :=VNOVAL;
 vap^.va_bytes            :=VNOVAL;
 vap^.va_mode             :=VNOVAL;
 vap^.va_nlink            :=VNOVAL;
 vap^.va_uid              :=VNOVAL;
 vap^.va_gid              :=VNOVAL;
 vap^.va_fsid             :=VNOVAL;
 vap^.va_fileid           :=VNOVAL;
 vap^.va_blocksize        :=VNOVAL;
 vap^.va_rdev             :=VNOVAL;
 vap^.va_atime.tv_sec     :=VNOVAL;
 vap^.va_atime.tv_nsec    :=VNOVAL;
 vap^.va_mtime.tv_sec     :=VNOVAL;
 vap^.va_mtime.tv_nsec    :=VNOVAL;
 vap^.va_ctime.tv_sec     :=VNOVAL;
 vap^.va_ctime.tv_nsec    :=VNOVAL;
 vap^.va_birthtime.tv_sec :=VNOVAL;
 vap^.va_birthtime.tv_nsec:=VNOVAL;
 vap^.va_flags            :=VNOVAL;
 vap^.va_gen              :=VNOVAL;
 vap^.va_vaflags          :=0;
end;

function vlrureclaim(mp:p_mount):Integer;
label
 next_iter,
 next_iter_mntunlocked,
 yield,
 relock_mnt;
var
 vp:p_vnode;
 done     :Integer;
 trigger  :Integer;
 usevnodes:Integer;
 count    :Integer;
begin
 usevnodes:=desiredvnodes;
 if (usevnodes <= 0) then usevnodes:=1;
 trigger:=v_page_count * 2 div usevnodes;
 done:=0;
 vn_start_write(nil, @mp, V_WAIT);
 MNT_ILOCK(mp);
 count:=mp^.mnt_nvnodelistsize div 10 + 1;

 while (count<>0) do
 begin
  vp:=TAILQ_FIRST(@mp^.mnt_nvnodelist);
  while (vp<>nil) do
  begin
   if (vp^.v_type<>VMARKER) then Break;
   vp:=TAILQ_NEXT(vp,@vp^.v_nmntvnodes);
  end;

  if (vp=nil) then break;

  TAILQ_REMOVE     (@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes);
  TAILQ_INSERT_TAIL(@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes);
  Dec(count);

  if (not VI_TRYLOCK(vp)) then
  begin
   goto next_iter;
  end;

  {
   * If it's been deconstructed already, it's still
   * referenced, or it exceeds the trigger, skip it.
   }
  if (vp^.v_usecount<>0) or
     {((vlru_allow_cache_src=0) and (not LIST_EMPTY(@vp^.v_cache_src))) or}
     ((vp^.v_iflag and VI_DOOMED)<>0) {or
     ((vp^.v_object<>nil) and (vp^.v_object^.resident_page_count > trigger))} then
  begin
   VI_UNLOCK(vp);
   goto next_iter;
  end;

  MNT_IUNLOCK(mp);
  vholdl(vp);
  if (VOP_LOCK(vp, LK_INTERLOCK or LK_EXCLUSIVE or LK_NOWAIT,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%})<>0) then
  begin
   vdrop(vp);
   goto next_iter_mntunlocked;
  end;
  VI_LOCK(vp);
  {
   * v_usecount may have been bumped after VOP_LOCK() dropped
   * the vnode interlock and before it was locked again.
   *
   * It is not necessary to recheck VI_DOOMED because it can
   * only be set by another thread that holds both the vnode
   * lock and vnode interlock.  If another thread has the
   * vnode lock before we get to VOP_LOCK() and obtains the
   * vnode interlock after VOP_LOCK() drops the vnode
   * interlock, the other thread will be unable to drop the
   * vnode lock before our VOP_LOCK() call fails.
   }
  if (vp^.v_usecount<>0) {or
     ((vlru_allow_cache_src=0) and (not LIST_EMPTY(@vp^.v_cache_src))) or
     ((vp^.v_object<>nil) and (vp^.v_object^.resident_page_count > trigger))} then
  begin
   VOP_UNLOCK(vp, LK_INTERLOCK);
   goto next_iter_mntunlocked;
  end;

  Assert((vp^.v_iflag and VI_DOOMED)=0,'VI_DOOMED unexpectedly detected in vlrureclaim()');
  //atomic_add_long(@recycles_count, 1);
  vgonel(vp);
  VOP_UNLOCK(vp, 0);
  vdropl(vp);
  Inc(done);
 next_iter_mntunlocked:
  //if (not should_yield()) then
   goto relock_mnt;
  //goto yield;
 next_iter:
  //if (not should_yield()) then
   continue;
  MNT_IUNLOCK(mp);
 yield:
  kern_yield(PRI_UNCHANGED);
 relock_mnt:
  MNT_ILOCK(mp);
 end;
 MNT_IUNLOCK(mp);
 vn_finished_write(mp);
 Exit(done);
end;

function vtryrecycle(vp:p_vnode):Integer; forward;

procedure vnlru_free(count:Integer);
var
 vp:p_vnode;
 vfslocked:Integer;
begin
 mtx_assert(vnode_free_list_mtx);
 For count:=count downto 0 do
 begin
  vp:=TAILQ_FIRST(@vnode_free_list);
  {
   * The list can be modified while the free_list_mtx
   * has been dropped and vp could be nil here.
   }
  if (vp=nil) then
   break;

  Assert(vp^.v_op<>nil,'vnlru_free: vnode already reclaimed.');
  Assert((vp^.v_iflag and VI_FREE)<>0,'Removing vnode not on freelist');
  Assert((vp^.v_iflag and VI_ACTIVE)=0,'Mangling active vnode');

  TAILQ_REMOVE(@vnode_free_list,vp,@vp^.v_actfreelist);
  {
   * Don't recycle if we can't get the interlock.
   }
  if (not VI_TRYLOCK(vp)) then
  begin
   TAILQ_INSERT_TAIL(@vnode_free_list,vp,@vp^.v_actfreelist);
   continue;
  end;
  Assert(VCANRECYCLE(vp),'vp inconsistent on freelist');
  Dec(freevnodes);
  vp^.v_iflag:=vp^.v_iflag and (not VI_FREE);
  vholdl(vp);
  mtx_unlock(vnode_free_list_mtx);
  VI_UNLOCK(vp);
  vfslocked:=VFS_LOCK_GIANT(vp^.v_mount);
  vtryrecycle(vp);
  VFS_UNLOCK_GIANT(vfslocked);
  {
   * If the recycled succeeded this vdrop will actually free
   * the vnode.  If not it will simply place it back on
   * the free list.
   }
  vdrop(vp);
  mtx_lock(vnode_free_list_mtx);
 end;
end;

//static struct proc *vnlruproc;
var
 vnlruproc_sig:Integer=0;

procedure vnlru_proc();
var
 mp,nmp:p_mount;
 done,vfslocked:Integer;
begin
 //EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST);

 //kproc_suspend_check(p);
 mtx_lock(vnode_free_list_mtx);
 if (freevnodes > wantfreevnodes) then
 begin
  vnlru_free(freevnodes - wantfreevnodes);
 end;
 if (numvnodes <= desiredvnodes * 9 div 10) then
 begin
  vnlruproc_sig:=0;
  wakeup(@vnlruproc_sig);
  //
  mtx_unlock(vnode_free_list_mtx);
  Exit;
 end;
 mtx_unlock(vnode_free_list_mtx);

 done:=0;
 mtx_lock(mountlist_mtx);
 mp:=TAILQ_FIRST(@mountlist);
 While (mp<>nil) do
 begin
  if (vfs_busy(mp, MBF_NOWAIT or MBF_MNTLSTLOCK)<>0) then
  begin
   nmp:=TAILQ_NEXT(mp,@mp^.mnt_list);
   continue;
  end;
  vfslocked:=VFS_LOCK_GIANT(mp);
  Inc(done,vlrureclaim(mp));
  VFS_UNLOCK_GIANT(vfslocked);
  mtx_lock(mountlist_mtx);
  nmp:=TAILQ_NEXT(mp,@mp^.mnt_list);
  vfs_unbusy(mp);
  //
  mp:=nmp
 end;
 mtx_unlock(mountlist_mtx);

 if (done=0) then
 begin
  Inc(vnlru_nowhere);
 end;

end;

function vtryrecycle(vp:p_vnode):Integer;
var
 vnmp:p_mount;
begin
 Assert(vp^.v_holdcnt<>0,'vtryrecycle: Recycling vp %p without a reference.');
 {
  * This vnode may found and locked via some other list, if so we
  * can't recycle it yet.
  }
 if (VOP_LOCK(vp, LK_EXCLUSIVE or LK_NOWAIT,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%})<>0) then
 begin
  Exit(EWOULDBLOCK);
 end;
 {
  * Don't recycle if its filesystem is being suspended.
  }
 if (vn_start_write(vp, @vnmp, V_NOWAIT)<>0) then
 begin
  VOP_UNLOCK(vp, 0);
  Exit(EBUSY);
 end;
 {
  * If we got this far, we need to acquire the interlock and see if
  * anyone picked up this vnode from another list.  If not, we will
  * mark it with DOOMED via vgonel() so that anyone who does find it
  * will skip over it.
  }
 VI_LOCK(vp);
 if (vp^.v_usecount<>0) then
 begin
  VOP_UNLOCK(vp, LK_INTERLOCK);
  vn_finished_write(vnmp);
  Exit(EBUSY);
 end;
 if ((vp^.v_iflag and VI_DOOMED)=0) then
 begin
  //atomic_add_long(@recycles_count, 1);
  vgonel(vp);
 end;
 VOP_UNLOCK(vp, LK_INTERLOCK);
 vn_finished_write(vnmp);
 Exit(0);
end;

function getnewvnode_wait(suspended:Integer):Integer;
begin
 mtx_assert(vnode_free_list_mtx);

 if (curkthread<>nil) and (numvnodes > desiredvnodes) then
 begin
  if (suspended<>0) then
  begin
   {
    * File system is beeing suspended, we cannot risk a
    * deadlock here, so allocate new vnode anyway.
    }
   if (freevnodes > wantfreevnodes) then
   begin
    vnlru_free(freevnodes - wantfreevnodes);
   end;
   Exit(0);
  end;
  if (vnlruproc_sig=0) then
  begin
   vnlruproc_sig:=1; { avoid unnecessary wakeups }
   wakeup(@vnlruproc_sig);
  end;
  msleep(@vnlruproc_sig,@vnode_free_list_mtx,PVFS,'vlruwk', hz);
 end;

 if (numvnodes>desiredvnodes) then
  Exit(ENFILE)
 else
  Exit(0);
end;

procedure getnewvnode_reserve(count:DWORD);
var
 td:p_kthread;
begin
 td:=curkthread;
 { First try to be quick and racy. }
 if (System.InterlockedExchangeAdd64(numvnodes,count) + count <= desiredvnodes) then
 begin
  Inc(td^.td_vp_reserv,count);
  Exit;
 end else
 begin
  System.InterlockedExchangeAdd64(numvnodes, -count);
 end;

 mtx_lock(vnode_free_list_mtx);
 while (count > 0) do
 begin
  if (getnewvnode_wait(0)=0) then
  begin
   Dec(count);
   Inc(td^.td_vp_reserv);
   System.InterlockedIncrement64(numvnodes);
  end;
 end;
 mtx_unlock(vnode_free_list_mtx);
end;

procedure getnewvnode_drop_reserve();
var
 td:p_kthread;
begin
 td:=curkthread;
 System.InterlockedExchangeAdd64(numvnodes,-td^.td_vp_reserv);
 td^.td_vp_reserv:=0;
end;

{
 * Return the next vnode from the free list.
 }
function getnewvnode(tag:PChar;mp:p_mount;vops:p_vop_vector;vpp:pp_vnode):Integer;
label
 alloc;
var
 td:p_kthread;
 vp:p_vnode;
 //struct bufobj *bo;
 error,susp:Integer;
begin
 vp:=nil;
 td:=curkthread;
 if (td<>nil) then
 begin
  if (td^.td_vp_reserv>0) then
  begin
   Dec(td^.td_vp_reserv,1);
   goto alloc;
  end;
 end;
 mtx_lock(vnode_free_list_mtx);
 {
  * Lend our context to reclaim vnodes if they've exceeded the max.
  }
 if (freevnodes > wantfreevnodes) then
 begin
  vnlru_free(1);
 end;

 susp:=ord(False);
 if (mp<>nil) then
 begin
  susp:=ord(((mp^.mnt_kern_flag and MNTK_SUSPEND)<>0));
 end;
 error:=getnewvnode_wait(susp);

 System.InterlockedIncrement64(numvnodes);
 mtx_unlock(vnode_free_list_mtx);
 alloc:
 //atomic_add_long(@vnodes_created, 1);
 vp:=AllocMem(SizeOf(t_vnode));
 {
  * Setup locks.
  }
 vp^.v_vnlock:=@vp^.v_lock;
 mtx_init(vp^.v_interlock,'vnode interlock');
 {
  * By default, don't allow shared locks unless filesystems
  * opt-in.
  }
 mtx_init(vp^.v_vnlock^,'PVFS');
 //lockinit(vp^.v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 {
  * Initialize bufobj.
  }
 //bo:=@vp^.v_bufobj;
 //bo^.__bo_vnode:=vp;
 //mtx_init(BO_MTX(bo), "bufobj interlock", nil, MTX_DEF);
 //bo^.bo_ops:=@buf_ops_bio;
 //bo^.bo_private:=vp;
 //TAILQ_INIT(@bo^.bo_clean.bv_hd);
 //TAILQ_INIT(@bo^.bo_dirty.bv_hd);
 {
  * Initialize namecache.
  }
 //LIST_INIT(@vp^.v_cache_src);
 //TAILQ_INIT(@vp^.v_cache_dst);
 {
  * Finalize various vnode identity bits.
  }
 vp^.v_type:=VNON;
 vp^.v_tag:=tag;
 vp^.v_op:=vops;
 v_incr_usecount(vp);
 vp^.v_data:=nil;

 //mac_vnode_init(vp);
 //if (mp<>nil and (mp^.mnt_flag and MNT_MULTILABEL)=0)
 // mac_vnode_associate_singlelabel(mp, vp);
 //else if (mp=nil and vops<>@dead_vnodeops)
 // printf'nil mp in getnewvnode()\n';

 if (mp<>nil) then
 begin
  //bo^.bo_bsize:=mp^.mnt_stat.f_iosize;
  if ((mp^.mnt_kern_flag and MNTK_NOKNOTE)<>0) then
  begin
   vp^.v_vflag:=vp^.v_vflag or VV_NOKNOTE;
  end;
 end;
 rangelock_init(@vp^.v_rl);

 {
  * For the filesystems which do not use vfs_hash_insert(),
  * still initialize v_hash to have vfs_hash_index() useful.
  * E.g., nilfs uses vfs_hash_index() on the lower vnode for
  * its own hashing.
  }
 vp^.v_hash:=ptruint(vp) shr vnsz2log;

 vpp^:=vp;
 Exit(0);
end;

{
 * Delete from old mount point vnode list, if on one.
 }
procedure delmntque(vp:p_vnode);
var
 mp:p_mount;
 active:Integer;
begin
 mp:=vp^.v_mount;
 if (mp=nil) then Exit;
 MNT_ILOCK(mp);
 VI_LOCK(vp);
 Assert(mp^.mnt_activevnodelistsize <= mp^.mnt_nvnodelistsize,
     'Active vnode list size %d > Vnode list size %d');
 active:=vp^.v_iflag and VI_ACTIVE;
 vp^.v_iflag:=vp^.v_iflag and (not VI_ACTIVE);
 if (active<>0) then
 begin
  mtx_lock(vnode_free_list_mtx);
  TAILQ_REMOVE(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist);
  Dec(mp^.mnt_activevnodelistsize);
  mtx_unlock(vnode_free_list_mtx);
 end;
 vp^.v_mount:=nil;
 VI_UNLOCK(vp);
 Assert(mp^.mnt_nvnodelistsize > 0,'bad mount point vnode list size');
 TAILQ_REMOVE(@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes);
 Dec(mp^.mnt_nvnodelistsize);
 MNT_REL(mp);
 MNT_IUNLOCK(mp);
end;

procedure insmntque_stddtr(vp:p_vnode;dtr_arg:Pointer);
begin
 vp^.v_data:=nil;
 vp^.v_op:=@dead_vnodeops;
 { XXX non mp-safe fs may still call insmntque with vnode
    unlocked }
 if (VOP_ISLOCKED(vp)=0) then
 begin
  vn_lock(vp, LK_EXCLUSIVE or LK_RETRY,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
 end;
 vgone(vp);
 vput(vp);
end;

{
 * Insert into list of vnodes for the new mount point, if available.
 }
function insmntque1(vp:p_vnode;mp:p_mount;dtr:t_insmntque1_dtr;dtr_arg:Pointer):Integer;
var
 locked:Integer;
begin
 Assert(vp^.v_mount=nil,'insmntque: vnode already on per mount vnode list');
 Assert(mp<>nil, 'Dont call insmntque(foo, nil)');

 {
  * We acquire the vnode interlock early to ensure that the
  * vnode cannot be recycled by another process releasing a
  * holdcnt on it before we get it on both the vnode list
  * and the active vnode list. The mount mutex protects only
  * manipulation of the vnode list and the vnode freelist
  * mutex protects only manipulation of the active vnode list.
  * Hence the need to hold the vnode interlock throughout.
  }
 MNT_ILOCK(mp);
 VI_LOCK(vp);
 if ((mp^.mnt_kern_flag and MNTK_NOINSMNTQ)<>0) and
    (((mp^.mnt_kern_flag and MNTK_UNMOUNTF)<>0) or
     (mp^.mnt_nvnodelistsize=0)) then
 begin
  locked:=VOP_ISLOCKED(vp);
  if (locked=0) or
     ((locked=LK_EXCLUSIVE) and
      ((vp^.v_vflag and VV_FORCEINSMQ)=0)) then
  begin
   VI_UNLOCK(vp);
   MNT_IUNLOCK(mp);
   if (dtr<>nil) then
   begin
    dtr(vp, dtr_arg);
   end;
   Exit(EBUSY);
  end;
 end;
 vp^.v_mount:=mp;
 MNT_REF(mp);
 TAILQ_INSERT_TAIL(@mp^.mnt_nvnodelist,vp,@vp^.v_nmntvnodes);
 Assert(mp^.mnt_nvnodelistsize >= 0,'neg mount point vnode list size');
 Inc(mp^.mnt_nvnodelistsize);
 Assert((vp^.v_iflag and VI_ACTIVE)=0,'Activating already active vnode');
 vp^.v_iflag:=vp^.v_iflag or VI_ACTIVE;
 mtx_lock(vnode_free_list_mtx);
 TAILQ_INSERT_HEAD(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist);
 Inc(mp^.mnt_activevnodelistsize);
 mtx_unlock(vnode_free_list_mtx);
 VI_UNLOCK(vp);
 MNT_IUNLOCK(mp);
 Exit(0);
end;

function insmntque(vp:p_vnode;mp:p_mount):Integer;
begin
 Exit(insmntque1(vp, mp, @insmntque_stddtr, nil));
end;

{
{
 * Flush out and invalidate all buffers associated with a bufobj
 * Called with the underlying object locked.
 }
int
bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
begin
 int error;

 BO_LOCK(bo);
 if (flags and V_SAVE) begin
  error:=bufobj_wwait(bo, slpflag, slptimeo);
  if (error) begin
   BO_UNLOCK(bo);
   Exit(error);
  end;
  if (bo^.bo_dirty.bv_cnt > 0) begin
   BO_UNLOCK(bo);
   if ((error:=BO_SYNC(bo, MNT_WAIT))<>0)
    Exit(error);
   {
    * XXX We could save a lock/unlock if this was only
    * enabled under INVARIANTS
    }
   BO_LOCK(bo);
   if (bo^.bo_numoutput > 0 or bo^.bo_dirty.bv_cnt > 0)
    panic'vinvalbuf: dirty bufs';
  end;
 end;
 {
  * If you alter this loop please notice that interlock is dropped and
  * reacquired in flushbuflist.  Special care is needed to ensure that
  * no race conditions occur from this.
  }
 do begin
  error:=flushbuflist(@bo^.bo_clean,
      flags, bo, slpflag, slptimeo);
  if (error=0 and !(flags and V_CLEANONLY))
   error:=flushbuflist(@bo^.bo_dirty,
       flags, bo, slpflag, slptimeo);
  if (error<>0 and error<>EAGAIN) begin
   BO_UNLOCK(bo);
   Exit(error);
  end;
 end; while (error<>0);

 {
  * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
  * have write I/O in-progress but if there is a VM object then the
  * VM object can also have read-I/O in-progress.
  }
 do begin
  bufobj_wwait(bo, 0, 0);
  BO_UNLOCK(bo);
  if (bo^.bo_object<>nil) begin
   VM_OBJECT_LOCK(bo^.bo_object);
   vm_object_pip_wait(bo^.bo_object, "bovlbx';
   VM_OBJECT_UNLOCK(bo^.bo_object);
  end;
  BO_LOCK(bo);
 end; while (bo^.bo_numoutput > 0);
 BO_UNLOCK(bo);

 {
  * Destroy the copy in the VM cache, too.
  }
 if (bo^.bo_object<>nil and
     (flags and (V_ALT or V_NORMAL or V_CLEANONLY))=0) begin
  VM_OBJECT_LOCK(bo^.bo_object);
  vm_object_page_remove(bo^.bo_object, 0, 0, (flags and V_SAVE) ?
      OBJPR_CLEANONLY : 0);
  VM_OBJECT_UNLOCK(bo^.bo_object);
 end;

#ifdef INVARIANTS
 BO_LOCK(bo);
 if ((flags and (V_ALT or V_NORMAL or V_CLEANONLY))=0 and
     (bo^.bo_dirty.bv_cnt > 0 or bo^.bo_clean.bv_cnt > 0))
  panic'vinvalbuf: flush failed';
 BO_UNLOCK(bo);
#endif
 Exit(0);
end;
}

{
 * Flush out and invalidate all buffers associated with a vnode.
 * Called with the underlying object locked.
 }
function vinvalbuf(vp:p_vnode;flags,slpflag,slptimeo:Integer):Integer;
begin
 ASSERT_VOP_LOCKED(vp, 'vinvalbuf');

 if (vp^.v_object<>nil) then
 if (vm_object_t(vp^.v_object)^.handle<>vp) then
 begin
  Exit(0);
 end;

 //Exit(bufobj_invalbuf(@vp^.v_bufobj, flags, slpflag, slptimeo));

 Result:=0;
end;

{
{
 * Flush out buffers on the specified list.
 *
 }
static int
flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
    int slptimeo)
begin
 struct buf *bp, *nbp;
 int retval, error;
 daddr_t lblkno;
 b_xflags_t xflags;

 ASSERT_BO_LOCKED(bo);

 retval:=0;
 TAILQ_FOREACH_SAFE(bp, &bufv^.bv_hd, b_bobufs, nbp) begin
  if (((flags and V_NORMAL) and (bp^.b_xflags and BX_ALTDATA)) or
      ((flags and V_ALT) and (bp^.b_xflags and BX_ALTDATA)=0)) begin
   continue;
  end;
  lblkno:=0;
  xflags:=0;
  if (nbp<>nil) begin
   lblkno:=nbp^.b_lblkno;
   xflags:=nbp^.b_xflags &
    (BX_BKGRDMARKER or BX_VNDIRTY or BX_VNCLEAN);
  end;
  retval:=EAGAIN;
  error:=BUF_TIMELOCK(bp,
      LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK, BO_MTX(bo),
      "flushbuf", slpflag, slptimeo);
  if (error) begin
   BO_LOCK(bo);
   Exit(error<>ENOLCK ? error : EAGAIN);
  end;
  Assert(bp^.b_bufobj=bo,
      'bp %p wrong b_bufobj %p should be %p",
      bp, bp^.b_bufobj, bo));
  if (bp^.b_bufobj<>bo) begin { XXX: necessary ? }
   BUF_UNLOCK(bp);
   BO_LOCK(bo);
   Exit(EAGAIN);
  end;
  {
   * XXX Since there are no node locks for NFS, I
   * believe there is a slight chance that a delayed
   * write will occur while sleeping just above, so
   * check for it.
   }
  if (((bp^.b_flags and (B_DELWRI or B_INVAL))=B_DELWRI) and
      (flags and V_SAVE)) begin
   BO_LOCK(bo);
   bremfree(bp);
   BO_UNLOCK(bo);
   bp^.b_flags:= or B_ASYNC;
   bwrite(bp);
   BO_LOCK(bo);
   Exit(EAGAIN); { XXX: why not loop ? }
  end;
  BO_LOCK(bo);
  bremfree(bp);
  BO_UNLOCK(bo);
  bp^.b_flags:= or (B_INVAL or B_RELBUF);
  bp^.b_flags:= and ~B_ASYNC;
  brelse(bp);
  BO_LOCK(bo);
  if (nbp<>nil and
      (nbp^.b_bufobj<>bo or
       nbp^.b_lblkno<>lblkno or
       (nbp^.b_xflags &
        (BX_BKGRDMARKER or BX_VNDIRTY or BX_VNCLEAN))<>xflags))
   break;   { nbp invalid }
 end;
 Exit(retval);
end;

{
 * Truncate a file's buffer and pages to a specified length.  This
 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 * sync activity.
 }
int
vtruncbuf(vp:p_vnode, struct ucred *cred, struct thread *td,
    off_t length, int blksize)
begin
 struct buf *bp, *nbp;
 int anyfreed;
 int trunclbn;
 struct bufobj *bo;

 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", {$I %LINE%},
     vp, cred, blksize, (uintmax_t)length);

 {
  * Round up to the *next* lbn.
  }
 trunclbn:=(length + blksize - 1) div blksize;

 ASSERT_VOP_LOCKED(vp, "vtruncbuf';
restart:
 bo:=@vp^.v_bufobj;
 BO_LOCK(bo);
 anyfreed:=1;
 for (;anyfreed;) begin
  anyfreed:=0;
  TAILQ_FOREACH_SAFE(bp, &bo^.bo_clean.bv_hd, b_bobufs, nbp) begin
   if (bp^.b_lblkno < trunclbn)
    continue;
   if (BUF_LOCK(bp,
       LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK,
       BO_MTX(bo))=ENOLCK)
    goto restart;

   BO_LOCK(bo);
   bremfree(bp);
   BO_UNLOCK(bo);
   bp^.b_flags:= or (B_INVAL or B_RELBUF);
   bp^.b_flags:= and ~B_ASYNC;
   brelse(bp);
   anyfreed:=1;

   BO_LOCK(bo);
   if (nbp<>nil and
       (((nbp^.b_xflags and BX_VNCLEAN)=0) or
       (nbp^.b_vp<>vp) or
       (nbp^.b_flags and B_DELWRI))) begin
    BO_UNLOCK(bo);
    goto restart;
   end;
  end;

  TAILQ_FOREACH_SAFE(bp, &bo^.bo_dirty.bv_hd, b_bobufs, nbp) begin
   if (bp^.b_lblkno < trunclbn)
    continue;
   if (BUF_LOCK(bp,
       LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK,
       BO_MTX(bo))=ENOLCK)
    goto restart;
   BO_LOCK(bo);
   bremfree(bp);
   BO_UNLOCK(bo);
   bp^.b_flags:= or (B_INVAL or B_RELBUF);
   bp^.b_flags:= and ~B_ASYNC;
   brelse(bp);
   anyfreed:=1;

   BO_LOCK(bo);
   if (nbp<>nil and
       (((nbp^.b_xflags and BX_VNDIRTY)=0) or
       (nbp^.b_vp<>vp) or
       (nbp^.b_flags and B_DELWRI)=0)) begin
    BO_UNLOCK(bo);
    goto restart;
   end;
  end;
 end;

 if (length > 0) begin
restartsync:
  TAILQ_FOREACH_SAFE(bp, &bo^.bo_dirty.bv_hd, b_bobufs, nbp) begin
   if (bp^.b_lblkno > 0)
    continue;
   {
    * Since we hold the vnode lock this should only
    * fail if we're racing with the buf daemon.
    }
   if (BUF_LOCK(bp,
       LK_EXCLUSIVE or LK_SLEEPFAIL or LK_INTERLOCK,
       BO_MTX(bo))=ENOLCK) begin
    goto restart;
   end;
   Assert((bp^.b_flags and B_DELWRI), vp,
       'buf(%p) on dirty queue without DELWRI", bp));

   BO_LOCK(bo);
   bremfree(bp);
   BO_UNLOCK(bo);
   bawrite(bp);
   BO_LOCK(bo);
   goto restartsync;
  end;
 end;

 bufobj_wwait(bo, 0, 0);
 BO_UNLOCK(bo);
 vnode_pager_setsize(vp, length);

 Exit(0);
end;

{
 * buf_splay() - splay tree core for the clean/dirty list of buffers in
 *   a vnode.
 *
 * NOTE: We have to deal with the special case of a background bitmap
 * buffer, a situation where two buffers will have the same logical
 * block offset.  We want (1) only the foreground buffer to be accessed
 * in a lookup and (2) must differentiate between the foreground and
 * background buffer in the splay tree algorithm because the splay
 * tree cannot normally handle multiple entities with the same 'index'.
 * We accomplish this by adding differentiating flags to the splay tree's
 * numerical domain.
 }
static
struct buf *
buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
begin
 struct buf dummy;
 struct buf *lefttreemax, *righttreemin, *y;

 if (root=nil)
  Exit(nil);
 lefttreemax:=righttreemin:=@dummy;
 for (;;) begin
  if (lblkno < root^.b_lblkno or
      (lblkno=root^.b_lblkno and
      (xflags and BX_BKGRDMARKER) < (root^.b_xflags and BX_BKGRDMARKER))) begin
   if ((y:=root^.b_left)=nil)
    break;
   if (lblkno < y^.b_lblkno) begin
    { Rotate right. }
    root^.b_left:=y^.b_right;
    y^.b_right:=root;
    root:=y;
    if ((y:=root^.b_left)=nil)
     break;
   end;
   { Link into the new root's right tree. }
   righttreemin^.b_left:=root;
   righttreemin:=root;
  end; else if (lblkno > root^.b_lblkno or
      (lblkno=root^.b_lblkno and
      (xflags and BX_BKGRDMARKER) > (root^.b_xflags and BX_BKGRDMARKER))) begin
   if ((y:=root^.b_right)=nil)
    break;
   if (lblkno > y^.b_lblkno) begin
    { Rotate left. }
    root^.b_right:=y^.b_left;
    y^.b_left:=root;
    root:=y;
    if ((y:=root^.b_right)=nil)
     break;
   end;
   { Link into the new root's left tree. }
   lefttreemax^.b_right:=root;
   lefttreemax:=root;
  end; else begin
   break;
  end;
  root:=y;
 end;
 { Assemble the new root. }
 lefttreemax^.b_right:=root^.b_left;
 righttreemin^.b_left:=root^.b_right;
 root^.b_left:=dummy.b_right;
 root^.b_right:=dummy.b_left;
 Exit(root);
end;

static void
buf_vlist_remove(struct buf *bp)
begin
 struct buf *root;
 struct bufv *bv;

 Assert(bp^.b_bufobj<>nil, 'No b_bufobj %p", bp));
 ASSERT_BO_LOCKED(bp^.b_bufobj);
 Assert((bp^.b_xflags and (BX_VNDIRTY|BX_VNCLEAN)) !=
     (BX_VNDIRTY|BX_VNCLEAN),
     'buf_vlist_remove: Buf %p is on two lists", bp));
 if (bp^.b_xflags and BX_VNDIRTY)
  bv:=@bp^.b_bufobj^.bo_dirty;
 else
  bv:=@bp^.b_bufobj^.bo_clean;
 if (bp<>bv^.bv_root) begin
  root:=buf_splay(bp^.b_lblkno, bp^.b_xflags, bv^.bv_root);
  Assert(root=bp, 'splay lookup failed in remove');
 end;
 if (bp^.b_left=nil) begin
  root:=bp^.b_right;
 end; else begin
  root:=buf_splay(bp^.b_lblkno, bp^.b_xflags, bp^.b_left);
  root^.b_right:=bp^.b_right;
 end;
 bv^.bv_root:=root;
 TAILQ_REMOVE(@bv^.bv_hd, bp, b_bobufs);
 bv^.bv_cnt--;
 bp^.b_xflags:= and ~(BX_VNDIRTY or BX_VNCLEAN);
end;

{
 * Add the buffer to the sorted clean or dirty block list using a
 * splay tree algorithm.
 *
 * NOTE: xflags is passed as a constant, optimizing this inline function!
 }
static void
buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
begin
 struct buf *root;
 struct bufv *bv;

 ASSERT_BO_LOCKED(bo);
 Assert((bp^.b_xflags and (BX_VNDIRTY|BX_VNCLEAN))=0,
     'buf_vlist_add: Buf %p has existing xflags %d", bp, bp^.b_xflags));
 bp^.b_xflags:= or xflags;
 if (xflags and BX_VNDIRTY)
  bv:=@bo^.bo_dirty;
 else
  bv:=@bo^.bo_clean;

 root:=buf_splay(bp^.b_lblkno, bp^.b_xflags, bv^.bv_root);
 if (root=nil) begin
  bp^.b_left:=nil;
  bp^.b_right:=nil;
  TAILQ_INSERT_TAIL(@bv^.bv_hd, bp, b_bobufs);
 end; else if (bp^.b_lblkno < root^.b_lblkno or
     (bp^.b_lblkno=root^.b_lblkno and
     (bp^.b_xflags and BX_BKGRDMARKER) < (root^.b_xflags and BX_BKGRDMARKER))) begin
  bp^.b_left:=root^.b_left;
  bp^.b_right:=root;
  root^.b_left:=nil;
  TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 end; else begin
  bp^.b_right:=root^.b_right;
  bp^.b_left:=root;
  root^.b_right:=nil;
  TAILQ_INSERT_AFTER(@bv^.bv_hd, root, bp, b_bobufs);
 end;
 bv^.bv_cnt++;
 bv^.bv_root:=bp;
end;

{
 * Lookup a buffer using the splay tree.  Note that we specifically avoid
 * shadow buffers used in background bitmap writes.
 *
 * This code isn't quite efficient as it could be because we are maintaining
 * two sorted lists and do not know which list the block resides in.
 *
 * During a "make buildworld" the desired buffer is found at one of
 * the roots more than 60% of the time.  Thus, checking both roots
 * before performing either splay eliminates unnecessary splays on the
 * first tree splayed.
 }
struct buf *
gbincore(struct bufobj *bo, daddr_t lblkno)
begin
 struct buf *bp;

 ASSERT_BO_LOCKED(bo);
 if ((bp:=bo^.bo_clean.bv_root)<>nil and
     bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER))
  Exit(bp);
 if ((bp:=bo^.bo_dirty.bv_root)<>nil and
     bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER))
  Exit(bp);
 if ((bp:=bo^.bo_clean.bv_root)<>nil) begin
  bo^.bo_clean.bv_root:=bp:=buf_splay(lblkno, 0, bp);
  if (bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER))
   Exit(bp);
 end;
 if ((bp:=bo^.bo_dirty.bv_root)<>nil) begin
  bo^.bo_dirty.bv_root:=bp:=buf_splay(lblkno, 0, bp);
  if (bp^.b_lblkno=lblkno and !(bp^.b_xflags and BX_BKGRDMARKER))
   Exit(bp);
 end;
 Exit(nil);
end;

{
 * Associate a buffer with a vnode.
 }
void
bgetvp(vp:p_vnode, struct buf *bp)
begin
 struct bufobj *bo;

 bo:=@vp^.v_bufobj;
 ASSERT_BO_LOCKED(bo);
 Assert(bp^.b_vp=nil, bp^.b_vp, 'bgetvp: not free');

 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp^.b_flags);
 Assert((bp^.b_xflags and (BX_VNDIRTY|BX_VNCLEAN))=0, vp,
     'bgetvp: bp already attached! %p", bp));

 vhold(vp);
 if (VFS_NEEDSGIANT(vp^.v_mount) or bo^.bo_flag and BO_NEEDSGIANT)
  bp^.b_flags:= or B_NEEDSGIANT;
 bp^.b_vp:=vp;
 bp^.b_bufobj:=bo;
 {
  * Insert onto list for new vnode.
  }
 buf_vlist_add(bp, bo, BX_VNCLEAN);
end;

{
 * Disassociate a buffer from a vnode.
 }
void
brelvp(struct buf *bp)
begin
 struct bufobj *bo;
 vp:p_vnode;

 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp^.b_vp, bp^.b_flags);
 Assert(bp^.b_vp<>nil, 'brelvp: nil');

 {
  * Delete from old vnode list, if on one.
  }
 vp:=bp^.b_vp;  { XXX }
 bo:=bp^.b_bufobj;
 BO_LOCK(bo);
 if (bp^.b_xflags and (BX_VNDIRTY or BX_VNCLEAN))
  buf_vlist_remove(bp);
 else
  panic'brelvp: Buffer %p not on queue.", bp);
 if ((bo^.bo_flag and BO_ONWORKLST) and bo^.bo_dirty.bv_cnt=0) begin
  bo^.bo_flag:= and ~BO_ONWORKLST;
  mtx_lock(@sync_mtx);
  LIST_REMOVE(bo, bo_synclist);
  syncer_worklist_len--;
  mtx_unlock(@sync_mtx);
 end;
 bp^.b_flags:= and ~B_NEEDSGIANT;
 bp^.b_vp:=nil;
 bp^.b_bufobj:=nil;
 BO_UNLOCK(bo);
 vdrop(vp);
end;

{
 * Add an item to the syncer work queue.
 }
static void
vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
begin
 int queue, slot;

 ASSERT_BO_LOCKED(bo);

 mtx_lock(@sync_mtx);
 if (bo^.bo_flag and BO_ONWORKLST)
  LIST_REMOVE(bo, bo_synclist);
 else begin
  bo^.bo_flag:= or BO_ONWORKLST;
  syncer_worklist_len++;
 end;

 if (delay > syncer_maxdelay - 2)
  delay:=syncer_maxdelay - 2;
 slot:=(syncer_delayno + delay) and syncer_mask;

 queue:=VFS_NEEDSGIANT(bo^.__bo_vnode^.v_mount) ? WI_GIANTQ :
     WI_MPSAFEQ;
 LIST_INSERT_HEAD(@syncer_workitem_pending[queue][slot], bo,
     bo_synclist);
 mtx_unlock(@sync_mtx);
end;

static int
sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
begin
 int error, len;

 mtx_lock(@sync_mtx);
 len:=syncer_worklist_len - sync_vnode_count;
 mtx_unlock(@sync_mtx);
 error:=SYSCTL_OUT(req, &len, sizeof(len));
 Exit(error);
end;

SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT or CTLFLAG_RD, nil, 0,
    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length';

static struct proc *updateproc;
static void sched_sync(void);
static struct kproc_desc up_kp:=begin
 "syncer",
 sched_sync,
 &updateproc
end;
SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);

static int
sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
begin
 vp:p_vnode;
 mp:p_mount;

 *bo:=LIST_FIRST(slp);
 if (*bo=nil)
  Exit(0);
 vp:=(*bo)^.__bo_vnode; { XXX }
 if (VOP_ISLOCKED(vp)<>0 or VI_TRYLOCK(vp)=0)
  Exit(1);
 {
  * We use vhold in case the vnode does not
  * successfully sync.  vhold prevents the vnode from
  * going away when we unlock the sync_mtx so that
  * we can acquire the vnode interlock.
  }
 vholdl(vp);
 mtx_unlock(@sync_mtx);
 VI_UNLOCK(vp);
 if (vn_start_write(vp, &mp, V_NOWAIT)<>0) begin
  vdrop(vp);
  mtx_lock(@sync_mtx);
  Exit(*bo=LIST_FIRST(slp));
 end;
 vn_lock(vp, LK_EXCLUSIVE or LK_RETRY);
 (void) VOP_FSYNC(vp, MNT_LAZY, td);
 VOP_UNLOCK(vp, 0);
 vn_finished_write(mp);
 BO_LOCK(*bo);
 if (((*bo)^.bo_flag and BO_ONWORKLST)<>0) begin
  {
   * Put us back on the worklist.  The worklist
   * routine will remove us from our current
   * position and then add us back in at a later
   * position.
   }
  vn_syncer_add_to_worklist(*bo, syncdelay);
 end;
 BO_UNLOCK(*bo);
 vdrop(vp);
 mtx_lock(@sync_mtx);
 Exit(0);
end;

{
 * System filesystem synchronizer daemon.
 }
static void
sched_sync(void)
begin
 struct synclist *gnext, *next;
 struct synclist *gslp, *slp;
 struct bufobj *bo;
 long starttime;
 struct thread *td:=curthread;
 int last_work_seen;
 int net_worklist_len;
 int syncer_final_iter;
 int first_printf;
 int error;

 last_work_seen:=0;
 syncer_final_iter:=0;
 first_printf:=1;
 syncer_state:=SYNCER_RUNNING;
 starttime:=time_uptime;
 td^.td_pflags:= or TDP_NORUNNINGBUF;

 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td^.td_proc,
     SHUTDOWN_PRI_LAST);

 mtx_lock(@sync_mtx);
 for (;;) begin
  if (syncer_state=SYNCER_FINAL_DELAY and
      syncer_final_iter=0) begin
   mtx_unlock(@sync_mtx);
   kproc_suspend_check(td^.td_proc);
   mtx_lock(@sync_mtx);
  end;
  net_worklist_len:=syncer_worklist_len - sync_vnode_count;
  if (syncer_state<>SYNCER_RUNNING and
      starttime<>time_uptime) begin
   if (first_printf) begin
    printf'\nSyncing disks, vnodes remaining...';
    first_printf:=0;
   end;
   printf'%d ", net_worklist_len);
  end;
  starttime:=time_uptime;

  {
   * Push files whose dirty time has expired.  Be careful
   * of interrupt race on slp queue.
   *
   * Skip over empty worklist slots when shutting down.
   }
  do begin
   slp:=@syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
   gslp:=@syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
   syncer_delayno += 1;
   if (syncer_delayno=syncer_maxdelay)
    syncer_delayno:=0;
   next:=@syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
   gnext:=@syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
   {
    * If the worklist has wrapped since the
    * it was emptied of all but syncer vnodes,
    * switch to the FINAL_DELAY state and run
    * for one more second.
    }
   if (syncer_state=SYNCER_SHUTTING_DOWN and
       net_worklist_len=0 and
       last_work_seen=syncer_delayno) begin
    syncer_state:=SYNCER_FINAL_DELAY;
    syncer_final_iter:=SYNCER_SHUTDOWN_SPEEDUP;
   end;
  end; while (syncer_state<>SYNCER_RUNNING and LIST_EMPTY(slp) and
      LIST_EMPTY(gslp) and syncer_worklist_len > 0);

  {
   * Keep track of the last time there was anything
   * on the worklist other than syncer vnodes.
   * Exitto the SHUTTING_DOWN state if any
   * new work appears.
   }
  if (net_worklist_len > 0 or syncer_state=SYNCER_RUNNING)
   last_work_seen:=syncer_delayno;
  if (net_worklist_len > 0 and syncer_state=SYNCER_FINAL_DELAY)
   syncer_state:=SYNCER_SHUTTING_DOWN;
  while (!LIST_EMPTY(slp)) begin
   error:=sync_vnode(slp, &bo, td);
   if (error=1) begin
    LIST_REMOVE(bo, bo_synclist);
    LIST_INSERT_HEAD(next, bo, bo_synclist);
    continue;
   end;

   if (first_printf=0)
    wdog_kern_pat(WD_LASTVAL);

  end;
  if (!LIST_EMPTY(gslp)) begin
   mtx_unlock(@sync_mtx);
   mtx_lock(@Giant);
   mtx_lock(@sync_mtx);
   while (!LIST_EMPTY(gslp)) begin
    error:=sync_vnode(gslp, &bo, td);
    if (error=1) begin
     LIST_REMOVE(bo, bo_synclist);
     LIST_INSERT_HEAD(gnext, bo,
         bo_synclist);
     continue;
    end;
   end;
   mtx_unlock(@Giant);
  end;
  if (syncer_state=SYNCER_FINAL_DELAY and syncer_final_iter > 0)
   syncer_final_iter--;
  {
   * The variable rushjob allows the kernel to speed up the
   * processing of the filesystem syncer process. A rushjob
   * value of N tells the filesystem syncer to process the next
   * N seconds worth of work on its queue ASAP. Currently rushjob
   * is used by the soft update code to speed up the filesystem
   * syncer process when the incore state is getting so far
   * ahead of the disk that the kernel memory pool is being
   * threatened with exhaustion.
   }
  if (rushjob > 0) begin
   rushjob -= 1;
   continue;
  end;
  {
   * Just sleep for a short period of time between
   * iterations when shutting down to allow some I/O
   * to happen.
   *
   * If it has taken us less than a second to process the
   * current work, then wait. Otherwise start right over
   * again. We can still lose time if any single round
   * takes more than two seconds, but it does not really
   * matter as we are just trying to generally pace the
   * filesystem activity.
   }
  if (syncer_state<>SYNCER_RUNNING or
      time_uptime=starttime) begin
   thread_lock(td);
   sched_prio(td, PPAUSE);
   thread_unlock(td);
  end;
  if (syncer_state<>SYNCER_RUNNING)
   cv_timedwait(@sync_wakeup, &sync_mtx,
       hz div SYNCER_SHUTDOWN_SPEEDUP);
  else if (time_uptime=starttime)
   cv_timedwait(@sync_wakeup, &sync_mtx, hz);
 end;
end;

{
 * Request the syncer daemon to speed up its work.
 * We never push it to speed up more than half of its
 * normal turn time, otherwise it could take over the cpu.
 }
int
speedup_syncer(void)
begin
 int ret:=0;

 mtx_lock(@sync_mtx);
 if (rushjob < syncdelay div 2) begin
  rushjob += 1;
  stat_rush_requests += 1;
  ret:=1;
 end;
 mtx_unlock(@sync_mtx);
 cv_broadcast(@sync_wakeup);
 Exit(ret);
end;

{
 * Tell the syncer to speed up its work and run though its work
 * list several times, then tell it to shut down.
 }
static void
syncer_shutdown(void *arg, int howto)
begin

 if (howto and RB_NOSYNC)
  Exit;
 mtx_lock(@sync_mtx);
 syncer_state:=SYNCER_SHUTTING_DOWN;
 rushjob:=0;
 mtx_unlock(@sync_mtx);
 cv_broadcast(@sync_wakeup);
 kproc_shutdown(arg, howto);
end;

{
 * Reassign a buffer from one vnode to another.
 * Used to assign file specific control information
 * (indirect blocks) to the vnode to which they belong.
 }
void
reassignbuf(struct buf *bp)
begin
 vp:p_vnode;
 struct bufobj *bo;
 int delay;
#ifdef INVARIANTS
 struct bufv *bv;
#endif

 vp:=bp^.b_vp;
 bo:=bp^.b_bufobj;
 ++reassignbufcalls;

 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
     bp, bp^.b_vp, bp^.b_flags);
 {
  * B_PAGING flagged buffers cannot be reassigned because their vp
  * is not fully linked in.
  }
 if (bp^.b_flags and B_PAGING)
  panic'cannot reassign paging buffer';

 {
  * Delete from old vnode list, if on one.
  }
 BO_LOCK(bo);
 if (bp^.b_xflags and (BX_VNDIRTY or BX_VNCLEAN))
  buf_vlist_remove(bp);
 else
  panic'reassignbuf: Buffer %p not on queue.", bp);
 {
  * If dirty, put on list of dirty buffers; otherwise insert onto list
  * of clean buffers.
  }
 if (bp^.b_flags and B_DELWRI) begin
  if ((bo^.bo_flag and BO_ONWORKLST)=0) begin
   switch (vp^.v_type) begin
   case VDIR:
    delay:=dirdelay;
    break;
   case VCHR:
    delay:=metadelay;
    break;
   default:
    delay:=filedelay;
   end;
   vn_syncer_add_to_worklist(bo, delay);
  end;
  buf_vlist_add(bp, bo, BX_VNDIRTY);
 end; else begin
  buf_vlist_add(bp, bo, BX_VNCLEAN);

  if ((bo^.bo_flag and BO_ONWORKLST) and bo^.bo_dirty.bv_cnt=0) begin
   mtx_lock(@sync_mtx);
   LIST_REMOVE(bo, bo_synclist);
   syncer_worklist_len--;
   mtx_unlock(@sync_mtx);
   bo^.bo_flag:= and ~BO_ONWORKLST;
  end;
 end;
#ifdef INVARIANTS
 bv:=@bo^.bo_clean;
 bp:=TAILQ_FIRST(@bv^.bv_hd);
 Assert(bp=nil or bp^.b_bufobj=bo,
     'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo));
 bp:=TAILQ_LAST(@bv^.bv_hd, buflists);
 Assert(bp=nil or bp^.b_bufobj=bo,
     'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo));
 bv:=@bo^.bo_dirty;
 bp:=TAILQ_FIRST(@bv^.bv_hd);
 Assert(bp=nil or bp^.b_bufobj=bo,
     'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo));
 bp:=TAILQ_LAST(@bv^.bv_hd, buflists);
 Assert(bp=nil or bp^.b_bufobj=bo,
     'bp %p wrong b_bufobj %p should be %p", bp, bp^.b_bufobj, bo));
#endif
 BO_UNLOCK(bo);
end;
}

{
 * Increment the use and hold counts on the vnode, taking care to reference
 * the driver's usecount if this is a chardev.  The vholdl() will remove
 * the vnode from the free list if it is presently free.  Requires the
 * vnode interlock and returns with it held.
 }
procedure v_incr_usecount(vp:p_vnode);
begin
 Inc(vp^.v_usecount);
 if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then
 begin
  dev_lock();
  Inc(p_cdev(vp^.v_rdev)^.si_usecount);
  dev_unlock();
 end;
 vholdl(vp);
end;

{
 * Turn a holdcnt into a use+holdcnt such that only one call to
 * v_decr_usecount is needed.
 }
procedure v_upgrade_usecount(vp:p_vnode);
begin
 Inc(vp^.v_usecount);
 if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then
 begin
  dev_lock();
  Inc(p_cdev(vp^.v_rdev)^.si_usecount);
  dev_unlock();
 end;
end;

{
 * Decrement the vnode use and hold count along with the driver's usecount
 * if this is a chardev.  The vdropl() below releases the vnode interlock
 * as it may free the vnode.
 }
procedure v_decr_usecount(vp:p_vnode);
begin
 ASSERT_VI_LOCKED(vp,{$I %LINE%});
 Assert(vp^.v_usecount>0,'v_decr_usecount: negative usecount');
 Dec(vp^.v_usecount);
 if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then
 begin
  dev_lock();
  Inc(p_cdev(vp^.v_rdev)^.si_usecount);
  dev_unlock();
 end;
 vdropl(vp);
end;

{
 * Decrement only the use count and driver use count.  This is intended to
 * be paired with a follow on vdropl() to release the remaining hold count.
 * In this way we may vgone() a vnode with a 0 usecount without risk of
 * having it end up on a free list because the hold count is kept above 0.
 }
procedure v_decr_useonly(vp:p_vnode);
begin
 ASSERT_VI_LOCKED(vp,{$I %LINE%});
 Assert(vp^.v_usecount>0,'v_decr_useonly: negative usecount');
 Dec(vp^.v_usecount);
 if (vp^.v_type=VCHR) and (vp^.v_rdev<>nil) then
 begin
  dev_lock();
  Dec(p_cdev(vp^.v_rdev)^.si_usecount);
  dev_unlock();
 end;
end;

{
 * Grab a particular vnode from the free list, increment its
 * reference count and lock it.  VI_DOOMED is set if the vnode
 * is being destroyed.  Only callers who specify LK_RETRY will
 * see doomed vnodes.  If inactive processing was delayed in
 * vput try to do it here.
 }
function vget(vp:p_vnode;flags:Integer):Integer;
var
 error:Integer;
begin
 error:=0;
 VFS_ASSERT_GIANT(vp^.v_mount);
 Assert((flags and LK_TYPE_MASK)<>0,'vget: invalid lock operation');

 if ((flags and LK_INTERLOCK)=0) then
 begin
  VI_LOCK(vp);
 end;

 vholdl(vp);

 error:=vn_lock(vp,flags or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
 if (error<>0) then
 begin
  vdrop(vp);
  Exit(error);
 end;

 if ((vp^.v_iflag and VI_DOOMED)<>0) and ((flags and LK_RETRY)=0) then
 begin
  Assert(false,'vget: vn_lock failed to Exit ENOENT');
 end;

 VI_LOCK(vp);
 { Upgrade our holdcnt to a usecount. }
 v_upgrade_usecount(vp);
 {
  * We don't guarantee that any particular close will
  * trigger inactive processing so just make a best effort
  * here at preventing a reference to a removed file.  If
  * we don't succeed no harm is done.
  }
 if ((vp^.v_iflag and VI_OWEINACT)<>0) then
 begin
  if (VOP_ISLOCKED(vp)=LK_EXCLUSIVE) and
     ((flags and LK_NOWAIT)=0) then
  begin
   vinactive(vp);
  end;

  vp^.v_iflag:=vp^.v_iflag and (not VI_OWEINACT);
 end;
 VI_UNLOCK(vp);
 Exit(0);
end;

{
 * Increase the reference count of a vnode.
 }
procedure vref(vp:p_vnode); public;
begin
 VI_LOCK(vp);
 v_incr_usecount(vp);
 VI_UNLOCK(vp);
end;

{
 * Exitreference count of a vnode.
 *
 * The results of this call are only guaranteed when some mechanism other
 * than the VI lock is used to stop other processes from gaining references
 * to the vnode.  This may be the case if the caller holds the only reference.
 * This is also useful when stale data is acceptable as race conditions may
 * be accounted for by some other means.
 }
function vrefcnt(vp:p_vnode):Integer;
begin
 VI_LOCK(vp);
 Result:=vp^.v_usecount;
 VI_UNLOCK(vp);
end;

const
 VPUTX_VRELE =1;
 VPUTX_VPUT  =2;
 VPUTX_VUNREF=3;

procedure vputx(vp:p_vnode;func:Integer);
var
 error:Integer;
begin
 Assert(vp<>nil,'vputx: nil vp');
 if (func=VPUTX_VUNREF) then
  ASSERT_VOP_LOCKED(vp,'vunref')
 else
 if (func=VPUTX_VPUT) then
  ASSERT_VOP_LOCKED(vp,'vput')
 else
  Assert(func=VPUTX_VRELE,'vputx: wrong func');

 VFS_ASSERT_GIANT(vp^.v_mount);

 VI_LOCK(vp);

 { Skip this v_writecount check if we're going to panic below. }
 Assert((vp^.v_writecount < vp^.v_usecount) or (vp^.v_usecount < 1),'vputx: missed vn_close');
 error:=0;

 if (vp^.v_usecount > 1) or (((vp^.v_iflag and VI_DOINGINACT)<>0) and (vp^.v_usecount=1)) then
 begin
  if (func=VPUTX_VPUT) then
  begin
   VOP_UNLOCK(vp, 0);
  end;
  v_decr_usecount(vp);
  Exit;
 end;

 if (vp^.v_usecount<>1) then
 begin
  Assert(false,'vputx: negative ref cnt');
 end;
 {
  * We want to hold the vnode until the inactive finishes to
  * prevent vgone() races.  We drop the use count here and the
  * hold count below when we're done.
  }
 v_decr_useonly(vp);
 {
  * We must call VOP_INACTIVE with the node locked. Mark
  * as VI_DOINGINACT to avoid recursion.
  }
 vp^.v_iflag:=vp^.v_iflag or VI_OWEINACT;

 case (func) of
  VPUTX_VRELE:
  begin
   error:=vn_lock(vp, LK_EXCLUSIVE or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
   VI_LOCK(vp);
  end;
  VPUTX_VPUT:
  begin
   if (VOP_ISLOCKED(vp)<>LK_EXCLUSIVE) then
   begin
    error:=VOP_LOCK(vp, LK_UPGRADE or LK_INTERLOCK or LK_NOWAIT,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
    VI_LOCK(vp);
   end;
  end;
 VPUTX_VUNREF:
  begin
   if (VOP_ISLOCKED(vp)<>LK_EXCLUSIVE) then
   begin
    error:=VOP_LOCK(vp, LK_TRYUPGRADE or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
    VI_LOCK(vp);
   end;
  end;
 end;

 if (vp^.v_usecount > 0) then
 begin
  vp^.v_iflag:=vp^.v_iflag and (not VI_OWEINACT);
 end;

 if (error=0) then
 begin
  if ((vp^.v_iflag and VI_OWEINACT)<>0) then
  begin
   vinactive(vp);
  end;

  if (func<>VPUTX_VUNREF) then
  begin
   VOP_UNLOCK(vp, 0);
  end;
 end;
 vdropl(vp);
end;

{
 * Vnode put/release.
 * If count drops to zero, call inactive routine and return to freelist.
 }
procedure vrele(vp:p_vnode);
begin
 vputx(vp, VPUTX_VRELE);
end;

{
 * Release an already locked vnode.  This give the same effects as
 * unlock+vrele(), but takes less time and avoids releasing and
 * re-aquiring the lock (as vrele() acquires the lock internally.)
 }
procedure vput(vp:p_vnode);
begin
 vputx(vp, VPUTX_VPUT);
end;

{
 * Release an exclusively locked vnode. Do not unlock the vnode lock.
 }
procedure vunref(vp:p_vnode);
begin
 vputx(vp, VPUTX_VUNREF);
end;

{
 * Somebody doesn't want the vnode recycled.
 }
procedure vhold(vp:p_vnode);
begin
 VI_LOCK(vp);
 vholdl(vp);
 VI_UNLOCK(vp);
end;

{
 * Increase the hold count and activate if this is the first reference.
 }
procedure vholdl(vp:p_vnode);
var
 mp:p_mount;
begin
 Inc(vp^.v_holdcnt);
 if (not VSHOULDBUSY(vp)) then Exit;

 ASSERT_VI_LOCKED(vp,'vholdl');
 Assert((vp^.v_iflag and VI_FREE)<>0,'vnode not free');
 Assert(vp^.v_op<>nil,'vholdl: vnode already reclaimed.');
 {
  * Remove a vnode from the free list, mark it as in use,
  * and put it on the active list.
  }
 mtx_lock(vnode_free_list_mtx);
 TAILQ_REMOVE(@vnode_free_list,vp,@vp^.v_actfreelist);
 Dec(freevnodes);
 vp^.v_iflag:=vp^.v_iflag and (not (VI_FREE or VI_AGE));
 Assert((vp^.v_iflag and VI_ACTIVE)=0,'Activating already active vnode');
 vp^.v_iflag:=vp^.v_iflag or VI_ACTIVE;
 mp:=vp^.v_mount;
 TAILQ_INSERT_HEAD(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist);
 Inc(mp^.mnt_activevnodelistsize);
 mtx_unlock(vnode_free_list_mtx);
end;

{
 * Note that there is one less who cares about this vnode.
 * vdrop() is the opposite of vhold().
 }
procedure vdrop(vp:p_vnode);
begin
 VI_LOCK(vp);
 vdropl(vp);
end;

{
 * Drop the hold count of the vnode.  If this is the last reference to
 * the vnode we place it on the free list unless it has been vgone'd
 * (marked VI_DOOMED) in which case we will free it.
 }
procedure vdropl(vp:p_vnode);
var
 //struct bufobj *bo;
 mp:p_mount;
 active:Integer;
begin
 ASSERT_VI_LOCKED(vp,'vdropl');

 if (vp^.v_holdcnt <= 0) then
 begin
  Assert(false,'vdrop: holdcnt');
 end;

 Dec(vp^.v_holdcnt);
 if (vp^.v_holdcnt > 0) then
 begin
  VI_UNLOCK(vp);
  Exit;
 end;

 if ((vp^.v_iflag and VI_DOOMED)=0) then
 begin
  {
   * Mark a vnode as free: remove it from its active list
   * and put it up for recycling on the freelist.
   }
  Assert(vp^.v_op<>nil,'vdropl: vnode already reclaimed.');
  Assert((vp^.v_iflag and VI_FREE)=0,'vnode already free');
  Assert(VSHOULDFREE(vp),'vdropl: freeing when we shouldnt');

  active:=vp^.v_iflag and VI_ACTIVE;
  vp^.v_iflag:=vp^.v_iflag and (not VI_ACTIVE);
  mp:=vp^.v_mount;
  mtx_lock(vnode_free_list_mtx);
  if (active<>0) then
  begin
   TAILQ_REMOVE(@mp^.mnt_activevnodelist,vp,@vp^.v_actfreelist);
   Dec(mp^.mnt_activevnodelistsize);
  end;
  if ((vp^.v_iflag and VI_AGE)<>0) then
  begin
   TAILQ_INSERT_HEAD(@vnode_free_list,vp,@vp^.v_actfreelist);
  end else
  begin
   TAILQ_INSERT_TAIL(@vnode_free_list,vp,@vp^.v_actfreelist);
  end;
  Inc(freevnodes);
  vp^.v_iflag:=vp^.v_iflag and (not VI_AGE);
  vp^.v_iflag:=vp^.v_iflag or VI_FREE;
  mtx_unlock(vnode_free_list_mtx);
  VI_UNLOCK(vp);
  Exit;
 end;

 {
  * The vnode has been marked for destruction, so free it.
  }
 System.InterlockedDecrement64(numvnodes);
 //bo:=@vp^.v_bufobj;
 Assert((vp^.v_iflag and VI_FREE)=0,'cleaned vnode still on the free list.');
 Assert(vp^.v_data=nil, 'cleaned vnode isnt');
 Assert(vp^.v_holdcnt=0, 'Non-zero hold count');
 Assert(vp^.v_usecount=0, 'Non-zero use count');
 Assert(vp^.v_writecount=0, 'Non-zero write count');
 //Assert(bo^.bo_numoutput=0, 'Clean vnode has pending I/Os');
 //Assert(bo^.bo_clean.bv_cnt=0, 'cleanbufcnt not 0');
 //Assert(bo^.bo_clean.bv_root=nil, 'cleanblkroot not nil');
 //Assert(bo^.bo_dirty.bv_cnt=0, 'dirtybufcnt not 0');
 //Assert(bo^.bo_dirty.bv_root=nil, 'dirtyblkroot not nil');
 //Assert(TAILQ_EMPTY(@vp^.v_cache_dst), 'vp has namecache dst');
 //Assert(LIST_EMPTY(@vp^.v_cache_src), 'vp has namecache src');
 //Assert(vp^.v_cache_dd=nil, 'vp has namecache for ..');
 VI_UNLOCK(vp);

 //mac_vnode_destroy(vp);

 if (vp^.v_pollinfo<>nil) then
 begin
  destroy_vpollinfo(vp^.v_pollinfo);
 end;

 { XXX Elsewhere we detect an already freed vnode via nil v_op. }
 vp^.v_op:=nil;

 rangelock_destroy(@vp^.v_rl);
 //lockdestroy(vp^.v_vnlock);
 mtx_destroy(vp^.v_vnlock^);
 mtx_destroy(vp^.v_interlock);

 mtx_destroy(vp^.v_lock);

 //mtx_destroy(BO_MTX(bo));

 FreeMem(vp);
end;

{
 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
 * OWEINACT tracks whether a vnode missed a call to inactive due to a
 * failed lock upgrade.
 }
procedure vinactive(vp:p_vnode);
var
 obj:vm_object_t;
begin
 ASSERT_VOP_ELOCKED(vp,'vinactive');
 ASSERT_VI_LOCKED(vp,'vinactive');
 Assert((vp^.v_iflag and VI_DOINGINACT)=0,'vinactive: recursed on VI_DOINGINACT');

 vp^.v_iflag:=vp^.v_iflag or VI_DOINGINACT;
 vp^.v_iflag:=vp^.v_iflag and (not VI_OWEINACT);
 VI_UNLOCK(vp);
 {
  * Before moving off the active list, we must be sure that any
  * modified pages are on the vnode's dirty list since these will
  * no longer be checked once the vnode is on the inactive list.
  * Because the vnode vm object keeps a hold reference on the vnode
  * if there is at least one resident non-cached page, the vnode
  * cannot leave the active list without the page cleanup done.
  }
 obj:=vp^.v_object;
 if (obj<>nil) then
 if ((obj^.flags and OBJ_MIGHTBEDIRTY)<>0) then
 begin
  VM_OBJECT_LOCK(obj);
  vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
  VM_OBJECT_UNLOCK(obj);
 end;
 VOP_INACTIVE(vp);
 VI_LOCK(vp);
 Assert((vp^.v_iflag and VI_DOINGINACT)<>0,'vinactive: lost VI_DOINGINACT');
 vp^.v_iflag:=vp^.v_iflag and (not VI_DOINGINACT);
end;

{
 * Remove any vnodes in the vnode table belonging to mount point mp.
 *
 * If FORCECLOSE is not specified, there should not be any active ones,
 * Exiterror if any are found (nb: this is a user error, not a
 * system error). If FORCECLOSE is specified, detach any active vnodes
 * that are found.
 *
 * If WRITECLOSE is set, only flush out regular file vnodes open for
 * writing.
 *
 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
 *
 * `rootrefs' specifies the base reference count for the root vnode
 * of this filesystem. The root vnode is considered busy if its
 * v_usecount exceeds this value. On a successful return, vflush(, td)
 * will call vrele() on the root vnode exactly rootrefs times.
 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
 * be zero.
 }
function vflush(mp:p_mount;rootrefs,flags:Integer):Integer;
label
 loop;
var
 vp,mvp,rootvp:p_vnode;
 vattr:t_vattr;
 busy,error:Integer;
begin
 rootvp:=nil;
 busy:=0;

 if (rootrefs > 0) then
 begin
  Assert((flags and (SKIPSYSTEM or WRITECLOSE))=0,'vflush: bad args');
  {
   * Get the filesystem root vnode. We can vput() it
   * immediately, since with rootrefs > 0, it won't go away.
   }
  error:=VFS_ROOT(mp, LK_EXCLUSIVE, @rootvp);
  if (error<>0) then
  begin
   Exit(error);
  end;
  vput(rootvp);
 end;
loop:
 vp:=__mnt_vnode_first_all(@mvp,mp);
 while (vp<>nil) do
 begin
  vholdl(vp);
  error:=vn_lock(vp, LK_INTERLOCK or LK_EXCLUSIVE,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
  if (error<>0) then
  begin
   vdrop(vp);
   //MNT_VNODE_FOREACH_ALL_ABORT
   MNT_ILOCK(mp);
   __mnt_vnode_markerfree_all(@mvp,mp);
   //MNT_VNODE_FOREACH_ALL_ABORT
   goto loop;
  end;
  {
   * Skip over a vnodes marked VV_SYSTEM.
   }
  if (((flags and SKIPSYSTEM)<>0) and ((vp^.v_vflag and VV_SYSTEM)<>0)) then
  begin
   VOP_UNLOCK(vp, 0);
   vdrop(vp);
   //
   vp:=__mnt_vnode_next_all(@mvp,mp);
   continue;
  end;
  {
   * If WRITECLOSE is set, flush out unlinked but still open
   * files (even if open only for reading) and regular file
   * vnodes open for writing.
   }
  if ((flags and WRITECLOSE)<>0) then
  begin
   if (vp^.v_object<>nil) then
   begin
    VM_OBJECT_LOCK(vp^.v_object);
    vm_object_page_clean(vp^.v_object, 0, 0, 0);
    VM_OBJECT_UNLOCK(vp^.v_object);
   end;
   error:=VOP_FSYNC(vp, MNT_WAIT);
   if (error<>0) then
   begin
    VOP_UNLOCK(vp, 0);
    vdrop(vp);
    //MNT_VNODE_FOREACH_ALL_ABORT
    MNT_ILOCK(mp);
    __mnt_vnode_markerfree_all(@mvp,mp);
    //MNT_VNODE_FOREACH_ALL_ABORT
    Exit(error);
   end;
   error:=VOP_GETATTR(vp, @vattr);
   VI_LOCK(vp);

   if ((vp^.v_type=VNON) or
       ((error=0) and (vattr.va_nlink > 0))) and
      ((vp^.v_writecount=0) or (vp^.v_type<>VREG)) then
   begin
    VOP_UNLOCK(vp, 0);
    vdropl(vp);
    //
    vp:=__mnt_vnode_next_all(@mvp,mp);
    continue;
   end;
  end else
   VI_LOCK(vp);
  {
   * With v_usecount=0, all we need to do is clear out the
   * vnode data structures and we are done.
   *
   * If FORCECLOSE is set, forcibly close the vnode.
   }
  if (vp^.v_usecount=0 or (flags and FORCECLOSE)) then
  begin
   Assert((vp^.v_usecount=0) or
          ((vp^.v_type<>VCHR) and (vp^.v_type<>VBLK)),'device VNODE %p is FORCECLOSED');
   vgonel(vp);
  end else
  begin
   Inc(busy);
  end;
  VOP_UNLOCK(vp, 0);
  vdropl(vp);
  //
  vp:=__mnt_vnode_next_all(@mvp,mp);
 end;
 if (rootrefs > 0) and ((flags and FORCECLOSE)=0) then
 begin
  {
   * If just the root vnode is busy, and if its refcount
   * is equal to `rootrefs', then go ahead and kill it.
   }
  VI_LOCK(rootvp);
  Assert(busy > 0, 'vflush: not busy');
  Assert(rootvp^.v_usecount >= rootrefs,'vflush: usecount %d < rootrefs %d');
  if (busy=1) and (rootvp^.v_usecount=rootrefs) then
  begin
   VOP_LOCK(rootvp, LK_EXCLUSIVE or LK_INTERLOCK,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
   vgone(rootvp);
   VOP_UNLOCK(rootvp, 0);
   busy:=0;
  end else
   VI_UNLOCK(rootvp);
 end;
 if (busy<>0) then
 begin
  Exit(EBUSY);
 end;

 while (rootrefs > 0) do
 begin
  vrele(rootvp);
  Dec(rootrefs);
 end;
 Exit(0);
end;

{
 * Recycle an unused vnode to the front of the free list.
 }
function vrecycle(vp:p_vnode):Integer;
var
 recycled:Integer;
begin
 ASSERT_VOP_ELOCKED(vp, 'vrecycle');
 recycled:=0;
 VI_LOCK(vp);
 if (vp^.v_usecount=0)then
 begin
  recycled:=1;
  vgonel(vp);
 end;
 VI_UNLOCK(vp);
 Exit(recycled);
end;

{
 * Eliminate all activity associated with a vnode
 * in preparation for reuse.
 }
procedure vgone(vp:p_vnode);
begin
 VI_LOCK(vp);
 vgonel(vp);
 VI_UNLOCK(vp);
end;

{
 * vgone, with the vp interlock held.
 }
procedure vgonel(vp:p_vnode);
var
 oweinact:Integer;
 active:Integer;
 mp:p_mount;
begin
 ASSERT_VOP_ELOCKED(vp, 'vgonel');
 ASSERT_VI_LOCKED(vp, 'vgonel');
 Assert(vp^.v_holdcnt<>0,'vgonel: vp %p has no reference.');

 {
  * Don't vgonel if we're already doomed.
  }
 if ((vp^.v_iflag and VI_DOOMED)<>0) then Exit;

 vp^.v_iflag:=vp^.v_iflag or VI_DOOMED;

 {
  * Check to see if the vnode is in use.  If so, we have to call
  * VOP_CLOSE() and VOP_INACTIVE().
  }
 active:=vp^.v_usecount;
 oweinact:=(vp^.v_iflag and VI_OWEINACT);
 VI_UNLOCK(vp);

 {
  * Clean out any buffers associated with the vnode.
  * If the flush fails, just toss the buffers.
  }
 mp:=nil;

 //if (not TAILQ_EMPTY(@vp^.v_bufobj.bo_dirty.bv_hd)) then
 // vn_start_secondary_write(vp, &mp, V_WAIT);

 if (vinvalbuf(vp, V_SAVE, 0, 0)<>0) then
 begin
  vinvalbuf(vp, 0, 0, 0);
 end;

 {
  * If purging an active vnode, it must be closed and
  * deactivated before being reclaimed.
  }
 if (active<>0) then
 begin
  VOP_CLOSE(vp, FNONBLOCK);
 end;

 if (oweinact<>0) or (active<>0) then
 begin
  VI_LOCK(vp);

  if ((vp^.v_iflag and VI_DOINGINACT)=0) then
  begin
   vinactive(vp);
  end;

  VI_UNLOCK(vp);
 end;

 //if (vp^.v_type=VSOCK) then
 // vfs_unp_reclaim(vp);

 {
  * Reclaim the vnode.
  }
 if (VOP_RECLAIM(vp)<>0) then
 begin
  Assert(false,'vgone: cannot reclaim');
 end;

 //if (mp<>nil) then
 // vn_finished_secondary_write(mp);

 //Assert(vp^.v_object=nil,'vop_reclaim left v_object vp=%p, tag=%s'));

 {
  * Clear the advisory locks and wake up waiting threads.
  }
 VOP_ADVLOCKPURGE(vp);
 {
  * Delete from old mount point vnode list.
  }
 delmntque(vp);
 //cache_purge(vp);
 {
  * Done with purge, reset to the standard lock and invalidate
  * the vnode.
  }
 VI_LOCK(vp);
 vp^.v_vnlock:=@vp^.v_lock;
 vp^.v_op    :=@dead_vnodeops;
 vp^.v_tag   :='none';
 vp^.v_type  :=VBAD;
end;

{
 * Calculate the total number of references to a special device.
 }
function vcount(vp:p_vnode):Integer;
begin
 dev_lock();
 Result:=p_cdev(vp^.v_rdev)^.si_usecount;
 dev_unlock();
end;

{
 * Same as above, but using the struct cdev *as argument
 }
function count_dev(dev:Pointer):Integer; //cdev
begin
 dev_lock();
 Result:=p_cdev(dev)^.si_usecount;
 dev_unlock();
end;

{
 * perform msync on all vnodes under a mount point
 * the mount point must be locked.
 }
procedure vfs_msync(mp:p_mount;flags:Integer);
var
 vp,mvp:p_vnode;
 obj:vm_object_t;
begin
 vp:=__mnt_vnode_first_active(@mvp,mp);
 While (vp<>nil) do
 begin
  obj:=vp^.v_object;
  if (obj<>nil) and
     ((obj^.flags and OBJ_MIGHTBEDIRTY)<>0) and
     ((flags=MNT_WAIT) or (VOP_ISLOCKED(vp)=0)) then
  begin
   if (vget(vp, LK_EXCLUSIVE or LK_RETRY or LK_INTERLOCK)=0) then
   begin
    if ((vp^.v_vflag and VV_NOSYNC)<>0) then
    begin { unlinked }
     vput(vp);
     //
     vp:=__mnt_vnode_next_active(@mvp,mp);
     continue;
    end;

    obj:=vp^.v_object;
    if (obj<>nil) then
    begin
     VM_OBJECT_LOCK(obj);
     if (flags=MNT_WAIT) then
     begin
      vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
     end else
     begin
      vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
     end;
     VM_OBJECT_UNLOCK(obj);
    end;
    vput(vp);
   end;
  end else
  begin
   VI_UNLOCK(vp);
  end;
  //
  vp:=__mnt_vnode_next_active(@mvp,mp);
 end;
end;

procedure destroy_vpollinfo_free(vi:p_vpollinfo);
begin
 knlist_destroy(@vi^.vpi_selinfo.si_note);
 mtx_destroy(vi^.vpi_lock);
 FreeMem(vi);
end;

procedure destroy_vpollinfo(vi:p_vpollinfo);
begin
 knlist_clear(@vi^.vpi_selinfo.si_note, 1);
 seldrain(@vi^.vpi_selinfo);
 destroy_vpollinfo_free(vi);
end;

procedure vfs_knllock(arg:Pointer);             forward;
procedure vfs_knlunlock(arg:Pointer);           forward;
procedure vfs_knl_assert_locked(arg:Pointer);   forward;
procedure vfs_knl_assert_unlocked(arg:Pointer); forward;

{
 * Initalize per-vnode helper structure to hold poll-related state.
 }
procedure v_addpollinfo(vp:p_vnode);
var
 vi:p_vpollinfo;
begin
 if (vp^.v_pollinfo<>nil) then
 begin
  Exit;
 end;
 vi:=AllocMem(SizeOf(vpollinfo));
 mtx_init(vi^.vpi_lock,'vnode pollinfo');
 knlist_init(@vi^.vpi_selinfo.si_note, vp, @vfs_knllock, @vfs_knlunlock, @vfs_knl_assert_locked, @vfs_knl_assert_unlocked);
 VI_LOCK(vp);
 if (vp^.v_pollinfo<>nil) then
 begin
  VI_UNLOCK(vp);
  destroy_vpollinfo_free(vi);
  Exit;
 end;
 vp^.v_pollinfo:=vi;
 VI_UNLOCK(vp);
end;

{
 * Record a process's interest in events which might happen to
 * a vnode.  Because poll uses the historic select-style interface
 * internally, this routine serves as both the ``check for any
 * pending events'' and the ``record my interest in future events''
 * functions.  (These are done together, while the lock is held,
 * to avoid race conditions.)
 }
function vn_pollrecord(vp:p_vnode;events:Integer):Integer;
begin
 v_addpollinfo(vp);
 mtx_lock(vp^.v_pollinfo^.vpi_lock);
 if ((vp^.v_pollinfo^.vpi_revents and events)<>0) then
 begin
  {
   * This leaves events we are not interested
   * in available for the other process which
   * which presumably had requested them
   * (otherwise they would never have been
   * recorded).
   }
  events:=events and vp^.v_pollinfo^.vpi_revents;
  vp^.v_pollinfo^.vpi_revents:=vp^.v_pollinfo^.vpi_revents and (not events);

  mtx_unlock(vp^.v_pollinfo^.vpi_lock);
  Exit(events);
 end;
 vp^.v_pollinfo^.vpi_events:=vp^.v_pollinfo^.vpi_events or events;
 selrecord(curkthread, @vp^.v_pollinfo^.vpi_selinfo);
 mtx_unlock(vp^.v_pollinfo^.vpi_lock);
 Exit(0);
end;

{
 * Routine to create and manage a filesystem syncer vnode.
 }

{
#define sync_close ((int (*)(struct  vop_close_args *))nilop)
static int sync_fsync(struct  vop_fsync_args *);
static int sync_inactive(struct  vop_inactive_args *);
static int sync_reclaim(struct  vop_reclaim_args *);

static struct vop_vector sync_vnodeops:=begin
 .vop_bypass:=VOP_EOPNOTSUPP,
 .vop_close:=sync_close,  { close }
 .vop_fsync:=sync_fsync,  { fsync }
 .vop_inactive:=sync_inactive, { inactive }
 .vop_reclaim:=sync_reclaim, { reclaim }
 .vop_lock1:=vop_stdlock, { lock }
 .vop_unlock:=vop_stdunlock, { unlock }
 .vop_islocked:=vop_stdislocked, { islocked }
end;

{
 * Create a new filesystem syncer vnode for the specified mount point.
 }
void
vfs_allocate_syncvnode(mp:p_mount)
begin
 vp:p_vnode;
 struct bufobj *bo;
 static long start, incr, next;
 int error;

 { Allocate a new vnode }
 error:=getnewvnode'syncer", mp, &sync_vnodeops, &vp);
 if (error<>0)
  panic'vfs_allocate_syncvnode: getnewvnode() failed';
 vp^.v_type:=VNON;
 vn_lock(vp, LK_EXCLUSIVE or LK_RETRY);
 vp^.v_vflag:= or VV_FORCEINSMQ;
 error:=insmntque(vp, mp);
 if (error<>0)
  panic'vfs_allocate_syncvnode: insmntque() failed';
 vp^.v_vflag:= and ~VV_FORCEINSMQ;
 VOP_UNLOCK(vp, 0);
 {
  * Place the vnode onto the syncer worklist. We attempt to
  * scatter them about on the list so that they will go off
  * at evenly distributed times even if all the filesystems
  * are mounted at once.
  }
 next += incr;
 if (next=0 or next > syncer_maxdelay) begin
  start /= 2;
  incr /= 2;
  if (start=0) begin
   start:=syncer_maxdelay div 2;
   incr:=syncer_maxdelay;
  end;
  next:=start;
 end;
 bo:=@vp^.v_bufobj;
 BO_LOCK(bo);
 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 { XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. }
 mtx_lock(@sync_mtx);
 sync_vnode_count++;
 if (mp^.mnt_syncer=nil) begin
  mp^.mnt_syncer:=vp;
  vp:=nil;
 end;
 mtx_unlock(@sync_mtx);
 BO_UNLOCK(bo);
 if (vp<>nil) begin
  vn_lock(vp, LK_EXCLUSIVE or LK_RETRY);
  vgone(vp);
  vput(vp);
 end;
end;

void
vfs_deallocate_syncvnode(mp:p_mount)
begin
 vp:p_vnode;

 mtx_lock(@sync_mtx);
 vp:=mp^.mnt_syncer;
 if (vp<>nil)
  mp^.mnt_syncer:=nil;
 mtx_unlock(@sync_mtx);
 if (vp<>nil)
  vrele(vp);
end;

{
 * Do a lazy sync of the filesystem.
 }
static int
sync_fsync(struct vop_fsync_args *ap)
begin
 struct vnode *syncvp:=ap^.a_vp;
 mp:p_mount:=syncvp^.v_mount;
 int error, save;
 struct bufobj *bo;

 {
  * We only need to do something if this is a lazy evaluation.
  }
 if (ap^.a_waitfor<>MNT_LAZY)
  Exit(0);

 {
  * Move ourselves to the back of the sync list.
  }
 bo:=@syncvp^.v_bufobj;
 BO_LOCK(bo);
 vn_syncer_add_to_worklist(bo, syncdelay);
 BO_UNLOCK(bo);

 {
  * Walk the list of vnodes pushing all that are dirty and
  * not already on the sync list.
  }
 if (vfs_busy(mp, MBF_NOWAIT)<>0)
  Exit(0);
 if (vn_start_write(nil, &mp, V_NOWAIT)<>0) begin
  vfs_unbusy(mp);
  Exit(0);
 end;
 save:=curthread_pflags_set(TDP_SYNCIO);
 vfs_msync(mp, MNT_NOWAIT);
 error:=VFS_SYNC(mp, MNT_LAZY);
 curthread_pflags_restore(save);
 vn_finished_write(mp);
 vfs_unbusy(mp);
 Exit(error);
end;

{
 * The syncer vnode is no referenced.
 }
static int
sync_inactive(struct vop_inactive_args *ap)
begin

 vgone(ap^.a_vp);
 Exit(0);
end;

{
 * The syncer vnode is no longer needed and is being decommissioned.
 *
 * Modifications to the worklist must be protected by sync_mtx.
 }
static int
sync_reclaim(struct vop_reclaim_args *ap)
begin
 vp:p_vnode:=ap^.a_vp;
 struct bufobj *bo;

 bo:=@vp^.v_bufobj;
 BO_LOCK(bo);
 mtx_lock(@sync_mtx);
 if (vp^.v_mount^.mnt_syncer=vp)
  vp^.v_mount^.mnt_syncer:=nil;
 if (bo^.bo_flag and BO_ONWORKLST) begin
  LIST_REMOVE(bo, bo_synclist);
  syncer_worklist_len--;
  sync_vnode_count--;
  bo^.bo_flag:= and ~BO_ONWORKLST;
 end;
 mtx_unlock(@sync_mtx);
 BO_UNLOCK(bo);

 Exit(0);
end;
}

{
 * Check if vnode represents a disk device
 }
function vn_isdisk(vp:p_vnode;errp:PInteger):Boolean;
var
 error:Integer;
begin
 error:=0;
 dev_lock();
 if (vp^.v_type<>VCHR) then
 begin
  error:=ENOTBLK
 end else
 if (vp^.v_rdev=nil) then
 begin
  error:=ENXIO
 end else
 if (p_cdev(vp^.v_rdev)^.si_devsw=nil) then
 begin
  error:=ENXIO
 end else
 if ((p_cdevsw(p_cdev(vp^.v_rdev)^.si_devsw)^.d_flags and D_DISK)=0) then
 begin
  error:=ENOTBLK;
 end;
 dev_unlock();
 error:=ENOTBLK;
 if (errp<>nil) then
 begin
  errp^:=error;
 end;
 Exit(error=0);
end;

{
 * Common filesystem object access control check routine.  Accepts a
 * vnode's type, "mode", uid and gid, requested access mode, credentials,
 * and optional call-by-reference privused argument allowing vaccess()
 * to indicate to the caller whether privilege was used to satisfy the
 * request (obsoleted).  Returns 0 on success, or an errno on failure.
 }
function vaccess(_type:vtype;
                 file_mode:mode_t;
                 file_uid:uid_t;
                 file_gid:gid_t;
                 accmode:accmode_t;
                 privused:PInteger):Integer;
label
 privcheck;
var
 dac_granted :accmode_t;
 priv_granted:accmode_t;
begin
 Assert((accmode and (not (VEXEC or VWRITE or VREAD or VADMIN or VAPPEND)))=0,'invalid bit in accmode');
 Assert(((accmode and VAPPEND)=0) or ((accmode and VWRITE)<>0),'VAPPEND without VWRITE');

 {
  * Look for a normal, non-privileged way to access the file/directory
  * as requested.  If it exists, go with that.
  }

 if (privused<>nil) then
 begin
  privused^:=0;
 end;

 dac_granted:=0;

 { Check the owner. }
 if {(cred^.cr_uid=file_uid)} True then
 begin
  dac_granted:=dac_granted or VADMIN;
  if ((file_mode and S_IXUSR)<>0) then
   dac_granted:=dac_granted or VEXEC;
  if ((file_mode and S_IRUSR)<>0) then
   dac_granted:=dac_granted or VREAD;
  if ((file_mode and S_IWUSR)<>0) then
   dac_granted:=dac_granted or (VWRITE or VAPPEND);

  if ((accmode and dac_granted)=accmode) then
  begin
   Exit(0);
  end;

  goto privcheck;
 end;

 { Otherwise, check the groups (first match) }
 if {(groupmember(file_gid, cred))} True then
 begin
  if ((file_mode and S_IXGRP)<>0) then
   dac_granted:=dac_granted or VEXEC;
  if ((file_mode and S_IRGRP)<>0) then
   dac_granted:=dac_granted or VREAD;
  if ((file_mode and S_IWGRP)<>0) then
   dac_granted:=dac_granted or (VWRITE or VAPPEND);

  if ((accmode and dac_granted)=accmode) then
  begin
   Exit(0);
  end;

  goto privcheck;
 end;

 { Otherwise, check everyone else. }
 if ((file_mode and S_IXOTH)<>0) then
  dac_granted:=dac_granted or VEXEC;
 if ((file_mode and S_IROTH)<>0) then
  dac_granted:=dac_granted or VREAD;
 if ((file_mode and S_IWOTH)<>0) then
  dac_granted:=dac_granted or (VWRITE or VAPPEND);

 if ((accmode and dac_granted)=accmode) then
 begin
  Exit(0);
 end;

privcheck:
 {
  * Build a privilege mask to determine if the set of privileges
  * satisfies the requirements when combined with the granted mask
  * from above.  For each privilege, if the privilege is required,
  * bitwise or the request type onto the priv_granted mask.
  }
 priv_granted:=0;

 if (_type=VDIR) then
 begin
  {
   * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
   * requests, instead of PRIV_VFS_EXEC.
   }
  if ((accmode and VEXEC)<>0) and
     ((dac_granted and VEXEC)=0) {and
     (priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)=0)} then
  begin
   priv_granted:=priv_granted or VEXEC;
  end;
 end else
 begin
  {
   * Ensure that at least one execute bit is on. Otherwise,
   * a privileged user will always succeed, and we don't want
   * this to happen unless the file really is executable.
   }
  if ((accmode and VEXEC)<>0) and
     ((dac_granted and VEXEC)=0) and
     ((file_mode and (S_IXUSR or S_IXGRP or S_IXOTH))<>0) {and
     (priv_check_cred(cred, PRIV_VFS_EXEC, 0)=0)} then
  begin
   priv_granted:=priv_granted or VEXEC;
  end;
 end;

 if ((accmode and VREAD)<>0) and
    ((dac_granted and VREAD)=0) {and
     (priv_check_cred(cred, PRIV_VFS_READ, 0)=0)} then
 begin
  priv_granted:=priv_granted or VREAD;
 end;

 if ((accmode and VWRITE)<>0) and
    ((dac_granted and VWRITE)=0) {and
    (priv_check_cred(cred, PRIV_VFS_WRITE, 0)=0)} then
 begin
  priv_granted:=priv_granted or (VWRITE or VAPPEND);
 end;

 if ((accmode and VADMIN)<>0) and
    ((dac_granted and VADMIN)=0) {and
    (priv_check_cred(cred, PRIV_VFS_ADMIN, 0)=0)} then
 begin
  priv_granted:=priv_granted or VADMIN;
 end;

 if ((accmode and (priv_granted or dac_granted))=accmode) then
 begin
  { XXX audit: privilege used }
  if (privused<>nil) then
  begin
   privused^:=1;
  end;
  Exit(0);
 end;

 if ((accmode and VADMIN)<>0) then
  Exit(EPERM)
 else
  Exit(EACCES);
end;

procedure vfs_badlock(msg,str:PChar;vp:p_vnode);
begin
 Writeln(msg,' ',str);
 Assert(false,RawByteString(msg)+' '+RawByteString(str));
end;

procedure assert_vi_locked(vp:p_vnode;str:PChar);
begin
 if {vfs_badlock_mutex and} (not mtx_owned(VI_MTX(vp)^)) then
  vfs_badlock('interlock is not locked but should be', str, vp);
end;

procedure assert_vi_unlocked(vp:p_vnode;str:PChar);
begin
 if {vfs_badlock_mutex and} mtx_owned(VI_MTX(vp)^) then
  vfs_badlock('interlock is locked but should not be', str, vp);
end;

procedure assert_vop_locked(vp:p_vnode;str:PChar);
var
 locked:Integer;
begin
 if (not IGNORE_LOCK(vp)) then
 begin
  locked:=VOP_ISLOCKED(vp);
  if (locked=0) or (locked=LK_EXCLOTHER) then
   vfs_badlock('is not locked but should be', str, vp);
 end;
end;

procedure assert_vop_unlocked(vp:p_vnode;str:PChar);
begin
 if (not IGNORE_LOCK(vp)) and (VOP_ISLOCKED(vp)=LK_EXCLUSIVE) then
  vfs_badlock('is locked but should not be', str, vp);
end;

procedure assert_vop_elocked(vp:p_vnode;str:PChar);
begin
 if (not IGNORE_LOCK(vp)) and (VOP_ISLOCKED(vp)<>LK_EXCLUSIVE) then
  vfs_badlock('is not exclusive locked but should be', str, vp);
end;

function VOP_WRITE_PRE(ap:p_vop_write_args;var osize,ooffset:Int64):Integer; public;
var
 va:t_vattr;
 error:Integer;
begin
 Result :=0;
 osize  :=0;
 ooffset:=0;
 if (not VN_KNLIST_EMPTY(ap^.a_vp)) then
 begin
  error:=VOP_GETATTR(ap^.a_vp, @va);
  if (error<>0) then Exit(error);
  ooffset:=ap^.a_uio^.uio_offset;
  osize:=va.va_size;
 end;
end;

procedure VOP_WRITE_POST(ap:p_vop_write_args;ret:Integer;var osize,ooffset:Int64); public;
var
 noffset:Int64;
begin
 noffset:=ap^.a_uio^.uio_offset;
 if (noffset>ooffset) and (not VN_KNLIST_EMPTY(ap^.a_vp)) then
 begin
  if (noffset>osize) then
  begin
   VFS_KNOTE_LOCKED(ap^.a_vp, NOTE_WRITE or NOTE_EXTEND);
  end else
  begin
   VFS_KNOTE_LOCKED(ap^.a_vp, NOTE_WRITE);
  end;
 end;
end;

procedure vop_rename_fail(ap:p_vop_rename_args);
begin
 if (ap^.a_tvp<>nil) then
  vput(ap^.a_tvp);

 if (ap^.a_tdvp=ap^.a_tvp) then
  vrele(ap^.a_tdvp)
 else
  vput(ap^.a_tdvp);

 vrele(ap^.a_fdvp);
 vrele(ap^.a_fvp);
end;

procedure vop_rename_pre(ap:p_vop_rename_args); public;
begin
 if (ap^.a_tdvp<>ap^.a_fdvp) then
  vhold(ap^.a_fdvp);

 if (ap^.a_tvp<>ap^.a_fvp) then
  vhold(ap^.a_fvp);

 vhold(ap^.a_tdvp);

 if (ap^.a_tvp<>nil) then
  vhold(ap^.a_tvp);
end;

procedure vop_create_post(ap:p_vop_create_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE);
 end;
end;

procedure vop_link_post(ap:p_vop_link_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_vp  , NOTE_LINK);
  VFS_KNOTE_LOCKED(ap^.a_tdvp, NOTE_WRITE);
 end;
end;

procedure vop_mkdir_post(ap:p_vop_mkdir_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE or NOTE_LINK);
 end;
end;

procedure vop_mknod_post(ap:p_vop_mknod_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE);
 end;
end;

procedure vop_remove_post(ap:p_vop_remove_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE);
  VFS_KNOTE_LOCKED(ap^.a_vp , NOTE_DELETE);
 end;
end;

procedure vop_rename_post(ap:p_vop_rename_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_UNLOCKED(ap^.a_fdvp, NOTE_WRITE);
  VFS_KNOTE_UNLOCKED(ap^.a_tdvp, NOTE_WRITE);
  VFS_KNOTE_UNLOCKED(ap^.a_fvp , NOTE_RENAME);
  if (ap^.a_tvp<>nil) then
  begin
   VFS_KNOTE_UNLOCKED(ap^.a_tvp, NOTE_DELETE);
  end;
 end;
 if (ap^.a_tdvp<>ap^.a_fdvp) then
 begin
  vdrop(ap^.a_fdvp);
 end;
 if (ap^.a_tvp<>ap^.a_fvp) then
 begin
  vdrop(ap^.a_fvp);
 end;
 vdrop(ap^.a_tdvp);
 if (ap^.a_tvp<>nil) then
 begin
  vdrop(ap^.a_tvp);
 end;
end;

procedure vop_rmdir_post(ap:p_vop_rmdir_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE or NOTE_LINK);
  VFS_KNOTE_LOCKED(ap^.a_vp , NOTE_DELETE);
 end;
end;

procedure vop_setattr_post(ap:p_vop_setattr_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_vp, NOTE_ATTRIB);
 end;
end;

procedure vop_symlink_post(ap:p_vop_symlink_args;rc:Integer); public;
begin
 if (rc=0) then
 begin
  VFS_KNOTE_LOCKED(ap^.a_dvp, NOTE_WRITE);
 end;
end;

procedure vfs_event_init();
begin
 knlist_init_mtx(@fs_knlist, nil);
end;

procedure vfs_event_signal(fsid:p_fsid;event:DWORD;data:ptrint);
begin
 KNOTE_UNLOCKED(@fs_knlist, event);
end;

function filt_fsattach(kn:p_knote):Integer;
begin
 kn^.kn_flags:=kn^.kn_flags or EV_CLEAR;
 knlist_add(@fs_knlist, kn, 0);
 Exit(0);
end;

procedure filt_fsdetach(kn:p_knote);
begin
 knlist_remove(@fs_knlist, kn, 0);
end;

function filt_fsevent(kn:p_knote;hint:QWORD):Integer;
begin
 kn^.kn_fflags:=kn^.kn_fflags or hint;
 Exit(ord(kn^.kn_fflags<>0));
end;

{
static int
sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
begin
 struct vfsidctl vc;
 int error;
 mp:p_mount;

 error:=SYSCTL_IN(req, &vc, sizeof(vc));
 if (error)
  Exit(error);
 if (vc.vc_vers<>VFS_CTL_VERS1)
  Exit(EINVAL);
 mp:=vfs_getvfs(@vc.vc_fsid);
 if (mp=nil)
  Exit(ENOENT);
 { ensure that a specific sysctl goes to the right filesystem. }
 if (strcmp(vc.vc_fstypename, "*'<>0 and
     strcmp(vc.vc_fstypename, mp^.mnt_vfc^.vfc_name)<>0) begin
  vfs_rel(mp);
  Exit(EINVAL);
 end;
 VCTLTOREQ(@vc, req);
 error:=VFS_SYSCTL(mp, vc.vc_op, req);
 vfs_rel(mp);
 Exit(error);
end;

SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE or CTLFLAG_WR, nil, 0, sysctl_vfs_ctl, "", "Sysctl by fsid';

{
 * Function to initialize a va_filerev field sensibly.
 * XXX: Wouldn't a random number make a lot more sense ??
 }
u_quad_t
init_va_filerev(void)
begin
 struct bintime bt;

 getbinuptime(@bt);
 Exit(((u_quad_t)bt.sec shl 32LL) or (bt.frac shr 32LL));
end;
}

procedure filt_vfsdetach(kn:p_knote);                    forward;
function  filt_vfsread  (kn:p_knote;hint:QWORD):Integer; forward;
function  filt_vfswrite (kn:p_knote;hint:QWORD):Integer; forward;
function  filt_vfsvnode (kn:p_knote;hint:QWORD):Integer; forward;

const
 vfsread_filtops:t_filterops=(
  f_isfd  :1;
  f_detach:@filt_vfsdetach;
  f_event :@filt_vfsread;
 );

 vfswrite_filtops:t_filterops=(
  f_isfd  :1;
  f_detach:@filt_vfsdetach;
  f_event :@filt_vfswrite;
 );

 vfsvnode_filtops:t_filterops=(
  f_isfd  :1;
  f_detach:@filt_vfsdetach;
  f_event :@filt_vfsvnode;
 );

procedure vfs_knllock(arg:Pointer);
begin
 vn_lock(p_vnode(arg), LK_EXCLUSIVE or LK_RETRY,{$INCLUDE %FILE%},{$INCLUDE %LINENUM%});
end;

procedure vfs_knlunlock(arg:Pointer);
begin
 VOP_UNLOCK(p_vnode(arg), 0);
end;

procedure vfs_knl_assert_locked(arg:Pointer);
begin
 //
end;

procedure vfs_knl_assert_unlocked(arg:Pointer);
begin
 //
end;

function vfs_kqfilter(ap:p_vop_kqfilter_args):Integer;
var
 vp:p_vnode;
 kn:p_knote;
 knl:p_knlist;
begin
 vp:=ap^.a_vp;
 kn:=ap^.a_kn;

 case (kn^.kn_filter) of
  EVFILT_READ :kn^.kn_fop:=@vfsread_filtops;
  EVFILT_WRITE:kn^.kn_fop:=@vfswrite_filtops;
  EVFILT_VNODE:kn^.kn_fop:=@vfsvnode_filtops;
  else
   Exit(EINVAL);
 end;

 kn^.kn_hook:=vp;

 v_addpollinfo(vp);

 if (vp^.v_pollinfo=nil) then
 begin
  Exit(ENOMEM);
 end;

 knl:=@vp^.v_pollinfo^.vpi_selinfo.si_note;
 vhold(vp);
 knlist_add(knl, kn, 0);

 Exit(0);
end;

{
 * Detach knote from vnode
 }
procedure filt_vfsdetach(kn:p_knote);
var
 vp:p_vnode;
begin
 vp:=kn^.kn_hook;

 Assert(vp^.v_pollinfo<>nil, 'Missing v_pollinfo');
 knlist_remove(@vp^.v_pollinfo^.vpi_selinfo.si_note, kn, 0);
 vdrop(vp);
end;

{ARGSUSED}
function filt_vfsread(kn:p_knote;hint:QWORD):Integer;
var
 vp:p_vnode;
 va:t_vattr;
 res:Integer;
begin
 vp:=kn^.kn_hook;

 {
  * filesystem is gone, so set the EOF flag and schedule
  * the knote for deletion.
  }
 if (hint=NOTE_REVOKE) then
 begin
  VI_LOCK(vp);
  kn^.kn_flags:=kn^.kn_flags or (EV_EOF or EV_ONESHOT);
  VI_UNLOCK(vp);
  Exit(1);
 end;

 if (VOP_GETATTR(vp, @va)<>0) then
 begin
  Exit(0);
 end;

 VI_LOCK(vp);
 kn^.kn_data:=va.va_size - p_file(kn^.kn_fp)^.f_offset;
 res:=ord(kn^.kn_data<>0);
 VI_UNLOCK(vp);
 Exit(res);
end;

{ARGSUSED}
function filt_vfswrite(kn:p_knote;hint:QWORD):Integer;
var
 vp:p_vnode;
begin
 vp:=kn^.kn_hook;

 VI_LOCK(vp);

 {
  * filesystem is gone, so set the EOF flag and schedule
  * the knote for deletion.
  }
 if (hint=NOTE_REVOKE) then
 begin
  kn^.kn_flags:=kn^.kn_flags or (EV_EOF or EV_ONESHOT);
 end;

 kn^.kn_data:=0;
 VI_UNLOCK(vp);
 Exit(1);
end;

function filt_vfsvnode(kn:p_knote;hint:QWORD):Integer;
var
 vp:p_vnode;
 res:Integer;
begin
 vp:=kn^.kn_hook;

 VI_LOCK(vp);
 if ((kn^.kn_sfflags and hint)<>0) then
 begin
  kn^.kn_fflags:=kn^.kn_fflags or hint;
 end;
 if (hint=NOTE_REVOKE) then
 begin
  kn^.kn_flags:=kn^.kn_flags or EV_EOF;
  VI_UNLOCK(vp);
  Exit(1);
 end;
 res:=ord(kn^.kn_fflags<>0);
 VI_UNLOCK(vp);
 Exit(res);
end;

function vfs_read_dirent(ap:p_vop_readdir_args;dp:p_dirent;off:QWORD):Integer;
var
 error:Integer;
begin
 if (dp^.d_reclen > ap^.a_uio^.uio_resid) then
 begin
  Exit(ENAMETOOLONG);
 end;

 error:=uiomove(dp, dp^.d_reclen, ap^.a_uio);
 if (error<>0) then
 begin
  if (ap^.a_ncookies<>nil) then
  begin
   if (ap^.a_cookies<>nil) then
   begin
    FreeMem(ap^.a_cookies);
   end;
   ap^.a_cookies:=nil;
   ap^.a_ncookies^:=0;
  end;
  Exit(error);
 end;

 if (ap^.a_ncookies=nil) then Exit(0);

 Assert(ap^.a_cookies<>nil,'null ap^.a_cookies value with non-null ap^.a_ncookies!');

 ap^.a_cookies^:=ReAllocMem(ap^.a_cookies^,(ap^.a_ncookies^ + 1) * sizeof(QWORD));
 ap^.a_cookies^[ap^.a_ncookies^]:=off;

 Inc(ap^.a_ncookies^);
 Exit(0);
end;

{
 * Mark for update the access time of the file if the filesystem
 * supports VOP_MARKATIME.  This functionality is used by execve and
 * mmap, so we want to avoid the I/O implied by directly setting
 * va_atime for the sake of efficiency.
 }
procedure vfs_mark_atime(vp:p_vnode);
var
 mp:p_mount;
begin
 mp:=vp^.v_mount;
 VFS_ASSERT_GIANT(mp);
 ASSERT_VOP_LOCKED(vp,'vfs_mark_atime');

 if (mp<>nil) then
 if ((mp^.mnt_flag and (MNT_NOATIME or MNT_RDONLY))=0) then
 begin
  VOP_MARKATIME(vp);
 end;
end;

{
 * The purpose of this routine is to remove granularity from accmode_t,
 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
 * VADMIN and VAPPEND.
 *
 * If it returns 0, the caller is supposed to continue with the usual
 * access checks using 'accmode' as modified by this routine.  If it
 * returns nonzero value, the caller is supposed to Exitthat value
 * as errno.
 *
 * Note that after this routine runs, accmode may be zero.
 }
function vfs_unixify_accmode(accmode:p_accmode_t):Integer;
begin
 {
  * There is no way to specify explicit "deny" rule using
  * file mode or POSIX.1e ACLs.
  }
 if ((accmode^ and VEXPLICIT_DENY)<>0) then
 begin
  accmode^:=0;
  Exit(0);
 end;

 {
  * None of these can be translated into usual access bits.
  * Also, the common case for NFSv4 ACLs is to not contain
  * either of these bits. Caller should check for VWRITE
  * on the containing directory instead.
  }
 if ((accmode^ and (VDELETE_CHILD or VDELETE))<>0) then
 begin
  Exit(EPERM);
 end;

 if ((accmode^ and VADMIN_PERMS)<>0) then
 begin
  accmode^:=accmode^ and (not VADMIN_PERMS);
  accmode^:=accmode^ or VADMIN;
 end;

 {
  * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
  * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
  }
 accmode^:=accmode^ and (not (VSTAT_PERMS or VSYNCHRONIZE));

 Exit(0);
end;


{
 * These are helper functions for filesystems to traverse all
 * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
 *
 * This interface replaces MNT_VNODE_FOREACH.
 }
function __mnt_vnode_next_all(mvp:pp_vnode;mp:p_mount):p_vnode;
var
 vp:p_vnode;
begin
 //if (should_yield())
  kern_yield(PRI_UNCHANGED);

 MNT_ILOCK(mp);
 Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch');
 vp:=TAILQ_NEXT(mvp^,@mvp^^.v_nmntvnodes);
 while (vp<>nil) do
 begin
  if not ((vp^.v_type=VMARKER) or ((vp^.v_iflag and VI_DOOMED)<>0)) then Break;
  vp:=TAILQ_NEXT(vp,@vp^.v_nmntvnodes);
 end;

 { Check if we are done }
 if (vp=nil) then
 begin
  __mnt_vnode_markerfree_all(mvp, mp);
  { MNT_IUNLOCK(mp); -- done in above function }
  Exit(nil);
 end;
 TAILQ_REMOVE(@mp^.mnt_nvnodelist,mvp^,@mvp^^.v_nmntvnodes);
 TAILQ_INSERT_AFTER(@mp^.mnt_nvnodelist,vp,mvp^,@mvp^^.v_nmntvnodes);
 VI_LOCK(vp);
 MNT_IUNLOCK(mp);
 Exit(vp);
end;

function __mnt_vnode_first_all(mvp:pp_vnode;mp:p_mount):p_vnode;
var
 vp:p_vnode;
begin
 mvp^:=AllocMem(sizeof(t_vnode));
 MNT_ILOCK(mp);
 MNT_REF(mp);
 mvp^^.v_type:=VMARKER;

 vp:=TAILQ_FIRST(@mp^.mnt_nvnodelist);
 while (vp<>nil) and
       ((vp^.v_type=VMARKER) or ((vp^.v_iflag and VI_DOOMED)<>0)) do
 begin
  vp:=TAILQ_NEXT(vp,@vp^.v_nmntvnodes);
 end;

 { Check if we are done }
 if (vp=nil) then
 begin
  MNT_REL(mp);
  MNT_IUNLOCK(mp);
  FreeMem(mvp^);
  mvp^:=nil;
  Exit(nil);
 end;
 mvp^^.v_mount:=mp;
 TAILQ_INSERT_AFTER(@mp^.mnt_nvnodelist,vp,mvp^,@mvp^^.v_nmntvnodes);
 VI_LOCK(vp);
 MNT_IUNLOCK(mp);
 Exit(vp);
end;

procedure __mnt_vnode_markerfree_all(mvp:pp_vnode;mp:p_mount);
begin
 if (mvp^=nil) then
 begin
  MNT_IUNLOCK(mp);
  Exit;
 end;

 mtx_assert(MNT_MTX(mp)^);

 Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch');
 TAILQ_REMOVE(@mp^.mnt_nvnodelist,mvp^,@mvp^^.v_nmntvnodes);
 MNT_REL(mp);
 MNT_IUNLOCK(mp);
 FreeMem(mvp^);
 mvp^:=nil;
end;

{
 * These are helper functions for filesystems to traverse their
 * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
 }
procedure mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount);
begin
 Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch');

 MNT_ILOCK(mp);
 MNT_REL(mp);
 MNT_IUNLOCK(mp);
 FreeMem(mvp^);
 mvp^:=nil;
end;

function mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode;
label
 restart;
var
 vp,nvp:p_vnode;
begin
 mtx_assert(vnode_free_list_mtx);
 Assert(mvp^^.v_mount=mp, 'marker vnode mount list mismatch');
restart:
 vp:=TAILQ_NEXT(mvp^,@mvp^^.v_actfreelist);
 TAILQ_REMOVE(@mp^.mnt_activevnodelist,mvp^,@mvp^^.v_actfreelist);
 while (vp<>nil) do
 begin
  if (vp^.v_type=VMARKER) then
  begin
   vp:=TAILQ_NEXT(vp,@vp^.v_actfreelist);
   continue;
  end;
  if (not VI_TRYLOCK(vp)) then
  begin
   continue;
  end;
  Assert(vp^.v_type<>VMARKER, 'locked marker %p');
  Assert((vp^.v_mount=mp) or (vp^.v_mount=nil),'alien vnode on the active list %p %p');
  if (vp^.v_mount=mp) and ((vp^.v_iflag and VI_DOOMED)=0) then
  begin
   break;
  end;
  nvp:=TAILQ_NEXT(vp,@vp^.v_actfreelist);
  VI_UNLOCK(vp);
  vp:=nvp;
 end;

 { Check if we are done }
 if (vp=nil) then
 begin
  mtx_unlock(vnode_free_list_mtx);
  mnt_vnode_markerfree_active(mvp, mp);
  Exit(nil);
 end;
 TAILQ_INSERT_AFTER(@mp^.mnt_activevnodelist,vp,mvp^,@mvp^^.v_actfreelist);
 mtx_unlock(vnode_free_list_mtx);
 ASSERT_VI_LOCKED(vp, 'active iter');
 Assert((vp^.v_iflag and VI_ACTIVE)<>0, 'Non-active vp %p');
 Exit(vp);
end;

function __mnt_vnode_next_active(mvp:pp_vnode;mp:p_mount):p_vnode;
begin
 //if (should_yield())
  kern_yield(PRI_UNCHANGED);
 mtx_lock(vnode_free_list_mtx);
 Exit(mnt_vnode_next_active(mvp, mp));
end;

function __mnt_vnode_first_active(mvp:pp_vnode;mp:p_mount):p_vnode;
var
 vp:p_vnode;
begin
 mvp^:=AllocMem(sizeof(t_vnode));
 MNT_ILOCK(mp);
 MNT_REF(mp);
 MNT_IUNLOCK(mp);
 mvp^^.v_type:=VMARKER;
 mvp^^.v_mount:=mp;

 mtx_lock(vnode_free_list_mtx);
 vp:=TAILQ_FIRST(@mp^.mnt_activevnodelist);
 if (vp=nil) then
 begin
  mtx_unlock(vnode_free_list_mtx);
  mnt_vnode_markerfree_active(mvp, mp);
  Exit(nil);
 end;
 TAILQ_INSERT_BEFORE(vp, mvp^,@mvp^^.v_actfreelist);
 Exit(mnt_vnode_next_active(mvp, mp));
end;

procedure __mnt_vnode_markerfree_active(mvp:pp_vnode;mp:p_mount);
begin
 if (mvp^=nil) then Exit;

 mtx_lock(vnode_free_list_mtx);
 TAILQ_REMOVE(@mp^.mnt_activevnodelist,mvp^,@mvp^^.v_actfreelist);
 mtx_unlock(vnode_free_list_mtx);
 mnt_vnode_markerfree_active(mvp, mp);
end;


end.