illumos-gate New usr/src/uts/common/fs/vfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/errno.h>
  43 #include <sys/user.h>
  44 #include <sys/fstyp.h>
  45 #include <sys/kmem.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/mount.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/fem.h>
  52 #include <sys/mntent.h>
  53 #include <sys/stat.h>
  54 #include <sys/statvfs.h>
  55 #include <sys/statfs.h>
  56 #include <sys/cred.h>
  57 #include <sys/vnode.h>
  58 #include <sys/rwstlock.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/file.h>
  61 #include <sys/time.h>
  62 #include <sys/atomic.h>
  63 #include <sys/cmn_err.h>
  64 #include <sys/buf.h>
  65 #include <sys/swap.h>
  66 #include <sys/debug.h>
  67 #include <sys/vnode.h>
  68 #include <sys/modctl.h>
  69 #include <sys/ddi.h>
  70 #include <sys/pathname.h>
  71 #include <sys/bootconf.h>
  72 #include <sys/dumphdr.h>
  73 #include <sys/dc_ki.h>
  74 #include <sys/poll.h>
  75 #include <sys/sunddi.h>
  76 #include <sys/sysmacros.h>
  77 #include <sys/zone.h>
  78 #include <sys/policy.h>
  79 #include <sys/ctfs.h>
  80 #include <sys/objfs.h>
  81 #include <sys/console.h>
  82 #include <sys/reboot.h>
  83 #include <sys/attr.h>
  84 #include <sys/zio.h>
  85 #include <sys/spa.h>
  86 #include <sys/lofi.h>
  87 #include <sys/bootprops.h>
  88 #include <sys/avl.h>
  89 
  90 #include <vm/page.h>
  91 
  92 #include <fs/fs_subr.h>
  93 /* Private interfaces to create vopstats-related data structures */
  94 extern void             initialize_vopstats(vopstats_t *);
  95 extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
  96 extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
  97 
  98 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
  99 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
 100     const char *, int, int);
 101 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 102 static void vfs_freemnttab(struct vfs *);
 103 static void vfs_freeopt(mntopt_t *);
 104 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 105 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 106 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 107 static void vfs_createopttbl_extend(mntopts_t *, const char *,
 108     const mntopts_t *);
 109 static char **vfs_copycancelopt_extend(char **const, int);
 110 static void vfs_freecancelopt(char **);
 111 static void getrootfs(char **, char **);
 112 static int getmacpath(dev_info_t *, void *);
 113 static void vfs_mnttabvp_setup(void);
 114 
 115 struct ipmnt {
 116         struct ipmnt    *mip_next;
 117         dev_t           mip_dev;
 118         struct vfs      *mip_vfsp;
 119 };
 120 
 121 static kmutex_t         vfs_miplist_mutex;
 122 static struct ipmnt     *vfs_miplist = NULL;
 123 static struct ipmnt     *vfs_miplist_end = NULL;
 124 
 125 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 126 
 127 /*
 128  * VFS global data.
 129  */
 130 vnode_t *rootdir;               /* pointer to root inode vnode. */
 131 vnode_t *devicesdir;            /* pointer to inode of devices root */
 132 vnode_t *devdir;                /* pointer to inode of dev root */
 133 
 134 char *server_rootpath;          /* root path for diskless clients */
 135 char *server_hostname;          /* hostname of diskless server */
 136 
 137 static struct vfs root;
 138 static struct vfs devices;
 139 static struct vfs dev;
 140 struct vfs *rootvfs = &root;        /* pointer to root vfs; head of VFS list. */
 141 avl_tree_t vfs_by_dev;          /* avl tree to index mounted VFSs by dev */
 142 avl_tree_t vfs_by_mntpnt;       /* avl tree to index mounted VFSs by mntpnt */
 143 uint64_t vfs_curr_mntix;        /* counter to provide a unique mntix for
 144                                  * entries in the above avl trees.
 145                                  * protected by vfslist lock */
 146 rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 147 int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 148                                 /* must be power of 2!  */
 149 timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 150 timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 151 char *vfs_dummyfstype = "\0";
 152 struct pollhead vfs_pollhd;     /* for mnttab pollers */
 153 struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 154 int     mntfstype;              /* will be set once mnt fs is mounted */
 155 
 156 /*
 157  * Table for generic options recognized in the VFS layer and acted
 158  * on at this level before parsing file system specific options.
 159  * The nosuid option is stronger than any of the devices and setuid
 160  * options, so those are canceled when nosuid is seen.
 161  *
 162  * All options which are added here need to be added to the
 163  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 164  */
 165 /*
 166  * VFS Mount options table
 167  */
 168 static char *ro_cancel[] = { MNTOPT_RW, NULL };
 169 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 170 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 171 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 172     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 173 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 174 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 175 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 176 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 177 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 178 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 179 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 180 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 181 
 182 static const mntopt_t mntopts[] = {
 183 /*
 184  *      option name             cancel options          default arg     flags
 185  */
 186         { MNTOPT_REMOUNT,       NULL,                   NULL,
 187                 MO_NODISPLAY, (void *)0 },
 188         { MNTOPT_RO,            ro_cancel,              NULL,           0,
 189                 (void *)0 },
 190         { MNTOPT_RW,            rw_cancel,              NULL,           0,
 191                 (void *)0 },
 192         { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 193                 (void *)0 },
 194         { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 195                 (void *)0 },
 196         { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 197                 (void *)0 },
 198         { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 199                 (void *)0 },
 200         { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 201                 (void *)0 },
 202         { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 203                 (void *)0 },
 204         { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 205                 (void *)0 },
 206         { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 207                 (void *)0 },
 208         { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 209                 (void *)0 },
 210         { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 211                 (void *)0 },
 212 };
 213 
 214 const mntopts_t vfs_mntopts = {
 215         sizeof (mntopts) / sizeof (mntopt_t),
 216         (mntopt_t *)&mntopts[0]
 217 };
 218 
 219 /*
 220  * File system operation dispatch functions.
 221  */
 222 
 223 int
 224 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 225 {
 226         return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 227 }
 228 
 229 int
 230 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 231 {
 232         return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 233 }
 234 
 235 int
 236 fsop_root(vfs_t *vfsp, vnode_t **vpp)
 237 {
 238         refstr_t *mntpt;
 239         int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 240         /*
 241          * Make sure this root has a path.  With lofs, it is possible to have
 242          * a NULL mountpoint.
 243          */
 244         if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
 245                 mntpt = vfs_getmntpoint(vfsp);
 246                 vn_setpath_str(*vpp, refstr_value(mntpt),
 247                     strlen(refstr_value(mntpt)));
 248                 refstr_rele(mntpt);
 249         }
 250 
 251         return (ret);
 252 }
 253 
 254 int
 255 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 256 {
 257         return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 258 }
 259 
 260 int
 261 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 262 {
 263         return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 264 }
 265 
 266 int
 267 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 268 {
 269         /*
 270          * In order to handle system attribute fids in a manner
 271          * transparent to the underlying fs, we embed the fid for
 272          * the sysattr parent object in the sysattr fid and tack on
 273          * some extra bytes that only the sysattr layer knows about.
 274          *
 275          * This guarantees that sysattr fids are larger than other fids
 276          * for this vfs. If the vfs supports the sysattr view interface
 277          * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 278          * collision with XATTR_FIDSZ.
 279          */
 280         if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 281             fidp->fid_len == XATTR_FIDSZ)
 282                 return (xattr_dir_vget(vfsp, vpp, fidp));
 283 
 284         return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 285 }
 286 
 287 int
 288 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 289 {
 290         return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 291 }
 292 
 293 void
 294 fsop_freefs(vfs_t *vfsp)
 295 {
 296         (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 297 }
 298 
 299 int
 300 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 301 {
 302         return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 303 }
 304 
 305 int
 306 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 307 {
 308         ASSERT((fstype >= 0) && (fstype < nfstype));
 309 
 310         if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 311                 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 312         else
 313                 return (ENOTSUP);
 314 }
 315 
 316 /*
 317  * File system initialization.  vfs_setfsops() must be called from a file
 318  * system's init routine.
 319  */
 320 
 321 static int
 322 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 323     int *unused_ops)
 324 {
 325         static const fs_operation_trans_def_t vfs_ops_table[] = {
 326                 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 327                         fs_nosys, fs_nosys,
 328 
 329                 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 330                         fs_nosys, fs_nosys,
 331 
 332                 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 333                         fs_nosys, fs_nosys,
 334 
 335                 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 336                         fs_nosys, fs_nosys,
 337 
 338                 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 339                         (fs_generic_func_p) fs_sync,
 340                         (fs_generic_func_p) fs_sync,    /* No errors allowed */
 341 
 342                 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 343                         fs_nosys, fs_nosys,
 344 
 345                 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 346                         fs_nosys, fs_nosys,
 347 
 348                 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 349                         (fs_generic_func_p)fs_freevfs,
 350                         (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 351 
 352                 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 353                         (fs_generic_func_p)fs_nosys,
 354                         (fs_generic_func_p)fs_nosys,
 355 
 356                 NULL, 0, NULL, NULL
 357         };
 358 
 359         return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 360 }
 361 
 362 void
 363 zfs_boot_init() {
 364 
 365         if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 366                 spa_boot_init();
 367 }
 368 
 369 int
 370 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 371 {
 372         int error;
 373         int unused_ops;
 374 
 375         /*
 376          * Verify that fstype refers to a valid fs.  Note that
 377          * 0 is valid since it's used to set "stray" ops.
 378          */
 379         if ((fstype < 0) || (fstype >= nfstype))
 380                 return (EINVAL);
 381 
 382         if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 383                 return (EINVAL);
 384 
 385         /* Set up the operations vector. */
 386 
 387         error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 388 
 389         if (error != 0)
 390                 return (error);
 391 
 392         vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 393 
 394         if (actual != NULL)
 395                 *actual = &vfssw[fstype].vsw_vfsops;
 396 
 397 #if DEBUG
 398         if (unused_ops != 0)
 399                 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 400                     "but not used", vfssw[fstype].vsw_name, unused_ops);
 401 #endif
 402 
 403         return (0);
 404 }
 405 
 406 int
 407 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 408 {
 409         int error;
 410         int unused_ops;
 411 
 412         *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 413 
 414         error = fs_copyfsops(template, *actual, &unused_ops);
 415         if (error != 0) {
 416                 kmem_free(*actual, sizeof (vfsops_t));
 417                 *actual = NULL;
 418                 return (error);
 419         }
 420 
 421         return (0);
 422 }
 423 
 424 /*
 425  * Free a vfsops structure created as a result of vfs_makefsops().
 426  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 427  * vfs_freevfsops_by_type().
 428  */
 429 void
 430 vfs_freevfsops(vfsops_t *vfsops)
 431 {
 432         kmem_free(vfsops, sizeof (vfsops_t));
 433 }
 434 
 435 /*
 436  * Since the vfsops structure is part of the vfssw table and wasn't
 437  * really allocated, we're not really freeing anything.  We keep
 438  * the name for consistency with vfs_freevfsops().  We do, however,
 439  * need to take care of a little bookkeeping.
 440  * NOTE: For a vfsops structure created by vfs_setfsops(), use
 441  * vfs_freevfsops_by_type().
 442  */
 443 int
 444 vfs_freevfsops_by_type(int fstype)
 445 {
 446 
 447         /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 448         if ((fstype <= 0) || (fstype >= nfstype))
 449                 return (EINVAL);
 450 
 451         WLOCK_VFSSW();
 452         if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 453                 WUNLOCK_VFSSW();
 454                 return (EINVAL);
 455         }
 456 
 457         vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 458         WUNLOCK_VFSSW();
 459 
 460         return (0);
 461 }
 462 
 463 /* Support routines used to reference vfs_op */
 464 
 465 /* Set the operations vector for a vfs */
 466 void
 467 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 468 {
 469         vfsops_t        *op;
 470 
 471         ASSERT(vfsp != NULL);
 472         ASSERT(vfsops != NULL);
 473 
 474         op = vfsp->vfs_op;
 475         membar_consumer();
 476         if (vfsp->vfs_femhead == NULL &&
 477             atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
 478                 return;
 479         }
 480         fsem_setvfsops(vfsp, vfsops);
 481 }
 482 
 483 /* Retrieve the operations vector for a vfs */
 484 vfsops_t *
 485 vfs_getops(vfs_t *vfsp)
 486 {
 487         vfsops_t        *op;
 488 
 489         ASSERT(vfsp != NULL);
 490 
 491         op = vfsp->vfs_op;
 492         membar_consumer();
 493         if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 494                 return (op);
 495         } else {
 496                 return (fsem_getvfsops(vfsp));
 497         }
 498 }
 499 
 500 /*
 501  * Returns non-zero (1) if the vfsops matches that of the vfs.
 502  * Returns zero (0) if not.
 503  */
 504 int
 505 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 506 {
 507         return (vfs_getops(vfsp) == vfsops);
 508 }
 509 
 510 /*
 511  * Returns non-zero (1) if the file system has installed a non-default,
 512  * non-error vfs_sync routine.  Returns zero (0) otherwise.
 513  */
 514 int
 515 vfs_can_sync(vfs_t *vfsp)
 516 {
 517         /* vfs_sync() routine is not the default/error function */
 518         return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 519 }
 520 
 521 /*
 522  * Initialize a vfs structure.
 523  */
 524 void
 525 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 526 {
 527         /* Other initialization has been moved to vfs_alloc() */
 528         vfsp->vfs_count = 0;
 529         vfsp->vfs_next = vfsp;
 530         vfsp->vfs_prev = vfsp;
 531         vfsp->vfs_zone_next = vfsp;
 532         vfsp->vfs_zone_prev = vfsp;
 533         vfsp->vfs_lofi_minor = 0;
 534         sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 535         vfsimpl_setup(vfsp);
 536         vfsp->vfs_data = (data);
 537         vfs_setops((vfsp), (op));
 538 }
 539 
 540 /*
 541  * Allocate and initialize the vfs implementation private data
 542  * structure, vfs_impl_t.
 543  */
 544 void
 545 vfsimpl_setup(vfs_t *vfsp)
 546 {
 547         int i;
 548 
 549         if (vfsp->vfs_implp != NULL) {
 550                 return;
 551         }
 552 
 553         vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 554         /* Note that these are #define'd in vfs.h */
 555         vfsp->vfs_vskap = NULL;
 556         vfsp->vfs_fstypevsp = NULL;
 557 
 558         /* Set size of counted array, then zero the array */
 559         vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 560         for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 561                 vfsp->vfs_featureset[i] = 0;
 562         }
 563 }
 564 
 565 /*
 566  * Release the vfs_impl_t structure, if it exists. Some unbundled
 567  * filesystems may not use the newer version of vfs and thus
 568  * would not contain this implementation private data structure.
 569  */
 570 void
 571 vfsimpl_teardown(vfs_t *vfsp)
 572 {
 573         vfs_impl_t      *vip = vfsp->vfs_implp;
 574 
 575         if (vip == NULL)
 576                 return;
 577 
 578         kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 579         vfsp->vfs_implp = NULL;
 580 }
 581 
 582 /*
 583  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 584  * fstatvfs, and sysfs moved to common/syscall.
 585  */
 586 
 587 /*
 588  * Update every mounted file system.  We call the vfs_sync operation of
 589  * each file system type, passing it a NULL vfsp to indicate that all
 590  * mounted file systems of that type should be updated.
 591  */
 592 void
 593 vfs_sync(int flag)
 594 {
 595         struct vfssw *vswp;
 596         RLOCK_VFSSW();
 597         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 598                 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 599                         vfs_refvfssw(vswp);
 600                         RUNLOCK_VFSSW();
 601                         (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 602                             CRED());
 603                         vfs_unrefvfssw(vswp);
 604                         RLOCK_VFSSW();
 605                 }
 606         }
 607         RUNLOCK_VFSSW();
 608 }
 609 
 610 void
 611 sync(void)
 612 {
 613         vfs_sync(0);
 614 }
 615 
 616 /*
 617  * compare function for vfs_by_dev avl tree. compare dev first, then mntix
 618  */
 619 static int
 620 vfs_cmp_dev(const void *aa, const void *bb)
 621 {
 622         const vfs_t *a = aa;
 623         const vfs_t *b = bb;
 624 
 625         if (a->vfs_dev < b->vfs_dev)
 626                 return (-1);
 627         if (a->vfs_dev > b->vfs_dev)
 628                 return (1);
 629         if (a->vfs_mntix < b->vfs_mntix)
 630                 return (-1);
 631         if (a->vfs_mntix > b->vfs_mntix)
 632                 return (1);
 633         return (0);
 634 }
 635 
 636 /*
 637  * compare function for vfs_by_mntpnt avl tree. compare mntpnt first, then mntix
 638  */
 639 static int
 640 vfs_cmp_mntpnt(const void *aa, const void *bb)
 641 {
 642         const vfs_t *a = aa;
 643         const vfs_t *b = bb;
 644         int ret;
 645 
 646         ret = strcmp(refstr_value(a->vfs_mntpt), refstr_value(b->vfs_mntpt));
 647         if (ret < 0)
 648                 return (-1);
 649         if (ret > 0)
 650                 return (1);
 651         if (a->vfs_mntix < b->vfs_mntix)
 652                 return (-1);
 653         if (a->vfs_mntix > b->vfs_mntix)
 654                 return (1);
 655         return (0);
 656 }
 657 
 658 /*
 659  * External routines.
 660  */
 661 
 662 krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 663 
 664 /*
 665  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 666  * but otherwise should be accessed only via vfs_list_lock() and
 667  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 668  */
 669 static krwlock_t vfslist;
 670 
 671 /*
 672  * Mount devfs on /devices. This is done right after root is mounted
 673  * to provide device access support for the system
 674  */
 675 static void
 676 vfs_mountdevices(void)
 677 {
 678         struct vfssw *vsw;
 679         struct vnode *mvp;
 680         struct mounta mounta = {        /* fake mounta for devfs_mount() */
 681                 NULL,
 682                 NULL,
 683                 MS_SYSSPACE,
 684                 NULL,
 685                 NULL,
 686                 0,
 687                 NULL,
 688                 0
 689         };
 690 
 691         /*
 692          * _init devfs module to fill in the vfssw
 693          */
 694         if (modload("fs", "devfs") == -1)
 695                 panic("Cannot _init devfs module");
 696 
 697         /*
 698          * Hold vfs
 699          */
 700         RLOCK_VFSSW();
 701         vsw = vfs_getvfsswbyname("devfs");
 702         VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 703         VFS_HOLD(&devices);
 704 
 705         /*
 706          * Locate mount point
 707          */
 708         if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 709                 panic("Cannot find /devices");
 710 
 711         /*
 712          * Perform the mount of /devices
 713          */
 714         if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 715                 panic("Cannot mount /devices");
 716 
 717         RUNLOCK_VFSSW();
 718 
 719         /*
 720          * Set appropriate members and add to vfs list for mnttab display
 721          */
 722         vfs_setresource(&devices, "/devices", 0);
 723         vfs_setmntpoint(&devices, "/devices", 0);
 724 
 725         /*
 726          * Hold the root of /devices so it won't go away
 727          */
 728         if (VFS_ROOT(&devices, &devicesdir))
 729                 panic("vfs_mountdevices: not devices root");
 730 
 731         if (vfs_lock(&devices) != 0) {
 732                 VN_RELE(devicesdir);
 733                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 734                 return;
 735         }
 736 
 737         if (vn_vfswlock(mvp) != 0) {
 738                 vfs_unlock(&devices);
 739                 VN_RELE(devicesdir);
 740                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 741                 return;
 742         }
 743 
 744         vfs_add(mvp, &devices, 0);
 745         vn_vfsunlock(mvp);
 746         vfs_unlock(&devices);
 747         VN_RELE(devicesdir);
 748 }
 749 
 750 /*
 751  * mount the first instance of /dev  to root and remain mounted
 752  */
 753 static void
 754 vfs_mountdev1(void)
 755 {
 756         struct vfssw *vsw;
 757         struct vnode *mvp;
 758         struct mounta mounta = {        /* fake mounta for sdev_mount() */
 759                 NULL,
 760                 NULL,
 761                 MS_SYSSPACE | MS_OVERLAY,
 762                 NULL,
 763                 NULL,
 764                 0,
 765                 NULL,
 766                 0
 767         };
 768 
 769         /*
 770          * _init dev module to fill in the vfssw
 771          */
 772         if (modload("fs", "dev") == -1)
 773                 cmn_err(CE_PANIC, "Cannot _init dev module\n");
 774 
 775         /*
 776          * Hold vfs
 777          */
 778         RLOCK_VFSSW();
 779         vsw = vfs_getvfsswbyname("dev");
 780         VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 781         VFS_HOLD(&dev);
 782 
 783         /*
 784          * Locate mount point
 785          */
 786         if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 787                 cmn_err(CE_PANIC, "Cannot find /dev\n");
 788 
 789         /*
 790          * Perform the mount of /dev
 791          */
 792         if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 793                 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 794 
 795         RUNLOCK_VFSSW();
 796 
 797         /*
 798          * Set appropriate members and add to vfs list for mnttab display
 799          */
 800         vfs_setresource(&dev, "/dev", 0);
 801         vfs_setmntpoint(&dev, "/dev", 0);
 802 
 803         /*
 804          * Hold the root of /dev so it won't go away
 805          */
 806         if (VFS_ROOT(&dev, &devdir))
 807                 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 808 
 809         if (vfs_lock(&dev) != 0) {
 810                 VN_RELE(devdir);
 811                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 812                 return;
 813         }
 814 
 815         if (vn_vfswlock(mvp) != 0) {
 816                 vfs_unlock(&dev);
 817                 VN_RELE(devdir);
 818                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 819                 return;
 820         }
 821 
 822         vfs_add(mvp, &dev, 0);
 823         vn_vfsunlock(mvp);
 824         vfs_unlock(&dev);
 825         VN_RELE(devdir);
 826 }
 827 
 828 /*
 829  * Mount required filesystem. This is done right after root is mounted.
 830  */
 831 static void
 832 vfs_mountfs(char *module, char *spec, char *path)
 833 {
 834         struct vnode *mvp;
 835         struct mounta mounta;
 836         vfs_t *vfsp;
 837 
 838         mounta.flags = MS_SYSSPACE | MS_DATA;
 839         mounta.fstype = module;
 840         mounta.spec = spec;
 841         mounta.dir = path;
 842         if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 843                 cmn_err(CE_WARN, "Cannot find %s", path);
 844                 return;
 845         }
 846         if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 847                 cmn_err(CE_WARN, "Cannot mount %s", path);
 848         else
 849                 VFS_RELE(vfsp);
 850         VN_RELE(mvp);
 851 }
 852 
 853 /*
 854  * vfs_mountroot is called by main() to mount the root filesystem.
 855  */
 856 void
 857 vfs_mountroot(void)
 858 {
 859         struct vnode    *rvp = NULL;
 860         char            *path;
 861         size_t          plen;
 862         struct vfssw    *vswp;
 863         proc_t          *p;
 864 
 865         rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 866         rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 867 
 868         /*
 869          * Alloc the avl trees for quick indexing via dev and mountpoint
 870          */
 871         avl_create(&vfs_by_dev, vfs_cmp_dev, sizeof(vfs_t),
 872             offsetof(vfs_t, vfs_avldev));
 873         avl_create(&vfs_by_mntpnt, vfs_cmp_mntpnt, sizeof(vfs_t),
 874             offsetof(vfs_t, vfs_avlmntpnt));
 875 
 876         /*
 877          * Alloc the vfs hash bucket array and locks
 878          */
 879         rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 880 
 881         /*
 882          * Call machine-dependent routine "rootconf" to choose a root
 883          * file system type.
 884          */
 885         if (rootconf())
 886                 panic("vfs_mountroot: cannot mount root");
 887         /*
 888          * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 889          * to point to it.  These are used by lookuppn() so that it
 890          * knows where to start from ('/' or '.').
 891          */
 892         vfs_setmntpoint(rootvfs, "/", 0);
 893         if (VFS_ROOT(rootvfs, &rootdir))
 894                 panic("vfs_mountroot: no root vnode");
 895 
 896         /*
 897          * At this point, the process tree consists of p0 and possibly some
 898          * direct children of p0.  (i.e. there are no grandchildren)
 899          *
 900          * Walk through them all, setting their current directory.
 901          */
 902         mutex_enter(&pidlock);
 903         for (p = practive; p != NULL; p = p->p_next) {
 904                 ASSERT(p == &p0 || p->p_parent == &p0);
 905 
 906                 PTOU(p)->u_cdir = rootdir;
 907                 VN_HOLD(PTOU(p)->u_cdir);
 908                 PTOU(p)->u_rdir = NULL;
 909         }
 910         mutex_exit(&pidlock);
 911 
 912         /*
 913          * Setup the global zone's rootvp, now that it exists.
 914          */
 915         global_zone->zone_rootvp = rootdir;
 916         VN_HOLD(global_zone->zone_rootvp);
 917 
 918         /*
 919          * Notify the module code that it can begin using the
 920          * root filesystem instead of the boot program's services.
 921          */
 922         modrootloaded = 1;
 923 
 924         /*
 925          * Special handling for a ZFS root file system.
 926          */
 927         zfs_boot_init();
 928 
 929         /*
 930          * Set up mnttab information for root
 931          */
 932         vfs_setresource(rootvfs, rootfs.bo_name, 0);
 933 
 934         /*
 935          * Notify cluster software that the root filesystem is available.
 936          */
 937         clboot_mountroot();
 938 
 939         /* Now that we're all done with the root FS, set up its vopstats */
 940         if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 941                 /* Set flag for statistics collection */
 942                 if (vswp->vsw_flag & VSW_STATS) {
 943                         initialize_vopstats(&rootvfs->vfs_vopstats);
 944                         rootvfs->vfs_flag |= VFS_STATS;
 945                         rootvfs->vfs_fstypevsp =
 946                             get_fstype_vopstats(rootvfs, vswp);
 947                         rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 948                 }
 949                 vfs_unrefvfssw(vswp);
 950         }
 951 
 952         /*
 953          * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 954          * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 955          */
 956         vfs_mountdevices();
 957         vfs_mountdev1();
 958 
 959         vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 960         vfs_mountfs("proc", "/proc", "/proc");
 961         vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 962         vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 963         vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 964 
 965         if (getzoneid() == GLOBAL_ZONEID) {
 966                 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 967         }
 968 
 969 #ifdef __sparc
 970         /*
 971          * This bit of magic can go away when we convert sparc to
 972          * the new boot architecture based on ramdisk.
 973          *
 974          * Booting off a mirrored root volume:
 975          * At this point, we have booted and mounted root on a
 976          * single component of the mirror.  Complete the boot
 977          * by configuring SVM and converting the root to the
 978          * dev_t of the mirrored root device.  This dev_t conversion
 979          * only works because the underlying device doesn't change.
 980          */
 981         if (root_is_svm) {
 982                 if (svm_rootconf()) {
 983                         panic("vfs_mountroot: cannot remount root");
 984                 }
 985 
 986                 /*
 987                  * mnttab should reflect the new root device
 988                  */
 989                 vfs_lock_wait(rootvfs);
 990                 vfs_setresource(rootvfs, rootfs.bo_name, 0);
 991                 vfs_unlock(rootvfs);
 992         }
 993 #endif /* __sparc */
 994 
 995         if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 996                 /*
 997                  * Look up the root device via devfs so that a dv_node is
 998                  * created for it. The vnode is never VN_RELE()ed.
 999                  * We allocate more than MAXPATHLEN so that the
1000                  * buffer passed to i_ddi_prompath_to_devfspath() is
1001                  * exactly MAXPATHLEN (the function expects a buffer
1002                  * of that length).
1003                  */
1004                 plen = strlen("/devices");
1005                 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
1006                 (void) strcpy(path, "/devices");
1007 
1008                 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
1009                     != DDI_SUCCESS ||
1010                     lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
1011 
1012                         /* NUL terminate in case "path" has garbage */
1013                         path[plen + MAXPATHLEN - 1] = '\0';
1014 #ifdef  DEBUG
1015                         cmn_err(CE_WARN, "!Cannot lookup root device: %s",
1016                             path);
1017 #endif
1018                 }
1019                 kmem_free(path, plen + MAXPATHLEN);
1020         }
1021 
1022         vfs_mnttabvp_setup();
1023 }
1024 
1025 /*
1026  * Check to see if our "block device" is actually a file.  If so,
1027  * automatically add a lofi device, and keep track of this fact.
1028  */
1029 static int
1030 lofi_add(const char *fsname, struct vfs *vfsp,
1031     mntopts_t *mntopts, struct mounta *uap)
1032 {
1033         int fromspace = (uap->flags & MS_SYSSPACE) ?
1034             UIO_SYSSPACE : UIO_USERSPACE;
1035         struct lofi_ioctl *li = NULL;
1036         struct vnode *vp = NULL;
1037         struct pathname pn = { NULL };
1038         ldi_ident_t ldi_id;
1039         ldi_handle_t ldi_hdl;
1040         vfssw_t *vfssw;
1041         int minor;
1042         int err = 0;
1043 
1044         if ((vfssw = vfs_getvfssw(fsname)) == NULL)
1045                 return (0);
1046 
1047         if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
1048                 vfs_unrefvfssw(vfssw);
1049                 return (0);
1050         }
1051 
1052         vfs_unrefvfssw(vfssw);
1053         vfssw = NULL;
1054 
1055         if (pn_get(uap->spec, fromspace, &pn) != 0)
1056                 return (0);
1057 
1058         if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
1059                 goto out;
1060 
1061         if (vp->v_type != VREG)
1062                 goto out;
1063 
1064         /* OK, this is a lofi mount. */
1065 
1066         if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
1067             vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
1068             vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
1069             vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
1070                 err = EINVAL;
1071                 goto out;
1072         }
1073 
1074         ldi_id = ldi_ident_from_anon();
1075         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1076         (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1077 
1078         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1079             &ldi_hdl, ldi_id);
1080 
1081         if (err)
1082                 goto out2;
1083 
1084         err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1085             FREAD | FWRITE | FKIOCTL, kcred, &minor);
1086 
1087         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1088 
1089         if (!err)
1090                 vfsp->vfs_lofi_minor = minor;
1091 
1092 out2:
1093         ldi_ident_release(ldi_id);
1094 out:
1095         if (li != NULL)
1096                 kmem_free(li, sizeof (*li));
1097         if (vp != NULL)
1098                 VN_RELE(vp);
1099         pn_free(&pn);
1100         return (err);
1101 }
1102 
1103 static void
1104 lofi_remove(struct vfs *vfsp)
1105 {
1106         struct lofi_ioctl *li = NULL;
1107         ldi_ident_t ldi_id;
1108         ldi_handle_t ldi_hdl;
1109         int err;
1110 
1111         if (vfsp->vfs_lofi_minor == 0)
1112                 return;
1113 
1114         ldi_id = ldi_ident_from_anon();
1115 
1116         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1117         li->li_minor = vfsp->vfs_lofi_minor;
1118         li->li_cleanup = B_TRUE;
1119 
1120         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1121             &ldi_hdl, ldi_id);
1122 
1123         if (err)
1124                 goto out;
1125 
1126         err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1127             FREAD | FWRITE | FKIOCTL, kcred, NULL);
1128 
1129         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1130 
1131         if (!err)
1132                 vfsp->vfs_lofi_minor = 0;
1133 
1134 out:
1135         ldi_ident_release(ldi_id);
1136         if (li != NULL)
1137                 kmem_free(li, sizeof (*li));
1138 }
1139 
1140 /*
1141  * Common mount code.  Called from the system call entry point, from autofs,
1142  * nfsv4 trigger mounts, and from pxfs.
1143  *
1144  * Takes the effective file system type, mount arguments, the mount point
1145  * vnode, flags specifying whether the mount is a remount and whether it
1146  * should be entered into the vfs list, and credentials.  Fills in its vfspp
1147  * parameter with the mounted file system instance's vfs.
1148  *
1149  * Note that the effective file system type is specified as a string.  It may
1150  * be null, in which case it's determined from the mount arguments, and may
1151  * differ from the type specified in the mount arguments; this is a hook to
1152  * allow interposition when instantiating file system instances.
1153  *
1154  * The caller is responsible for releasing its own hold on the mount point
1155  * vp (this routine does its own hold when necessary).
1156  * Also note that for remounts, the mount point vp should be the vnode for
1157  * the root of the file system rather than the vnode that the file system
1158  * is mounted on top of.
1159  */
1160 int
1161 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1162         struct vfs **vfspp)
1163 {
1164         struct vfssw    *vswp;
1165         vfsops_t        *vfsops;
1166         struct vfs      *vfsp;
1167         struct vnode    *bvp;
1168         dev_t           bdev = 0;
1169         mntopts_t       mnt_mntopts;
1170         int             error = 0;
1171         int             copyout_error = 0;
1172         int             ovflags;
1173         char            *opts = uap->optptr;
1174         char            *inargs = opts;
1175         int             optlen = uap->optlen;
1176         int             remount;
1177         int             rdonly;
1178         int             nbmand = 0;
1179         int             delmip = 0;
1180         int             addmip = 0;
1181         int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1182         int             fromspace = (uap->flags & MS_SYSSPACE) ?
1183             UIO_SYSSPACE : UIO_USERSPACE;
1184         char            *resource = NULL, *mountpt = NULL;
1185         refstr_t        *oldresource, *oldmntpt;
1186         struct pathname pn, rpn;
1187         vsk_anchor_t    *vskap;
1188         char fstname[FSTYPSZ];
1189         zone_t          *zone;
1190 
1191         /*
1192          * The v_flag value for the mount point vp is permanently set
1193          * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1194          * for mount point locking.
1195          */
1196         mutex_enter(&vp->v_lock);
1197         vp->v_flag |= VVFSLOCK;
1198         mutex_exit(&vp->v_lock);
1199 
1200         mnt_mntopts.mo_count = 0;
1201         /*
1202          * Find the ops vector to use to invoke the file system-specific mount
1203          * method.  If the fsname argument is non-NULL, use it directly.
1204          * Otherwise, dig the file system type information out of the mount
1205          * arguments.
1206          *
1207          * A side effect is to hold the vfssw entry.
1208          *
1209          * Mount arguments can be specified in several ways, which are
1210          * distinguished by flag bit settings.  The preferred way is to set
1211          * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1212          * type supplied as a character string and the last two arguments
1213          * being a pointer to a character buffer and the size of the buffer.
1214          * On entry, the buffer holds a null terminated list of options; on
1215          * return, the string is the list of options the file system
1216          * recognized. If MS_DATA is set arguments five and six point to a
1217          * block of binary data which the file system interprets.
1218          * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1219          * consistently with these conventions.  To handle them, we check to
1220          * see whether the pointer to the file system name has a numeric value
1221          * less than 256.  If so, we treat it as an index.
1222          */
1223         if (fsname != NULL) {
1224                 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1225                         return (EINVAL);
1226                 }
1227         } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1228                 size_t n;
1229                 uint_t fstype;
1230 
1231                 fsname = fstname;
1232 
1233                 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1234                         RLOCK_VFSSW();
1235                         if (fstype == 0 || fstype >= nfstype ||
1236                             !ALLOCATED_VFSSW(&vfssw[fstype])) {
1237                                 RUNLOCK_VFSSW();
1238                                 return (EINVAL);
1239                         }
1240                         (void) strcpy(fsname, vfssw[fstype].vsw_name);
1241                         RUNLOCK_VFSSW();
1242                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1243                                 return (EINVAL);
1244                 } else {
1245                         /*
1246                          * Handle either kernel or user address space.
1247                          */
1248                         if (uap->flags & MS_SYSSPACE) {
1249                                 error = copystr(uap->fstype, fsname,
1250                                     FSTYPSZ, &n);
1251                         } else {
1252                                 error = copyinstr(uap->fstype, fsname,
1253                                     FSTYPSZ, &n);
1254                         }
1255                         if (error) {
1256                                 if (error == ENAMETOOLONG)
1257                                         return (EINVAL);
1258                                 return (error);
1259                         }
1260                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1261                                 return (EINVAL);
1262                 }
1263         } else {
1264                 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1265                         return (EINVAL);
1266                 fsname = vswp->vsw_name;
1267         }
1268         if (!VFS_INSTALLED(vswp))
1269                 return (EINVAL);
1270 
1271         if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1272                 vfs_unrefvfssw(vswp);
1273                 return (error);
1274         }
1275 
1276         vfsops = &vswp->vsw_vfsops;
1277 
1278         vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1279         /*
1280          * Fetch mount options and parse them for generic vfs options
1281          */
1282         if (uap->flags & MS_OPTIONSTR) {
1283                 /*
1284                  * Limit the buffer size
1285                  */
1286                 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1287                         error = EINVAL;
1288                         goto errout;
1289                 }
1290                 if ((uap->flags & MS_SYSSPACE) == 0) {
1291                         inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1292                         inargs[0] = '\0';
1293                         if (optlen) {
1294                                 error = copyinstr(opts, inargs, (size_t)optlen,
1295                                     NULL);
1296                                 if (error) {
1297                                         goto errout;
1298                                 }
1299                         }
1300                 }
1301                 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1302         }
1303         /*
1304          * Flag bits override the options string.
1305          */
1306         if (uap->flags & MS_REMOUNT)
1307                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1308         if (uap->flags & MS_RDONLY)
1309                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1310         if (uap->flags & MS_NOSUID)
1311                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1312 
1313         /*
1314          * Check if this is a remount; must be set in the option string and
1315          * the file system must support a remount option.
1316          */
1317         if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1318             MNTOPT_REMOUNT, NULL)) {
1319                 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1320                         error = ENOTSUP;
1321                         goto errout;
1322                 }
1323                 uap->flags |= MS_REMOUNT;
1324         }
1325 
1326         /*
1327          * uap->flags and vfs_optionisset() should agree.
1328          */
1329         if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1330                 uap->flags |= MS_RDONLY;
1331         }
1332         if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1333                 uap->flags |= MS_NOSUID;
1334         }
1335         nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1336         ASSERT(splice || !remount);
1337         /*
1338          * If we are splicing the fs into the namespace,
1339          * perform mount point checks.
1340          *
1341          * We want to resolve the path for the mount point to eliminate
1342          * '.' and ".." and symlinks in mount points; we can't do the
1343          * same for the resource string, since it would turn
1344          * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1345          * this before grabbing vn_vfswlock(), because otherwise we
1346          * would deadlock with lookuppn().
1347          */
1348         if (splice) {
1349                 ASSERT(vp->v_count > 0);
1350 
1351                 /*
1352                  * Pick up mount point and device from appropriate space.
1353                  */
1354                 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1355                         resource = kmem_alloc(pn.pn_pathlen + 1,
1356                             KM_SLEEP);
1357                         (void) strcpy(resource, pn.pn_path);
1358                         pn_free(&pn);
1359                 }
1360                 /*
1361                  * Do a lookupname prior to taking the
1362                  * writelock. Mark this as completed if
1363                  * successful for later cleanup and addition to
1364                  * the mount in progress table.
1365                  */
1366                 if ((uap->flags & MS_GLOBAL) == 0 &&
1367                     lookupname(uap->spec, fromspace,
1368                     FOLLOW, NULL, &bvp) == 0) {
1369                         addmip = 1;
1370                 }
1371 
1372                 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1373                         pathname_t *pnp;
1374 
1375                         if (*pn.pn_path != '/') {
1376                                 error = EINVAL;
1377                                 pn_free(&pn);
1378                                 goto errout;
1379                         }
1380                         pn_alloc(&rpn);
1381                         /*
1382                          * Kludge to prevent autofs from deadlocking with
1383                          * itself when it calls domount().
1384                          *
1385                          * If autofs is calling, it is because it is doing
1386                          * (autofs) mounts in the process of an NFS mount.  A
1387                          * lookuppn() here would cause us to block waiting for
1388                          * said NFS mount to complete, which can't since this
1389                          * is the thread that was supposed to doing it.
1390                          */
1391                         if (fromspace == UIO_USERSPACE) {
1392                                 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1393                                     NULL)) == 0) {
1394                                         pnp = &rpn;
1395                                 } else {
1396                                         /*
1397                                          * The file disappeared or otherwise
1398                                          * became inaccessible since we opened
1399                                          * it; might as well fail the mount
1400                                          * since the mount point is no longer
1401                                          * accessible.
1402                                          */
1403                                         pn_free(&rpn);
1404                                         pn_free(&pn);
1405                                         goto errout;
1406                                 }
1407                         } else {
1408                                 pnp = &pn;
1409                         }
1410                         mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1411                         (void) strcpy(mountpt, pnp->pn_path);
1412 
1413                         /*
1414                          * If the addition of the zone's rootpath
1415                          * would push us over a total path length
1416                          * of MAXPATHLEN, we fail the mount with
1417                          * ENAMETOOLONG, which is what we would have
1418                          * gotten if we were trying to perform the same
1419                          * mount in the global zone.
1420                          *
1421                          * strlen() doesn't count the trailing
1422                          * '\0', but zone_rootpathlen counts both a
1423                          * trailing '/' and the terminating '\0'.
1424                          */
1425                         if ((curproc->p_zone->zone_rootpathlen - 1 +
1426                             strlen(mountpt)) > MAXPATHLEN ||
1427                             (resource != NULL &&
1428                             (curproc->p_zone->zone_rootpathlen - 1 +
1429                             strlen(resource)) > MAXPATHLEN)) {
1430                                 error = ENAMETOOLONG;
1431                         }
1432 
1433                         pn_free(&rpn);
1434                         pn_free(&pn);
1435                 }
1436 
1437                 if (error)
1438                         goto errout;
1439 
1440                 /*
1441                  * Prevent path name resolution from proceeding past
1442                  * the mount point.
1443                  */
1444                 if (vn_vfswlock(vp) != 0) {
1445                         error = EBUSY;
1446                         goto errout;
1447                 }
1448 
1449                 /*
1450                  * Verify that it's legitimate to establish a mount on
1451                  * the prospective mount point.
1452                  */
1453                 if (vn_mountedvfs(vp) != NULL) {
1454                         /*
1455                          * The mount point lock was obtained after some
1456                          * other thread raced through and established a mount.
1457                          */
1458                         vn_vfsunlock(vp);
1459                         error = EBUSY;
1460                         goto errout;
1461                 }
1462                 if (vp->v_flag & VNOMOUNT) {
1463                         vn_vfsunlock(vp);
1464                         error = EINVAL;
1465                         goto errout;
1466                 }
1467         }
1468         if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1469                 uap->dataptr = NULL;
1470                 uap->datalen = 0;
1471         }
1472 
1473         /*
1474          * If this is a remount, we don't want to create a new VFS.
1475          * Instead, we pass the existing one with a remount flag.
1476          */
1477         if (remount) {
1478                 /*
1479                  * Confirm that the mount point is the root vnode of the
1480                  * file system that is being remounted.
1481                  * This can happen if the user specifies a different
1482                  * mount point directory pathname in the (re)mount command.
1483                  *
1484                  * Code below can only be reached if splice is true, so it's
1485                  * safe to do vn_vfsunlock() here.
1486                  */
1487                 if ((vp->v_flag & VROOT) == 0) {
1488                         vn_vfsunlock(vp);
1489                         error = ENOENT;
1490                         goto errout;
1491                 }
1492                 /*
1493                  * Disallow making file systems read-only unless file system
1494                  * explicitly allows it in its vfssw.  Ignore other flags.
1495                  */
1496                 if (rdonly && vn_is_readonly(vp) == 0 &&
1497                     (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1498                         vn_vfsunlock(vp);
1499                         error = EINVAL;
1500                         goto errout;
1501                 }
1502                 /*
1503                  * Disallow changing the NBMAND disposition of the file
1504                  * system on remounts.
1505                  */
1506                 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1507                     (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1508                         vn_vfsunlock(vp);
1509                         error = EINVAL;
1510                         goto errout;
1511                 }
1512                 vfsp = vp->v_vfsp;
1513                 ovflags = vfsp->vfs_flag;
1514                 vfsp->vfs_flag |= VFS_REMOUNT;
1515                 vfsp->vfs_flag &= ~VFS_RDONLY;
1516         } else {
1517                 vfsp = vfs_alloc(KM_SLEEP);
1518                 VFS_INIT(vfsp, vfsops, NULL);
1519         }
1520 
1521         VFS_HOLD(vfsp);
1522 
1523         if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1524                 if (!remount) {
1525                         if (splice)
1526                                 vn_vfsunlock(vp);
1527                         vfs_free(vfsp);
1528                 } else {
1529                         vn_vfsunlock(vp);
1530                         VFS_RELE(vfsp);
1531                 }
1532                 goto errout;
1533         }
1534 
1535         /*
1536          * PRIV_SYS_MOUNT doesn't mean you can become root.
1537          */
1538         if (vfsp->vfs_lofi_minor != 0) {
1539                 uap->flags |= MS_NOSUID;
1540                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1541         }
1542 
1543         /*
1544          * The vfs_reflock is not used anymore the code below explicitly
1545          * holds it preventing others accesing it directly.
1546          */
1547         if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1548             !(vfsp->vfs_flag & VFS_REMOUNT))
1549                 cmn_err(CE_WARN,
1550                     "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1551 
1552         /*
1553          * Lock the vfs. If this is a remount we want to avoid spurious umount
1554          * failures that happen as a side-effect of fsflush() and other mount
1555          * and unmount operations that might be going on simultaneously and
1556          * may have locked the vfs currently. To not return EBUSY immediately
1557          * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1558          */
1559         if (!remount) {
1560                 if (error = vfs_lock(vfsp)) {
1561                         vfsp->vfs_flag = ovflags;
1562 
1563                         lofi_remove(vfsp);
1564 
1565                         if (splice)
1566                                 vn_vfsunlock(vp);
1567                         vfs_free(vfsp);
1568                         goto errout;
1569                 }
1570         } else {
1571                 vfs_lock_wait(vfsp);
1572         }
1573 
1574         /*
1575          * Add device to mount in progress table, global mounts require special
1576          * handling. It is possible that we have already done the lookupname
1577          * on a spliced, non-global fs. If so, we don't want to do it again
1578          * since we cannot do a lookupname after taking the
1579          * wlock above. This case is for a non-spliced, non-global filesystem.
1580          */
1581         if (!addmip) {
1582                 if ((uap->flags & MS_GLOBAL) == 0 &&
1583                     lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1584                         addmip = 1;
1585                 }
1586         }
1587 
1588         if (addmip) {
1589                 vnode_t *lvp = NULL;
1590 
1591                 error = vfs_get_lofi(vfsp, &lvp);
1592                 if (error > 0) {
1593                         lofi_remove(vfsp);
1594 
1595                         if (splice)
1596                                 vn_vfsunlock(vp);
1597                         vfs_unlock(vfsp);
1598 
1599                         if (remount) {
1600                                 VFS_RELE(vfsp);
1601                         } else {
1602                                 vfs_free(vfsp);
1603                         }
1604 
1605                         goto errout;
1606                 } else if (error == -1) {
1607                         bdev = bvp->v_rdev;
1608                         VN_RELE(bvp);
1609                 } else {
1610                         bdev = lvp->v_rdev;
1611                         VN_RELE(lvp);
1612                         VN_RELE(bvp);
1613                 }
1614 
1615                 vfs_addmip(bdev, vfsp);
1616                 addmip = 0;
1617                 delmip = 1;
1618         }
1619         /*
1620          * Invalidate cached entry for the mount point.
1621          */
1622         if (splice)
1623                 dnlc_purge_vp(vp);
1624 
1625         /*
1626          * If have an option string but the filesystem doesn't supply a
1627          * prototype options table, create a table with the global
1628          * options and sufficient room to accept all the options in the
1629          * string.  Then parse the passed in option string
1630          * accepting all the options in the string.  This gives us an
1631          * option table with all the proper cancel properties for the
1632          * global options.
1633          *
1634          * Filesystems that supply a prototype options table are handled
1635          * earlier in this function.
1636          */
1637         if (uap->flags & MS_OPTIONSTR) {
1638                 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1639                         mntopts_t tmp_mntopts;
1640 
1641                         tmp_mntopts.mo_count = 0;
1642                         vfs_createopttbl_extend(&tmp_mntopts, inargs,
1643                             &mnt_mntopts);
1644                         vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1645                         vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1646                         vfs_freeopttbl(&tmp_mntopts);
1647                 }
1648         }
1649 
1650         /*
1651          * Serialize with zone state transitions.
1652          * See vfs_list_add; zone mounted into is:
1653          *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1654          * not the zone doing the mount (curproc->p_zone), but if we're already
1655          * inside a NGZ, then we know what zone we are.
1656          */
1657         if (INGLOBALZONE(curproc)) {
1658                 zone = zone_find_by_path(mountpt);
1659                 ASSERT(zone != NULL);
1660         } else {
1661                 zone = curproc->p_zone;
1662                 /*
1663                  * zone_find_by_path does a hold, so do one here too so that
1664                  * we can do a zone_rele after mount_completed.
1665                  */
1666                 zone_hold(zone);
1667         }
1668         mount_in_progress(zone);
1669         /*
1670          * Instantiate (or reinstantiate) the file system.  If appropriate,
1671          * splice it into the file system name space.
1672          *
1673          * We want VFS_MOUNT() to be able to override the vfs_resource
1674          * string if necessary (ie, mntfs), and also for a remount to
1675          * change the same (necessary when remounting '/' during boot).
1676          * So we set up vfs_mntpt and vfs_resource to what we think they
1677          * should be, then hand off control to VFS_MOUNT() which can
1678          * override this.
1679          *
1680          * For safety's sake, when changing vfs_resource or vfs_mntpt of
1681          * a vfs which is on the vfs list (i.e. during a remount), we must
1682          * never set those fields to NULL. Several bits of code make
1683          * assumptions that the fields are always valid.
1684          */
1685         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1686         if (remount) {
1687                 if ((oldresource = vfsp->vfs_resource) != NULL)
1688                         refstr_hold(oldresource);
1689                 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1690                         refstr_hold(oldmntpt);
1691         }
1692         vfs_setresource(vfsp, resource, 0);
1693         vfs_setmntpoint(vfsp, mountpt, 0);
1694 
1695         /*
1696          * going to mount on this vnode, so notify.
1697          */
1698         vnevent_mountedover(vp, NULL);
1699         error = VFS_MOUNT(vfsp, vp, uap, credp);
1700 
1701         if (uap->flags & MS_RDONLY)
1702                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1703         if (uap->flags & MS_NOSUID)
1704                 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1705         if (uap->flags & MS_GLOBAL)
1706                 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1707 
1708         if (error) {
1709                 lofi_remove(vfsp);
1710 
1711                 if (remount) {
1712                         /* put back pre-remount options */
1713                         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1714                         vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1715                             VFSSP_VERBATIM);
1716                         if (oldmntpt)
1717                                 refstr_rele(oldmntpt);
1718                         vfs_setresource(vfsp, refstr_value(oldresource),
1719                             VFSSP_VERBATIM);
1720                         if (oldresource)
1721                                 refstr_rele(oldresource);
1722                         vfsp->vfs_flag = ovflags;
1723                         vfs_unlock(vfsp);
1724                         VFS_RELE(vfsp);
1725                 } else {
1726                         vfs_unlock(vfsp);
1727                         vfs_freemnttab(vfsp);
1728                         vfs_free(vfsp);
1729                 }
1730         } else {
1731                 /*
1732                  * Set the mount time to now
1733                  */
1734                 vfsp->vfs_mtime = ddi_get_time();
1735                 if (remount) {
1736                         vfsp->vfs_flag &= ~VFS_REMOUNT;
1737                         if (oldresource)
1738                                 refstr_rele(oldresource);
1739                         if (oldmntpt)
1740                                 refstr_rele(oldmntpt);
1741                 } else if (splice) {
1742                         /*
1743                          * Link vfsp into the name space at the mount
1744                          * point. Vfs_add() is responsible for
1745                          * holding the mount point which will be
1746                          * released when vfs_remove() is called.
1747                          */
1748                         vfs_add(vp, vfsp, uap->flags);
1749                 } else {
1750                         /*
1751                          * Hold the reference to file system which is
1752                          * not linked into the name space.
1753                          */
1754                         vfsp->vfs_zone = NULL;
1755                         VFS_HOLD(vfsp);
1756                         vfsp->vfs_vnodecovered = NULL;
1757                 }
1758                 /*
1759                  * Set flags for global options encountered
1760                  */
1761                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1762                         vfsp->vfs_flag |= VFS_RDONLY;
1763                 else
1764                         vfsp->vfs_flag &= ~VFS_RDONLY;
1765                 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1766                         vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1767                 } else {
1768                         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1769                                 vfsp->vfs_flag |= VFS_NODEVICES;
1770                         else
1771                                 vfsp->vfs_flag &= ~VFS_NODEVICES;
1772                         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1773                                 vfsp->vfs_flag |= VFS_NOSETUID;
1774                         else
1775                                 vfsp->vfs_flag &= ~VFS_NOSETUID;
1776                 }
1777                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1778                         vfsp->vfs_flag |= VFS_NBMAND;
1779                 else
1780                         vfsp->vfs_flag &= ~VFS_NBMAND;
1781 
1782                 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1783                         vfsp->vfs_flag |= VFS_XATTR;
1784                 else
1785                         vfsp->vfs_flag &= ~VFS_XATTR;
1786 
1787                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1788                         vfsp->vfs_flag |= VFS_NOEXEC;
1789                 else
1790                         vfsp->vfs_flag &= ~VFS_NOEXEC;
1791 
1792                 /*
1793                  * Now construct the output option string of options
1794                  * we recognized.
1795                  */
1796                 if (uap->flags & MS_OPTIONSTR) {
1797                         vfs_list_read_lock();
1798                         copyout_error = vfs_buildoptionstr(
1799                             &vfsp->vfs_mntopts, inargs, optlen);
1800                         vfs_list_unlock();
1801                         if (copyout_error == 0 &&
1802                             (uap->flags & MS_SYSSPACE) == 0) {
1803                                 copyout_error = copyoutstr(inargs, opts,
1804                                     optlen, NULL);
1805                         }
1806                 }
1807 
1808                 /*
1809                  * If this isn't a remount, set up the vopstats before
1810                  * anyone can touch this. We only allow spliced file
1811                  * systems (file systems which are in the namespace) to
1812                  * have the VFS_STATS flag set.
1813                  * NOTE: PxFS mounts the underlying file system with
1814                  * MS_NOSPLICE set and copies those vfs_flags to its private
1815                  * vfs structure. As a result, PxFS should never have
1816                  * the VFS_STATS flag or else we might access the vfs
1817                  * statistics-related fields prior to them being
1818                  * properly initialized.
1819                  */
1820                 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1821                         initialize_vopstats(&vfsp->vfs_vopstats);
1822                         /*
1823                          * We need to set vfs_vskap to NULL because there's
1824                          * a chance it won't be set below.  This is checked
1825                          * in teardown_vopstats() so we can't have garbage.
1826                          */
1827                         vfsp->vfs_vskap = NULL;
1828                         vfsp->vfs_flag |= VFS_STATS;
1829                         vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1830                 }
1831 
1832                 if (vswp->vsw_flag & VSW_XID)
1833                         vfsp->vfs_flag |= VFS_XID;
1834 
1835                 vfs_unlock(vfsp);
1836         }
1837         mount_completed(zone);
1838         zone_rele(zone);
1839         if (splice)
1840                 vn_vfsunlock(vp);
1841 
1842         if ((error == 0) && (copyout_error == 0)) {
1843                 if (!remount) {
1844                         /*
1845                          * Don't call get_vskstat_anchor() while holding
1846                          * locks since it allocates memory and calls
1847                          * VFS_STATVFS().  For NFS, the latter can generate
1848                          * an over-the-wire call.
1849                          */
1850                         vskap = get_vskstat_anchor(vfsp);
1851                         /* Only take the lock if we have something to do */
1852                         if (vskap != NULL) {
1853                                 vfs_lock_wait(vfsp);
1854                                 if (vfsp->vfs_flag & VFS_STATS) {
1855                                         vfsp->vfs_vskap = vskap;
1856                                 }
1857                                 vfs_unlock(vfsp);
1858                         }
1859                 }
1860                 /* Return vfsp to caller. */
1861                 *vfspp = vfsp;
1862         }
1863 errout:
1864         vfs_freeopttbl(&mnt_mntopts);
1865         if (resource != NULL)
1866                 kmem_free(resource, strlen(resource) + 1);
1867         if (mountpt != NULL)
1868                 kmem_free(mountpt, strlen(mountpt) + 1);
1869         /*
1870          * It is possible we errored prior to adding to mount in progress
1871          * table. Must free vnode we acquired with successful lookupname.
1872          */
1873         if (addmip)
1874                 VN_RELE(bvp);
1875         if (delmip)
1876                 vfs_delmip(vfsp);
1877         ASSERT(vswp != NULL);
1878         vfs_unrefvfssw(vswp);
1879         if (inargs != opts)
1880                 kmem_free(inargs, MAX_MNTOPT_STR);
1881         if (copyout_error) {
1882                 lofi_remove(vfsp);
1883                 VFS_RELE(vfsp);
1884                 error = copyout_error;
1885         }
1886         return (error);
1887 }
1888 
1889 static void
1890 vfs_setpath(
1891     struct vfs *vfsp,           /* vfs being updated */
1892     refstr_t **refp,            /* Ref-count string to contain the new path */
1893     const char *newpath,        /* Path to add to refp (above) */
1894     uint32_t flag)              /* flag */
1895 {
1896         size_t len;
1897         refstr_t *ref;
1898         zone_t *zone = curproc->p_zone;
1899         char *sp;
1900         int have_list_lock = 0;
1901 
1902         ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1903 
1904         /*
1905          * New path must be less than MAXPATHLEN because mntfs
1906          * will only display up to MAXPATHLEN bytes. This is currently
1907          * safe, because domount() uses pn_get(), and other callers
1908          * similarly cap the size to fewer than MAXPATHLEN bytes.
1909          */
1910 
1911         ASSERT(strlen(newpath) < MAXPATHLEN);
1912 
1913         /* mntfs requires consistency while vfs list lock is held */
1914 
1915         if (VFS_ON_LIST(vfsp)) {
1916                 have_list_lock = 1;
1917                 vfs_list_lock();
1918         }
1919 
1920         if (*refp != NULL)
1921                 refstr_rele(*refp);
1922 
1923         /*
1924          * If we are in a non-global zone then we prefix the supplied path,
1925          * newpath, with the zone's root path, with two exceptions. The first
1926          * is where we have been explicitly directed to avoid doing so; this
1927          * will be the case following a failed remount, where the path supplied
1928          * will be a saved version which must now be restored. The second
1929          * exception is where newpath is not a pathname but a descriptive name,
1930          * e.g. "procfs".
1931          */
1932         if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1933                 ref = refstr_alloc(newpath);
1934                 goto out;
1935         }
1936 
1937         /*
1938          * Truncate the trailing '/' in the zoneroot, and merge
1939          * in the zone's rootpath with the "newpath" (resource
1940          * or mountpoint) passed in.
1941          *
1942          * The size of the required buffer is thus the size of
1943          * the buffer required for the passed-in newpath
1944          * (strlen(newpath) + 1), plus the size of the buffer
1945          * required to hold zone_rootpath (zone_rootpathlen)
1946          * minus one for one of the now-superfluous NUL
1947          * terminations, minus one for the trailing '/'.
1948          *
1949          * That gives us:
1950          *
1951          * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1952          *
1953          * Which is what we have below.
1954          */
1955 
1956         len = strlen(newpath) + zone->zone_rootpathlen - 1;
1957         sp = kmem_alloc(len, KM_SLEEP);
1958 
1959         /*
1960          * Copy everything including the trailing slash, which
1961          * we then overwrite with the NUL character.
1962          */
1963 
1964         (void) strcpy(sp, zone->zone_rootpath);
1965         sp[zone->zone_rootpathlen - 2] = '\0';
1966         (void) strcat(sp, newpath);
1967 
1968         ref = refstr_alloc(sp);
1969         kmem_free(sp, len);
1970 out:
1971         *refp = ref;
1972 
1973         if (have_list_lock) {
1974                 vfs_mnttab_modtimeupd();
1975                 vfs_list_unlock();
1976         }
1977 }
1978 
1979 /*
1980  * Record a mounted resource name in a vfs structure.
1981  * If vfsp is already mounted, caller must hold the vfs lock.
1982  */
1983 void
1984 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1985 {
1986         if (resource == NULL || resource[0] == '\0')
1987                 resource = VFS_NORESOURCE;
1988         vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1989 }
1990 
1991 /*
1992  * Record a mount point name in a vfs structure.
1993  * If vfsp is already mounted, caller must hold the vfs lock.
1994  */
1995 void
1996 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1997 {
1998         if (mntpt == NULL || mntpt[0] == '\0')
1999                 mntpt = VFS_NOMNTPT;
2000         vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
2001 }
2002 
2003 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
2004 
2005 refstr_t *
2006 vfs_getresource(const struct vfs *vfsp)
2007 {
2008         refstr_t *resource;
2009 
2010         vfs_list_read_lock();
2011         resource = vfsp->vfs_resource;
2012         refstr_hold(resource);
2013         vfs_list_unlock();
2014 
2015         return (resource);
2016 }
2017 
2018 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
2019 
2020 refstr_t *
2021 vfs_getmntpoint(const struct vfs *vfsp)
2022 {
2023         refstr_t *mntpt;
2024 
2025         vfs_list_read_lock();
2026         mntpt = vfsp->vfs_mntpt;
2027         refstr_hold(mntpt);
2028         vfs_list_unlock();
2029 
2030         return (mntpt);
2031 }
2032 
2033 /*
2034  * Create an empty options table with enough empty slots to hold all
2035  * The options in the options string passed as an argument.
2036  * Potentially prepend another options table.
2037  *
2038  * Note: caller is responsible for locking the vfs list, if needed,
2039  *       to protect mops.
2040  */
2041 static void
2042 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
2043     const mntopts_t *mtmpl)
2044 {
2045         const char *s = opts;
2046         uint_t count;
2047 
2048         if (opts == NULL || *opts == '\0') {
2049                 count = 0;
2050         } else {
2051                 count = 1;
2052 
2053                 /*
2054                  * Count number of options in the string
2055                  */
2056                 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
2057                         count++;
2058                         s++;
2059                 }
2060         }
2061         vfs_copyopttbl_extend(mtmpl, mops, count);
2062 }
2063 
2064 /*
2065  * Create an empty options table with enough empty slots to hold all
2066  * The options in the options string passed as an argument.
2067  *
2068  * This function is *not* for general use by filesystems.
2069  *
2070  * Note: caller is responsible for locking the vfs list, if needed,
2071  *       to protect mops.
2072  */
2073 void
2074 vfs_createopttbl(mntopts_t *mops, const char *opts)
2075 {
2076         vfs_createopttbl_extend(mops, opts, NULL);
2077 }
2078 
2079 
2080 /*
2081  * Swap two mount options tables
2082  */
2083 static void
2084 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2085 {
2086         uint_t tmpcnt;
2087         mntopt_t *tmplist;
2088 
2089         tmpcnt = optbl2->mo_count;
2090         tmplist = optbl2->mo_list;
2091         optbl2->mo_count = optbl1->mo_count;
2092         optbl2->mo_list = optbl1->mo_list;
2093         optbl1->mo_count = tmpcnt;
2094         optbl1->mo_list = tmplist;
2095 }
2096 
2097 static void
2098 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2099 {
2100         vfs_list_lock();
2101         vfs_swapopttbl_nolock(optbl1, optbl2);
2102         vfs_mnttab_modtimeupd();
2103         vfs_list_unlock();
2104 }
2105 
2106 static char **
2107 vfs_copycancelopt_extend(char **const moc, int extend)
2108 {
2109         int i = 0;
2110         int j;
2111         char **result;
2112 
2113         if (moc != NULL) {
2114                 for (; moc[i] != NULL; i++)
2115                         /* count number of options to cancel */;
2116         }
2117 
2118         if (i + extend == 0)
2119                 return (NULL);
2120 
2121         result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2122 
2123         for (j = 0; j < i; j++) {
2124                 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2125                 (void) strcpy(result[j], moc[j]);
2126         }
2127         for (; j <= i + extend; j++)
2128                 result[j] = NULL;
2129 
2130         return (result);
2131 }
2132 
2133 static void
2134 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2135 {
2136         char *sp, *dp;
2137 
2138         d->mo_flags = s->mo_flags;
2139         d->mo_data = s->mo_data;
2140         sp = s->mo_name;
2141         if (sp != NULL) {
2142                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2143                 (void) strcpy(dp, sp);
2144                 d->mo_name = dp;
2145         } else {
2146                 d->mo_name = NULL; /* should never happen */
2147         }
2148 
2149         d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2150 
2151         sp = s->mo_arg;
2152         if (sp != NULL) {
2153                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2154                 (void) strcpy(dp, sp);
2155                 d->mo_arg = dp;
2156         } else {
2157                 d->mo_arg = NULL;
2158         }
2159 }
2160 
2161 /*
2162  * Copy a mount options table, possibly allocating some spare
2163  * slots at the end.  It is permissible to copy_extend the NULL table.
2164  */
2165 static void
2166 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2167 {
2168         uint_t i, count;
2169         mntopt_t *motbl;
2170 
2171         /*
2172          * Clear out any existing stuff in the options table being initialized
2173          */
2174         vfs_freeopttbl(dmo);
2175         count = (smo == NULL) ? 0 : smo->mo_count;
2176         if ((count + extra) == 0)       /* nothing to do */
2177                 return;
2178         dmo->mo_count = count + extra;
2179         motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2180         dmo->mo_list = motbl;
2181         for (i = 0; i < count; i++) {
2182                 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2183         }
2184         for (i = count; i < count + extra; i++) {
2185                 motbl[i].mo_flags = MO_EMPTY;
2186         }
2187 }
2188 
2189 /*
2190  * Copy a mount options table.
2191  *
2192  * This function is *not* for general use by filesystems.
2193  *
2194  * Note: caller is responsible for locking the vfs list, if needed,
2195  *       to protect smo and dmo.
2196  */
2197 void
2198 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2199 {
2200         vfs_copyopttbl_extend(smo, dmo, 0);
2201 }
2202 
2203 static char **
2204 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2205 {
2206         int c1 = 0;
2207         int c2 = 0;
2208         char **result;
2209         char **sp1, **sp2, **dp;
2210 
2211         /*
2212          * First we count both lists of cancel options.
2213          * If either is NULL or has no elements, we return a copy of
2214          * the other.
2215          */
2216         if (mop1->mo_cancel != NULL) {
2217                 for (; mop1->mo_cancel[c1] != NULL; c1++)
2218                         /* count cancel options in mop1 */;
2219         }
2220 
2221         if (c1 == 0)
2222                 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2223 
2224         if (mop2->mo_cancel != NULL) {
2225                 for (; mop2->mo_cancel[c2] != NULL; c2++)
2226                         /* count cancel options in mop2 */;
2227         }
2228 
2229         result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2230 
2231         if (c2 == 0)
2232                 return (result);
2233 
2234         /*
2235          * When we get here, we've got two sets of cancel options;
2236          * we need to merge the two sets.  We know that the result
2237          * array has "c1+c2+1" entries and in the end we might shrink
2238          * it.
2239          * Result now has a copy of the c1 entries from mop1; we'll
2240          * now lookup all the entries of mop2 in mop1 and copy it if
2241          * it is unique.
2242          * This operation is O(n^2) but it's only called once per
2243          * filesystem per duplicate option.  This is a situation
2244          * which doesn't arise with the filesystems in ON and
2245          * n is generally 1.
2246          */
2247 
2248         dp = &result[c1];
2249         for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2250                 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2251                         if (strcmp(*sp1, *sp2) == 0)
2252                                 break;
2253                 }
2254                 if (*sp1 == NULL) {
2255                         /*
2256                          * Option *sp2 not found in mop1, so copy it.
2257                          * The calls to vfs_copycancelopt_extend()
2258                          * guarantee that there's enough room.
2259                          */
2260                         *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2261                         (void) strcpy(*dp++, *sp2);
2262                 }
2263         }
2264         if (dp != &result[c1+c2]) {
2265                 size_t bytes = (dp - result + 1) * sizeof (char *);
2266                 char **nres = kmem_alloc(bytes, KM_SLEEP);
2267 
2268                 bcopy(result, nres, bytes);
2269                 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2270                 result = nres;
2271         }
2272         return (result);
2273 }
2274 
2275 /*
2276  * Merge two mount option tables (outer and inner) into one.  This is very
2277  * similar to "merging" global variables and automatic variables in C.
2278  *
2279  * This isn't (and doesn't have to be) fast.
2280  *
2281  * This function is *not* for general use by filesystems.
2282  *
2283  * Note: caller is responsible for locking the vfs list, if needed,
2284  *       to protect omo, imo & dmo.
2285  */
2286 void
2287 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2288 {
2289         uint_t i, count;
2290         mntopt_t *mop, *motbl;
2291         uint_t freeidx;
2292 
2293         /*
2294          * First determine how much space we need to allocate.
2295          */
2296         count = omo->mo_count;
2297         for (i = 0; i < imo->mo_count; i++) {
2298                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2299                         continue;
2300                 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2301                         count++;
2302         }
2303         ASSERT(count >= omo->mo_count &&
2304             count <= omo->mo_count + imo->mo_count);
2305         motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2306         for (i = 0; i < omo->mo_count; i++)
2307                 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2308         freeidx = omo->mo_count;
2309         for (i = 0; i < imo->mo_count; i++) {
2310                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2311                         continue;
2312                 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2313                         char **newcanp;
2314                         uint_t index = mop - omo->mo_list;
2315 
2316                         newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2317 
2318                         vfs_freeopt(&motbl[index]);
2319                         vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2320 
2321                         vfs_freecancelopt(motbl[index].mo_cancel);
2322                         motbl[index].mo_cancel = newcanp;
2323                 } else {
2324                         /*
2325                          * If it's a new option, just copy it over to the first
2326                          * free location.
2327                          */
2328                         vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2329                 }
2330         }
2331         dmo->mo_count = count;
2332         dmo->mo_list = motbl;
2333 }
2334 
2335 /*
2336  * Functions to set and clear mount options in a mount options table.
2337  */
2338 
2339 /*
2340  * Clear a mount option, if it exists.
2341  *
2342  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2343  * the vfs list.
2344  */
2345 static void
2346 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2347 {
2348         struct mntopt *mop;
2349         uint_t i, count;
2350 
2351         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2352 
2353         count = mops->mo_count;
2354         for (i = 0; i < count; i++) {
2355                 mop = &mops->mo_list[i];
2356 
2357                 if (mop->mo_flags & MO_EMPTY)
2358                         continue;
2359                 if (strcmp(opt, mop->mo_name))
2360                         continue;
2361                 mop->mo_flags &= ~MO_SET;
2362                 if (mop->mo_arg != NULL) {
2363                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2364                 }
2365                 mop->mo_arg = NULL;
2366                 if (update_mnttab)
2367                         vfs_mnttab_modtimeupd();
2368                 break;
2369         }
2370 }
2371 
2372 void
2373 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2374 {
2375         int gotlock = 0;
2376 
2377         if (VFS_ON_LIST(vfsp)) {
2378                 gotlock = 1;
2379                 vfs_list_lock();
2380         }
2381         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2382         if (gotlock)
2383                 vfs_list_unlock();
2384 }
2385 
2386 
2387 /*
2388  * Set a mount option on.  If it's not found in the table, it's silently
2389  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2390  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2391  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2392  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2393  * MO_EMPTY set is created as the option passed in.
2394  *
2395  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2396  * the vfs list.
2397  */
2398 static void
2399 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2400     const char *arg, int flags, int update_mnttab)
2401 {
2402         mntopt_t *mop;
2403         uint_t i, count;
2404         char *sp;
2405 
2406         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2407 
2408         if (flags & VFS_CREATEOPT) {
2409                 if (vfs_hasopt(mops, opt) != NULL) {
2410                         flags &= ~VFS_CREATEOPT;
2411                 }
2412         }
2413         count = mops->mo_count;
2414         for (i = 0; i < count; i++) {
2415                 mop = &mops->mo_list[i];
2416 
2417                 if (mop->mo_flags & MO_EMPTY) {
2418                         if ((flags & VFS_CREATEOPT) == 0)
2419                                 continue;
2420                         sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2421                         (void) strcpy(sp, opt);
2422                         mop->mo_name = sp;
2423                         if (arg != NULL)
2424                                 mop->mo_flags = MO_HASVALUE;
2425                         else
2426                                 mop->mo_flags = 0;
2427                 } else if (strcmp(opt, mop->mo_name)) {
2428                         continue;
2429                 }
2430                 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2431                         break;
2432                 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2433                         sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2434                         (void) strcpy(sp, arg);
2435                 } else {
2436                         sp = NULL;
2437                 }
2438                 if (mop->mo_arg != NULL)
2439                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2440                 mop->mo_arg = sp;
2441                 if (flags & VFS_DISPLAY)
2442                         mop->mo_flags &= ~MO_NODISPLAY;
2443                 if (flags & VFS_NODISPLAY)
2444                         mop->mo_flags |= MO_NODISPLAY;
2445                 mop->mo_flags |= MO_SET;
2446                 if (mop->mo_cancel != NULL) {
2447                         char **cp;
2448 
2449                         for (cp = mop->mo_cancel; *cp != NULL; cp++)
2450                                 vfs_clearmntopt_nolock(mops, *cp, 0);
2451                 }
2452                 if (update_mnttab)
2453                         vfs_mnttab_modtimeupd();
2454                 break;
2455         }
2456 }
2457 
2458 void
2459 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2460 {
2461         int gotlock = 0;
2462 
2463         if (VFS_ON_LIST(vfsp)) {
2464                 gotlock = 1;
2465                 vfs_list_lock();
2466         }
2467         vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2468         if (gotlock)
2469                 vfs_list_unlock();
2470 }
2471 
2472 
2473 /*
2474  * Add a "tag" option to a mounted file system's options list.
2475  *
2476  * Note: caller is responsible for locking the vfs list, if needed,
2477  *       to protect mops.
2478  */
2479 static mntopt_t *
2480 vfs_addtag(mntopts_t *mops, const char *tag)
2481 {
2482         uint_t count;
2483         mntopt_t *mop, *motbl;
2484 
2485         count = mops->mo_count + 1;
2486         motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2487         if (mops->mo_count) {
2488                 size_t len = (count - 1) * sizeof (mntopt_t);
2489 
2490                 bcopy(mops->mo_list, motbl, len);
2491                 kmem_free(mops->mo_list, len);
2492         }
2493         mops->mo_count = count;
2494         mops->mo_list = motbl;
2495         mop = &motbl[count - 1];
2496         mop->mo_flags = MO_TAG;
2497         mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2498         (void) strcpy(mop->mo_name, tag);
2499         return (mop);
2500 }
2501 
2502 /*
2503  * Allow users to set arbitrary "tags" in a vfs's mount options.
2504  * Broader use within the kernel is discouraged.
2505  */
2506 int
2507 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2508     cred_t *cr)
2509 {
2510         vfs_t *vfsp;
2511         mntopts_t *mops;
2512         mntopt_t *mop;
2513         int found = 0;
2514         dev_t dev = makedevice(major, minor);
2515         int err = 0;
2516         char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2517 
2518         /*
2519          * Find the desired mounted file system
2520          */
2521         vfs_list_lock();
2522         vfsp = rootvfs;
2523         do {
2524                 if (vfsp->vfs_dev == dev &&
2525                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2526                         found = 1;
2527                         break;
2528                 }
2529                 vfsp = vfsp->vfs_next;
2530         } while (vfsp != rootvfs);
2531 
2532         if (!found) {
2533                 err = EINVAL;
2534                 goto out;
2535         }
2536         err = secpolicy_fs_config(cr, vfsp);
2537         if (err != 0)
2538                 goto out;
2539 
2540         mops = &vfsp->vfs_mntopts;
2541         /*
2542          * Add tag if it doesn't already exist
2543          */
2544         if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2545                 int len;
2546 
2547                 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2548                 len = strlen(buf);
2549                 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2550                         err = ENAMETOOLONG;
2551                         goto out;
2552                 }
2553                 mop = vfs_addtag(mops, tag);
2554         }
2555         if ((mop->mo_flags & MO_TAG) == 0) {
2556                 err = EINVAL;
2557                 goto out;
2558         }
2559         vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2560 out:
2561         vfs_list_unlock();
2562         kmem_free(buf, MAX_MNTOPT_STR);
2563         return (err);
2564 }
2565 
2566 /*
2567  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2568  * Broader use within the kernel is discouraged.
2569  */
2570 int
2571 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2572     cred_t *cr)
2573 {
2574         vfs_t *vfsp;
2575         mntopt_t *mop;
2576         int found = 0;
2577         dev_t dev = makedevice(major, minor);
2578         int err = 0;
2579 
2580         /*
2581          * Find the desired mounted file system
2582          */
2583         vfs_list_lock();
2584         vfsp = rootvfs;
2585         do {
2586                 if (vfsp->vfs_dev == dev &&
2587                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2588                         found = 1;
2589                         break;
2590                 }
2591                 vfsp = vfsp->vfs_next;
2592         } while (vfsp != rootvfs);
2593 
2594         if (!found) {
2595                 err = EINVAL;
2596                 goto out;
2597         }
2598         err = secpolicy_fs_config(cr, vfsp);
2599         if (err != 0)
2600                 goto out;
2601 
2602         if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2603                 err = EINVAL;
2604                 goto out;
2605         }
2606         if ((mop->mo_flags & MO_TAG) == 0) {
2607                 err = EINVAL;
2608                 goto out;
2609         }
2610         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2611 out:
2612         vfs_list_unlock();
2613         return (err);
2614 }
2615 
2616 /*
2617  * Function to parse an option string and fill in a mount options table.
2618  * Unknown options are silently ignored.  The input option string is modified
2619  * by replacing separators with nulls.  If the create flag is set, options
2620  * not found in the table are just added on the fly.  The table must have
2621  * an option slot marked MO_EMPTY to add an option on the fly.
2622  *
2623  * This function is *not* for general use by filesystems.
2624  *
2625  * Note: caller is responsible for locking the vfs list, if needed,
2626  *       to protect mops..
2627  */
2628 void
2629 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2630 {
2631         char *s = osp, *p, *nextop, *valp, *cp, *ep;
2632         int setflg = VFS_NOFORCEOPT;
2633 
2634         if (osp == NULL)
2635                 return;
2636         while (*s != '\0') {
2637                 p = strchr(s, ',');     /* find next option */
2638                 if (p == NULL) {
2639                         cp = NULL;
2640                         p = s + strlen(s);
2641                 } else {
2642                         cp = p;         /* save location of comma */
2643                         *p++ = '\0';    /* mark end and point to next option */
2644                 }
2645                 nextop = p;
2646                 p = strchr(s, '=');     /* look for value */
2647                 if (p == NULL) {
2648                         valp = NULL;    /* no value supplied */
2649                 } else {
2650                         ep = p;         /* save location of equals */
2651                         *p++ = '\0';    /* end option and point to value */
2652                         valp = p;
2653                 }
2654                 /*
2655                  * set option into options table
2656                  */
2657                 if (create)
2658                         setflg |= VFS_CREATEOPT;
2659                 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2660                 if (cp != NULL)
2661                         *cp = ',';      /* restore the comma */
2662                 if (valp != NULL)
2663                         *ep = '=';      /* restore the equals */
2664                 s = nextop;
2665         }
2666 }
2667 
2668 /*
2669  * Function to inquire if an option exists in a mount options table.
2670  * Returns a pointer to the option if it exists, else NULL.
2671  *
2672  * This function is *not* for general use by filesystems.
2673  *
2674  * Note: caller is responsible for locking the vfs list, if needed,
2675  *       to protect mops.
2676  */
2677 struct mntopt *
2678 vfs_hasopt(const mntopts_t *mops, const char *opt)
2679 {
2680         struct mntopt *mop;
2681         uint_t i, count;
2682 
2683         count = mops->mo_count;
2684         for (i = 0; i < count; i++) {
2685                 mop = &mops->mo_list[i];
2686 
2687                 if (mop->mo_flags & MO_EMPTY)
2688                         continue;
2689                 if (strcmp(opt, mop->mo_name) == 0)
2690                         return (mop);
2691         }
2692         return (NULL);
2693 }
2694 
2695 /*
2696  * Function to inquire if an option is set in a mount options table.
2697  * Returns non-zero if set and fills in the arg pointer with a pointer to
2698  * the argument string or NULL if there is no argument string.
2699  */
2700 static int
2701 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2702 {
2703         struct mntopt *mop;
2704         uint_t i, count;
2705 
2706         count = mops->mo_count;
2707         for (i = 0; i < count; i++) {
2708                 mop = &mops->mo_list[i];
2709 
2710                 if (mop->mo_flags & MO_EMPTY)
2711                         continue;
2712                 if (strcmp(opt, mop->mo_name))
2713                         continue;
2714                 if ((mop->mo_flags & MO_SET) == 0)
2715                         return (0);
2716                 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2717                         *argp = mop->mo_arg;
2718                 return (1);
2719         }
2720         return (0);
2721 }
2722 
2723 
2724 int
2725 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2726 {
2727         int ret;
2728 
2729         vfs_list_read_lock();
2730         ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2731         vfs_list_unlock();
2732         return (ret);
2733 }
2734 
2735 
2736 /*
2737  * Construct a comma separated string of the options set in the given
2738  * mount table, return the string in the given buffer.  Return non-zero if
2739  * the buffer would overflow.
2740  *
2741  * This function is *not* for general use by filesystems.
2742  *
2743  * Note: caller is responsible for locking the vfs list, if needed,
2744  *       to protect mp.
2745  */
2746 int
2747 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2748 {
2749         char *cp;
2750         uint_t i;
2751 
2752         buf[0] = '\0';
2753         cp = buf;
2754         for (i = 0; i < mp->mo_count; i++) {
2755                 struct mntopt *mop;
2756 
2757                 mop = &mp->mo_list[i];
2758                 if (mop->mo_flags & MO_SET) {
2759                         int optlen, comma = 0;
2760 
2761                         if (buf[0] != '\0')
2762                                 comma = 1;
2763                         optlen = strlen(mop->mo_name);
2764                         if (strlen(buf) + comma + optlen + 1 > len)
2765                                 goto err;
2766                         if (comma)
2767                                 *cp++ = ',';
2768                         (void) strcpy(cp, mop->mo_name);
2769                         cp += optlen;
2770                         /*
2771                          * Append option value if there is one
2772                          */
2773                         if (mop->mo_arg != NULL) {
2774                                 int arglen;
2775 
2776                                 arglen = strlen(mop->mo_arg);
2777                                 if (strlen(buf) + arglen + 2 > len)
2778                                         goto err;
2779                                 *cp++ = '=';
2780                                 (void) strcpy(cp, mop->mo_arg);
2781                                 cp += arglen;
2782                         }
2783                 }
2784         }
2785         return (0);
2786 err:
2787         return (EOVERFLOW);
2788 }
2789 
2790 static void
2791 vfs_freecancelopt(char **moc)
2792 {
2793         if (moc != NULL) {
2794                 int ccnt = 0;
2795                 char **cp;
2796 
2797                 for (cp = moc; *cp != NULL; cp++) {
2798                         kmem_free(*cp, strlen(*cp) + 1);
2799                         ccnt++;
2800                 }
2801                 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2802         }
2803 }
2804 
2805 static void
2806 vfs_freeopt(mntopt_t *mop)
2807 {
2808         if (mop->mo_name != NULL)
2809                 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2810 
2811         vfs_freecancelopt(mop->mo_cancel);
2812 
2813         if (mop->mo_arg != NULL)
2814                 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2815 }
2816 
2817 /*
2818  * Free a mount options table
2819  *
2820  * This function is *not* for general use by filesystems.
2821  *
2822  * Note: caller is responsible for locking the vfs list, if needed,
2823  *       to protect mp.
2824  */
2825 void
2826 vfs_freeopttbl(mntopts_t *mp)
2827 {
2828         uint_t i, count;
2829 
2830         count = mp->mo_count;
2831         for (i = 0; i < count; i++) {
2832                 vfs_freeopt(&mp->mo_list[i]);
2833         }
2834         if (count) {
2835                 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2836                 mp->mo_count = 0;
2837                 mp->mo_list = NULL;
2838         }
2839 }
2840 
2841 
2842 /* ARGSUSED */
2843 static int
2844 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2845         caller_context_t *ct)
2846 {
2847         return (0);
2848 }
2849 
2850 /* ARGSUSED */
2851 static int
2852 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2853         caller_context_t *ct)
2854 {
2855         return (0);
2856 }
2857 
2858 /*
2859  * The dummy vnode is currently used only by file events notification
2860  * module which is just interested in the timestamps.
2861  */
2862 /* ARGSUSED */
2863 static int
2864 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2865     caller_context_t *ct)
2866 {
2867         bzero(vap, sizeof (vattr_t));
2868         vap->va_type = VREG;
2869         vap->va_nlink = 1;
2870         vap->va_ctime = vfs_mnttab_ctime;
2871         /*
2872          * it is ok to just copy mtime as the time will be monotonically
2873          * increasing.
2874          */
2875         vap->va_mtime = vfs_mnttab_mtime;
2876         vap->va_atime = vap->va_mtime;
2877         return (0);
2878 }
2879 
2880 static void
2881 vfs_mnttabvp_setup(void)
2882 {
2883         vnode_t *tvp;
2884         vnodeops_t *vfs_mntdummyvnops;
2885         const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2886                 VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2887                 VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2888                 VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2889                 VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2890                 NULL,                   NULL
2891         };
2892 
2893         if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2894             &vfs_mntdummyvnops) != 0) {
2895                 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2896                 /* Shouldn't happen, but not bad enough to panic */
2897                 return;
2898         }
2899 
2900         /*
2901          * A global dummy vnode is allocated to represent mntfs files.
2902          * The mntfs file (/etc/mnttab) can be monitored for file events
2903          * and receive an event when mnttab changes. Dummy VOP calls
2904          * will be made on this vnode. The file events notification module
2905          * intercepts this vnode and delivers relevant events.
2906          */
2907         tvp = vn_alloc(KM_SLEEP);
2908         tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2909         vn_setops(tvp, vfs_mntdummyvnops);
2910         tvp->v_type = VREG;
2911         /*
2912          * The mnt dummy ops do not reference v_data.
2913          * No other module intercepting this vnode should either.
2914          * Just set it to point to itself.
2915          */
2916         tvp->v_data = (caddr_t)tvp;
2917         tvp->v_vfsp = rootvfs;
2918         vfs_mntdummyvp = tvp;
2919 }
2920 
2921 /*
2922  * performs fake read/write ops
2923  */
2924 static void
2925 vfs_mnttab_rwop(int rw)
2926 {
2927         struct uio      uio;
2928         struct iovec    iov;
2929         char    buf[1];
2930 
2931         if (vfs_mntdummyvp == NULL)
2932                 return;
2933 
2934         bzero(&uio, sizeof (uio));
2935         bzero(&iov, sizeof (iov));
2936         iov.iov_base = buf;
2937         iov.iov_len = 0;
2938         uio.uio_iov = &iov;
2939         uio.uio_iovcnt = 1;
2940         uio.uio_loffset = 0;
2941         uio.uio_segflg = UIO_SYSSPACE;
2942         uio.uio_resid = 0;
2943         if (rw) {
2944                 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2945         } else {
2946                 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2947         }
2948 }
2949 
2950 /*
2951  * Generate a write operation.
2952  */
2953 void
2954 vfs_mnttab_writeop(void)
2955 {
2956         vfs_mnttab_rwop(1);
2957 }
2958 
2959 /*
2960  * Generate a read operation.
2961  */
2962 void
2963 vfs_mnttab_readop(void)
2964 {
2965         vfs_mnttab_rwop(0);
2966 }
2967 
2968 /*
2969  * Free any mnttab information recorded in the vfs struct.
2970  * The vfs must not be on the vfs list.
2971  */
2972 static void
2973 vfs_freemnttab(struct vfs *vfsp)
2974 {
2975         ASSERT(!VFS_ON_LIST(vfsp));
2976 
2977         /*
2978          * Free device and mount point information
2979          */
2980         if (vfsp->vfs_mntpt != NULL) {
2981                 refstr_rele(vfsp->vfs_mntpt);
2982                 vfsp->vfs_mntpt = NULL;
2983         }
2984         if (vfsp->vfs_resource != NULL) {
2985                 refstr_rele(vfsp->vfs_resource);
2986                 vfsp->vfs_resource = NULL;
2987         }
2988         /*
2989          * Now free mount options information
2990          */
2991         vfs_freeopttbl(&vfsp->vfs_mntopts);
2992 }
2993 
2994 /*
2995  * Return the last mnttab modification time
2996  */
2997 void
2998 vfs_mnttab_modtime(timespec_t *ts)
2999 {
3000         ASSERT(RW_LOCK_HELD(&vfslist));
3001         *ts = vfs_mnttab_mtime;
3002 }
3003 
3004 /*
3005  * See if mnttab is changed
3006  */
3007 void
3008 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
3009 {
3010         int changed;
3011 
3012         *phpp = (struct pollhead *)NULL;
3013 
3014         /*
3015          * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
3016          * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
3017          * to not grab the vfs list lock because tv_sec is monotonically
3018          * increasing.
3019          */
3020 
3021         changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
3022             (old->tv_sec != vfs_mnttab_mtime.tv_sec);
3023         if (!changed) {
3024                 *phpp = &vfs_pollhd;
3025         }
3026 }
3027 
3028 /* Provide a unique and monotonically-increasing timestamp. */
3029 void
3030 vfs_mono_time(timespec_t *ts)
3031 {
3032         static volatile hrtime_t hrt;           /* The saved time. */
3033         hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
3034         timespec_t      newts;
3035 
3036         /*
3037          * Try gethrestime() first, but be prepared to fabricate a sensible
3038          * answer at the first sign of any trouble.
3039          */
3040         gethrestime(&newts);
3041         newhrt = ts2hrt(&newts);
3042         for (;;) {
3043                 oldhrt = hrt;
3044                 if (newhrt <= hrt)
3045                         newhrt = hrt + 1;
3046                 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
3047                         break;
3048         }
3049         hrt2ts(newhrt, ts);
3050 }
3051 
3052 /*
3053  * Update the mnttab modification time and wake up any waiters for
3054  * mnttab changes
3055  */
3056 void
3057 vfs_mnttab_modtimeupd()
3058 {
3059         hrtime_t oldhrt, newhrt;
3060 
3061         ASSERT(RW_WRITE_HELD(&vfslist));
3062         oldhrt = ts2hrt(&vfs_mnttab_mtime);
3063         gethrestime(&vfs_mnttab_mtime);
3064         newhrt = ts2hrt(&vfs_mnttab_mtime);
3065         if (oldhrt == (hrtime_t)0)
3066                 vfs_mnttab_ctime = vfs_mnttab_mtime;
3067         /*
3068          * Attempt to provide unique mtime (like uniqtime but not).
3069          */
3070         if (newhrt == oldhrt) {
3071                 newhrt++;
3072                 hrt2ts(newhrt, &vfs_mnttab_mtime);
3073         }
3074         pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
3075         vfs_mnttab_writeop();
3076 }
3077 
3078 int
3079 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3080 {
3081         vnode_t *coveredvp;
3082         int error;
3083         extern void teardown_vopstats(vfs_t *);
3084 
3085         /*
3086          * Get covered vnode. This will be NULL if the vfs is not linked
3087          * into the file system name space (i.e., domount() with MNT_NOSPICE).
3088          */
3089         coveredvp = vfsp->vfs_vnodecovered;
3090         ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3091 
3092         /*
3093          * Purge all dnlc entries for this vfs.
3094          */
3095         (void) dnlc_purge_vfsp(vfsp, 0);
3096 
3097         /* For forcible umount, skip VFS_SYNC() since it may hang */
3098         if ((flag & MS_FORCE) == 0)
3099                 (void) VFS_SYNC(vfsp, 0, cr);
3100 
3101         /*
3102          * Lock the vfs to maintain fs status quo during unmount.  This
3103          * has to be done after the sync because ufs_update tries to acquire
3104          * the vfs_reflock.
3105          */
3106         vfs_lock_wait(vfsp);
3107 
3108         if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3109                 vfs_unlock(vfsp);
3110                 if (coveredvp != NULL)
3111                         vn_vfsunlock(coveredvp);
3112         } else if (coveredvp != NULL) {
3113                 teardown_vopstats(vfsp);
3114                 /*
3115                  * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3116                  * when it frees vfsp so we do a VN_HOLD() so we can
3117                  * continue to use coveredvp afterwards.
3118                  */
3119                 VN_HOLD(coveredvp);
3120                 vfs_remove(vfsp);
3121                 vn_vfsunlock(coveredvp);
3122                 VN_RELE(coveredvp);
3123         } else {
3124                 teardown_vopstats(vfsp);
3125                 /*
3126                  * Release the reference to vfs that is not linked
3127                  * into the name space.
3128                  */
3129                 vfs_unlock(vfsp);
3130                 VFS_RELE(vfsp);
3131         }
3132         return (error);
3133 }
3134 
3135 
3136 /*
3137  * Vfs_unmountall() is called by uadmin() to unmount all
3138  * mounted file systems (except the root file system) during shutdown.
3139  * It follows the existing locking protocol when traversing the vfs list
3140  * to sync and unmount vfses. Even though there should be no
3141  * other thread running while the system is shutting down, it is prudent
3142  * to still follow the locking protocol.
3143  */
3144 void
3145 vfs_unmountall(void)
3146 {
3147         struct vfs *vfsp;
3148         struct vfs *prev_vfsp = NULL;
3149         int error;
3150 
3151         /*
3152          * Toss all dnlc entries now so that the per-vfs sync
3153          * and unmount operations don't have to slog through
3154          * a bunch of uninteresting vnodes over and over again.
3155          */
3156         dnlc_purge();
3157 
3158         vfs_list_lock();
3159         for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3160                 prev_vfsp = vfsp->vfs_prev;
3161 
3162                 if (vfs_lock(vfsp) != 0)
3163                         continue;
3164                 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3165                 vfs_unlock(vfsp);
3166                 if (error)
3167                         continue;
3168 
3169                 vfs_list_unlock();
3170 
3171                 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3172                 (void) dounmount(vfsp, 0, CRED());
3173 
3174                 /*
3175                  * Since we dropped the vfslist lock above we must
3176                  * verify that next_vfsp still exists, else start over.
3177                  */
3178                 vfs_list_lock();
3179                 for (vfsp = rootvfs->vfs_prev;
3180                     vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3181                         if (vfsp == prev_vfsp)
3182                                 break;
3183                 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3184                         prev_vfsp = rootvfs->vfs_prev;
3185         }
3186         vfs_list_unlock();
3187 }
3188 
3189 /*
3190  * Called to add an entry to the end of the vfs mount in progress list
3191  */
3192 void
3193 vfs_addmip(dev_t dev, struct vfs *vfsp)
3194 {
3195         struct ipmnt *mipp;
3196 
3197         mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3198         mipp->mip_next = NULL;
3199         mipp->mip_dev = dev;
3200         mipp->mip_vfsp = vfsp;
3201         mutex_enter(&vfs_miplist_mutex);
3202         if (vfs_miplist_end != NULL)
3203                 vfs_miplist_end->mip_next = mipp;
3204         else
3205                 vfs_miplist = mipp;
3206         vfs_miplist_end = mipp;
3207         mutex_exit(&vfs_miplist_mutex);
3208 }
3209 
3210 /*
3211  * Called to remove an entry from the mount in progress list
3212  * Either because the mount completed or it failed.
3213  */
3214 void
3215 vfs_delmip(struct vfs *vfsp)
3216 {
3217         struct ipmnt *mipp, *mipprev;
3218 
3219         mutex_enter(&vfs_miplist_mutex);
3220         mipprev = NULL;
3221         for (mipp = vfs_miplist;
3222             mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3223                 mipprev = mipp;
3224         }
3225         if (mipp == NULL)
3226                 return; /* shouldn't happen */
3227         if (mipp == vfs_miplist_end)
3228                 vfs_miplist_end = mipprev;
3229         if (mipprev == NULL)
3230                 vfs_miplist = mipp->mip_next;
3231         else
3232                 mipprev->mip_next = mipp->mip_next;
3233         mutex_exit(&vfs_miplist_mutex);
3234         kmem_free(mipp, sizeof (struct ipmnt));
3235 }
3236 
3237 /*
3238  * vfs_add is called by a specific filesystem's mount routine to add
3239  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3240  * The vfs should already have been locked by the caller.
3241  *
3242  * coveredvp is NULL if this is the root.
3243  */
3244 void
3245 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3246 {
3247         int newflag;
3248 
3249         ASSERT(vfs_lock_held(vfsp));
3250         VFS_HOLD(vfsp);
3251         newflag = vfsp->vfs_flag;
3252         if (mflag & MS_RDONLY)
3253                 newflag |= VFS_RDONLY;
3254         else
3255                 newflag &= ~VFS_RDONLY;
3256         if (mflag & MS_NOSUID)
3257                 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3258         else
3259                 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3260         if (mflag & MS_NOMNTTAB)
3261                 newflag |= VFS_NOMNTTAB;
3262         else
3263                 newflag &= ~VFS_NOMNTTAB;
3264 
3265         if (coveredvp != NULL) {
3266                 ASSERT(vn_vfswlock_held(coveredvp));
3267                 coveredvp->v_vfsmountedhere = vfsp;
3268                 VN_HOLD(coveredvp);
3269         }
3270         vfsp->vfs_vnodecovered = coveredvp;
3271         vfsp->vfs_flag = newflag;
3272 
3273         vfs_list_add(vfsp);
3274 }
3275 
3276 /*
3277  * Remove a vfs from the vfs list, null out the pointer from the
3278  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3279  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3280  * reference to the vfs and to the covered vnode.
3281  *
3282  * Called from dounmount after it's confirmed with the file system
3283  * that the unmount is legal.
3284  */
3285 void
3286 vfs_remove(struct vfs *vfsp)
3287 {
3288         vnode_t *vp;
3289 
3290         ASSERT(vfs_lock_held(vfsp));
3291 
3292         /*
3293          * Can't unmount root.  Should never happen because fs will
3294          * be busy.
3295          */
3296         if (vfsp == rootvfs)
3297                 panic("vfs_remove: unmounting root");
3298 
3299         vfs_list_remove(vfsp);
3300 
3301         /*
3302          * Unhook from the file system name space.
3303          */
3304         vp = vfsp->vfs_vnodecovered;
3305         ASSERT(vn_vfswlock_held(vp));
3306         vp->v_vfsmountedhere = NULL;
3307         vfsp->vfs_vnodecovered = NULL;
3308         VN_RELE(vp);
3309 
3310         /*
3311          * Release lock and wakeup anybody waiting.
3312          */
3313         vfs_unlock(vfsp);
3314         VFS_RELE(vfsp);
3315 }
3316 
3317 /*
3318  * Lock a filesystem to prevent access to it while mounting,
3319  * unmounting and syncing.  Return EBUSY immediately if lock
3320  * can't be acquired.
3321  */
3322 int
3323 vfs_lock(vfs_t *vfsp)
3324 {
3325         vn_vfslocks_entry_t *vpvfsentry;
3326 
3327         vpvfsentry = vn_vfslocks_getlock(vfsp);
3328         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3329                 return (0);
3330 
3331         vn_vfslocks_rele(vpvfsentry);
3332         return (EBUSY);
3333 }
3334 
3335 int
3336 vfs_rlock(vfs_t *vfsp)
3337 {
3338         vn_vfslocks_entry_t *vpvfsentry;
3339 
3340         vpvfsentry = vn_vfslocks_getlock(vfsp);
3341 
3342         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3343                 return (0);
3344 
3345         vn_vfslocks_rele(vpvfsentry);
3346         return (EBUSY);
3347 }
3348 
3349 void
3350 vfs_lock_wait(vfs_t *vfsp)
3351 {
3352         vn_vfslocks_entry_t *vpvfsentry;
3353 
3354         vpvfsentry = vn_vfslocks_getlock(vfsp);
3355         rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3356 }
3357 
3358 void
3359 vfs_rlock_wait(vfs_t *vfsp)
3360 {
3361         vn_vfslocks_entry_t *vpvfsentry;
3362 
3363         vpvfsentry = vn_vfslocks_getlock(vfsp);
3364         rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3365 }
3366 
3367 /*
3368  * Unlock a locked filesystem.
3369  */
3370 void
3371 vfs_unlock(vfs_t *vfsp)
3372 {
3373         vn_vfslocks_entry_t *vpvfsentry;
3374 
3375         /*
3376          * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3377          * And these changes should remain for the patch changes as it is.
3378          */
3379         if (panicstr)
3380                 return;
3381 
3382         /*
3383          * ve_refcount needs to be dropped twice here.
3384          * 1. To release refernce after a call to vfs_locks_getlock()
3385          * 2. To release the reference from the locking routines like
3386          *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3387          */
3388 
3389         vpvfsentry = vn_vfslocks_getlock(vfsp);
3390         vn_vfslocks_rele(vpvfsentry);
3391 
3392         rwst_exit(&vpvfsentry->ve_lock);
3393         vn_vfslocks_rele(vpvfsentry);
3394 }
3395 
3396 /*
3397  * Utility routine that allows a filesystem to construct its
3398  * fsid in "the usual way" - by munging some underlying dev_t and
3399  * the filesystem type number into the 64-bit fsid.  Note that
3400  * this implicitly relies on dev_t persistence to make filesystem
3401  * id's persistent.
3402  *
3403  * There's nothing to prevent an individual fs from constructing its
3404  * fsid in a different way, and indeed they should.
3405  *
3406  * Since we want fsids to be 32-bit quantities (so that they can be
3407  * exported identically by either 32-bit or 64-bit APIs, as well as
3408  * the fact that fsid's are "known" to NFS), we compress the device
3409  * number given down to 32-bits, and panic if that isn't possible.
3410  */
3411 void
3412 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3413 {
3414         if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3415                 panic("device number too big for fsid!");
3416         fsi->val[1] = val;
3417 }
3418 
3419 int
3420 vfs_lock_held(vfs_t *vfsp)
3421 {
3422         int held;
3423         vn_vfslocks_entry_t *vpvfsentry;
3424 
3425         /*
3426          * vfs_lock_held will mimic sema_held behaviour
3427          * if panicstr is set. And these changes should remain
3428          * for the patch changes as it is.
3429          */
3430         if (panicstr)
3431                 return (1);
3432 
3433         vpvfsentry = vn_vfslocks_getlock(vfsp);
3434         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3435 
3436         vn_vfslocks_rele(vpvfsentry);
3437         return (held);
3438 }
3439 
3440 struct _kthread *
3441 vfs_lock_owner(vfs_t *vfsp)
3442 {
3443         struct _kthread *owner;
3444         vn_vfslocks_entry_t *vpvfsentry;
3445 
3446         /*
3447          * vfs_wlock_held will mimic sema_held behaviour
3448          * if panicstr is set. And these changes should remain
3449          * for the patch changes as it is.
3450          */
3451         if (panicstr)
3452                 return (NULL);
3453 
3454         vpvfsentry = vn_vfslocks_getlock(vfsp);
3455         owner = rwst_owner(&vpvfsentry->ve_lock);
3456 
3457         vn_vfslocks_rele(vpvfsentry);
3458         return (owner);
3459 }
3460 
3461 /*
3462  * vfs list locking.
3463  *
3464  * Rather than manipulate the vfslist lock directly, we abstract into lock
3465  * and unlock routines to allow the locking implementation to be changed for
3466  * clustering.
3467  *
3468  * Whenever the vfs list is modified through its hash links, the overall list
3469  * lock must be obtained before locking the relevant hash bucket.  But to see
3470  * whether a given vfs is on the list, it suffices to obtain the lock for the
3471  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3472  */
3473 
3474 void
3475 vfs_list_lock()
3476 {
3477         rw_enter(&vfslist, RW_WRITER);
3478 }
3479 
3480 void
3481 vfs_list_read_lock()
3482 {
3483         rw_enter(&vfslist, RW_READER);
3484 }
3485 
3486 void
3487 vfs_list_unlock()
3488 {
3489         rw_exit(&vfslist);
3490 }
3491 
3492 /*
3493  * Low level worker routines for adding entries to and removing entries from
3494  * the vfs list.
3495  */
3496 
3497 static void
3498 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3499 {
3500         int vhno;
3501         struct vfs **hp;
3502         dev_t dev;
3503 
3504         ASSERT(RW_WRITE_HELD(&vfslist));
3505 
3506         dev = expldev(vfsp->vfs_fsid.val[0]);
3507         vhno = VFSHASH(getmajor(dev), getminor(dev));
3508 
3509         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3510 
3511         /*
3512          * Link into the hash table, inserting it at the end, so that LOFS
3513          * with the same fsid as UFS (or other) file systems will not hide the
3514          * UFS.
3515          */
3516         if (insert_at_head) {
3517                 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3518                 rvfs_list[vhno].rvfs_head = vfsp;
3519         } else {
3520                 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3521                     hp = &(*hp)->vfs_hash)
3522                         continue;
3523                 /*
3524                  * hp now contains the address of the pointer to update
3525                  * to effect the insertion.
3526                  */
3527                 vfsp->vfs_hash = NULL;
3528                 *hp = vfsp;
3529         }
3530 
3531         rvfs_list[vhno].rvfs_len++;
3532         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3533 }
3534 
3535 
3536 static void
3537 vfs_hash_remove(struct vfs *vfsp)
3538 {
3539         int vhno;
3540         struct vfs *tvfsp;
3541         dev_t dev;
3542 
3543         ASSERT(RW_WRITE_HELD(&vfslist));
3544 
3545         dev = expldev(vfsp->vfs_fsid.val[0]);
3546         vhno = VFSHASH(getmajor(dev), getminor(dev));
3547 
3548         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3549 
3550         /*
3551          * Remove from hash.
3552          */
3553         if (rvfs_list[vhno].rvfs_head == vfsp) {
3554                 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3555                 rvfs_list[vhno].rvfs_len--;
3556                 goto foundit;
3557         }
3558         for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3559             tvfsp = tvfsp->vfs_hash) {
3560                 if (tvfsp->vfs_hash == vfsp) {
3561                         tvfsp->vfs_hash = vfsp->vfs_hash;
3562                         rvfs_list[vhno].rvfs_len--;
3563                         goto foundit;
3564                 }
3565         }
3566         cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3567 
3568 foundit:
3569 
3570         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3571 }
3572 
3573 
3574 void
3575 vfs_list_add(struct vfs *vfsp)
3576 {
3577         zone_t *zone;
3578 
3579         /*
3580          * Typically, the vfs_t will have been created on behalf of the file
3581          * system in vfs_init, where it will have been provided with a
3582          * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3583          * by an unbundled file system. We therefore check for such an example
3584          * before stamping the vfs_t with its creation time for the benefit of
3585          * mntfs.
3586          */
3587         if (vfsp->vfs_implp == NULL)
3588                 vfsimpl_setup(vfsp);
3589         vfs_mono_time(&vfsp->vfs_hrctime);
3590 
3591         /*
3592          * The zone that owns the mount is the one that performed the mount.
3593          * Note that this isn't necessarily the same as the zone mounted into.
3594          * The corresponding zone_rele_ref() will be done when the vfs_t
3595          * is being free'd.
3596          */
3597         vfsp->vfs_zone = curproc->p_zone;
3598         zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3599         zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3600             ZONE_REF_VFS);
3601 
3602         /*
3603          * Find the zone mounted into, and put this mount on its vfs list.
3604          */
3605         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3606         ASSERT(zone != NULL);
3607         /*
3608          * Special casing for the root vfs.  This structure is allocated
3609          * statically and hooked onto rootvfs at link time.  During the
3610          * vfs_mountroot call at system startup time, the root file system's
3611          * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3612          * as argument.  The code below must detect and handle this special
3613          * case.  The only apparent justification for this special casing is
3614          * to ensure that the root file system appears at the head of the
3615          * list.
3616          *
3617          * XXX: I'm assuming that it's ok to do normal list locking when
3618          *      adding the entry for the root file system (this used to be
3619          *      done with no locks held).
3620          */
3621         vfs_list_lock();
3622         /*
3623          * Link into the vfs list proper.
3624          */
3625         if (vfsp == &root) {
3626                 /*
3627                  * Assert: This vfs is already on the list as its first entry.
3628                  * Thus, there's nothing to do.
3629                  */
3630                 ASSERT(rootvfs == vfsp);
3631                 /*
3632                  * Add it to the head of the global zone's vfslist.
3633                  */
3634                 ASSERT(zone == global_zone);
3635                 ASSERT(zone->zone_vfslist == NULL);
3636                 zone->zone_vfslist = vfsp;
3637         } else {
3638                 /*
3639                  * Link to end of list using vfs_prev (as rootvfs is now a
3640                  * doubly linked circular list) so list is in mount order for
3641                  * mnttab use.
3642                  */
3643                 rootvfs->vfs_prev->vfs_next = vfsp;
3644                 vfsp->vfs_prev = rootvfs->vfs_prev;
3645                 rootvfs->vfs_prev = vfsp;
3646                 vfsp->vfs_next = rootvfs;
3647 
3648                 /*
3649                  * Do it again for the zone-private list (which may be NULL).
3650                  */
3651                 if (zone->zone_vfslist == NULL) {
3652                         ASSERT(zone != global_zone);
3653                         zone->zone_vfslist = vfsp;
3654                 } else {
3655                         zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3656                         vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3657                         zone->zone_vfslist->vfs_zone_prev = vfsp;
3658                         vfsp->vfs_zone_next = zone->zone_vfslist;
3659                 }
3660         }
3661 
3662         /*
3663          * Link into the hash table, inserting it at the end, so that LOFS
3664          * with the same fsid as UFS (or other) file systems will not hide
3665          * the UFS.
3666          */
3667         vfs_hash_add(vfsp, 0);
3668 
3669         /*
3670          * Link into tree indexed by mntpoint, for vfs_mntpoint2vfsp
3671          * mntix discerns entries with the same key
3672          */
3673         vfsp->vfs_mntix = ++vfs_curr_mntix;
3674         avl_add(&vfs_by_dev, vfsp);
3675 
3676         /*
3677          * Link into tree indexed by dev, for vfs_devismounted
3678          */
3679         avl_add(&vfs_by_mntpnt, vfsp);
3680 
3681         /*
3682          * update the mnttab modification time
3683          */
3684         vfs_mnttab_modtimeupd();
3685         vfs_list_unlock();
3686         zone_rele(zone);
3687 }
3688 
3689 void
3690 vfs_list_remove(struct vfs *vfsp)
3691 {
3692         zone_t *zone;
3693 
3694         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3695         ASSERT(zone != NULL);
3696         /*
3697          * Callers are responsible for preventing attempts to unmount the
3698          * root.
3699          */
3700         ASSERT(vfsp != rootvfs);
3701 
3702         vfs_list_lock();
3703 
3704         /*
3705          * Remove from avl trees
3706          */
3707         avl_remove(&vfs_by_mntpnt, vfsp);
3708         avl_remove(&vfs_by_dev, vfsp);
3709 
3710         /*
3711          * Remove from hash.
3712          */
3713         vfs_hash_remove(vfsp);
3714 
3715         /*
3716          * Remove from vfs list.
3717          */
3718         vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3719         vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3720         vfsp->vfs_next = vfsp->vfs_prev = NULL;
3721 
3722         /*
3723          * Remove from zone-specific vfs list.
3724          */
3725         if (zone->zone_vfslist == vfsp)
3726                 zone->zone_vfslist = vfsp->vfs_zone_next;
3727 
3728         if (vfsp->vfs_zone_next == vfsp) {
3729                 ASSERT(vfsp->vfs_zone_prev == vfsp);
3730                 ASSERT(zone->zone_vfslist == vfsp);
3731                 zone->zone_vfslist = NULL;
3732         }
3733 
3734         vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3735         vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3736         vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3737 
3738         /*
3739          * update the mnttab modification time
3740          */
3741         vfs_mnttab_modtimeupd();
3742         vfs_list_unlock();
3743         zone_rele(zone);
3744 }
3745 
3746 struct vfs *
3747 getvfs(fsid_t *fsid)
3748 {
3749         struct vfs *vfsp;
3750         int val0 = fsid->val[0];
3751         int val1 = fsid->val[1];
3752         dev_t dev = expldev(val0);
3753         int vhno = VFSHASH(getmajor(dev), getminor(dev));
3754         kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3755 
3756         mutex_enter(hmp);
3757         for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3758                 if (vfsp->vfs_fsid.val[0] == val0 &&
3759                     vfsp->vfs_fsid.val[1] == val1) {
3760                         VFS_HOLD(vfsp);
3761                         mutex_exit(hmp);
3762                         return (vfsp);
3763                 }
3764         }
3765         mutex_exit(hmp);
3766         return (NULL);
3767 }
3768 
3769 /*
3770  * Search the vfs mount in progress list for a specified device/vfs entry.
3771  * Returns 0 if the first entry in the list that the device matches has the
3772  * given vfs pointer as well.  If the device matches but a different vfs
3773  * pointer is encountered in the list before the given vfs pointer then
3774  * a 1 is returned.
3775  */
3776 
3777 int
3778 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3779 {
3780         int retval = 0;
3781         struct ipmnt *mipp;
3782 
3783         mutex_enter(&vfs_miplist_mutex);
3784         for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3785                 if (mipp->mip_dev == dev) {
3786                         if (mipp->mip_vfsp != vfsp)
3787                                 retval = 1;
3788                         break;
3789                 }
3790         }
3791         mutex_exit(&vfs_miplist_mutex);
3792         return (retval);
3793 }
3794 
3795 /*
3796  * Search the vfs list for a specified device.  Returns 1, if entry is found
3797  * or 0 if no suitable entry is found.
3798  */
3799 
3800 int
3801 vfs_devismounted(dev_t dev)
3802 {
3803         struct vfs *vfsp;
3804         int found = 0;
3805         struct vfs search;
3806         avl_index_t index;
3807 
3808         search.vfs_dev = dev;
3809         search.vfs_mntix = 0;
3810 
3811         vfs_list_read_lock();
3812 
3813         /*
3814          * there might be several entries with the same dev in the tree,
3815          * only discerned by mntix. To find the first, we start with a mntix
3816          * of 0. The search will fail. The following avl_nearest will give
3817          * us the actual first entry.
3818          */
3819         VERIFY(avl_find(&vfs_by_dev, &search, &index) == NULL);
3820         vfsp = avl_nearest(&vfs_by_dev, index, AVL_AFTER);
3821 
3822         if (vfsp != NULL && vfsp->vfs_dev == dev)
3823                 found = 1;
3824 
3825         vfs_list_unlock();
3826         return (found);
3827 }
3828 
3829 /*
3830  * Search the vfs list for a specified device.  Returns a pointer to it
3831  * or NULL if no suitable entry is found. The caller of this routine
3832  * is responsible for releasing the returned vfs pointer.
3833  */
3834 struct vfs *
3835 vfs_dev2vfsp(dev_t dev)
3836 {
3837         struct vfs *vfsp;
3838         int found;
3839         struct vfs search;
3840         avl_index_t index;
3841 
3842         search.vfs_dev = dev;
3843         search.vfs_mntix = 0;
3844 
3845         vfs_list_read_lock();
3846 
3847         /*
3848          * there might be several entries with the same dev in the tree,
3849          * only discerned by mntix. To find the first, we start with a mntix
3850          * of 0. The search will fail. The following avl_nearest will give
3851          * us the actual first entry.
3852          */
3853         VERIFY(avl_find(&vfs_by_dev, &search, &index) == NULL);
3854         vfsp = avl_nearest(&vfs_by_dev, index, AVL_AFTER);
3855 
3856         found = 0;
3857         while (vfsp != NULL && vfsp->vfs_dev == dev) {
3858                 /*
3859                  * The following could be made more efficient by making
3860                  * the entire loop use vfs_zone_next if the call is from
3861                  * a zone.  The only callers, however, ustat(2) and
3862                  * umount2(2), don't seem to justify the added
3863                  * complexity at present.
3864                  */
3865                 if (ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3866                     curproc->p_zone)) {
3867                         VFS_HOLD(vfsp);
3868                         found = 1;
3869                         break;
3870                 }
3871                 vfsp = AVL_NEXT(&vfs_by_dev, vfsp);
3872         }
3873         vfs_list_unlock();
3874         return (found ? vfsp : NULL);
3875 }
3876 
3877 /*
3878  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3879  * or NULL if no suitable entry is found. The caller of this routine
3880  * is responsible for releasing the returned vfs pointer.
3881  *
3882  * Note that if multiple mntpoints match, the last one matching is
3883  * returned in an attempt to return the "top" mount when overlay
3884  * mounts are covering the same mount point.  This is accomplished by starting
3885  * at the end of the list and working our way backwards, stopping at the first
3886  * matching mount.
3887  */
3888 struct vfs *
3889 vfs_mntpoint2vfsp(const char *mp)
3890 {
3891         struct vfs *vfsp;
3892         struct vfs *retvfsp = NULL;
3893         zone_t *zone = curproc->p_zone;
3894         struct vfs *list;
3895 
3896         vfs_list_read_lock();
3897         if (getzoneid() == GLOBAL_ZONEID) {
3898                 /*
3899                  * The global zone may see filesystems in any zone.
3900                  */
3901                 struct vfs search;
3902                 search.vfs_mntpt = refstr_alloc(mp);
3903                 search.vfs_mntix = UINT64_MAX;
3904                 avl_index_t index;
3905 
3906                 /*
3907                  * there might be several entries with the same mntpnt in the
3908                  * tree, only discerned by mntix. To find the last, we start
3909                  * with a mntix of UINT64_MAX. The search will fail. The
3910                  * following avl_nearest will give  us the actual last entry
3911                  * matching the mntpnt.
3912                  */
3913                 VERIFY(avl_find(&vfs_by_mntpnt, &search, &index) == 0);
3914                 vfsp = avl_nearest(&vfs_by_mntpnt, index, AVL_BEFORE);
3915 
3916                 refstr_rele(search.vfs_mntpt);
3917 
3918                 if (vfsp != NULL &&
3919                     strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0)
3920                         retvfsp = vfsp;
3921         } else if ((list = zone->zone_vfslist) != NULL) {
3922                 const char *mntpt;
3923 
3924                 vfsp = list->vfs_zone_prev;
3925                 do {
3926                         mntpt = refstr_value(vfsp->vfs_mntpt);
3927                         mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3928                         if (strcmp(mntpt, mp) == 0) {
3929                                 retvfsp = vfsp;
3930                                 break;
3931                         }
3932                         vfsp = vfsp->vfs_zone_prev;
3933                 } while (vfsp != list->vfs_zone_prev);
3934         }
3935         if (retvfsp)
3936                 VFS_HOLD(retvfsp);
3937         vfs_list_unlock();
3938         return (retvfsp);
3939 }
3940 
3941 /*
3942  * Search the vfs list for a specified vfsops.
3943  * if vfs entry is found then return 1, else 0.
3944  */
3945 int
3946 vfs_opsinuse(vfsops_t *ops)
3947 {
3948         struct vfs *vfsp;
3949         int found;
3950 
3951         vfs_list_read_lock();
3952         vfsp = rootvfs;
3953         found = 0;
3954         do {
3955                 if (vfs_getops(vfsp) == ops) {
3956                         found = 1;
3957                         break;
3958                 }
3959                 vfsp = vfsp->vfs_next;
3960         } while (vfsp != rootvfs);
3961         vfs_list_unlock();
3962         return (found);
3963 }
3964 
3965 /*
3966  * Allocate an entry in vfssw for a file system type
3967  */
3968 struct vfssw *
3969 allocate_vfssw(const char *type)
3970 {
3971         struct vfssw *vswp;
3972 
3973         if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3974                 /*
3975                  * The vfssw table uses the empty string to identify an
3976                  * available entry; we cannot add any type which has
3977                  * a leading NUL. The string length is limited to
3978                  * the size of the st_fstype array in struct stat.
3979                  */
3980                 return (NULL);
3981         }
3982 
3983         ASSERT(VFSSW_WRITE_LOCKED());
3984         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3985                 if (!ALLOCATED_VFSSW(vswp)) {
3986                         vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3987                         (void) strcpy(vswp->vsw_name, type);
3988                         ASSERT(vswp->vsw_count == 0);
3989                         vswp->vsw_count = 1;
3990                         mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3991                         return (vswp);
3992                 }
3993         return (NULL);
3994 }
3995 
3996 /*
3997  * Impose additional layer of translation between vfstype names
3998  * and module names in the filesystem.
3999  */
4000 static const char *
4001 vfs_to_modname(const char *vfstype)
4002 {
4003         if (strcmp(vfstype, "proc") == 0) {
4004                 vfstype = "procfs";
4005         } else if (strcmp(vfstype, "fd") == 0) {
4006                 vfstype = "fdfs";
4007         } else if (strncmp(vfstype, "nfs", 3) == 0) {
4008                 vfstype = "nfs";
4009         }
4010 
4011         return (vfstype);
4012 }
4013 
4014 /*
4015  * Find a vfssw entry given a file system type name.
4016  * Try to autoload the filesystem if it's not found.
4017  * If it's installed, return the vfssw locked to prevent unloading.
4018  */
4019 struct vfssw *
4020 vfs_getvfssw(const char *type)
4021 {
4022         struct vfssw *vswp;
4023         const char *modname;
4024 
4025         RLOCK_VFSSW();
4026         vswp = vfs_getvfsswbyname(type);
4027         modname = vfs_to_modname(type);
4028 
4029         if (rootdir == NULL) {
4030                 /*
4031                  * If we haven't yet loaded the root file system, then our
4032                  * _init won't be called until later. Allocate vfssw entry,
4033                  * because mod_installfs won't be called.
4034                  */
4035                 if (vswp == NULL) {
4036                         RUNLOCK_VFSSW();
4037                         WLOCK_VFSSW();
4038                         if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
4039                                 if ((vswp = allocate_vfssw(type)) == NULL) {
4040                                         WUNLOCK_VFSSW();
4041                                         return (NULL);
4042                                 }
4043                         }
4044                         WUNLOCK_VFSSW();
4045                         RLOCK_VFSSW();
4046                 }
4047                 if (!VFS_INSTALLED(vswp)) {
4048                         RUNLOCK_VFSSW();
4049                         (void) modloadonly("fs", modname);
4050                 } else
4051                         RUNLOCK_VFSSW();
4052                 return (vswp);
4053         }
4054 
4055         /*
4056          * Try to load the filesystem.  Before calling modload(), we drop
4057          * our lock on the VFS switch table, and pick it up after the
4058          * module is loaded.  However, there is a potential race:  the
4059          * module could be unloaded after the call to modload() completes
4060          * but before we pick up the lock and drive on.  Therefore,
4061          * we keep reloading the module until we've loaded the module
4062          * _and_ we have the lock on the VFS switch table.
4063          */
4064         while (vswp == NULL || !VFS_INSTALLED(vswp)) {
4065                 RUNLOCK_VFSSW();
4066                 if (modload("fs", modname) == -1)
4067                         return (NULL);
4068                 RLOCK_VFSSW();
4069                 if (vswp == NULL)
4070                         if ((vswp = vfs_getvfsswbyname(type)) == NULL)
4071                                 break;
4072         }
4073         RUNLOCK_VFSSW();
4074 
4075         return (vswp);
4076 }
4077 
4078 /*
4079  * Find a vfssw entry given a file system type name.
4080  */
4081 struct vfssw *
4082 vfs_getvfsswbyname(const char *type)
4083 {
4084         struct vfssw *vswp;
4085 
4086         ASSERT(VFSSW_LOCKED());
4087         if (type == NULL || *type == '\0')
4088                 return (NULL);
4089 
4090         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4091                 if (strcmp(type, vswp->vsw_name) == 0) {
4092                         vfs_refvfssw(vswp);
4093                         return (vswp);
4094                 }
4095         }
4096 
4097         return (NULL);
4098 }
4099 
4100 /*
4101  * Find a vfssw entry given a set of vfsops.
4102  */
4103 struct vfssw *
4104 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
4105 {
4106         struct vfssw *vswp;
4107 
4108         RLOCK_VFSSW();
4109         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4110                 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
4111                         vfs_refvfssw(vswp);
4112                         RUNLOCK_VFSSW();
4113                         return (vswp);
4114                 }
4115         }
4116         RUNLOCK_VFSSW();
4117 
4118         return (NULL);
4119 }
4120 
4121 /*
4122  * Reference a vfssw entry.
4123  */
4124 void
4125 vfs_refvfssw(struct vfssw *vswp)
4126 {
4127 
4128         mutex_enter(&vswp->vsw_lock);
4129         vswp->vsw_count++;
4130         mutex_exit(&vswp->vsw_lock);
4131 }
4132 
4133 /*
4134  * Unreference a vfssw entry.
4135  */
4136 void
4137 vfs_unrefvfssw(struct vfssw *vswp)
4138 {
4139 
4140         mutex_enter(&vswp->vsw_lock);
4141         vswp->vsw_count--;
4142         mutex_exit(&vswp->vsw_lock);
4143 }
4144 
4145 int sync_timeout = 30;          /* timeout for syncing a page during panic */
4146 int sync_timeleft;              /* portion of sync_timeout remaining */
4147 
4148 static int sync_retries = 20;   /* number of retries when not making progress */
4149 static int sync_triesleft;      /* portion of sync_retries remaining */
4150 
4151 static pgcnt_t old_pgcnt, new_pgcnt;
4152 static int new_bufcnt, old_bufcnt;
4153 
4154 /*
4155  * Sync all of the mounted filesystems, and then wait for the actual i/o to
4156  * complete.  We wait by counting the number of dirty pages and buffers,
4157  * pushing them out using bio_busy() and page_busy(), and then counting again.
4158  * This routine is used during both the uadmin A_SHUTDOWN code as well as
4159  * the SYNC phase of the panic code (see comments in panic.c).  It should only
4160  * be used after some higher-level mechanism has quiesced the system so that
4161  * new writes are not being initiated while we are waiting for completion.
4162  *
4163  * To ensure finite running time, our algorithm uses two timeout mechanisms:
4164  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
4165  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
4166  * Together these ensure that syncing completes if our i/o paths are stuck.
4167  * The counters are declared above so they can be found easily in the debugger.
4168  *
4169  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
4170  * vfs_syncprogress() subroutine whenever we make progress through the lists of
4171  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
4172  * When vfs_syncall() decides it is done, we disable the deadman() counter by
4173  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
4174  * deadlocking or hanging inside of a broken filesystem or driver routine.
4175  *
4176  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4177  * sync_retries consecutive calls to bio_busy() and page_busy() without
4178  * decreasing either the number of dirty buffers or dirty pages below the
4179  * lowest count we have seen so far, we give up and return from vfs_syncall().
4180  *
4181  * Each loop iteration ends with a call to delay() one second to allow time for
4182  * i/o completion and to permit the user time to read our progress messages.
4183  */
4184 void
4185 vfs_syncall(void)
4186 {
4187         if (rootdir == NULL && !modrootloaded)
4188                 return; /* panic during boot - no filesystems yet */
4189 
4190         printf("syncing file systems...");
4191         vfs_syncprogress();
4192         sync();
4193 
4194         vfs_syncprogress();
4195         sync_triesleft = sync_retries;
4196 
4197         old_bufcnt = new_bufcnt = INT_MAX;
4198         old_pgcnt = new_pgcnt = ULONG_MAX;
4199 
4200         while (sync_triesleft > 0) {
4201                 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4202                 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4203 
4204                 new_bufcnt = bio_busy(B_TRUE);
4205                 new_pgcnt = page_busy(B_TRUE);
4206                 vfs_syncprogress();
4207 
4208                 if (new_bufcnt == 0 && new_pgcnt == 0)
4209                         break;
4210 
4211                 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4212                         sync_triesleft = sync_retries;
4213                 else
4214                         sync_triesleft--;
4215 
4216                 if (new_bufcnt)
4217                         printf(" [%d]", new_bufcnt);
4218                 if (new_pgcnt)
4219                         printf(" %lu", new_pgcnt);
4220 
4221                 delay(hz);
4222         }
4223 
4224         if (new_bufcnt != 0 || new_pgcnt != 0)
4225                 printf(" done (not all i/o completed)\n");
4226         else
4227                 printf(" done\n");
4228 
4229         sync_timeleft = 0;
4230         delay(hz);
4231 }
4232 
4233 /*
4234  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
4235  * sync_timeout to indicate that we are making progress and the deadman()
4236  * omnipresent cyclic should not yet time us out.  Note that it is safe to
4237  * store to sync_timeleft here since the deadman() is firing at high-level
4238  * on top of us.  If we are racing with the deadman(), either the deadman()
4239  * will decrement the old value and then we will reset it, or we will
4240  * reset it and then the deadman() will immediately decrement it.  In either
4241  * case, correct behavior results.
4242  */
4243 void
4244 vfs_syncprogress(void)
4245 {
4246         if (panicstr)
4247                 sync_timeleft = sync_timeout;
4248 }
4249 
4250 /*
4251  * Map VFS flags to statvfs flags.  These shouldn't really be separate
4252  * flags at all.
4253  */
4254 uint_t
4255 vf_to_stf(uint_t vf)
4256 {
4257         uint_t stf = 0;
4258 
4259         if (vf & VFS_RDONLY)
4260                 stf |= ST_RDONLY;
4261         if (vf & VFS_NOSETUID)
4262                 stf |= ST_NOSUID;
4263         if (vf & VFS_NOTRUNC)
4264                 stf |= ST_NOTRUNC;
4265 
4266         return (stf);
4267 }
4268 
4269 /*
4270  * Entries for (illegal) fstype 0.
4271  */
4272 /* ARGSUSED */
4273 int
4274 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4275 {
4276         cmn_err(CE_PANIC, "stray vfs operation");
4277         return (0);
4278 }
4279 
4280 /*
4281  * Entries for (illegal) fstype 0.
4282  */
4283 int
4284 vfsstray(void)
4285 {
4286         cmn_err(CE_PANIC, "stray vfs operation");
4287         return (0);
4288 }
4289 
4290 /*
4291  * Support for dealing with forced UFS unmount and its interaction with
4292  * LOFS. Could be used by any filesystem.
4293  * See bug 1203132.
4294  */
4295 int
4296 vfs_EIO(void)
4297 {
4298         return (EIO);
4299 }
4300 
4301 /*
4302  * We've gotta define the op for sync separately, since the compiler gets
4303  * confused if we mix and match ANSI and normal style prototypes when
4304  * a "short" argument is present and spits out a warning.
4305  */
4306 /*ARGSUSED*/
4307 int
4308 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4309 {
4310         return (EIO);
4311 }
4312 
4313 vfs_t EIO_vfs;
4314 vfsops_t *EIO_vfsops;
4315 
4316 /*
4317  * Called from startup() to initialize all loaded vfs's
4318  */
4319 void
4320 vfsinit(void)
4321 {
4322         struct vfssw *vswp;
4323         int error;
4324         extern int vopstats_enabled;
4325         extern void vopstats_startup();
4326 
4327         static const fs_operation_def_t EIO_vfsops_template[] = {
4328                 VFSNAME_MOUNT,          { .error = vfs_EIO },
4329                 VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4330                 VFSNAME_ROOT,           { .error = vfs_EIO },
4331                 VFSNAME_STATVFS,        { .error = vfs_EIO },
4332                 VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4333                 VFSNAME_VGET,           { .error = vfs_EIO },
4334                 VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4335                 VFSNAME_FREEVFS,        { .error = vfs_EIO },
4336                 VFSNAME_VNSTATE,        { .error = vfs_EIO },
4337                 NULL, NULL
4338         };
4339 
4340         static const fs_operation_def_t stray_vfsops_template[] = {
4341                 VFSNAME_MOUNT,          { .error = vfsstray },
4342                 VFSNAME_UNMOUNT,        { .error = vfsstray },
4343                 VFSNAME_ROOT,           { .error = vfsstray },
4344                 VFSNAME_STATVFS,        { .error = vfsstray },
4345                 VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4346                 VFSNAME_VGET,           { .error = vfsstray },
4347                 VFSNAME_MOUNTROOT,      { .error = vfsstray },
4348                 VFSNAME_FREEVFS,        { .error = vfsstray },
4349                 VFSNAME_VNSTATE,        { .error = vfsstray },
4350                 NULL, NULL
4351         };
4352 
4353         /* Create vfs cache */
4354         vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4355             sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4356 
4357         /* Initialize the vnode cache (file systems may use it during init). */
4358         vn_create_cache();
4359 
4360         /* Setup event monitor framework */
4361         fem_init();
4362 
4363         /* Initialize the dummy stray file system type. */
4364         error = vfs_setfsops(0, stray_vfsops_template, NULL);
4365 
4366         /* Initialize the dummy EIO file system. */
4367         error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4368         if (error != 0) {
4369                 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4370                 /* Shouldn't happen, but not bad enough to panic */
4371         }
4372 
4373         VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4374 
4375         /*
4376          * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4377          * on this vfs can immediately notice it's invalid.
4378          */
4379         EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4380 
4381         /*
4382          * Call the init routines of non-loadable filesystems only.
4383          * Filesystems which are loaded as separate modules will be
4384          * initialized by the module loading code instead.
4385          */
4386 
4387         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4388                 RLOCK_VFSSW();
4389                 if (vswp->vsw_init != NULL)
4390                         (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4391                 RUNLOCK_VFSSW();
4392         }
4393 
4394         vopstats_startup();
4395 
4396         if (vopstats_enabled) {
4397                 /* EIO_vfs can collect stats, but we don't retrieve them */
4398                 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4399                 EIO_vfs.vfs_fstypevsp = NULL;
4400                 EIO_vfs.vfs_vskap = NULL;
4401                 EIO_vfs.vfs_flag |= VFS_STATS;
4402         }
4403 
4404         xattr_init();
4405 
4406         reparse_point_init();
4407 }
4408 
4409 vfs_t *
4410 vfs_alloc(int kmflag)
4411 {
4412         vfs_t *vfsp;
4413 
4414         vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4415 
4416         /*
4417          * Do the simplest initialization here.
4418          * Everything else gets done in vfs_init()
4419          */
4420         bzero(vfsp, sizeof (vfs_t));
4421         return (vfsp);
4422 }
4423 
4424 void
4425 vfs_free(vfs_t *vfsp)
4426 {
4427         /*
4428          * One would be tempted to assert that "vfsp->vfs_count == 0".
4429          * The problem is that this gets called out of domount() with
4430          * a partially initialized vfs and a vfs_count of 1.  This is
4431          * also called from vfs_rele() with a vfs_count of 0.  We can't
4432          * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4433          * returned.  This is because VFS_MOUNT() fully initializes the
4434          * vfs structure and its associated data.  VFS_RELE() will call
4435          * VFS_FREEVFS() which may panic the system if the data structures
4436          * aren't fully initialized from a successful VFS_MOUNT()).
4437          */
4438 
4439         /* If FEM was in use, make sure everything gets cleaned up */
4440         if (vfsp->vfs_femhead) {
4441                 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4442                 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4443                 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4444                 vfsp->vfs_femhead = NULL;
4445         }
4446 
4447         if (vfsp->vfs_implp)
4448                 vfsimpl_teardown(vfsp);
4449         sema_destroy(&vfsp->vfs_reflock);
4450         kmem_cache_free(vfs_cache, vfsp);
4451 }
4452 
4453 /*
4454  * Increments the vfs reference count by one atomically.
4455  */
4456 void
4457 vfs_hold(vfs_t *vfsp)
4458 {
4459         atomic_inc_32(&vfsp->vfs_count);
4460         ASSERT(vfsp->vfs_count != 0);
4461 }
4462 
4463 /*
4464  * Decrements the vfs reference count by one atomically. When
4465  * vfs reference count becomes zero, it calls the file system
4466  * specific vfs_freevfs() to free up the resources.
4467  */
4468 void
4469 vfs_rele(vfs_t *vfsp)
4470 {
4471         ASSERT(vfsp->vfs_count != 0);
4472         if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4473                 VFS_FREEVFS(vfsp);
4474                 lofi_remove(vfsp);
4475                 if (vfsp->vfs_zone)
4476                         zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4477                             ZONE_REF_VFS);
4478                 vfs_freemnttab(vfsp);
4479                 vfs_free(vfsp);
4480         }
4481 }
4482 
4483 /*
4484  * Generic operations vector support.
4485  *
4486  * This is used to build operations vectors for both the vfs and vnode.
4487  * It's normally called only when a file system is loaded.
4488  *
4489  * There are many possible algorithms for this, including the following:
4490  *
4491  *   (1) scan the list of known operations; for each, see if the file system
4492  *       includes an entry for it, and fill it in as appropriate.
4493  *
4494  *   (2) set up defaults for all known operations.  scan the list of ops
4495  *       supplied by the file system; for each which is both supplied and
4496  *       known, fill it in.
4497  *
4498  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4499  *       in entries as we go.
4500  *
4501  * we choose (1) for simplicity, and because performance isn't critical here.
4502  * note that (2) could be sped up using a precomputed hash table on known ops.
4503  * (3) could be faster than either, but only if the lists were very large or
4504  * supplied in sorted order.
4505  *
4506  */
4507 
4508 int
4509 fs_build_vector(void *vector, int *unused_ops,
4510     const fs_operation_trans_def_t *translation,
4511     const fs_operation_def_t *operations)
4512 {
4513         int i, num_trans, num_ops, used;
4514 
4515         /*
4516          * Count the number of translations and the number of supplied
4517          * operations.
4518          */
4519 
4520         {
4521                 const fs_operation_trans_def_t *p;
4522 
4523                 for (num_trans = 0, p = translation;
4524                     p->name != NULL;
4525                     num_trans++, p++)
4526                         ;
4527         }
4528 
4529         {
4530                 const fs_operation_def_t *p;
4531 
4532                 for (num_ops = 0, p = operations;
4533                     p->name != NULL;
4534                     num_ops++, p++)
4535                         ;
4536         }
4537 
4538         /* Walk through each operation known to our caller.  There will be */
4539         /* one entry in the supplied "translation table" for each. */
4540 
4541         used = 0;
4542 
4543         for (i = 0; i < num_trans; i++) {
4544                 int j, found;
4545                 char *curname;
4546                 fs_generic_func_p result;
4547                 fs_generic_func_p *location;
4548 
4549                 curname = translation[i].name;
4550 
4551                 /* Look for a matching operation in the list supplied by the */
4552                 /* file system. */
4553 
4554                 found = 0;
4555 
4556                 for (j = 0; j < num_ops; j++) {
4557                         if (strcmp(operations[j].name, curname) == 0) {
4558                                 used++;
4559                                 found = 1;
4560                                 break;
4561                         }
4562                 }
4563 
4564                 /*
4565                  * If the file system is using a "placeholder" for default
4566                  * or error functions, grab the appropriate function out of
4567                  * the translation table.  If the file system didn't supply
4568                  * this operation at all, use the default function.
4569                  */
4570 
4571                 if (found) {
4572                         result = operations[j].func.fs_generic;
4573                         if (result == fs_default) {
4574                                 result = translation[i].defaultFunc;
4575                         } else if (result == fs_error) {
4576                                 result = translation[i].errorFunc;
4577                         } else if (result == NULL) {
4578                                 /* Null values are PROHIBITED */
4579                                 return (EINVAL);
4580                         }
4581                 } else {
4582                         result = translation[i].defaultFunc;
4583                 }
4584 
4585                 /* Now store the function into the operations vector. */
4586 
4587                 location = (fs_generic_func_p *)
4588                     (((char *)vector) + translation[i].offset);
4589 
4590                 *location = result;
4591         }
4592 
4593         *unused_ops = num_ops - used;
4594 
4595         return (0);
4596 }
4597 
4598 /* Placeholder functions, should never be called. */
4599 
4600 int
4601 fs_error(void)
4602 {
4603         cmn_err(CE_PANIC, "fs_error called");
4604         return (0);
4605 }
4606 
4607 int
4608 fs_default(void)
4609 {
4610         cmn_err(CE_PANIC, "fs_default called");
4611         return (0);
4612 }
4613 
4614 #ifdef __sparc
4615 
4616 /*
4617  * Part of the implementation of booting off a mirrored root
4618  * involves a change of dev_t for the root device.  To
4619  * accomplish this, first remove the existing hash table
4620  * entry for the root device, convert to the new dev_t,
4621  * then re-insert in the hash table at the head of the list.
4622  */
4623 void
4624 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4625 {
4626         vfs_list_lock();
4627 
4628         vfs_hash_remove(vfsp);
4629 
4630         vfsp->vfs_dev = ndev;
4631         vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4632 
4633         vfs_hash_add(vfsp, 1);
4634 
4635         vfs_list_unlock();
4636 }
4637 
4638 #else /* x86 NEWBOOT */
4639 
4640 #if defined(__x86)
4641 extern int hvmboot_rootconf();
4642 #endif /* __x86 */
4643 
4644 extern ib_boot_prop_t *iscsiboot_prop;
4645 
4646 int
4647 rootconf()
4648 {
4649         int error;
4650         struct vfssw *vsw;
4651         extern void pm_init();
4652         char *fstyp, *fsmod;
4653         int ret = -1;
4654 
4655         getrootfs(&fstyp, &fsmod);
4656 
4657 #if defined(__x86)
4658         /*
4659          * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4660          * which lives in /platform/i86hvm, and hence is only available when
4661          * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4662          * is not available then the modstub for this function will return 0.
4663          * If the hvm_bootstrap misc module is available it will be loaded
4664          * and hvmboot_rootconf() will be invoked.
4665          */
4666         if (error = hvmboot_rootconf())
4667                 return (error);
4668 #endif /* __x86 */
4669 
4670         if (error = clboot_rootconf())
4671                 return (error);
4672 
4673         if (modload("fs", fsmod) == -1)
4674                 panic("Cannot _init %s module", fsmod);
4675 
4676         RLOCK_VFSSW();
4677         vsw = vfs_getvfsswbyname(fstyp);
4678         RUNLOCK_VFSSW();
4679         if (vsw == NULL) {
4680                 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4681                 return (ENXIO);
4682         }
4683         VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4684         VFS_HOLD(rootvfs);
4685 
4686         /* always mount readonly first */
4687         rootvfs->vfs_flag |= VFS_RDONLY;
4688 
4689         pm_init();
4690 
4691         if (netboot && iscsiboot_prop) {
4692                 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4693                     " shouldn't happen in the same time");
4694                 return (EINVAL);
4695         }
4696 
4697         if (netboot || iscsiboot_prop) {
4698                 ret = strplumb();
4699                 if (ret != 0) {
4700                         cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4701                         return (EFAULT);
4702                 }
4703         }
4704 
4705         if ((ret == 0) && iscsiboot_prop) {
4706                 ret = modload("drv", "iscsi");
4707                 /* -1 indicates fail */
4708                 if (ret == -1) {
4709                         cmn_err(CE_WARN, "Failed to load iscsi module");
4710                         iscsi_boot_prop_free();
4711                         return (EINVAL);
4712                 } else {
4713                         if (!i_ddi_attach_pseudo_node("iscsi")) {
4714                                 cmn_err(CE_WARN,
4715                                     "Failed to attach iscsi driver");
4716                                 iscsi_boot_prop_free();
4717                                 return (ENODEV);
4718                         }
4719                 }
4720         }
4721 
4722         error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4723         vfs_unrefvfssw(vsw);
4724         rootdev = rootvfs->vfs_dev;
4725 
4726         if (error)
4727                 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4728                     rootfs.bo_name, fstyp);
4729         else
4730                 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4731                     rootfs.bo_name, fstyp);
4732         return (error);
4733 }
4734 
4735 /*
4736  * XXX this is called by nfs only and should probably be removed
4737  * If booted with ASKNAME, prompt on the console for a filesystem
4738  * name and return it.
4739  */
4740 void
4741 getfsname(char *askfor, char *name, size_t namelen)
4742 {
4743         if (boothowto & RB_ASKNAME) {
4744                 printf("%s name: ", askfor);
4745                 console_gets(name, namelen);
4746         }
4747 }
4748 
4749 /*
4750  * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4751  * property.
4752  *
4753  * Filesystem types starting with the prefix "nfs" are diskless clients;
4754  * init the root filename name (rootfs.bo_name), too.
4755  *
4756  * If we are booting via NFS we currently have these options:
4757  *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4758  *      nfs2 -  force NFS V2
4759  *      nfs3 -  force NFS V3
4760  *      nfs4 -  force NFS V4
4761  * Because we need to maintain backward compatibility with the naming
4762  * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4763  * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4764  * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4765  * This is only for root filesystems, all other uses such as cachefs
4766  * will expect that "nfs" == NFS V2.
4767  */
4768 static void
4769 getrootfs(char **fstypp, char **fsmodp)
4770 {
4771         extern char *strplumb_get_netdev_path(void);
4772         char *propstr = NULL;
4773 
4774         /*
4775          * Check fstype property; for diskless it should be one of "nfs",
4776          * "nfs2", "nfs3" or "nfs4".
4777          */
4778         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4779             DDI_PROP_DONTPASS, "fstype", &propstr)
4780             == DDI_SUCCESS) {
4781                 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4782                 ddi_prop_free(propstr);
4783 
4784         /*
4785          * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4786          * assume the type of this root filesystem is 'zfs'.
4787          */
4788         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4789             DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4790             == DDI_SUCCESS) {
4791                 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4792                 ddi_prop_free(propstr);
4793         }
4794 
4795         if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4796                 *fstypp = *fsmodp = rootfs.bo_fstype;
4797                 return;
4798         }
4799 
4800         ++netboot;
4801 
4802         if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4803                 (void) strcpy(rootfs.bo_fstype, "nfs");
4804         else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4805                 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4806 
4807         /*
4808          * check if path to network interface is specified in bootpath
4809          * or by a hypervisor domain configuration file.
4810          * XXPV - enable strlumb_get_netdev_path()
4811          */
4812         if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4813             "xpv-nfsroot")) {
4814                 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4815         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4816             DDI_PROP_DONTPASS, "bootpath", &propstr)
4817             == DDI_SUCCESS) {
4818                 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4819                 ddi_prop_free(propstr);
4820         } else {
4821                 /* attempt to determine netdev_path via boot_mac address */
4822                 netdev_path = strplumb_get_netdev_path();
4823                 if (netdev_path == NULL)
4824                         panic("cannot find boot network interface");
4825                 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4826         }
4827         *fstypp = rootfs.bo_fstype;
4828         *fsmodp = "nfs";
4829 }
4830 #endif
4831 
4832 /*
4833  * VFS feature routines
4834  */
4835 
4836 #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4837 #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4838 
4839 /* Register a feature in the vfs */
4840 void
4841 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4842 {
4843         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4844         if (vfsp->vfs_implp == NULL)
4845                 return;
4846 
4847         vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4848 }
4849 
4850 void
4851 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4852 {
4853         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4854         if (vfsp->vfs_implp == NULL)
4855                 return;
4856         vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4857 }
4858 
4859 /*
4860  * Query a vfs for a feature.
4861  * Returns 1 if feature is present, 0 if not
4862  */
4863 int
4864 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4865 {
4866         int     ret = 0;
4867 
4868         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4869         if (vfsp->vfs_implp == NULL)
4870                 return (ret);
4871 
4872         if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4873                 ret = 1;
4874 
4875         return (ret);
4876 }
4877 
4878 /*
4879  * Propagate feature set from one vfs to another
4880  */
4881 void
4882 vfs_propagate_features(vfs_t *from, vfs_t *to)
4883 {
4884         int i;
4885 
4886         if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4887                 return;
4888 
4889         for (i = 1; i <= to->vfs_featureset[0]; i++) {
4890                 to->vfs_featureset[i] = from->vfs_featureset[i];
4891         }
4892 }
4893 
4894 #define LOFINODE_PATH "/dev/lofi/%d"
4895 
4896 /*
4897  * Return the vnode for the lofi node if there's a lofi mount in place.
4898  * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4899  * failure.
4900  */
4901 int
4902 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4903 {
4904         char *path = NULL;
4905         int strsize;
4906         int err;
4907 
4908         if (vfsp->vfs_lofi_minor == 0) {
4909                 *vpp = NULL;
4910                 return (-1);
4911         }
4912 
4913         strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_minor);
4914         path = kmem_alloc(strsize + 1, KM_SLEEP);
4915         (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_minor);
4916 
4917         /*
4918          * We may be inside a zone, so we need to use the /dev path, but
4919          * it's created asynchronously, so we wait here.
4920          */
4921         for (;;) {
4922                 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4923 
4924                 if (err != ENOENT)
4925                         break;
4926 
4927                 if ((err = delay_sig(hz / 8)) == EINTR)
4928                         break;
4929         }
4930 
4931         if (err)
4932                 *vpp = NULL;
4933 
4934         kmem_free(path, strsize + 1);
4935         return (err);
4936 }