illumos-gate Old usr/src/uts/common/fs/vfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/errno.h>
  43 #include <sys/user.h>
  44 #include <sys/fstyp.h>
  45 #include <sys/kmem.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/mount.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/fem.h>
  52 #include <sys/mntent.h>
  53 #include <sys/stat.h>
  54 #include <sys/statvfs.h>
  55 #include <sys/statfs.h>
  56 #include <sys/cred.h>
  57 #include <sys/vnode.h>
  58 #include <sys/rwstlock.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/file.h>
  61 #include <sys/time.h>
  62 #include <sys/atomic.h>
  63 #include <sys/cmn_err.h>
  64 #include <sys/buf.h>
  65 #include <sys/swap.h>
  66 #include <sys/debug.h>
  67 #include <sys/vnode.h>
  68 #include <sys/modctl.h>
  69 #include <sys/ddi.h>
  70 #include <sys/pathname.h>
  71 #include <sys/bootconf.h>
  72 #include <sys/dumphdr.h>
  73 #include <sys/dc_ki.h>
  74 #include <sys/poll.h>
  75 #include <sys/sunddi.h>
  76 #include <sys/sysmacros.h>
  77 #include <sys/zone.h>
  78 #include <sys/policy.h>
  79 #include <sys/ctfs.h>
  80 #include <sys/objfs.h>
  81 #include <sys/console.h>
  82 #include <sys/reboot.h>
  83 #include <sys/attr.h>
  84 #include <sys/zio.h>
  85 #include <sys/spa.h>
  86 #include <sys/lofi.h>
  87 #include <sys/bootprops.h>
  88 
  89 #include <vm/page.h>
  90 
  91 #include <fs/fs_subr.h>
  92 /* Private interfaces to create vopstats-related data structures */
  93 extern void             initialize_vopstats(vopstats_t *);
  94 extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
  95 extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
  96 
  97 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
  98 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
  99     const char *, int, int);
 100 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 101 static void vfs_freemnttab(struct vfs *);
 102 static void vfs_freeopt(mntopt_t *);
 103 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 104 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 105 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 106 static void vfs_createopttbl_extend(mntopts_t *, const char *,
 107     const mntopts_t *);
 108 static char **vfs_copycancelopt_extend(char **const, int);
 109 static void vfs_freecancelopt(char **);
 110 static void getrootfs(char **, char **);
 111 static int getmacpath(dev_info_t *, void *);
 112 static void vfs_mnttabvp_setup(void);
 113 
 114 struct ipmnt {
 115         struct ipmnt    *mip_next;
 116         dev_t           mip_dev;
 117         struct vfs      *mip_vfsp;
 118 };
 119 
 120 static kmutex_t         vfs_miplist_mutex;
 121 static struct ipmnt     *vfs_miplist = NULL;
 122 static struct ipmnt     *vfs_miplist_end = NULL;
 123 
 124 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 125 
 126 /*
 127  * VFS global data.
 128  */
 129 vnode_t *rootdir;               /* pointer to root inode vnode. */
 130 vnode_t *devicesdir;            /* pointer to inode of devices root */
 131 vnode_t *devdir;                /* pointer to inode of dev root */
 132 
 133 char *server_rootpath;          /* root path for diskless clients */
 134 char *server_hostname;          /* hostname of diskless server */
 135 
 136 static struct vfs root;
 137 static struct vfs devices;
 138 static struct vfs dev;
 139 struct vfs *rootvfs = &root;        /* pointer to root vfs; head of VFS list. */
 140 rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 141 int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 142                                 /* must be power of 2!  */
 143 timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 144 timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 145 char *vfs_dummyfstype = "\0";
 146 struct pollhead vfs_pollhd;     /* for mnttab pollers */
 147 struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 148 int     mntfstype;              /* will be set once mnt fs is mounted */
 149 
 150 /*
 151  * Table for generic options recognized in the VFS layer and acted
 152  * on at this level before parsing file system specific options.
 153  * The nosuid option is stronger than any of the devices and setuid
 154  * options, so those are canceled when nosuid is seen.
 155  *
 156  * All options which are added here need to be added to the
 157  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 158  */
 159 /*
 160  * VFS Mount options table
 161  */
 162 static char *ro_cancel[] = { MNTOPT_RW, NULL };
 163 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 164 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 165 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 166     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 167 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 168 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 169 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 170 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 171 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 172 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 173 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 174 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 175 
 176 static const mntopt_t mntopts[] = {
 177 /*
 178  *      option name             cancel options          default arg     flags
 179  */
 180         { MNTOPT_REMOUNT,       NULL,                   NULL,
 181                 MO_NODISPLAY, (void *)0 },
 182         { MNTOPT_RO,            ro_cancel,              NULL,           0,
 183                 (void *)0 },
 184         { MNTOPT_RW,            rw_cancel,              NULL,           0,
 185                 (void *)0 },
 186         { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 187                 (void *)0 },
 188         { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 189                 (void *)0 },
 190         { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 191                 (void *)0 },
 192         { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 193                 (void *)0 },
 194         { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 195                 (void *)0 },
 196         { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 197                 (void *)0 },
 198         { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 199                 (void *)0 },
 200         { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 201                 (void *)0 },
 202         { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 203                 (void *)0 },
 204         { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 205                 (void *)0 },
 206 };
 207 
 208 const mntopts_t vfs_mntopts = {
 209         sizeof (mntopts) / sizeof (mntopt_t),
 210         (mntopt_t *)&mntopts[0]
 211 };
 212 
 213 /*
 214  * File system operation dispatch functions.
 215  */
 216 
 217 int
 218 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 219 {
 220         return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 221 }
 222 
 223 int
 224 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 225 {
 226         return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 227 }
 228 
 229 int
 230 fsop_root(vfs_t *vfsp, vnode_t **vpp)
 231 {
 232         refstr_t *mntpt;
 233         int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 234         /*
 235          * Make sure this root has a path.  With lofs, it is possible to have
 236          * a NULL mountpoint.
 237          */
 238         if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
 239                 mntpt = vfs_getmntpoint(vfsp);
 240                 vn_setpath_str(*vpp, refstr_value(mntpt),
 241                     strlen(refstr_value(mntpt)));
 242                 refstr_rele(mntpt);
 243         }
 244 
 245         return (ret);
 246 }
 247 
 248 int
 249 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 250 {
 251         return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 252 }
 253 
 254 int
 255 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 256 {
 257         return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 258 }
 259 
 260 int
 261 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 262 {
 263         /*
 264          * In order to handle system attribute fids in a manner
 265          * transparent to the underlying fs, we embed the fid for
 266          * the sysattr parent object in the sysattr fid and tack on
 267          * some extra bytes that only the sysattr layer knows about.
 268          *
 269          * This guarantees that sysattr fids are larger than other fids
 270          * for this vfs. If the vfs supports the sysattr view interface
 271          * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 272          * collision with XATTR_FIDSZ.
 273          */
 274         if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 275             fidp->fid_len == XATTR_FIDSZ)
 276                 return (xattr_dir_vget(vfsp, vpp, fidp));
 277 
 278         return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 279 }
 280 
 281 int
 282 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 283 {
 284         return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 285 }
 286 
 287 void
 288 fsop_freefs(vfs_t *vfsp)
 289 {
 290         (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 291 }
 292 
 293 int
 294 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 295 {
 296         return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 297 }
 298 
 299 int
 300 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 301 {
 302         ASSERT((fstype >= 0) && (fstype < nfstype));
 303 
 304         if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 305                 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 306         else
 307                 return (ENOTSUP);
 308 }
 309 
 310 /*
 311  * File system initialization.  vfs_setfsops() must be called from a file
 312  * system's init routine.
 313  */
 314 
 315 static int
 316 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 317     int *unused_ops)
 318 {
 319         static const fs_operation_trans_def_t vfs_ops_table[] = {
 320                 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 321                         fs_nosys, fs_nosys,
 322 
 323                 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 324                         fs_nosys, fs_nosys,
 325 
 326                 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 327                         fs_nosys, fs_nosys,
 328 
 329                 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 330                         fs_nosys, fs_nosys,
 331 
 332                 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 333                         (fs_generic_func_p) fs_sync,
 334                         (fs_generic_func_p) fs_sync,    /* No errors allowed */
 335 
 336                 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 337                         fs_nosys, fs_nosys,
 338 
 339                 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 340                         fs_nosys, fs_nosys,
 341 
 342                 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 343                         (fs_generic_func_p)fs_freevfs,
 344                         (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 345 
 346                 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 347                         (fs_generic_func_p)fs_nosys,
 348                         (fs_generic_func_p)fs_nosys,
 349 
 350                 NULL, 0, NULL, NULL
 351         };
 352 
 353         return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 354 }
 355 
 356 void
 357 zfs_boot_init() {
 358 
 359         if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 360                 spa_boot_init();
 361 }
 362 
 363 int
 364 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 365 {
 366         int error;
 367         int unused_ops;
 368 
 369         /*
 370          * Verify that fstype refers to a valid fs.  Note that
 371          * 0 is valid since it's used to set "stray" ops.
 372          */
 373         if ((fstype < 0) || (fstype >= nfstype))
 374                 return (EINVAL);
 375 
 376         if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 377                 return (EINVAL);
 378 
 379         /* Set up the operations vector. */
 380 
 381         error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 382 
 383         if (error != 0)
 384                 return (error);
 385 
 386         vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 387 
 388         if (actual != NULL)
 389                 *actual = &vfssw[fstype].vsw_vfsops;
 390 
 391 #if DEBUG
 392         if (unused_ops != 0)
 393                 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 394                     "but not used", vfssw[fstype].vsw_name, unused_ops);
 395 #endif
 396 
 397         return (0);
 398 }
 399 
 400 int
 401 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 402 {
 403         int error;
 404         int unused_ops;
 405 
 406         *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 407 
 408         error = fs_copyfsops(template, *actual, &unused_ops);
 409         if (error != 0) {
 410                 kmem_free(*actual, sizeof (vfsops_t));
 411                 *actual = NULL;
 412                 return (error);
 413         }
 414 
 415         return (0);
 416 }
 417 
 418 /*
 419  * Free a vfsops structure created as a result of vfs_makefsops().
 420  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 421  * vfs_freevfsops_by_type().
 422  */
 423 void
 424 vfs_freevfsops(vfsops_t *vfsops)
 425 {
 426         kmem_free(vfsops, sizeof (vfsops_t));
 427 }
 428 
 429 /*
 430  * Since the vfsops structure is part of the vfssw table and wasn't
 431  * really allocated, we're not really freeing anything.  We keep
 432  * the name for consistency with vfs_freevfsops().  We do, however,
 433  * need to take care of a little bookkeeping.
 434  * NOTE: For a vfsops structure created by vfs_setfsops(), use
 435  * vfs_freevfsops_by_type().
 436  */
 437 int
 438 vfs_freevfsops_by_type(int fstype)
 439 {
 440 
 441         /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 442         if ((fstype <= 0) || (fstype >= nfstype))
 443                 return (EINVAL);
 444 
 445         WLOCK_VFSSW();
 446         if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 447                 WUNLOCK_VFSSW();
 448                 return (EINVAL);
 449         }
 450 
 451         vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 452         WUNLOCK_VFSSW();
 453 
 454         return (0);
 455 }
 456 
 457 /* Support routines used to reference vfs_op */
 458 
 459 /* Set the operations vector for a vfs */
 460 void
 461 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 462 {
 463         vfsops_t        *op;
 464 
 465         ASSERT(vfsp != NULL);
 466         ASSERT(vfsops != NULL);
 467 
 468         op = vfsp->vfs_op;
 469         membar_consumer();
 470         if (vfsp->vfs_femhead == NULL &&
 471             atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
 472                 return;
 473         }
 474         fsem_setvfsops(vfsp, vfsops);
 475 }
 476 
 477 /* Retrieve the operations vector for a vfs */
 478 vfsops_t *
 479 vfs_getops(vfs_t *vfsp)
 480 {
 481         vfsops_t        *op;
 482 
 483         ASSERT(vfsp != NULL);
 484 
 485         op = vfsp->vfs_op;
 486         membar_consumer();
 487         if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 488                 return (op);
 489         } else {
 490                 return (fsem_getvfsops(vfsp));
 491         }
 492 }
 493 
 494 /*
 495  * Returns non-zero (1) if the vfsops matches that of the vfs.
 496  * Returns zero (0) if not.
 497  */
 498 int
 499 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 500 {
 501         return (vfs_getops(vfsp) == vfsops);
 502 }
 503 
 504 /*
 505  * Returns non-zero (1) if the file system has installed a non-default,
 506  * non-error vfs_sync routine.  Returns zero (0) otherwise.
 507  */
 508 int
 509 vfs_can_sync(vfs_t *vfsp)
 510 {
 511         /* vfs_sync() routine is not the default/error function */
 512         return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 513 }
 514 
 515 /*
 516  * Initialize a vfs structure.
 517  */
 518 void
 519 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 520 {
 521         /* Other initialization has been moved to vfs_alloc() */
 522         vfsp->vfs_count = 0;
 523         vfsp->vfs_next = vfsp;
 524         vfsp->vfs_prev = vfsp;
 525         vfsp->vfs_zone_next = vfsp;
 526         vfsp->vfs_zone_prev = vfsp;
 527         vfsp->vfs_lofi_minor = 0;
 528         sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 529         vfsimpl_setup(vfsp);
 530         vfsp->vfs_data = (data);
 531         vfs_setops((vfsp), (op));
 532 }
 533 
 534 /*
 535  * Allocate and initialize the vfs implementation private data
 536  * structure, vfs_impl_t.
 537  */
 538 void
 539 vfsimpl_setup(vfs_t *vfsp)
 540 {
 541         int i;
 542 
 543         if (vfsp->vfs_implp != NULL) {
 544                 return;
 545         }
 546 
 547         vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 548         /* Note that these are #define'd in vfs.h */
 549         vfsp->vfs_vskap = NULL;
 550         vfsp->vfs_fstypevsp = NULL;
 551 
 552         /* Set size of counted array, then zero the array */
 553         vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 554         for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 555                 vfsp->vfs_featureset[i] = 0;
 556         }
 557 }
 558 
 559 /*
 560  * Release the vfs_impl_t structure, if it exists. Some unbundled
 561  * filesystems may not use the newer version of vfs and thus
 562  * would not contain this implementation private data structure.
 563  */
 564 void
 565 vfsimpl_teardown(vfs_t *vfsp)
 566 {
 567         vfs_impl_t      *vip = vfsp->vfs_implp;
 568 
 569         if (vip == NULL)
 570                 return;
 571 
 572         kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 573         vfsp->vfs_implp = NULL;
 574 }
 575 
 576 /*
 577  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 578  * fstatvfs, and sysfs moved to common/syscall.
 579  */
 580 
 581 /*
 582  * Update every mounted file system.  We call the vfs_sync operation of
 583  * each file system type, passing it a NULL vfsp to indicate that all
 584  * mounted file systems of that type should be updated.
 585  */
 586 void
 587 vfs_sync(int flag)
 588 {
 589         struct vfssw *vswp;
 590         RLOCK_VFSSW();
 591         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 592                 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 593                         vfs_refvfssw(vswp);
 594                         RUNLOCK_VFSSW();
 595                         (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 596                             CRED());
 597                         vfs_unrefvfssw(vswp);
 598                         RLOCK_VFSSW();
 599                 }
 600         }
 601         RUNLOCK_VFSSW();
 602 }
 603 
 604 void
 605 sync(void)
 606 {
 607         vfs_sync(0);
 608 }
 609 
 610 /*
 611  * External routines.
 612  */
 613 
 614 krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 615 
 616 /*
 617  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 618  * but otherwise should be accessed only via vfs_list_lock() and
 619  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 620  */
 621 static krwlock_t vfslist;
 622 
 623 /*
 624  * Mount devfs on /devices. This is done right after root is mounted
 625  * to provide device access support for the system
 626  */
 627 static void
 628 vfs_mountdevices(void)
 629 {
 630         struct vfssw *vsw;
 631         struct vnode *mvp;
 632         struct mounta mounta = {        /* fake mounta for devfs_mount() */
 633                 NULL,
 634                 NULL,
 635                 MS_SYSSPACE,
 636                 NULL,
 637                 NULL,
 638                 0,
 639                 NULL,
 640                 0
 641         };
 642 
 643         /*
 644          * _init devfs module to fill in the vfssw
 645          */
 646         if (modload("fs", "devfs") == -1)
 647                 panic("Cannot _init devfs module");
 648 
 649         /*
 650          * Hold vfs
 651          */
 652         RLOCK_VFSSW();
 653         vsw = vfs_getvfsswbyname("devfs");
 654         VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 655         VFS_HOLD(&devices);
 656 
 657         /*
 658          * Locate mount point
 659          */
 660         if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 661                 panic("Cannot find /devices");
 662 
 663         /*
 664          * Perform the mount of /devices
 665          */
 666         if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 667                 panic("Cannot mount /devices");
 668 
 669         RUNLOCK_VFSSW();
 670 
 671         /*
 672          * Set appropriate members and add to vfs list for mnttab display
 673          */
 674         vfs_setresource(&devices, "/devices", 0);
 675         vfs_setmntpoint(&devices, "/devices", 0);
 676 
 677         /*
 678          * Hold the root of /devices so it won't go away
 679          */
 680         if (VFS_ROOT(&devices, &devicesdir))
 681                 panic("vfs_mountdevices: not devices root");
 682 
 683         if (vfs_lock(&devices) != 0) {
 684                 VN_RELE(devicesdir);
 685                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 686                 return;
 687         }
 688 
 689         if (vn_vfswlock(mvp) != 0) {
 690                 vfs_unlock(&devices);
 691                 VN_RELE(devicesdir);
 692                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 693                 return;
 694         }
 695 
 696         vfs_add(mvp, &devices, 0);
 697         vn_vfsunlock(mvp);
 698         vfs_unlock(&devices);
 699         VN_RELE(devicesdir);
 700 }
 701 
 702 /*
 703  * mount the first instance of /dev  to root and remain mounted
 704  */
 705 static void
 706 vfs_mountdev1(void)
 707 {
 708         struct vfssw *vsw;
 709         struct vnode *mvp;
 710         struct mounta mounta = {        /* fake mounta for sdev_mount() */
 711                 NULL,
 712                 NULL,
 713                 MS_SYSSPACE | MS_OVERLAY,
 714                 NULL,
 715                 NULL,
 716                 0,
 717                 NULL,
 718                 0
 719         };
 720 
 721         /*
 722          * _init dev module to fill in the vfssw
 723          */
 724         if (modload("fs", "dev") == -1)
 725                 cmn_err(CE_PANIC, "Cannot _init dev module\n");
 726 
 727         /*
 728          * Hold vfs
 729          */
 730         RLOCK_VFSSW();
 731         vsw = vfs_getvfsswbyname("dev");
 732         VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 733         VFS_HOLD(&dev);
 734 
 735         /*
 736          * Locate mount point
 737          */
 738         if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 739                 cmn_err(CE_PANIC, "Cannot find /dev\n");
 740 
 741         /*
 742          * Perform the mount of /dev
 743          */
 744         if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 745                 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 746 
 747         RUNLOCK_VFSSW();
 748 
 749         /*
 750          * Set appropriate members and add to vfs list for mnttab display
 751          */
 752         vfs_setresource(&dev, "/dev", 0);
 753         vfs_setmntpoint(&dev, "/dev", 0);
 754 
 755         /*
 756          * Hold the root of /dev so it won't go away
 757          */
 758         if (VFS_ROOT(&dev, &devdir))
 759                 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 760 
 761         if (vfs_lock(&dev) != 0) {
 762                 VN_RELE(devdir);
 763                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 764                 return;
 765         }
 766 
 767         if (vn_vfswlock(mvp) != 0) {
 768                 vfs_unlock(&dev);
 769                 VN_RELE(devdir);
 770                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 771                 return;
 772         }
 773 
 774         vfs_add(mvp, &dev, 0);
 775         vn_vfsunlock(mvp);
 776         vfs_unlock(&dev);
 777         VN_RELE(devdir);
 778 }
 779 
 780 /*
 781  * Mount required filesystem. This is done right after root is mounted.
 782  */
 783 static void
 784 vfs_mountfs(char *module, char *spec, char *path)
 785 {
 786         struct vnode *mvp;
 787         struct mounta mounta;
 788         vfs_t *vfsp;
 789 
 790         mounta.flags = MS_SYSSPACE | MS_DATA;
 791         mounta.fstype = module;
 792         mounta.spec = spec;
 793         mounta.dir = path;
 794         if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 795                 cmn_err(CE_WARN, "Cannot find %s", path);
 796                 return;
 797         }
 798         if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 799                 cmn_err(CE_WARN, "Cannot mount %s", path);
 800         else
 801                 VFS_RELE(vfsp);
 802         VN_RELE(mvp);
 803 }
 804 
 805 /*
 806  * vfs_mountroot is called by main() to mount the root filesystem.
 807  */
 808 void
 809 vfs_mountroot(void)
 810 {
 811         struct vnode    *rvp = NULL;
 812         char            *path;
 813         size_t          plen;
 814         struct vfssw    *vswp;
 815         proc_t          *p;
 816 
 817         rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 818         rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 819 
 820         /*
 821          * Alloc the vfs hash bucket array and locks
 822          */
 823         rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 824 
 825         /*
 826          * Call machine-dependent routine "rootconf" to choose a root
 827          * file system type.
 828          */
 829         if (rootconf())
 830                 panic("vfs_mountroot: cannot mount root");
 831         /*
 832          * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 833          * to point to it.  These are used by lookuppn() so that it
 834          * knows where to start from ('/' or '.').
 835          */
 836         vfs_setmntpoint(rootvfs, "/", 0);
 837         if (VFS_ROOT(rootvfs, &rootdir))
 838                 panic("vfs_mountroot: no root vnode");
 839 
 840         /*
 841          * At this point, the process tree consists of p0 and possibly some
 842          * direct children of p0.  (i.e. there are no grandchildren)
 843          *
 844          * Walk through them all, setting their current directory.
 845          */
 846         mutex_enter(&pidlock);
 847         for (p = practive; p != NULL; p = p->p_next) {
 848                 ASSERT(p == &p0 || p->p_parent == &p0);
 849 
 850                 PTOU(p)->u_cdir = rootdir;
 851                 VN_HOLD(PTOU(p)->u_cdir);
 852                 PTOU(p)->u_rdir = NULL;
 853         }
 854         mutex_exit(&pidlock);
 855 
 856         /*
 857          * Setup the global zone's rootvp, now that it exists.
 858          */
 859         global_zone->zone_rootvp = rootdir;
 860         VN_HOLD(global_zone->zone_rootvp);
 861 
 862         /*
 863          * Notify the module code that it can begin using the
 864          * root filesystem instead of the boot program's services.
 865          */
 866         modrootloaded = 1;
 867 
 868         /*
 869          * Special handling for a ZFS root file system.
 870          */
 871         zfs_boot_init();
 872 
 873         /*
 874          * Set up mnttab information for root
 875          */
 876         vfs_setresource(rootvfs, rootfs.bo_name, 0);
 877 
 878         /*
 879          * Notify cluster software that the root filesystem is available.
 880          */
 881         clboot_mountroot();
 882 
 883         /* Now that we're all done with the root FS, set up its vopstats */
 884         if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 885                 /* Set flag for statistics collection */
 886                 if (vswp->vsw_flag & VSW_STATS) {
 887                         initialize_vopstats(&rootvfs->vfs_vopstats);
 888                         rootvfs->vfs_flag |= VFS_STATS;
 889                         rootvfs->vfs_fstypevsp =
 890                             get_fstype_vopstats(rootvfs, vswp);
 891                         rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 892                 }
 893                 vfs_unrefvfssw(vswp);
 894         }
 895 
 896         /*
 897          * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 898          * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 899          */
 900         vfs_mountdevices();
 901         vfs_mountdev1();
 902 
 903         vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 904         vfs_mountfs("proc", "/proc", "/proc");
 905         vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 906         vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 907         vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 908 
 909         if (getzoneid() == GLOBAL_ZONEID) {
 910                 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 911         }
 912 
 913 #ifdef __sparc
 914         /*
 915          * This bit of magic can go away when we convert sparc to
 916          * the new boot architecture based on ramdisk.
 917          *
 918          * Booting off a mirrored root volume:
 919          * At this point, we have booted and mounted root on a
 920          * single component of the mirror.  Complete the boot
 921          * by configuring SVM and converting the root to the
 922          * dev_t of the mirrored root device.  This dev_t conversion
 923          * only works because the underlying device doesn't change.
 924          */
 925         if (root_is_svm) {
 926                 if (svm_rootconf()) {
 927                         panic("vfs_mountroot: cannot remount root");
 928                 }
 929 
 930                 /*
 931                  * mnttab should reflect the new root device
 932                  */
 933                 vfs_lock_wait(rootvfs);
 934                 vfs_setresource(rootvfs, rootfs.bo_name, 0);
 935                 vfs_unlock(rootvfs);
 936         }
 937 #endif /* __sparc */
 938 
 939         if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 940                 /*
 941                  * Look up the root device via devfs so that a dv_node is
 942                  * created for it. The vnode is never VN_RELE()ed.
 943                  * We allocate more than MAXPATHLEN so that the
 944                  * buffer passed to i_ddi_prompath_to_devfspath() is
 945                  * exactly MAXPATHLEN (the function expects a buffer
 946                  * of that length).
 947                  */
 948                 plen = strlen("/devices");
 949                 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 950                 (void) strcpy(path, "/devices");
 951 
 952                 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 953                     != DDI_SUCCESS ||
 954                     lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 955 
 956                         /* NUL terminate in case "path" has garbage */
 957                         path[plen + MAXPATHLEN - 1] = '\0';
 958 #ifdef  DEBUG
 959                         cmn_err(CE_WARN, "!Cannot lookup root device: %s",
 960                             path);
 961 #endif
 962                 }
 963                 kmem_free(path, plen + MAXPATHLEN);
 964         }
 965 
 966         vfs_mnttabvp_setup();
 967 }
 968 
 969 /*
 970  * Check to see if our "block device" is actually a file.  If so,
 971  * automatically add a lofi device, and keep track of this fact.
 972  */
 973 static int
 974 lofi_add(const char *fsname, struct vfs *vfsp,
 975     mntopts_t *mntopts, struct mounta *uap)
 976 {
 977         int fromspace = (uap->flags & MS_SYSSPACE) ?
 978             UIO_SYSSPACE : UIO_USERSPACE;
 979         struct lofi_ioctl *li = NULL;
 980         struct vnode *vp = NULL;
 981         struct pathname pn = { NULL };
 982         ldi_ident_t ldi_id;
 983         ldi_handle_t ldi_hdl;
 984         vfssw_t *vfssw;
 985         int minor;
 986         int err = 0;
 987 
 988         if ((vfssw = vfs_getvfssw(fsname)) == NULL)
 989                 return (0);
 990 
 991         if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
 992                 vfs_unrefvfssw(vfssw);
 993                 return (0);
 994         }
 995 
 996         vfs_unrefvfssw(vfssw);
 997         vfssw = NULL;
 998 
 999         if (pn_get(uap->spec, fromspace, &pn) != 0)
1000                 return (0);
1001 
1002         if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
1003                 goto out;
1004 
1005         if (vp->v_type != VREG)
1006                 goto out;
1007 
1008         /* OK, this is a lofi mount. */
1009 
1010         if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
1011             vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
1012             vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
1013             vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
1014                 err = EINVAL;
1015                 goto out;
1016         }
1017 
1018         ldi_id = ldi_ident_from_anon();
1019         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1020         (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1021 
1022         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1023             &ldi_hdl, ldi_id);
1024 
1025         if (err)
1026                 goto out2;
1027 
1028         err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1029             FREAD | FWRITE | FKIOCTL, kcred, &minor);
1030 
1031         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1032 
1033         if (!err)
1034                 vfsp->vfs_lofi_minor = minor;
1035 
1036 out2:
1037         ldi_ident_release(ldi_id);
1038 out:
1039         if (li != NULL)
1040                 kmem_free(li, sizeof (*li));
1041         if (vp != NULL)
1042                 VN_RELE(vp);
1043         pn_free(&pn);
1044         return (err);
1045 }
1046 
1047 static void
1048 lofi_remove(struct vfs *vfsp)
1049 {
1050         struct lofi_ioctl *li = NULL;
1051         ldi_ident_t ldi_id;
1052         ldi_handle_t ldi_hdl;
1053         int err;
1054 
1055         if (vfsp->vfs_lofi_minor == 0)
1056                 return;
1057 
1058         ldi_id = ldi_ident_from_anon();
1059 
1060         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1061         li->li_minor = vfsp->vfs_lofi_minor;
1062         li->li_cleanup = B_TRUE;
1063 
1064         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1065             &ldi_hdl, ldi_id);
1066 
1067         if (err)
1068                 goto out;
1069 
1070         err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1071             FREAD | FWRITE | FKIOCTL, kcred, NULL);
1072 
1073         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1074 
1075         if (!err)
1076                 vfsp->vfs_lofi_minor = 0;
1077 
1078 out:
1079         ldi_ident_release(ldi_id);
1080         if (li != NULL)
1081                 kmem_free(li, sizeof (*li));
1082 }
1083 
1084 /*
1085  * Common mount code.  Called from the system call entry point, from autofs,
1086  * nfsv4 trigger mounts, and from pxfs.
1087  *
1088  * Takes the effective file system type, mount arguments, the mount point
1089  * vnode, flags specifying whether the mount is a remount and whether it
1090  * should be entered into the vfs list, and credentials.  Fills in its vfspp
1091  * parameter with the mounted file system instance's vfs.
1092  *
1093  * Note that the effective file system type is specified as a string.  It may
1094  * be null, in which case it's determined from the mount arguments, and may
1095  * differ from the type specified in the mount arguments; this is a hook to
1096  * allow interposition when instantiating file system instances.
1097  *
1098  * The caller is responsible for releasing its own hold on the mount point
1099  * vp (this routine does its own hold when necessary).
1100  * Also note that for remounts, the mount point vp should be the vnode for
1101  * the root of the file system rather than the vnode that the file system
1102  * is mounted on top of.
1103  */
1104 int
1105 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1106         struct vfs **vfspp)
1107 {
1108         struct vfssw    *vswp;
1109         vfsops_t        *vfsops;
1110         struct vfs      *vfsp;
1111         struct vnode    *bvp;
1112         dev_t           bdev = 0;
1113         mntopts_t       mnt_mntopts;
1114         int             error = 0;
1115         int             copyout_error = 0;
1116         int             ovflags;
1117         char            *opts = uap->optptr;
1118         char            *inargs = opts;
1119         int             optlen = uap->optlen;
1120         int             remount;
1121         int             rdonly;
1122         int             nbmand = 0;
1123         int             delmip = 0;
1124         int             addmip = 0;
1125         int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1126         int             fromspace = (uap->flags & MS_SYSSPACE) ?
1127             UIO_SYSSPACE : UIO_USERSPACE;
1128         char            *resource = NULL, *mountpt = NULL;
1129         refstr_t        *oldresource, *oldmntpt;
1130         struct pathname pn, rpn;
1131         vsk_anchor_t    *vskap;
1132         char fstname[FSTYPSZ];
1133         zone_t          *zone;
1134 
1135         /*
1136          * The v_flag value for the mount point vp is permanently set
1137          * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1138          * for mount point locking.
1139          */
1140         mutex_enter(&vp->v_lock);
1141         vp->v_flag |= VVFSLOCK;
1142         mutex_exit(&vp->v_lock);
1143 
1144         mnt_mntopts.mo_count = 0;
1145         /*
1146          * Find the ops vector to use to invoke the file system-specific mount
1147          * method.  If the fsname argument is non-NULL, use it directly.
1148          * Otherwise, dig the file system type information out of the mount
1149          * arguments.
1150          *
1151          * A side effect is to hold the vfssw entry.
1152          *
1153          * Mount arguments can be specified in several ways, which are
1154          * distinguished by flag bit settings.  The preferred way is to set
1155          * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1156          * type supplied as a character string and the last two arguments
1157          * being a pointer to a character buffer and the size of the buffer.
1158          * On entry, the buffer holds a null terminated list of options; on
1159          * return, the string is the list of options the file system
1160          * recognized. If MS_DATA is set arguments five and six point to a
1161          * block of binary data which the file system interprets.
1162          * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1163          * consistently with these conventions.  To handle them, we check to
1164          * see whether the pointer to the file system name has a numeric value
1165          * less than 256.  If so, we treat it as an index.
1166          */
1167         if (fsname != NULL) {
1168                 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1169                         return (EINVAL);
1170                 }
1171         } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1172                 size_t n;
1173                 uint_t fstype;
1174 
1175                 fsname = fstname;
1176 
1177                 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1178                         RLOCK_VFSSW();
1179                         if (fstype == 0 || fstype >= nfstype ||
1180                             !ALLOCATED_VFSSW(&vfssw[fstype])) {
1181                                 RUNLOCK_VFSSW();
1182                                 return (EINVAL);
1183                         }
1184                         (void) strcpy(fsname, vfssw[fstype].vsw_name);
1185                         RUNLOCK_VFSSW();
1186                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1187                                 return (EINVAL);
1188                 } else {
1189                         /*
1190                          * Handle either kernel or user address space.
1191                          */
1192                         if (uap->flags & MS_SYSSPACE) {
1193                                 error = copystr(uap->fstype, fsname,
1194                                     FSTYPSZ, &n);
1195                         } else {
1196                                 error = copyinstr(uap->fstype, fsname,
1197                                     FSTYPSZ, &n);
1198                         }
1199                         if (error) {
1200                                 if (error == ENAMETOOLONG)
1201                                         return (EINVAL);
1202                                 return (error);
1203                         }
1204                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1205                                 return (EINVAL);
1206                 }
1207         } else {
1208                 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1209                         return (EINVAL);
1210                 fsname = vswp->vsw_name;
1211         }
1212         if (!VFS_INSTALLED(vswp))
1213                 return (EINVAL);
1214 
1215         if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1216                 vfs_unrefvfssw(vswp);
1217                 return (error);
1218         }
1219 
1220         vfsops = &vswp->vsw_vfsops;
1221 
1222         vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1223         /*
1224          * Fetch mount options and parse them for generic vfs options
1225          */
1226         if (uap->flags & MS_OPTIONSTR) {
1227                 /*
1228                  * Limit the buffer size
1229                  */
1230                 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1231                         error = EINVAL;
1232                         goto errout;
1233                 }
1234                 if ((uap->flags & MS_SYSSPACE) == 0) {
1235                         inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1236                         inargs[0] = '\0';
1237                         if (optlen) {
1238                                 error = copyinstr(opts, inargs, (size_t)optlen,
1239                                     NULL);
1240                                 if (error) {
1241                                         goto errout;
1242                                 }
1243                         }
1244                 }
1245                 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1246         }
1247         /*
1248          * Flag bits override the options string.
1249          */
1250         if (uap->flags & MS_REMOUNT)
1251                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1252         if (uap->flags & MS_RDONLY)
1253                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1254         if (uap->flags & MS_NOSUID)
1255                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1256 
1257         /*
1258          * Check if this is a remount; must be set in the option string and
1259          * the file system must support a remount option.
1260          */
1261         if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1262             MNTOPT_REMOUNT, NULL)) {
1263                 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1264                         error = ENOTSUP;
1265                         goto errout;
1266                 }
1267                 uap->flags |= MS_REMOUNT;
1268         }
1269 
1270         /*
1271          * uap->flags and vfs_optionisset() should agree.
1272          */
1273         if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1274                 uap->flags |= MS_RDONLY;
1275         }
1276         if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1277                 uap->flags |= MS_NOSUID;
1278         }
1279         nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1280         ASSERT(splice || !remount);
1281         /*
1282          * If we are splicing the fs into the namespace,
1283          * perform mount point checks.
1284          *
1285          * We want to resolve the path for the mount point to eliminate
1286          * '.' and ".." and symlinks in mount points; we can't do the
1287          * same for the resource string, since it would turn
1288          * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1289          * this before grabbing vn_vfswlock(), because otherwise we
1290          * would deadlock with lookuppn().
1291          */
1292         if (splice) {
1293                 ASSERT(vp->v_count > 0);
1294 
1295                 /*
1296                  * Pick up mount point and device from appropriate space.
1297                  */
1298                 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1299                         resource = kmem_alloc(pn.pn_pathlen + 1,
1300                             KM_SLEEP);
1301                         (void) strcpy(resource, pn.pn_path);
1302                         pn_free(&pn);
1303                 }
1304                 /*
1305                  * Do a lookupname prior to taking the
1306                  * writelock. Mark this as completed if
1307                  * successful for later cleanup and addition to
1308                  * the mount in progress table.
1309                  */
1310                 if ((uap->flags & MS_GLOBAL) == 0 &&
1311                     lookupname(uap->spec, fromspace,
1312                     FOLLOW, NULL, &bvp) == 0) {
1313                         addmip = 1;
1314                 }
1315 
1316                 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1317                         pathname_t *pnp;
1318 
1319                         if (*pn.pn_path != '/') {
1320                                 error = EINVAL;
1321                                 pn_free(&pn);
1322                                 goto errout;
1323                         }
1324                         pn_alloc(&rpn);
1325                         /*
1326                          * Kludge to prevent autofs from deadlocking with
1327                          * itself when it calls domount().
1328                          *
1329                          * If autofs is calling, it is because it is doing
1330                          * (autofs) mounts in the process of an NFS mount.  A
1331                          * lookuppn() here would cause us to block waiting for
1332                          * said NFS mount to complete, which can't since this
1333                          * is the thread that was supposed to doing it.
1334                          */
1335                         if (fromspace == UIO_USERSPACE) {
1336                                 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1337                                     NULL)) == 0) {
1338                                         pnp = &rpn;
1339                                 } else {
1340                                         /*
1341                                          * The file disappeared or otherwise
1342                                          * became inaccessible since we opened
1343                                          * it; might as well fail the mount
1344                                          * since the mount point is no longer
1345                                          * accessible.
1346                                          */
1347                                         pn_free(&rpn);
1348                                         pn_free(&pn);
1349                                         goto errout;
1350                                 }
1351                         } else {
1352                                 pnp = &pn;
1353                         }
1354                         mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1355                         (void) strcpy(mountpt, pnp->pn_path);
1356 
1357                         /*
1358                          * If the addition of the zone's rootpath
1359                          * would push us over a total path length
1360                          * of MAXPATHLEN, we fail the mount with
1361                          * ENAMETOOLONG, which is what we would have
1362                          * gotten if we were trying to perform the same
1363                          * mount in the global zone.
1364                          *
1365                          * strlen() doesn't count the trailing
1366                          * '\0', but zone_rootpathlen counts both a
1367                          * trailing '/' and the terminating '\0'.
1368                          */
1369                         if ((curproc->p_zone->zone_rootpathlen - 1 +
1370                             strlen(mountpt)) > MAXPATHLEN ||
1371                             (resource != NULL &&
1372                             (curproc->p_zone->zone_rootpathlen - 1 +
1373                             strlen(resource)) > MAXPATHLEN)) {
1374                                 error = ENAMETOOLONG;
1375                         }
1376 
1377                         pn_free(&rpn);
1378                         pn_free(&pn);
1379                 }
1380 
1381                 if (error)
1382                         goto errout;
1383 
1384                 /*
1385                  * Prevent path name resolution from proceeding past
1386                  * the mount point.
1387                  */
1388                 if (vn_vfswlock(vp) != 0) {
1389                         error = EBUSY;
1390                         goto errout;
1391                 }
1392 
1393                 /*
1394                  * Verify that it's legitimate to establish a mount on
1395                  * the prospective mount point.
1396                  */
1397                 if (vn_mountedvfs(vp) != NULL) {
1398                         /*
1399                          * The mount point lock was obtained after some
1400                          * other thread raced through and established a mount.
1401                          */
1402                         vn_vfsunlock(vp);
1403                         error = EBUSY;
1404                         goto errout;
1405                 }
1406                 if (vp->v_flag & VNOMOUNT) {
1407                         vn_vfsunlock(vp);
1408                         error = EINVAL;
1409                         goto errout;
1410                 }
1411         }
1412         if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1413                 uap->dataptr = NULL;
1414                 uap->datalen = 0;
1415         }
1416 
1417         /*
1418          * If this is a remount, we don't want to create a new VFS.
1419          * Instead, we pass the existing one with a remount flag.
1420          */
1421         if (remount) {
1422                 /*
1423                  * Confirm that the mount point is the root vnode of the
1424                  * file system that is being remounted.
1425                  * This can happen if the user specifies a different
1426                  * mount point directory pathname in the (re)mount command.
1427                  *
1428                  * Code below can only be reached if splice is true, so it's
1429                  * safe to do vn_vfsunlock() here.
1430                  */
1431                 if ((vp->v_flag & VROOT) == 0) {
1432                         vn_vfsunlock(vp);
1433                         error = ENOENT;
1434                         goto errout;
1435                 }
1436                 /*
1437                  * Disallow making file systems read-only unless file system
1438                  * explicitly allows it in its vfssw.  Ignore other flags.
1439                  */
1440                 if (rdonly && vn_is_readonly(vp) == 0 &&
1441                     (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1442                         vn_vfsunlock(vp);
1443                         error = EINVAL;
1444                         goto errout;
1445                 }
1446                 /*
1447                  * Disallow changing the NBMAND disposition of the file
1448                  * system on remounts.
1449                  */
1450                 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1451                     (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1452                         vn_vfsunlock(vp);
1453                         error = EINVAL;
1454                         goto errout;
1455                 }
1456                 vfsp = vp->v_vfsp;
1457                 ovflags = vfsp->vfs_flag;
1458                 vfsp->vfs_flag |= VFS_REMOUNT;
1459                 vfsp->vfs_flag &= ~VFS_RDONLY;
1460         } else {
1461                 vfsp = vfs_alloc(KM_SLEEP);
1462                 VFS_INIT(vfsp, vfsops, NULL);
1463         }
1464 
1465         VFS_HOLD(vfsp);
1466 
1467         if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1468                 if (!remount) {
1469                         if (splice)
1470                                 vn_vfsunlock(vp);
1471                         vfs_free(vfsp);
1472                 } else {
1473                         vn_vfsunlock(vp);
1474                         VFS_RELE(vfsp);
1475                 }
1476                 goto errout;
1477         }
1478 
1479         /*
1480          * PRIV_SYS_MOUNT doesn't mean you can become root.
1481          */
1482         if (vfsp->vfs_lofi_minor != 0) {
1483                 uap->flags |= MS_NOSUID;
1484                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1485         }
1486 
1487         /*
1488          * The vfs_reflock is not used anymore the code below explicitly
1489          * holds it preventing others accesing it directly.
1490          */
1491         if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1492             !(vfsp->vfs_flag & VFS_REMOUNT))
1493                 cmn_err(CE_WARN,
1494                     "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1495 
1496         /*
1497          * Lock the vfs. If this is a remount we want to avoid spurious umount
1498          * failures that happen as a side-effect of fsflush() and other mount
1499          * and unmount operations that might be going on simultaneously and
1500          * may have locked the vfs currently. To not return EBUSY immediately
1501          * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1502          */
1503         if (!remount) {
1504                 if (error = vfs_lock(vfsp)) {
1505                         vfsp->vfs_flag = ovflags;
1506 
1507                         lofi_remove(vfsp);
1508 
1509                         if (splice)
1510                                 vn_vfsunlock(vp);
1511                         vfs_free(vfsp);
1512                         goto errout;
1513                 }
1514         } else {
1515                 vfs_lock_wait(vfsp);
1516         }
1517 
1518         /*
1519          * Add device to mount in progress table, global mounts require special
1520          * handling. It is possible that we have already done the lookupname
1521          * on a spliced, non-global fs. If so, we don't want to do it again
1522          * since we cannot do a lookupname after taking the
1523          * wlock above. This case is for a non-spliced, non-global filesystem.
1524          */
1525         if (!addmip) {
1526                 if ((uap->flags & MS_GLOBAL) == 0 &&
1527                     lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1528                         addmip = 1;
1529                 }
1530         }
1531 
1532         if (addmip) {
1533                 vnode_t *lvp = NULL;
1534 
1535                 error = vfs_get_lofi(vfsp, &lvp);
1536                 if (error > 0) {
1537                         lofi_remove(vfsp);
1538 
1539                         if (splice)
1540                                 vn_vfsunlock(vp);
1541                         vfs_unlock(vfsp);
1542 
1543                         if (remount) {
1544                                 VFS_RELE(vfsp);
1545                         } else {
1546                                 vfs_free(vfsp);
1547                         }
1548 
1549                         goto errout;
1550                 } else if (error == -1) {
1551                         bdev = bvp->v_rdev;
1552                         VN_RELE(bvp);
1553                 } else {
1554                         bdev = lvp->v_rdev;
1555                         VN_RELE(lvp);
1556                         VN_RELE(bvp);
1557                 }
1558 
1559                 vfs_addmip(bdev, vfsp);
1560                 addmip = 0;
1561                 delmip = 1;
1562         }
1563         /*
1564          * Invalidate cached entry for the mount point.
1565          */
1566         if (splice)
1567                 dnlc_purge_vp(vp);
1568 
1569         /*
1570          * If have an option string but the filesystem doesn't supply a
1571          * prototype options table, create a table with the global
1572          * options and sufficient room to accept all the options in the
1573          * string.  Then parse the passed in option string
1574          * accepting all the options in the string.  This gives us an
1575          * option table with all the proper cancel properties for the
1576          * global options.
1577          *
1578          * Filesystems that supply a prototype options table are handled
1579          * earlier in this function.
1580          */
1581         if (uap->flags & MS_OPTIONSTR) {
1582                 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1583                         mntopts_t tmp_mntopts;
1584 
1585                         tmp_mntopts.mo_count = 0;
1586                         vfs_createopttbl_extend(&tmp_mntopts, inargs,
1587                             &mnt_mntopts);
1588                         vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1589                         vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1590                         vfs_freeopttbl(&tmp_mntopts);
1591                 }
1592         }
1593 
1594         /*
1595          * Serialize with zone state transitions.
1596          * See vfs_list_add; zone mounted into is:
1597          *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1598          * not the zone doing the mount (curproc->p_zone), but if we're already
1599          * inside a NGZ, then we know what zone we are.
1600          */
1601         if (INGLOBALZONE(curproc)) {
1602                 zone = zone_find_by_path(mountpt);
1603                 ASSERT(zone != NULL);
1604         } else {
1605                 zone = curproc->p_zone;
1606                 /*
1607                  * zone_find_by_path does a hold, so do one here too so that
1608                  * we can do a zone_rele after mount_completed.
1609                  */
1610                 zone_hold(zone);
1611         }
1612         mount_in_progress(zone);
1613         /*
1614          * Instantiate (or reinstantiate) the file system.  If appropriate,
1615          * splice it into the file system name space.
1616          *
1617          * We want VFS_MOUNT() to be able to override the vfs_resource
1618          * string if necessary (ie, mntfs), and also for a remount to
1619          * change the same (necessary when remounting '/' during boot).
1620          * So we set up vfs_mntpt and vfs_resource to what we think they
1621          * should be, then hand off control to VFS_MOUNT() which can
1622          * override this.
1623          *
1624          * For safety's sake, when changing vfs_resource or vfs_mntpt of
1625          * a vfs which is on the vfs list (i.e. during a remount), we must
1626          * never set those fields to NULL. Several bits of code make
1627          * assumptions that the fields are always valid.
1628          */
1629         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1630         if (remount) {
1631                 if ((oldresource = vfsp->vfs_resource) != NULL)
1632                         refstr_hold(oldresource);
1633                 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1634                         refstr_hold(oldmntpt);
1635         }
1636         vfs_setresource(vfsp, resource, 0);
1637         vfs_setmntpoint(vfsp, mountpt, 0);
1638 
1639         /*
1640          * going to mount on this vnode, so notify.
1641          */
1642         vnevent_mountedover(vp, NULL);
1643         error = VFS_MOUNT(vfsp, vp, uap, credp);
1644 
1645         if (uap->flags & MS_RDONLY)
1646                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1647         if (uap->flags & MS_NOSUID)
1648                 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1649         if (uap->flags & MS_GLOBAL)
1650                 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1651 
1652         if (error) {
1653                 lofi_remove(vfsp);
1654 
1655                 if (remount) {
1656                         /* put back pre-remount options */
1657                         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1658                         vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1659                             VFSSP_VERBATIM);
1660                         if (oldmntpt)
1661                                 refstr_rele(oldmntpt);
1662                         vfs_setresource(vfsp, refstr_value(oldresource),
1663                             VFSSP_VERBATIM);
1664                         if (oldresource)
1665                                 refstr_rele(oldresource);
1666                         vfsp->vfs_flag = ovflags;
1667                         vfs_unlock(vfsp);
1668                         VFS_RELE(vfsp);
1669                 } else {
1670                         vfs_unlock(vfsp);
1671                         vfs_freemnttab(vfsp);
1672                         vfs_free(vfsp);
1673                 }
1674         } else {
1675                 /*
1676                  * Set the mount time to now
1677                  */
1678                 vfsp->vfs_mtime = ddi_get_time();
1679                 if (remount) {
1680                         vfsp->vfs_flag &= ~VFS_REMOUNT;
1681                         if (oldresource)
1682                                 refstr_rele(oldresource);
1683                         if (oldmntpt)
1684                                 refstr_rele(oldmntpt);
1685                 } else if (splice) {
1686                         /*
1687                          * Link vfsp into the name space at the mount
1688                          * point. Vfs_add() is responsible for
1689                          * holding the mount point which will be
1690                          * released when vfs_remove() is called.
1691                          */
1692                         vfs_add(vp, vfsp, uap->flags);
1693                 } else {
1694                         /*
1695                          * Hold the reference to file system which is
1696                          * not linked into the name space.
1697                          */
1698                         vfsp->vfs_zone = NULL;
1699                         VFS_HOLD(vfsp);
1700                         vfsp->vfs_vnodecovered = NULL;
1701                 }
1702                 /*
1703                  * Set flags for global options encountered
1704                  */
1705                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1706                         vfsp->vfs_flag |= VFS_RDONLY;
1707                 else
1708                         vfsp->vfs_flag &= ~VFS_RDONLY;
1709                 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1710                         vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1711                 } else {
1712                         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1713                                 vfsp->vfs_flag |= VFS_NODEVICES;
1714                         else
1715                                 vfsp->vfs_flag &= ~VFS_NODEVICES;
1716                         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1717                                 vfsp->vfs_flag |= VFS_NOSETUID;
1718                         else
1719                                 vfsp->vfs_flag &= ~VFS_NOSETUID;
1720                 }
1721                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1722                         vfsp->vfs_flag |= VFS_NBMAND;
1723                 else
1724                         vfsp->vfs_flag &= ~VFS_NBMAND;
1725 
1726                 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1727                         vfsp->vfs_flag |= VFS_XATTR;
1728                 else
1729                         vfsp->vfs_flag &= ~VFS_XATTR;
1730 
1731                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1732                         vfsp->vfs_flag |= VFS_NOEXEC;
1733                 else
1734                         vfsp->vfs_flag &= ~VFS_NOEXEC;
1735 
1736                 /*
1737                  * Now construct the output option string of options
1738                  * we recognized.
1739                  */
1740                 if (uap->flags & MS_OPTIONSTR) {
1741                         vfs_list_read_lock();
1742                         copyout_error = vfs_buildoptionstr(
1743                             &vfsp->vfs_mntopts, inargs, optlen);
1744                         vfs_list_unlock();
1745                         if (copyout_error == 0 &&
1746                             (uap->flags & MS_SYSSPACE) == 0) {
1747                                 copyout_error = copyoutstr(inargs, opts,
1748                                     optlen, NULL);
1749                         }
1750                 }
1751 
1752                 /*
1753                  * If this isn't a remount, set up the vopstats before
1754                  * anyone can touch this. We only allow spliced file
1755                  * systems (file systems which are in the namespace) to
1756                  * have the VFS_STATS flag set.
1757                  * NOTE: PxFS mounts the underlying file system with
1758                  * MS_NOSPLICE set and copies those vfs_flags to its private
1759                  * vfs structure. As a result, PxFS should never have
1760                  * the VFS_STATS flag or else we might access the vfs
1761                  * statistics-related fields prior to them being
1762                  * properly initialized.
1763                  */
1764                 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1765                         initialize_vopstats(&vfsp->vfs_vopstats);
1766                         /*
1767                          * We need to set vfs_vskap to NULL because there's
1768                          * a chance it won't be set below.  This is checked
1769                          * in teardown_vopstats() so we can't have garbage.
1770                          */
1771                         vfsp->vfs_vskap = NULL;
1772                         vfsp->vfs_flag |= VFS_STATS;
1773                         vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1774                 }
1775 
1776                 if (vswp->vsw_flag & VSW_XID)
1777                         vfsp->vfs_flag |= VFS_XID;
1778 
1779                 vfs_unlock(vfsp);
1780         }
1781         mount_completed(zone);
1782         zone_rele(zone);
1783         if (splice)
1784                 vn_vfsunlock(vp);
1785 
1786         if ((error == 0) && (copyout_error == 0)) {
1787                 if (!remount) {
1788                         /*
1789                          * Don't call get_vskstat_anchor() while holding
1790                          * locks since it allocates memory and calls
1791                          * VFS_STATVFS().  For NFS, the latter can generate
1792                          * an over-the-wire call.
1793                          */
1794                         vskap = get_vskstat_anchor(vfsp);
1795                         /* Only take the lock if we have something to do */
1796                         if (vskap != NULL) {
1797                                 vfs_lock_wait(vfsp);
1798                                 if (vfsp->vfs_flag & VFS_STATS) {
1799                                         vfsp->vfs_vskap = vskap;
1800                                 }
1801                                 vfs_unlock(vfsp);
1802                         }
1803                 }
1804                 /* Return vfsp to caller. */
1805                 *vfspp = vfsp;
1806         }
1807 errout:
1808         vfs_freeopttbl(&mnt_mntopts);
1809         if (resource != NULL)
1810                 kmem_free(resource, strlen(resource) + 1);
1811         if (mountpt != NULL)
1812                 kmem_free(mountpt, strlen(mountpt) + 1);
1813         /*
1814          * It is possible we errored prior to adding to mount in progress
1815          * table. Must free vnode we acquired with successful lookupname.
1816          */
1817         if (addmip)
1818                 VN_RELE(bvp);
1819         if (delmip)
1820                 vfs_delmip(vfsp);
1821         ASSERT(vswp != NULL);
1822         vfs_unrefvfssw(vswp);
1823         if (inargs != opts)
1824                 kmem_free(inargs, MAX_MNTOPT_STR);
1825         if (copyout_error) {
1826                 lofi_remove(vfsp);
1827                 VFS_RELE(vfsp);
1828                 error = copyout_error;
1829         }
1830         return (error);
1831 }
1832 
1833 static void
1834 vfs_setpath(
1835     struct vfs *vfsp,           /* vfs being updated */
1836     refstr_t **refp,            /* Ref-count string to contain the new path */
1837     const char *newpath,        /* Path to add to refp (above) */
1838     uint32_t flag)              /* flag */
1839 {
1840         size_t len;
1841         refstr_t *ref;
1842         zone_t *zone = curproc->p_zone;
1843         char *sp;
1844         int have_list_lock = 0;
1845 
1846         ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1847 
1848         /*
1849          * New path must be less than MAXPATHLEN because mntfs
1850          * will only display up to MAXPATHLEN bytes. This is currently
1851          * safe, because domount() uses pn_get(), and other callers
1852          * similarly cap the size to fewer than MAXPATHLEN bytes.
1853          */
1854 
1855         ASSERT(strlen(newpath) < MAXPATHLEN);
1856 
1857         /* mntfs requires consistency while vfs list lock is held */
1858 
1859         if (VFS_ON_LIST(vfsp)) {
1860                 have_list_lock = 1;
1861                 vfs_list_lock();
1862         }
1863 
1864         if (*refp != NULL)
1865                 refstr_rele(*refp);
1866 
1867         /*
1868          * If we are in a non-global zone then we prefix the supplied path,
1869          * newpath, with the zone's root path, with two exceptions. The first
1870          * is where we have been explicitly directed to avoid doing so; this
1871          * will be the case following a failed remount, where the path supplied
1872          * will be a saved version which must now be restored. The second
1873          * exception is where newpath is not a pathname but a descriptive name,
1874          * e.g. "procfs".
1875          */
1876         if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1877                 ref = refstr_alloc(newpath);
1878                 goto out;
1879         }
1880 
1881         /*
1882          * Truncate the trailing '/' in the zoneroot, and merge
1883          * in the zone's rootpath with the "newpath" (resource
1884          * or mountpoint) passed in.
1885          *
1886          * The size of the required buffer is thus the size of
1887          * the buffer required for the passed-in newpath
1888          * (strlen(newpath) + 1), plus the size of the buffer
1889          * required to hold zone_rootpath (zone_rootpathlen)
1890          * minus one for one of the now-superfluous NUL
1891          * terminations, minus one for the trailing '/'.
1892          *
1893          * That gives us:
1894          *
1895          * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1896          *
1897          * Which is what we have below.
1898          */
1899 
1900         len = strlen(newpath) + zone->zone_rootpathlen - 1;
1901         sp = kmem_alloc(len, KM_SLEEP);
1902 
1903         /*
1904          * Copy everything including the trailing slash, which
1905          * we then overwrite with the NUL character.
1906          */
1907 
1908         (void) strcpy(sp, zone->zone_rootpath);
1909         sp[zone->zone_rootpathlen - 2] = '\0';
1910         (void) strcat(sp, newpath);
1911 
1912         ref = refstr_alloc(sp);
1913         kmem_free(sp, len);
1914 out:
1915         *refp = ref;
1916 
1917         if (have_list_lock) {
1918                 vfs_mnttab_modtimeupd();
1919                 vfs_list_unlock();
1920         }
1921 }
1922 
1923 /*
1924  * Record a mounted resource name in a vfs structure.
1925  * If vfsp is already mounted, caller must hold the vfs lock.
1926  */
1927 void
1928 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1929 {
1930         if (resource == NULL || resource[0] == '\0')
1931                 resource = VFS_NORESOURCE;
1932         vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1933 }
1934 
1935 /*
1936  * Record a mount point name in a vfs structure.
1937  * If vfsp is already mounted, caller must hold the vfs lock.
1938  */
1939 void
1940 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1941 {
1942         if (mntpt == NULL || mntpt[0] == '\0')
1943                 mntpt = VFS_NOMNTPT;
1944         vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1945 }
1946 
1947 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1948 
1949 refstr_t *
1950 vfs_getresource(const struct vfs *vfsp)
1951 {
1952         refstr_t *resource;
1953 
1954         vfs_list_read_lock();
1955         resource = vfsp->vfs_resource;
1956         refstr_hold(resource);
1957         vfs_list_unlock();
1958 
1959         return (resource);
1960 }
1961 
1962 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1963 
1964 refstr_t *
1965 vfs_getmntpoint(const struct vfs *vfsp)
1966 {
1967         refstr_t *mntpt;
1968 
1969         vfs_list_read_lock();
1970         mntpt = vfsp->vfs_mntpt;
1971         refstr_hold(mntpt);
1972         vfs_list_unlock();
1973 
1974         return (mntpt);
1975 }
1976 
1977 /*
1978  * Create an empty options table with enough empty slots to hold all
1979  * The options in the options string passed as an argument.
1980  * Potentially prepend another options table.
1981  *
1982  * Note: caller is responsible for locking the vfs list, if needed,
1983  *       to protect mops.
1984  */
1985 static void
1986 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1987     const mntopts_t *mtmpl)
1988 {
1989         const char *s = opts;
1990         uint_t count;
1991 
1992         if (opts == NULL || *opts == '\0') {
1993                 count = 0;
1994         } else {
1995                 count = 1;
1996 
1997                 /*
1998                  * Count number of options in the string
1999                  */
2000                 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
2001                         count++;
2002                         s++;
2003                 }
2004         }
2005         vfs_copyopttbl_extend(mtmpl, mops, count);
2006 }
2007 
2008 /*
2009  * Create an empty options table with enough empty slots to hold all
2010  * The options in the options string passed as an argument.
2011  *
2012  * This function is *not* for general use by filesystems.
2013  *
2014  * Note: caller is responsible for locking the vfs list, if needed,
2015  *       to protect mops.
2016  */
2017 void
2018 vfs_createopttbl(mntopts_t *mops, const char *opts)
2019 {
2020         vfs_createopttbl_extend(mops, opts, NULL);
2021 }
2022 
2023 
2024 /*
2025  * Swap two mount options tables
2026  */
2027 static void
2028 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2029 {
2030         uint_t tmpcnt;
2031         mntopt_t *tmplist;
2032 
2033         tmpcnt = optbl2->mo_count;
2034         tmplist = optbl2->mo_list;
2035         optbl2->mo_count = optbl1->mo_count;
2036         optbl2->mo_list = optbl1->mo_list;
2037         optbl1->mo_count = tmpcnt;
2038         optbl1->mo_list = tmplist;
2039 }
2040 
2041 static void
2042 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2043 {
2044         vfs_list_lock();
2045         vfs_swapopttbl_nolock(optbl1, optbl2);
2046         vfs_mnttab_modtimeupd();
2047         vfs_list_unlock();
2048 }
2049 
2050 static char **
2051 vfs_copycancelopt_extend(char **const moc, int extend)
2052 {
2053         int i = 0;
2054         int j;
2055         char **result;
2056 
2057         if (moc != NULL) {
2058                 for (; moc[i] != NULL; i++)
2059                         /* count number of options to cancel */;
2060         }
2061 
2062         if (i + extend == 0)
2063                 return (NULL);
2064 
2065         result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2066 
2067         for (j = 0; j < i; j++) {
2068                 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2069                 (void) strcpy(result[j], moc[j]);
2070         }
2071         for (; j <= i + extend; j++)
2072                 result[j] = NULL;
2073 
2074         return (result);
2075 }
2076 
2077 static void
2078 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2079 {
2080         char *sp, *dp;
2081 
2082         d->mo_flags = s->mo_flags;
2083         d->mo_data = s->mo_data;
2084         sp = s->mo_name;
2085         if (sp != NULL) {
2086                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2087                 (void) strcpy(dp, sp);
2088                 d->mo_name = dp;
2089         } else {
2090                 d->mo_name = NULL; /* should never happen */
2091         }
2092 
2093         d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2094 
2095         sp = s->mo_arg;
2096         if (sp != NULL) {
2097                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2098                 (void) strcpy(dp, sp);
2099                 d->mo_arg = dp;
2100         } else {
2101                 d->mo_arg = NULL;
2102         }
2103 }
2104 
2105 /*
2106  * Copy a mount options table, possibly allocating some spare
2107  * slots at the end.  It is permissible to copy_extend the NULL table.
2108  */
2109 static void
2110 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2111 {
2112         uint_t i, count;
2113         mntopt_t *motbl;
2114 
2115         /*
2116          * Clear out any existing stuff in the options table being initialized
2117          */
2118         vfs_freeopttbl(dmo);
2119         count = (smo == NULL) ? 0 : smo->mo_count;
2120         if ((count + extra) == 0)       /* nothing to do */
2121                 return;
2122         dmo->mo_count = count + extra;
2123         motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2124         dmo->mo_list = motbl;
2125         for (i = 0; i < count; i++) {
2126                 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2127         }
2128         for (i = count; i < count + extra; i++) {
2129                 motbl[i].mo_flags = MO_EMPTY;
2130         }
2131 }
2132 
2133 /*
2134  * Copy a mount options table.
2135  *
2136  * This function is *not* for general use by filesystems.
2137  *
2138  * Note: caller is responsible for locking the vfs list, if needed,
2139  *       to protect smo and dmo.
2140  */
2141 void
2142 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2143 {
2144         vfs_copyopttbl_extend(smo, dmo, 0);
2145 }
2146 
2147 static char **
2148 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2149 {
2150         int c1 = 0;
2151         int c2 = 0;
2152         char **result;
2153         char **sp1, **sp2, **dp;
2154 
2155         /*
2156          * First we count both lists of cancel options.
2157          * If either is NULL or has no elements, we return a copy of
2158          * the other.
2159          */
2160         if (mop1->mo_cancel != NULL) {
2161                 for (; mop1->mo_cancel[c1] != NULL; c1++)
2162                         /* count cancel options in mop1 */;
2163         }
2164 
2165         if (c1 == 0)
2166                 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2167 
2168         if (mop2->mo_cancel != NULL) {
2169                 for (; mop2->mo_cancel[c2] != NULL; c2++)
2170                         /* count cancel options in mop2 */;
2171         }
2172 
2173         result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2174 
2175         if (c2 == 0)
2176                 return (result);
2177 
2178         /*
2179          * When we get here, we've got two sets of cancel options;
2180          * we need to merge the two sets.  We know that the result
2181          * array has "c1+c2+1" entries and in the end we might shrink
2182          * it.
2183          * Result now has a copy of the c1 entries from mop1; we'll
2184          * now lookup all the entries of mop2 in mop1 and copy it if
2185          * it is unique.
2186          * This operation is O(n^2) but it's only called once per
2187          * filesystem per duplicate option.  This is a situation
2188          * which doesn't arise with the filesystems in ON and
2189          * n is generally 1.
2190          */
2191 
2192         dp = &result[c1];
2193         for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2194                 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2195                         if (strcmp(*sp1, *sp2) == 0)
2196                                 break;
2197                 }
2198                 if (*sp1 == NULL) {
2199                         /*
2200                          * Option *sp2 not found in mop1, so copy it.
2201                          * The calls to vfs_copycancelopt_extend()
2202                          * guarantee that there's enough room.
2203                          */
2204                         *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2205                         (void) strcpy(*dp++, *sp2);
2206                 }
2207         }
2208         if (dp != &result[c1+c2]) {
2209                 size_t bytes = (dp - result + 1) * sizeof (char *);
2210                 char **nres = kmem_alloc(bytes, KM_SLEEP);
2211 
2212                 bcopy(result, nres, bytes);
2213                 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2214                 result = nres;
2215         }
2216         return (result);
2217 }
2218 
2219 /*
2220  * Merge two mount option tables (outer and inner) into one.  This is very
2221  * similar to "merging" global variables and automatic variables in C.
2222  *
2223  * This isn't (and doesn't have to be) fast.
2224  *
2225  * This function is *not* for general use by filesystems.
2226  *
2227  * Note: caller is responsible for locking the vfs list, if needed,
2228  *       to protect omo, imo & dmo.
2229  */
2230 void
2231 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2232 {
2233         uint_t i, count;
2234         mntopt_t *mop, *motbl;
2235         uint_t freeidx;
2236 
2237         /*
2238          * First determine how much space we need to allocate.
2239          */
2240         count = omo->mo_count;
2241         for (i = 0; i < imo->mo_count; i++) {
2242                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2243                         continue;
2244                 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2245                         count++;
2246         }
2247         ASSERT(count >= omo->mo_count &&
2248             count <= omo->mo_count + imo->mo_count);
2249         motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2250         for (i = 0; i < omo->mo_count; i++)
2251                 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2252         freeidx = omo->mo_count;
2253         for (i = 0; i < imo->mo_count; i++) {
2254                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2255                         continue;
2256                 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2257                         char **newcanp;
2258                         uint_t index = mop - omo->mo_list;
2259 
2260                         newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2261 
2262                         vfs_freeopt(&motbl[index]);
2263                         vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2264 
2265                         vfs_freecancelopt(motbl[index].mo_cancel);
2266                         motbl[index].mo_cancel = newcanp;
2267                 } else {
2268                         /*
2269                          * If it's a new option, just copy it over to the first
2270                          * free location.
2271                          */
2272                         vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2273                 }
2274         }
2275         dmo->mo_count = count;
2276         dmo->mo_list = motbl;
2277 }
2278 
2279 /*
2280  * Functions to set and clear mount options in a mount options table.
2281  */
2282 
2283 /*
2284  * Clear a mount option, if it exists.
2285  *
2286  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2287  * the vfs list.
2288  */
2289 static void
2290 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2291 {
2292         struct mntopt *mop;
2293         uint_t i, count;
2294 
2295         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2296 
2297         count = mops->mo_count;
2298         for (i = 0; i < count; i++) {
2299                 mop = &mops->mo_list[i];
2300 
2301                 if (mop->mo_flags & MO_EMPTY)
2302                         continue;
2303                 if (strcmp(opt, mop->mo_name))
2304                         continue;
2305                 mop->mo_flags &= ~MO_SET;
2306                 if (mop->mo_arg != NULL) {
2307                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2308                 }
2309                 mop->mo_arg = NULL;
2310                 if (update_mnttab)
2311                         vfs_mnttab_modtimeupd();
2312                 break;
2313         }
2314 }
2315 
2316 void
2317 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2318 {
2319         int gotlock = 0;
2320 
2321         if (VFS_ON_LIST(vfsp)) {
2322                 gotlock = 1;
2323                 vfs_list_lock();
2324         }
2325         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2326         if (gotlock)
2327                 vfs_list_unlock();
2328 }
2329 
2330 
2331 /*
2332  * Set a mount option on.  If it's not found in the table, it's silently
2333  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2334  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2335  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2336  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2337  * MO_EMPTY set is created as the option passed in.
2338  *
2339  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2340  * the vfs list.
2341  */
2342 static void
2343 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2344     const char *arg, int flags, int update_mnttab)
2345 {
2346         mntopt_t *mop;
2347         uint_t i, count;
2348         char *sp;
2349 
2350         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2351 
2352         if (flags & VFS_CREATEOPT) {
2353                 if (vfs_hasopt(mops, opt) != NULL) {
2354                         flags &= ~VFS_CREATEOPT;
2355                 }
2356         }
2357         count = mops->mo_count;
2358         for (i = 0; i < count; i++) {
2359                 mop = &mops->mo_list[i];
2360 
2361                 if (mop->mo_flags & MO_EMPTY) {
2362                         if ((flags & VFS_CREATEOPT) == 0)
2363                                 continue;
2364                         sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2365                         (void) strcpy(sp, opt);
2366                         mop->mo_name = sp;
2367                         if (arg != NULL)
2368                                 mop->mo_flags = MO_HASVALUE;
2369                         else
2370                                 mop->mo_flags = 0;
2371                 } else if (strcmp(opt, mop->mo_name)) {
2372                         continue;
2373                 }
2374                 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2375                         break;
2376                 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2377                         sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2378                         (void) strcpy(sp, arg);
2379                 } else {
2380                         sp = NULL;
2381                 }
2382                 if (mop->mo_arg != NULL)
2383                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2384                 mop->mo_arg = sp;
2385                 if (flags & VFS_DISPLAY)
2386                         mop->mo_flags &= ~MO_NODISPLAY;
2387                 if (flags & VFS_NODISPLAY)
2388                         mop->mo_flags |= MO_NODISPLAY;
2389                 mop->mo_flags |= MO_SET;
2390                 if (mop->mo_cancel != NULL) {
2391                         char **cp;
2392 
2393                         for (cp = mop->mo_cancel; *cp != NULL; cp++)
2394                                 vfs_clearmntopt_nolock(mops, *cp, 0);
2395                 }
2396                 if (update_mnttab)
2397                         vfs_mnttab_modtimeupd();
2398                 break;
2399         }
2400 }
2401 
2402 void
2403 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2404 {
2405         int gotlock = 0;
2406 
2407         if (VFS_ON_LIST(vfsp)) {
2408                 gotlock = 1;
2409                 vfs_list_lock();
2410         }
2411         vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2412         if (gotlock)
2413                 vfs_list_unlock();
2414 }
2415 
2416 
2417 /*
2418  * Add a "tag" option to a mounted file system's options list.
2419  *
2420  * Note: caller is responsible for locking the vfs list, if needed,
2421  *       to protect mops.
2422  */
2423 static mntopt_t *
2424 vfs_addtag(mntopts_t *mops, const char *tag)
2425 {
2426         uint_t count;
2427         mntopt_t *mop, *motbl;
2428 
2429         count = mops->mo_count + 1;
2430         motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2431         if (mops->mo_count) {
2432                 size_t len = (count - 1) * sizeof (mntopt_t);
2433 
2434                 bcopy(mops->mo_list, motbl, len);
2435                 kmem_free(mops->mo_list, len);
2436         }
2437         mops->mo_count = count;
2438         mops->mo_list = motbl;
2439         mop = &motbl[count - 1];
2440         mop->mo_flags = MO_TAG;
2441         mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2442         (void) strcpy(mop->mo_name, tag);
2443         return (mop);
2444 }
2445 
2446 /*
2447  * Allow users to set arbitrary "tags" in a vfs's mount options.
2448  * Broader use within the kernel is discouraged.
2449  */
2450 int
2451 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2452     cred_t *cr)
2453 {
2454         vfs_t *vfsp;
2455         mntopts_t *mops;
2456         mntopt_t *mop;
2457         int found = 0;
2458         dev_t dev = makedevice(major, minor);
2459         int err = 0;
2460         char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2461 
2462         /*
2463          * Find the desired mounted file system
2464          */
2465         vfs_list_lock();
2466         vfsp = rootvfs;
2467         do {
2468                 if (vfsp->vfs_dev == dev &&
2469                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2470                         found = 1;
2471                         break;
2472                 }
2473                 vfsp = vfsp->vfs_next;
2474         } while (vfsp != rootvfs);
2475 
2476         if (!found) {
2477                 err = EINVAL;
2478                 goto out;
2479         }
2480         err = secpolicy_fs_config(cr, vfsp);
2481         if (err != 0)
2482                 goto out;
2483 
2484         mops = &vfsp->vfs_mntopts;
2485         /*
2486          * Add tag if it doesn't already exist
2487          */
2488         if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2489                 int len;
2490 
2491                 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2492                 len = strlen(buf);
2493                 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2494                         err = ENAMETOOLONG;
2495                         goto out;
2496                 }
2497                 mop = vfs_addtag(mops, tag);
2498         }
2499         if ((mop->mo_flags & MO_TAG) == 0) {
2500                 err = EINVAL;
2501                 goto out;
2502         }
2503         vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2504 out:
2505         vfs_list_unlock();
2506         kmem_free(buf, MAX_MNTOPT_STR);
2507         return (err);
2508 }
2509 
2510 /*
2511  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2512  * Broader use within the kernel is discouraged.
2513  */
2514 int
2515 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2516     cred_t *cr)
2517 {
2518         vfs_t *vfsp;
2519         mntopt_t *mop;
2520         int found = 0;
2521         dev_t dev = makedevice(major, minor);
2522         int err = 0;
2523 
2524         /*
2525          * Find the desired mounted file system
2526          */
2527         vfs_list_lock();
2528         vfsp = rootvfs;
2529         do {
2530                 if (vfsp->vfs_dev == dev &&
2531                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2532                         found = 1;
2533                         break;
2534                 }
2535                 vfsp = vfsp->vfs_next;
2536         } while (vfsp != rootvfs);
2537 
2538         if (!found) {
2539                 err = EINVAL;
2540                 goto out;
2541         }
2542         err = secpolicy_fs_config(cr, vfsp);
2543         if (err != 0)
2544                 goto out;
2545 
2546         if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2547                 err = EINVAL;
2548                 goto out;
2549         }
2550         if ((mop->mo_flags & MO_TAG) == 0) {
2551                 err = EINVAL;
2552                 goto out;
2553         }
2554         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2555 out:
2556         vfs_list_unlock();
2557         return (err);
2558 }
2559 
2560 /*
2561  * Function to parse an option string and fill in a mount options table.
2562  * Unknown options are silently ignored.  The input option string is modified
2563  * by replacing separators with nulls.  If the create flag is set, options
2564  * not found in the table are just added on the fly.  The table must have
2565  * an option slot marked MO_EMPTY to add an option on the fly.
2566  *
2567  * This function is *not* for general use by filesystems.
2568  *
2569  * Note: caller is responsible for locking the vfs list, if needed,
2570  *       to protect mops..
2571  */
2572 void
2573 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2574 {
2575         char *s = osp, *p, *nextop, *valp, *cp, *ep;
2576         int setflg = VFS_NOFORCEOPT;
2577 
2578         if (osp == NULL)
2579                 return;
2580         while (*s != '\0') {
2581                 p = strchr(s, ',');     /* find next option */
2582                 if (p == NULL) {
2583                         cp = NULL;
2584                         p = s + strlen(s);
2585                 } else {
2586                         cp = p;         /* save location of comma */
2587                         *p++ = '\0';    /* mark end and point to next option */
2588                 }
2589                 nextop = p;
2590                 p = strchr(s, '=');     /* look for value */
2591                 if (p == NULL) {
2592                         valp = NULL;    /* no value supplied */
2593                 } else {
2594                         ep = p;         /* save location of equals */
2595                         *p++ = '\0';    /* end option and point to value */
2596                         valp = p;
2597                 }
2598                 /*
2599                  * set option into options table
2600                  */
2601                 if (create)
2602                         setflg |= VFS_CREATEOPT;
2603                 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2604                 if (cp != NULL)
2605                         *cp = ',';      /* restore the comma */
2606                 if (valp != NULL)
2607                         *ep = '=';      /* restore the equals */
2608                 s = nextop;
2609         }
2610 }
2611 
2612 /*
2613  * Function to inquire if an option exists in a mount options table.
2614  * Returns a pointer to the option if it exists, else NULL.
2615  *
2616  * This function is *not* for general use by filesystems.
2617  *
2618  * Note: caller is responsible for locking the vfs list, if needed,
2619  *       to protect mops.
2620  */
2621 struct mntopt *
2622 vfs_hasopt(const mntopts_t *mops, const char *opt)
2623 {
2624         struct mntopt *mop;
2625         uint_t i, count;
2626 
2627         count = mops->mo_count;
2628         for (i = 0; i < count; i++) {
2629                 mop = &mops->mo_list[i];
2630 
2631                 if (mop->mo_flags & MO_EMPTY)
2632                         continue;
2633                 if (strcmp(opt, mop->mo_name) == 0)
2634                         return (mop);
2635         }
2636         return (NULL);
2637 }
2638 
2639 /*
2640  * Function to inquire if an option is set in a mount options table.
2641  * Returns non-zero if set and fills in the arg pointer with a pointer to
2642  * the argument string or NULL if there is no argument string.
2643  */
2644 static int
2645 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2646 {
2647         struct mntopt *mop;
2648         uint_t i, count;
2649 
2650         count = mops->mo_count;
2651         for (i = 0; i < count; i++) {
2652                 mop = &mops->mo_list[i];
2653 
2654                 if (mop->mo_flags & MO_EMPTY)
2655                         continue;
2656                 if (strcmp(opt, mop->mo_name))
2657                         continue;
2658                 if ((mop->mo_flags & MO_SET) == 0)
2659                         return (0);
2660                 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2661                         *argp = mop->mo_arg;
2662                 return (1);
2663         }
2664         return (0);
2665 }
2666 
2667 
2668 int
2669 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2670 {
2671         int ret;
2672 
2673         vfs_list_read_lock();
2674         ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2675         vfs_list_unlock();
2676         return (ret);
2677 }
2678 
2679 
2680 /*
2681  * Construct a comma separated string of the options set in the given
2682  * mount table, return the string in the given buffer.  Return non-zero if
2683  * the buffer would overflow.
2684  *
2685  * This function is *not* for general use by filesystems.
2686  *
2687  * Note: caller is responsible for locking the vfs list, if needed,
2688  *       to protect mp.
2689  */
2690 int
2691 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2692 {
2693         char *cp;
2694         uint_t i;
2695 
2696         buf[0] = '\0';
2697         cp = buf;
2698         for (i = 0; i < mp->mo_count; i++) {
2699                 struct mntopt *mop;
2700 
2701                 mop = &mp->mo_list[i];
2702                 if (mop->mo_flags & MO_SET) {
2703                         int optlen, comma = 0;
2704 
2705                         if (buf[0] != '\0')
2706                                 comma = 1;
2707                         optlen = strlen(mop->mo_name);
2708                         if (strlen(buf) + comma + optlen + 1 > len)
2709                                 goto err;
2710                         if (comma)
2711                                 *cp++ = ',';
2712                         (void) strcpy(cp, mop->mo_name);
2713                         cp += optlen;
2714                         /*
2715                          * Append option value if there is one
2716                          */
2717                         if (mop->mo_arg != NULL) {
2718                                 int arglen;
2719 
2720                                 arglen = strlen(mop->mo_arg);
2721                                 if (strlen(buf) + arglen + 2 > len)
2722                                         goto err;
2723                                 *cp++ = '=';
2724                                 (void) strcpy(cp, mop->mo_arg);
2725                                 cp += arglen;
2726                         }
2727                 }
2728         }
2729         return (0);
2730 err:
2731         return (EOVERFLOW);
2732 }
2733 
2734 static void
2735 vfs_freecancelopt(char **moc)
2736 {
2737         if (moc != NULL) {
2738                 int ccnt = 0;
2739                 char **cp;
2740 
2741                 for (cp = moc; *cp != NULL; cp++) {
2742                         kmem_free(*cp, strlen(*cp) + 1);
2743                         ccnt++;
2744                 }
2745                 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2746         }
2747 }
2748 
2749 static void
2750 vfs_freeopt(mntopt_t *mop)
2751 {
2752         if (mop->mo_name != NULL)
2753                 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2754 
2755         vfs_freecancelopt(mop->mo_cancel);
2756 
2757         if (mop->mo_arg != NULL)
2758                 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2759 }
2760 
2761 /*
2762  * Free a mount options table
2763  *
2764  * This function is *not* for general use by filesystems.
2765  *
2766  * Note: caller is responsible for locking the vfs list, if needed,
2767  *       to protect mp.
2768  */
2769 void
2770 vfs_freeopttbl(mntopts_t *mp)
2771 {
2772         uint_t i, count;
2773 
2774         count = mp->mo_count;
2775         for (i = 0; i < count; i++) {
2776                 vfs_freeopt(&mp->mo_list[i]);
2777         }
2778         if (count) {
2779                 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2780                 mp->mo_count = 0;
2781                 mp->mo_list = NULL;
2782         }
2783 }
2784 
2785 
2786 /* ARGSUSED */
2787 static int
2788 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2789         caller_context_t *ct)
2790 {
2791         return (0);
2792 }
2793 
2794 /* ARGSUSED */
2795 static int
2796 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2797         caller_context_t *ct)
2798 {
2799         return (0);
2800 }
2801 
2802 /*
2803  * The dummy vnode is currently used only by file events notification
2804  * module which is just interested in the timestamps.
2805  */
2806 /* ARGSUSED */
2807 static int
2808 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2809     caller_context_t *ct)
2810 {
2811         bzero(vap, sizeof (vattr_t));
2812         vap->va_type = VREG;
2813         vap->va_nlink = 1;
2814         vap->va_ctime = vfs_mnttab_ctime;
2815         /*
2816          * it is ok to just copy mtime as the time will be monotonically
2817          * increasing.
2818          */
2819         vap->va_mtime = vfs_mnttab_mtime;
2820         vap->va_atime = vap->va_mtime;
2821         return (0);
2822 }
2823 
2824 static void
2825 vfs_mnttabvp_setup(void)
2826 {
2827         vnode_t *tvp;
2828         vnodeops_t *vfs_mntdummyvnops;
2829         const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2830                 VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2831                 VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2832                 VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2833                 VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2834                 NULL,                   NULL
2835         };
2836 
2837         if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2838             &vfs_mntdummyvnops) != 0) {
2839                 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2840                 /* Shouldn't happen, but not bad enough to panic */
2841                 return;
2842         }
2843 
2844         /*
2845          * A global dummy vnode is allocated to represent mntfs files.
2846          * The mntfs file (/etc/mnttab) can be monitored for file events
2847          * and receive an event when mnttab changes. Dummy VOP calls
2848          * will be made on this vnode. The file events notification module
2849          * intercepts this vnode and delivers relevant events.
2850          */
2851         tvp = vn_alloc(KM_SLEEP);
2852         tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2853         vn_setops(tvp, vfs_mntdummyvnops);
2854         tvp->v_type = VREG;
2855         /*
2856          * The mnt dummy ops do not reference v_data.
2857          * No other module intercepting this vnode should either.
2858          * Just set it to point to itself.
2859          */
2860         tvp->v_data = (caddr_t)tvp;
2861         tvp->v_vfsp = rootvfs;
2862         vfs_mntdummyvp = tvp;
2863 }
2864 
2865 /*
2866  * performs fake read/write ops
2867  */
2868 static void
2869 vfs_mnttab_rwop(int rw)
2870 {
2871         struct uio      uio;
2872         struct iovec    iov;
2873         char    buf[1];
2874 
2875         if (vfs_mntdummyvp == NULL)
2876                 return;
2877 
2878         bzero(&uio, sizeof (uio));
2879         bzero(&iov, sizeof (iov));
2880         iov.iov_base = buf;
2881         iov.iov_len = 0;
2882         uio.uio_iov = &iov;
2883         uio.uio_iovcnt = 1;
2884         uio.uio_loffset = 0;
2885         uio.uio_segflg = UIO_SYSSPACE;
2886         uio.uio_resid = 0;
2887         if (rw) {
2888                 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2889         } else {
2890                 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2891         }
2892 }
2893 
2894 /*
2895  * Generate a write operation.
2896  */
2897 void
2898 vfs_mnttab_writeop(void)
2899 {
2900         vfs_mnttab_rwop(1);
2901 }
2902 
2903 /*
2904  * Generate a read operation.
2905  */
2906 void
2907 vfs_mnttab_readop(void)
2908 {
2909         vfs_mnttab_rwop(0);
2910 }
2911 
2912 /*
2913  * Free any mnttab information recorded in the vfs struct.
2914  * The vfs must not be on the vfs list.
2915  */
2916 static void
2917 vfs_freemnttab(struct vfs *vfsp)
2918 {
2919         ASSERT(!VFS_ON_LIST(vfsp));
2920 
2921         /*
2922          * Free device and mount point information
2923          */
2924         if (vfsp->vfs_mntpt != NULL) {
2925                 refstr_rele(vfsp->vfs_mntpt);
2926                 vfsp->vfs_mntpt = NULL;
2927         }
2928         if (vfsp->vfs_resource != NULL) {
2929                 refstr_rele(vfsp->vfs_resource);
2930                 vfsp->vfs_resource = NULL;
2931         }
2932         /*
2933          * Now free mount options information
2934          */
2935         vfs_freeopttbl(&vfsp->vfs_mntopts);
2936 }
2937 
2938 /*
2939  * Return the last mnttab modification time
2940  */
2941 void
2942 vfs_mnttab_modtime(timespec_t *ts)
2943 {
2944         ASSERT(RW_LOCK_HELD(&vfslist));
2945         *ts = vfs_mnttab_mtime;
2946 }
2947 
2948 /*
2949  * See if mnttab is changed
2950  */
2951 void
2952 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2953 {
2954         int changed;
2955 
2956         *phpp = (struct pollhead *)NULL;
2957 
2958         /*
2959          * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2960          * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2961          * to not grab the vfs list lock because tv_sec is monotonically
2962          * increasing.
2963          */
2964 
2965         changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2966             (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2967         if (!changed) {
2968                 *phpp = &vfs_pollhd;
2969         }
2970 }
2971 
2972 /* Provide a unique and monotonically-increasing timestamp. */
2973 void
2974 vfs_mono_time(timespec_t *ts)
2975 {
2976         static volatile hrtime_t hrt;           /* The saved time. */
2977         hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2978         timespec_t      newts;
2979 
2980         /*
2981          * Try gethrestime() first, but be prepared to fabricate a sensible
2982          * answer at the first sign of any trouble.
2983          */
2984         gethrestime(&newts);
2985         newhrt = ts2hrt(&newts);
2986         for (;;) {
2987                 oldhrt = hrt;
2988                 if (newhrt <= hrt)
2989                         newhrt = hrt + 1;
2990                 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2991                         break;
2992         }
2993         hrt2ts(newhrt, ts);
2994 }
2995 
2996 /*
2997  * Update the mnttab modification time and wake up any waiters for
2998  * mnttab changes
2999  */
3000 void
3001 vfs_mnttab_modtimeupd()
3002 {
3003         hrtime_t oldhrt, newhrt;
3004 
3005         ASSERT(RW_WRITE_HELD(&vfslist));
3006         oldhrt = ts2hrt(&vfs_mnttab_mtime);
3007         gethrestime(&vfs_mnttab_mtime);
3008         newhrt = ts2hrt(&vfs_mnttab_mtime);
3009         if (oldhrt == (hrtime_t)0)
3010                 vfs_mnttab_ctime = vfs_mnttab_mtime;
3011         /*
3012          * Attempt to provide unique mtime (like uniqtime but not).
3013          */
3014         if (newhrt == oldhrt) {
3015                 newhrt++;
3016                 hrt2ts(newhrt, &vfs_mnttab_mtime);
3017         }
3018         pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
3019         vfs_mnttab_writeop();
3020 }
3021 
3022 int
3023 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3024 {
3025         vnode_t *coveredvp;
3026         int error;
3027         extern void teardown_vopstats(vfs_t *);
3028 
3029         /*
3030          * Get covered vnode. This will be NULL if the vfs is not linked
3031          * into the file system name space (i.e., domount() with MNT_NOSPICE).
3032          */
3033         coveredvp = vfsp->vfs_vnodecovered;
3034         ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3035 
3036         /*
3037          * Purge all dnlc entries for this vfs.
3038          */
3039         (void) dnlc_purge_vfsp(vfsp, 0);
3040 
3041         /* For forcible umount, skip VFS_SYNC() since it may hang */
3042         if ((flag & MS_FORCE) == 0)
3043                 (void) VFS_SYNC(vfsp, 0, cr);
3044 
3045         /*
3046          * Lock the vfs to maintain fs status quo during unmount.  This
3047          * has to be done after the sync because ufs_update tries to acquire
3048          * the vfs_reflock.
3049          */
3050         vfs_lock_wait(vfsp);
3051 
3052         if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3053                 vfs_unlock(vfsp);
3054                 if (coveredvp != NULL)
3055                         vn_vfsunlock(coveredvp);
3056         } else if (coveredvp != NULL) {
3057                 teardown_vopstats(vfsp);
3058                 /*
3059                  * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3060                  * when it frees vfsp so we do a VN_HOLD() so we can
3061                  * continue to use coveredvp afterwards.
3062                  */
3063                 VN_HOLD(coveredvp);
3064                 vfs_remove(vfsp);
3065                 vn_vfsunlock(coveredvp);
3066                 VN_RELE(coveredvp);
3067         } else {
3068                 teardown_vopstats(vfsp);
3069                 /*
3070                  * Release the reference to vfs that is not linked
3071                  * into the name space.
3072                  */
3073                 vfs_unlock(vfsp);
3074                 VFS_RELE(vfsp);
3075         }
3076         return (error);
3077 }
3078 
3079 
3080 /*
3081  * Vfs_unmountall() is called by uadmin() to unmount all
3082  * mounted file systems (except the root file system) during shutdown.
3083  * It follows the existing locking protocol when traversing the vfs list
3084  * to sync and unmount vfses. Even though there should be no
3085  * other thread running while the system is shutting down, it is prudent
3086  * to still follow the locking protocol.
3087  */
3088 void
3089 vfs_unmountall(void)
3090 {
3091         struct vfs *vfsp;
3092         struct vfs *prev_vfsp = NULL;
3093         int error;
3094 
3095         /*
3096          * Toss all dnlc entries now so that the per-vfs sync
3097          * and unmount operations don't have to slog through
3098          * a bunch of uninteresting vnodes over and over again.
3099          */
3100         dnlc_purge();
3101 
3102         vfs_list_lock();
3103         for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3104                 prev_vfsp = vfsp->vfs_prev;
3105 
3106                 if (vfs_lock(vfsp) != 0)
3107                         continue;
3108                 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3109                 vfs_unlock(vfsp);
3110                 if (error)
3111                         continue;
3112 
3113                 vfs_list_unlock();
3114 
3115                 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3116                 (void) dounmount(vfsp, 0, CRED());
3117 
3118                 /*
3119                  * Since we dropped the vfslist lock above we must
3120                  * verify that next_vfsp still exists, else start over.
3121                  */
3122                 vfs_list_lock();
3123                 for (vfsp = rootvfs->vfs_prev;
3124                     vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3125                         if (vfsp == prev_vfsp)
3126                                 break;
3127                 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3128                         prev_vfsp = rootvfs->vfs_prev;
3129         }
3130         vfs_list_unlock();
3131 }
3132 
3133 /*
3134  * Called to add an entry to the end of the vfs mount in progress list
3135  */
3136 void
3137 vfs_addmip(dev_t dev, struct vfs *vfsp)
3138 {
3139         struct ipmnt *mipp;
3140 
3141         mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3142         mipp->mip_next = NULL;
3143         mipp->mip_dev = dev;
3144         mipp->mip_vfsp = vfsp;
3145         mutex_enter(&vfs_miplist_mutex);
3146         if (vfs_miplist_end != NULL)
3147                 vfs_miplist_end->mip_next = mipp;
3148         else
3149                 vfs_miplist = mipp;
3150         vfs_miplist_end = mipp;
3151         mutex_exit(&vfs_miplist_mutex);
3152 }
3153 
3154 /*
3155  * Called to remove an entry from the mount in progress list
3156  * Either because the mount completed or it failed.
3157  */
3158 void
3159 vfs_delmip(struct vfs *vfsp)
3160 {
3161         struct ipmnt *mipp, *mipprev;
3162 
3163         mutex_enter(&vfs_miplist_mutex);
3164         mipprev = NULL;
3165         for (mipp = vfs_miplist;
3166             mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3167                 mipprev = mipp;
3168         }
3169         if (mipp == NULL)
3170                 return; /* shouldn't happen */
3171         if (mipp == vfs_miplist_end)
3172                 vfs_miplist_end = mipprev;
3173         if (mipprev == NULL)
3174                 vfs_miplist = mipp->mip_next;
3175         else
3176                 mipprev->mip_next = mipp->mip_next;
3177         mutex_exit(&vfs_miplist_mutex);
3178         kmem_free(mipp, sizeof (struct ipmnt));
3179 }
3180 
3181 /*
3182  * vfs_add is called by a specific filesystem's mount routine to add
3183  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3184  * The vfs should already have been locked by the caller.
3185  *
3186  * coveredvp is NULL if this is the root.
3187  */
3188 void
3189 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3190 {
3191         int newflag;
3192 
3193         ASSERT(vfs_lock_held(vfsp));
3194         VFS_HOLD(vfsp);
3195         newflag = vfsp->vfs_flag;
3196         if (mflag & MS_RDONLY)
3197                 newflag |= VFS_RDONLY;
3198         else
3199                 newflag &= ~VFS_RDONLY;
3200         if (mflag & MS_NOSUID)
3201                 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3202         else
3203                 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3204         if (mflag & MS_NOMNTTAB)
3205                 newflag |= VFS_NOMNTTAB;
3206         else
3207                 newflag &= ~VFS_NOMNTTAB;
3208 
3209         if (coveredvp != NULL) {
3210                 ASSERT(vn_vfswlock_held(coveredvp));
3211                 coveredvp->v_vfsmountedhere = vfsp;
3212                 VN_HOLD(coveredvp);
3213         }
3214         vfsp->vfs_vnodecovered = coveredvp;
3215         vfsp->vfs_flag = newflag;
3216 
3217         vfs_list_add(vfsp);
3218 }
3219 
3220 /*
3221  * Remove a vfs from the vfs list, null out the pointer from the
3222  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3223  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3224  * reference to the vfs and to the covered vnode.
3225  *
3226  * Called from dounmount after it's confirmed with the file system
3227  * that the unmount is legal.
3228  */
3229 void
3230 vfs_remove(struct vfs *vfsp)
3231 {
3232         vnode_t *vp;
3233 
3234         ASSERT(vfs_lock_held(vfsp));
3235 
3236         /*
3237          * Can't unmount root.  Should never happen because fs will
3238          * be busy.
3239          */
3240         if (vfsp == rootvfs)
3241                 panic("vfs_remove: unmounting root");
3242 
3243         vfs_list_remove(vfsp);
3244 
3245         /*
3246          * Unhook from the file system name space.
3247          */
3248         vp = vfsp->vfs_vnodecovered;
3249         ASSERT(vn_vfswlock_held(vp));
3250         vp->v_vfsmountedhere = NULL;
3251         vfsp->vfs_vnodecovered = NULL;
3252         VN_RELE(vp);
3253 
3254         /*
3255          * Release lock and wakeup anybody waiting.
3256          */
3257         vfs_unlock(vfsp);
3258         VFS_RELE(vfsp);
3259 }
3260 
3261 /*
3262  * Lock a filesystem to prevent access to it while mounting,
3263  * unmounting and syncing.  Return EBUSY immediately if lock
3264  * can't be acquired.
3265  */
3266 int
3267 vfs_lock(vfs_t *vfsp)
3268 {
3269         vn_vfslocks_entry_t *vpvfsentry;
3270 
3271         vpvfsentry = vn_vfslocks_getlock(vfsp);
3272         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3273                 return (0);
3274 
3275         vn_vfslocks_rele(vpvfsentry);
3276         return (EBUSY);
3277 }
3278 
3279 int
3280 vfs_rlock(vfs_t *vfsp)
3281 {
3282         vn_vfslocks_entry_t *vpvfsentry;
3283 
3284         vpvfsentry = vn_vfslocks_getlock(vfsp);
3285 
3286         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3287                 return (0);
3288 
3289         vn_vfslocks_rele(vpvfsentry);
3290         return (EBUSY);
3291 }
3292 
3293 void
3294 vfs_lock_wait(vfs_t *vfsp)
3295 {
3296         vn_vfslocks_entry_t *vpvfsentry;
3297 
3298         vpvfsentry = vn_vfslocks_getlock(vfsp);
3299         rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3300 }
3301 
3302 void
3303 vfs_rlock_wait(vfs_t *vfsp)
3304 {
3305         vn_vfslocks_entry_t *vpvfsentry;
3306 
3307         vpvfsentry = vn_vfslocks_getlock(vfsp);
3308         rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3309 }
3310 
3311 /*
3312  * Unlock a locked filesystem.
3313  */
3314 void
3315 vfs_unlock(vfs_t *vfsp)
3316 {
3317         vn_vfslocks_entry_t *vpvfsentry;
3318 
3319         /*
3320          * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3321          * And these changes should remain for the patch changes as it is.
3322          */
3323         if (panicstr)
3324                 return;
3325 
3326         /*
3327          * ve_refcount needs to be dropped twice here.
3328          * 1. To release refernce after a call to vfs_locks_getlock()
3329          * 2. To release the reference from the locking routines like
3330          *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3331          */
3332 
3333         vpvfsentry = vn_vfslocks_getlock(vfsp);
3334         vn_vfslocks_rele(vpvfsentry);
3335 
3336         rwst_exit(&vpvfsentry->ve_lock);
3337         vn_vfslocks_rele(vpvfsentry);
3338 }
3339 
3340 /*
3341  * Utility routine that allows a filesystem to construct its
3342  * fsid in "the usual way" - by munging some underlying dev_t and
3343  * the filesystem type number into the 64-bit fsid.  Note that
3344  * this implicitly relies on dev_t persistence to make filesystem
3345  * id's persistent.
3346  *
3347  * There's nothing to prevent an individual fs from constructing its
3348  * fsid in a different way, and indeed they should.
3349  *
3350  * Since we want fsids to be 32-bit quantities (so that they can be
3351  * exported identically by either 32-bit or 64-bit APIs, as well as
3352  * the fact that fsid's are "known" to NFS), we compress the device
3353  * number given down to 32-bits, and panic if that isn't possible.
3354  */
3355 void
3356 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3357 {
3358         if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3359                 panic("device number too big for fsid!");
3360         fsi->val[1] = val;
3361 }
3362 
3363 int
3364 vfs_lock_held(vfs_t *vfsp)
3365 {
3366         int held;
3367         vn_vfslocks_entry_t *vpvfsentry;
3368 
3369         /*
3370          * vfs_lock_held will mimic sema_held behaviour
3371          * if panicstr is set. And these changes should remain
3372          * for the patch changes as it is.
3373          */
3374         if (panicstr)
3375                 return (1);
3376 
3377         vpvfsentry = vn_vfslocks_getlock(vfsp);
3378         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3379 
3380         vn_vfslocks_rele(vpvfsentry);
3381         return (held);
3382 }
3383 
3384 struct _kthread *
3385 vfs_lock_owner(vfs_t *vfsp)
3386 {
3387         struct _kthread *owner;
3388         vn_vfslocks_entry_t *vpvfsentry;
3389 
3390         /*
3391          * vfs_wlock_held will mimic sema_held behaviour
3392          * if panicstr is set. And these changes should remain
3393          * for the patch changes as it is.
3394          */
3395         if (panicstr)
3396                 return (NULL);
3397 
3398         vpvfsentry = vn_vfslocks_getlock(vfsp);
3399         owner = rwst_owner(&vpvfsentry->ve_lock);
3400 
3401         vn_vfslocks_rele(vpvfsentry);
3402         return (owner);
3403 }
3404 
3405 /*
3406  * vfs list locking.
3407  *
3408  * Rather than manipulate the vfslist lock directly, we abstract into lock
3409  * and unlock routines to allow the locking implementation to be changed for
3410  * clustering.
3411  *
3412  * Whenever the vfs list is modified through its hash links, the overall list
3413  * lock must be obtained before locking the relevant hash bucket.  But to see
3414  * whether a given vfs is on the list, it suffices to obtain the lock for the
3415  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3416  */
3417 
3418 void
3419 vfs_list_lock()
3420 {
3421         rw_enter(&vfslist, RW_WRITER);
3422 }
3423 
3424 void
3425 vfs_list_read_lock()
3426 {
3427         rw_enter(&vfslist, RW_READER);
3428 }
3429 
3430 void
3431 vfs_list_unlock()
3432 {
3433         rw_exit(&vfslist);
3434 }
3435 
3436 /*
3437  * Low level worker routines for adding entries to and removing entries from
3438  * the vfs list.
3439  */
3440 
3441 static void
3442 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3443 {
3444         int vhno;
3445         struct vfs **hp;
3446         dev_t dev;
3447 
3448         ASSERT(RW_WRITE_HELD(&vfslist));
3449 
3450         dev = expldev(vfsp->vfs_fsid.val[0]);
3451         vhno = VFSHASH(getmajor(dev), getminor(dev));
3452 
3453         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3454 
3455         /*
3456          * Link into the hash table, inserting it at the end, so that LOFS
3457          * with the same fsid as UFS (or other) file systems will not hide the
3458          * UFS.
3459          */
3460         if (insert_at_head) {
3461                 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3462                 rvfs_list[vhno].rvfs_head = vfsp;
3463         } else {
3464                 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3465                     hp = &(*hp)->vfs_hash)
3466                         continue;
3467                 /*
3468                  * hp now contains the address of the pointer to update
3469                  * to effect the insertion.
3470                  */
3471                 vfsp->vfs_hash = NULL;
3472                 *hp = vfsp;
3473         }
3474 
3475         rvfs_list[vhno].rvfs_len++;
3476         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3477 }
3478 
3479 
3480 static void
3481 vfs_hash_remove(struct vfs *vfsp)
3482 {
3483         int vhno;
3484         struct vfs *tvfsp;
3485         dev_t dev;
3486 
3487         ASSERT(RW_WRITE_HELD(&vfslist));
3488 
3489         dev = expldev(vfsp->vfs_fsid.val[0]);
3490         vhno = VFSHASH(getmajor(dev), getminor(dev));
3491 
3492         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3493 
3494         /*
3495          * Remove from hash.
3496          */
3497         if (rvfs_list[vhno].rvfs_head == vfsp) {
3498                 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3499                 rvfs_list[vhno].rvfs_len--;
3500                 goto foundit;
3501         }
3502         for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3503             tvfsp = tvfsp->vfs_hash) {
3504                 if (tvfsp->vfs_hash == vfsp) {
3505                         tvfsp->vfs_hash = vfsp->vfs_hash;
3506                         rvfs_list[vhno].rvfs_len--;
3507                         goto foundit;
3508                 }
3509         }
3510         cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3511 
3512 foundit:
3513 
3514         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3515 }
3516 
3517 
3518 void
3519 vfs_list_add(struct vfs *vfsp)
3520 {
3521         zone_t *zone;
3522 
3523         /*
3524          * Typically, the vfs_t will have been created on behalf of the file
3525          * system in vfs_init, where it will have been provided with a
3526          * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3527          * by an unbundled file system. We therefore check for such an example
3528          * before stamping the vfs_t with its creation time for the benefit of
3529          * mntfs.
3530          */
3531         if (vfsp->vfs_implp == NULL)
3532                 vfsimpl_setup(vfsp);
3533         vfs_mono_time(&vfsp->vfs_hrctime);
3534 
3535         /*
3536          * The zone that owns the mount is the one that performed the mount.
3537          * Note that this isn't necessarily the same as the zone mounted into.
3538          * The corresponding zone_rele_ref() will be done when the vfs_t
3539          * is being free'd.
3540          */
3541         vfsp->vfs_zone = curproc->p_zone;
3542         zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3543         zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3544             ZONE_REF_VFS);
3545 
3546         /*
3547          * Find the zone mounted into, and put this mount on its vfs list.
3548          */
3549         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3550         ASSERT(zone != NULL);
3551         /*
3552          * Special casing for the root vfs.  This structure is allocated
3553          * statically and hooked onto rootvfs at link time.  During the
3554          * vfs_mountroot call at system startup time, the root file system's
3555          * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3556          * as argument.  The code below must detect and handle this special
3557          * case.  The only apparent justification for this special casing is
3558          * to ensure that the root file system appears at the head of the
3559          * list.
3560          *
3561          * XXX: I'm assuming that it's ok to do normal list locking when
3562          *      adding the entry for the root file system (this used to be
3563          *      done with no locks held).
3564          */
3565         vfs_list_lock();
3566         /*
3567          * Link into the vfs list proper.
3568          */
3569         if (vfsp == &root) {
3570                 /*
3571                  * Assert: This vfs is already on the list as its first entry.
3572                  * Thus, there's nothing to do.
3573                  */
3574                 ASSERT(rootvfs == vfsp);
3575                 /*
3576                  * Add it to the head of the global zone's vfslist.
3577                  */
3578                 ASSERT(zone == global_zone);
3579                 ASSERT(zone->zone_vfslist == NULL);
3580                 zone->zone_vfslist = vfsp;
3581         } else {
3582                 /*
3583                  * Link to end of list using vfs_prev (as rootvfs is now a
3584                  * doubly linked circular list) so list is in mount order for
3585                  * mnttab use.
3586                  */
3587                 rootvfs->vfs_prev->vfs_next = vfsp;
3588                 vfsp->vfs_prev = rootvfs->vfs_prev;
3589                 rootvfs->vfs_prev = vfsp;
3590                 vfsp->vfs_next = rootvfs;
3591 
3592                 /*
3593                  * Do it again for the zone-private list (which may be NULL).
3594                  */
3595                 if (zone->zone_vfslist == NULL) {
3596                         ASSERT(zone != global_zone);
3597                         zone->zone_vfslist = vfsp;
3598                 } else {
3599                         zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3600                         vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3601                         zone->zone_vfslist->vfs_zone_prev = vfsp;
3602                         vfsp->vfs_zone_next = zone->zone_vfslist;
3603                 }
3604         }
3605 
3606         /*
3607          * Link into the hash table, inserting it at the end, so that LOFS
3608          * with the same fsid as UFS (or other) file systems will not hide
3609          * the UFS.
3610          */
3611         vfs_hash_add(vfsp, 0);
3612 
3613         /*
3614          * update the mnttab modification time
3615          */
3616         vfs_mnttab_modtimeupd();
3617         vfs_list_unlock();
3618         zone_rele(zone);
3619 }
3620 
3621 void
3622 vfs_list_remove(struct vfs *vfsp)
3623 {
3624         zone_t *zone;
3625 
3626         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3627         ASSERT(zone != NULL);
3628         /*
3629          * Callers are responsible for preventing attempts to unmount the
3630          * root.
3631          */
3632         ASSERT(vfsp != rootvfs);
3633 
3634         vfs_list_lock();
3635 
3636         /*
3637          * Remove from hash.
3638          */
3639         vfs_hash_remove(vfsp);
3640 
3641         /*
3642          * Remove from vfs list.
3643          */
3644         vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3645         vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3646         vfsp->vfs_next = vfsp->vfs_prev = NULL;
3647 
3648         /*
3649          * Remove from zone-specific vfs list.
3650          */
3651         if (zone->zone_vfslist == vfsp)
3652                 zone->zone_vfslist = vfsp->vfs_zone_next;
3653 
3654         if (vfsp->vfs_zone_next == vfsp) {
3655                 ASSERT(vfsp->vfs_zone_prev == vfsp);
3656                 ASSERT(zone->zone_vfslist == vfsp);
3657                 zone->zone_vfslist = NULL;
3658         }
3659 
3660         vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3661         vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3662         vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3663 
3664         /*
3665          * update the mnttab modification time
3666          */
3667         vfs_mnttab_modtimeupd();
3668         vfs_list_unlock();
3669         zone_rele(zone);
3670 }
3671 
3672 struct vfs *
3673 getvfs(fsid_t *fsid)
3674 {
3675         struct vfs *vfsp;
3676         int val0 = fsid->val[0];
3677         int val1 = fsid->val[1];
3678         dev_t dev = expldev(val0);
3679         int vhno = VFSHASH(getmajor(dev), getminor(dev));
3680         kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3681 
3682         mutex_enter(hmp);
3683         for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3684                 if (vfsp->vfs_fsid.val[0] == val0 &&
3685                     vfsp->vfs_fsid.val[1] == val1) {
3686                         VFS_HOLD(vfsp);
3687                         mutex_exit(hmp);
3688                         return (vfsp);
3689                 }
3690         }
3691         mutex_exit(hmp);
3692         return (NULL);
3693 }
3694 
3695 /*
3696  * Search the vfs mount in progress list for a specified device/vfs entry.
3697  * Returns 0 if the first entry in the list that the device matches has the
3698  * given vfs pointer as well.  If the device matches but a different vfs
3699  * pointer is encountered in the list before the given vfs pointer then
3700  * a 1 is returned.
3701  */
3702 
3703 int
3704 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3705 {
3706         int retval = 0;
3707         struct ipmnt *mipp;
3708 
3709         mutex_enter(&vfs_miplist_mutex);
3710         for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3711                 if (mipp->mip_dev == dev) {
3712                         if (mipp->mip_vfsp != vfsp)
3713                                 retval = 1;
3714                         break;
3715                 }
3716         }
3717         mutex_exit(&vfs_miplist_mutex);
3718         return (retval);
3719 }
3720 
3721 /*
3722  * Search the vfs list for a specified device.  Returns 1, if entry is found
3723  * or 0 if no suitable entry is found.
3724  */
3725 
3726 int
3727 vfs_devismounted(dev_t dev)
3728 {
3729         struct vfs *vfsp;
3730         int found;
3731 
3732         vfs_list_read_lock();
3733         vfsp = rootvfs;
3734         found = 0;
3735         do {
3736                 if (vfsp->vfs_dev == dev) {
3737                         found = 1;
3738                         break;
3739                 }
3740                 vfsp = vfsp->vfs_next;
3741         } while (vfsp != rootvfs);
3742 
3743         vfs_list_unlock();
3744         return (found);
3745 }
3746 
3747 /*
3748  * Search the vfs list for a specified device.  Returns a pointer to it
3749  * or NULL if no suitable entry is found. The caller of this routine
3750  * is responsible for releasing the returned vfs pointer.
3751  */
3752 struct vfs *
3753 vfs_dev2vfsp(dev_t dev)
3754 {
3755         struct vfs *vfsp;
3756         int found;
3757 
3758         vfs_list_read_lock();
3759         vfsp = rootvfs;
3760         found = 0;
3761         do {
3762                 /*
3763                  * The following could be made more efficient by making
3764                  * the entire loop use vfs_zone_next if the call is from
3765                  * a zone.  The only callers, however, ustat(2) and
3766                  * umount2(2), don't seem to justify the added
3767                  * complexity at present.
3768                  */
3769                 if (vfsp->vfs_dev == dev &&
3770                     ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3771                     curproc->p_zone)) {
3772                         VFS_HOLD(vfsp);
3773                         found = 1;
3774                         break;
3775                 }
3776                 vfsp = vfsp->vfs_next;
3777         } while (vfsp != rootvfs);
3778         vfs_list_unlock();
3779         return (found ? vfsp: NULL);
3780 }
3781 
3782 /*
3783  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3784  * or NULL if no suitable entry is found. The caller of this routine
3785  * is responsible for releasing the returned vfs pointer.
3786  *
3787  * Note that if multiple mntpoints match, the last one matching is
3788  * returned in an attempt to return the "top" mount when overlay
3789  * mounts are covering the same mount point.  This is accomplished by starting
3790  * at the end of the list and working our way backwards, stopping at the first
3791  * matching mount.
3792  */
3793 struct vfs *
3794 vfs_mntpoint2vfsp(const char *mp)
3795 {
3796         struct vfs *vfsp;
3797         struct vfs *retvfsp = NULL;
3798         zone_t *zone = curproc->p_zone;
3799         struct vfs *list;
3800 
3801         vfs_list_read_lock();
3802         if (getzoneid() == GLOBAL_ZONEID) {
3803                 /*
3804                  * The global zone may see filesystems in any zone.
3805                  */
3806                 vfsp = rootvfs->vfs_prev;
3807                 do {
3808                         if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3809                                 retvfsp = vfsp;
3810                                 break;
3811                         }
3812                         vfsp = vfsp->vfs_prev;
3813                 } while (vfsp != rootvfs->vfs_prev);
3814         } else if ((list = zone->zone_vfslist) != NULL) {
3815                 const char *mntpt;
3816 
3817                 vfsp = list->vfs_zone_prev;
3818                 do {
3819                         mntpt = refstr_value(vfsp->vfs_mntpt);
3820                         mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3821                         if (strcmp(mntpt, mp) == 0) {
3822                                 retvfsp = vfsp;
3823                                 break;
3824                         }
3825                         vfsp = vfsp->vfs_zone_prev;
3826                 } while (vfsp != list->vfs_zone_prev);
3827         }
3828         if (retvfsp)
3829                 VFS_HOLD(retvfsp);
3830         vfs_list_unlock();
3831         return (retvfsp);
3832 }
3833 
3834 /*
3835  * Search the vfs list for a specified vfsops.
3836  * if vfs entry is found then return 1, else 0.
3837  */
3838 int
3839 vfs_opsinuse(vfsops_t *ops)
3840 {
3841         struct vfs *vfsp;
3842         int found;
3843 
3844         vfs_list_read_lock();
3845         vfsp = rootvfs;
3846         found = 0;
3847         do {
3848                 if (vfs_getops(vfsp) == ops) {
3849                         found = 1;
3850                         break;
3851                 }
3852                 vfsp = vfsp->vfs_next;
3853         } while (vfsp != rootvfs);
3854         vfs_list_unlock();
3855         return (found);
3856 }
3857 
3858 /*
3859  * Allocate an entry in vfssw for a file system type
3860  */
3861 struct vfssw *
3862 allocate_vfssw(const char *type)
3863 {
3864         struct vfssw *vswp;
3865 
3866         if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3867                 /*
3868                  * The vfssw table uses the empty string to identify an
3869                  * available entry; we cannot add any type which has
3870                  * a leading NUL. The string length is limited to
3871                  * the size of the st_fstype array in struct stat.
3872                  */
3873                 return (NULL);
3874         }
3875 
3876         ASSERT(VFSSW_WRITE_LOCKED());
3877         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3878                 if (!ALLOCATED_VFSSW(vswp)) {
3879                         vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3880                         (void) strcpy(vswp->vsw_name, type);
3881                         ASSERT(vswp->vsw_count == 0);
3882                         vswp->vsw_count = 1;
3883                         mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3884                         return (vswp);
3885                 }
3886         return (NULL);
3887 }
3888 
3889 /*
3890  * Impose additional layer of translation between vfstype names
3891  * and module names in the filesystem.
3892  */
3893 static const char *
3894 vfs_to_modname(const char *vfstype)
3895 {
3896         if (strcmp(vfstype, "proc") == 0) {
3897                 vfstype = "procfs";
3898         } else if (strcmp(vfstype, "fd") == 0) {
3899                 vfstype = "fdfs";
3900         } else if (strncmp(vfstype, "nfs", 3) == 0) {
3901                 vfstype = "nfs";
3902         }
3903 
3904         return (vfstype);
3905 }
3906 
3907 /*
3908  * Find a vfssw entry given a file system type name.
3909  * Try to autoload the filesystem if it's not found.
3910  * If it's installed, return the vfssw locked to prevent unloading.
3911  */
3912 struct vfssw *
3913 vfs_getvfssw(const char *type)
3914 {
3915         struct vfssw *vswp;
3916         const char *modname;
3917 
3918         RLOCK_VFSSW();
3919         vswp = vfs_getvfsswbyname(type);
3920         modname = vfs_to_modname(type);
3921 
3922         if (rootdir == NULL) {
3923                 /*
3924                  * If we haven't yet loaded the root file system, then our
3925                  * _init won't be called until later. Allocate vfssw entry,
3926                  * because mod_installfs won't be called.
3927                  */
3928                 if (vswp == NULL) {
3929                         RUNLOCK_VFSSW();
3930                         WLOCK_VFSSW();
3931                         if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3932                                 if ((vswp = allocate_vfssw(type)) == NULL) {
3933                                         WUNLOCK_VFSSW();
3934                                         return (NULL);
3935                                 }
3936                         }
3937                         WUNLOCK_VFSSW();
3938                         RLOCK_VFSSW();
3939                 }
3940                 if (!VFS_INSTALLED(vswp)) {
3941                         RUNLOCK_VFSSW();
3942                         (void) modloadonly("fs", modname);
3943                 } else
3944                         RUNLOCK_VFSSW();
3945                 return (vswp);
3946         }
3947 
3948         /*
3949          * Try to load the filesystem.  Before calling modload(), we drop
3950          * our lock on the VFS switch table, and pick it up after the
3951          * module is loaded.  However, there is a potential race:  the
3952          * module could be unloaded after the call to modload() completes
3953          * but before we pick up the lock and drive on.  Therefore,
3954          * we keep reloading the module until we've loaded the module
3955          * _and_ we have the lock on the VFS switch table.
3956          */
3957         while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3958                 RUNLOCK_VFSSW();
3959                 if (modload("fs", modname) == -1)
3960                         return (NULL);
3961                 RLOCK_VFSSW();
3962                 if (vswp == NULL)
3963                         if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3964                                 break;
3965         }
3966         RUNLOCK_VFSSW();
3967 
3968         return (vswp);
3969 }
3970 
3971 /*
3972  * Find a vfssw entry given a file system type name.
3973  */
3974 struct vfssw *
3975 vfs_getvfsswbyname(const char *type)
3976 {
3977         struct vfssw *vswp;
3978 
3979         ASSERT(VFSSW_LOCKED());
3980         if (type == NULL || *type == '\0')
3981                 return (NULL);
3982 
3983         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3984                 if (strcmp(type, vswp->vsw_name) == 0) {
3985                         vfs_refvfssw(vswp);
3986                         return (vswp);
3987                 }
3988         }
3989 
3990         return (NULL);
3991 }
3992 
3993 /*
3994  * Find a vfssw entry given a set of vfsops.
3995  */
3996 struct vfssw *
3997 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3998 {
3999         struct vfssw *vswp;
4000 
4001         RLOCK_VFSSW();
4002         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4003                 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
4004                         vfs_refvfssw(vswp);
4005                         RUNLOCK_VFSSW();
4006                         return (vswp);
4007                 }
4008         }
4009         RUNLOCK_VFSSW();
4010 
4011         return (NULL);
4012 }
4013 
4014 /*
4015  * Reference a vfssw entry.
4016  */
4017 void
4018 vfs_refvfssw(struct vfssw *vswp)
4019 {
4020 
4021         mutex_enter(&vswp->vsw_lock);
4022         vswp->vsw_count++;
4023         mutex_exit(&vswp->vsw_lock);
4024 }
4025 
4026 /*
4027  * Unreference a vfssw entry.
4028  */
4029 void
4030 vfs_unrefvfssw(struct vfssw *vswp)
4031 {
4032 
4033         mutex_enter(&vswp->vsw_lock);
4034         vswp->vsw_count--;
4035         mutex_exit(&vswp->vsw_lock);
4036 }
4037 
4038 int sync_timeout = 30;          /* timeout for syncing a page during panic */
4039 int sync_timeleft;              /* portion of sync_timeout remaining */
4040 
4041 static int sync_retries = 20;   /* number of retries when not making progress */
4042 static int sync_triesleft;      /* portion of sync_retries remaining */
4043 
4044 static pgcnt_t old_pgcnt, new_pgcnt;
4045 static int new_bufcnt, old_bufcnt;
4046 
4047 /*
4048  * Sync all of the mounted filesystems, and then wait for the actual i/o to
4049  * complete.  We wait by counting the number of dirty pages and buffers,
4050  * pushing them out using bio_busy() and page_busy(), and then counting again.
4051  * This routine is used during both the uadmin A_SHUTDOWN code as well as
4052  * the SYNC phase of the panic code (see comments in panic.c).  It should only
4053  * be used after some higher-level mechanism has quiesced the system so that
4054  * new writes are not being initiated while we are waiting for completion.
4055  *
4056  * To ensure finite running time, our algorithm uses two timeout mechanisms:
4057  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
4058  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
4059  * Together these ensure that syncing completes if our i/o paths are stuck.
4060  * The counters are declared above so they can be found easily in the debugger.
4061  *
4062  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
4063  * vfs_syncprogress() subroutine whenever we make progress through the lists of
4064  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
4065  * When vfs_syncall() decides it is done, we disable the deadman() counter by
4066  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
4067  * deadlocking or hanging inside of a broken filesystem or driver routine.
4068  *
4069  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4070  * sync_retries consecutive calls to bio_busy() and page_busy() without
4071  * decreasing either the number of dirty buffers or dirty pages below the
4072  * lowest count we have seen so far, we give up and return from vfs_syncall().
4073  *
4074  * Each loop iteration ends with a call to delay() one second to allow time for
4075  * i/o completion and to permit the user time to read our progress messages.
4076  */
4077 void
4078 vfs_syncall(void)
4079 {
4080         if (rootdir == NULL && !modrootloaded)
4081                 return; /* panic during boot - no filesystems yet */
4082 
4083         printf("syncing file systems...");
4084         vfs_syncprogress();
4085         sync();
4086 
4087         vfs_syncprogress();
4088         sync_triesleft = sync_retries;
4089 
4090         old_bufcnt = new_bufcnt = INT_MAX;
4091         old_pgcnt = new_pgcnt = ULONG_MAX;
4092 
4093         while (sync_triesleft > 0) {
4094                 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4095                 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4096 
4097                 new_bufcnt = bio_busy(B_TRUE);
4098                 new_pgcnt = page_busy(B_TRUE);
4099                 vfs_syncprogress();
4100 
4101                 if (new_bufcnt == 0 && new_pgcnt == 0)
4102                         break;
4103 
4104                 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4105                         sync_triesleft = sync_retries;
4106                 else
4107                         sync_triesleft--;
4108 
4109                 if (new_bufcnt)
4110                         printf(" [%d]", new_bufcnt);
4111                 if (new_pgcnt)
4112                         printf(" %lu", new_pgcnt);
4113 
4114                 delay(hz);
4115         }
4116 
4117         if (new_bufcnt != 0 || new_pgcnt != 0)
4118                 printf(" done (not all i/o completed)\n");
4119         else
4120                 printf(" done\n");
4121 
4122         sync_timeleft = 0;
4123         delay(hz);
4124 }
4125 
4126 /*
4127  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
4128  * sync_timeout to indicate that we are making progress and the deadman()
4129  * omnipresent cyclic should not yet time us out.  Note that it is safe to
4130  * store to sync_timeleft here since the deadman() is firing at high-level
4131  * on top of us.  If we are racing with the deadman(), either the deadman()
4132  * will decrement the old value and then we will reset it, or we will
4133  * reset it and then the deadman() will immediately decrement it.  In either
4134  * case, correct behavior results.
4135  */
4136 void
4137 vfs_syncprogress(void)
4138 {
4139         if (panicstr)
4140                 sync_timeleft = sync_timeout;
4141 }
4142 
4143 /*
4144  * Map VFS flags to statvfs flags.  These shouldn't really be separate
4145  * flags at all.
4146  */
4147 uint_t
4148 vf_to_stf(uint_t vf)
4149 {
4150         uint_t stf = 0;
4151 
4152         if (vf & VFS_RDONLY)
4153                 stf |= ST_RDONLY;
4154         if (vf & VFS_NOSETUID)
4155                 stf |= ST_NOSUID;
4156         if (vf & VFS_NOTRUNC)
4157                 stf |= ST_NOTRUNC;
4158 
4159         return (stf);
4160 }
4161 
4162 /*
4163  * Entries for (illegal) fstype 0.
4164  */
4165 /* ARGSUSED */
4166 int
4167 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4168 {
4169         cmn_err(CE_PANIC, "stray vfs operation");
4170         return (0);
4171 }
4172 
4173 /*
4174  * Entries for (illegal) fstype 0.
4175  */
4176 int
4177 vfsstray(void)
4178 {
4179         cmn_err(CE_PANIC, "stray vfs operation");
4180         return (0);
4181 }
4182 
4183 /*
4184  * Support for dealing with forced UFS unmount and its interaction with
4185  * LOFS. Could be used by any filesystem.
4186  * See bug 1203132.
4187  */
4188 int
4189 vfs_EIO(void)
4190 {
4191         return (EIO);
4192 }
4193 
4194 /*
4195  * We've gotta define the op for sync separately, since the compiler gets
4196  * confused if we mix and match ANSI and normal style prototypes when
4197  * a "short" argument is present and spits out a warning.
4198  */
4199 /*ARGSUSED*/
4200 int
4201 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4202 {
4203         return (EIO);
4204 }
4205 
4206 vfs_t EIO_vfs;
4207 vfsops_t *EIO_vfsops;
4208 
4209 /*
4210  * Called from startup() to initialize all loaded vfs's
4211  */
4212 void
4213 vfsinit(void)
4214 {
4215         struct vfssw *vswp;
4216         int error;
4217         extern int vopstats_enabled;
4218         extern void vopstats_startup();
4219 
4220         static const fs_operation_def_t EIO_vfsops_template[] = {
4221                 VFSNAME_MOUNT,          { .error = vfs_EIO },
4222                 VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4223                 VFSNAME_ROOT,           { .error = vfs_EIO },
4224                 VFSNAME_STATVFS,        { .error = vfs_EIO },
4225                 VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4226                 VFSNAME_VGET,           { .error = vfs_EIO },
4227                 VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4228                 VFSNAME_FREEVFS,        { .error = vfs_EIO },
4229                 VFSNAME_VNSTATE,        { .error = vfs_EIO },
4230                 NULL, NULL
4231         };
4232 
4233         static const fs_operation_def_t stray_vfsops_template[] = {
4234                 VFSNAME_MOUNT,          { .error = vfsstray },
4235                 VFSNAME_UNMOUNT,        { .error = vfsstray },
4236                 VFSNAME_ROOT,           { .error = vfsstray },
4237                 VFSNAME_STATVFS,        { .error = vfsstray },
4238                 VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4239                 VFSNAME_VGET,           { .error = vfsstray },
4240                 VFSNAME_MOUNTROOT,      { .error = vfsstray },
4241                 VFSNAME_FREEVFS,        { .error = vfsstray },
4242                 VFSNAME_VNSTATE,        { .error = vfsstray },
4243                 NULL, NULL
4244         };
4245 
4246         /* Create vfs cache */
4247         vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4248             sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4249 
4250         /* Initialize the vnode cache (file systems may use it during init). */
4251         vn_create_cache();
4252 
4253         /* Setup event monitor framework */
4254         fem_init();
4255 
4256         /* Initialize the dummy stray file system type. */
4257         error = vfs_setfsops(0, stray_vfsops_template, NULL);
4258 
4259         /* Initialize the dummy EIO file system. */
4260         error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4261         if (error != 0) {
4262                 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4263                 /* Shouldn't happen, but not bad enough to panic */
4264         }
4265 
4266         VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4267 
4268         /*
4269          * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4270          * on this vfs can immediately notice it's invalid.
4271          */
4272         EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4273 
4274         /*
4275          * Call the init routines of non-loadable filesystems only.
4276          * Filesystems which are loaded as separate modules will be
4277          * initialized by the module loading code instead.
4278          */
4279 
4280         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4281                 RLOCK_VFSSW();
4282                 if (vswp->vsw_init != NULL)
4283                         (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4284                 RUNLOCK_VFSSW();
4285         }
4286 
4287         vopstats_startup();
4288 
4289         if (vopstats_enabled) {
4290                 /* EIO_vfs can collect stats, but we don't retrieve them */
4291                 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4292                 EIO_vfs.vfs_fstypevsp = NULL;
4293                 EIO_vfs.vfs_vskap = NULL;
4294                 EIO_vfs.vfs_flag |= VFS_STATS;
4295         }
4296 
4297         xattr_init();
4298 
4299         reparse_point_init();
4300 }
4301 
4302 vfs_t *
4303 vfs_alloc(int kmflag)
4304 {
4305         vfs_t *vfsp;
4306 
4307         vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4308 
4309         /*
4310          * Do the simplest initialization here.
4311          * Everything else gets done in vfs_init()
4312          */
4313         bzero(vfsp, sizeof (vfs_t));
4314         return (vfsp);
4315 }
4316 
4317 void
4318 vfs_free(vfs_t *vfsp)
4319 {
4320         /*
4321          * One would be tempted to assert that "vfsp->vfs_count == 0".
4322          * The problem is that this gets called out of domount() with
4323          * a partially initialized vfs and a vfs_count of 1.  This is
4324          * also called from vfs_rele() with a vfs_count of 0.  We can't
4325          * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4326          * returned.  This is because VFS_MOUNT() fully initializes the
4327          * vfs structure and its associated data.  VFS_RELE() will call
4328          * VFS_FREEVFS() which may panic the system if the data structures
4329          * aren't fully initialized from a successful VFS_MOUNT()).
4330          */
4331 
4332         /* If FEM was in use, make sure everything gets cleaned up */
4333         if (vfsp->vfs_femhead) {
4334                 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4335                 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4336                 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4337                 vfsp->vfs_femhead = NULL;
4338         }
4339 
4340         if (vfsp->vfs_implp)
4341                 vfsimpl_teardown(vfsp);
4342         sema_destroy(&vfsp->vfs_reflock);
4343         kmem_cache_free(vfs_cache, vfsp);
4344 }
4345 
4346 /*
4347  * Increments the vfs reference count by one atomically.
4348  */
4349 void
4350 vfs_hold(vfs_t *vfsp)
4351 {
4352         atomic_inc_32(&vfsp->vfs_count);
4353         ASSERT(vfsp->vfs_count != 0);
4354 }
4355 
4356 /*
4357  * Decrements the vfs reference count by one atomically. When
4358  * vfs reference count becomes zero, it calls the file system
4359  * specific vfs_freevfs() to free up the resources.
4360  */
4361 void
4362 vfs_rele(vfs_t *vfsp)
4363 {
4364         ASSERT(vfsp->vfs_count != 0);
4365         if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4366                 VFS_FREEVFS(vfsp);
4367                 lofi_remove(vfsp);
4368                 if (vfsp->vfs_zone)
4369                         zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4370                             ZONE_REF_VFS);
4371                 vfs_freemnttab(vfsp);
4372                 vfs_free(vfsp);
4373         }
4374 }
4375 
4376 /*
4377  * Generic operations vector support.
4378  *
4379  * This is used to build operations vectors for both the vfs and vnode.
4380  * It's normally called only when a file system is loaded.
4381  *
4382  * There are many possible algorithms for this, including the following:
4383  *
4384  *   (1) scan the list of known operations; for each, see if the file system
4385  *       includes an entry for it, and fill it in as appropriate.
4386  *
4387  *   (2) set up defaults for all known operations.  scan the list of ops
4388  *       supplied by the file system; for each which is both supplied and
4389  *       known, fill it in.
4390  *
4391  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4392  *       in entries as we go.
4393  *
4394  * we choose (1) for simplicity, and because performance isn't critical here.
4395  * note that (2) could be sped up using a precomputed hash table on known ops.
4396  * (3) could be faster than either, but only if the lists were very large or
4397  * supplied in sorted order.
4398  *
4399  */
4400 
4401 int
4402 fs_build_vector(void *vector, int *unused_ops,
4403     const fs_operation_trans_def_t *translation,
4404     const fs_operation_def_t *operations)
4405 {
4406         int i, num_trans, num_ops, used;
4407 
4408         /*
4409          * Count the number of translations and the number of supplied
4410          * operations.
4411          */
4412 
4413         {
4414                 const fs_operation_trans_def_t *p;
4415 
4416                 for (num_trans = 0, p = translation;
4417                     p->name != NULL;
4418                     num_trans++, p++)
4419                         ;
4420         }
4421 
4422         {
4423                 const fs_operation_def_t *p;
4424 
4425                 for (num_ops = 0, p = operations;
4426                     p->name != NULL;
4427                     num_ops++, p++)
4428                         ;
4429         }
4430 
4431         /* Walk through each operation known to our caller.  There will be */
4432         /* one entry in the supplied "translation table" for each. */
4433 
4434         used = 0;
4435 
4436         for (i = 0; i < num_trans; i++) {
4437                 int j, found;
4438                 char *curname;
4439                 fs_generic_func_p result;
4440                 fs_generic_func_p *location;
4441 
4442                 curname = translation[i].name;
4443 
4444                 /* Look for a matching operation in the list supplied by the */
4445                 /* file system. */
4446 
4447                 found = 0;
4448 
4449                 for (j = 0; j < num_ops; j++) {
4450                         if (strcmp(operations[j].name, curname) == 0) {
4451                                 used++;
4452                                 found = 1;
4453                                 break;
4454                         }
4455                 }
4456 
4457                 /*
4458                  * If the file system is using a "placeholder" for default
4459                  * or error functions, grab the appropriate function out of
4460                  * the translation table.  If the file system didn't supply
4461                  * this operation at all, use the default function.
4462                  */
4463 
4464                 if (found) {
4465                         result = operations[j].func.fs_generic;
4466                         if (result == fs_default) {
4467                                 result = translation[i].defaultFunc;
4468                         } else if (result == fs_error) {
4469                                 result = translation[i].errorFunc;
4470                         } else if (result == NULL) {
4471                                 /* Null values are PROHIBITED */
4472                                 return (EINVAL);
4473                         }
4474                 } else {
4475                         result = translation[i].defaultFunc;
4476                 }
4477 
4478                 /* Now store the function into the operations vector. */
4479 
4480                 location = (fs_generic_func_p *)
4481                     (((char *)vector) + translation[i].offset);
4482 
4483                 *location = result;
4484         }
4485 
4486         *unused_ops = num_ops - used;
4487 
4488         return (0);
4489 }
4490 
4491 /* Placeholder functions, should never be called. */
4492 
4493 int
4494 fs_error(void)
4495 {
4496         cmn_err(CE_PANIC, "fs_error called");
4497         return (0);
4498 }
4499 
4500 int
4501 fs_default(void)
4502 {
4503         cmn_err(CE_PANIC, "fs_default called");
4504         return (0);
4505 }
4506 
4507 #ifdef __sparc
4508 
4509 /*
4510  * Part of the implementation of booting off a mirrored root
4511  * involves a change of dev_t for the root device.  To
4512  * accomplish this, first remove the existing hash table
4513  * entry for the root device, convert to the new dev_t,
4514  * then re-insert in the hash table at the head of the list.
4515  */
4516 void
4517 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4518 {
4519         vfs_list_lock();
4520 
4521         vfs_hash_remove(vfsp);
4522 
4523         vfsp->vfs_dev = ndev;
4524         vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4525 
4526         vfs_hash_add(vfsp, 1);
4527 
4528         vfs_list_unlock();
4529 }
4530 
4531 #else /* x86 NEWBOOT */
4532 
4533 #if defined(__x86)
4534 extern int hvmboot_rootconf();
4535 #endif /* __x86 */
4536 
4537 extern ib_boot_prop_t *iscsiboot_prop;
4538 
4539 int
4540 rootconf()
4541 {
4542         int error;
4543         struct vfssw *vsw;
4544         extern void pm_init();
4545         char *fstyp, *fsmod;
4546         int ret = -1;
4547 
4548         getrootfs(&fstyp, &fsmod);
4549 
4550 #if defined(__x86)
4551         /*
4552          * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4553          * which lives in /platform/i86hvm, and hence is only available when
4554          * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4555          * is not available then the modstub for this function will return 0.
4556          * If the hvm_bootstrap misc module is available it will be loaded
4557          * and hvmboot_rootconf() will be invoked.
4558          */
4559         if (error = hvmboot_rootconf())
4560                 return (error);
4561 #endif /* __x86 */
4562 
4563         if (error = clboot_rootconf())
4564                 return (error);
4565 
4566         if (modload("fs", fsmod) == -1)
4567                 panic("Cannot _init %s module", fsmod);
4568 
4569         RLOCK_VFSSW();
4570         vsw = vfs_getvfsswbyname(fstyp);
4571         RUNLOCK_VFSSW();
4572         if (vsw == NULL) {
4573                 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4574                 return (ENXIO);
4575         }
4576         VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4577         VFS_HOLD(rootvfs);
4578 
4579         /* always mount readonly first */
4580         rootvfs->vfs_flag |= VFS_RDONLY;
4581 
4582         pm_init();
4583 
4584         if (netboot && iscsiboot_prop) {
4585                 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4586                     " shouldn't happen in the same time");
4587                 return (EINVAL);
4588         }
4589 
4590         if (netboot || iscsiboot_prop) {
4591                 ret = strplumb();
4592                 if (ret != 0) {
4593                         cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4594                         return (EFAULT);
4595                 }
4596         }
4597 
4598         if ((ret == 0) && iscsiboot_prop) {
4599                 ret = modload("drv", "iscsi");
4600                 /* -1 indicates fail */
4601                 if (ret == -1) {
4602                         cmn_err(CE_WARN, "Failed to load iscsi module");
4603                         iscsi_boot_prop_free();
4604                         return (EINVAL);
4605                 } else {
4606                         if (!i_ddi_attach_pseudo_node("iscsi")) {
4607                                 cmn_err(CE_WARN,
4608                                     "Failed to attach iscsi driver");
4609                                 iscsi_boot_prop_free();
4610                                 return (ENODEV);
4611                         }
4612                 }
4613         }
4614 
4615         error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4616         vfs_unrefvfssw(vsw);
4617         rootdev = rootvfs->vfs_dev;
4618 
4619         if (error)
4620                 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4621                     rootfs.bo_name, fstyp);
4622         else
4623                 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4624                     rootfs.bo_name, fstyp);
4625         return (error);
4626 }
4627 
4628 /*
4629  * XXX this is called by nfs only and should probably be removed
4630  * If booted with ASKNAME, prompt on the console for a filesystem
4631  * name and return it.
4632  */
4633 void
4634 getfsname(char *askfor, char *name, size_t namelen)
4635 {
4636         if (boothowto & RB_ASKNAME) {
4637                 printf("%s name: ", askfor);
4638                 console_gets(name, namelen);
4639         }
4640 }
4641 
4642 /*
4643  * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4644  * property.
4645  *
4646  * Filesystem types starting with the prefix "nfs" are diskless clients;
4647  * init the root filename name (rootfs.bo_name), too.
4648  *
4649  * If we are booting via NFS we currently have these options:
4650  *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4651  *      nfs2 -  force NFS V2
4652  *      nfs3 -  force NFS V3
4653  *      nfs4 -  force NFS V4
4654  * Because we need to maintain backward compatibility with the naming
4655  * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4656  * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4657  * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4658  * This is only for root filesystems, all other uses such as cachefs
4659  * will expect that "nfs" == NFS V2.
4660  */
4661 static void
4662 getrootfs(char **fstypp, char **fsmodp)
4663 {
4664         extern char *strplumb_get_netdev_path(void);
4665         char *propstr = NULL;
4666 
4667         /*
4668          * Check fstype property; for diskless it should be one of "nfs",
4669          * "nfs2", "nfs3" or "nfs4".
4670          */
4671         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4672             DDI_PROP_DONTPASS, "fstype", &propstr)
4673             == DDI_SUCCESS) {
4674                 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4675                 ddi_prop_free(propstr);
4676 
4677         /*
4678          * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4679          * assume the type of this root filesystem is 'zfs'.
4680          */
4681         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4682             DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4683             == DDI_SUCCESS) {
4684                 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4685                 ddi_prop_free(propstr);
4686         }
4687 
4688         if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4689                 *fstypp = *fsmodp = rootfs.bo_fstype;
4690                 return;
4691         }
4692 
4693         ++netboot;
4694 
4695         if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4696                 (void) strcpy(rootfs.bo_fstype, "nfs");
4697         else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4698                 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4699 
4700         /*
4701          * check if path to network interface is specified in bootpath
4702          * or by a hypervisor domain configuration file.
4703          * XXPV - enable strlumb_get_netdev_path()
4704          */
4705         if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4706             "xpv-nfsroot")) {
4707                 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4708         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4709             DDI_PROP_DONTPASS, "bootpath", &propstr)
4710             == DDI_SUCCESS) {
4711                 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4712                 ddi_prop_free(propstr);
4713         } else {
4714                 /* attempt to determine netdev_path via boot_mac address */
4715                 netdev_path = strplumb_get_netdev_path();
4716                 if (netdev_path == NULL)
4717                         panic("cannot find boot network interface");
4718                 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4719         }
4720         *fstypp = rootfs.bo_fstype;
4721         *fsmodp = "nfs";
4722 }
4723 #endif
4724 
4725 /*
4726  * VFS feature routines
4727  */
4728 
4729 #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4730 #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4731 
4732 /* Register a feature in the vfs */
4733 void
4734 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4735 {
4736         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4737         if (vfsp->vfs_implp == NULL)
4738                 return;
4739 
4740         vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4741 }
4742 
4743 void
4744 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4745 {
4746         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4747         if (vfsp->vfs_implp == NULL)
4748                 return;
4749         vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4750 }
4751 
4752 /*
4753  * Query a vfs for a feature.
4754  * Returns 1 if feature is present, 0 if not
4755  */
4756 int
4757 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4758 {
4759         int     ret = 0;
4760 
4761         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4762         if (vfsp->vfs_implp == NULL)
4763                 return (ret);
4764 
4765         if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4766                 ret = 1;
4767 
4768         return (ret);
4769 }
4770 
4771 /*
4772  * Propagate feature set from one vfs to another
4773  */
4774 void
4775 vfs_propagate_features(vfs_t *from, vfs_t *to)
4776 {
4777         int i;
4778 
4779         if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4780                 return;
4781 
4782         for (i = 1; i <= to->vfs_featureset[0]; i++) {
4783                 to->vfs_featureset[i] = from->vfs_featureset[i];
4784         }
4785 }
4786 
4787 #define LOFINODE_PATH "/dev/lofi/%d"
4788 
4789 /*
4790  * Return the vnode for the lofi node if there's a lofi mount in place.
4791  * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4792  * failure.
4793  */
4794 int
4795 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4796 {
4797         char *path = NULL;
4798         int strsize;
4799         int err;
4800 
4801         if (vfsp->vfs_lofi_minor == 0) {
4802                 *vpp = NULL;
4803                 return (-1);
4804         }
4805 
4806         strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_minor);
4807         path = kmem_alloc(strsize + 1, KM_SLEEP);
4808         (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_minor);
4809 
4810         /*
4811          * We may be inside a zone, so we need to use the /dev path, but
4812          * it's created asynchronously, so we wait here.
4813          */
4814         for (;;) {
4815                 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4816 
4817                 if (err != ENOENT)
4818                         break;
4819 
4820                 if ((err = delay_sig(hz / 8)) == EINTR)
4821                         break;
4822         }
4823 
4824         if (err)
4825                 *vpp = NULL;
4826 
4827         kmem_free(path, strsize + 1);
4828         return (err);
4829 }