illumos-gate Wdiff usr/src/uts/common/fs/vfs.c

Print this page

7798 vfs_mountfs passes junk in flags to domount
Reviewed by: Alexander Pyhalov <alp@rsu.ru>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Juraj Lutter <juraj.lutter@erigones.com>
Reviewed by: Marcel Telka <marcel@telka.sk>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/vfs.c
          +++ new/usr/src/uts/common/fs/vfs.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]

↓ open down ↓

17 lines elided

↑ open up ↑

  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25   25   * Copyright 2016 Toomas Soome <tsoome@me.com>
  26   26   * Copyright (c) 2016 by Delphix. All rights reserved.
  27   27   * Copyright 2016 Nexenta Systems, Inc.
       28 + * Copyright 2017 RackTop Systems.
  28   29   */
  29   30  
  30   31  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  31   32  /*        All Rights Reserved   */
  32   33  
  33   34  /*
  34   35   * University Copyright- Copyright (c) 1982, 1986, 1988
  35   36   * The Regents of the University of California
  36   37   * All Rights Reserved
  37   38   *

  38   39   * University Acknowledgment- Portions of this document are derived from
  39   40   * software developed by the University of California, Berkeley, and its
  40   41   * contributors.
  41   42   */
  42   43  
  43   44  #include <sys/types.h>
  44   45  #include <sys/t_lock.h>
  45   46  #include <sys/param.h>
  46   47  #include <sys/errno.h>
  47   48  #include <sys/user.h>
  48   49  #include <sys/fstyp.h>
  49   50  #include <sys/kmem.h>
  50   51  #include <sys/systm.h>
  51   52  #include <sys/proc.h>
  52   53  #include <sys/mount.h>
  53   54  #include <sys/vfs.h>
  54   55  #include <sys/vfs_opreg.h>
  55   56  #include <sys/fem.h>
  56   57  #include <sys/mntent.h>
  57   58  #include <sys/stat.h>
  58   59  #include <sys/statvfs.h>
  59   60  #include <sys/statfs.h>
  60   61  #include <sys/cred.h>
  61   62  #include <sys/vnode.h>
  62   63  #include <sys/rwstlock.h>
  63   64  #include <sys/dnlc.h>
  64   65  #include <sys/file.h>
  65   66  #include <sys/time.h>
  66   67  #include <sys/atomic.h>
  67   68  #include <sys/cmn_err.h>
  68   69  #include <sys/buf.h>
  69   70  #include <sys/swap.h>
  70   71  #include <sys/debug.h>
  71   72  #include <sys/vnode.h>
  72   73  #include <sys/modctl.h>
  73   74  #include <sys/ddi.h>
  74   75  #include <sys/pathname.h>
  75   76  #include <sys/bootconf.h>
  76   77  #include <sys/dumphdr.h>
  77   78  #include <sys/dc_ki.h>
  78   79  #include <sys/poll.h>
  79   80  #include <sys/sunddi.h>
  80   81  #include <sys/sysmacros.h>
  81   82  #include <sys/zone.h>
  82   83  #include <sys/policy.h>
  83   84  #include <sys/ctfs.h>
  84   85  #include <sys/objfs.h>
  85   86  #include <sys/console.h>
  86   87  #include <sys/reboot.h>
  87   88  #include <sys/attr.h>
  88   89  #include <sys/zio.h>
  89   90  #include <sys/spa.h>
  90   91  #include <sys/lofi.h>
  91   92  #include <sys/bootprops.h>
  92   93  
  93   94  #include <vm/page.h>
  94   95  
  95   96  #include <fs/fs_subr.h>
  96   97  /* Private interfaces to create vopstats-related data structures */
  97   98  extern void             initialize_vopstats(vopstats_t *);
  98   99  extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
  99  100  extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
 100  101  
 101  102  static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
 102  103  static void vfs_setmntopt_nolock(mntopts_t *, const char *,
 103  104      const char *, int, int);
 104  105  static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 105  106  static void vfs_freemnttab(struct vfs *);
 106  107  static void vfs_freeopt(mntopt_t *);
 107  108  static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 108  109  static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 109  110  static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 110  111  static void vfs_createopttbl_extend(mntopts_t *, const char *,
 111  112      const mntopts_t *);
 112  113  static char **vfs_copycancelopt_extend(char **const, int);
 113  114  static void vfs_freecancelopt(char **);
 114  115  static void getrootfs(char **, char **);
 115  116  static int getmacpath(dev_info_t *, void *);
 116  117  static void vfs_mnttabvp_setup(void);
 117  118  
 118  119  struct ipmnt {
 119  120          struct ipmnt    *mip_next;
 120  121          dev_t           mip_dev;
 121  122          struct vfs      *mip_vfsp;
 122  123  };
 123  124  
 124  125  static kmutex_t         vfs_miplist_mutex;
 125  126  static struct ipmnt     *vfs_miplist = NULL;
 126  127  static struct ipmnt     *vfs_miplist_end = NULL;
 127  128  
 128  129  static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 129  130  
 130  131  /*
 131  132   * VFS global data.
 132  133   */
 133  134  vnode_t *rootdir;               /* pointer to root inode vnode. */
 134  135  vnode_t *devicesdir;            /* pointer to inode of devices root */
 135  136  vnode_t *devdir;                /* pointer to inode of dev root */
 136  137  
 137  138  char *server_rootpath;          /* root path for diskless clients */
 138  139  char *server_hostname;          /* hostname of diskless server */
 139  140  
 140  141  static struct vfs root;
 141  142  static struct vfs devices;
 142  143  static struct vfs dev;
 143  144  struct vfs *rootvfs = &root;    /* pointer to root vfs; head of VFS list. */
 144  145  rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 145  146  int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 146  147                                  /* must be power of 2!  */
 147  148  timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 148  149  timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 149  150  char *vfs_dummyfstype = "\0";
 150  151  struct pollhead vfs_pollhd;     /* for mnttab pollers */
 151  152  struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 152  153  int     mntfstype;              /* will be set once mnt fs is mounted */
 153  154  
 154  155  /*
 155  156   * Table for generic options recognized in the VFS layer and acted
 156  157   * on at this level before parsing file system specific options.
 157  158   * The nosuid option is stronger than any of the devices and setuid
 158  159   * options, so those are canceled when nosuid is seen.
 159  160   *
 160  161   * All options which are added here need to be added to the
 161  162   * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 162  163   */
 163  164  /*
 164  165   * VFS Mount options table
 165  166   */
 166  167  static char *ro_cancel[] = { MNTOPT_RW, NULL };
 167  168  static char *rw_cancel[] = { MNTOPT_RO, NULL };
 168  169  static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 169  170  static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 170  171      MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 171  172  static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 172  173  static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 173  174  static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 174  175  static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 175  176  static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 176  177  static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 177  178  static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 178  179  static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 179  180  
 180  181  static const mntopt_t mntopts[] = {
 181  182  /*
 182  183   *      option name             cancel options          default arg     flags
 183  184   */
 184  185          { MNTOPT_REMOUNT,       NULL,                   NULL,
 185  186                  MO_NODISPLAY, (void *)0 },
 186  187          { MNTOPT_RO,            ro_cancel,              NULL,           0,
 187  188                  (void *)0 },
 188  189          { MNTOPT_RW,            rw_cancel,              NULL,           0,
 189  190                  (void *)0 },
 190  191          { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 191  192                  (void *)0 },
 192  193          { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 193  194                  (void *)0 },
 194  195          { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 195  196                  (void *)0 },
 196  197          { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 197  198                  (void *)0 },
 198  199          { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 199  200                  (void *)0 },
 200  201          { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 201  202                  (void *)0 },
 202  203          { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 203  204                  (void *)0 },
 204  205          { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 205  206                  (void *)0 },
 206  207          { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 207  208                  (void *)0 },
 208  209          { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 209  210                  (void *)0 },
 210  211  };
 211  212  
 212  213  const mntopts_t vfs_mntopts = {
 213  214          sizeof (mntopts) / sizeof (mntopt_t),
 214  215          (mntopt_t *)&mntopts[0]
 215  216  };
 216  217  
 217  218  /*
 218  219   * File system operation dispatch functions.
 219  220   */
 220  221  
 221  222  int
 222  223  fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 223  224  {
 224  225          return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 225  226  }
 226  227  
 227  228  int
 228  229  fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 229  230  {
 230  231          return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 231  232  }
 232  233  
 233  234  int
 234  235  fsop_root(vfs_t *vfsp, vnode_t **vpp)
 235  236  {
 236  237          refstr_t *mntpt;
 237  238          int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 238  239          /*
 239  240           * Make sure this root has a path.  With lofs, it is possible to have
 240  241           * a NULL mountpoint.
 241  242           */
 242  243          if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
 243  244                  mntpt = vfs_getmntpoint(vfsp);
 244  245                  vn_setpath_str(*vpp, refstr_value(mntpt),
 245  246                      strlen(refstr_value(mntpt)));
 246  247                  refstr_rele(mntpt);
 247  248          }
 248  249  
 249  250          return (ret);
 250  251  }
 251  252  
 252  253  int
 253  254  fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 254  255  {
 255  256          return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 256  257  }
 257  258  
 258  259  int
 259  260  fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 260  261  {
 261  262          return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 262  263  }
 263  264  
 264  265  int
 265  266  fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 266  267  {
 267  268          /*
 268  269           * In order to handle system attribute fids in a manner
 269  270           * transparent to the underlying fs, we embed the fid for
 270  271           * the sysattr parent object in the sysattr fid and tack on
 271  272           * some extra bytes that only the sysattr layer knows about.
 272  273           *
 273  274           * This guarantees that sysattr fids are larger than other fids
 274  275           * for this vfs. If the vfs supports the sysattr view interface
 275  276           * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 276  277           * collision with XATTR_FIDSZ.
 277  278           */
 278  279          if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 279  280              fidp->fid_len == XATTR_FIDSZ)
 280  281                  return (xattr_dir_vget(vfsp, vpp, fidp));
 281  282  
 282  283          return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 283  284  }
 284  285  
 285  286  int
 286  287  fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 287  288  {
 288  289          return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 289  290  }
 290  291  
 291  292  void
 292  293  fsop_freefs(vfs_t *vfsp)
 293  294  {
 294  295          (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 295  296  }
 296  297  
 297  298  int
 298  299  fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 299  300  {
 300  301          return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 301  302  }
 302  303  
 303  304  int
 304  305  fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 305  306  {
 306  307          ASSERT((fstype >= 0) && (fstype < nfstype));
 307  308  
 308  309          if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 309  310                  return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 310  311          else
 311  312                  return (ENOTSUP);
 312  313  }
 313  314  
 314  315  /*
 315  316   * File system initialization.  vfs_setfsops() must be called from a file
 316  317   * system's init routine.
 317  318   */
 318  319  
 319  320  static int
 320  321  fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 321  322      int *unused_ops)
 322  323  {
 323  324          static const fs_operation_trans_def_t vfs_ops_table[] = {
 324  325                  VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 325  326                          fs_nosys, fs_nosys,
 326  327  
 327  328                  VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 328  329                          fs_nosys, fs_nosys,
 329  330  
 330  331                  VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 331  332                          fs_nosys, fs_nosys,
 332  333  
 333  334                  VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 334  335                          fs_nosys, fs_nosys,
 335  336  
 336  337                  VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 337  338                          (fs_generic_func_p) fs_sync,
 338  339                          (fs_generic_func_p) fs_sync,    /* No errors allowed */
 339  340  
 340  341                  VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 341  342                          fs_nosys, fs_nosys,
 342  343  
 343  344                  VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 344  345                          fs_nosys, fs_nosys,
 345  346  
 346  347                  VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 347  348                          (fs_generic_func_p)fs_freevfs,
 348  349                          (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 349  350  
 350  351                  VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 351  352                          (fs_generic_func_p)fs_nosys,
 352  353                          (fs_generic_func_p)fs_nosys,
 353  354  
 354  355                  NULL, 0, NULL, NULL
 355  356          };
 356  357  
 357  358          return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 358  359  }
 359  360  
 360  361  void
 361  362  zfs_boot_init(void)
 362  363  {
 363  364          if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 364  365                  spa_boot_init();
 365  366  }
 366  367  
 367  368  int
 368  369  vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 369  370  {
 370  371          int error;
 371  372          int unused_ops;
 372  373  
 373  374          /*
 374  375           * Verify that fstype refers to a valid fs.  Note that
 375  376           * 0 is valid since it's used to set "stray" ops.
 376  377           */
 377  378          if ((fstype < 0) || (fstype >= nfstype))
 378  379                  return (EINVAL);
 379  380  
 380  381          if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 381  382                  return (EINVAL);
 382  383  
 383  384          /* Set up the operations vector. */
 384  385  
 385  386          error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 386  387  
 387  388          if (error != 0)
 388  389                  return (error);
 389  390  
 390  391          vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 391  392  
 392  393          if (actual != NULL)
 393  394                  *actual = &vfssw[fstype].vsw_vfsops;
 394  395  
 395  396  #if DEBUG
 396  397          if (unused_ops != 0)
 397  398                  cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 398  399                      "but not used", vfssw[fstype].vsw_name, unused_ops);
 399  400  #endif
 400  401  
 401  402          return (0);
 402  403  }
 403  404  
 404  405  int
 405  406  vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 406  407  {
 407  408          int error;
 408  409          int unused_ops;
 409  410  
 410  411          *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 411  412  
 412  413          error = fs_copyfsops(template, *actual, &unused_ops);
 413  414          if (error != 0) {
 414  415                  kmem_free(*actual, sizeof (vfsops_t));
 415  416                  *actual = NULL;
 416  417                  return (error);
 417  418          }
 418  419  
 419  420          return (0);
 420  421  }
 421  422  
 422  423  /*
 423  424   * Free a vfsops structure created as a result of vfs_makefsops().
 424  425   * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 425  426   * vfs_freevfsops_by_type().
 426  427   */
 427  428  void
 428  429  vfs_freevfsops(vfsops_t *vfsops)
 429  430  {
 430  431          kmem_free(vfsops, sizeof (vfsops_t));
 431  432  }
 432  433  
 433  434  /*
 434  435   * Since the vfsops structure is part of the vfssw table and wasn't
 435  436   * really allocated, we're not really freeing anything.  We keep
 436  437   * the name for consistency with vfs_freevfsops().  We do, however,
 437  438   * need to take care of a little bookkeeping.
 438  439   * NOTE: For a vfsops structure created by vfs_setfsops(), use
 439  440   * vfs_freevfsops_by_type().
 440  441   */
 441  442  int
 442  443  vfs_freevfsops_by_type(int fstype)
 443  444  {
 444  445  
 445  446          /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 446  447          if ((fstype <= 0) || (fstype >= nfstype))
 447  448                  return (EINVAL);
 448  449  
 449  450          WLOCK_VFSSW();
 450  451          if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 451  452                  WUNLOCK_VFSSW();
 452  453                  return (EINVAL);
 453  454          }
 454  455  
 455  456          vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 456  457          WUNLOCK_VFSSW();
 457  458  
 458  459          return (0);
 459  460  }
 460  461  
 461  462  /* Support routines used to reference vfs_op */
 462  463  
 463  464  /* Set the operations vector for a vfs */
 464  465  void
 465  466  vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 466  467  {
 467  468          vfsops_t        *op;
 468  469  
 469  470          ASSERT(vfsp != NULL);
 470  471          ASSERT(vfsops != NULL);
 471  472  
 472  473          op = vfsp->vfs_op;
 473  474          membar_consumer();
 474  475          if (vfsp->vfs_femhead == NULL &&
 475  476              atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
 476  477                  return;
 477  478          }
 478  479          fsem_setvfsops(vfsp, vfsops);
 479  480  }
 480  481  
 481  482  /* Retrieve the operations vector for a vfs */
 482  483  vfsops_t *
 483  484  vfs_getops(vfs_t *vfsp)
 484  485  {
 485  486          vfsops_t        *op;
 486  487  
 487  488          ASSERT(vfsp != NULL);
 488  489  
 489  490          op = vfsp->vfs_op;
 490  491          membar_consumer();
 491  492          if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 492  493                  return (op);
 493  494          } else {
 494  495                  return (fsem_getvfsops(vfsp));
 495  496          }
 496  497  }
 497  498  
 498  499  /*
 499  500   * Returns non-zero (1) if the vfsops matches that of the vfs.
 500  501   * Returns zero (0) if not.
 501  502   */
 502  503  int
 503  504  vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 504  505  {
 505  506          return (vfs_getops(vfsp) == vfsops);
 506  507  }
 507  508  
 508  509  /*
 509  510   * Returns non-zero (1) if the file system has installed a non-default,
 510  511   * non-error vfs_sync routine.  Returns zero (0) otherwise.
 511  512   */
 512  513  int
 513  514  vfs_can_sync(vfs_t *vfsp)
 514  515  {
 515  516          /* vfs_sync() routine is not the default/error function */
 516  517          return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 517  518  }
 518  519  
 519  520  /*
 520  521   * Initialize a vfs structure.
 521  522   */
 522  523  void
 523  524  vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 524  525  {
 525  526          /* Other initialization has been moved to vfs_alloc() */
 526  527          vfsp->vfs_count = 0;
 527  528          vfsp->vfs_next = vfsp;
 528  529          vfsp->vfs_prev = vfsp;
 529  530          vfsp->vfs_zone_next = vfsp;
 530  531          vfsp->vfs_zone_prev = vfsp;
 531  532          vfsp->vfs_lofi_id = 0;
 532  533          sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 533  534          vfsimpl_setup(vfsp);
 534  535          vfsp->vfs_data = (data);
 535  536          vfs_setops((vfsp), (op));
 536  537  }
 537  538  
 538  539  /*
 539  540   * Allocate and initialize the vfs implementation private data
 540  541   * structure, vfs_impl_t.
 541  542   */
 542  543  void
 543  544  vfsimpl_setup(vfs_t *vfsp)
 544  545  {
 545  546          int i;
 546  547  
 547  548          if (vfsp->vfs_implp != NULL) {
 548  549                  return;
 549  550          }
 550  551  
 551  552          vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 552  553          /* Note that these are #define'd in vfs.h */
 553  554          vfsp->vfs_vskap = NULL;
 554  555          vfsp->vfs_fstypevsp = NULL;
 555  556  
 556  557          /* Set size of counted array, then zero the array */
 557  558          vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 558  559          for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 559  560                  vfsp->vfs_featureset[i] = 0;
 560  561          }
 561  562  }
 562  563  
 563  564  /*
 564  565   * Release the vfs_impl_t structure, if it exists. Some unbundled
 565  566   * filesystems may not use the newer version of vfs and thus
 566  567   * would not contain this implementation private data structure.
 567  568   */
 568  569  void
 569  570  vfsimpl_teardown(vfs_t *vfsp)
 570  571  {
 571  572          vfs_impl_t      *vip = vfsp->vfs_implp;
 572  573  
 573  574          if (vip == NULL)
 574  575                  return;
 575  576  
 576  577          kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 577  578          vfsp->vfs_implp = NULL;
 578  579  }
 579  580  
 580  581  /*
 581  582   * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 582  583   * fstatvfs, and sysfs moved to common/syscall.
 583  584   */
 584  585  
 585  586  /*
 586  587   * Update every mounted file system.  We call the vfs_sync operation of
 587  588   * each file system type, passing it a NULL vfsp to indicate that all
 588  589   * mounted file systems of that type should be updated.
 589  590   */
 590  591  void
 591  592  vfs_sync(int flag)
 592  593  {
 593  594          struct vfssw *vswp;
 594  595          RLOCK_VFSSW();
 595  596          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 596  597                  if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 597  598                          vfs_refvfssw(vswp);
 598  599                          RUNLOCK_VFSSW();
 599  600                          (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 600  601                              CRED());
 601  602                          vfs_unrefvfssw(vswp);
 602  603                          RLOCK_VFSSW();
 603  604                  }
 604  605          }
 605  606          RUNLOCK_VFSSW();
 606  607  }
 607  608  
 608  609  void
 609  610  sync(void)
 610  611  {
 611  612          vfs_sync(0);
 612  613  }
 613  614  
 614  615  /*
 615  616   * External routines.
 616  617   */
 617  618  
 618  619  krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 619  620  
 620  621  /*
 621  622   * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 622  623   * but otherwise should be accessed only via vfs_list_lock() and
 623  624   * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 624  625   */
 625  626  static krwlock_t vfslist;
 626  627  
 627  628  /*
 628  629   * Mount devfs on /devices. This is done right after root is mounted
 629  630   * to provide device access support for the system
 630  631   */
 631  632  static void
 632  633  vfs_mountdevices(void)
 633  634  {
 634  635          struct vfssw *vsw;
 635  636          struct vnode *mvp;
 636  637          struct mounta mounta = {        /* fake mounta for devfs_mount() */
 637  638                  NULL,
 638  639                  NULL,
 639  640                  MS_SYSSPACE,
 640  641                  NULL,
 641  642                  NULL,
 642  643                  0,
 643  644                  NULL,
 644  645                  0
 645  646          };
 646  647  
 647  648          /*
 648  649           * _init devfs module to fill in the vfssw
 649  650           */
 650  651          if (modload("fs", "devfs") == -1)
 651  652                  panic("Cannot _init devfs module");
 652  653  
 653  654          /*
 654  655           * Hold vfs
 655  656           */
 656  657          RLOCK_VFSSW();
 657  658          vsw = vfs_getvfsswbyname("devfs");
 658  659          VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 659  660          VFS_HOLD(&devices);
 660  661  
 661  662          /*
 662  663           * Locate mount point
 663  664           */
 664  665          if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 665  666                  panic("Cannot find /devices");
 666  667  
 667  668          /*
 668  669           * Perform the mount of /devices
 669  670           */
 670  671          if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 671  672                  panic("Cannot mount /devices");
 672  673  
 673  674          RUNLOCK_VFSSW();
 674  675  
 675  676          /*
 676  677           * Set appropriate members and add to vfs list for mnttab display
 677  678           */
 678  679          vfs_setresource(&devices, "/devices", 0);
 679  680          vfs_setmntpoint(&devices, "/devices", 0);
 680  681  
 681  682          /*
 682  683           * Hold the root of /devices so it won't go away
 683  684           */
 684  685          if (VFS_ROOT(&devices, &devicesdir))
 685  686                  panic("vfs_mountdevices: not devices root");
 686  687  
 687  688          if (vfs_lock(&devices) != 0) {
 688  689                  VN_RELE(devicesdir);
 689  690                  cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 690  691                  return;
 691  692          }
 692  693  
 693  694          if (vn_vfswlock(mvp) != 0) {
 694  695                  vfs_unlock(&devices);
 695  696                  VN_RELE(devicesdir);
 696  697                  cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 697  698                  return;
 698  699          }
 699  700  
 700  701          vfs_add(mvp, &devices, 0);
 701  702          vn_vfsunlock(mvp);
 702  703          vfs_unlock(&devices);
 703  704          VN_RELE(devicesdir);
 704  705  }
 705  706  
 706  707  /*
 707  708   * mount the first instance of /dev  to root and remain mounted
 708  709   */
 709  710  static void
 710  711  vfs_mountdev1(void)
 711  712  {
 712  713          struct vfssw *vsw;
 713  714          struct vnode *mvp;
 714  715          struct mounta mounta = {        /* fake mounta for sdev_mount() */
 715  716                  NULL,
 716  717                  NULL,
 717  718                  MS_SYSSPACE | MS_OVERLAY,
 718  719                  NULL,
 719  720                  NULL,
 720  721                  0,
 721  722                  NULL,
 722  723                  0
 723  724          };
 724  725  
 725  726          /*
 726  727           * _init dev module to fill in the vfssw
 727  728           */
 728  729          if (modload("fs", "dev") == -1)
 729  730                  cmn_err(CE_PANIC, "Cannot _init dev module\n");
 730  731  
 731  732          /*
 732  733           * Hold vfs
 733  734           */
 734  735          RLOCK_VFSSW();
 735  736          vsw = vfs_getvfsswbyname("dev");
 736  737          VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 737  738          VFS_HOLD(&dev);
 738  739  
 739  740          /*
 740  741           * Locate mount point
 741  742           */
 742  743          if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 743  744                  cmn_err(CE_PANIC, "Cannot find /dev\n");
 744  745  
 745  746          /*
 746  747           * Perform the mount of /dev
 747  748           */
 748  749          if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 749  750                  cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 750  751  
 751  752          RUNLOCK_VFSSW();
 752  753  
 753  754          /*
 754  755           * Set appropriate members and add to vfs list for mnttab display
 755  756           */
 756  757          vfs_setresource(&dev, "/dev", 0);
 757  758          vfs_setmntpoint(&dev, "/dev", 0);
 758  759  
 759  760          /*
 760  761           * Hold the root of /dev so it won't go away
 761  762           */
 762  763          if (VFS_ROOT(&dev, &devdir))
 763  764                  cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 764  765  
 765  766          if (vfs_lock(&dev) != 0) {
 766  767                  VN_RELE(devdir);
 767  768                  cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 768  769                  return;
 769  770          }
 770  771  
 771  772          if (vn_vfswlock(mvp) != 0) {
 772  773                  vfs_unlock(&dev);
 773  774                  VN_RELE(devdir);
 774  775                  cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 775  776                  return;
 776  777          }
 777  778  
 778  779          vfs_add(mvp, &dev, 0);
 779  780          vn_vfsunlock(mvp);
 780  781          vfs_unlock(&dev);
 781  782          VN_RELE(devdir);
 782  783  }
 783  784

↓ open down ↓

746 lines elided

↑ open up ↑

 784  785  /*
 785  786   * Mount required filesystem. This is done right after root is mounted.
 786  787   */
 787  788  static void
 788  789  vfs_mountfs(char *module, char *spec, char *path)
 789  790  {
 790  791          struct vnode *mvp;
 791  792          struct mounta mounta;
 792  793          vfs_t *vfsp;
 793  794  
      795 +        bzero(&mounta, sizeof (mounta));
 794  796          mounta.flags = MS_SYSSPACE | MS_DATA;
 795  797          mounta.fstype = module;
 796  798          mounta.spec = spec;
 797  799          mounta.dir = path;
 798  800          if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 799  801                  cmn_err(CE_WARN, "Cannot find %s", path);
 800  802                  return;
 801  803          }
 802  804          if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 803  805                  cmn_err(CE_WARN, "Cannot mount %s", path);

 804  806          else
 805  807                  VFS_RELE(vfsp);
 806  808          VN_RELE(mvp);
 807  809  }
 808  810  
 809  811  /*
 810  812   * vfs_mountroot is called by main() to mount the root filesystem.
 811  813   */
 812  814  void
 813  815  vfs_mountroot(void)
 814  816  {
 815  817          struct vnode    *rvp = NULL;
 816  818          char            *path;
 817  819          size_t          plen;
 818  820          struct vfssw    *vswp;
 819  821          proc_t          *p;
 820  822  
 821  823          rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 822  824          rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 823  825  
 824  826          /*
 825  827           * Alloc the vfs hash bucket array and locks
 826  828           */
 827  829          rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 828  830  
 829  831          /*
 830  832           * Call machine-dependent routine "rootconf" to choose a root
 831  833           * file system type.
 832  834           */
 833  835          if (rootconf())
 834  836                  panic("vfs_mountroot: cannot mount root");
 835  837          /*
 836  838           * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 837  839           * to point to it.  These are used by lookuppn() so that it
 838  840           * knows where to start from ('/' or '.').
 839  841           */
 840  842          vfs_setmntpoint(rootvfs, "/", 0);
 841  843          if (VFS_ROOT(rootvfs, &rootdir))
 842  844                  panic("vfs_mountroot: no root vnode");
 843  845  
 844  846          /*
 845  847           * At this point, the process tree consists of p0 and possibly some
 846  848           * direct children of p0.  (i.e. there are no grandchildren)
 847  849           *
 848  850           * Walk through them all, setting their current directory.
 849  851           */
 850  852          mutex_enter(&pidlock);
 851  853          for (p = practive; p != NULL; p = p->p_next) {
 852  854                  ASSERT(p == &p0 || p->p_parent == &p0);
 853  855  
 854  856                  PTOU(p)->u_cdir = rootdir;
 855  857                  VN_HOLD(PTOU(p)->u_cdir);
 856  858                  PTOU(p)->u_rdir = NULL;
 857  859          }
 858  860          mutex_exit(&pidlock);
 859  861  
 860  862          /*
 861  863           * Setup the global zone's rootvp, now that it exists.
 862  864           */
 863  865          global_zone->zone_rootvp = rootdir;
 864  866          VN_HOLD(global_zone->zone_rootvp);
 865  867  
 866  868          /*
 867  869           * Notify the module code that it can begin using the
 868  870           * root filesystem instead of the boot program's services.
 869  871           */
 870  872          modrootloaded = 1;
 871  873  
 872  874          /*
 873  875           * Special handling for a ZFS root file system.
 874  876           */
 875  877          zfs_boot_init();
 876  878  
 877  879          /*
 878  880           * Set up mnttab information for root
 879  881           */
 880  882          vfs_setresource(rootvfs, rootfs.bo_name, 0);
 881  883  
 882  884          /*
 883  885           * Notify cluster software that the root filesystem is available.
 884  886           */
 885  887          clboot_mountroot();
 886  888  
 887  889          /* Now that we're all done with the root FS, set up its vopstats */
 888  890          if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 889  891                  /* Set flag for statistics collection */
 890  892                  if (vswp->vsw_flag & VSW_STATS) {
 891  893                          initialize_vopstats(&rootvfs->vfs_vopstats);
 892  894                          rootvfs->vfs_flag |= VFS_STATS;
 893  895                          rootvfs->vfs_fstypevsp =
 894  896                              get_fstype_vopstats(rootvfs, vswp);
 895  897                          rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 896  898                  }
 897  899                  vfs_unrefvfssw(vswp);
 898  900          }
 899  901  
 900  902          /*
 901  903           * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 902  904           * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 903  905           */
 904  906          vfs_mountdevices();
 905  907          vfs_mountdev1();
 906  908  
 907  909          vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 908  910          vfs_mountfs("proc", "/proc", "/proc");
 909  911          vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 910  912          vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 911  913          vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 912  914          vfs_mountfs("bootfs", "bootfs", "/system/boot");
 913  915  
 914  916          if (getzoneid() == GLOBAL_ZONEID) {
 915  917                  vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 916  918          }
 917  919  
 918  920          if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 919  921                  /*
 920  922                   * Look up the root device via devfs so that a dv_node is
 921  923                   * created for it. The vnode is never VN_RELE()ed.
 922  924                   * We allocate more than MAXPATHLEN so that the
 923  925                   * buffer passed to i_ddi_prompath_to_devfspath() is
 924  926                   * exactly MAXPATHLEN (the function expects a buffer
 925  927                   * of that length).
 926  928                   */
 927  929                  plen = strlen("/devices");
 928  930                  path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 929  931                  (void) strcpy(path, "/devices");
 930  932  
 931  933                  if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 932  934                      != DDI_SUCCESS ||
 933  935                      lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 934  936  
 935  937                          /* NUL terminate in case "path" has garbage */
 936  938                          path[plen + MAXPATHLEN - 1] = '\0';
 937  939  #ifdef  DEBUG
 938  940                          cmn_err(CE_WARN, "!Cannot lookup root device: %s",
 939  941                              path);
 940  942  #endif
 941  943                  }
 942  944                  kmem_free(path, plen + MAXPATHLEN);
 943  945          }
 944  946  
 945  947          vfs_mnttabvp_setup();
 946  948  }
 947  949  
 948  950  /*
 949  951   * Check to see if our "block device" is actually a file.  If so,
 950  952   * automatically add a lofi device, and keep track of this fact.
 951  953   */
 952  954  static int
 953  955  lofi_add(const char *fsname, struct vfs *vfsp,
 954  956      mntopts_t *mntopts, struct mounta *uap)
 955  957  {
 956  958          int fromspace = (uap->flags & MS_SYSSPACE) ?
 957  959              UIO_SYSSPACE : UIO_USERSPACE;
 958  960          struct lofi_ioctl *li = NULL;
 959  961          struct vnode *vp = NULL;
 960  962          struct pathname pn = { NULL };
 961  963          ldi_ident_t ldi_id;
 962  964          ldi_handle_t ldi_hdl;
 963  965          vfssw_t *vfssw;
 964  966          int id;
 965  967          int err = 0;
 966  968  
 967  969          if ((vfssw = vfs_getvfssw(fsname)) == NULL)
 968  970                  return (0);
 969  971  
 970  972          if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
 971  973                  vfs_unrefvfssw(vfssw);
 972  974                  return (0);
 973  975          }
 974  976  
 975  977          vfs_unrefvfssw(vfssw);
 976  978          vfssw = NULL;
 977  979  
 978  980          if (pn_get(uap->spec, fromspace, &pn) != 0)
 979  981                  return (0);
 980  982  
 981  983          if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
 982  984                  goto out;
 983  985  
 984  986          if (vp->v_type != VREG)
 985  987                  goto out;
 986  988  
 987  989          /* OK, this is a lofi mount. */
 988  990  
 989  991          if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
 990  992              vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
 991  993              vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
 992  994              vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
 993  995                  err = EINVAL;
 994  996                  goto out;
 995  997          }
 996  998  
 997  999          ldi_id = ldi_ident_from_anon();
 998 1000          li = kmem_zalloc(sizeof (*li), KM_SLEEP);
 999 1001          (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1000 1002  
1001 1003          err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1002 1004              &ldi_hdl, ldi_id);
1003 1005  
1004 1006          if (err)
1005 1007                  goto out2;
1006 1008  
1007 1009          err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1008 1010              FREAD | FWRITE | FKIOCTL, kcred, &id);
1009 1011  
1010 1012          (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1011 1013  
1012 1014          if (!err)
1013 1015                  vfsp->vfs_lofi_id = id;
1014 1016  
1015 1017  out2:
1016 1018          ldi_ident_release(ldi_id);
1017 1019  out:
1018 1020          if (li != NULL)
1019 1021                  kmem_free(li, sizeof (*li));
1020 1022          if (vp != NULL)
1021 1023                  VN_RELE(vp);
1022 1024          pn_free(&pn);
1023 1025          return (err);
1024 1026  }
1025 1027  
1026 1028  static void
1027 1029  lofi_remove(struct vfs *vfsp)
1028 1030  {
1029 1031          struct lofi_ioctl *li = NULL;
1030 1032          ldi_ident_t ldi_id;
1031 1033          ldi_handle_t ldi_hdl;
1032 1034          int err;
1033 1035  
1034 1036          if (vfsp->vfs_lofi_id == 0)
1035 1037                  return;
1036 1038  
1037 1039          ldi_id = ldi_ident_from_anon();
1038 1040  
1039 1041          li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1040 1042          li->li_id = vfsp->vfs_lofi_id;
1041 1043          li->li_cleanup = B_TRUE;
1042 1044  
1043 1045          err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1044 1046              &ldi_hdl, ldi_id);
1045 1047  
1046 1048          if (err)
1047 1049                  goto out;
1048 1050  
1049 1051          err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1050 1052              FREAD | FWRITE | FKIOCTL, kcred, NULL);
1051 1053  
1052 1054          (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1053 1055  
1054 1056          if (!err)
1055 1057                  vfsp->vfs_lofi_id = 0;
1056 1058  
1057 1059  out:
1058 1060          ldi_ident_release(ldi_id);
1059 1061          if (li != NULL)
1060 1062                  kmem_free(li, sizeof (*li));
1061 1063  }
1062 1064  
1063 1065  /*
1064 1066   * Common mount code.  Called from the system call entry point, from autofs,
1065 1067   * nfsv4 trigger mounts, and from pxfs.
1066 1068   *
1067 1069   * Takes the effective file system type, mount arguments, the mount point
1068 1070   * vnode, flags specifying whether the mount is a remount and whether it
1069 1071   * should be entered into the vfs list, and credentials.  Fills in its vfspp
1070 1072   * parameter with the mounted file system instance's vfs.
1071 1073   *
1072 1074   * Note that the effective file system type is specified as a string.  It may
1073 1075   * be null, in which case it's determined from the mount arguments, and may
1074 1076   * differ from the type specified in the mount arguments; this is a hook to
1075 1077   * allow interposition when instantiating file system instances.
1076 1078   *
1077 1079   * The caller is responsible for releasing its own hold on the mount point
1078 1080   * vp (this routine does its own hold when necessary).
1079 1081   * Also note that for remounts, the mount point vp should be the vnode for
1080 1082   * the root of the file system rather than the vnode that the file system
1081 1083   * is mounted on top of.
1082 1084   */
1083 1085  int
1084 1086  domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1085 1087      struct vfs **vfspp)
1086 1088  {
1087 1089          struct vfssw    *vswp;
1088 1090          vfsops_t        *vfsops;
1089 1091          struct vfs      *vfsp;
1090 1092          struct vnode    *bvp;
1091 1093          dev_t           bdev = 0;
1092 1094          mntopts_t       mnt_mntopts;
1093 1095          int             error = 0;
1094 1096          int             copyout_error = 0;
1095 1097          int             ovflags;
1096 1098          char            *opts = uap->optptr;
1097 1099          char            *inargs = opts;
1098 1100          int             optlen = uap->optlen;
1099 1101          int             remount;
1100 1102          int             rdonly;
1101 1103          int             nbmand = 0;
1102 1104          int             delmip = 0;
1103 1105          int             addmip = 0;
1104 1106          int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1105 1107          int             fromspace = (uap->flags & MS_SYSSPACE) ?
1106 1108              UIO_SYSSPACE : UIO_USERSPACE;
1107 1109          char            *resource = NULL, *mountpt = NULL;
1108 1110          refstr_t        *oldresource, *oldmntpt;
1109 1111          struct pathname pn, rpn;
1110 1112          vsk_anchor_t    *vskap;
1111 1113          char fstname[FSTYPSZ];
1112 1114          zone_t          *zone;
1113 1115  
1114 1116          /*
1115 1117           * The v_flag value for the mount point vp is permanently set
1116 1118           * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1117 1119           * for mount point locking.
1118 1120           */
1119 1121          mutex_enter(&vp->v_lock);
1120 1122          vp->v_flag |= VVFSLOCK;
1121 1123          mutex_exit(&vp->v_lock);
1122 1124  
1123 1125          mnt_mntopts.mo_count = 0;
1124 1126          /*
1125 1127           * Find the ops vector to use to invoke the file system-specific mount
1126 1128           * method.  If the fsname argument is non-NULL, use it directly.
1127 1129           * Otherwise, dig the file system type information out of the mount
1128 1130           * arguments.
1129 1131           *
1130 1132           * A side effect is to hold the vfssw entry.
1131 1133           *
1132 1134           * Mount arguments can be specified in several ways, which are
1133 1135           * distinguished by flag bit settings.  The preferred way is to set
1134 1136           * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1135 1137           * type supplied as a character string and the last two arguments
1136 1138           * being a pointer to a character buffer and the size of the buffer.
1137 1139           * On entry, the buffer holds a null terminated list of options; on
1138 1140           * return, the string is the list of options the file system
1139 1141           * recognized. If MS_DATA is set arguments five and six point to a
1140 1142           * block of binary data which the file system interprets.
1141 1143           * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1142 1144           * consistently with these conventions.  To handle them, we check to
1143 1145           * see whether the pointer to the file system name has a numeric value
1144 1146           * less than 256.  If so, we treat it as an index.
1145 1147           */
1146 1148          if (fsname != NULL) {
1147 1149                  if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1148 1150                          return (EINVAL);
1149 1151                  }
1150 1152          } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1151 1153                  size_t n;
1152 1154                  uint_t fstype;
1153 1155  
1154 1156                  fsname = fstname;
1155 1157  
1156 1158                  if ((fstype = (uintptr_t)uap->fstype) < 256) {
1157 1159                          RLOCK_VFSSW();
1158 1160                          if (fstype == 0 || fstype >= nfstype ||
1159 1161                              !ALLOCATED_VFSSW(&vfssw[fstype])) {
1160 1162                                  RUNLOCK_VFSSW();
1161 1163                                  return (EINVAL);
1162 1164                          }
1163 1165                          (void) strcpy(fsname, vfssw[fstype].vsw_name);
1164 1166                          RUNLOCK_VFSSW();
1165 1167                          if ((vswp = vfs_getvfssw(fsname)) == NULL)
1166 1168                                  return (EINVAL);
1167 1169                  } else {
1168 1170                          /*
1169 1171                           * Handle either kernel or user address space.
1170 1172                           */
1171 1173                          if (uap->flags & MS_SYSSPACE) {
1172 1174                                  error = copystr(uap->fstype, fsname,
1173 1175                                      FSTYPSZ, &n);
1174 1176                          } else {
1175 1177                                  error = copyinstr(uap->fstype, fsname,
1176 1178                                      FSTYPSZ, &n);
1177 1179                          }
1178 1180                          if (error) {
1179 1181                                  if (error == ENAMETOOLONG)
1180 1182                                          return (EINVAL);
1181 1183                                  return (error);
1182 1184                          }
1183 1185                          if ((vswp = vfs_getvfssw(fsname)) == NULL)
1184 1186                                  return (EINVAL);
1185 1187                  }
1186 1188          } else {
1187 1189                  if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1188 1190                          return (EINVAL);
1189 1191                  fsname = vswp->vsw_name;
1190 1192          }
1191 1193          if (!VFS_INSTALLED(vswp))
1192 1194                  return (EINVAL);
1193 1195  
1194 1196          if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1195 1197                  vfs_unrefvfssw(vswp);
1196 1198                  return (error);
1197 1199          }
1198 1200  
1199 1201          vfsops = &vswp->vsw_vfsops;
1200 1202  
1201 1203          vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1202 1204          /*
1203 1205           * Fetch mount options and parse them for generic vfs options
1204 1206           */
1205 1207          if (uap->flags & MS_OPTIONSTR) {
1206 1208                  /*
1207 1209                   * Limit the buffer size
1208 1210                   */
1209 1211                  if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1210 1212                          error = EINVAL;
1211 1213                          goto errout;
1212 1214                  }
1213 1215                  if ((uap->flags & MS_SYSSPACE) == 0) {
1214 1216                          inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1215 1217                          inargs[0] = '\0';
1216 1218                          if (optlen) {
1217 1219                                  error = copyinstr(opts, inargs, (size_t)optlen,
1218 1220                                      NULL);
1219 1221                                  if (error) {
1220 1222                                          goto errout;
1221 1223                                  }
1222 1224                          }
1223 1225                  }
1224 1226                  vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1225 1227          }
1226 1228          /*
1227 1229           * Flag bits override the options string.
1228 1230           */
1229 1231          if (uap->flags & MS_REMOUNT)
1230 1232                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1231 1233          if (uap->flags & MS_RDONLY)
1232 1234                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1233 1235          if (uap->flags & MS_NOSUID)
1234 1236                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1235 1237  
1236 1238          /*
1237 1239           * Check if this is a remount; must be set in the option string and
1238 1240           * the file system must support a remount option.
1239 1241           */
1240 1242          if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1241 1243              MNTOPT_REMOUNT, NULL)) {
1242 1244                  if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1243 1245                          error = ENOTSUP;
1244 1246                          goto errout;
1245 1247                  }
1246 1248                  uap->flags |= MS_REMOUNT;
1247 1249          }
1248 1250  
1249 1251          /*
1250 1252           * uap->flags and vfs_optionisset() should agree.
1251 1253           */
1252 1254          if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1253 1255                  uap->flags |= MS_RDONLY;
1254 1256          }
1255 1257          if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1256 1258                  uap->flags |= MS_NOSUID;
1257 1259          }
1258 1260          nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1259 1261          ASSERT(splice || !remount);
1260 1262          /*
1261 1263           * If we are splicing the fs into the namespace,
1262 1264           * perform mount point checks.
1263 1265           *
1264 1266           * We want to resolve the path for the mount point to eliminate
1265 1267           * '.' and ".." and symlinks in mount points; we can't do the
1266 1268           * same for the resource string, since it would turn
1267 1269           * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1268 1270           * this before grabbing vn_vfswlock(), because otherwise we
1269 1271           * would deadlock with lookuppn().
1270 1272           */
1271 1273          if (splice) {
1272 1274                  ASSERT(vp->v_count > 0);
1273 1275  
1274 1276                  /*
1275 1277                   * Pick up mount point and device from appropriate space.
1276 1278                   */
1277 1279                  if (pn_get(uap->spec, fromspace, &pn) == 0) {
1278 1280                          resource = kmem_alloc(pn.pn_pathlen + 1,
1279 1281                              KM_SLEEP);
1280 1282                          (void) strcpy(resource, pn.pn_path);
1281 1283                          pn_free(&pn);
1282 1284                  }
1283 1285                  /*
1284 1286                   * Do a lookupname prior to taking the
1285 1287                   * writelock. Mark this as completed if
1286 1288                   * successful for later cleanup and addition to
1287 1289                   * the mount in progress table.
1288 1290                   */
1289 1291                  if ((uap->flags & MS_GLOBAL) == 0 &&
1290 1292                      lookupname(uap->spec, fromspace,
1291 1293                      FOLLOW, NULL, &bvp) == 0) {
1292 1294                          addmip = 1;
1293 1295                  }
1294 1296  
1295 1297                  if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1296 1298                          pathname_t *pnp;
1297 1299  
1298 1300                          if (*pn.pn_path != '/') {
1299 1301                                  error = EINVAL;
1300 1302                                  pn_free(&pn);
1301 1303                                  goto errout;
1302 1304                          }
1303 1305                          pn_alloc(&rpn);
1304 1306                          /*
1305 1307                           * Kludge to prevent autofs from deadlocking with
1306 1308                           * itself when it calls domount().
1307 1309                           *
1308 1310                           * If autofs is calling, it is because it is doing
1309 1311                           * (autofs) mounts in the process of an NFS mount.  A
1310 1312                           * lookuppn() here would cause us to block waiting for
1311 1313                           * said NFS mount to complete, which can't since this
1312 1314                           * is the thread that was supposed to doing it.
1313 1315                           */
1314 1316                          if (fromspace == UIO_USERSPACE) {
1315 1317                                  if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1316 1318                                      NULL)) == 0) {
1317 1319                                          pnp = &rpn;
1318 1320                                  } else {
1319 1321                                          /*
1320 1322                                           * The file disappeared or otherwise
1321 1323                                           * became inaccessible since we opened
1322 1324                                           * it; might as well fail the mount
1323 1325                                           * since the mount point is no longer
1324 1326                                           * accessible.
1325 1327                                           */
1326 1328                                          pn_free(&rpn);
1327 1329                                          pn_free(&pn);
1328 1330                                          goto errout;
1329 1331                                  }
1330 1332                          } else {
1331 1333                                  pnp = &pn;
1332 1334                          }
1333 1335                          mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1334 1336                          (void) strcpy(mountpt, pnp->pn_path);
1335 1337  
1336 1338                          /*
1337 1339                           * If the addition of the zone's rootpath
1338 1340                           * would push us over a total path length
1339 1341                           * of MAXPATHLEN, we fail the mount with
1340 1342                           * ENAMETOOLONG, which is what we would have
1341 1343                           * gotten if we were trying to perform the same
1342 1344                           * mount in the global zone.
1343 1345                           *
1344 1346                           * strlen() doesn't count the trailing
1345 1347                           * '\0', but zone_rootpathlen counts both a
1346 1348                           * trailing '/' and the terminating '\0'.
1347 1349                           */
1348 1350                          if ((curproc->p_zone->zone_rootpathlen - 1 +
1349 1351                              strlen(mountpt)) > MAXPATHLEN ||
1350 1352                              (resource != NULL &&
1351 1353                              (curproc->p_zone->zone_rootpathlen - 1 +
1352 1354                              strlen(resource)) > MAXPATHLEN)) {
1353 1355                                  error = ENAMETOOLONG;
1354 1356                          }
1355 1357  
1356 1358                          pn_free(&rpn);
1357 1359                          pn_free(&pn);
1358 1360                  }
1359 1361  
1360 1362                  if (error)
1361 1363                          goto errout;
1362 1364  
1363 1365                  /*
1364 1366                   * Prevent path name resolution from proceeding past
1365 1367                   * the mount point.
1366 1368                   */
1367 1369                  if (vn_vfswlock(vp) != 0) {
1368 1370                          error = EBUSY;
1369 1371                          goto errout;
1370 1372                  }
1371 1373  
1372 1374                  /*
1373 1375                   * Verify that it's legitimate to establish a mount on
1374 1376                   * the prospective mount point.
1375 1377                   */
1376 1378                  if (vn_mountedvfs(vp) != NULL) {
1377 1379                          /*
1378 1380                           * The mount point lock was obtained after some
1379 1381                           * other thread raced through and established a mount.
1380 1382                           */
1381 1383                          vn_vfsunlock(vp);
1382 1384                          error = EBUSY;
1383 1385                          goto errout;
1384 1386                  }
1385 1387                  if (vp->v_flag & VNOMOUNT) {
1386 1388                          vn_vfsunlock(vp);
1387 1389                          error = EINVAL;
1388 1390                          goto errout;
1389 1391                  }
1390 1392          }
1391 1393          if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1392 1394                  uap->dataptr = NULL;
1393 1395                  uap->datalen = 0;
1394 1396          }
1395 1397  
1396 1398          /*
1397 1399           * If this is a remount, we don't want to create a new VFS.
1398 1400           * Instead, we pass the existing one with a remount flag.
1399 1401           */
1400 1402          if (remount) {
1401 1403                  /*
1402 1404                   * Confirm that the mount point is the root vnode of the
1403 1405                   * file system that is being remounted.
1404 1406                   * This can happen if the user specifies a different
1405 1407                   * mount point directory pathname in the (re)mount command.
1406 1408                   *
1407 1409                   * Code below can only be reached if splice is true, so it's
1408 1410                   * safe to do vn_vfsunlock() here.
1409 1411                   */
1410 1412                  if ((vp->v_flag & VROOT) == 0) {
1411 1413                          vn_vfsunlock(vp);
1412 1414                          error = ENOENT;
1413 1415                          goto errout;
1414 1416                  }
1415 1417                  /*
1416 1418                   * Disallow making file systems read-only unless file system
1417 1419                   * explicitly allows it in its vfssw.  Ignore other flags.
1418 1420                   */
1419 1421                  if (rdonly && vn_is_readonly(vp) == 0 &&
1420 1422                      (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1421 1423                          vn_vfsunlock(vp);
1422 1424                          error = EINVAL;
1423 1425                          goto errout;
1424 1426                  }
1425 1427                  /*
1426 1428                   * Disallow changing the NBMAND disposition of the file
1427 1429                   * system on remounts.
1428 1430                   */
1429 1431                  if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1430 1432                      (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1431 1433                          vn_vfsunlock(vp);
1432 1434                          error = EINVAL;
1433 1435                          goto errout;
1434 1436                  }
1435 1437                  vfsp = vp->v_vfsp;
1436 1438                  ovflags = vfsp->vfs_flag;
1437 1439                  vfsp->vfs_flag |= VFS_REMOUNT;
1438 1440                  vfsp->vfs_flag &= ~VFS_RDONLY;
1439 1441          } else {
1440 1442                  vfsp = vfs_alloc(KM_SLEEP);
1441 1443                  VFS_INIT(vfsp, vfsops, NULL);
1442 1444          }
1443 1445  
1444 1446          VFS_HOLD(vfsp);
1445 1447  
1446 1448          if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1447 1449                  if (!remount) {
1448 1450                          if (splice)
1449 1451                                  vn_vfsunlock(vp);
1450 1452                          vfs_free(vfsp);
1451 1453                  } else {
1452 1454                          vn_vfsunlock(vp);
1453 1455                          VFS_RELE(vfsp);
1454 1456                  }
1455 1457                  goto errout;
1456 1458          }
1457 1459  
1458 1460          /*
1459 1461           * PRIV_SYS_MOUNT doesn't mean you can become root.
1460 1462           */
1461 1463          if (vfsp->vfs_lofi_id != 0) {
1462 1464                  uap->flags |= MS_NOSUID;
1463 1465                  vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1464 1466          }
1465 1467  
1466 1468          /*
1467 1469           * The vfs_reflock is not used anymore the code below explicitly
1468 1470           * holds it preventing others accesing it directly.
1469 1471           */
1470 1472          if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1471 1473              !(vfsp->vfs_flag & VFS_REMOUNT))
1472 1474                  cmn_err(CE_WARN,
1473 1475                      "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1474 1476  
1475 1477          /*
1476 1478           * Lock the vfs. If this is a remount we want to avoid spurious umount
1477 1479           * failures that happen as a side-effect of fsflush() and other mount
1478 1480           * and unmount operations that might be going on simultaneously and
1479 1481           * may have locked the vfs currently. To not return EBUSY immediately
1480 1482           * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1481 1483           */
1482 1484          if (!remount) {
1483 1485                  if (error = vfs_lock(vfsp)) {
1484 1486                          vfsp->vfs_flag = ovflags;
1485 1487  
1486 1488                          lofi_remove(vfsp);
1487 1489  
1488 1490                          if (splice)
1489 1491                                  vn_vfsunlock(vp);
1490 1492                          vfs_free(vfsp);
1491 1493                          goto errout;
1492 1494                  }
1493 1495          } else {
1494 1496                  vfs_lock_wait(vfsp);
1495 1497          }
1496 1498  
1497 1499          /*
1498 1500           * Add device to mount in progress table, global mounts require special
1499 1501           * handling. It is possible that we have already done the lookupname
1500 1502           * on a spliced, non-global fs. If so, we don't want to do it again
1501 1503           * since we cannot do a lookupname after taking the
1502 1504           * wlock above. This case is for a non-spliced, non-global filesystem.
1503 1505           */
1504 1506          if (!addmip) {
1505 1507                  if ((uap->flags & MS_GLOBAL) == 0 &&
1506 1508                      lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1507 1509                          addmip = 1;
1508 1510                  }
1509 1511          }
1510 1512  
1511 1513          if (addmip) {
1512 1514                  vnode_t *lvp = NULL;
1513 1515  
1514 1516                  error = vfs_get_lofi(vfsp, &lvp);
1515 1517                  if (error > 0) {
1516 1518                          lofi_remove(vfsp);
1517 1519  
1518 1520                          if (splice)
1519 1521                                  vn_vfsunlock(vp);
1520 1522                          vfs_unlock(vfsp);
1521 1523  
1522 1524                          if (remount) {
1523 1525                                  VFS_RELE(vfsp);
1524 1526                          } else {
1525 1527                                  vfs_free(vfsp);
1526 1528                          }
1527 1529  
1528 1530                          goto errout;
1529 1531                  } else if (error == -1) {
1530 1532                          bdev = bvp->v_rdev;
1531 1533                          VN_RELE(bvp);
1532 1534                  } else {
1533 1535                          bdev = lvp->v_rdev;
1534 1536                          VN_RELE(lvp);
1535 1537                          VN_RELE(bvp);
1536 1538                  }
1537 1539  
1538 1540                  vfs_addmip(bdev, vfsp);
1539 1541                  addmip = 0;
1540 1542                  delmip = 1;
1541 1543          }
1542 1544          /*
1543 1545           * Invalidate cached entry for the mount point.
1544 1546           */
1545 1547          if (splice)
1546 1548                  dnlc_purge_vp(vp);
1547 1549  
1548 1550          /*
1549 1551           * If have an option string but the filesystem doesn't supply a
1550 1552           * prototype options table, create a table with the global
1551 1553           * options and sufficient room to accept all the options in the
1552 1554           * string.  Then parse the passed in option string
1553 1555           * accepting all the options in the string.  This gives us an
1554 1556           * option table with all the proper cancel properties for the
1555 1557           * global options.
1556 1558           *
1557 1559           * Filesystems that supply a prototype options table are handled
1558 1560           * earlier in this function.
1559 1561           */
1560 1562          if (uap->flags & MS_OPTIONSTR) {
1561 1563                  if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1562 1564                          mntopts_t tmp_mntopts;
1563 1565  
1564 1566                          tmp_mntopts.mo_count = 0;
1565 1567                          vfs_createopttbl_extend(&tmp_mntopts, inargs,
1566 1568                              &mnt_mntopts);
1567 1569                          vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1568 1570                          vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1569 1571                          vfs_freeopttbl(&tmp_mntopts);
1570 1572                  }
1571 1573          }
1572 1574  
1573 1575          /*
1574 1576           * Serialize with zone state transitions.
1575 1577           * See vfs_list_add; zone mounted into is:
1576 1578           *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1577 1579           * not the zone doing the mount (curproc->p_zone), but if we're already
1578 1580           * inside a NGZ, then we know what zone we are.
1579 1581           */
1580 1582          if (INGLOBALZONE(curproc)) {
1581 1583                  zone = zone_find_by_path(mountpt);
1582 1584                  ASSERT(zone != NULL);
1583 1585          } else {
1584 1586                  zone = curproc->p_zone;
1585 1587                  /*
1586 1588                   * zone_find_by_path does a hold, so do one here too so that
1587 1589                   * we can do a zone_rele after mount_completed.
1588 1590                   */
1589 1591                  zone_hold(zone);
1590 1592          }
1591 1593          mount_in_progress(zone);
1592 1594          /*
1593 1595           * Instantiate (or reinstantiate) the file system.  If appropriate,
1594 1596           * splice it into the file system name space.
1595 1597           *
1596 1598           * We want VFS_MOUNT() to be able to override the vfs_resource
1597 1599           * string if necessary (ie, mntfs), and also for a remount to
1598 1600           * change the same (necessary when remounting '/' during boot).
1599 1601           * So we set up vfs_mntpt and vfs_resource to what we think they
1600 1602           * should be, then hand off control to VFS_MOUNT() which can
1601 1603           * override this.
1602 1604           *
1603 1605           * For safety's sake, when changing vfs_resource or vfs_mntpt of
1604 1606           * a vfs which is on the vfs list (i.e. during a remount), we must
1605 1607           * never set those fields to NULL. Several bits of code make
1606 1608           * assumptions that the fields are always valid.
1607 1609           */
1608 1610          vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1609 1611          if (remount) {
1610 1612                  if ((oldresource = vfsp->vfs_resource) != NULL)
1611 1613                          refstr_hold(oldresource);
1612 1614                  if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1613 1615                          refstr_hold(oldmntpt);
1614 1616          }
1615 1617          vfs_setresource(vfsp, resource, 0);
1616 1618          vfs_setmntpoint(vfsp, mountpt, 0);
1617 1619  
1618 1620          /*
1619 1621           * going to mount on this vnode, so notify.
1620 1622           */
1621 1623          vnevent_mountedover(vp, NULL);
1622 1624          error = VFS_MOUNT(vfsp, vp, uap, credp);
1623 1625  
1624 1626          if (uap->flags & MS_RDONLY)
1625 1627                  vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1626 1628          if (uap->flags & MS_NOSUID)
1627 1629                  vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1628 1630          if (uap->flags & MS_GLOBAL)
1629 1631                  vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1630 1632  
1631 1633          if (error) {
1632 1634                  lofi_remove(vfsp);
1633 1635  
1634 1636                  if (remount) {
1635 1637                          /* put back pre-remount options */
1636 1638                          vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1637 1639                          vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1638 1640                              VFSSP_VERBATIM);
1639 1641                          if (oldmntpt)
1640 1642                                  refstr_rele(oldmntpt);
1641 1643                          vfs_setresource(vfsp, refstr_value(oldresource),
1642 1644                              VFSSP_VERBATIM);
1643 1645                          if (oldresource)
1644 1646                                  refstr_rele(oldresource);
1645 1647                          vfsp->vfs_flag = ovflags;
1646 1648                          vfs_unlock(vfsp);
1647 1649                          VFS_RELE(vfsp);
1648 1650                  } else {
1649 1651                          vfs_unlock(vfsp);
1650 1652                          vfs_freemnttab(vfsp);
1651 1653                          vfs_free(vfsp);
1652 1654                  }
1653 1655          } else {
1654 1656                  /*
1655 1657                   * Set the mount time to now
1656 1658                   */
1657 1659                  vfsp->vfs_mtime = ddi_get_time();
1658 1660                  if (remount) {
1659 1661                          vfsp->vfs_flag &= ~VFS_REMOUNT;
1660 1662                          if (oldresource)
1661 1663                                  refstr_rele(oldresource);
1662 1664                          if (oldmntpt)
1663 1665                                  refstr_rele(oldmntpt);
1664 1666                  } else if (splice) {
1665 1667                          /*
1666 1668                           * Link vfsp into the name space at the mount
1667 1669                           * point. Vfs_add() is responsible for
1668 1670                           * holding the mount point which will be
1669 1671                           * released when vfs_remove() is called.
1670 1672                           */
1671 1673                          vfs_add(vp, vfsp, uap->flags);
1672 1674                  } else {
1673 1675                          /*
1674 1676                           * Hold the reference to file system which is
1675 1677                           * not linked into the name space.
1676 1678                           */
1677 1679                          vfsp->vfs_zone = NULL;
1678 1680                          VFS_HOLD(vfsp);
1679 1681                          vfsp->vfs_vnodecovered = NULL;
1680 1682                  }
1681 1683                  /*
1682 1684                   * Set flags for global options encountered
1683 1685                   */
1684 1686                  if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1685 1687                          vfsp->vfs_flag |= VFS_RDONLY;
1686 1688                  else
1687 1689                          vfsp->vfs_flag &= ~VFS_RDONLY;
1688 1690                  if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1689 1691                          vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1690 1692                  } else {
1691 1693                          if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1692 1694                                  vfsp->vfs_flag |= VFS_NODEVICES;
1693 1695                          else
1694 1696                                  vfsp->vfs_flag &= ~VFS_NODEVICES;
1695 1697                          if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1696 1698                                  vfsp->vfs_flag |= VFS_NOSETUID;
1697 1699                          else
1698 1700                                  vfsp->vfs_flag &= ~VFS_NOSETUID;
1699 1701                  }
1700 1702                  if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1701 1703                          vfsp->vfs_flag |= VFS_NBMAND;
1702 1704                  else
1703 1705                          vfsp->vfs_flag &= ~VFS_NBMAND;
1704 1706  
1705 1707                  if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1706 1708                          vfsp->vfs_flag |= VFS_XATTR;
1707 1709                  else
1708 1710                          vfsp->vfs_flag &= ~VFS_XATTR;
1709 1711  
1710 1712                  if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1711 1713                          vfsp->vfs_flag |= VFS_NOEXEC;
1712 1714                  else
1713 1715                          vfsp->vfs_flag &= ~VFS_NOEXEC;
1714 1716  
1715 1717                  /*
1716 1718                   * Now construct the output option string of options
1717 1719                   * we recognized.
1718 1720                   */
1719 1721                  if (uap->flags & MS_OPTIONSTR) {
1720 1722                          vfs_list_read_lock();
1721 1723                          copyout_error = vfs_buildoptionstr(
1722 1724                              &vfsp->vfs_mntopts, inargs, optlen);
1723 1725                          vfs_list_unlock();
1724 1726                          if (copyout_error == 0 &&
1725 1727                              (uap->flags & MS_SYSSPACE) == 0) {
1726 1728                                  copyout_error = copyoutstr(inargs, opts,
1727 1729                                      optlen, NULL);
1728 1730                          }
1729 1731                  }
1730 1732  
1731 1733                  /*
1732 1734                   * If this isn't a remount, set up the vopstats before
1733 1735                   * anyone can touch this. We only allow spliced file
1734 1736                   * systems (file systems which are in the namespace) to
1735 1737                   * have the VFS_STATS flag set.
1736 1738                   * NOTE: PxFS mounts the underlying file system with
1737 1739                   * MS_NOSPLICE set and copies those vfs_flags to its private
1738 1740                   * vfs structure. As a result, PxFS should never have
1739 1741                   * the VFS_STATS flag or else we might access the vfs
1740 1742                   * statistics-related fields prior to them being
1741 1743                   * properly initialized.
1742 1744                   */
1743 1745                  if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1744 1746                          initialize_vopstats(&vfsp->vfs_vopstats);
1745 1747                          /*
1746 1748                           * We need to set vfs_vskap to NULL because there's
1747 1749                           * a chance it won't be set below.  This is checked
1748 1750                           * in teardown_vopstats() so we can't have garbage.
1749 1751                           */
1750 1752                          vfsp->vfs_vskap = NULL;
1751 1753                          vfsp->vfs_flag |= VFS_STATS;
1752 1754                          vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1753 1755                  }
1754 1756  
1755 1757                  if (vswp->vsw_flag & VSW_XID)
1756 1758                          vfsp->vfs_flag |= VFS_XID;
1757 1759  
1758 1760                  vfs_unlock(vfsp);
1759 1761          }
1760 1762          mount_completed(zone);
1761 1763          zone_rele(zone);
1762 1764          if (splice)
1763 1765                  vn_vfsunlock(vp);
1764 1766  
1765 1767          if ((error == 0) && (copyout_error == 0)) {
1766 1768                  if (!remount) {
1767 1769                          /*
1768 1770                           * Don't call get_vskstat_anchor() while holding
1769 1771                           * locks since it allocates memory and calls
1770 1772                           * VFS_STATVFS().  For NFS, the latter can generate
1771 1773                           * an over-the-wire call.
1772 1774                           */
1773 1775                          vskap = get_vskstat_anchor(vfsp);
1774 1776                          /* Only take the lock if we have something to do */
1775 1777                          if (vskap != NULL) {
1776 1778                                  vfs_lock_wait(vfsp);
1777 1779                                  if (vfsp->vfs_flag & VFS_STATS) {
1778 1780                                          vfsp->vfs_vskap = vskap;
1779 1781                                  }
1780 1782                                  vfs_unlock(vfsp);
1781 1783                          }
1782 1784                  }
1783 1785                  /* Return vfsp to caller. */
1784 1786                  *vfspp = vfsp;
1785 1787          }
1786 1788  errout:
1787 1789          vfs_freeopttbl(&mnt_mntopts);
1788 1790          if (resource != NULL)
1789 1791                  kmem_free(resource, strlen(resource) + 1);
1790 1792          if (mountpt != NULL)
1791 1793                  kmem_free(mountpt, strlen(mountpt) + 1);
1792 1794          /*
1793 1795           * It is possible we errored prior to adding to mount in progress
1794 1796           * table. Must free vnode we acquired with successful lookupname.
1795 1797           */
1796 1798          if (addmip)
1797 1799                  VN_RELE(bvp);
1798 1800          if (delmip)
1799 1801                  vfs_delmip(vfsp);
1800 1802          ASSERT(vswp != NULL);
1801 1803          vfs_unrefvfssw(vswp);
1802 1804          if (inargs != opts)
1803 1805                  kmem_free(inargs, MAX_MNTOPT_STR);
1804 1806          if (copyout_error) {
1805 1807                  lofi_remove(vfsp);
1806 1808                  VFS_RELE(vfsp);
1807 1809                  error = copyout_error;
1808 1810          }
1809 1811          return (error);
1810 1812  }
1811 1813  
1812 1814  static void
1813 1815  vfs_setpath(
1814 1816      struct vfs *vfsp,           /* vfs being updated */
1815 1817      refstr_t **refp,            /* Ref-count string to contain the new path */
1816 1818      const char *newpath,        /* Path to add to refp (above) */
1817 1819      uint32_t flag)              /* flag */
1818 1820  {
1819 1821          size_t len;
1820 1822          refstr_t *ref;
1821 1823          zone_t *zone = curproc->p_zone;
1822 1824          char *sp;
1823 1825          int have_list_lock = 0;
1824 1826  
1825 1827          ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1826 1828  
1827 1829          /*
1828 1830           * New path must be less than MAXPATHLEN because mntfs
1829 1831           * will only display up to MAXPATHLEN bytes. This is currently
1830 1832           * safe, because domount() uses pn_get(), and other callers
1831 1833           * similarly cap the size to fewer than MAXPATHLEN bytes.
1832 1834           */
1833 1835  
1834 1836          ASSERT(strlen(newpath) < MAXPATHLEN);
1835 1837  
1836 1838          /* mntfs requires consistency while vfs list lock is held */
1837 1839  
1838 1840          if (VFS_ON_LIST(vfsp)) {
1839 1841                  have_list_lock = 1;
1840 1842                  vfs_list_lock();
1841 1843          }
1842 1844  
1843 1845          if (*refp != NULL)
1844 1846                  refstr_rele(*refp);
1845 1847  
1846 1848          /*
1847 1849           * If we are in a non-global zone then we prefix the supplied path,
1848 1850           * newpath, with the zone's root path, with two exceptions. The first
1849 1851           * is where we have been explicitly directed to avoid doing so; this
1850 1852           * will be the case following a failed remount, where the path supplied
1851 1853           * will be a saved version which must now be restored. The second
1852 1854           * exception is where newpath is not a pathname but a descriptive name,
1853 1855           * e.g. "procfs".
1854 1856           */
1855 1857          if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1856 1858                  ref = refstr_alloc(newpath);
1857 1859                  goto out;
1858 1860          }
1859 1861  
1860 1862          /*
1861 1863           * Truncate the trailing '/' in the zoneroot, and merge
1862 1864           * in the zone's rootpath with the "newpath" (resource
1863 1865           * or mountpoint) passed in.
1864 1866           *
1865 1867           * The size of the required buffer is thus the size of
1866 1868           * the buffer required for the passed-in newpath
1867 1869           * (strlen(newpath) + 1), plus the size of the buffer
1868 1870           * required to hold zone_rootpath (zone_rootpathlen)
1869 1871           * minus one for one of the now-superfluous NUL
1870 1872           * terminations, minus one for the trailing '/'.
1871 1873           *
1872 1874           * That gives us:
1873 1875           *
1874 1876           * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1875 1877           *
1876 1878           * Which is what we have below.
1877 1879           */
1878 1880  
1879 1881          len = strlen(newpath) + zone->zone_rootpathlen - 1;
1880 1882          sp = kmem_alloc(len, KM_SLEEP);
1881 1883  
1882 1884          /*
1883 1885           * Copy everything including the trailing slash, which
1884 1886           * we then overwrite with the NUL character.
1885 1887           */
1886 1888  
1887 1889          (void) strcpy(sp, zone->zone_rootpath);
1888 1890          sp[zone->zone_rootpathlen - 2] = '\0';
1889 1891          (void) strcat(sp, newpath);
1890 1892  
1891 1893          ref = refstr_alloc(sp);
1892 1894          kmem_free(sp, len);
1893 1895  out:
1894 1896          *refp = ref;
1895 1897  
1896 1898          if (have_list_lock) {
1897 1899                  vfs_mnttab_modtimeupd();
1898 1900                  vfs_list_unlock();
1899 1901          }
1900 1902  }
1901 1903  
1902 1904  /*
1903 1905   * Record a mounted resource name in a vfs structure.
1904 1906   * If vfsp is already mounted, caller must hold the vfs lock.
1905 1907   */
1906 1908  void
1907 1909  vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1908 1910  {
1909 1911          if (resource == NULL || resource[0] == '\0')
1910 1912                  resource = VFS_NORESOURCE;
1911 1913          vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1912 1914  }
1913 1915  
1914 1916  /*
1915 1917   * Record a mount point name in a vfs structure.
1916 1918   * If vfsp is already mounted, caller must hold the vfs lock.
1917 1919   */
1918 1920  void
1919 1921  vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1920 1922  {
1921 1923          if (mntpt == NULL || mntpt[0] == '\0')
1922 1924                  mntpt = VFS_NOMNTPT;
1923 1925          vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1924 1926  }
1925 1927  
1926 1928  /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1927 1929  
1928 1930  refstr_t *
1929 1931  vfs_getresource(const struct vfs *vfsp)
1930 1932  {
1931 1933          refstr_t *resource;
1932 1934  
1933 1935          vfs_list_read_lock();
1934 1936          resource = vfsp->vfs_resource;
1935 1937          refstr_hold(resource);
1936 1938          vfs_list_unlock();
1937 1939  
1938 1940          return (resource);
1939 1941  }
1940 1942  
1941 1943  /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1942 1944  
1943 1945  refstr_t *
1944 1946  vfs_getmntpoint(const struct vfs *vfsp)
1945 1947  {
1946 1948          refstr_t *mntpt;
1947 1949  
1948 1950          vfs_list_read_lock();
1949 1951          mntpt = vfsp->vfs_mntpt;
1950 1952          refstr_hold(mntpt);
1951 1953          vfs_list_unlock();
1952 1954  
1953 1955          return (mntpt);
1954 1956  }
1955 1957  
1956 1958  /*
1957 1959   * Create an empty options table with enough empty slots to hold all
1958 1960   * The options in the options string passed as an argument.
1959 1961   * Potentially prepend another options table.
1960 1962   *
1961 1963   * Note: caller is responsible for locking the vfs list, if needed,
1962 1964   *       to protect mops.
1963 1965   */
1964 1966  static void
1965 1967  vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1966 1968      const mntopts_t *mtmpl)
1967 1969  {
1968 1970          const char *s = opts;
1969 1971          uint_t count;
1970 1972  
1971 1973          if (opts == NULL || *opts == '\0') {
1972 1974                  count = 0;
1973 1975          } else {
1974 1976                  count = 1;
1975 1977  
1976 1978                  /*
1977 1979                   * Count number of options in the string
1978 1980                   */
1979 1981                  for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1980 1982                          count++;
1981 1983                          s++;
1982 1984                  }
1983 1985          }
1984 1986          vfs_copyopttbl_extend(mtmpl, mops, count);
1985 1987  }
1986 1988  
1987 1989  /*
1988 1990   * Create an empty options table with enough empty slots to hold all
1989 1991   * The options in the options string passed as an argument.
1990 1992   *
1991 1993   * This function is *not* for general use by filesystems.
1992 1994   *
1993 1995   * Note: caller is responsible for locking the vfs list, if needed,
1994 1996   *       to protect mops.
1995 1997   */
1996 1998  void
1997 1999  vfs_createopttbl(mntopts_t *mops, const char *opts)
1998 2000  {
1999 2001          vfs_createopttbl_extend(mops, opts, NULL);
2000 2002  }
2001 2003  
2002 2004  
2003 2005  /*
2004 2006   * Swap two mount options tables
2005 2007   */
2006 2008  static void
2007 2009  vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2008 2010  {
2009 2011          uint_t tmpcnt;
2010 2012          mntopt_t *tmplist;
2011 2013  
2012 2014          tmpcnt = optbl2->mo_count;
2013 2015          tmplist = optbl2->mo_list;
2014 2016          optbl2->mo_count = optbl1->mo_count;
2015 2017          optbl2->mo_list = optbl1->mo_list;
2016 2018          optbl1->mo_count = tmpcnt;
2017 2019          optbl1->mo_list = tmplist;
2018 2020  }
2019 2021  
2020 2022  static void
2021 2023  vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2022 2024  {
2023 2025          vfs_list_lock();
2024 2026          vfs_swapopttbl_nolock(optbl1, optbl2);
2025 2027          vfs_mnttab_modtimeupd();
2026 2028          vfs_list_unlock();
2027 2029  }
2028 2030  
2029 2031  static char **
2030 2032  vfs_copycancelopt_extend(char **const moc, int extend)
2031 2033  {
2032 2034          int i = 0;
2033 2035          int j;
2034 2036          char **result;
2035 2037  
2036 2038          if (moc != NULL) {
2037 2039                  for (; moc[i] != NULL; i++)
2038 2040                          /* count number of options to cancel */;
2039 2041          }
2040 2042  
2041 2043          if (i + extend == 0)
2042 2044                  return (NULL);
2043 2045  
2044 2046          result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2045 2047  
2046 2048          for (j = 0; j < i; j++) {
2047 2049                  result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2048 2050                  (void) strcpy(result[j], moc[j]);
2049 2051          }
2050 2052          for (; j <= i + extend; j++)
2051 2053                  result[j] = NULL;
2052 2054  
2053 2055          return (result);
2054 2056  }
2055 2057  
2056 2058  static void
2057 2059  vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2058 2060  {
2059 2061          char *sp, *dp;
2060 2062  
2061 2063          d->mo_flags = s->mo_flags;
2062 2064          d->mo_data = s->mo_data;
2063 2065          sp = s->mo_name;
2064 2066          if (sp != NULL) {
2065 2067                  dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2066 2068                  (void) strcpy(dp, sp);
2067 2069                  d->mo_name = dp;
2068 2070          } else {
2069 2071                  d->mo_name = NULL; /* should never happen */
2070 2072          }
2071 2073  
2072 2074          d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2073 2075  
2074 2076          sp = s->mo_arg;
2075 2077          if (sp != NULL) {
2076 2078                  dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2077 2079                  (void) strcpy(dp, sp);
2078 2080                  d->mo_arg = dp;
2079 2081          } else {
2080 2082                  d->mo_arg = NULL;
2081 2083          }
2082 2084  }
2083 2085  
2084 2086  /*
2085 2087   * Copy a mount options table, possibly allocating some spare
2086 2088   * slots at the end.  It is permissible to copy_extend the NULL table.
2087 2089   */
2088 2090  static void
2089 2091  vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2090 2092  {
2091 2093          uint_t i, count;
2092 2094          mntopt_t *motbl;
2093 2095  
2094 2096          /*
2095 2097           * Clear out any existing stuff in the options table being initialized
2096 2098           */
2097 2099          vfs_freeopttbl(dmo);
2098 2100          count = (smo == NULL) ? 0 : smo->mo_count;
2099 2101          if ((count + extra) == 0)       /* nothing to do */
2100 2102                  return;
2101 2103          dmo->mo_count = count + extra;
2102 2104          motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2103 2105          dmo->mo_list = motbl;
2104 2106          for (i = 0; i < count; i++) {
2105 2107                  vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2106 2108          }
2107 2109          for (i = count; i < count + extra; i++) {
2108 2110                  motbl[i].mo_flags = MO_EMPTY;
2109 2111          }
2110 2112  }
2111 2113  
2112 2114  /*
2113 2115   * Copy a mount options table.
2114 2116   *
2115 2117   * This function is *not* for general use by filesystems.
2116 2118   *
2117 2119   * Note: caller is responsible for locking the vfs list, if needed,
2118 2120   *       to protect smo and dmo.
2119 2121   */
2120 2122  void
2121 2123  vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2122 2124  {
2123 2125          vfs_copyopttbl_extend(smo, dmo, 0);
2124 2126  }
2125 2127  
2126 2128  static char **
2127 2129  vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2128 2130  {
2129 2131          int c1 = 0;
2130 2132          int c2 = 0;
2131 2133          char **result;
2132 2134          char **sp1, **sp2, **dp;
2133 2135  
2134 2136          /*
2135 2137           * First we count both lists of cancel options.
2136 2138           * If either is NULL or has no elements, we return a copy of
2137 2139           * the other.
2138 2140           */
2139 2141          if (mop1->mo_cancel != NULL) {
2140 2142                  for (; mop1->mo_cancel[c1] != NULL; c1++)
2141 2143                          /* count cancel options in mop1 */;
2142 2144          }
2143 2145  
2144 2146          if (c1 == 0)
2145 2147                  return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2146 2148  
2147 2149          if (mop2->mo_cancel != NULL) {
2148 2150                  for (; mop2->mo_cancel[c2] != NULL; c2++)
2149 2151                          /* count cancel options in mop2 */;
2150 2152          }
2151 2153  
2152 2154          result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2153 2155  
2154 2156          if (c2 == 0)
2155 2157                  return (result);
2156 2158  
2157 2159          /*
2158 2160           * When we get here, we've got two sets of cancel options;
2159 2161           * we need to merge the two sets.  We know that the result
2160 2162           * array has "c1+c2+1" entries and in the end we might shrink
2161 2163           * it.
2162 2164           * Result now has a copy of the c1 entries from mop1; we'll
2163 2165           * now lookup all the entries of mop2 in mop1 and copy it if
2164 2166           * it is unique.
2165 2167           * This operation is O(n^2) but it's only called once per
2166 2168           * filesystem per duplicate option.  This is a situation
2167 2169           * which doesn't arise with the filesystems in ON and
2168 2170           * n is generally 1.
2169 2171           */
2170 2172  
2171 2173          dp = &result[c1];
2172 2174          for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2173 2175                  for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2174 2176                          if (strcmp(*sp1, *sp2) == 0)
2175 2177                                  break;
2176 2178                  }
2177 2179                  if (*sp1 == NULL) {
2178 2180                          /*
2179 2181                           * Option *sp2 not found in mop1, so copy it.
2180 2182                           * The calls to vfs_copycancelopt_extend()
2181 2183                           * guarantee that there's enough room.
2182 2184                           */
2183 2185                          *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2184 2186                          (void) strcpy(*dp++, *sp2);
2185 2187                  }
2186 2188          }
2187 2189          if (dp != &result[c1+c2]) {
2188 2190                  size_t bytes = (dp - result + 1) * sizeof (char *);
2189 2191                  char **nres = kmem_alloc(bytes, KM_SLEEP);
2190 2192  
2191 2193                  bcopy(result, nres, bytes);
2192 2194                  kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2193 2195                  result = nres;
2194 2196          }
2195 2197          return (result);
2196 2198  }
2197 2199  
2198 2200  /*
2199 2201   * Merge two mount option tables (outer and inner) into one.  This is very
2200 2202   * similar to "merging" global variables and automatic variables in C.
2201 2203   *
2202 2204   * This isn't (and doesn't have to be) fast.
2203 2205   *
2204 2206   * This function is *not* for general use by filesystems.
2205 2207   *
2206 2208   * Note: caller is responsible for locking the vfs list, if needed,
2207 2209   *       to protect omo, imo & dmo.
2208 2210   */
2209 2211  void
2210 2212  vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2211 2213  {
2212 2214          uint_t i, count;
2213 2215          mntopt_t *mop, *motbl;
2214 2216          uint_t freeidx;
2215 2217  
2216 2218          /*
2217 2219           * First determine how much space we need to allocate.
2218 2220           */
2219 2221          count = omo->mo_count;
2220 2222          for (i = 0; i < imo->mo_count; i++) {
2221 2223                  if (imo->mo_list[i].mo_flags & MO_EMPTY)
2222 2224                          continue;
2223 2225                  if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2224 2226                          count++;
2225 2227          }
2226 2228          ASSERT(count >= omo->mo_count &&
2227 2229              count <= omo->mo_count + imo->mo_count);
2228 2230          motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2229 2231          for (i = 0; i < omo->mo_count; i++)
2230 2232                  vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2231 2233          freeidx = omo->mo_count;
2232 2234          for (i = 0; i < imo->mo_count; i++) {
2233 2235                  if (imo->mo_list[i].mo_flags & MO_EMPTY)
2234 2236                          continue;
2235 2237                  if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2236 2238                          char **newcanp;
2237 2239                          uint_t index = mop - omo->mo_list;
2238 2240  
2239 2241                          newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2240 2242  
2241 2243                          vfs_freeopt(&motbl[index]);
2242 2244                          vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2243 2245  
2244 2246                          vfs_freecancelopt(motbl[index].mo_cancel);
2245 2247                          motbl[index].mo_cancel = newcanp;
2246 2248                  } else {
2247 2249                          /*
2248 2250                           * If it's a new option, just copy it over to the first
2249 2251                           * free location.
2250 2252                           */
2251 2253                          vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2252 2254                  }
2253 2255          }
2254 2256          dmo->mo_count = count;
2255 2257          dmo->mo_list = motbl;
2256 2258  }
2257 2259  
2258 2260  /*
2259 2261   * Functions to set and clear mount options in a mount options table.
2260 2262   */
2261 2263  
2262 2264  /*
2263 2265   * Clear a mount option, if it exists.
2264 2266   *
2265 2267   * The update_mnttab arg indicates whether mops is part of a vfs that is on
2266 2268   * the vfs list.
2267 2269   */
2268 2270  static void
2269 2271  vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2270 2272  {
2271 2273          struct mntopt *mop;
2272 2274          uint_t i, count;
2273 2275  
2274 2276          ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2275 2277  
2276 2278          count = mops->mo_count;
2277 2279          for (i = 0; i < count; i++) {
2278 2280                  mop = &mops->mo_list[i];
2279 2281  
2280 2282                  if (mop->mo_flags & MO_EMPTY)
2281 2283                          continue;
2282 2284                  if (strcmp(opt, mop->mo_name))
2283 2285                          continue;
2284 2286                  mop->mo_flags &= ~MO_SET;
2285 2287                  if (mop->mo_arg != NULL) {
2286 2288                          kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2287 2289                  }
2288 2290                  mop->mo_arg = NULL;
2289 2291                  if (update_mnttab)
2290 2292                          vfs_mnttab_modtimeupd();
2291 2293                  break;
2292 2294          }
2293 2295  }
2294 2296  
2295 2297  void
2296 2298  vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2297 2299  {
2298 2300          int gotlock = 0;
2299 2301  
2300 2302          if (VFS_ON_LIST(vfsp)) {
2301 2303                  gotlock = 1;
2302 2304                  vfs_list_lock();
2303 2305          }
2304 2306          vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2305 2307          if (gotlock)
2306 2308                  vfs_list_unlock();
2307 2309  }
2308 2310  
2309 2311  
2310 2312  /*
2311 2313   * Set a mount option on.  If it's not found in the table, it's silently
2312 2314   * ignored.  If the option has MO_IGNORE set, it is still set unless the
2313 2315   * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2314 2316   * bits can be used to toggle the MO_NODISPLAY bit for the option.
2315 2317   * If the VFS_CREATEOPT flag bit is set then the first option slot with
2316 2318   * MO_EMPTY set is created as the option passed in.
2317 2319   *
2318 2320   * The update_mnttab arg indicates whether mops is part of a vfs that is on
2319 2321   * the vfs list.
2320 2322   */
2321 2323  static void
2322 2324  vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2323 2325      const char *arg, int flags, int update_mnttab)
2324 2326  {
2325 2327          mntopt_t *mop;
2326 2328          uint_t i, count;
2327 2329          char *sp;
2328 2330  
2329 2331          ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2330 2332  
2331 2333          if (flags & VFS_CREATEOPT) {
2332 2334                  if (vfs_hasopt(mops, opt) != NULL) {
2333 2335                          flags &= ~VFS_CREATEOPT;
2334 2336                  }
2335 2337          }
2336 2338          count = mops->mo_count;
2337 2339          for (i = 0; i < count; i++) {
2338 2340                  mop = &mops->mo_list[i];
2339 2341  
2340 2342                  if (mop->mo_flags & MO_EMPTY) {
2341 2343                          if ((flags & VFS_CREATEOPT) == 0)
2342 2344                                  continue;
2343 2345                          sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2344 2346                          (void) strcpy(sp, opt);
2345 2347                          mop->mo_name = sp;
2346 2348                          if (arg != NULL)
2347 2349                                  mop->mo_flags = MO_HASVALUE;
2348 2350                          else
2349 2351                                  mop->mo_flags = 0;
2350 2352                  } else if (strcmp(opt, mop->mo_name)) {
2351 2353                          continue;
2352 2354                  }
2353 2355                  if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2354 2356                          break;
2355 2357                  if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2356 2358                          sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2357 2359                          (void) strcpy(sp, arg);
2358 2360                  } else {
2359 2361                          sp = NULL;
2360 2362                  }
2361 2363                  if (mop->mo_arg != NULL)
2362 2364                          kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2363 2365                  mop->mo_arg = sp;
2364 2366                  if (flags & VFS_DISPLAY)
2365 2367                          mop->mo_flags &= ~MO_NODISPLAY;
2366 2368                  if (flags & VFS_NODISPLAY)
2367 2369                          mop->mo_flags |= MO_NODISPLAY;
2368 2370                  mop->mo_flags |= MO_SET;
2369 2371                  if (mop->mo_cancel != NULL) {
2370 2372                          char **cp;
2371 2373  
2372 2374                          for (cp = mop->mo_cancel; *cp != NULL; cp++)
2373 2375                                  vfs_clearmntopt_nolock(mops, *cp, 0);
2374 2376                  }
2375 2377                  if (update_mnttab)
2376 2378                          vfs_mnttab_modtimeupd();
2377 2379                  break;
2378 2380          }
2379 2381  }
2380 2382  
2381 2383  void
2382 2384  vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2383 2385  {
2384 2386          int gotlock = 0;
2385 2387  
2386 2388          if (VFS_ON_LIST(vfsp)) {
2387 2389                  gotlock = 1;
2388 2390                  vfs_list_lock();
2389 2391          }
2390 2392          vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2391 2393          if (gotlock)
2392 2394                  vfs_list_unlock();
2393 2395  }
2394 2396  
2395 2397  
2396 2398  /*
2397 2399   * Add a "tag" option to a mounted file system's options list.
2398 2400   *
2399 2401   * Note: caller is responsible for locking the vfs list, if needed,
2400 2402   *       to protect mops.
2401 2403   */
2402 2404  static mntopt_t *
2403 2405  vfs_addtag(mntopts_t *mops, const char *tag)
2404 2406  {
2405 2407          uint_t count;
2406 2408          mntopt_t *mop, *motbl;
2407 2409  
2408 2410          count = mops->mo_count + 1;
2409 2411          motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2410 2412          if (mops->mo_count) {
2411 2413                  size_t len = (count - 1) * sizeof (mntopt_t);
2412 2414  
2413 2415                  bcopy(mops->mo_list, motbl, len);
2414 2416                  kmem_free(mops->mo_list, len);
2415 2417          }
2416 2418          mops->mo_count = count;
2417 2419          mops->mo_list = motbl;
2418 2420          mop = &motbl[count - 1];
2419 2421          mop->mo_flags = MO_TAG;
2420 2422          mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2421 2423          (void) strcpy(mop->mo_name, tag);
2422 2424          return (mop);
2423 2425  }
2424 2426  
2425 2427  /*
2426 2428   * Allow users to set arbitrary "tags" in a vfs's mount options.
2427 2429   * Broader use within the kernel is discouraged.
2428 2430   */
2429 2431  int
2430 2432  vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2431 2433      cred_t *cr)
2432 2434  {
2433 2435          vfs_t *vfsp;
2434 2436          mntopts_t *mops;
2435 2437          mntopt_t *mop;
2436 2438          int found = 0;
2437 2439          dev_t dev = makedevice(major, minor);
2438 2440          int err = 0;
2439 2441          char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2440 2442  
2441 2443          /*
2442 2444           * Find the desired mounted file system
2443 2445           */
2444 2446          vfs_list_lock();
2445 2447          vfsp = rootvfs;
2446 2448          do {
2447 2449                  if (vfsp->vfs_dev == dev &&
2448 2450                      strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2449 2451                          found = 1;
2450 2452                          break;
2451 2453                  }
2452 2454                  vfsp = vfsp->vfs_next;
2453 2455          } while (vfsp != rootvfs);
2454 2456  
2455 2457          if (!found) {
2456 2458                  err = EINVAL;
2457 2459                  goto out;
2458 2460          }
2459 2461          err = secpolicy_fs_config(cr, vfsp);
2460 2462          if (err != 0)
2461 2463                  goto out;
2462 2464  
2463 2465          mops = &vfsp->vfs_mntopts;
2464 2466          /*
2465 2467           * Add tag if it doesn't already exist
2466 2468           */
2467 2469          if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2468 2470                  int len;
2469 2471  
2470 2472                  (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2471 2473                  len = strlen(buf);
2472 2474                  if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2473 2475                          err = ENAMETOOLONG;
2474 2476                          goto out;
2475 2477                  }
2476 2478                  mop = vfs_addtag(mops, tag);
2477 2479          }
2478 2480          if ((mop->mo_flags & MO_TAG) == 0) {
2479 2481                  err = EINVAL;
2480 2482                  goto out;
2481 2483          }
2482 2484          vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2483 2485  out:
2484 2486          vfs_list_unlock();
2485 2487          kmem_free(buf, MAX_MNTOPT_STR);
2486 2488          return (err);
2487 2489  }
2488 2490  
2489 2491  /*
2490 2492   * Allow users to remove arbitrary "tags" in a vfs's mount options.
2491 2493   * Broader use within the kernel is discouraged.
2492 2494   */
2493 2495  int
2494 2496  vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2495 2497      cred_t *cr)
2496 2498  {
2497 2499          vfs_t *vfsp;
2498 2500          mntopt_t *mop;
2499 2501          int found = 0;
2500 2502          dev_t dev = makedevice(major, minor);
2501 2503          int err = 0;
2502 2504  
2503 2505          /*
2504 2506           * Find the desired mounted file system
2505 2507           */
2506 2508          vfs_list_lock();
2507 2509          vfsp = rootvfs;
2508 2510          do {
2509 2511                  if (vfsp->vfs_dev == dev &&
2510 2512                      strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2511 2513                          found = 1;
2512 2514                          break;
2513 2515                  }
2514 2516                  vfsp = vfsp->vfs_next;
2515 2517          } while (vfsp != rootvfs);
2516 2518  
2517 2519          if (!found) {
2518 2520                  err = EINVAL;
2519 2521                  goto out;
2520 2522          }
2521 2523          err = secpolicy_fs_config(cr, vfsp);
2522 2524          if (err != 0)
2523 2525                  goto out;
2524 2526  
2525 2527          if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2526 2528                  err = EINVAL;
2527 2529                  goto out;
2528 2530          }
2529 2531          if ((mop->mo_flags & MO_TAG) == 0) {
2530 2532                  err = EINVAL;
2531 2533                  goto out;
2532 2534          }
2533 2535          vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2534 2536  out:
2535 2537          vfs_list_unlock();
2536 2538          return (err);
2537 2539  }
2538 2540  
2539 2541  /*
2540 2542   * Function to parse an option string and fill in a mount options table.
2541 2543   * Unknown options are silently ignored.  The input option string is modified
2542 2544   * by replacing separators with nulls.  If the create flag is set, options
2543 2545   * not found in the table are just added on the fly.  The table must have
2544 2546   * an option slot marked MO_EMPTY to add an option on the fly.
2545 2547   *
2546 2548   * This function is *not* for general use by filesystems.
2547 2549   *
2548 2550   * Note: caller is responsible for locking the vfs list, if needed,
2549 2551   *       to protect mops..
2550 2552   */
2551 2553  void
2552 2554  vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2553 2555  {
2554 2556          char *s = osp, *p, *nextop, *valp, *cp, *ep;
2555 2557          int setflg = VFS_NOFORCEOPT;
2556 2558  
2557 2559          if (osp == NULL)
2558 2560                  return;
2559 2561          while (*s != '\0') {
2560 2562                  p = strchr(s, ',');     /* find next option */
2561 2563                  if (p == NULL) {
2562 2564                          cp = NULL;
2563 2565                          p = s + strlen(s);
2564 2566                  } else {
2565 2567                          cp = p;         /* save location of comma */
2566 2568                          *p++ = '\0';    /* mark end and point to next option */
2567 2569                  }
2568 2570                  nextop = p;
2569 2571                  p = strchr(s, '=');     /* look for value */
2570 2572                  if (p == NULL) {
2571 2573                          valp = NULL;    /* no value supplied */
2572 2574                  } else {
2573 2575                          ep = p;         /* save location of equals */
2574 2576                          *p++ = '\0';    /* end option and point to value */
2575 2577                          valp = p;
2576 2578                  }
2577 2579                  /*
2578 2580                   * set option into options table
2579 2581                   */
2580 2582                  if (create)
2581 2583                          setflg |= VFS_CREATEOPT;
2582 2584                  vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2583 2585                  if (cp != NULL)
2584 2586                          *cp = ',';      /* restore the comma */
2585 2587                  if (valp != NULL)
2586 2588                          *ep = '=';      /* restore the equals */
2587 2589                  s = nextop;
2588 2590          }
2589 2591  }
2590 2592  
2591 2593  /*
2592 2594   * Function to inquire if an option exists in a mount options table.
2593 2595   * Returns a pointer to the option if it exists, else NULL.
2594 2596   *
2595 2597   * This function is *not* for general use by filesystems.
2596 2598   *
2597 2599   * Note: caller is responsible for locking the vfs list, if needed,
2598 2600   *       to protect mops.
2599 2601   */
2600 2602  struct mntopt *
2601 2603  vfs_hasopt(const mntopts_t *mops, const char *opt)
2602 2604  {
2603 2605          struct mntopt *mop;
2604 2606          uint_t i, count;
2605 2607  
2606 2608          count = mops->mo_count;
2607 2609          for (i = 0; i < count; i++) {
2608 2610                  mop = &mops->mo_list[i];
2609 2611  
2610 2612                  if (mop->mo_flags & MO_EMPTY)
2611 2613                          continue;
2612 2614                  if (strcmp(opt, mop->mo_name) == 0)
2613 2615                          return (mop);
2614 2616          }
2615 2617          return (NULL);
2616 2618  }
2617 2619  
2618 2620  /*
2619 2621   * Function to inquire if an option is set in a mount options table.
2620 2622   * Returns non-zero if set and fills in the arg pointer with a pointer to
2621 2623   * the argument string or NULL if there is no argument string.
2622 2624   */
2623 2625  static int
2624 2626  vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2625 2627  {
2626 2628          struct mntopt *mop;
2627 2629          uint_t i, count;
2628 2630  
2629 2631          count = mops->mo_count;
2630 2632          for (i = 0; i < count; i++) {
2631 2633                  mop = &mops->mo_list[i];
2632 2634  
2633 2635                  if (mop->mo_flags & MO_EMPTY)
2634 2636                          continue;
2635 2637                  if (strcmp(opt, mop->mo_name))
2636 2638                          continue;
2637 2639                  if ((mop->mo_flags & MO_SET) == 0)
2638 2640                          return (0);
2639 2641                  if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2640 2642                          *argp = mop->mo_arg;
2641 2643                  return (1);
2642 2644          }
2643 2645          return (0);
2644 2646  }
2645 2647  
2646 2648  
2647 2649  int
2648 2650  vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2649 2651  {
2650 2652          int ret;
2651 2653  
2652 2654          vfs_list_read_lock();
2653 2655          ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2654 2656          vfs_list_unlock();
2655 2657          return (ret);
2656 2658  }
2657 2659  
2658 2660  
2659 2661  /*
2660 2662   * Construct a comma separated string of the options set in the given
2661 2663   * mount table, return the string in the given buffer.  Return non-zero if
2662 2664   * the buffer would overflow.
2663 2665   *
2664 2666   * This function is *not* for general use by filesystems.
2665 2667   *
2666 2668   * Note: caller is responsible for locking the vfs list, if needed,
2667 2669   *       to protect mp.
2668 2670   */
2669 2671  int
2670 2672  vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2671 2673  {
2672 2674          char *cp;
2673 2675          uint_t i;
2674 2676  
2675 2677          buf[0] = '\0';
2676 2678          cp = buf;
2677 2679          for (i = 0; i < mp->mo_count; i++) {
2678 2680                  struct mntopt *mop;
2679 2681  
2680 2682                  mop = &mp->mo_list[i];
2681 2683                  if (mop->mo_flags & MO_SET) {
2682 2684                          int optlen, comma = 0;
2683 2685  
2684 2686                          if (buf[0] != '\0')
2685 2687                                  comma = 1;
2686 2688                          optlen = strlen(mop->mo_name);
2687 2689                          if (strlen(buf) + comma + optlen + 1 > len)
2688 2690                                  goto err;
2689 2691                          if (comma)
2690 2692                                  *cp++ = ',';
2691 2693                          (void) strcpy(cp, mop->mo_name);
2692 2694                          cp += optlen;
2693 2695                          /*
2694 2696                           * Append option value if there is one
2695 2697                           */
2696 2698                          if (mop->mo_arg != NULL) {
2697 2699                                  int arglen;
2698 2700  
2699 2701                                  arglen = strlen(mop->mo_arg);
2700 2702                                  if (strlen(buf) + arglen + 2 > len)
2701 2703                                          goto err;
2702 2704                                  *cp++ = '=';
2703 2705                                  (void) strcpy(cp, mop->mo_arg);
2704 2706                                  cp += arglen;
2705 2707                          }
2706 2708                  }
2707 2709          }
2708 2710          return (0);
2709 2711  err:
2710 2712          return (EOVERFLOW);
2711 2713  }
2712 2714  
2713 2715  static void
2714 2716  vfs_freecancelopt(char **moc)
2715 2717  {
2716 2718          if (moc != NULL) {
2717 2719                  int ccnt = 0;
2718 2720                  char **cp;
2719 2721  
2720 2722                  for (cp = moc; *cp != NULL; cp++) {
2721 2723                          kmem_free(*cp, strlen(*cp) + 1);
2722 2724                          ccnt++;
2723 2725                  }
2724 2726                  kmem_free(moc, (ccnt + 1) * sizeof (char *));
2725 2727          }
2726 2728  }
2727 2729  
2728 2730  static void
2729 2731  vfs_freeopt(mntopt_t *mop)
2730 2732  {
2731 2733          if (mop->mo_name != NULL)
2732 2734                  kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2733 2735  
2734 2736          vfs_freecancelopt(mop->mo_cancel);
2735 2737  
2736 2738          if (mop->mo_arg != NULL)
2737 2739                  kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2738 2740  }
2739 2741  
2740 2742  /*
2741 2743   * Free a mount options table
2742 2744   *
2743 2745   * This function is *not* for general use by filesystems.
2744 2746   *
2745 2747   * Note: caller is responsible for locking the vfs list, if needed,
2746 2748   *       to protect mp.
2747 2749   */
2748 2750  void
2749 2751  vfs_freeopttbl(mntopts_t *mp)
2750 2752  {
2751 2753          uint_t i, count;
2752 2754  
2753 2755          count = mp->mo_count;
2754 2756          for (i = 0; i < count; i++) {
2755 2757                  vfs_freeopt(&mp->mo_list[i]);
2756 2758          }
2757 2759          if (count) {
2758 2760                  kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2759 2761                  mp->mo_count = 0;
2760 2762                  mp->mo_list = NULL;
2761 2763          }
2762 2764  }
2763 2765  
2764 2766  
2765 2767  /* ARGSUSED */
2766 2768  static int
2767 2769  vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2768 2770      caller_context_t *ct)
2769 2771  {
2770 2772          return (0);
2771 2773  }
2772 2774  
2773 2775  /* ARGSUSED */
2774 2776  static int
2775 2777  vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2776 2778      caller_context_t *ct)
2777 2779  {
2778 2780          return (0);
2779 2781  }
2780 2782  
2781 2783  /*
2782 2784   * The dummy vnode is currently used only by file events notification
2783 2785   * module which is just interested in the timestamps.
2784 2786   */
2785 2787  /* ARGSUSED */
2786 2788  static int
2787 2789  vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2788 2790      caller_context_t *ct)
2789 2791  {
2790 2792          bzero(vap, sizeof (vattr_t));
2791 2793          vap->va_type = VREG;
2792 2794          vap->va_nlink = 1;
2793 2795          vap->va_ctime = vfs_mnttab_ctime;
2794 2796          /*
2795 2797           * it is ok to just copy mtime as the time will be monotonically
2796 2798           * increasing.
2797 2799           */
2798 2800          vap->va_mtime = vfs_mnttab_mtime;
2799 2801          vap->va_atime = vap->va_mtime;
2800 2802          return (0);
2801 2803  }
2802 2804  
2803 2805  static void
2804 2806  vfs_mnttabvp_setup(void)
2805 2807  {
2806 2808          vnode_t *tvp;
2807 2809          vnodeops_t *vfs_mntdummyvnops;
2808 2810          const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2809 2811                  VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2810 2812                  VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2811 2813                  VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2812 2814                  VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2813 2815                  NULL,                   NULL
2814 2816          };
2815 2817  
2816 2818          if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2817 2819              &vfs_mntdummyvnops) != 0) {
2818 2820                  cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2819 2821                  /* Shouldn't happen, but not bad enough to panic */
2820 2822                  return;
2821 2823          }
2822 2824  
2823 2825          /*
2824 2826           * A global dummy vnode is allocated to represent mntfs files.
2825 2827           * The mntfs file (/etc/mnttab) can be monitored for file events
2826 2828           * and receive an event when mnttab changes. Dummy VOP calls
2827 2829           * will be made on this vnode. The file events notification module
2828 2830           * intercepts this vnode and delivers relevant events.
2829 2831           */
2830 2832          tvp = vn_alloc(KM_SLEEP);
2831 2833          tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2832 2834          vn_setops(tvp, vfs_mntdummyvnops);
2833 2835          tvp->v_type = VREG;
2834 2836          /*
2835 2837           * The mnt dummy ops do not reference v_data.
2836 2838           * No other module intercepting this vnode should either.
2837 2839           * Just set it to point to itself.
2838 2840           */
2839 2841          tvp->v_data = (caddr_t)tvp;
2840 2842          tvp->v_vfsp = rootvfs;
2841 2843          vfs_mntdummyvp = tvp;
2842 2844  }
2843 2845  
2844 2846  /*
2845 2847   * performs fake read/write ops
2846 2848   */
2847 2849  static void
2848 2850  vfs_mnttab_rwop(int rw)
2849 2851  {
2850 2852          struct uio      uio;
2851 2853          struct iovec    iov;
2852 2854          char    buf[1];
2853 2855  
2854 2856          if (vfs_mntdummyvp == NULL)
2855 2857                  return;
2856 2858  
2857 2859          bzero(&uio, sizeof (uio));
2858 2860          bzero(&iov, sizeof (iov));
2859 2861          iov.iov_base = buf;
2860 2862          iov.iov_len = 0;
2861 2863          uio.uio_iov = &iov;
2862 2864          uio.uio_iovcnt = 1;
2863 2865          uio.uio_loffset = 0;
2864 2866          uio.uio_segflg = UIO_SYSSPACE;
2865 2867          uio.uio_resid = 0;
2866 2868          if (rw) {
2867 2869                  (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2868 2870          } else {
2869 2871                  (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2870 2872          }
2871 2873  }
2872 2874  
2873 2875  /*
2874 2876   * Generate a write operation.
2875 2877   */
2876 2878  void
2877 2879  vfs_mnttab_writeop(void)
2878 2880  {
2879 2881          vfs_mnttab_rwop(1);
2880 2882  }
2881 2883  
2882 2884  /*
2883 2885   * Generate a read operation.
2884 2886   */
2885 2887  void
2886 2888  vfs_mnttab_readop(void)
2887 2889  {
2888 2890          vfs_mnttab_rwop(0);
2889 2891  }
2890 2892  
2891 2893  /*
2892 2894   * Free any mnttab information recorded in the vfs struct.
2893 2895   * The vfs must not be on the vfs list.
2894 2896   */
2895 2897  static void
2896 2898  vfs_freemnttab(struct vfs *vfsp)
2897 2899  {
2898 2900          ASSERT(!VFS_ON_LIST(vfsp));
2899 2901  
2900 2902          /*
2901 2903           * Free device and mount point information
2902 2904           */
2903 2905          if (vfsp->vfs_mntpt != NULL) {
2904 2906                  refstr_rele(vfsp->vfs_mntpt);
2905 2907                  vfsp->vfs_mntpt = NULL;
2906 2908          }
2907 2909          if (vfsp->vfs_resource != NULL) {
2908 2910                  refstr_rele(vfsp->vfs_resource);
2909 2911                  vfsp->vfs_resource = NULL;
2910 2912          }
2911 2913          /*
2912 2914           * Now free mount options information
2913 2915           */
2914 2916          vfs_freeopttbl(&vfsp->vfs_mntopts);
2915 2917  }
2916 2918  
2917 2919  /*
2918 2920   * Return the last mnttab modification time
2919 2921   */
2920 2922  void
2921 2923  vfs_mnttab_modtime(timespec_t *ts)
2922 2924  {
2923 2925          ASSERT(RW_LOCK_HELD(&vfslist));
2924 2926          *ts = vfs_mnttab_mtime;
2925 2927  }
2926 2928  
2927 2929  /*
2928 2930   * See if mnttab is changed
2929 2931   */
2930 2932  void
2931 2933  vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2932 2934  {
2933 2935          int changed;
2934 2936  
2935 2937          *phpp = (struct pollhead *)NULL;
2936 2938  
2937 2939          /*
2938 2940           * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2939 2941           * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2940 2942           * to not grab the vfs list lock because tv_sec is monotonically
2941 2943           * increasing.
2942 2944           */
2943 2945  
2944 2946          changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2945 2947              (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2946 2948          if (!changed) {
2947 2949                  *phpp = &vfs_pollhd;
2948 2950          }
2949 2951  }
2950 2952  
2951 2953  /* Provide a unique and monotonically-increasing timestamp. */
2952 2954  void
2953 2955  vfs_mono_time(timespec_t *ts)
2954 2956  {
2955 2957          static volatile hrtime_t hrt;           /* The saved time. */
2956 2958          hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2957 2959          timespec_t      newts;
2958 2960  
2959 2961          /*
2960 2962           * Try gethrestime() first, but be prepared to fabricate a sensible
2961 2963           * answer at the first sign of any trouble.
2962 2964           */
2963 2965          gethrestime(&newts);
2964 2966          newhrt = ts2hrt(&newts);
2965 2967          for (;;) {
2966 2968                  oldhrt = hrt;
2967 2969                  if (newhrt <= hrt)
2968 2970                          newhrt = hrt + 1;
2969 2971                  if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2970 2972                          break;
2971 2973          }
2972 2974          hrt2ts(newhrt, ts);
2973 2975  }
2974 2976  
2975 2977  /*
2976 2978   * Update the mnttab modification time and wake up any waiters for
2977 2979   * mnttab changes
2978 2980   */
2979 2981  void
2980 2982  vfs_mnttab_modtimeupd()
2981 2983  {
2982 2984          hrtime_t oldhrt, newhrt;
2983 2985  
2984 2986          ASSERT(RW_WRITE_HELD(&vfslist));
2985 2987          oldhrt = ts2hrt(&vfs_mnttab_mtime);
2986 2988          gethrestime(&vfs_mnttab_mtime);
2987 2989          newhrt = ts2hrt(&vfs_mnttab_mtime);
2988 2990          if (oldhrt == (hrtime_t)0)
2989 2991                  vfs_mnttab_ctime = vfs_mnttab_mtime;
2990 2992          /*
2991 2993           * Attempt to provide unique mtime (like uniqtime but not).
2992 2994           */
2993 2995          if (newhrt == oldhrt) {
2994 2996                  newhrt++;
2995 2997                  hrt2ts(newhrt, &vfs_mnttab_mtime);
2996 2998          }
2997 2999          pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2998 3000          vfs_mnttab_writeop();
2999 3001  }
3000 3002  
3001 3003  int
3002 3004  dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3003 3005  {
3004 3006          vnode_t *coveredvp;
3005 3007          int error;
3006 3008          extern void teardown_vopstats(vfs_t *);
3007 3009  
3008 3010          /*
3009 3011           * Get covered vnode. This will be NULL if the vfs is not linked
3010 3012           * into the file system name space (i.e., domount() with MNT_NOSPICE).
3011 3013           */
3012 3014          coveredvp = vfsp->vfs_vnodecovered;
3013 3015          ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3014 3016  
3015 3017          /*
3016 3018           * Purge all dnlc entries for this vfs.
3017 3019           */
3018 3020          (void) dnlc_purge_vfsp(vfsp, 0);
3019 3021  
3020 3022          /* For forcible umount, skip VFS_SYNC() since it may hang */
3021 3023          if ((flag & MS_FORCE) == 0)
3022 3024                  (void) VFS_SYNC(vfsp, 0, cr);
3023 3025  
3024 3026          /*
3025 3027           * Lock the vfs to maintain fs status quo during unmount.  This
3026 3028           * has to be done after the sync because ufs_update tries to acquire
3027 3029           * the vfs_reflock.
3028 3030           */
3029 3031          vfs_lock_wait(vfsp);
3030 3032  
3031 3033          if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3032 3034                  vfs_unlock(vfsp);
3033 3035                  if (coveredvp != NULL)
3034 3036                          vn_vfsunlock(coveredvp);
3035 3037          } else if (coveredvp != NULL) {
3036 3038                  teardown_vopstats(vfsp);
3037 3039                  /*
3038 3040                   * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3039 3041                   * when it frees vfsp so we do a VN_HOLD() so we can
3040 3042                   * continue to use coveredvp afterwards.
3041 3043                   */
3042 3044                  VN_HOLD(coveredvp);
3043 3045                  vfs_remove(vfsp);
3044 3046                  vn_vfsunlock(coveredvp);
3045 3047                  VN_RELE(coveredvp);
3046 3048          } else {
3047 3049                  teardown_vopstats(vfsp);
3048 3050                  /*
3049 3051                   * Release the reference to vfs that is not linked
3050 3052                   * into the name space.
3051 3053                   */
3052 3054                  vfs_unlock(vfsp);
3053 3055                  VFS_RELE(vfsp);
3054 3056          }
3055 3057          return (error);
3056 3058  }
3057 3059  
3058 3060  
3059 3061  /*
3060 3062   * Vfs_unmountall() is called by uadmin() to unmount all
3061 3063   * mounted file systems (except the root file system) during shutdown.
3062 3064   * It follows the existing locking protocol when traversing the vfs list
3063 3065   * to sync and unmount vfses. Even though there should be no
3064 3066   * other thread running while the system is shutting down, it is prudent
3065 3067   * to still follow the locking protocol.
3066 3068   */
3067 3069  void
3068 3070  vfs_unmountall(void)
3069 3071  {
3070 3072          struct vfs *vfsp;
3071 3073          struct vfs *prev_vfsp = NULL;
3072 3074          int error;
3073 3075  
3074 3076          /*
3075 3077           * Toss all dnlc entries now so that the per-vfs sync
3076 3078           * and unmount operations don't have to slog through
3077 3079           * a bunch of uninteresting vnodes over and over again.
3078 3080           */
3079 3081          dnlc_purge();
3080 3082  
3081 3083          vfs_list_lock();
3082 3084          for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3083 3085                  prev_vfsp = vfsp->vfs_prev;
3084 3086  
3085 3087                  if (vfs_lock(vfsp) != 0)
3086 3088                          continue;
3087 3089                  error = vn_vfswlock(vfsp->vfs_vnodecovered);
3088 3090                  vfs_unlock(vfsp);
3089 3091                  if (error)
3090 3092                          continue;
3091 3093  
3092 3094                  vfs_list_unlock();
3093 3095  
3094 3096                  (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3095 3097                  (void) dounmount(vfsp, 0, CRED());
3096 3098  
3097 3099                  /*
3098 3100                   * Since we dropped the vfslist lock above we must
3099 3101                   * verify that next_vfsp still exists, else start over.
3100 3102                   */
3101 3103                  vfs_list_lock();
3102 3104                  for (vfsp = rootvfs->vfs_prev;
3103 3105                      vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3104 3106                          if (vfsp == prev_vfsp)
3105 3107                                  break;
3106 3108                  if (vfsp == rootvfs && prev_vfsp != rootvfs)
3107 3109                          prev_vfsp = rootvfs->vfs_prev;
3108 3110          }
3109 3111          vfs_list_unlock();
3110 3112  }
3111 3113  
3112 3114  /*
3113 3115   * Called to add an entry to the end of the vfs mount in progress list
3114 3116   */
3115 3117  void
3116 3118  vfs_addmip(dev_t dev, struct vfs *vfsp)
3117 3119  {
3118 3120          struct ipmnt *mipp;
3119 3121  
3120 3122          mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3121 3123          mipp->mip_next = NULL;
3122 3124          mipp->mip_dev = dev;
3123 3125          mipp->mip_vfsp = vfsp;
3124 3126          mutex_enter(&vfs_miplist_mutex);
3125 3127          if (vfs_miplist_end != NULL)
3126 3128                  vfs_miplist_end->mip_next = mipp;
3127 3129          else
3128 3130                  vfs_miplist = mipp;
3129 3131          vfs_miplist_end = mipp;
3130 3132          mutex_exit(&vfs_miplist_mutex);
3131 3133  }
3132 3134  
3133 3135  /*
3134 3136   * Called to remove an entry from the mount in progress list
3135 3137   * Either because the mount completed or it failed.
3136 3138   */
3137 3139  void
3138 3140  vfs_delmip(struct vfs *vfsp)
3139 3141  {
3140 3142          struct ipmnt *mipp, *mipprev;
3141 3143  
3142 3144          mutex_enter(&vfs_miplist_mutex);
3143 3145          mipprev = NULL;
3144 3146          for (mipp = vfs_miplist;
3145 3147              mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3146 3148                  mipprev = mipp;
3147 3149          }
3148 3150          if (mipp == NULL)
3149 3151                  return; /* shouldn't happen */
3150 3152          if (mipp == vfs_miplist_end)
3151 3153                  vfs_miplist_end = mipprev;
3152 3154          if (mipprev == NULL)
3153 3155                  vfs_miplist = mipp->mip_next;
3154 3156          else
3155 3157                  mipprev->mip_next = mipp->mip_next;
3156 3158          mutex_exit(&vfs_miplist_mutex);
3157 3159          kmem_free(mipp, sizeof (struct ipmnt));
3158 3160  }
3159 3161  
3160 3162  /*
3161 3163   * vfs_add is called by a specific filesystem's mount routine to add
3162 3164   * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3163 3165   * The vfs should already have been locked by the caller.
3164 3166   *
3165 3167   * coveredvp is NULL if this is the root.
3166 3168   */
3167 3169  void
3168 3170  vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3169 3171  {
3170 3172          int newflag;
3171 3173  
3172 3174          ASSERT(vfs_lock_held(vfsp));
3173 3175          VFS_HOLD(vfsp);
3174 3176          newflag = vfsp->vfs_flag;
3175 3177          if (mflag & MS_RDONLY)
3176 3178                  newflag |= VFS_RDONLY;
3177 3179          else
3178 3180                  newflag &= ~VFS_RDONLY;
3179 3181          if (mflag & MS_NOSUID)
3180 3182                  newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3181 3183          else
3182 3184                  newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3183 3185          if (mflag & MS_NOMNTTAB)
3184 3186                  newflag |= VFS_NOMNTTAB;
3185 3187          else
3186 3188                  newflag &= ~VFS_NOMNTTAB;
3187 3189  
3188 3190          if (coveredvp != NULL) {
3189 3191                  ASSERT(vn_vfswlock_held(coveredvp));
3190 3192                  coveredvp->v_vfsmountedhere = vfsp;
3191 3193                  VN_HOLD(coveredvp);
3192 3194          }
3193 3195          vfsp->vfs_vnodecovered = coveredvp;
3194 3196          vfsp->vfs_flag = newflag;
3195 3197  
3196 3198          vfs_list_add(vfsp);
3197 3199  }
3198 3200  
3199 3201  /*
3200 3202   * Remove a vfs from the vfs list, null out the pointer from the
3201 3203   * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3202 3204   * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3203 3205   * reference to the vfs and to the covered vnode.
3204 3206   *
3205 3207   * Called from dounmount after it's confirmed with the file system
3206 3208   * that the unmount is legal.
3207 3209   */
3208 3210  void
3209 3211  vfs_remove(struct vfs *vfsp)
3210 3212  {
3211 3213          vnode_t *vp;
3212 3214  
3213 3215          ASSERT(vfs_lock_held(vfsp));
3214 3216  
3215 3217          /*
3216 3218           * Can't unmount root.  Should never happen because fs will
3217 3219           * be busy.
3218 3220           */
3219 3221          if (vfsp == rootvfs)
3220 3222                  panic("vfs_remove: unmounting root");
3221 3223  
3222 3224          vfs_list_remove(vfsp);
3223 3225  
3224 3226          /*
3225 3227           * Unhook from the file system name space.
3226 3228           */
3227 3229          vp = vfsp->vfs_vnodecovered;
3228 3230          ASSERT(vn_vfswlock_held(vp));
3229 3231          vp->v_vfsmountedhere = NULL;
3230 3232          vfsp->vfs_vnodecovered = NULL;
3231 3233          VN_RELE(vp);
3232 3234  
3233 3235          /*
3234 3236           * Release lock and wakeup anybody waiting.
3235 3237           */
3236 3238          vfs_unlock(vfsp);
3237 3239          VFS_RELE(vfsp);
3238 3240  }
3239 3241  
3240 3242  /*
3241 3243   * Lock a filesystem to prevent access to it while mounting,
3242 3244   * unmounting and syncing.  Return EBUSY immediately if lock
3243 3245   * can't be acquired.
3244 3246   */
3245 3247  int
3246 3248  vfs_lock(vfs_t *vfsp)
3247 3249  {
3248 3250          vn_vfslocks_entry_t *vpvfsentry;
3249 3251  
3250 3252          vpvfsentry = vn_vfslocks_getlock(vfsp);
3251 3253          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3252 3254                  return (0);
3253 3255  
3254 3256          vn_vfslocks_rele(vpvfsentry);
3255 3257          return (EBUSY);
3256 3258  }
3257 3259  
3258 3260  int
3259 3261  vfs_rlock(vfs_t *vfsp)
3260 3262  {
3261 3263          vn_vfslocks_entry_t *vpvfsentry;
3262 3264  
3263 3265          vpvfsentry = vn_vfslocks_getlock(vfsp);
3264 3266  
3265 3267          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3266 3268                  return (0);
3267 3269  
3268 3270          vn_vfslocks_rele(vpvfsentry);
3269 3271          return (EBUSY);
3270 3272  }
3271 3273  
3272 3274  void
3273 3275  vfs_lock_wait(vfs_t *vfsp)
3274 3276  {
3275 3277          vn_vfslocks_entry_t *vpvfsentry;
3276 3278  
3277 3279          vpvfsentry = vn_vfslocks_getlock(vfsp);
3278 3280          rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3279 3281  }
3280 3282  
3281 3283  void
3282 3284  vfs_rlock_wait(vfs_t *vfsp)
3283 3285  {
3284 3286          vn_vfslocks_entry_t *vpvfsentry;
3285 3287  
3286 3288          vpvfsentry = vn_vfslocks_getlock(vfsp);
3287 3289          rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3288 3290  }
3289 3291  
3290 3292  /*
3291 3293   * Unlock a locked filesystem.
3292 3294   */
3293 3295  void
3294 3296  vfs_unlock(vfs_t *vfsp)
3295 3297  {
3296 3298          vn_vfslocks_entry_t *vpvfsentry;
3297 3299  
3298 3300          /*
3299 3301           * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3300 3302           * And these changes should remain for the patch changes as it is.
3301 3303           */
3302 3304          if (panicstr)
3303 3305                  return;
3304 3306  
3305 3307          /*
3306 3308           * ve_refcount needs to be dropped twice here.
3307 3309           * 1. To release refernce after a call to vfs_locks_getlock()
3308 3310           * 2. To release the reference from the locking routines like
3309 3311           *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3310 3312           */
3311 3313  
3312 3314          vpvfsentry = vn_vfslocks_getlock(vfsp);
3313 3315          vn_vfslocks_rele(vpvfsentry);
3314 3316  
3315 3317          rwst_exit(&vpvfsentry->ve_lock);
3316 3318          vn_vfslocks_rele(vpvfsentry);
3317 3319  }
3318 3320  
3319 3321  /*
3320 3322   * Utility routine that allows a filesystem to construct its
3321 3323   * fsid in "the usual way" - by munging some underlying dev_t and
3322 3324   * the filesystem type number into the 64-bit fsid.  Note that
3323 3325   * this implicitly relies on dev_t persistence to make filesystem
3324 3326   * id's persistent.
3325 3327   *
3326 3328   * There's nothing to prevent an individual fs from constructing its
3327 3329   * fsid in a different way, and indeed they should.
3328 3330   *
3329 3331   * Since we want fsids to be 32-bit quantities (so that they can be
3330 3332   * exported identically by either 32-bit or 64-bit APIs, as well as
3331 3333   * the fact that fsid's are "known" to NFS), we compress the device
3332 3334   * number given down to 32-bits, and panic if that isn't possible.
3333 3335   */
3334 3336  void
3335 3337  vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3336 3338  {
3337 3339          if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3338 3340                  panic("device number too big for fsid!");
3339 3341          fsi->val[1] = val;
3340 3342  }
3341 3343  
3342 3344  int
3343 3345  vfs_lock_held(vfs_t *vfsp)
3344 3346  {
3345 3347          int held;
3346 3348          vn_vfslocks_entry_t *vpvfsentry;
3347 3349  
3348 3350          /*
3349 3351           * vfs_lock_held will mimic sema_held behaviour
3350 3352           * if panicstr is set. And these changes should remain
3351 3353           * for the patch changes as it is.
3352 3354           */
3353 3355          if (panicstr)
3354 3356                  return (1);
3355 3357  
3356 3358          vpvfsentry = vn_vfslocks_getlock(vfsp);
3357 3359          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3358 3360  
3359 3361          vn_vfslocks_rele(vpvfsentry);
3360 3362          return (held);
3361 3363  }
3362 3364  
3363 3365  struct _kthread *
3364 3366  vfs_lock_owner(vfs_t *vfsp)
3365 3367  {
3366 3368          struct _kthread *owner;
3367 3369          vn_vfslocks_entry_t *vpvfsentry;
3368 3370  
3369 3371          /*
3370 3372           * vfs_wlock_held will mimic sema_held behaviour
3371 3373           * if panicstr is set. And these changes should remain
3372 3374           * for the patch changes as it is.
3373 3375           */
3374 3376          if (panicstr)
3375 3377                  return (NULL);
3376 3378  
3377 3379          vpvfsentry = vn_vfslocks_getlock(vfsp);
3378 3380          owner = rwst_owner(&vpvfsentry->ve_lock);
3379 3381  
3380 3382          vn_vfslocks_rele(vpvfsentry);
3381 3383          return (owner);
3382 3384  }
3383 3385  
3384 3386  /*
3385 3387   * vfs list locking.
3386 3388   *
3387 3389   * Rather than manipulate the vfslist lock directly, we abstract into lock
3388 3390   * and unlock routines to allow the locking implementation to be changed for
3389 3391   * clustering.
3390 3392   *
3391 3393   * Whenever the vfs list is modified through its hash links, the overall list
3392 3394   * lock must be obtained before locking the relevant hash bucket.  But to see
3393 3395   * whether a given vfs is on the list, it suffices to obtain the lock for the
3394 3396   * hash bucket without getting the overall list lock.  (See getvfs() below.)
3395 3397   */
3396 3398  
3397 3399  void
3398 3400  vfs_list_lock()
3399 3401  {
3400 3402          rw_enter(&vfslist, RW_WRITER);
3401 3403  }
3402 3404  
3403 3405  void
3404 3406  vfs_list_read_lock()
3405 3407  {
3406 3408          rw_enter(&vfslist, RW_READER);
3407 3409  }
3408 3410  
3409 3411  void
3410 3412  vfs_list_unlock()
3411 3413  {
3412 3414          rw_exit(&vfslist);
3413 3415  }
3414 3416  
3415 3417  /*
3416 3418   * Low level worker routines for adding entries to and removing entries from
3417 3419   * the vfs list.
3418 3420   */
3419 3421  
3420 3422  static void
3421 3423  vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3422 3424  {
3423 3425          int vhno;
3424 3426          struct vfs **hp;
3425 3427          dev_t dev;
3426 3428  
3427 3429          ASSERT(RW_WRITE_HELD(&vfslist));
3428 3430  
3429 3431          dev = expldev(vfsp->vfs_fsid.val[0]);
3430 3432          vhno = VFSHASH(getmajor(dev), getminor(dev));
3431 3433  
3432 3434          mutex_enter(&rvfs_list[vhno].rvfs_lock);
3433 3435  
3434 3436          /*
3435 3437           * Link into the hash table, inserting it at the end, so that LOFS
3436 3438           * with the same fsid as UFS (or other) file systems will not hide the
3437 3439           * UFS.
3438 3440           */
3439 3441          if (insert_at_head) {
3440 3442                  vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3441 3443                  rvfs_list[vhno].rvfs_head = vfsp;
3442 3444          } else {
3443 3445                  for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3444 3446                      hp = &(*hp)->vfs_hash)
3445 3447                          continue;
3446 3448                  /*
3447 3449                   * hp now contains the address of the pointer to update
3448 3450                   * to effect the insertion.
3449 3451                   */
3450 3452                  vfsp->vfs_hash = NULL;
3451 3453                  *hp = vfsp;
3452 3454          }
3453 3455  
3454 3456          rvfs_list[vhno].rvfs_len++;
3455 3457          mutex_exit(&rvfs_list[vhno].rvfs_lock);
3456 3458  }
3457 3459  
3458 3460  
3459 3461  static void
3460 3462  vfs_hash_remove(struct vfs *vfsp)
3461 3463  {
3462 3464          int vhno;
3463 3465          struct vfs *tvfsp;
3464 3466          dev_t dev;
3465 3467  
3466 3468          ASSERT(RW_WRITE_HELD(&vfslist));
3467 3469  
3468 3470          dev = expldev(vfsp->vfs_fsid.val[0]);
3469 3471          vhno = VFSHASH(getmajor(dev), getminor(dev));
3470 3472  
3471 3473          mutex_enter(&rvfs_list[vhno].rvfs_lock);
3472 3474  
3473 3475          /*
3474 3476           * Remove from hash.
3475 3477           */
3476 3478          if (rvfs_list[vhno].rvfs_head == vfsp) {
3477 3479                  rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3478 3480                  rvfs_list[vhno].rvfs_len--;
3479 3481                  goto foundit;
3480 3482          }
3481 3483          for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3482 3484              tvfsp = tvfsp->vfs_hash) {
3483 3485                  if (tvfsp->vfs_hash == vfsp) {
3484 3486                          tvfsp->vfs_hash = vfsp->vfs_hash;
3485 3487                          rvfs_list[vhno].rvfs_len--;
3486 3488                          goto foundit;
3487 3489                  }
3488 3490          }
3489 3491          cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3490 3492  
3491 3493  foundit:
3492 3494  
3493 3495          mutex_exit(&rvfs_list[vhno].rvfs_lock);
3494 3496  }
3495 3497  
3496 3498  
3497 3499  void
3498 3500  vfs_list_add(struct vfs *vfsp)
3499 3501  {
3500 3502          zone_t *zone;
3501 3503  
3502 3504          /*
3503 3505           * Typically, the vfs_t will have been created on behalf of the file
3504 3506           * system in vfs_init, where it will have been provided with a
3505 3507           * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3506 3508           * by an unbundled file system. We therefore check for such an example
3507 3509           * before stamping the vfs_t with its creation time for the benefit of
3508 3510           * mntfs.
3509 3511           */
3510 3512          if (vfsp->vfs_implp == NULL)
3511 3513                  vfsimpl_setup(vfsp);
3512 3514          vfs_mono_time(&vfsp->vfs_hrctime);
3513 3515  
3514 3516          /*
3515 3517           * The zone that owns the mount is the one that performed the mount.
3516 3518           * Note that this isn't necessarily the same as the zone mounted into.
3517 3519           * The corresponding zone_rele_ref() will be done when the vfs_t
3518 3520           * is being free'd.
3519 3521           */
3520 3522          vfsp->vfs_zone = curproc->p_zone;
3521 3523          zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3522 3524          zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3523 3525              ZONE_REF_VFS);
3524 3526  
3525 3527          /*
3526 3528           * Find the zone mounted into, and put this mount on its vfs list.
3527 3529           */
3528 3530          zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3529 3531          ASSERT(zone != NULL);
3530 3532          /*
3531 3533           * Special casing for the root vfs.  This structure is allocated
3532 3534           * statically and hooked onto rootvfs at link time.  During the
3533 3535           * vfs_mountroot call at system startup time, the root file system's
3534 3536           * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3535 3537           * as argument.  The code below must detect and handle this special
3536 3538           * case.  The only apparent justification for this special casing is
3537 3539           * to ensure that the root file system appears at the head of the
3538 3540           * list.
3539 3541           *
3540 3542           * XXX: I'm assuming that it's ok to do normal list locking when
3541 3543           *      adding the entry for the root file system (this used to be
3542 3544           *      done with no locks held).
3543 3545           */
3544 3546          vfs_list_lock();
3545 3547          /*
3546 3548           * Link into the vfs list proper.
3547 3549           */
3548 3550          if (vfsp == &root) {
3549 3551                  /*
3550 3552                   * Assert: This vfs is already on the list as its first entry.
3551 3553                   * Thus, there's nothing to do.
3552 3554                   */
3553 3555                  ASSERT(rootvfs == vfsp);
3554 3556                  /*
3555 3557                   * Add it to the head of the global zone's vfslist.
3556 3558                   */
3557 3559                  ASSERT(zone == global_zone);
3558 3560                  ASSERT(zone->zone_vfslist == NULL);
3559 3561                  zone->zone_vfslist = vfsp;
3560 3562          } else {
3561 3563                  /*
3562 3564                   * Link to end of list using vfs_prev (as rootvfs is now a
3563 3565                   * doubly linked circular list) so list is in mount order for
3564 3566                   * mnttab use.
3565 3567                   */
3566 3568                  rootvfs->vfs_prev->vfs_next = vfsp;
3567 3569                  vfsp->vfs_prev = rootvfs->vfs_prev;
3568 3570                  rootvfs->vfs_prev = vfsp;
3569 3571                  vfsp->vfs_next = rootvfs;
3570 3572  
3571 3573                  /*
3572 3574                   * Do it again for the zone-private list (which may be NULL).
3573 3575                   */
3574 3576                  if (zone->zone_vfslist == NULL) {
3575 3577                          ASSERT(zone != global_zone);
3576 3578                          zone->zone_vfslist = vfsp;
3577 3579                  } else {
3578 3580                          zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3579 3581                          vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3580 3582                          zone->zone_vfslist->vfs_zone_prev = vfsp;
3581 3583                          vfsp->vfs_zone_next = zone->zone_vfslist;
3582 3584                  }
3583 3585          }
3584 3586  
3585 3587          /*
3586 3588           * Link into the hash table, inserting it at the end, so that LOFS
3587 3589           * with the same fsid as UFS (or other) file systems will not hide
3588 3590           * the UFS.
3589 3591           */
3590 3592          vfs_hash_add(vfsp, 0);
3591 3593  
3592 3594          /*
3593 3595           * update the mnttab modification time
3594 3596           */
3595 3597          vfs_mnttab_modtimeupd();
3596 3598          vfs_list_unlock();
3597 3599          zone_rele(zone);
3598 3600  }
3599 3601  
3600 3602  void
3601 3603  vfs_list_remove(struct vfs *vfsp)
3602 3604  {
3603 3605          zone_t *zone;
3604 3606  
3605 3607          zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3606 3608          ASSERT(zone != NULL);
3607 3609          /*
3608 3610           * Callers are responsible for preventing attempts to unmount the
3609 3611           * root.
3610 3612           */
3611 3613          ASSERT(vfsp != rootvfs);
3612 3614  
3613 3615          vfs_list_lock();
3614 3616  
3615 3617          /*
3616 3618           * Remove from hash.
3617 3619           */
3618 3620          vfs_hash_remove(vfsp);
3619 3621  
3620 3622          /*
3621 3623           * Remove from vfs list.
3622 3624           */
3623 3625          vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3624 3626          vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3625 3627          vfsp->vfs_next = vfsp->vfs_prev = NULL;
3626 3628  
3627 3629          /*
3628 3630           * Remove from zone-specific vfs list.
3629 3631           */
3630 3632          if (zone->zone_vfslist == vfsp)
3631 3633                  zone->zone_vfslist = vfsp->vfs_zone_next;
3632 3634  
3633 3635          if (vfsp->vfs_zone_next == vfsp) {
3634 3636                  ASSERT(vfsp->vfs_zone_prev == vfsp);
3635 3637                  ASSERT(zone->zone_vfslist == vfsp);
3636 3638                  zone->zone_vfslist = NULL;
3637 3639          }
3638 3640  
3639 3641          vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3640 3642          vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3641 3643          vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3642 3644  
3643 3645          /*
3644 3646           * update the mnttab modification time
3645 3647           */
3646 3648          vfs_mnttab_modtimeupd();
3647 3649          vfs_list_unlock();
3648 3650          zone_rele(zone);
3649 3651  }
3650 3652  
3651 3653  struct vfs *
3652 3654  getvfs(fsid_t *fsid)
3653 3655  {
3654 3656          struct vfs *vfsp;
3655 3657          int val0 = fsid->val[0];
3656 3658          int val1 = fsid->val[1];
3657 3659          dev_t dev = expldev(val0);
3658 3660          int vhno = VFSHASH(getmajor(dev), getminor(dev));
3659 3661          kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3660 3662  
3661 3663          mutex_enter(hmp);
3662 3664          for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3663 3665                  if (vfsp->vfs_fsid.val[0] == val0 &&
3664 3666                      vfsp->vfs_fsid.val[1] == val1) {
3665 3667                          VFS_HOLD(vfsp);
3666 3668                          mutex_exit(hmp);
3667 3669                          return (vfsp);
3668 3670                  }
3669 3671          }
3670 3672          mutex_exit(hmp);
3671 3673          return (NULL);
3672 3674  }
3673 3675  
3674 3676  /*
3675 3677   * Search the vfs mount in progress list for a specified device/vfs entry.
3676 3678   * Returns 0 if the first entry in the list that the device matches has the
3677 3679   * given vfs pointer as well.  If the device matches but a different vfs
3678 3680   * pointer is encountered in the list before the given vfs pointer then
3679 3681   * a 1 is returned.
3680 3682   */
3681 3683  
3682 3684  int
3683 3685  vfs_devmounting(dev_t dev, struct vfs *vfsp)
3684 3686  {
3685 3687          int retval = 0;
3686 3688          struct ipmnt *mipp;
3687 3689  
3688 3690          mutex_enter(&vfs_miplist_mutex);
3689 3691          for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3690 3692                  if (mipp->mip_dev == dev) {
3691 3693                          if (mipp->mip_vfsp != vfsp)
3692 3694                                  retval = 1;
3693 3695                          break;
3694 3696                  }
3695 3697          }
3696 3698          mutex_exit(&vfs_miplist_mutex);
3697 3699          return (retval);
3698 3700  }
3699 3701  
3700 3702  /*
3701 3703   * Search the vfs list for a specified device.  Returns 1, if entry is found
3702 3704   * or 0 if no suitable entry is found.
3703 3705   */
3704 3706  
3705 3707  int
3706 3708  vfs_devismounted(dev_t dev)
3707 3709  {
3708 3710          struct vfs *vfsp;
3709 3711          int found;
3710 3712  
3711 3713          vfs_list_read_lock();
3712 3714          vfsp = rootvfs;
3713 3715          found = 0;
3714 3716          do {
3715 3717                  if (vfsp->vfs_dev == dev) {
3716 3718                          found = 1;
3717 3719                          break;
3718 3720                  }
3719 3721                  vfsp = vfsp->vfs_next;
3720 3722          } while (vfsp != rootvfs);
3721 3723  
3722 3724          vfs_list_unlock();
3723 3725          return (found);
3724 3726  }
3725 3727  
3726 3728  /*
3727 3729   * Search the vfs list for a specified device.  Returns a pointer to it
3728 3730   * or NULL if no suitable entry is found. The caller of this routine
3729 3731   * is responsible for releasing the returned vfs pointer.
3730 3732   */
3731 3733  struct vfs *
3732 3734  vfs_dev2vfsp(dev_t dev)
3733 3735  {
3734 3736          struct vfs *vfsp;
3735 3737          int found;
3736 3738  
3737 3739          vfs_list_read_lock();
3738 3740          vfsp = rootvfs;
3739 3741          found = 0;
3740 3742          do {
3741 3743                  /*
3742 3744                   * The following could be made more efficient by making
3743 3745                   * the entire loop use vfs_zone_next if the call is from
3744 3746                   * a zone.  The only callers, however, ustat(2) and
3745 3747                   * umount2(2), don't seem to justify the added
3746 3748                   * complexity at present.
3747 3749                   */
3748 3750                  if (vfsp->vfs_dev == dev &&
3749 3751                      ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3750 3752                      curproc->p_zone)) {
3751 3753                          VFS_HOLD(vfsp);
3752 3754                          found = 1;
3753 3755                          break;
3754 3756                  }
3755 3757                  vfsp = vfsp->vfs_next;
3756 3758          } while (vfsp != rootvfs);
3757 3759          vfs_list_unlock();
3758 3760          return (found ? vfsp: NULL);
3759 3761  }
3760 3762  
3761 3763  /*
3762 3764   * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3763 3765   * or NULL if no suitable entry is found. The caller of this routine
3764 3766   * is responsible for releasing the returned vfs pointer.
3765 3767   *
3766 3768   * Note that if multiple mntpoints match, the last one matching is
3767 3769   * returned in an attempt to return the "top" mount when overlay
3768 3770   * mounts are covering the same mount point.  This is accomplished by starting
3769 3771   * at the end of the list and working our way backwards, stopping at the first
3770 3772   * matching mount.
3771 3773   */
3772 3774  struct vfs *
3773 3775  vfs_mntpoint2vfsp(const char *mp)
3774 3776  {
3775 3777          struct vfs *vfsp;
3776 3778          struct vfs *retvfsp = NULL;
3777 3779          zone_t *zone = curproc->p_zone;
3778 3780          struct vfs *list;
3779 3781  
3780 3782          vfs_list_read_lock();
3781 3783          if (getzoneid() == GLOBAL_ZONEID) {
3782 3784                  /*
3783 3785                   * The global zone may see filesystems in any zone.
3784 3786                   */
3785 3787                  vfsp = rootvfs->vfs_prev;
3786 3788                  do {
3787 3789                          if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3788 3790                                  retvfsp = vfsp;
3789 3791                                  break;
3790 3792                          }
3791 3793                          vfsp = vfsp->vfs_prev;
3792 3794                  } while (vfsp != rootvfs->vfs_prev);
3793 3795          } else if ((list = zone->zone_vfslist) != NULL) {
3794 3796                  const char *mntpt;
3795 3797  
3796 3798                  vfsp = list->vfs_zone_prev;
3797 3799                  do {
3798 3800                          mntpt = refstr_value(vfsp->vfs_mntpt);
3799 3801                          mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3800 3802                          if (strcmp(mntpt, mp) == 0) {
3801 3803                                  retvfsp = vfsp;
3802 3804                                  break;
3803 3805                          }
3804 3806                          vfsp = vfsp->vfs_zone_prev;
3805 3807                  } while (vfsp != list->vfs_zone_prev);
3806 3808          }
3807 3809          if (retvfsp)
3808 3810                  VFS_HOLD(retvfsp);
3809 3811          vfs_list_unlock();
3810 3812          return (retvfsp);
3811 3813  }
3812 3814  
3813 3815  /*
3814 3816   * Search the vfs list for a specified vfsops.
3815 3817   * if vfs entry is found then return 1, else 0.
3816 3818   */
3817 3819  int
3818 3820  vfs_opsinuse(vfsops_t *ops)
3819 3821  {
3820 3822          struct vfs *vfsp;
3821 3823          int found;
3822 3824  
3823 3825          vfs_list_read_lock();
3824 3826          vfsp = rootvfs;
3825 3827          found = 0;
3826 3828          do {
3827 3829                  if (vfs_getops(vfsp) == ops) {
3828 3830                          found = 1;
3829 3831                          break;
3830 3832                  }
3831 3833                  vfsp = vfsp->vfs_next;
3832 3834          } while (vfsp != rootvfs);
3833 3835          vfs_list_unlock();
3834 3836          return (found);
3835 3837  }
3836 3838  
3837 3839  /*
3838 3840   * Allocate an entry in vfssw for a file system type
3839 3841   */
3840 3842  struct vfssw *
3841 3843  allocate_vfssw(const char *type)
3842 3844  {
3843 3845          struct vfssw *vswp;
3844 3846  
3845 3847          if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3846 3848                  /*
3847 3849                   * The vfssw table uses the empty string to identify an
3848 3850                   * available entry; we cannot add any type which has
3849 3851                   * a leading NUL. The string length is limited to
3850 3852                   * the size of the st_fstype array in struct stat.
3851 3853                   */
3852 3854                  return (NULL);
3853 3855          }
3854 3856  
3855 3857          ASSERT(VFSSW_WRITE_LOCKED());
3856 3858          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3857 3859                  if (!ALLOCATED_VFSSW(vswp)) {
3858 3860                          vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3859 3861                          (void) strcpy(vswp->vsw_name, type);
3860 3862                          ASSERT(vswp->vsw_count == 0);
3861 3863                          vswp->vsw_count = 1;
3862 3864                          mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3863 3865                          return (vswp);
3864 3866                  }
3865 3867          return (NULL);
3866 3868  }
3867 3869  
3868 3870  /*
3869 3871   * Impose additional layer of translation between vfstype names
3870 3872   * and module names in the filesystem.
3871 3873   */
3872 3874  static const char *
3873 3875  vfs_to_modname(const char *vfstype)
3874 3876  {
3875 3877          if (strcmp(vfstype, "proc") == 0) {
3876 3878                  vfstype = "procfs";
3877 3879          } else if (strcmp(vfstype, "fd") == 0) {
3878 3880                  vfstype = "fdfs";
3879 3881          } else if (strncmp(vfstype, "nfs", 3) == 0) {
3880 3882                  vfstype = "nfs";
3881 3883          }
3882 3884  
3883 3885          return (vfstype);
3884 3886  }
3885 3887  
3886 3888  /*
3887 3889   * Find a vfssw entry given a file system type name.
3888 3890   * Try to autoload the filesystem if it's not found.
3889 3891   * If it's installed, return the vfssw locked to prevent unloading.
3890 3892   */
3891 3893  struct vfssw *
3892 3894  vfs_getvfssw(const char *type)
3893 3895  {
3894 3896          struct vfssw *vswp;
3895 3897          const char *modname;
3896 3898  
3897 3899          RLOCK_VFSSW();
3898 3900          vswp = vfs_getvfsswbyname(type);
3899 3901          modname = vfs_to_modname(type);
3900 3902  
3901 3903          if (rootdir == NULL) {
3902 3904                  /*
3903 3905                   * If we haven't yet loaded the root file system, then our
3904 3906                   * _init won't be called until later. Allocate vfssw entry,
3905 3907                   * because mod_installfs won't be called.
3906 3908                   */
3907 3909                  if (vswp == NULL) {
3908 3910                          RUNLOCK_VFSSW();
3909 3911                          WLOCK_VFSSW();
3910 3912                          if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3911 3913                                  if ((vswp = allocate_vfssw(type)) == NULL) {
3912 3914                                          WUNLOCK_VFSSW();
3913 3915                                          return (NULL);
3914 3916                                  }
3915 3917                          }
3916 3918                          WUNLOCK_VFSSW();
3917 3919                          RLOCK_VFSSW();
3918 3920                  }
3919 3921                  if (!VFS_INSTALLED(vswp)) {
3920 3922                          RUNLOCK_VFSSW();
3921 3923                          (void) modloadonly("fs", modname);
3922 3924                  } else
3923 3925                          RUNLOCK_VFSSW();
3924 3926                  return (vswp);
3925 3927          }
3926 3928  
3927 3929          /*
3928 3930           * Try to load the filesystem.  Before calling modload(), we drop
3929 3931           * our lock on the VFS switch table, and pick it up after the
3930 3932           * module is loaded.  However, there is a potential race:  the
3931 3933           * module could be unloaded after the call to modload() completes
3932 3934           * but before we pick up the lock and drive on.  Therefore,
3933 3935           * we keep reloading the module until we've loaded the module
3934 3936           * _and_ we have the lock on the VFS switch table.
3935 3937           */
3936 3938          while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3937 3939                  RUNLOCK_VFSSW();
3938 3940                  if (modload("fs", modname) == -1)
3939 3941                          return (NULL);
3940 3942                  RLOCK_VFSSW();
3941 3943                  if (vswp == NULL)
3942 3944                          if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3943 3945                                  break;
3944 3946          }
3945 3947          RUNLOCK_VFSSW();
3946 3948  
3947 3949          return (vswp);
3948 3950  }
3949 3951  
3950 3952  /*
3951 3953   * Find a vfssw entry given a file system type name.
3952 3954   */
3953 3955  struct vfssw *
3954 3956  vfs_getvfsswbyname(const char *type)
3955 3957  {
3956 3958          struct vfssw *vswp;
3957 3959  
3958 3960          ASSERT(VFSSW_LOCKED());
3959 3961          if (type == NULL || *type == '\0')
3960 3962                  return (NULL);
3961 3963  
3962 3964          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3963 3965                  if (strcmp(type, vswp->vsw_name) == 0) {
3964 3966                          vfs_refvfssw(vswp);
3965 3967                          return (vswp);
3966 3968                  }
3967 3969          }
3968 3970  
3969 3971          return (NULL);
3970 3972  }
3971 3973  
3972 3974  /*
3973 3975   * Find a vfssw entry given a set of vfsops.
3974 3976   */
3975 3977  struct vfssw *
3976 3978  vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3977 3979  {
3978 3980          struct vfssw *vswp;
3979 3981  
3980 3982          RLOCK_VFSSW();
3981 3983          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3982 3984                  if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3983 3985                          vfs_refvfssw(vswp);
3984 3986                          RUNLOCK_VFSSW();
3985 3987                          return (vswp);
3986 3988                  }
3987 3989          }
3988 3990          RUNLOCK_VFSSW();
3989 3991  
3990 3992          return (NULL);
3991 3993  }
3992 3994  
3993 3995  /*
3994 3996   * Reference a vfssw entry.
3995 3997   */
3996 3998  void
3997 3999  vfs_refvfssw(struct vfssw *vswp)
3998 4000  {
3999 4001  
4000 4002          mutex_enter(&vswp->vsw_lock);
4001 4003          vswp->vsw_count++;
4002 4004          mutex_exit(&vswp->vsw_lock);
4003 4005  }
4004 4006  
4005 4007  /*
4006 4008   * Unreference a vfssw entry.
4007 4009   */
4008 4010  void
4009 4011  vfs_unrefvfssw(struct vfssw *vswp)
4010 4012  {
4011 4013  
4012 4014          mutex_enter(&vswp->vsw_lock);
4013 4015          vswp->vsw_count--;
4014 4016          mutex_exit(&vswp->vsw_lock);
4015 4017  }
4016 4018  
4017 4019  static int sync_retries = 20;   /* number of retries when not making progress */
4018 4020  static int sync_triesleft;      /* portion of sync_retries remaining */
4019 4021  
4020 4022  static pgcnt_t old_pgcnt, new_pgcnt;
4021 4023  static int new_bufcnt, old_bufcnt;
4022 4024  
4023 4025  /*
4024 4026   * Sync all of the mounted filesystems, and then wait for the actual i/o to
4025 4027   * complete.  We wait by counting the number of dirty pages and buffers,
4026 4028   * pushing them out using bio_busy() and page_busy(), and then counting again.
4027 4029   * This routine is used during the uadmin A_SHUTDOWN code.  It should only
4028 4030   * be used after some higher-level mechanism has quiesced the system so that
4029 4031   * new writes are not being initiated while we are waiting for completion.
4030 4032   *
4031 4033   * To ensure finite running time, our algorithm uses sync_triesleft (a progress
4032 4034   * counter used by the vfs_syncall() loop below). It is declared above so
4033 4035   * it can be found easily in the debugger.
4034 4036   *
4035 4037   * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4036 4038   * sync_retries consecutive calls to bio_busy() and page_busy() without
4037 4039   * decreasing either the number of dirty buffers or dirty pages below the
4038 4040   * lowest count we have seen so far, we give up and return from vfs_syncall().
4039 4041   *
4040 4042   * Each loop iteration ends with a call to delay() one second to allow time for
4041 4043   * i/o completion and to permit the user time to read our progress messages.
4042 4044   */
4043 4045  void
4044 4046  vfs_syncall(void)
4045 4047  {
4046 4048          if (rootdir == NULL && !modrootloaded)
4047 4049                  return; /* no filesystems have been loaded yet */
4048 4050  
4049 4051          printf("syncing file systems...");
4050 4052          sync();
4051 4053  
4052 4054          sync_triesleft = sync_retries;
4053 4055  
4054 4056          old_bufcnt = new_bufcnt = INT_MAX;
4055 4057          old_pgcnt = new_pgcnt = ULONG_MAX;
4056 4058  
4057 4059          while (sync_triesleft > 0) {
4058 4060                  old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4059 4061                  old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4060 4062  
4061 4063                  new_bufcnt = bio_busy(B_TRUE);
4062 4064                  new_pgcnt = page_busy(B_TRUE);
4063 4065  
4064 4066                  if (new_bufcnt == 0 && new_pgcnt == 0)
4065 4067                          break;
4066 4068  
4067 4069                  if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4068 4070                          sync_triesleft = sync_retries;
4069 4071                  else
4070 4072                          sync_triesleft--;
4071 4073  
4072 4074                  if (new_bufcnt)
4073 4075                          printf(" [%d]", new_bufcnt);
4074 4076                  if (new_pgcnt)
4075 4077                          printf(" %lu", new_pgcnt);
4076 4078  
4077 4079                  delay(hz);
4078 4080          }
4079 4081  
4080 4082          if (new_bufcnt != 0 || new_pgcnt != 0)
4081 4083                  printf(" done (not all i/o completed)\n");
4082 4084          else
4083 4085                  printf(" done\n");
4084 4086  
4085 4087          delay(hz);
4086 4088  }
4087 4089  
4088 4090  /*
4089 4091   * Map VFS flags to statvfs flags.  These shouldn't really be separate
4090 4092   * flags at all.
4091 4093   */
4092 4094  uint_t
4093 4095  vf_to_stf(uint_t vf)
4094 4096  {
4095 4097          uint_t stf = 0;
4096 4098  
4097 4099          if (vf & VFS_RDONLY)
4098 4100                  stf |= ST_RDONLY;
4099 4101          if (vf & VFS_NOSETUID)
4100 4102                  stf |= ST_NOSUID;
4101 4103          if (vf & VFS_NOTRUNC)
4102 4104                  stf |= ST_NOTRUNC;
4103 4105  
4104 4106          return (stf);
4105 4107  }
4106 4108  
4107 4109  /*
4108 4110   * Entries for (illegal) fstype 0.
4109 4111   */
4110 4112  /* ARGSUSED */
4111 4113  int
4112 4114  vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4113 4115  {
4114 4116          cmn_err(CE_PANIC, "stray vfs operation");
4115 4117          return (0);
4116 4118  }
4117 4119  
4118 4120  /*
4119 4121   * Entries for (illegal) fstype 0.
4120 4122   */
4121 4123  int
4122 4124  vfsstray(void)
4123 4125  {
4124 4126          cmn_err(CE_PANIC, "stray vfs operation");
4125 4127          return (0);
4126 4128  }
4127 4129  
4128 4130  /*
4129 4131   * Support for dealing with forced UFS unmount and its interaction with
4130 4132   * LOFS. Could be used by any filesystem.
4131 4133   * See bug 1203132.
4132 4134   */
4133 4135  int
4134 4136  vfs_EIO(void)
4135 4137  {
4136 4138          return (EIO);
4137 4139  }
4138 4140  
4139 4141  /*
4140 4142   * We've gotta define the op for sync separately, since the compiler gets
4141 4143   * confused if we mix and match ANSI and normal style prototypes when
4142 4144   * a "short" argument is present and spits out a warning.
4143 4145   */
4144 4146  /*ARGSUSED*/
4145 4147  int
4146 4148  vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4147 4149  {
4148 4150          return (EIO);
4149 4151  }
4150 4152  
4151 4153  vfs_t EIO_vfs;
4152 4154  vfsops_t *EIO_vfsops;
4153 4155  
4154 4156  /*
4155 4157   * Called from startup() to initialize all loaded vfs's
4156 4158   */
4157 4159  void
4158 4160  vfsinit(void)
4159 4161  {
4160 4162          struct vfssw *vswp;
4161 4163          int error;
4162 4164          extern int vopstats_enabled;
4163 4165          extern void vopstats_startup();
4164 4166  
4165 4167          static const fs_operation_def_t EIO_vfsops_template[] = {
4166 4168                  VFSNAME_MOUNT,          { .error = vfs_EIO },
4167 4169                  VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4168 4170                  VFSNAME_ROOT,           { .error = vfs_EIO },
4169 4171                  VFSNAME_STATVFS,        { .error = vfs_EIO },
4170 4172                  VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4171 4173                  VFSNAME_VGET,           { .error = vfs_EIO },
4172 4174                  VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4173 4175                  VFSNAME_FREEVFS,        { .error = vfs_EIO },
4174 4176                  VFSNAME_VNSTATE,        { .error = vfs_EIO },
4175 4177                  NULL, NULL
4176 4178          };
4177 4179  
4178 4180          static const fs_operation_def_t stray_vfsops_template[] = {
4179 4181                  VFSNAME_MOUNT,          { .error = vfsstray },
4180 4182                  VFSNAME_UNMOUNT,        { .error = vfsstray },
4181 4183                  VFSNAME_ROOT,           { .error = vfsstray },
4182 4184                  VFSNAME_STATVFS,        { .error = vfsstray },
4183 4185                  VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4184 4186                  VFSNAME_VGET,           { .error = vfsstray },
4185 4187                  VFSNAME_MOUNTROOT,      { .error = vfsstray },
4186 4188                  VFSNAME_FREEVFS,        { .error = vfsstray },
4187 4189                  VFSNAME_VNSTATE,        { .error = vfsstray },
4188 4190                  NULL, NULL
4189 4191          };
4190 4192  
4191 4193          /* Create vfs cache */
4192 4194          vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4193 4195              sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4194 4196  
4195 4197          /* Initialize the vnode cache (file systems may use it during init). */
4196 4198          vn_create_cache();
4197 4199  
4198 4200          /* Setup event monitor framework */
4199 4201          fem_init();
4200 4202  
4201 4203          /* Initialize the dummy stray file system type. */
4202 4204          error = vfs_setfsops(0, stray_vfsops_template, NULL);
4203 4205  
4204 4206          /* Initialize the dummy EIO file system. */
4205 4207          error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4206 4208          if (error != 0) {
4207 4209                  cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4208 4210                  /* Shouldn't happen, but not bad enough to panic */
4209 4211          }
4210 4212  
4211 4213          VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4212 4214  
4213 4215          /*
4214 4216           * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4215 4217           * on this vfs can immediately notice it's invalid.
4216 4218           */
4217 4219          EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4218 4220  
4219 4221          /*
4220 4222           * Call the init routines of non-loadable filesystems only.
4221 4223           * Filesystems which are loaded as separate modules will be
4222 4224           * initialized by the module loading code instead.
4223 4225           */
4224 4226  
4225 4227          for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4226 4228                  RLOCK_VFSSW();
4227 4229                  if (vswp->vsw_init != NULL)
4228 4230                          (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4229 4231                  RUNLOCK_VFSSW();
4230 4232          }
4231 4233  
4232 4234          vopstats_startup();
4233 4235  
4234 4236          if (vopstats_enabled) {
4235 4237                  /* EIO_vfs can collect stats, but we don't retrieve them */
4236 4238                  initialize_vopstats(&EIO_vfs.vfs_vopstats);
4237 4239                  EIO_vfs.vfs_fstypevsp = NULL;
4238 4240                  EIO_vfs.vfs_vskap = NULL;
4239 4241                  EIO_vfs.vfs_flag |= VFS_STATS;
4240 4242          }
4241 4243  
4242 4244          xattr_init();
4243 4245  
4244 4246          reparse_point_init();
4245 4247  }
4246 4248  
4247 4249  vfs_t *
4248 4250  vfs_alloc(int kmflag)
4249 4251  {
4250 4252          vfs_t *vfsp;
4251 4253  
4252 4254          vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4253 4255  
4254 4256          /*
4255 4257           * Do the simplest initialization here.
4256 4258           * Everything else gets done in vfs_init()
4257 4259           */
4258 4260          bzero(vfsp, sizeof (vfs_t));
4259 4261          return (vfsp);
4260 4262  }
4261 4263  
4262 4264  void
4263 4265  vfs_free(vfs_t *vfsp)
4264 4266  {
4265 4267          /*
4266 4268           * One would be tempted to assert that "vfsp->vfs_count == 0".
4267 4269           * The problem is that this gets called out of domount() with
4268 4270           * a partially initialized vfs and a vfs_count of 1.  This is
4269 4271           * also called from vfs_rele() with a vfs_count of 0.  We can't
4270 4272           * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4271 4273           * returned.  This is because VFS_MOUNT() fully initializes the
4272 4274           * vfs structure and its associated data.  VFS_RELE() will call
4273 4275           * VFS_FREEVFS() which may panic the system if the data structures
4274 4276           * aren't fully initialized from a successful VFS_MOUNT()).
4275 4277           */
4276 4278  
4277 4279          /* If FEM was in use, make sure everything gets cleaned up */
4278 4280          if (vfsp->vfs_femhead) {
4279 4281                  ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4280 4282                  mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4281 4283                  kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4282 4284                  vfsp->vfs_femhead = NULL;
4283 4285          }
4284 4286  
4285 4287          if (vfsp->vfs_implp)
4286 4288                  vfsimpl_teardown(vfsp);
4287 4289          sema_destroy(&vfsp->vfs_reflock);
4288 4290          kmem_cache_free(vfs_cache, vfsp);
4289 4291  }
4290 4292  
4291 4293  /*
4292 4294   * Increments the vfs reference count by one atomically.
4293 4295   */
4294 4296  void
4295 4297  vfs_hold(vfs_t *vfsp)
4296 4298  {
4297 4299          atomic_inc_32(&vfsp->vfs_count);
4298 4300          ASSERT(vfsp->vfs_count != 0);
4299 4301  }
4300 4302  
4301 4303  /*
4302 4304   * Decrements the vfs reference count by one atomically. When
4303 4305   * vfs reference count becomes zero, it calls the file system
4304 4306   * specific vfs_freevfs() to free up the resources.
4305 4307   */
4306 4308  void
4307 4309  vfs_rele(vfs_t *vfsp)
4308 4310  {
4309 4311          ASSERT(vfsp->vfs_count != 0);
4310 4312          if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4311 4313                  VFS_FREEVFS(vfsp);
4312 4314                  lofi_remove(vfsp);
4313 4315                  if (vfsp->vfs_zone)
4314 4316                          zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4315 4317                              ZONE_REF_VFS);
4316 4318                  vfs_freemnttab(vfsp);
4317 4319                  vfs_free(vfsp);
4318 4320          }
4319 4321  }
4320 4322  
4321 4323  /*
4322 4324   * Generic operations vector support.
4323 4325   *
4324 4326   * This is used to build operations vectors for both the vfs and vnode.
4325 4327   * It's normally called only when a file system is loaded.
4326 4328   *
4327 4329   * There are many possible algorithms for this, including the following:
4328 4330   *
4329 4331   *   (1) scan the list of known operations; for each, see if the file system
4330 4332   *       includes an entry for it, and fill it in as appropriate.
4331 4333   *
4332 4334   *   (2) set up defaults for all known operations.  scan the list of ops
4333 4335   *       supplied by the file system; for each which is both supplied and
4334 4336   *       known, fill it in.
4335 4337   *
4336 4338   *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4337 4339   *       in entries as we go.
4338 4340   *
4339 4341   * we choose (1) for simplicity, and because performance isn't critical here.
4340 4342   * note that (2) could be sped up using a precomputed hash table on known ops.
4341 4343   * (3) could be faster than either, but only if the lists were very large or
4342 4344   * supplied in sorted order.
4343 4345   *
4344 4346   */
4345 4347  
4346 4348  int
4347 4349  fs_build_vector(void *vector, int *unused_ops,
4348 4350      const fs_operation_trans_def_t *translation,
4349 4351      const fs_operation_def_t *operations)
4350 4352  {
4351 4353          int i, num_trans, num_ops, used;
4352 4354  
4353 4355          /*
4354 4356           * Count the number of translations and the number of supplied
4355 4357           * operations.
4356 4358           */
4357 4359  
4358 4360          {
4359 4361                  const fs_operation_trans_def_t *p;
4360 4362  
4361 4363                  for (num_trans = 0, p = translation;
4362 4364                      p->name != NULL;
4363 4365                      num_trans++, p++)
4364 4366                          ;
4365 4367          }
4366 4368  
4367 4369          {
4368 4370                  const fs_operation_def_t *p;
4369 4371  
4370 4372                  for (num_ops = 0, p = operations;
4371 4373                      p->name != NULL;
4372 4374                      num_ops++, p++)
4373 4375                          ;
4374 4376          }
4375 4377  
4376 4378          /* Walk through each operation known to our caller.  There will be */
4377 4379          /* one entry in the supplied "translation table" for each. */
4378 4380  
4379 4381          used = 0;
4380 4382  
4381 4383          for (i = 0; i < num_trans; i++) {
4382 4384                  int j, found;
4383 4385                  char *curname;
4384 4386                  fs_generic_func_p result;
4385 4387                  fs_generic_func_p *location;
4386 4388  
4387 4389                  curname = translation[i].name;
4388 4390  
4389 4391                  /* Look for a matching operation in the list supplied by the */
4390 4392                  /* file system. */
4391 4393  
4392 4394                  found = 0;
4393 4395  
4394 4396                  for (j = 0; j < num_ops; j++) {
4395 4397                          if (strcmp(operations[j].name, curname) == 0) {
4396 4398                                  used++;
4397 4399                                  found = 1;
4398 4400                                  break;
4399 4401                          }
4400 4402                  }
4401 4403  
4402 4404                  /*
4403 4405                   * If the file system is using a "placeholder" for default
4404 4406                   * or error functions, grab the appropriate function out of
4405 4407                   * the translation table.  If the file system didn't supply
4406 4408                   * this operation at all, use the default function.
4407 4409                   */
4408 4410  
4409 4411                  if (found) {
4410 4412                          result = operations[j].func.fs_generic;
4411 4413                          if (result == fs_default) {
4412 4414                                  result = translation[i].defaultFunc;
4413 4415                          } else if (result == fs_error) {
4414 4416                                  result = translation[i].errorFunc;
4415 4417                          } else if (result == NULL) {
4416 4418                                  /* Null values are PROHIBITED */
4417 4419                                  return (EINVAL);
4418 4420                          }
4419 4421                  } else {
4420 4422                          result = translation[i].defaultFunc;
4421 4423                  }
4422 4424  
4423 4425                  /* Now store the function into the operations vector. */
4424 4426  
4425 4427                  location = (fs_generic_func_p *)
4426 4428                      (((char *)vector) + translation[i].offset);
4427 4429  
4428 4430                  *location = result;
4429 4431          }
4430 4432  
4431 4433          *unused_ops = num_ops - used;
4432 4434  
4433 4435          return (0);
4434 4436  }
4435 4437  
4436 4438  /* Placeholder functions, should never be called. */
4437 4439  
4438 4440  int
4439 4441  fs_error(void)
4440 4442  {
4441 4443          cmn_err(CE_PANIC, "fs_error called");
4442 4444          return (0);
4443 4445  }
4444 4446  
4445 4447  int
4446 4448  fs_default(void)
4447 4449  {
4448 4450          cmn_err(CE_PANIC, "fs_default called");
4449 4451          return (0);
4450 4452  }
4451 4453  
4452 4454  #ifdef __sparc
4453 4455  
4454 4456  /*
4455 4457   * Part of the implementation of booting off a mirrored root
4456 4458   * involves a change of dev_t for the root device.  To
4457 4459   * accomplish this, first remove the existing hash table
4458 4460   * entry for the root device, convert to the new dev_t,
4459 4461   * then re-insert in the hash table at the head of the list.
4460 4462   */
4461 4463  void
4462 4464  vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4463 4465  {
4464 4466          vfs_list_lock();
4465 4467  
4466 4468          vfs_hash_remove(vfsp);
4467 4469  
4468 4470          vfsp->vfs_dev = ndev;
4469 4471          vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4470 4472  
4471 4473          vfs_hash_add(vfsp, 1);
4472 4474  
4473 4475          vfs_list_unlock();
4474 4476  }
4475 4477  
4476 4478  #else /* x86 NEWBOOT */
4477 4479  
4478 4480  #if defined(__x86)
4479 4481  extern int hvmboot_rootconf();
4480 4482  #endif /* __x86 */
4481 4483  
4482 4484  extern ib_boot_prop_t *iscsiboot_prop;
4483 4485  
4484 4486  int
4485 4487  rootconf()
4486 4488  {
4487 4489          int error;
4488 4490          struct vfssw *vsw;
4489 4491          extern void pm_init();
4490 4492          char *fstyp, *fsmod;
4491 4493          int ret = -1;
4492 4494  
4493 4495          getrootfs(&fstyp, &fsmod);
4494 4496  
4495 4497  #if defined(__x86)
4496 4498          /*
4497 4499           * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4498 4500           * which lives in /platform/i86hvm, and hence is only available when
4499 4501           * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4500 4502           * is not available then the modstub for this function will return 0.
4501 4503           * If the hvm_bootstrap misc module is available it will be loaded
4502 4504           * and hvmboot_rootconf() will be invoked.
4503 4505           */
4504 4506          if (error = hvmboot_rootconf())
4505 4507                  return (error);
4506 4508  #endif /* __x86 */
4507 4509  
4508 4510          if (error = clboot_rootconf())
4509 4511                  return (error);
4510 4512  
4511 4513          if (modload("fs", fsmod) == -1)
4512 4514                  panic("Cannot _init %s module", fsmod);
4513 4515  
4514 4516          RLOCK_VFSSW();
4515 4517          vsw = vfs_getvfsswbyname(fstyp);
4516 4518          RUNLOCK_VFSSW();
4517 4519          if (vsw == NULL) {
4518 4520                  cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4519 4521                  return (ENXIO);
4520 4522          }
4521 4523          VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4522 4524          VFS_HOLD(rootvfs);
4523 4525  
4524 4526          /* always mount readonly first */
4525 4527          rootvfs->vfs_flag |= VFS_RDONLY;
4526 4528  
4527 4529          pm_init();
4528 4530  
4529 4531          if (netboot && iscsiboot_prop) {
4530 4532                  cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4531 4533                      " shouldn't happen in the same time");
4532 4534                  return (EINVAL);
4533 4535          }
4534 4536  
4535 4537          if (netboot || iscsiboot_prop) {
4536 4538                  ret = strplumb();
4537 4539                  if (ret != 0) {
4538 4540                          cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4539 4541                          return (EFAULT);
4540 4542                  }
4541 4543          }
4542 4544  
4543 4545          if ((ret == 0) && iscsiboot_prop) {
4544 4546                  ret = modload("drv", "iscsi");
4545 4547                  /* -1 indicates fail */
4546 4548                  if (ret == -1) {
4547 4549                          cmn_err(CE_WARN, "Failed to load iscsi module");
4548 4550                          iscsi_boot_prop_free();
4549 4551                          return (EINVAL);
4550 4552                  } else {
4551 4553                          if (!i_ddi_attach_pseudo_node("iscsi")) {
4552 4554                                  cmn_err(CE_WARN,
4553 4555                                      "Failed to attach iscsi driver");
4554 4556                                  iscsi_boot_prop_free();
4555 4557                                  return (ENODEV);
4556 4558                          }
4557 4559                  }
4558 4560          }
4559 4561  
4560 4562          error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4561 4563          vfs_unrefvfssw(vsw);
4562 4564          rootdev = rootvfs->vfs_dev;
4563 4565  
4564 4566          if (error)
4565 4567                  cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4566 4568                      rootfs.bo_name, fstyp);
4567 4569          else
4568 4570                  cmn_err(CE_CONT, "?root on %s fstype %s\n",
4569 4571                      rootfs.bo_name, fstyp);
4570 4572          return (error);
4571 4573  }
4572 4574  
4573 4575  /*
4574 4576   * XXX this is called by nfs only and should probably be removed
4575 4577   * If booted with ASKNAME, prompt on the console for a filesystem
4576 4578   * name and return it.
4577 4579   */
4578 4580  void
4579 4581  getfsname(char *askfor, char *name, size_t namelen)
4580 4582  {
4581 4583          if (boothowto & RB_ASKNAME) {
4582 4584                  printf("%s name: ", askfor);
4583 4585                  console_gets(name, namelen);
4584 4586          }
4585 4587  }
4586 4588  
4587 4589  /*
4588 4590   * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4589 4591   * property.
4590 4592   *
4591 4593   * Filesystem types starting with the prefix "nfs" are diskless clients;
4592 4594   * init the root filename name (rootfs.bo_name), too.
4593 4595   *
4594 4596   * If we are booting via NFS we currently have these options:
4595 4597   *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4596 4598   *      nfs2 -  force NFS V2
4597 4599   *      nfs3 -  force NFS V3
4598 4600   *      nfs4 -  force NFS V4
4599 4601   * Because we need to maintain backward compatibility with the naming
4600 4602   * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4601 4603   * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4602 4604   * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4603 4605   * This is only for root filesystems, all other uses will expect
4604 4606   * that "nfs" == NFS V2.
4605 4607   */
4606 4608  static void
4607 4609  getrootfs(char **fstypp, char **fsmodp)
4608 4610  {
4609 4611          char *propstr = NULL;
4610 4612  
4611 4613          /*
4612 4614           * Check fstype property; for diskless it should be one of "nfs",
4613 4615           * "nfs2", "nfs3" or "nfs4".
4614 4616           */
4615 4617          if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4616 4618              DDI_PROP_DONTPASS, "fstype", &propstr)
4617 4619              == DDI_SUCCESS) {
4618 4620                  (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4619 4621                  ddi_prop_free(propstr);
4620 4622  
4621 4623          /*
4622 4624           * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4623 4625           * assume the type of this root filesystem is 'zfs'.
4624 4626           */
4625 4627          } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4626 4628              DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4627 4629              == DDI_SUCCESS) {
4628 4630                  (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4629 4631                  ddi_prop_free(propstr);
4630 4632          }
4631 4633  
4632 4634          if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4633 4635                  *fstypp = *fsmodp = rootfs.bo_fstype;
4634 4636                  return;
4635 4637          }
4636 4638  
4637 4639          ++netboot;
4638 4640  
4639 4641          if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4640 4642                  (void) strcpy(rootfs.bo_fstype, "nfs");
4641 4643          else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4642 4644                  (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4643 4645  
4644 4646          /*
4645 4647           * check if path to network interface is specified in bootpath
4646 4648           * or by a hypervisor domain configuration file.
4647 4649           * XXPV - enable strlumb_get_netdev_path()
4648 4650           */
4649 4651          if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4650 4652              "xpv-nfsroot")) {
4651 4653                  (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4652 4654          } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4653 4655              DDI_PROP_DONTPASS, "bootpath", &propstr)
4654 4656              == DDI_SUCCESS) {
4655 4657                  (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4656 4658                  ddi_prop_free(propstr);
4657 4659          } else {
4658 4660                  rootfs.bo_name[0] = '\0';
4659 4661          }
4660 4662          *fstypp = rootfs.bo_fstype;
4661 4663          *fsmodp = "nfs";
4662 4664  }
4663 4665  #endif
4664 4666  
4665 4667  /*
4666 4668   * VFS feature routines
4667 4669   */
4668 4670  
4669 4671  #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4670 4672  #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4671 4673  
4672 4674  /* Register a feature in the vfs */
4673 4675  void
4674 4676  vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4675 4677  {
4676 4678          /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4677 4679          if (vfsp->vfs_implp == NULL)
4678 4680                  return;
4679 4681  
4680 4682          vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4681 4683  }
4682 4684  
4683 4685  void
4684 4686  vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4685 4687  {
4686 4688          /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4687 4689          if (vfsp->vfs_implp == NULL)
4688 4690                  return;
4689 4691          vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4690 4692  }
4691 4693  
4692 4694  /*
4693 4695   * Query a vfs for a feature.
4694 4696   * Returns 1 if feature is present, 0 if not
4695 4697   */
4696 4698  int
4697 4699  vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4698 4700  {
4699 4701          int     ret = 0;
4700 4702  
4701 4703          /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4702 4704          if (vfsp->vfs_implp == NULL)
4703 4705                  return (ret);
4704 4706  
4705 4707          if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4706 4708                  ret = 1;
4707 4709  
4708 4710          return (ret);
4709 4711  }
4710 4712  
4711 4713  /*
4712 4714   * Propagate feature set from one vfs to another
4713 4715   */
4714 4716  void
4715 4717  vfs_propagate_features(vfs_t *from, vfs_t *to)
4716 4718  {
4717 4719          int i;
4718 4720  
4719 4721          if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4720 4722                  return;
4721 4723  
4722 4724          for (i = 1; i <= to->vfs_featureset[0]; i++) {
4723 4725                  to->vfs_featureset[i] = from->vfs_featureset[i];
4724 4726          }
4725 4727  }
4726 4728  
4727 4729  #define LOFINODE_PATH "/dev/lofi/%d"
4728 4730  
4729 4731  /*
4730 4732   * Return the vnode for the lofi node if there's a lofi mount in place.
4731 4733   * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4732 4734   * failure.
4733 4735   */
4734 4736  int
4735 4737  vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4736 4738  {
4737 4739          char *path = NULL;
4738 4740          int strsize;
4739 4741          int err;
4740 4742  
4741 4743          if (vfsp->vfs_lofi_id == 0) {
4742 4744                  *vpp = NULL;
4743 4745                  return (-1);
4744 4746          }
4745 4747  
4746 4748          strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4747 4749          path = kmem_alloc(strsize + 1, KM_SLEEP);
4748 4750          (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4749 4751  
4750 4752          /*
4751 4753           * We may be inside a zone, so we need to use the /dev path, but
4752 4754           * it's created asynchronously, so we wait here.
4753 4755           */
4754 4756          for (;;) {
4755 4757                  err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4756 4758  
4757 4759                  if (err != ENOENT)
4758 4760                          break;
4759 4761  
4760 4762                  if ((err = delay_sig(hz / 8)) == EINTR)
4761 4763                          break;
4762 4764          }
4763 4765  
4764 4766          if (err)
4765 4767                  *vpp = NULL;
4766 4768  
4767 4769          kmem_free(path, strsize + 1);
4768 4770          return (err);
4769 4771  }

↓ open down ↓

3966 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX