illumos-gate Wdiff usr/src/uts/common/fs/zfs/zfs_vfsops.c

Print this page

3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zfs_vfsops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vfsops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  /* Portions Copyright 2010 Robert Milkowski */
  27   27  
  28   28  #include <sys/types.h>
  29   29  #include <sys/param.h>
  30   30  #include <sys/systm.h>
  31   31  #include <sys/sysmacros.h>
  32   32  #include <sys/kmem.h>
  33   33  #include <sys/pathname.h>
  34   34  #include <sys/vnode.h>
  35   35  #include <sys/vfs.h>
  36   36  #include <sys/vfs_opreg.h>
  37   37  #include <sys/mntent.h>
  38   38  #include <sys/mount.h>
  39   39  #include <sys/cmn_err.h>
  40   40  #include "fs/fs_subr.h"
  41   41  #include <sys/zfs_znode.h>
  42   42  #include <sys/zfs_dir.h>
  43   43  #include <sys/zil.h>
  44   44  #include <sys/fs/zfs.h>
  45   45  #include <sys/dmu.h>
  46   46  #include <sys/dsl_prop.h>
  47   47  #include <sys/dsl_dataset.h>
  48   48  #include <sys/dsl_deleg.h>
  49   49  #include <sys/spa.h>
  50   50  #include <sys/zap.h>
  51   51  #include <sys/sa.h>
  52   52  #include <sys/varargs.h>
  53   53  #include <sys/policy.h>
  54   54  #include <sys/atomic.h>
  55   55  #include <sys/mkdev.h>
  56   56  #include <sys/modctl.h>
  57   57  #include <sys/refstr.h>
  58   58  #include <sys/zfs_ioctl.h>
  59   59  #include <sys/zfs_ctldir.h>
  60   60  #include <sys/zfs_fuid.h>
  61   61  #include <sys/bootconf.h>
  62   62  #include <sys/sunddi.h>
  63   63  #include <sys/dnlc.h>
  64   64  #include <sys/dmu_objset.h>
  65   65  #include <sys/spa_boot.h>
  66   66  #include <sys/sa.h>
  67   67  #include "zfs_comutil.h"
  68   68  
  69   69  int zfsfstype;
  70   70  vfsops_t *zfs_vfsops = NULL;
  71   71  static major_t zfs_major;
  72   72  static minor_t zfs_minor;
  73   73  static kmutex_t zfs_dev_mtx;
  74   74  
  75   75  extern int sys_shutdown;
  76   76  
  77   77  static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
  78   78  static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
  79   79  static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
  80   80  static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
  81   81  static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
  82   82  static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
  83   83  static void zfs_freevfs(vfs_t *vfsp);
  84   84  
  85   85  static const fs_operation_def_t zfs_vfsops_template[] = {
  86   86          VFSNAME_MOUNT,          { .vfs_mount = zfs_mount },
  87   87          VFSNAME_MOUNTROOT,      { .vfs_mountroot = zfs_mountroot },
  88   88          VFSNAME_UNMOUNT,        { .vfs_unmount = zfs_umount },
  89   89          VFSNAME_ROOT,           { .vfs_root = zfs_root },
  90   90          VFSNAME_STATVFS,        { .vfs_statvfs = zfs_statvfs },
  91   91          VFSNAME_SYNC,           { .vfs_sync = zfs_sync },
  92   92          VFSNAME_VGET,           { .vfs_vget = zfs_vget },
  93   93          VFSNAME_FREEVFS,        { .vfs_freevfs = zfs_freevfs },
  94   94          NULL,                   NULL
  95   95  };
  96   96  
  97   97  static const fs_operation_def_t zfs_vfsops_eio_template[] = {
  98   98          VFSNAME_FREEVFS,        { .vfs_freevfs =  zfs_freevfs },
  99   99          NULL,                   NULL
 100  100  };
 101  101  
 102  102  /*
 103  103   * We need to keep a count of active fs's.
 104  104   * This is necessary to prevent our module
 105  105   * from being unloaded after a umount -f
 106  106   */
 107  107  static uint32_t zfs_active_fs_count = 0;
 108  108  
 109  109  static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
 110  110  static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
 111  111  static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
 112  112  static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
 113  113  
 114  114  /*
 115  115   * MO_DEFAULT is not used since the default value is determined
 116  116   * by the equivalent property.
 117  117   */
 118  118  static mntopt_t mntopts[] = {
 119  119          { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
 120  120          { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
 121  121          { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
 122  122          { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
 123  123  };
 124  124  
 125  125  static mntopts_t zfs_mntopts = {
 126  126          sizeof (mntopts) / sizeof (mntopt_t),
 127  127          mntopts
 128  128  };
 129  129  
 130  130  /*ARGSUSED*/
 131  131  int
 132  132  zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 133  133  {
 134  134          /*
 135  135           * Data integrity is job one.  We don't want a compromised kernel
 136  136           * writing to the storage pool, so we never sync during panic.
 137  137           */
 138  138          if (panicstr)
 139  139                  return (0);
 140  140  
 141  141          /*
 142  142           * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
 143  143           * to sync metadata, which they would otherwise cache indefinitely.
 144  144           * Semantically, the only requirement is that the sync be initiated.
 145  145           * The DMU syncs out txgs frequently, so there's nothing to do.
 146  146           */
 147  147          if (flag & SYNC_ATTR)
 148  148                  return (0);
 149  149  
 150  150          if (vfsp != NULL) {
 151  151                  /*
 152  152                   * Sync a specific filesystem.
 153  153                   */
 154  154                  zfsvfs_t *zfsvfs = vfsp->vfs_data;
 155  155                  dsl_pool_t *dp;
 156  156  
 157  157                  ZFS_ENTER(zfsvfs);
 158  158                  dp = dmu_objset_pool(zfsvfs->z_os);
 159  159  
 160  160                  /*
 161  161                   * If the system is shutting down, then skip any
 162  162                   * filesystems which may exist on a suspended pool.
 163  163                   */
 164  164                  if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 165  165                          ZFS_EXIT(zfsvfs);
 166  166                          return (0);
 167  167                  }
 168  168  
 169  169                  if (zfsvfs->z_log != NULL)
 170  170                          zil_commit(zfsvfs->z_log, 0);
 171  171  
 172  172                  ZFS_EXIT(zfsvfs);
 173  173          } else {
 174  174                  /*
 175  175                   * Sync all ZFS filesystems.  This is what happens when you
 176  176                   * run sync(1M).  Unlike other filesystems, ZFS honors the
 177  177                   * request by waiting for all pools to commit all dirty data.
 178  178                   */
 179  179                  spa_sync_allpools();
 180  180          }
 181  181  
 182  182          return (0);
 183  183  }
 184  184  
 185  185  static int
 186  186  zfs_create_unique_device(dev_t *dev)
 187  187  {
 188  188          major_t new_major;
 189  189  
 190  190          do {
 191  191                  ASSERT3U(zfs_minor, <=, MAXMIN32);
 192  192                  minor_t start = zfs_minor;
 193  193                  do {
 194  194                          mutex_enter(&zfs_dev_mtx);
 195  195                          if (zfs_minor >= MAXMIN32) {
 196  196                                  /*
 197  197                                   * If we're still using the real major
 198  198                                   * keep out of /dev/zfs and /dev/zvol minor
 199  199                                   * number space.  If we're using a getudev()'ed
 200  200                                   * major number, we can use all of its minors.
 201  201                                   */
 202  202                                  if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 203  203                                          zfs_minor = ZFS_MIN_MINOR;
 204  204                                  else
 205  205                                          zfs_minor = 0;
 206  206                          } else {
 207  207                                  zfs_minor++;
 208  208                          }
 209  209                          *dev = makedevice(zfs_major, zfs_minor);
 210  210                          mutex_exit(&zfs_dev_mtx);
 211  211                  } while (vfs_devismounted(*dev) && zfs_minor != start);
 212  212                  if (zfs_minor == start) {
 213  213                          /*
 214  214                           * We are using all ~262,000 minor numbers for the
 215  215                           * current major number.  Create a new major number.
 216  216                           */
 217  217                          if ((new_major = getudev()) == (major_t)-1) {
 218  218                                  cmn_err(CE_WARN,
 219  219                                      "zfs_mount: Can't get unique major "
 220  220                                      "device number.");
 221  221                                  return (-1);
 222  222                          }
 223  223                          mutex_enter(&zfs_dev_mtx);
 224  224                          zfs_major = new_major;
 225  225                          zfs_minor = 0;
 226  226  
 227  227                          mutex_exit(&zfs_dev_mtx);
 228  228                  } else {
 229  229                          break;
 230  230                  }
 231  231                  /* CONSTANTCONDITION */
 232  232          } while (1);
 233  233  
 234  234          return (0);
 235  235  }
 236  236  
 237  237  static void
 238  238  atime_changed_cb(void *arg, uint64_t newval)
 239  239  {
 240  240          zfsvfs_t *zfsvfs = arg;
 241  241  
 242  242          if (newval == TRUE) {
 243  243                  zfsvfs->z_atime = TRUE;
 244  244                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 245  245                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 246  246          } else {
 247  247                  zfsvfs->z_atime = FALSE;
 248  248                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 249  249                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 250  250          }
 251  251  }
 252  252  
 253  253  static void
 254  254  xattr_changed_cb(void *arg, uint64_t newval)
 255  255  {
 256  256          zfsvfs_t *zfsvfs = arg;
 257  257  
 258  258          if (newval == TRUE) {
 259  259                  /* XXX locking on vfs_flag? */
 260  260                  zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 261  261                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 262  262                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 263  263          } else {
 264  264                  /* XXX locking on vfs_flag? */
 265  265                  zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 266  266                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 267  267                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 268  268          }
 269  269  }
 270  270  
 271  271  static void
 272  272  blksz_changed_cb(void *arg, uint64_t newval)
 273  273  {
 274  274          zfsvfs_t *zfsvfs = arg;
 275  275  
 276  276          if (newval < SPA_MINBLOCKSIZE ||
 277  277              newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 278  278                  newval = SPA_MAXBLOCKSIZE;
 279  279  
 280  280          zfsvfs->z_max_blksz = newval;
 281  281          zfsvfs->z_vfs->vfs_bsize = newval;
 282  282  }
 283  283  
 284  284  static void
 285  285  readonly_changed_cb(void *arg, uint64_t newval)
 286  286  {
 287  287          zfsvfs_t *zfsvfs = arg;
 288  288  
 289  289          if (newval) {
 290  290                  /* XXX locking on vfs_flag? */
 291  291                  zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 292  292                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 293  293                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 294  294          } else {
 295  295                  /* XXX locking on vfs_flag? */
 296  296                  zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 297  297                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 298  298                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 299  299          }
 300  300  }
 301  301  
 302  302  static void
 303  303  devices_changed_cb(void *arg, uint64_t newval)
 304  304  {
 305  305          zfsvfs_t *zfsvfs = arg;
 306  306  
 307  307          if (newval == FALSE) {
 308  308                  zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
 309  309                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
 310  310                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
 311  311          } else {
 312  312                  zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
 313  313                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
 314  314                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
 315  315          }
 316  316  }
 317  317  
 318  318  static void
 319  319  setuid_changed_cb(void *arg, uint64_t newval)
 320  320  {
 321  321          zfsvfs_t *zfsvfs = arg;
 322  322  
 323  323          if (newval == FALSE) {
 324  324                  zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 325  325                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 326  326                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 327  327          } else {
 328  328                  zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 329  329                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 330  330                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 331  331          }
 332  332  }
 333  333  
 334  334  static void
 335  335  exec_changed_cb(void *arg, uint64_t newval)
 336  336  {
 337  337          zfsvfs_t *zfsvfs = arg;
 338  338  
 339  339          if (newval == FALSE) {
 340  340                  zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 341  341                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 342  342                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 343  343          } else {
 344  344                  zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 345  345                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 346  346                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 347  347          }
 348  348  }
 349  349  
 350  350  /*
 351  351   * The nbmand mount option can be changed at mount time.
 352  352   * We can't allow it to be toggled on live file systems or incorrect
 353  353   * behavior may be seen from cifs clients
 354  354   *
 355  355   * This property isn't registered via dsl_prop_register(), but this callback
 356  356   * will be called when a file system is first mounted
 357  357   */
 358  358  static void
 359  359  nbmand_changed_cb(void *arg, uint64_t newval)
 360  360  {
 361  361          zfsvfs_t *zfsvfs = arg;
 362  362          if (newval == FALSE) {
 363  363                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 364  364                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 365  365          } else {
 366  366                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 367  367                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 368  368          }
 369  369  }
 370  370  
 371  371  static void
 372  372  snapdir_changed_cb(void *arg, uint64_t newval)
 373  373  {
 374  374          zfsvfs_t *zfsvfs = arg;
 375  375  
 376  376          zfsvfs->z_show_ctldir = newval;
 377  377  }
 378  378  
 379  379  static void
 380  380  vscan_changed_cb(void *arg, uint64_t newval)
 381  381  {
 382  382          zfsvfs_t *zfsvfs = arg;
 383  383  
 384  384          zfsvfs->z_vscan = newval;
 385  385  }
 386  386  
 387  387  static void
 388  388  acl_mode_changed_cb(void *arg, uint64_t newval)
 389  389  {
 390  390          zfsvfs_t *zfsvfs = arg;
 391  391  
 392  392          zfsvfs->z_acl_mode = newval;
 393  393  }
 394  394  
 395  395  static void
 396  396  acl_inherit_changed_cb(void *arg, uint64_t newval)
 397  397  {
 398  398          zfsvfs_t *zfsvfs = arg;
 399  399  
 400  400          zfsvfs->z_acl_inherit = newval;
 401  401  }
 402  402  
 403  403  static int
 404  404  zfs_register_callbacks(vfs_t *vfsp)
 405  405  {
 406  406          struct dsl_dataset *ds = NULL;
 407  407          objset_t *os = NULL;
 408  408          zfsvfs_t *zfsvfs = NULL;
 409  409          uint64_t nbmand;
 410  410          int readonly, do_readonly = B_FALSE;
 411  411          int setuid, do_setuid = B_FALSE;
 412  412          int exec, do_exec = B_FALSE;
 413  413          int devices, do_devices = B_FALSE;
 414  414          int xattr, do_xattr = B_FALSE;
 415  415          int atime, do_atime = B_FALSE;
 416  416          int error = 0;
 417  417  
 418  418          ASSERT(vfsp);
 419  419          zfsvfs = vfsp->vfs_data;
 420  420          ASSERT(zfsvfs);
 421  421          os = zfsvfs->z_os;
 422  422  
 423  423          /*
 424  424           * The act of registering our callbacks will destroy any mount
 425  425           * options we may have.  In order to enable temporary overrides
 426  426           * of mount options, we stash away the current values and
 427  427           * restore them after we register the callbacks.
 428  428           */
 429  429          if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 430  430              !spa_writeable(dmu_objset_spa(os))) {
 431  431                  readonly = B_TRUE;
 432  432                  do_readonly = B_TRUE;
 433  433          } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 434  434                  readonly = B_FALSE;
 435  435                  do_readonly = B_TRUE;
 436  436          }
 437  437          if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 438  438                  devices = B_FALSE;
 439  439                  setuid = B_FALSE;
 440  440                  do_devices = B_TRUE;
 441  441                  do_setuid = B_TRUE;
 442  442          } else {
 443  443                  if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
 444  444                          devices = B_FALSE;
 445  445                          do_devices = B_TRUE;
 446  446                  } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
 447  447                          devices = B_TRUE;
 448  448                          do_devices = B_TRUE;
 449  449                  }
 450  450  
 451  451                  if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 452  452                          setuid = B_FALSE;
 453  453                          do_setuid = B_TRUE;
 454  454                  } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 455  455                          setuid = B_TRUE;
 456  456                          do_setuid = B_TRUE;
 457  457                  }
 458  458          }
 459  459          if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 460  460                  exec = B_FALSE;
 461  461                  do_exec = B_TRUE;
 462  462          } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 463  463                  exec = B_TRUE;
 464  464                  do_exec = B_TRUE;
 465  465          }
 466  466          if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 467  467                  xattr = B_FALSE;
 468  468                  do_xattr = B_TRUE;
 469  469          } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 470  470                  xattr = B_TRUE;
 471  471                  do_xattr = B_TRUE;
 472  472          }
 473  473          if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 474  474                  atime = B_FALSE;
 475  475                  do_atime = B_TRUE;
 476  476          } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 477  477                  atime = B_TRUE;
 478  478                  do_atime = B_TRUE;
 479  479          }
 480  480  
 481  481          /*
 482  482           * nbmand is a special property.  It can only be changed at
 483  483           * mount time.
 484  484           *
 485  485           * This is weird, but it is documented to only be changeable
 486  486           * at mount time.
 487  487           */
 488  488          if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 489  489                  nbmand = B_FALSE;
 490  490          } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 491  491                  nbmand = B_TRUE;
 492  492          } else {
 493  493                  char osname[MAXNAMELEN];
 494  494  
 495  495                  dmu_objset_name(os, osname);
 496  496                  if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 497  497                      NULL)) {
 498  498                          return (error);
 499  499                  }
 500  500          }
 501  501  
 502  502          /*
 503  503           * Register property callbacks.
 504  504           *
 505  505           * It would probably be fine to just check for i/o error from
 506  506           * the first prop_register(), but I guess I like to go
 507  507           * overboard...
 508  508           */
 509  509          ds = dmu_objset_ds(os);
 510  510          error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 511  511          error = error ? error : dsl_prop_register(ds,
 512  512              "xattr", xattr_changed_cb, zfsvfs);
 513  513          error = error ? error : dsl_prop_register(ds,
 514  514              "recordsize", blksz_changed_cb, zfsvfs);
 515  515          error = error ? error : dsl_prop_register(ds,
 516  516              "readonly", readonly_changed_cb, zfsvfs);
 517  517          error = error ? error : dsl_prop_register(ds,
 518  518              "devices", devices_changed_cb, zfsvfs);
 519  519          error = error ? error : dsl_prop_register(ds,
 520  520              "setuid", setuid_changed_cb, zfsvfs);
 521  521          error = error ? error : dsl_prop_register(ds,
 522  522              "exec", exec_changed_cb, zfsvfs);
 523  523          error = error ? error : dsl_prop_register(ds,
 524  524              "snapdir", snapdir_changed_cb, zfsvfs);
 525  525          error = error ? error : dsl_prop_register(ds,
 526  526              "aclmode", acl_mode_changed_cb, zfsvfs);
 527  527          error = error ? error : dsl_prop_register(ds,
 528  528              "aclinherit", acl_inherit_changed_cb, zfsvfs);
 529  529          error = error ? error : dsl_prop_register(ds,
 530  530              "vscan", vscan_changed_cb, zfsvfs);
 531  531          if (error)
 532  532                  goto unregister;
 533  533  
 534  534          /*
 535  535           * Invoke our callbacks to restore temporary mount options.
 536  536           */
 537  537          if (do_readonly)
 538  538                  readonly_changed_cb(zfsvfs, readonly);
 539  539          if (do_setuid)
 540  540                  setuid_changed_cb(zfsvfs, setuid);
 541  541          if (do_exec)
 542  542                  exec_changed_cb(zfsvfs, exec);
 543  543          if (do_devices)
 544  544                  devices_changed_cb(zfsvfs, devices);
 545  545          if (do_xattr)
 546  546                  xattr_changed_cb(zfsvfs, xattr);
 547  547          if (do_atime)
 548  548                  atime_changed_cb(zfsvfs, atime);
 549  549  
 550  550          nbmand_changed_cb(zfsvfs, nbmand);
 551  551  
 552  552          return (0);
 553  553  
 554  554  unregister:
 555  555          /*
 556  556           * We may attempt to unregister some callbacks that are not
 557  557           * registered, but this is OK; it will simply return ENOMSG,
 558  558           * which we will ignore.
 559  559           */
 560  560          (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 561  561          (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 562  562          (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 563  563          (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 564  564          (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
 565  565          (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 566  566          (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 567  567          (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 568  568          (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 569  569          (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 570  570              zfsvfs);
 571  571          (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
 572  572          return (error);
 573  573  
 574  574  }
 575  575  
 576  576  static int
 577  577  zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
 578  578      uint64_t *userp, uint64_t *groupp)
 579  579  {
 580  580          znode_phys_t *znp = data;
 581  581          int error = 0;
 582  582  
 583  583          /*
 584  584           * Is it a valid type of object to track?
 585  585           */
 586  586          if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 587  587                  return (ENOENT);
 588  588  
 589  589          /*
 590  590           * If we have a NULL data pointer
 591  591           * then assume the id's aren't changing and
 592  592           * return EEXIST to the dmu to let it know to
 593  593           * use the same ids
 594  594           */
 595  595          if (data == NULL)
 596  596                  return (EEXIST);
 597  597  
 598  598          if (bonustype == DMU_OT_ZNODE) {
 599  599                  *userp = znp->zp_uid;
 600  600                  *groupp = znp->zp_gid;
 601  601          } else {
 602  602                  int hdrsize;
 603  603  
 604  604                  ASSERT(bonustype == DMU_OT_SA);
 605  605                  hdrsize = sa_hdrsize(data);
 606  606  
 607  607                  if (hdrsize != 0) {
 608  608                          *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 609  609                              SA_UID_OFFSET));
 610  610                          *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 611  611                              SA_GID_OFFSET));
 612  612                  } else {
 613  613                          /*
 614  614                           * This should only happen for newly created
 615  615                           * files that haven't had the znode data filled
 616  616                           * in yet.
 617  617                           */
 618  618                          *userp = 0;
 619  619                          *groupp = 0;
 620  620                  }
 621  621          }
 622  622          return (error);
 623  623  }
 624  624  
 625  625  static void
 626  626  fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
 627  627      char *domainbuf, int buflen, uid_t *ridp)
 628  628  {
 629  629          uint64_t fuid;
 630  630          const char *domain;
 631  631  
 632  632          fuid = strtonum(fuidstr, NULL);
 633  633  
 634  634          domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 635  635          if (domain)
 636  636                  (void) strlcpy(domainbuf, domain, buflen);
 637  637          else
 638  638                  domainbuf[0] = '\0';
 639  639          *ridp = FUID_RID(fuid);
 640  640  }
 641  641  
 642  642  static uint64_t
 643  643  zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 644  644  {
 645  645          switch (type) {
 646  646          case ZFS_PROP_USERUSED:
 647  647                  return (DMU_USERUSED_OBJECT);
 648  648          case ZFS_PROP_GROUPUSED:
 649  649                  return (DMU_GROUPUSED_OBJECT);
 650  650          case ZFS_PROP_USERQUOTA:
 651  651                  return (zfsvfs->z_userquota_obj);
 652  652          case ZFS_PROP_GROUPQUOTA:
 653  653                  return (zfsvfs->z_groupquota_obj);
 654  654          }
 655  655          return (0);
 656  656  }
 657  657  
 658  658  int
 659  659  zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 660  660      uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 661  661  {
 662  662          int error;
 663  663          zap_cursor_t zc;
 664  664          zap_attribute_t za;
 665  665          zfs_useracct_t *buf = vbuf;
 666  666          uint64_t obj;
 667  667  
 668  668          if (!dmu_objset_userspace_present(zfsvfs->z_os))
 669  669                  return (ENOTSUP);
 670  670  
 671  671          obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 672  672          if (obj == 0) {
 673  673                  *bufsizep = 0;
 674  674                  return (0);
 675  675          }
 676  676  
 677  677          for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 678  678              (error = zap_cursor_retrieve(&zc, &za)) == 0;
 679  679              zap_cursor_advance(&zc)) {
 680  680                  if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 681  681                      *bufsizep)
 682  682                          break;
 683  683  
 684  684                  fuidstr_to_sid(zfsvfs, za.za_name,
 685  685                      buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 686  686  
 687  687                  buf->zu_space = za.za_first_integer;
 688  688                  buf++;
 689  689          }
 690  690          if (error == ENOENT)
 691  691                  error = 0;
 692  692  
 693  693          ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 694  694          *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 695  695          *cookiep = zap_cursor_serialize(&zc);
 696  696          zap_cursor_fini(&zc);
 697  697          return (error);
 698  698  }
 699  699  
 700  700  /*
 701  701   * buf must be big enough (eg, 32 bytes)
 702  702   */
 703  703  static int
 704  704  id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
 705  705      char *buf, boolean_t addok)
 706  706  {
 707  707          uint64_t fuid;
 708  708          int domainid = 0;
 709  709  
 710  710          if (domain && domain[0]) {
 711  711                  domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 712  712                  if (domainid == -1)
 713  713                          return (ENOENT);
 714  714          }
 715  715          fuid = FUID_ENCODE(domainid, rid);
 716  716          (void) sprintf(buf, "%llx", (longlong_t)fuid);
 717  717          return (0);
 718  718  }
 719  719  
 720  720  int
 721  721  zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 722  722      const char *domain, uint64_t rid, uint64_t *valp)
 723  723  {
 724  724          char buf[32];
 725  725          int err;
 726  726          uint64_t obj;
 727  727  
 728  728          *valp = 0;
 729  729  
 730  730          if (!dmu_objset_userspace_present(zfsvfs->z_os))
 731  731                  return (ENOTSUP);
 732  732  
 733  733          obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 734  734          if (obj == 0)
 735  735                  return (0);
 736  736  
 737  737          err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 738  738          if (err)
 739  739                  return (err);
 740  740  
 741  741          err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 742  742          if (err == ENOENT)
 743  743                  err = 0;
 744  744          return (err);
 745  745  }
 746  746  
 747  747  int
 748  748  zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 749  749      const char *domain, uint64_t rid, uint64_t quota)
 750  750  {
 751  751          char buf[32];
 752  752          int err;
 753  753          dmu_tx_t *tx;
 754  754          uint64_t *objp;
 755  755          boolean_t fuid_dirtied;
 756  756  
 757  757          if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 758  758                  return (EINVAL);
 759  759  
 760  760          if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 761  761                  return (ENOTSUP);
 762  762  
 763  763          objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 764  764              &zfsvfs->z_groupquota_obj;
 765  765  
 766  766          err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 767  767          if (err)
 768  768                  return (err);
 769  769          fuid_dirtied = zfsvfs->z_fuid_dirty;
 770  770  
 771  771          tx = dmu_tx_create(zfsvfs->z_os);
 772  772          dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 773  773          if (*objp == 0) {
 774  774                  dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 775  775                      zfs_userquota_prop_prefixes[type]);
 776  776          }
 777  777          if (fuid_dirtied)
 778  778                  zfs_fuid_txhold(zfsvfs, tx);
 779  779          err = dmu_tx_assign(tx, TXG_WAIT);
 780  780          if (err) {
 781  781                  dmu_tx_abort(tx);
 782  782                  return (err);
 783  783          }
 784  784  
 785  785          mutex_enter(&zfsvfs->z_lock);
 786  786          if (*objp == 0) {
 787  787                  *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 788  788                      DMU_OT_NONE, 0, tx);
 789  789                  VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 790  790                      zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 791  791          }
 792  792          mutex_exit(&zfsvfs->z_lock);
 793  793  
 794  794          if (quota == 0) {
 795  795                  err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 796  796                  if (err == ENOENT)
 797  797                          err = 0;
 798  798          } else {
 799  799                  err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 800  800          }
 801  801          ASSERT(err == 0);
 802  802          if (fuid_dirtied)
 803  803                  zfs_fuid_sync(zfsvfs, tx);
 804  804          dmu_tx_commit(tx);
 805  805          return (err);
 806  806  }
 807  807  
 808  808  boolean_t
 809  809  zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 810  810  {
 811  811          char buf[32];
 812  812          uint64_t used, quota, usedobj, quotaobj;
 813  813          int err;
 814  814  
 815  815          usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 816  816          quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 817  817  
 818  818          if (quotaobj == 0 || zfsvfs->z_replay)
 819  819                  return (B_FALSE);
 820  820  
 821  821          (void) sprintf(buf, "%llx", (longlong_t)fuid);
 822  822          err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 823  823          if (err != 0)
 824  824                  return (B_FALSE);
 825  825  
 826  826          err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 827  827          if (err != 0)
 828  828                  return (B_FALSE);
 829  829          return (used >= quota);
 830  830  }
 831  831  
 832  832  boolean_t
 833  833  zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 834  834  {
 835  835          uint64_t fuid;
 836  836          uint64_t quotaobj;
 837  837  
 838  838          quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 839  839  
 840  840          fuid = isgroup ? zp->z_gid : zp->z_uid;
 841  841  
 842  842          if (quotaobj == 0 || zfsvfs->z_replay)
 843  843                  return (B_FALSE);
 844  844  
 845  845          return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 846  846  }
 847  847  
 848  848  int
 849  849  zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 850  850  {
 851  851          objset_t *os;
 852  852          zfsvfs_t *zfsvfs;
 853  853          uint64_t zval;
 854  854          int i, error;
 855  855          uint64_t sa_obj;
 856  856  
 857  857          zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 858  858  
 859  859          /*
 860  860           * We claim to always be readonly so we can open snapshots;
 861  861           * other ZPL code will prevent us from writing to snapshots.
 862  862           */
 863  863          error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 864  864          if (error) {
 865  865                  kmem_free(zfsvfs, sizeof (zfsvfs_t));
 866  866                  return (error);
 867  867          }
 868  868  
 869  869          /*
 870  870           * Initialize the zfs-specific filesystem structure.
 871  871           * Should probably make this a kmem cache, shuffle fields,
 872  872           * and just bzero up to z_hold_mtx[].
 873  873           */
 874  874          zfsvfs->z_vfs = NULL;
 875  875          zfsvfs->z_parent = zfsvfs;
 876  876          zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 877  877          zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 878  878          zfsvfs->z_os = os;
 879  879  
 880  880          error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 881  881          if (error) {
 882  882                  goto out;
 883  883          } else if (zfsvfs->z_version >
 884  884              zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 885  885                  (void) printf("Can't mount a version %lld file system "
 886  886                      "on a version %lld pool\n. Pool must be upgraded to mount "
 887  887                      "this file system.", (u_longlong_t)zfsvfs->z_version,
 888  888                      (u_longlong_t)spa_version(dmu_objset_spa(os)));
 889  889                  error = ENOTSUP;
 890  890                  goto out;
 891  891          }
 892  892          if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
 893  893                  goto out;
 894  894          zfsvfs->z_norm = (int)zval;
 895  895  
 896  896          if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
 897  897                  goto out;
 898  898          zfsvfs->z_utf8 = (zval != 0);
 899  899  
 900  900          if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
 901  901                  goto out;
 902  902          zfsvfs->z_case = (uint_t)zval;
 903  903  
 904  904          /*
 905  905           * Fold case on file systems that are always or sometimes case
 906  906           * insensitive.
 907  907           */
 908  908          if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 909  909              zfsvfs->z_case == ZFS_CASE_MIXED)
 910  910                  zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 911  911  
 912  912          zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 913  913          zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 914  914  
 915  915          if (zfsvfs->z_use_sa) {
 916  916                  /* should either have both of these objects or none */
 917  917                  error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 918  918                      &sa_obj);
 919  919                  if (error)
 920  920                          return (error);
 921  921          } else {
 922  922                  /*
 923  923                   * Pre SA versions file systems should never touch
 924  924                   * either the attribute registration or layout objects.
 925  925                   */
 926  926                  sa_obj = 0;
 927  927          }
 928  928  
 929  929          error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 930  930              &zfsvfs->z_attr_table);
 931  931          if (error)
 932  932                  goto out;
 933  933  
 934  934          if (zfsvfs->z_version >= ZPL_VERSION_SA)
 935  935                  sa_register_update_callback(os, zfs_sa_upgrade);
 936  936  
 937  937          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 938  938              &zfsvfs->z_root);
 939  939          if (error)
 940  940                  goto out;
 941  941          ASSERT(zfsvfs->z_root != 0);
 942  942  
 943  943          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 944  944              &zfsvfs->z_unlinkedobj);
 945  945          if (error)
 946  946                  goto out;
 947  947  
 948  948          error = zap_lookup(os, MASTER_NODE_OBJ,
 949  949              zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 950  950              8, 1, &zfsvfs->z_userquota_obj);
 951  951          if (error && error != ENOENT)
 952  952                  goto out;
 953  953  
 954  954          error = zap_lookup(os, MASTER_NODE_OBJ,
 955  955              zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 956  956              8, 1, &zfsvfs->z_groupquota_obj);
 957  957          if (error && error != ENOENT)
 958  958                  goto out;
 959  959  
 960  960          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 961  961              &zfsvfs->z_fuid_obj);
 962  962          if (error && error != ENOENT)
 963  963                  goto out;
 964  964  
 965  965          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 966  966              &zfsvfs->z_shares_dir);
 967  967          if (error && error != ENOENT)
 968  968                  goto out;
 969  969  
 970  970          mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 971  971          mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 972  972          list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 973  973              offsetof(znode_t, z_link_node));
 974  974          rrw_init(&zfsvfs->z_teardown_lock);
 975  975          rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 976  976          rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 977  977          for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 978  978                  mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 979  979  
 980  980          *zfvp = zfsvfs;
 981  981          return (0);
 982  982  
 983  983  out:
 984  984          dmu_objset_disown(os, zfsvfs);
 985  985          *zfvp = NULL;
 986  986          kmem_free(zfsvfs, sizeof (zfsvfs_t));
 987  987          return (error);
 988  988  }
 989  989  
 990  990  static int
 991  991  zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 992  992  {
 993  993          int error;
 994  994  
 995  995          error = zfs_register_callbacks(zfsvfs->z_vfs);
 996  996          if (error)
 997  997                  return (error);
 998  998  
 999  999          /*
1000 1000           * Set the objset user_ptr to track its zfsvfs.
1001 1001           */
1002 1002          mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1003 1003          dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1004 1004          mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1005 1005  
1006 1006          zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1007 1007  
1008 1008          /*
1009 1009           * If we are not mounting (ie: online recv), then we don't
1010 1010           * have to worry about replaying the log as we blocked all
1011 1011           * operations out since we closed the ZIL.
1012 1012           */
1013 1013          if (mounting) {
1014 1014                  boolean_t readonly;
1015 1015  
1016 1016                  /*
1017 1017                   * During replay we remove the read only flag to
1018 1018                   * allow replays to succeed.
1019 1019                   */
1020 1020                  readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1021 1021                  if (readonly != 0)
1022 1022                          zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1023 1023                  else
1024 1024                          zfs_unlinked_drain(zfsvfs);
1025 1025  
1026 1026                  /*
1027 1027                   * Parse and replay the intent log.
1028 1028                   *
1029 1029                   * Because of ziltest, this must be done after
1030 1030                   * zfs_unlinked_drain().  (Further note: ziltest
1031 1031                   * doesn't use readonly mounts, where
1032 1032                   * zfs_unlinked_drain() isn't called.)  This is because
1033 1033                   * ziltest causes spa_sync() to think it's committed,
1034 1034                   * but actually it is not, so the intent log contains
1035 1035                   * many txg's worth of changes.
1036 1036                   *
1037 1037                   * In particular, if object N is in the unlinked set in
1038 1038                   * the last txg to actually sync, then it could be
1039 1039                   * actually freed in a later txg and then reallocated
1040 1040                   * in a yet later txg.  This would write a "create
1041 1041                   * object N" record to the intent log.  Normally, this
1042 1042                   * would be fine because the spa_sync() would have
1043 1043                   * written out the fact that object N is free, before
1044 1044                   * we could write the "create object N" intent log
1045 1045                   * record.
1046 1046                   *
1047 1047                   * But when we are in ziltest mode, we advance the "open
1048 1048                   * txg" without actually spa_sync()-ing the changes to
1049 1049                   * disk.  So we would see that object N is still
1050 1050                   * allocated and in the unlinked set, and there is an
1051 1051                   * intent log record saying to allocate it.
1052 1052                   */
1053 1053                  if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1054 1054                          if (zil_replay_disable) {
1055 1055                                  zil_destroy(zfsvfs->z_log, B_FALSE);
1056 1056                          } else {
1057 1057                                  zfsvfs->z_replay = B_TRUE;
1058 1058                                  zil_replay(zfsvfs->z_os, zfsvfs,
1059 1059                                      zfs_replay_vector);
1060 1060                                  zfsvfs->z_replay = B_FALSE;
1061 1061                          }
1062 1062                  }
1063 1063                  zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1064 1064          }
1065 1065  
1066 1066          return (0);
1067 1067  }
1068 1068  
1069 1069  void
1070 1070  zfsvfs_free(zfsvfs_t *zfsvfs)
1071 1071  {
1072 1072          int i;
1073 1073          extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1074 1074  
1075 1075          /*
1076 1076           * This is a barrier to prevent the filesystem from going away in
1077 1077           * zfs_znode_move() until we can safely ensure that the filesystem is
1078 1078           * not unmounted. We consider the filesystem valid before the barrier
1079 1079           * and invalid after the barrier.
1080 1080           */
1081 1081          rw_enter(&zfsvfs_lock, RW_READER);
1082 1082          rw_exit(&zfsvfs_lock);
1083 1083  
1084 1084          zfs_fuid_destroy(zfsvfs);
1085 1085  
1086 1086          mutex_destroy(&zfsvfs->z_znodes_lock);
1087 1087          mutex_destroy(&zfsvfs->z_lock);
1088 1088          list_destroy(&zfsvfs->z_all_znodes);
1089 1089          rrw_destroy(&zfsvfs->z_teardown_lock);
1090 1090          rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1091 1091          rw_destroy(&zfsvfs->z_fuid_lock);
1092 1092          for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1093 1093                  mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1094 1094          kmem_free(zfsvfs, sizeof (zfsvfs_t));
1095 1095  }
1096 1096  
1097 1097  static void
1098 1098  zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1099 1099  {
1100 1100          zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1101 1101          if (zfsvfs->z_vfs) {
1102 1102                  if (zfsvfs->z_use_fuids) {
1103 1103                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1104 1104                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1105 1105                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1106 1106                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1107 1107                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1108 1108                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1109 1109                  } else {
1110 1110                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1111 1111                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1112 1112                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1113 1113                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1114 1114                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1115 1115                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1116 1116                  }
1117 1117          }
1118 1118          zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1119 1119  }
1120 1120  
1121 1121  static int
1122 1122  zfs_domount(vfs_t *vfsp, char *osname)
1123 1123  {
1124 1124          dev_t mount_dev;
1125 1125          uint64_t recordsize, fsid_guid;
1126 1126          int error = 0;
1127 1127          zfsvfs_t *zfsvfs;
1128 1128  
1129 1129          ASSERT(vfsp);
1130 1130          ASSERT(osname);
1131 1131  
1132 1132          error = zfsvfs_create(osname, &zfsvfs);
1133 1133          if (error)
1134 1134                  return (error);
1135 1135          zfsvfs->z_vfs = vfsp;
1136 1136  
1137 1137          /* Initialize the generic filesystem structure. */
1138 1138          vfsp->vfs_bcount = 0;
1139 1139          vfsp->vfs_data = NULL;
1140 1140  
1141 1141          if (zfs_create_unique_device(&mount_dev) == -1) {
1142 1142                  error = ENODEV;
1143 1143                  goto out;
1144 1144          }
1145 1145          ASSERT(vfs_devismounted(mount_dev) == 0);
1146 1146  
1147 1147          if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1148 1148              NULL))
1149 1149                  goto out;
1150 1150  
1151 1151          vfsp->vfs_dev = mount_dev;
1152 1152          vfsp->vfs_fstype = zfsfstype;
1153 1153          vfsp->vfs_bsize = recordsize;
1154 1154          vfsp->vfs_flag |= VFS_NOTRUNC;
1155 1155          vfsp->vfs_data = zfsvfs;
1156 1156  
1157 1157          /*
1158 1158           * The fsid is 64 bits, composed of an 8-bit fs type, which
1159 1159           * separates our fsid from any other filesystem types, and a
1160 1160           * 56-bit objset unique ID.  The objset unique ID is unique to
1161 1161           * all objsets open on this system, provided by unique_create().
1162 1162           * The 8-bit fs type must be put in the low bits of fsid[1]
1163 1163           * because that's where other Solaris filesystems put it.
1164 1164           */
1165 1165          fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1166 1166          ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1167 1167          vfsp->vfs_fsid.val[0] = fsid_guid;
1168 1168          vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1169 1169              zfsfstype & 0xFF;
1170 1170  
1171 1171          /*
1172 1172           * Set features for file system.
1173 1173           */
1174 1174          zfs_set_fuid_feature(zfsvfs);
1175 1175          if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1176 1176                  vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1177 1177                  vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1178 1178                  vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1179 1179          } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1180 1180                  vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1181 1181                  vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1182 1182          }
1183 1183          vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1184 1184  
1185 1185          if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1186 1186                  uint64_t pval;
1187 1187  
1188 1188                  atime_changed_cb(zfsvfs, B_FALSE);
1189 1189                  readonly_changed_cb(zfsvfs, B_TRUE);
1190 1190                  if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1191 1191                          goto out;
1192 1192                  xattr_changed_cb(zfsvfs, pval);
1193 1193                  zfsvfs->z_issnap = B_TRUE;
1194 1194                  zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1195 1195  
1196 1196                  mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1197 1197                  dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1198 1198                  mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1199 1199          } else {
1200 1200                  error = zfsvfs_setup(zfsvfs, B_TRUE);
1201 1201          }
1202 1202  
1203 1203          if (!zfsvfs->z_issnap)
1204 1204                  zfsctl_create(zfsvfs);
1205 1205  out:
1206 1206          if (error) {
1207 1207                  dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1208 1208                  zfsvfs_free(zfsvfs);
1209 1209          } else {
1210 1210                  atomic_add_32(&zfs_active_fs_count, 1);
1211 1211          }
1212 1212  
1213 1213          return (error);
1214 1214  }
1215 1215  
1216 1216  void
1217 1217  zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1218 1218  {
1219 1219          objset_t *os = zfsvfs->z_os;
1220 1220          struct dsl_dataset *ds;
1221 1221  
1222 1222          /*
1223 1223           * Unregister properties.
1224 1224           */
1225 1225          if (!dmu_objset_is_snapshot(os)) {
1226 1226                  ds = dmu_objset_ds(os);
1227 1227                  VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
1228 1228                      zfsvfs) == 0);
1229 1229  
1230 1230                  VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
1231 1231                      zfsvfs) == 0);
1232 1232  
1233 1233                  VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
1234 1234                      zfsvfs) == 0);
1235 1235  
1236 1236                  VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
1237 1237                      zfsvfs) == 0);
1238 1238  
1239 1239                  VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
1240 1240                      zfsvfs) == 0);
1241 1241  
1242 1242                  VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
1243 1243                      zfsvfs) == 0);
1244 1244  
1245 1245                  VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
1246 1246                      zfsvfs) == 0);
1247 1247  
1248 1248                  VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
1249 1249                      zfsvfs) == 0);
1250 1250  
1251 1251                  VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
1252 1252                      zfsvfs) == 0);
1253 1253  
1254 1254                  VERIFY(dsl_prop_unregister(ds, "aclinherit",
1255 1255                      acl_inherit_changed_cb, zfsvfs) == 0);
1256 1256  
1257 1257                  VERIFY(dsl_prop_unregister(ds, "vscan",
1258 1258                      vscan_changed_cb, zfsvfs) == 0);
1259 1259          }
1260 1260  }
1261 1261  
1262 1262  /*
1263 1263   * Convert a decimal digit string to a uint64_t integer.
1264 1264   */
1265 1265  static int
1266 1266  str_to_uint64(char *str, uint64_t *objnum)
1267 1267  {
1268 1268          uint64_t num = 0;
1269 1269  
1270 1270          while (*str) {
1271 1271                  if (*str < '0' || *str > '9')
1272 1272                          return (EINVAL);
1273 1273  
1274 1274                  num = num*10 + *str++ - '0';
1275 1275          }
1276 1276  
1277 1277          *objnum = num;
1278 1278          return (0);
1279 1279  }
1280 1280  
1281 1281  /*
1282 1282   * The boot path passed from the boot loader is in the form of
1283 1283   * "rootpool-name/root-filesystem-object-number'. Convert this
1284 1284   * string to a dataset name: "rootpool-name/root-filesystem-name".
1285 1285   */
1286 1286  static int
1287 1287  zfs_parse_bootfs(char *bpath, char *outpath)
1288 1288  {
1289 1289          char *slashp;
1290 1290          uint64_t objnum;
1291 1291          int error;
1292 1292  
1293 1293          if (*bpath == 0 || *bpath == '/')
1294 1294                  return (EINVAL);
1295 1295  
1296 1296          (void) strcpy(outpath, bpath);
1297 1297  
1298 1298          slashp = strchr(bpath, '/');
1299 1299  
1300 1300          /* if no '/', just return the pool name */
1301 1301          if (slashp == NULL) {
1302 1302                  return (0);
1303 1303          }
1304 1304  
1305 1305          /* if not a number, just return the root dataset name */
1306 1306          if (str_to_uint64(slashp+1, &objnum)) {
1307 1307                  return (0);
1308 1308          }
1309 1309  
1310 1310          *slashp = '\0';
1311 1311          error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1312 1312          *slashp = '/';
1313 1313  
1314 1314          return (error);
1315 1315  }
1316 1316  
1317 1317  /*
1318 1318   * zfs_check_global_label:
1319 1319   *      Check that the hex label string is appropriate for the dataset
1320 1320   *      being mounted into the global_zone proper.
1321 1321   *
1322 1322   *      Return an error if the hex label string is not default or
1323 1323   *      admin_low/admin_high.  For admin_low labels, the corresponding
1324 1324   *      dataset must be readonly.
1325 1325   */
1326 1326  int
1327 1327  zfs_check_global_label(const char *dsname, const char *hexsl)
1328 1328  {
1329 1329          if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1330 1330                  return (0);
1331 1331          if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1332 1332                  return (0);
1333 1333          if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1334 1334                  /* must be readonly */
1335 1335                  uint64_t rdonly;
1336 1336  
1337 1337                  if (dsl_prop_get_integer(dsname,
1338 1338                      zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1339 1339                          return (EACCES);
1340 1340                  return (rdonly ? 0 : EACCES);
1341 1341          }
1342 1342          return (EACCES);
1343 1343  }
1344 1344  
1345 1345  /*
1346 1346   * zfs_mount_label_policy:
1347 1347   *      Determine whether the mount is allowed according to MAC check.
1348 1348   *      by comparing (where appropriate) label of the dataset against
1349 1349   *      the label of the zone being mounted into.  If the dataset has
1350 1350   *      no label, create one.
1351 1351   *
1352 1352   *      Returns:
1353 1353   *               0 :    access allowed
1354 1354   *              >0 :    error code, such as EACCES
1355 1355   */
1356 1356  static int
1357 1357  zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1358 1358  {
1359 1359          int             error, retv;
1360 1360          zone_t          *mntzone = NULL;
1361 1361          ts_label_t      *mnt_tsl;
1362 1362          bslabel_t       *mnt_sl;
1363 1363          bslabel_t       ds_sl;
1364 1364          char            ds_hexsl[MAXNAMELEN];
1365 1365  
1366 1366          retv = EACCES;                          /* assume the worst */
1367 1367  
1368 1368          /*
1369 1369           * Start by getting the dataset label if it exists.
1370 1370           */
1371 1371          error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1372 1372              1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1373 1373          if (error)
1374 1374                  return (EACCES);
1375 1375  
1376 1376          /*
1377 1377           * If labeling is NOT enabled, then disallow the mount of datasets
1378 1378           * which have a non-default label already.  No other label checks
1379 1379           * are needed.
1380 1380           */
1381 1381          if (!is_system_labeled()) {
1382 1382                  if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1383 1383                          return (0);
1384 1384                  return (EACCES);
1385 1385          }
1386 1386  
1387 1387          /*
1388 1388           * Get the label of the mountpoint.  If mounting into the global
1389 1389           * zone (i.e. mountpoint is not within an active zone and the
1390 1390           * zoned property is off), the label must be default or
1391 1391           * admin_low/admin_high only; no other checks are needed.
1392 1392           */
1393 1393          mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1394 1394          if (mntzone->zone_id == GLOBAL_ZONEID) {
1395 1395                  uint64_t zoned;
1396 1396  
1397 1397                  zone_rele(mntzone);
1398 1398  
1399 1399                  if (dsl_prop_get_integer(osname,
1400 1400                      zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1401 1401                          return (EACCES);
1402 1402                  if (!zoned)
1403 1403                          return (zfs_check_global_label(osname, ds_hexsl));
1404 1404                  else
1405 1405                          /*
1406 1406                           * This is the case of a zone dataset being mounted
1407 1407                           * initially, before the zone has been fully created;
1408 1408                           * allow this mount into global zone.
1409 1409                           */
1410 1410                          return (0);
1411 1411          }
1412 1412  
1413 1413          mnt_tsl = mntzone->zone_slabel;
1414 1414          ASSERT(mnt_tsl != NULL);
1415 1415          label_hold(mnt_tsl);
1416 1416          mnt_sl = label2bslabel(mnt_tsl);
1417 1417  
1418 1418          if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1419 1419                  /*
1420 1420                   * The dataset doesn't have a real label, so fabricate one.
1421 1421                   */
1422 1422                  char *str = NULL;
1423 1423  
1424 1424                  if (l_to_str_internal(mnt_sl, &str) == 0 &&
1425 1425                      dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1426 1426                      ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
1427 1427                          retv = 0;
1428 1428                  if (str != NULL)
1429 1429                          kmem_free(str, strlen(str) + 1);
1430 1430          } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1431 1431                  /*
1432 1432                   * Now compare labels to complete the MAC check.  If the
1433 1433                   * labels are equal then allow access.  If the mountpoint
1434 1434                   * label dominates the dataset label, allow readonly access.
1435 1435                   * Otherwise, access is denied.
1436 1436                   */
1437 1437                  if (blequal(mnt_sl, &ds_sl))
1438 1438                          retv = 0;
1439 1439                  else if (bldominates(mnt_sl, &ds_sl)) {
1440 1440                          vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1441 1441                          retv = 0;
1442 1442                  }
1443 1443          }
1444 1444  
1445 1445          label_rele(mnt_tsl);
1446 1446          zone_rele(mntzone);
1447 1447          return (retv);
1448 1448  }
1449 1449  
1450 1450  static int
1451 1451  zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1452 1452  {
1453 1453          int error = 0;
1454 1454          static int zfsrootdone = 0;
1455 1455          zfsvfs_t *zfsvfs = NULL;
1456 1456          znode_t *zp = NULL;
1457 1457          vnode_t *vp = NULL;
1458 1458          char *zfs_bootfs;
1459 1459          char *zfs_devid;
1460 1460  
1461 1461          ASSERT(vfsp);
1462 1462  
1463 1463          /*
1464 1464           * The filesystem that we mount as root is defined in the
1465 1465           * boot property "zfs-bootfs" with a format of
1466 1466           * "poolname/root-dataset-objnum".
1467 1467           */
1468 1468          if (why == ROOT_INIT) {
1469 1469                  if (zfsrootdone++)
1470 1470                          return (EBUSY);
1471 1471                  /*
1472 1472                   * the process of doing a spa_load will require the
1473 1473                   * clock to be set before we could (for example) do
1474 1474                   * something better by looking at the timestamp on
1475 1475                   * an uberblock, so just set it to -1.
1476 1476                   */
1477 1477                  clkset(-1);
1478 1478  
1479 1479                  if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1480 1480                          cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1481 1481                              "bootfs name");
1482 1482                          return (EINVAL);
1483 1483                  }
1484 1484                  zfs_devid = spa_get_bootprop("diskdevid");
1485 1485                  error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1486 1486                  if (zfs_devid)
1487 1487                          spa_free_bootprop(zfs_devid);
1488 1488                  if (error) {
1489 1489                          spa_free_bootprop(zfs_bootfs);
1490 1490                          cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1491 1491                              error);
1492 1492                          return (error);
1493 1493                  }
1494 1494                  if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1495 1495                          spa_free_bootprop(zfs_bootfs);
1496 1496                          cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1497 1497                              error);
1498 1498                          return (error);
1499 1499                  }
1500 1500  
1501 1501                  spa_free_bootprop(zfs_bootfs);
1502 1502  
1503 1503                  if (error = vfs_lock(vfsp))
1504 1504                          return (error);
1505 1505  
1506 1506                  if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1507 1507                          cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1508 1508                          goto out;
1509 1509                  }
1510 1510  
1511 1511                  zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1512 1512                  ASSERT(zfsvfs);
1513 1513                  if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1514 1514                          cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1515 1515                          goto out;
1516 1516                  }
1517 1517  
1518 1518                  vp = ZTOV(zp);
1519 1519                  mutex_enter(&vp->v_lock);
1520 1520                  vp->v_flag |= VROOT;
1521 1521                  mutex_exit(&vp->v_lock);
1522 1522                  rootvp = vp;
1523 1523  
1524 1524                  /*
1525 1525                   * Leave rootvp held.  The root file system is never unmounted.
1526 1526                   */
1527 1527  
1528 1528                  vfs_add((struct vnode *)0, vfsp,
1529 1529                      (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1530 1530  out:
1531 1531                  vfs_unlock(vfsp);
1532 1532                  return (error);
1533 1533          } else if (why == ROOT_REMOUNT) {
1534 1534                  readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1535 1535                  vfsp->vfs_flag |= VFS_REMOUNT;
1536 1536  
1537 1537                  /* refresh mount options */
1538 1538                  zfs_unregister_callbacks(vfsp->vfs_data);
1539 1539                  return (zfs_register_callbacks(vfsp));
1540 1540  
1541 1541          } else if (why == ROOT_UNMOUNT) {
1542 1542                  zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1543 1543                  (void) zfs_sync(vfsp, 0, 0);
1544 1544                  return (0);
1545 1545          }
1546 1546  
1547 1547          /*
1548 1548           * if "why" is equal to anything else other than ROOT_INIT,
1549 1549           * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1550 1550           */
1551 1551          return (ENOTSUP);
1552 1552  }
1553 1553  
1554 1554  /*ARGSUSED*/
1555 1555  static int
1556 1556  zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1557 1557  {
1558 1558          char            *osname;
1559 1559          pathname_t      spn;
1560 1560          int             error = 0;
1561 1561          uio_seg_t       fromspace = (uap->flags & MS_SYSSPACE) ?
1562 1562              UIO_SYSSPACE : UIO_USERSPACE;
1563 1563          int             canwrite;
1564 1564  
1565 1565          if (mvp->v_type != VDIR)
1566 1566                  return (ENOTDIR);
1567 1567  
1568 1568          mutex_enter(&mvp->v_lock);
1569 1569          if ((uap->flags & MS_REMOUNT) == 0 &&
1570 1570              (uap->flags & MS_OVERLAY) == 0 &&
1571 1571              (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1572 1572                  mutex_exit(&mvp->v_lock);
1573 1573                  return (EBUSY);
1574 1574          }
1575 1575          mutex_exit(&mvp->v_lock);
1576 1576  
1577 1577          /*
1578 1578           * ZFS does not support passing unparsed data in via MS_DATA.
1579 1579           * Users should use the MS_OPTIONSTR interface; this means
1580 1580           * that all option parsing is already done and the options struct
1581 1581           * can be interrogated.
1582 1582           */
1583 1583          if ((uap->flags & MS_DATA) && uap->datalen > 0)
1584 1584                  return (EINVAL);
1585 1585  
1586 1586          /*
1587 1587           * Get the objset name (the "special" mount argument).
1588 1588           */
1589 1589          if (error = pn_get(uap->spec, fromspace, &spn))
1590 1590                  return (error);
1591 1591  
1592 1592          osname = spn.pn_path;
1593 1593  
1594 1594          /*
1595 1595           * Check for mount privilege?
1596 1596           *
1597 1597           * If we don't have privilege then see if
1598 1598           * we have local permission to allow it
1599 1599           */
1600 1600          error = secpolicy_fs_mount(cr, mvp, vfsp);
1601 1601          if (error) {
1602 1602                  if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1603 1603                          vattr_t         vattr;
1604 1604  
1605 1605                          /*
1606 1606                           * Make sure user is the owner of the mount point
1607 1607                           * or has sufficient privileges.
1608 1608                           */
1609 1609  
1610 1610                          vattr.va_mask = AT_UID;
1611 1611  
1612 1612                          if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1613 1613                                  goto out;
1614 1614                          }
1615 1615  
1616 1616                          if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1617 1617                              VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1618 1618                                  goto out;
1619 1619                          }
1620 1620                          secpolicy_fs_mount_clearopts(cr, vfsp);
1621 1621                  } else {
1622 1622                          goto out;
1623 1623                  }
1624 1624          }
1625 1625  
1626 1626          /*
1627 1627           * Refuse to mount a filesystem if we are in a local zone and the
1628 1628           * dataset is not visible.
1629 1629           */
1630 1630          if (!INGLOBALZONE(curproc) &&
1631 1631              (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1632 1632                  error = EPERM;
1633 1633                  goto out;
1634 1634          }
1635 1635  
1636 1636          error = zfs_mount_label_policy(vfsp, osname);
1637 1637          if (error)
1638 1638                  goto out;
1639 1639  
1640 1640          /*
1641 1641           * When doing a remount, we simply refresh our temporary properties
1642 1642           * according to those options set in the current VFS options.
1643 1643           */
1644 1644          if (uap->flags & MS_REMOUNT) {
1645 1645                  /* refresh mount options */
1646 1646                  zfs_unregister_callbacks(vfsp->vfs_data);
1647 1647                  error = zfs_register_callbacks(vfsp);
1648 1648                  goto out;
1649 1649          }
1650 1650  
1651 1651          error = zfs_domount(vfsp, osname);
1652 1652  
1653 1653          /*
1654 1654           * Add an extra VFS_HOLD on our parent vfs so that it can't
1655 1655           * disappear due to a forced unmount.
1656 1656           */
1657 1657          if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1658 1658                  VFS_HOLD(mvp->v_vfsp);
1659 1659  
1660 1660  out:
1661 1661          pn_free(&spn);
1662 1662          return (error);
1663 1663  }
1664 1664  
1665 1665  static int
1666 1666  zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1667 1667  {
1668 1668          zfsvfs_t *zfsvfs = vfsp->vfs_data;
1669 1669          dev32_t d32;
1670 1670          uint64_t refdbytes, availbytes, usedobjs, availobjs;
1671 1671  
1672 1672          ZFS_ENTER(zfsvfs);
1673 1673  
1674 1674          dmu_objset_space(zfsvfs->z_os,
1675 1675              &refdbytes, &availbytes, &usedobjs, &availobjs);
1676 1676  
1677 1677          /*
1678 1678           * The underlying storage pool actually uses multiple block sizes.
1679 1679           * We report the fragsize as the smallest block size we support,
1680 1680           * and we report our blocksize as the filesystem's maximum blocksize.
1681 1681           */
1682 1682          statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1683 1683          statp->f_bsize = zfsvfs->z_max_blksz;
1684 1684  
1685 1685          /*
1686 1686           * The following report "total" blocks of various kinds in the
1687 1687           * file system, but reported in terms of f_frsize - the
1688 1688           * "fragment" size.
1689 1689           */
1690 1690  
1691 1691          statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1692 1692          statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1693 1693          statp->f_bavail = statp->f_bfree; /* no root reservation */
1694 1694  
1695 1695          /*
1696 1696           * statvfs() should really be called statufs(), because it assumes
1697 1697           * static metadata.  ZFS doesn't preallocate files, so the best
1698 1698           * we can do is report the max that could possibly fit in f_files,
1699 1699           * and that minus the number actually used in f_ffree.
1700 1700           * For f_ffree, report the smaller of the number of object available
1701 1701           * and the number of blocks (each object will take at least a block).
1702 1702           */
1703 1703          statp->f_ffree = MIN(availobjs, statp->f_bfree);
1704 1704          statp->f_favail = statp->f_ffree;       /* no "root reservation" */
1705 1705          statp->f_files = statp->f_ffree + usedobjs;
1706 1706  
1707 1707          (void) cmpldev(&d32, vfsp->vfs_dev);
1708 1708          statp->f_fsid = d32;
1709 1709  
1710 1710          /*
1711 1711           * We're a zfs filesystem.
1712 1712           */
1713 1713          (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1714 1714  
1715 1715          statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1716 1716  
1717 1717          statp->f_namemax = ZFS_MAXNAMELEN;
1718 1718  
1719 1719          /*
1720 1720           * We have all of 32 characters to stuff a string here.
1721 1721           * Is there anything useful we could/should provide?
1722 1722           */
1723 1723          bzero(statp->f_fstr, sizeof (statp->f_fstr));
1724 1724  
1725 1725          ZFS_EXIT(zfsvfs);
1726 1726          return (0);
1727 1727  }
1728 1728  
1729 1729  static int
1730 1730  zfs_root(vfs_t *vfsp, vnode_t **vpp)
1731 1731  {
1732 1732          zfsvfs_t *zfsvfs = vfsp->vfs_data;
1733 1733          znode_t *rootzp;
1734 1734          int error;
1735 1735  
1736 1736          ZFS_ENTER(zfsvfs);
1737 1737  
1738 1738          error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1739 1739          if (error == 0)
1740 1740                  *vpp = ZTOV(rootzp);
1741 1741  
1742 1742          ZFS_EXIT(zfsvfs);
1743 1743          return (error);
1744 1744  }
1745 1745  
1746 1746  /*
1747 1747   * Teardown the zfsvfs::z_os.
1748 1748   *
1749 1749   * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1750 1750   * and 'z_teardown_inactive_lock' held.
1751 1751   */
1752 1752  static int
1753 1753  zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1754 1754  {
1755 1755          znode_t *zp;
1756 1756  
1757 1757          rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1758 1758  
1759 1759          if (!unmounting) {
1760 1760                  /*
1761 1761                   * We purge the parent filesystem's vfsp as the parent
1762 1762                   * filesystem and all of its snapshots have their vnode's
1763 1763                   * v_vfsp set to the parent's filesystem's vfsp.  Note,
1764 1764                   * 'z_parent' is self referential for non-snapshots.
1765 1765                   */
1766 1766                  (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1767 1767          }
1768 1768  
1769 1769          /*
1770 1770           * Close the zil. NB: Can't close the zil while zfs_inactive
1771 1771           * threads are blocked as zil_close can call zfs_inactive.
1772 1772           */
1773 1773          if (zfsvfs->z_log) {
1774 1774                  zil_close(zfsvfs->z_log);
1775 1775                  zfsvfs->z_log = NULL;
1776 1776          }
1777 1777  
1778 1778          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1779 1779  
1780 1780          /*
1781 1781           * If we are not unmounting (ie: online recv) and someone already
1782 1782           * unmounted this file system while we were doing the switcheroo,
1783 1783           * or a reopen of z_os failed then just bail out now.
1784 1784           */
1785 1785          if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1786 1786                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
1787 1787                  rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1788 1788                  return (EIO);
1789 1789          }
1790 1790  
1791 1791          /*
1792 1792           * At this point there are no vops active, and any new vops will
1793 1793           * fail with EIO since we have z_teardown_lock for writer (only
1794 1794           * relavent for forced unmount).
1795 1795           *
1796 1796           * Release all holds on dbufs.
1797 1797           */
1798 1798          mutex_enter(&zfsvfs->z_znodes_lock);
1799 1799          for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1800 1800              zp = list_next(&zfsvfs->z_all_znodes, zp))
1801 1801                  if (zp->z_sa_hdl) {
1802 1802                          ASSERT(ZTOV(zp)->v_count > 0);
1803 1803                          zfs_znode_dmu_fini(zp);
1804 1804                  }
1805 1805          mutex_exit(&zfsvfs->z_znodes_lock);
1806 1806  
1807 1807          /*
1808 1808           * If we are unmounting, set the unmounted flag and let new vops
1809 1809           * unblock.  zfs_inactive will have the unmounted behavior, and all
1810 1810           * other vops will fail with EIO.
1811 1811           */
1812 1812          if (unmounting) {
1813 1813                  zfsvfs->z_unmounted = B_TRUE;
1814 1814                  rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1815 1815                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
1816 1816          }
1817 1817  
1818 1818          /*
1819 1819           * z_os will be NULL if there was an error in attempting to reopen
1820 1820           * zfsvfs, so just return as the properties had already been
1821 1821           * unregistered and cached data had been evicted before.
1822 1822           */
1823 1823          if (zfsvfs->z_os == NULL)
1824 1824                  return (0);
1825 1825  
1826 1826          /*
1827 1827           * Unregister properties.
1828 1828           */
1829 1829          zfs_unregister_callbacks(zfsvfs);
1830 1830  
1831 1831          /*
1832 1832           * Evict cached data
1833 1833           */
1834 1834          if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
1835 1835                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1836 1836                          txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1837 1837          (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1838 1838  
1839 1839          return (0);
1840 1840  }
1841 1841  
1842 1842  /*ARGSUSED*/
1843 1843  static int
1844 1844  zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1845 1845  {
1846 1846          zfsvfs_t *zfsvfs = vfsp->vfs_data;
1847 1847          objset_t *os;
1848 1848          int ret;
1849 1849  
1850 1850          ret = secpolicy_fs_unmount(cr, vfsp);
1851 1851          if (ret) {
1852 1852                  if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1853 1853                      ZFS_DELEG_PERM_MOUNT, cr))
1854 1854                          return (ret);
1855 1855          }
1856 1856  
1857 1857          /*
1858 1858           * We purge the parent filesystem's vfsp as the parent filesystem
1859 1859           * and all of its snapshots have their vnode's v_vfsp set to the
1860 1860           * parent's filesystem's vfsp.  Note, 'z_parent' is self
1861 1861           * referential for non-snapshots.
1862 1862           */
1863 1863          (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1864 1864  
1865 1865          /*
1866 1866           * Unmount any snapshots mounted under .zfs before unmounting the
1867 1867           * dataset itself.
1868 1868           */
1869 1869          if (zfsvfs->z_ctldir != NULL &&
1870 1870              (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1871 1871                  return (ret);
1872 1872          }
1873 1873  
1874 1874          if (!(fflag & MS_FORCE)) {
1875 1875                  /*
1876 1876                   * Check the number of active vnodes in the file system.
1877 1877                   * Our count is maintained in the vfs structure, but the
1878 1878                   * number is off by 1 to indicate a hold on the vfs
1879 1879                   * structure itself.
1880 1880                   *
1881 1881                   * The '.zfs' directory maintains a reference of its
1882 1882                   * own, and any active references underneath are
1883 1883                   * reflected in the vnode count.
1884 1884                   */
1885 1885                  if (zfsvfs->z_ctldir == NULL) {
1886 1886                          if (vfsp->vfs_count > 1)
1887 1887                                  return (EBUSY);
1888 1888                  } else {
1889 1889                          if (vfsp->vfs_count > 2 ||
1890 1890                              zfsvfs->z_ctldir->v_count > 1)
1891 1891                                  return (EBUSY);
1892 1892                  }
1893 1893          }
1894 1894  
1895 1895          vfsp->vfs_flag |= VFS_UNMOUNTED;
1896 1896  
1897 1897          VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1898 1898          os = zfsvfs->z_os;
1899 1899  
1900 1900          /*
1901 1901           * z_os will be NULL if there was an error in
1902 1902           * attempting to reopen zfsvfs.
1903 1903           */
1904 1904          if (os != NULL) {
1905 1905                  /*
1906 1906                   * Unset the objset user_ptr.
1907 1907                   */
1908 1908                  mutex_enter(&os->os_user_ptr_lock);
1909 1909                  dmu_objset_set_user(os, NULL);
1910 1910                  mutex_exit(&os->os_user_ptr_lock);
1911 1911  
1912 1912                  /*
1913 1913                   * Finally release the objset
1914 1914                   */
1915 1915                  dmu_objset_disown(os, zfsvfs);
1916 1916          }
1917 1917  
1918 1918          /*
1919 1919           * We can now safely destroy the '.zfs' directory node.
1920 1920           */
1921 1921          if (zfsvfs->z_ctldir != NULL)
1922 1922                  zfsctl_destroy(zfsvfs);
1923 1923  
1924 1924          return (0);
1925 1925  }
1926 1926  
1927 1927  static int
1928 1928  zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1929 1929  {
1930 1930          zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1931 1931          znode_t         *zp;
1932 1932          uint64_t        object = 0;
1933 1933          uint64_t        fid_gen = 0;
1934 1934          uint64_t        gen_mask;
1935 1935          uint64_t        zp_gen;
1936 1936          int             i, err;
1937 1937  
1938 1938          *vpp = NULL;
1939 1939  
1940 1940          ZFS_ENTER(zfsvfs);
1941 1941  
1942 1942          if (fidp->fid_len == LONG_FID_LEN) {
1943 1943                  zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1944 1944                  uint64_t        objsetid = 0;
1945 1945                  uint64_t        setgen = 0;
1946 1946  
1947 1947                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1948 1948                          objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1949 1949  
1950 1950                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1951 1951                          setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1952 1952  
1953 1953                  ZFS_EXIT(zfsvfs);
1954 1954  
1955 1955                  err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1956 1956                  if (err)
1957 1957                          return (EINVAL);
1958 1958                  ZFS_ENTER(zfsvfs);
1959 1959          }
1960 1960  
1961 1961          if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1962 1962                  zfid_short_t    *zfid = (zfid_short_t *)fidp;
1963 1963  
1964 1964                  for (i = 0; i < sizeof (zfid->zf_object); i++)
1965 1965                          object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1966 1966  
1967 1967                  for (i = 0; i < sizeof (zfid->zf_gen); i++)
1968 1968                          fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1969 1969          } else {
1970 1970                  ZFS_EXIT(zfsvfs);
1971 1971                  return (EINVAL);
1972 1972          }
1973 1973  
1974 1974          /* A zero fid_gen means we are in the .zfs control directories */
1975 1975          if (fid_gen == 0 &&
1976 1976              (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1977 1977                  *vpp = zfsvfs->z_ctldir;
1978 1978                  ASSERT(*vpp != NULL);
1979 1979                  if (object == ZFSCTL_INO_SNAPDIR) {
1980 1980                          VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1981 1981                              0, NULL, NULL, NULL, NULL, NULL) == 0);
1982 1982                  } else {
1983 1983                          VN_HOLD(*vpp);
1984 1984                  }
1985 1985                  ZFS_EXIT(zfsvfs);
1986 1986                  return (0);
1987 1987          }
1988 1988  
1989 1989          gen_mask = -1ULL >> (64 - 8 * i);
1990 1990  
1991 1991          dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1992 1992          if (err = zfs_zget(zfsvfs, object, &zp)) {
1993 1993                  ZFS_EXIT(zfsvfs);
1994 1994                  return (err);
1995 1995          }
1996 1996          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1997 1997              sizeof (uint64_t));
1998 1998          zp_gen = zp_gen & gen_mask;
1999 1999          if (zp_gen == 0)
2000 2000                  zp_gen = 1;
2001 2001          if (zp->z_unlinked || zp_gen != fid_gen) {
2002 2002                  dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2003 2003                  VN_RELE(ZTOV(zp));
2004 2004                  ZFS_EXIT(zfsvfs);
2005 2005                  return (EINVAL);
2006 2006          }
2007 2007  
2008 2008          *vpp = ZTOV(zp);
2009 2009          ZFS_EXIT(zfsvfs);
2010 2010          return (0);
2011 2011  }
2012 2012  
2013 2013  /*
2014 2014   * Block out VOPs and close zfsvfs_t::z_os
2015 2015   *
2016 2016   * Note, if successful, then we return with the 'z_teardown_lock' and
2017 2017   * 'z_teardown_inactive_lock' write held.
2018 2018   */
2019 2019  int
2020 2020  zfs_suspend_fs(zfsvfs_t *zfsvfs)
2021 2021  {
2022 2022          int error;
2023 2023  
2024 2024          if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2025 2025                  return (error);
2026 2026          dmu_objset_disown(zfsvfs->z_os, zfsvfs);
2027 2027  
2028 2028          return (0);
2029 2029  }
2030 2030  
2031 2031  /*
2032 2032   * Reopen zfsvfs_t::z_os and release VOPs.
2033 2033   */
2034 2034  int
2035 2035  zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
2036 2036  {
2037 2037          int err;
2038 2038  
2039 2039          ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
2040 2040          ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2041 2041  
2042 2042          err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
2043 2043              &zfsvfs->z_os);
2044 2044          if (err) {
2045 2045                  zfsvfs->z_os = NULL;
2046 2046          } else {
2047 2047                  znode_t *zp;
2048 2048                  uint64_t sa_obj = 0;
2049 2049  
2050 2050                  /*
2051 2051                   * Make sure version hasn't changed
2052 2052                   */
2053 2053  
2054 2054                  err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
2055 2055                      &zfsvfs->z_version);
2056 2056  
2057 2057                  if (err)
2058 2058                          goto bail;
2059 2059  
2060 2060                  err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
2061 2061                      ZFS_SA_ATTRS, 8, 1, &sa_obj);
2062 2062  
2063 2063                  if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
2064 2064                          goto bail;
2065 2065  
2066 2066                  if ((err = sa_setup(zfsvfs->z_os, sa_obj,
2067 2067                      zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
2068 2068                          goto bail;
2069 2069  
2070 2070                  if (zfsvfs->z_version >= ZPL_VERSION_SA)
2071 2071                          sa_register_update_callback(zfsvfs->z_os,
2072 2072                              zfs_sa_upgrade);
2073 2073  
2074 2074                  VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2075 2075  
2076 2076                  zfs_set_fuid_feature(zfsvfs);
2077 2077  
2078 2078                  /*
2079 2079                   * Attempt to re-establish all the active znodes with
2080 2080                   * their dbufs.  If a zfs_rezget() fails, then we'll let
2081 2081                   * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2082 2082                   * when they try to use their znode.
2083 2083                   */
2084 2084                  mutex_enter(&zfsvfs->z_znodes_lock);
2085 2085                  for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2086 2086                      zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2087 2087                          (void) zfs_rezget(zp);
2088 2088                  }
2089 2089                  mutex_exit(&zfsvfs->z_znodes_lock);
2090 2090          }
2091 2091  
2092 2092  bail:
2093 2093          /* release the VOPs */
2094 2094          rw_exit(&zfsvfs->z_teardown_inactive_lock);
2095 2095          rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
2096 2096  
2097 2097          if (err) {
2098 2098                  /*
2099 2099                   * Since we couldn't reopen zfsvfs::z_os, or
2100 2100                   * setup the sa framework force unmount this file system.
2101 2101                   */
2102 2102                  if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2103 2103                          (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2104 2104          }
2105 2105          return (err);
2106 2106  }
2107 2107  
2108 2108  static void
2109 2109  zfs_freevfs(vfs_t *vfsp)
2110 2110  {
2111 2111          zfsvfs_t *zfsvfs = vfsp->vfs_data;
2112 2112  
2113 2113          /*
2114 2114           * If this is a snapshot, we have an extra VFS_HOLD on our parent
2115 2115           * from zfs_mount().  Release it here.  If we came through
2116 2116           * zfs_mountroot() instead, we didn't grab an extra hold, so
2117 2117           * skip the VFS_RELE for rootvfs.
2118 2118           */
2119 2119          if (zfsvfs->z_issnap && (vfsp != rootvfs))
2120 2120                  VFS_RELE(zfsvfs->z_parent->z_vfs);
2121 2121  
2122 2122          zfsvfs_free(zfsvfs);
2123 2123  
2124 2124          atomic_add_32(&zfs_active_fs_count, -1);
2125 2125  }
2126 2126  
2127 2127  /*
2128 2128   * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
2129 2129   * so we can't safely do any non-idempotent initialization here.
2130 2130   * Leave that to zfs_init() and zfs_fini(), which are called
2131 2131   * from the module's _init() and _fini() entry points.
2132 2132   */
2133 2133  /*ARGSUSED*/
2134 2134  static int
2135 2135  zfs_vfsinit(int fstype, char *name)
2136 2136  {
2137 2137          int error;
2138 2138  
2139 2139          zfsfstype = fstype;
2140 2140  
2141 2141          /*
2142 2142           * Setup vfsops and vnodeops tables.
2143 2143           */
2144 2144          error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
2145 2145          if (error != 0) {
2146 2146                  cmn_err(CE_WARN, "zfs: bad vfs ops template");
2147 2147          }
2148 2148  
2149 2149          error = zfs_create_op_tables();
2150 2150          if (error) {
2151 2151                  zfs_remove_op_tables();
2152 2152                  cmn_err(CE_WARN, "zfs: bad vnode ops template");
2153 2153                  (void) vfs_freevfsops_by_type(zfsfstype);
2154 2154                  return (error);
2155 2155          }
2156 2156  
2157 2157          mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
2158 2158  
2159 2159          /*
2160 2160           * Unique major number for all zfs mounts.
2161 2161           * If we run out of 32-bit minors, we'll getudev() another major.
2162 2162           */
2163 2163          zfs_major = ddi_name_to_major(ZFS_DRIVER);
2164 2164          zfs_minor = ZFS_MIN_MINOR;
2165 2165  
2166 2166          return (0);
2167 2167  }
2168 2168  
2169 2169  void
2170 2170  zfs_init(void)
2171 2171  {
2172 2172          /*
2173 2173           * Initialize .zfs directory structures
2174 2174           */
2175 2175          zfsctl_init();
2176 2176  
2177 2177          /*
2178 2178           * Initialize znode cache, vnode ops, etc...
2179 2179           */
2180 2180          zfs_znode_init();
2181 2181  
2182 2182          dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2183 2183  }
2184 2184  
2185 2185  void
2186 2186  zfs_fini(void)
2187 2187  {
2188 2188          zfsctl_fini();
2189 2189          zfs_znode_fini();
2190 2190  }
2191 2191  
2192 2192  int
2193 2193  zfs_busy(void)
2194 2194  {
2195 2195          return (zfs_active_fs_count != 0);
2196 2196  }
2197 2197  
2198 2198  int
2199 2199  zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2200 2200  {
2201 2201          int error;
2202 2202          objset_t *os = zfsvfs->z_os;
2203 2203          dmu_tx_t *tx;
2204 2204  
2205 2205          if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2206 2206                  return (EINVAL);
2207 2207  
2208 2208          if (newvers < zfsvfs->z_version)
2209 2209                  return (EINVAL);
2210 2210  
2211 2211          if (zfs_spa_version_map(newvers) >
2212 2212              spa_version(dmu_objset_spa(zfsvfs->z_os)))
2213 2213                  return (ENOTSUP);
2214 2214  
2215 2215          tx = dmu_tx_create(os);
2216 2216          dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2217 2217          if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2218 2218                  dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2219 2219                      ZFS_SA_ATTRS);
2220 2220                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2221 2221          }
2222 2222          error = dmu_tx_assign(tx, TXG_WAIT);
2223 2223          if (error) {
2224 2224                  dmu_tx_abort(tx);
2225 2225                  return (error);
2226 2226          }
2227 2227  
2228 2228          error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2229 2229              8, 1, &newvers, tx);
2230 2230  
2231 2231          if (error) {
2232 2232                  dmu_tx_commit(tx);
2233 2233                  return (error);
2234 2234          }
2235 2235

↓ open down ↓

2235 lines elided

↑ open up ↑

2236 2236          if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2237 2237                  uint64_t sa_obj;
2238 2238  
2239 2239                  ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2240 2240                      SPA_VERSION_SA);
2241 2241                  sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2242 2242                      DMU_OT_NONE, 0, tx);
2243 2243  
2244 2244                  error = zap_add(os, MASTER_NODE_OBJ,
2245 2245                      ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2246      -                ASSERT3U(error, ==, 0);
     2246 +                ASSERT0(error);
2247 2247  
2248 2248                  VERIFY(0 == sa_set_sa_object(os, sa_obj));
2249 2249                  sa_register_update_callback(os, zfs_sa_upgrade);
2250 2250          }
2251 2251  
2252 2252          spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2253 2253              "from %llu to %llu", zfsvfs->z_version, newvers);
2254 2254  
2255 2255          dmu_tx_commit(tx);
2256 2256

2257 2257          zfsvfs->z_version = newvers;
2258 2258  
2259 2259          zfs_set_fuid_feature(zfsvfs);
2260 2260  
2261 2261          return (0);
2262 2262  }
2263 2263  
2264 2264  /*
2265 2265   * Read a property stored within the master node.
2266 2266   */
2267 2267  int
2268 2268  zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2269 2269  {
2270 2270          const char *pname;
2271 2271          int error = ENOENT;
2272 2272  
2273 2273          /*
2274 2274           * Look up the file system's value for the property.  For the
2275 2275           * version property, we look up a slightly different string.
2276 2276           */
2277 2277          if (prop == ZFS_PROP_VERSION)
2278 2278                  pname = ZPL_VERSION_STR;
2279 2279          else
2280 2280                  pname = zfs_prop_to_name(prop);
2281 2281  
2282 2282          if (os != NULL)
2283 2283                  error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2284 2284  
2285 2285          if (error == ENOENT) {
2286 2286                  /* No value set, use the default value */
2287 2287                  switch (prop) {
2288 2288                  case ZFS_PROP_VERSION:
2289 2289                          *value = ZPL_VERSION;
2290 2290                          break;
2291 2291                  case ZFS_PROP_NORMALIZE:
2292 2292                  case ZFS_PROP_UTF8ONLY:
2293 2293                          *value = 0;
2294 2294                          break;
2295 2295                  case ZFS_PROP_CASE:
2296 2296                          *value = ZFS_CASE_SENSITIVE;
2297 2297                          break;
2298 2298                  default:
2299 2299                          return (error);
2300 2300                  }
2301 2301                  error = 0;
2302 2302          }
2303 2303          return (error);
2304 2304  }
2305 2305  
2306 2306  static vfsdef_t vfw = {
2307 2307          VFSDEF_VERSION,
2308 2308          MNTTYPE_ZFS,
2309 2309          zfs_vfsinit,
2310 2310          VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2311 2311              VSW_XID|VSW_ZMOUNT,
2312 2312          &zfs_mntopts
2313 2313  };
2314 2314  
2315 2315  struct modlfs zfs_modlfs = {
2316 2316          &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2317 2317  };

↓ open down ↓

61 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX