illumos-merge Wdiff usr/src/uts/common/fs/zfs/zfs_vfsops.c

Print this page

2882 implement libzfs_core
2883 changing "canmount" property to "on" should not always remount dataset
2900 "zfs snapshot" should be able to create multiple, arbitrary snapshots at once
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Chris Siden <christopher.siden@delphix.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Bill Pijewski <wdp@joyent.com>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zfs_vfsops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vfsops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2012 by Delphix. All rights reserved.
  23   24   */
  24   25  
  25   26  /* Portions Copyright 2010 Robert Milkowski */
  26   27  
  27   28  #include <sys/types.h>
  28   29  #include <sys/param.h>
  29   30  #include <sys/systm.h>
  30   31  #include <sys/sysmacros.h>
  31   32  #include <sys/kmem.h>
  32   33  #include <sys/pathname.h>

  33   34  #include <sys/vnode.h>
  34   35  #include <sys/vfs.h>
  35   36  #include <sys/vfs_opreg.h>
  36   37  #include <sys/mntent.h>
  37   38  #include <sys/mount.h>
  38   39  #include <sys/cmn_err.h>
  39   40  #include "fs/fs_subr.h"
  40   41  #include <sys/zfs_znode.h>
  41   42  #include <sys/zfs_dir.h>
  42   43  #include <sys/zil.h>
  43   44  #include <sys/fs/zfs.h>
  44   45  #include <sys/dmu.h>
  45   46  #include <sys/dsl_prop.h>
  46   47  #include <sys/dsl_dataset.h>
  47   48  #include <sys/dsl_deleg.h>
  48   49  #include <sys/spa.h>
  49   50  #include <sys/zap.h>
  50   51  #include <sys/sa.h>
  51   52  #include <sys/varargs.h>
  52   53  #include <sys/policy.h>
  53   54  #include <sys/atomic.h>
  54   55  #include <sys/mkdev.h>
  55   56  #include <sys/modctl.h>
  56   57  #include <sys/refstr.h>
  57   58  #include <sys/zfs_ioctl.h>
  58   59  #include <sys/zfs_ctldir.h>
  59   60  #include <sys/zfs_fuid.h>
  60   61  #include <sys/bootconf.h>
  61   62  #include <sys/sunddi.h>
  62   63  #include <sys/dnlc.h>
  63   64  #include <sys/dmu_objset.h>
  64   65  #include <sys/spa_boot.h>
  65   66  #include <sys/sa.h>
  66   67  #include "zfs_comutil.h"
  67   68  
  68   69  int zfsfstype;
  69   70  vfsops_t *zfs_vfsops = NULL;
  70   71  static major_t zfs_major;
  71   72  static minor_t zfs_minor;
  72   73  static kmutex_t zfs_dev_mtx;
  73   74  
  74   75  extern int sys_shutdown;
  75   76  
  76   77  static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
  77   78  static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
  78   79  static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
  79   80  static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
  80   81  static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
  81   82  static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
  82   83  static void zfs_freevfs(vfs_t *vfsp);
  83   84  
  84   85  static const fs_operation_def_t zfs_vfsops_template[] = {
  85   86          VFSNAME_MOUNT,          { .vfs_mount = zfs_mount },
  86   87          VFSNAME_MOUNTROOT,      { .vfs_mountroot = zfs_mountroot },
  87   88          VFSNAME_UNMOUNT,        { .vfs_unmount = zfs_umount },
  88   89          VFSNAME_ROOT,           { .vfs_root = zfs_root },
  89   90          VFSNAME_STATVFS,        { .vfs_statvfs = zfs_statvfs },
  90   91          VFSNAME_SYNC,           { .vfs_sync = zfs_sync },
  91   92          VFSNAME_VGET,           { .vfs_vget = zfs_vget },
  92   93          VFSNAME_FREEVFS,        { .vfs_freevfs = zfs_freevfs },
  93   94          NULL,                   NULL
  94   95  };
  95   96  
  96   97  static const fs_operation_def_t zfs_vfsops_eio_template[] = {
  97   98          VFSNAME_FREEVFS,        { .vfs_freevfs =  zfs_freevfs },
  98   99          NULL,                   NULL
  99  100  };
 100  101  
 101  102  /*
 102  103   * We need to keep a count of active fs's.
 103  104   * This is necessary to prevent our module
 104  105   * from being unloaded after a umount -f
 105  106   */
 106  107  static uint32_t zfs_active_fs_count = 0;
 107  108  
 108  109  static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
 109  110  static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
 110  111  static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
 111  112  static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
 112  113  
 113  114  /*
 114  115   * MO_DEFAULT is not used since the default value is determined
 115  116   * by the equivalent property.
 116  117   */
 117  118  static mntopt_t mntopts[] = {
 118  119          { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
 119  120          { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
 120  121          { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
 121  122          { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
 122  123  };
 123  124  
 124  125  static mntopts_t zfs_mntopts = {
 125  126          sizeof (mntopts) / sizeof (mntopt_t),
 126  127          mntopts
 127  128  };
 128  129  
 129  130  /*ARGSUSED*/
 130  131  int
 131  132  zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 132  133  {
 133  134          /*
 134  135           * Data integrity is job one.  We don't want a compromised kernel
 135  136           * writing to the storage pool, so we never sync during panic.
 136  137           */
 137  138          if (panicstr)
 138  139                  return (0);
 139  140  
 140  141          /*
 141  142           * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
 142  143           * to sync metadata, which they would otherwise cache indefinitely.
 143  144           * Semantically, the only requirement is that the sync be initiated.
 144  145           * The DMU syncs out txgs frequently, so there's nothing to do.
 145  146           */
 146  147          if (flag & SYNC_ATTR)
 147  148                  return (0);
 148  149  
 149  150          if (vfsp != NULL) {
 150  151                  /*
 151  152                   * Sync a specific filesystem.
 152  153                   */
 153  154                  zfsvfs_t *zfsvfs = vfsp->vfs_data;
 154  155                  dsl_pool_t *dp;
 155  156  
 156  157                  ZFS_ENTER(zfsvfs);
 157  158                  dp = dmu_objset_pool(zfsvfs->z_os);
 158  159  
 159  160                  /*
 160  161                   * If the system is shutting down, then skip any
 161  162                   * filesystems which may exist on a suspended pool.
 162  163                   */
 163  164                  if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 164  165                          ZFS_EXIT(zfsvfs);
 165  166                          return (0);
 166  167                  }
 167  168  
 168  169                  if (zfsvfs->z_log != NULL)
 169  170                          zil_commit(zfsvfs->z_log, 0);
 170  171  
 171  172                  ZFS_EXIT(zfsvfs);
 172  173          } else {
 173  174                  /*
 174  175                   * Sync all ZFS filesystems.  This is what happens when you
 175  176                   * run sync(1M).  Unlike other filesystems, ZFS honors the
 176  177                   * request by waiting for all pools to commit all dirty data.
 177  178                   */
 178  179                  spa_sync_allpools();
 179  180          }
 180  181  
 181  182          return (0);
 182  183  }
 183  184  
 184  185  static int
 185  186  zfs_create_unique_device(dev_t *dev)
 186  187  {
 187  188          major_t new_major;
 188  189  
 189  190          do {
 190  191                  ASSERT3U(zfs_minor, <=, MAXMIN32);
 191  192                  minor_t start = zfs_minor;
 192  193                  do {
 193  194                          mutex_enter(&zfs_dev_mtx);
 194  195                          if (zfs_minor >= MAXMIN32) {
 195  196                                  /*
 196  197                                   * If we're still using the real major
 197  198                                   * keep out of /dev/zfs and /dev/zvol minor
 198  199                                   * number space.  If we're using a getudev()'ed
 199  200                                   * major number, we can use all of its minors.
 200  201                                   */
 201  202                                  if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 202  203                                          zfs_minor = ZFS_MIN_MINOR;
 203  204                                  else
 204  205                                          zfs_minor = 0;
 205  206                          } else {
 206  207                                  zfs_minor++;
 207  208                          }
 208  209                          *dev = makedevice(zfs_major, zfs_minor);
 209  210                          mutex_exit(&zfs_dev_mtx);
 210  211                  } while (vfs_devismounted(*dev) && zfs_minor != start);
 211  212                  if (zfs_minor == start) {
 212  213                          /*
 213  214                           * We are using all ~262,000 minor numbers for the
 214  215                           * current major number.  Create a new major number.
 215  216                           */
 216  217                          if ((new_major = getudev()) == (major_t)-1) {
 217  218                                  cmn_err(CE_WARN,
 218  219                                      "zfs_mount: Can't get unique major "
 219  220                                      "device number.");
 220  221                                  return (-1);
 221  222                          }
 222  223                          mutex_enter(&zfs_dev_mtx);
 223  224                          zfs_major = new_major;
 224  225                          zfs_minor = 0;
 225  226  
 226  227                          mutex_exit(&zfs_dev_mtx);
 227  228                  } else {
 228  229                          break;
 229  230                  }
 230  231                  /* CONSTANTCONDITION */
 231  232          } while (1);
 232  233  
 233  234          return (0);
 234  235  }
 235  236  
 236  237  static void
 237  238  atime_changed_cb(void *arg, uint64_t newval)
 238  239  {
 239  240          zfsvfs_t *zfsvfs = arg;
 240  241  
 241  242          if (newval == TRUE) {
 242  243                  zfsvfs->z_atime = TRUE;
 243  244                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 244  245                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 245  246          } else {
 246  247                  zfsvfs->z_atime = FALSE;
 247  248                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 248  249                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 249  250          }
 250  251  }
 251  252  
 252  253  static void
 253  254  xattr_changed_cb(void *arg, uint64_t newval)
 254  255  {
 255  256          zfsvfs_t *zfsvfs = arg;
 256  257  
 257  258          if (newval == TRUE) {
 258  259                  /* XXX locking on vfs_flag? */
 259  260                  zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 260  261                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 261  262                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 262  263          } else {
 263  264                  /* XXX locking on vfs_flag? */
 264  265                  zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 265  266                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 266  267                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 267  268          }
 268  269  }
 269  270  
 270  271  static void
 271  272  blksz_changed_cb(void *arg, uint64_t newval)
 272  273  {
 273  274          zfsvfs_t *zfsvfs = arg;
 274  275  
 275  276          if (newval < SPA_MINBLOCKSIZE ||
 276  277              newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 277  278                  newval = SPA_MAXBLOCKSIZE;
 278  279  
 279  280          zfsvfs->z_max_blksz = newval;
 280  281          zfsvfs->z_vfs->vfs_bsize = newval;
 281  282  }
 282  283  
 283  284  static void
 284  285  readonly_changed_cb(void *arg, uint64_t newval)
 285  286  {
 286  287          zfsvfs_t *zfsvfs = arg;
 287  288  
 288  289          if (newval) {
 289  290                  /* XXX locking on vfs_flag? */
 290  291                  zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 291  292                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 292  293                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 293  294          } else {
 294  295                  /* XXX locking on vfs_flag? */
 295  296                  zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 296  297                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 297  298                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 298  299          }
 299  300  }
 300  301  
 301  302  static void
 302  303  devices_changed_cb(void *arg, uint64_t newval)
 303  304  {
 304  305          zfsvfs_t *zfsvfs = arg;
 305  306  
 306  307          if (newval == FALSE) {
 307  308                  zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
 308  309                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
 309  310                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
 310  311          } else {
 311  312                  zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
 312  313                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
 313  314                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
 314  315          }
 315  316  }
 316  317  
 317  318  static void
 318  319  setuid_changed_cb(void *arg, uint64_t newval)
 319  320  {
 320  321          zfsvfs_t *zfsvfs = arg;
 321  322  
 322  323          if (newval == FALSE) {
 323  324                  zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 324  325                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 325  326                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 326  327          } else {
 327  328                  zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 328  329                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 329  330                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 330  331          }
 331  332  }
 332  333  
 333  334  static void
 334  335  exec_changed_cb(void *arg, uint64_t newval)
 335  336  {
 336  337          zfsvfs_t *zfsvfs = arg;
 337  338  
 338  339          if (newval == FALSE) {
 339  340                  zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 340  341                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 341  342                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 342  343          } else {
 343  344                  zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 344  345                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 345  346                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 346  347          }
 347  348  }
 348  349  
 349  350  /*
 350  351   * The nbmand mount option can be changed at mount time.
 351  352   * We can't allow it to be toggled on live file systems or incorrect
 352  353   * behavior may be seen from cifs clients
 353  354   *
 354  355   * This property isn't registered via dsl_prop_register(), but this callback
 355  356   * will be called when a file system is first mounted
 356  357   */
 357  358  static void
 358  359  nbmand_changed_cb(void *arg, uint64_t newval)
 359  360  {
 360  361          zfsvfs_t *zfsvfs = arg;
 361  362          if (newval == FALSE) {
 362  363                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 363  364                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 364  365          } else {
 365  366                  vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 366  367                  vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 367  368          }
 368  369  }
 369  370  
 370  371  static void
 371  372  snapdir_changed_cb(void *arg, uint64_t newval)
 372  373  {
 373  374          zfsvfs_t *zfsvfs = arg;
 374  375  
 375  376          zfsvfs->z_show_ctldir = newval;
 376  377  }
 377  378  
 378  379  static void
 379  380  vscan_changed_cb(void *arg, uint64_t newval)
 380  381  {
 381  382          zfsvfs_t *zfsvfs = arg;
 382  383  
 383  384          zfsvfs->z_vscan = newval;
 384  385  }
 385  386  
 386  387  static void
 387  388  acl_mode_changed_cb(void *arg, uint64_t newval)
 388  389  {
 389  390          zfsvfs_t *zfsvfs = arg;
 390  391  
 391  392          zfsvfs->z_acl_mode = newval;
 392  393  }
 393  394  
 394  395  static void
 395  396  acl_inherit_changed_cb(void *arg, uint64_t newval)
 396  397  {
 397  398          zfsvfs_t *zfsvfs = arg;
 398  399  
 399  400          zfsvfs->z_acl_inherit = newval;
 400  401  }
 401  402  
 402  403  static int
 403  404  zfs_register_callbacks(vfs_t *vfsp)
 404  405  {
 405  406          struct dsl_dataset *ds = NULL;
 406  407          objset_t *os = NULL;
 407  408          zfsvfs_t *zfsvfs = NULL;
 408  409          uint64_t nbmand;
 409  410          int readonly, do_readonly = B_FALSE;
 410  411          int setuid, do_setuid = B_FALSE;
 411  412          int exec, do_exec = B_FALSE;
 412  413          int devices, do_devices = B_FALSE;
 413  414          int xattr, do_xattr = B_FALSE;
 414  415          int atime, do_atime = B_FALSE;
 415  416          int error = 0;
 416  417  
 417  418          ASSERT(vfsp);
 418  419          zfsvfs = vfsp->vfs_data;
 419  420          ASSERT(zfsvfs);
 420  421          os = zfsvfs->z_os;
 421  422  
 422  423          /*
 423  424           * The act of registering our callbacks will destroy any mount
 424  425           * options we may have.  In order to enable temporary overrides
 425  426           * of mount options, we stash away the current values and
 426  427           * restore them after we register the callbacks.
 427  428           */
 428  429          if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 429  430              !spa_writeable(dmu_objset_spa(os))) {
 430  431                  readonly = B_TRUE;
 431  432                  do_readonly = B_TRUE;
 432  433          } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 433  434                  readonly = B_FALSE;
 434  435                  do_readonly = B_TRUE;
 435  436          }
 436  437          if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 437  438                  devices = B_FALSE;
 438  439                  setuid = B_FALSE;
 439  440                  do_devices = B_TRUE;
 440  441                  do_setuid = B_TRUE;
 441  442          } else {
 442  443                  if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
 443  444                          devices = B_FALSE;
 444  445                          do_devices = B_TRUE;
 445  446                  } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
 446  447                          devices = B_TRUE;
 447  448                          do_devices = B_TRUE;
 448  449                  }
 449  450  
 450  451                  if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 451  452                          setuid = B_FALSE;
 452  453                          do_setuid = B_TRUE;
 453  454                  } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 454  455                          setuid = B_TRUE;
 455  456                          do_setuid = B_TRUE;
 456  457                  }
 457  458          }
 458  459          if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 459  460                  exec = B_FALSE;
 460  461                  do_exec = B_TRUE;
 461  462          } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 462  463                  exec = B_TRUE;
 463  464                  do_exec = B_TRUE;
 464  465          }
 465  466          if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 466  467                  xattr = B_FALSE;
 467  468                  do_xattr = B_TRUE;
 468  469          } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 469  470                  xattr = B_TRUE;
 470  471                  do_xattr = B_TRUE;
 471  472          }
 472  473          if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 473  474                  atime = B_FALSE;
 474  475                  do_atime = B_TRUE;
 475  476          } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 476  477                  atime = B_TRUE;
 477  478                  do_atime = B_TRUE;
 478  479          }
 479  480  
 480  481          /*
 481  482           * nbmand is a special property.  It can only be changed at
 482  483           * mount time.
 483  484           *
 484  485           * This is weird, but it is documented to only be changeable
 485  486           * at mount time.
 486  487           */
 487  488          if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 488  489                  nbmand = B_FALSE;
 489  490          } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 490  491                  nbmand = B_TRUE;
 491  492          } else {
 492  493                  char osname[MAXNAMELEN];
 493  494  
 494  495                  dmu_objset_name(os, osname);
 495  496                  if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 496  497                      NULL)) {
 497  498                          return (error);
 498  499                  }
 499  500          }
 500  501  
 501  502          /*
 502  503           * Register property callbacks.
 503  504           *
 504  505           * It would probably be fine to just check for i/o error from
 505  506           * the first prop_register(), but I guess I like to go
 506  507           * overboard...
 507  508           */
 508  509          ds = dmu_objset_ds(os);
 509  510          error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 510  511          error = error ? error : dsl_prop_register(ds,
 511  512              "xattr", xattr_changed_cb, zfsvfs);
 512  513          error = error ? error : dsl_prop_register(ds,
 513  514              "recordsize", blksz_changed_cb, zfsvfs);
 514  515          error = error ? error : dsl_prop_register(ds,
 515  516              "readonly", readonly_changed_cb, zfsvfs);
 516  517          error = error ? error : dsl_prop_register(ds,
 517  518              "devices", devices_changed_cb, zfsvfs);
 518  519          error = error ? error : dsl_prop_register(ds,
 519  520              "setuid", setuid_changed_cb, zfsvfs);
 520  521          error = error ? error : dsl_prop_register(ds,
 521  522              "exec", exec_changed_cb, zfsvfs);
 522  523          error = error ? error : dsl_prop_register(ds,
 523  524              "snapdir", snapdir_changed_cb, zfsvfs);
 524  525          error = error ? error : dsl_prop_register(ds,
 525  526              "aclmode", acl_mode_changed_cb, zfsvfs);
 526  527          error = error ? error : dsl_prop_register(ds,
 527  528              "aclinherit", acl_inherit_changed_cb, zfsvfs);
 528  529          error = error ? error : dsl_prop_register(ds,
 529  530              "vscan", vscan_changed_cb, zfsvfs);
 530  531          if (error)
 531  532                  goto unregister;
 532  533  
 533  534          /*
 534  535           * Invoke our callbacks to restore temporary mount options.
 535  536           */
 536  537          if (do_readonly)
 537  538                  readonly_changed_cb(zfsvfs, readonly);
 538  539          if (do_setuid)
 539  540                  setuid_changed_cb(zfsvfs, setuid);
 540  541          if (do_exec)
 541  542                  exec_changed_cb(zfsvfs, exec);
 542  543          if (do_devices)
 543  544                  devices_changed_cb(zfsvfs, devices);
 544  545          if (do_xattr)
 545  546                  xattr_changed_cb(zfsvfs, xattr);
 546  547          if (do_atime)
 547  548                  atime_changed_cb(zfsvfs, atime);
 548  549  
 549  550          nbmand_changed_cb(zfsvfs, nbmand);
 550  551  
 551  552          return (0);
 552  553  
 553  554  unregister:
 554  555          /*
 555  556           * We may attempt to unregister some callbacks that are not
 556  557           * registered, but this is OK; it will simply return ENOMSG,
 557  558           * which we will ignore.
 558  559           */
 559  560          (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 560  561          (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 561  562          (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 562  563          (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 563  564          (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
 564  565          (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 565  566          (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 566  567          (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 567  568          (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 568  569          (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 569  570              zfsvfs);
 570  571          (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
 571  572          return (error);
 572  573  
 573  574  }
 574  575  
 575  576  static int
 576  577  zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
 577  578      uint64_t *userp, uint64_t *groupp)
 578  579  {
 579  580          znode_phys_t *znp = data;
 580  581          int error = 0;
 581  582  
 582  583          /*
 583  584           * Is it a valid type of object to track?
 584  585           */
 585  586          if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 586  587                  return (ENOENT);
 587  588  
 588  589          /*
 589  590           * If we have a NULL data pointer
 590  591           * then assume the id's aren't changing and
 591  592           * return EEXIST to the dmu to let it know to
 592  593           * use the same ids
 593  594           */
 594  595          if (data == NULL)
 595  596                  return (EEXIST);
 596  597  
 597  598          if (bonustype == DMU_OT_ZNODE) {
 598  599                  *userp = znp->zp_uid;
 599  600                  *groupp = znp->zp_gid;
 600  601          } else {
 601  602                  int hdrsize;
 602  603  
 603  604                  ASSERT(bonustype == DMU_OT_SA);
 604  605                  hdrsize = sa_hdrsize(data);
 605  606  
 606  607                  if (hdrsize != 0) {
 607  608                          *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 608  609                              SA_UID_OFFSET));
 609  610                          *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 610  611                              SA_GID_OFFSET));
 611  612                  } else {
 612  613                          /*
 613  614                           * This should only happen for newly created
 614  615                           * files that haven't had the znode data filled
 615  616                           * in yet.
 616  617                           */
 617  618                          *userp = 0;
 618  619                          *groupp = 0;
 619  620                  }
 620  621          }
 621  622          return (error);
 622  623  }
 623  624  
 624  625  static void
 625  626  fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
 626  627      char *domainbuf, int buflen, uid_t *ridp)
 627  628  {
 628  629          uint64_t fuid;
 629  630          const char *domain;
 630  631  
 631  632          fuid = strtonum(fuidstr, NULL);
 632  633  
 633  634          domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 634  635          if (domain)
 635  636                  (void) strlcpy(domainbuf, domain, buflen);
 636  637          else
 637  638                  domainbuf[0] = '\0';
 638  639          *ridp = FUID_RID(fuid);
 639  640  }
 640  641  
 641  642  static uint64_t
 642  643  zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 643  644  {
 644  645          switch (type) {
 645  646          case ZFS_PROP_USERUSED:
 646  647                  return (DMU_USERUSED_OBJECT);
 647  648          case ZFS_PROP_GROUPUSED:
 648  649                  return (DMU_GROUPUSED_OBJECT);
 649  650          case ZFS_PROP_USERQUOTA:
 650  651                  return (zfsvfs->z_userquota_obj);
 651  652          case ZFS_PROP_GROUPQUOTA:
 652  653                  return (zfsvfs->z_groupquota_obj);
 653  654          }
 654  655          return (0);
 655  656  }
 656  657  
 657  658  int
 658  659  zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 659  660      uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 660  661  {
 661  662          int error;
 662  663          zap_cursor_t zc;
 663  664          zap_attribute_t za;
 664  665          zfs_useracct_t *buf = vbuf;
 665  666          uint64_t obj;
 666  667  
 667  668          if (!dmu_objset_userspace_present(zfsvfs->z_os))
 668  669                  return (ENOTSUP);
 669  670  
 670  671          obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 671  672          if (obj == 0) {
 672  673                  *bufsizep = 0;
 673  674                  return (0);
 674  675          }
 675  676  
 676  677          for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 677  678              (error = zap_cursor_retrieve(&zc, &za)) == 0;
 678  679              zap_cursor_advance(&zc)) {
 679  680                  if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 680  681                      *bufsizep)
 681  682                          break;
 682  683  
 683  684                  fuidstr_to_sid(zfsvfs, za.za_name,
 684  685                      buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 685  686  
 686  687                  buf->zu_space = za.za_first_integer;
 687  688                  buf++;
 688  689          }
 689  690          if (error == ENOENT)
 690  691                  error = 0;
 691  692  
 692  693          ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 693  694          *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 694  695          *cookiep = zap_cursor_serialize(&zc);
 695  696          zap_cursor_fini(&zc);
 696  697          return (error);
 697  698  }
 698  699  
 699  700  /*
 700  701   * buf must be big enough (eg, 32 bytes)
 701  702   */
 702  703  static int
 703  704  id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
 704  705      char *buf, boolean_t addok)
 705  706  {
 706  707          uint64_t fuid;
 707  708          int domainid = 0;
 708  709  
 709  710          if (domain && domain[0]) {
 710  711                  domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 711  712                  if (domainid == -1)
 712  713                          return (ENOENT);
 713  714          }
 714  715          fuid = FUID_ENCODE(domainid, rid);
 715  716          (void) sprintf(buf, "%llx", (longlong_t)fuid);
 716  717          return (0);
 717  718  }
 718  719  
 719  720  int
 720  721  zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 721  722      const char *domain, uint64_t rid, uint64_t *valp)
 722  723  {
 723  724          char buf[32];
 724  725          int err;
 725  726          uint64_t obj;
 726  727  
 727  728          *valp = 0;
 728  729  
 729  730          if (!dmu_objset_userspace_present(zfsvfs->z_os))
 730  731                  return (ENOTSUP);
 731  732  
 732  733          obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 733  734          if (obj == 0)
 734  735                  return (0);
 735  736  
 736  737          err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 737  738          if (err)
 738  739                  return (err);
 739  740  
 740  741          err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 741  742          if (err == ENOENT)
 742  743                  err = 0;
 743  744          return (err);
 744  745  }
 745  746  
 746  747  int
 747  748  zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 748  749      const char *domain, uint64_t rid, uint64_t quota)
 749  750  {
 750  751          char buf[32];
 751  752          int err;
 752  753          dmu_tx_t *tx;
 753  754          uint64_t *objp;
 754  755          boolean_t fuid_dirtied;
 755  756  
 756  757          if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 757  758                  return (EINVAL);
 758  759  
 759  760          if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 760  761                  return (ENOTSUP);
 761  762  
 762  763          objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 763  764              &zfsvfs->z_groupquota_obj;
 764  765  
 765  766          err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 766  767          if (err)
 767  768                  return (err);
 768  769          fuid_dirtied = zfsvfs->z_fuid_dirty;
 769  770  
 770  771          tx = dmu_tx_create(zfsvfs->z_os);
 771  772          dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 772  773          if (*objp == 0) {
 773  774                  dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 774  775                      zfs_userquota_prop_prefixes[type]);
 775  776          }
 776  777          if (fuid_dirtied)
 777  778                  zfs_fuid_txhold(zfsvfs, tx);
 778  779          err = dmu_tx_assign(tx, TXG_WAIT);
 779  780          if (err) {
 780  781                  dmu_tx_abort(tx);
 781  782                  return (err);
 782  783          }
 783  784  
 784  785          mutex_enter(&zfsvfs->z_lock);
 785  786          if (*objp == 0) {
 786  787                  *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 787  788                      DMU_OT_NONE, 0, tx);
 788  789                  VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 789  790                      zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 790  791          }
 791  792          mutex_exit(&zfsvfs->z_lock);
 792  793  
 793  794          if (quota == 0) {
 794  795                  err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 795  796                  if (err == ENOENT)
 796  797                          err = 0;
 797  798          } else {
 798  799                  err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 799  800          }
 800  801          ASSERT(err == 0);
 801  802          if (fuid_dirtied)
 802  803                  zfs_fuid_sync(zfsvfs, tx);
 803  804          dmu_tx_commit(tx);
 804  805          return (err);
 805  806  }
 806  807  
 807  808  boolean_t
 808  809  zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 809  810  {
 810  811          char buf[32];
 811  812          uint64_t used, quota, usedobj, quotaobj;
 812  813          int err;
 813  814  
 814  815          usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 815  816          quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 816  817  
 817  818          if (quotaobj == 0 || zfsvfs->z_replay)
 818  819                  return (B_FALSE);
 819  820  
 820  821          (void) sprintf(buf, "%llx", (longlong_t)fuid);
 821  822          err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 822  823          if (err != 0)
 823  824                  return (B_FALSE);
 824  825  
 825  826          err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 826  827          if (err != 0)
 827  828                  return (B_FALSE);
 828  829          return (used >= quota);
 829  830  }
 830  831  
 831  832  boolean_t
 832  833  zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 833  834  {
 834  835          uint64_t fuid;
 835  836          uint64_t quotaobj;
 836  837  
 837  838          quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 838  839  
 839  840          fuid = isgroup ? zp->z_gid : zp->z_uid;
 840  841  
 841  842          if (quotaobj == 0 || zfsvfs->z_replay)
 842  843                  return (B_FALSE);
 843  844  
 844  845          return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 845  846  }
 846  847  
 847  848  int
 848  849  zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 849  850  {
 850  851          objset_t *os;
 851  852          zfsvfs_t *zfsvfs;
 852  853          uint64_t zval;
 853  854          int i, error;
 854  855          uint64_t sa_obj;
 855  856  
 856  857          zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 857  858  
 858  859          /*
 859  860           * We claim to always be readonly so we can open snapshots;
 860  861           * other ZPL code will prevent us from writing to snapshots.
 861  862           */
 862  863          error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 863  864          if (error) {
 864  865                  kmem_free(zfsvfs, sizeof (zfsvfs_t));
 865  866                  return (error);
 866  867          }
 867  868  
 868  869          /*
 869  870           * Initialize the zfs-specific filesystem structure.
 870  871           * Should probably make this a kmem cache, shuffle fields,
 871  872           * and just bzero up to z_hold_mtx[].
 872  873           */
 873  874          zfsvfs->z_vfs = NULL;
 874  875          zfsvfs->z_parent = zfsvfs;
 875  876          zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 876  877          zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 877  878          zfsvfs->z_os = os;
 878  879  
 879  880          error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 880  881          if (error) {
 881  882                  goto out;
 882  883          } else if (zfsvfs->z_version >
 883  884              zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 884  885                  (void) printf("Can't mount a version %lld file system "
 885  886                      "on a version %lld pool\n. Pool must be upgraded to mount "
 886  887                      "this file system.", (u_longlong_t)zfsvfs->z_version,
 887  888                      (u_longlong_t)spa_version(dmu_objset_spa(os)));
 888  889                  error = ENOTSUP;
 889  890                  goto out;
 890  891          }
 891  892          if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
 892  893                  goto out;
 893  894          zfsvfs->z_norm = (int)zval;
 894  895  
 895  896          if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
 896  897                  goto out;
 897  898          zfsvfs->z_utf8 = (zval != 0);
 898  899  
 899  900          if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
 900  901                  goto out;
 901  902          zfsvfs->z_case = (uint_t)zval;
 902  903  
 903  904          /*
 904  905           * Fold case on file systems that are always or sometimes case
 905  906           * insensitive.
 906  907           */
 907  908          if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 908  909              zfsvfs->z_case == ZFS_CASE_MIXED)
 909  910                  zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 910  911  
 911  912          zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 912  913          zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 913  914  
 914  915          if (zfsvfs->z_use_sa) {
 915  916                  /* should either have both of these objects or none */
 916  917                  error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 917  918                      &sa_obj);
 918  919                  if (error)
 919  920                          return (error);
 920  921          } else {
 921  922                  /*
 922  923                   * Pre SA versions file systems should never touch
 923  924                   * either the attribute registration or layout objects.
 924  925                   */
 925  926                  sa_obj = 0;
 926  927          }
 927  928  
 928  929          error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 929  930              &zfsvfs->z_attr_table);
 930  931          if (error)
 931  932                  goto out;
 932  933  
 933  934          if (zfsvfs->z_version >= ZPL_VERSION_SA)
 934  935                  sa_register_update_callback(os, zfs_sa_upgrade);
 935  936  
 936  937          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 937  938              &zfsvfs->z_root);
 938  939          if (error)
 939  940                  goto out;
 940  941          ASSERT(zfsvfs->z_root != 0);
 941  942  
 942  943          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 943  944              &zfsvfs->z_unlinkedobj);
 944  945          if (error)
 945  946                  goto out;
 946  947  
 947  948          error = zap_lookup(os, MASTER_NODE_OBJ,
 948  949              zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 949  950              8, 1, &zfsvfs->z_userquota_obj);
 950  951          if (error && error != ENOENT)
 951  952                  goto out;
 952  953  
 953  954          error = zap_lookup(os, MASTER_NODE_OBJ,
 954  955              zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 955  956              8, 1, &zfsvfs->z_groupquota_obj);
 956  957          if (error && error != ENOENT)
 957  958                  goto out;
 958  959  
 959  960          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 960  961              &zfsvfs->z_fuid_obj);
 961  962          if (error && error != ENOENT)
 962  963                  goto out;
 963  964  
 964  965          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 965  966              &zfsvfs->z_shares_dir);
 966  967          if (error && error != ENOENT)
 967  968                  goto out;
 968  969  
 969  970          mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 970  971          mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 971  972          list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 972  973              offsetof(znode_t, z_link_node));
 973  974          rrw_init(&zfsvfs->z_teardown_lock);
 974  975          rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 975  976          rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 976  977          for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 977  978                  mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 978  979  
 979  980          *zfvp = zfsvfs;
 980  981          return (0);
 981  982  
 982  983  out:
 983  984          dmu_objset_disown(os, zfsvfs);
 984  985          *zfvp = NULL;
 985  986          kmem_free(zfsvfs, sizeof (zfsvfs_t));
 986  987          return (error);
 987  988  }
 988  989  
 989  990  static int
 990  991  zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 991  992  {
 992  993          int error;
 993  994  
 994  995          error = zfs_register_callbacks(zfsvfs->z_vfs);
 995  996          if (error)
 996  997                  return (error);
 997  998  
 998  999          /*
 999 1000           * Set the objset user_ptr to track its zfsvfs.
1000 1001           */
1001 1002          mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1002 1003          dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1003 1004          mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1004 1005  
1005 1006          zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1006 1007  
1007 1008          /*
1008 1009           * If we are not mounting (ie: online recv), then we don't
1009 1010           * have to worry about replaying the log as we blocked all
1010 1011           * operations out since we closed the ZIL.
1011 1012           */
1012 1013          if (mounting) {
1013 1014                  boolean_t readonly;
1014 1015  
1015 1016                  /*
1016 1017                   * During replay we remove the read only flag to
1017 1018                   * allow replays to succeed.
1018 1019                   */
1019 1020                  readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1020 1021                  if (readonly != 0)
1021 1022                          zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1022 1023                  else
1023 1024                          zfs_unlinked_drain(zfsvfs);
1024 1025  
1025 1026                  /*
1026 1027                   * Parse and replay the intent log.
1027 1028                   *
1028 1029                   * Because of ziltest, this must be done after
1029 1030                   * zfs_unlinked_drain().  (Further note: ziltest
1030 1031                   * doesn't use readonly mounts, where
1031 1032                   * zfs_unlinked_drain() isn't called.)  This is because
1032 1033                   * ziltest causes spa_sync() to think it's committed,
1033 1034                   * but actually it is not, so the intent log contains
1034 1035                   * many txg's worth of changes.
1035 1036                   *
1036 1037                   * In particular, if object N is in the unlinked set in
1037 1038                   * the last txg to actually sync, then it could be
1038 1039                   * actually freed in a later txg and then reallocated
1039 1040                   * in a yet later txg.  This would write a "create
1040 1041                   * object N" record to the intent log.  Normally, this
1041 1042                   * would be fine because the spa_sync() would have
1042 1043                   * written out the fact that object N is free, before
1043 1044                   * we could write the "create object N" intent log
1044 1045                   * record.
1045 1046                   *
1046 1047                   * But when we are in ziltest mode, we advance the "open
1047 1048                   * txg" without actually spa_sync()-ing the changes to
1048 1049                   * disk.  So we would see that object N is still
1049 1050                   * allocated and in the unlinked set, and there is an
1050 1051                   * intent log record saying to allocate it.
1051 1052                   */
1052 1053                  if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1053 1054                          if (zil_replay_disable) {
1054 1055                                  zil_destroy(zfsvfs->z_log, B_FALSE);
1055 1056                          } else {
1056 1057                                  zfsvfs->z_replay = B_TRUE;
1057 1058                                  zil_replay(zfsvfs->z_os, zfsvfs,
1058 1059                                      zfs_replay_vector);
1059 1060                                  zfsvfs->z_replay = B_FALSE;
1060 1061                          }
1061 1062                  }
1062 1063                  zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1063 1064          }
1064 1065  
1065 1066          return (0);
1066 1067  }
1067 1068  
1068 1069  void
1069 1070  zfsvfs_free(zfsvfs_t *zfsvfs)
1070 1071  {
1071 1072          int i;
1072 1073          extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1073 1074  
1074 1075          /*
1075 1076           * This is a barrier to prevent the filesystem from going away in
1076 1077           * zfs_znode_move() until we can safely ensure that the filesystem is
1077 1078           * not unmounted. We consider the filesystem valid before the barrier
1078 1079           * and invalid after the barrier.
1079 1080           */
1080 1081          rw_enter(&zfsvfs_lock, RW_READER);
1081 1082          rw_exit(&zfsvfs_lock);
1082 1083  
1083 1084          zfs_fuid_destroy(zfsvfs);
1084 1085  
1085 1086          mutex_destroy(&zfsvfs->z_znodes_lock);
1086 1087          mutex_destroy(&zfsvfs->z_lock);
1087 1088          list_destroy(&zfsvfs->z_all_znodes);
1088 1089          rrw_destroy(&zfsvfs->z_teardown_lock);
1089 1090          rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1090 1091          rw_destroy(&zfsvfs->z_fuid_lock);
1091 1092          for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1092 1093                  mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1093 1094          kmem_free(zfsvfs, sizeof (zfsvfs_t));
1094 1095  }
1095 1096  
1096 1097  static void
1097 1098  zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1098 1099  {
1099 1100          zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1100 1101          if (zfsvfs->z_vfs) {
1101 1102                  if (zfsvfs->z_use_fuids) {
1102 1103                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1103 1104                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1104 1105                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1105 1106                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1106 1107                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1107 1108                          vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1108 1109                  } else {
1109 1110                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1110 1111                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1111 1112                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1112 1113                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1113 1114                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1114 1115                          vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1115 1116                  }
1116 1117          }
1117 1118          zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1118 1119  }
1119 1120  
1120 1121  static int
1121 1122  zfs_domount(vfs_t *vfsp, char *osname)
1122 1123  {
1123 1124          dev_t mount_dev;
1124 1125          uint64_t recordsize, fsid_guid;
1125 1126          int error = 0;
1126 1127          zfsvfs_t *zfsvfs;
1127 1128  
1128 1129          ASSERT(vfsp);
1129 1130          ASSERT(osname);
1130 1131  
1131 1132          error = zfsvfs_create(osname, &zfsvfs);
1132 1133          if (error)
1133 1134                  return (error);
1134 1135          zfsvfs->z_vfs = vfsp;
1135 1136  
1136 1137          /* Initialize the generic filesystem structure. */
1137 1138          vfsp->vfs_bcount = 0;
1138 1139          vfsp->vfs_data = NULL;
1139 1140  
1140 1141          if (zfs_create_unique_device(&mount_dev) == -1) {
1141 1142                  error = ENODEV;
1142 1143                  goto out;
1143 1144          }
1144 1145          ASSERT(vfs_devismounted(mount_dev) == 0);
1145 1146  
1146 1147          if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1147 1148              NULL))
1148 1149                  goto out;
1149 1150  
1150 1151          vfsp->vfs_dev = mount_dev;
1151 1152          vfsp->vfs_fstype = zfsfstype;
1152 1153          vfsp->vfs_bsize = recordsize;
1153 1154          vfsp->vfs_flag |= VFS_NOTRUNC;
1154 1155          vfsp->vfs_data = zfsvfs;
1155 1156  
1156 1157          /*
1157 1158           * The fsid is 64 bits, composed of an 8-bit fs type, which
1158 1159           * separates our fsid from any other filesystem types, and a
1159 1160           * 56-bit objset unique ID.  The objset unique ID is unique to
1160 1161           * all objsets open on this system, provided by unique_create().
1161 1162           * The 8-bit fs type must be put in the low bits of fsid[1]
1162 1163           * because that's where other Solaris filesystems put it.
1163 1164           */
1164 1165          fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1165 1166          ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1166 1167          vfsp->vfs_fsid.val[0] = fsid_guid;
1167 1168          vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1168 1169              zfsfstype & 0xFF;
1169 1170  
1170 1171          /*
1171 1172           * Set features for file system.
1172 1173           */
1173 1174          zfs_set_fuid_feature(zfsvfs);
1174 1175          if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1175 1176                  vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1176 1177                  vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1177 1178                  vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1178 1179          } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1179 1180                  vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1180 1181                  vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1181 1182          }
1182 1183          vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1183 1184  
1184 1185          if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1185 1186                  uint64_t pval;
1186 1187  
1187 1188                  atime_changed_cb(zfsvfs, B_FALSE);
1188 1189                  readonly_changed_cb(zfsvfs, B_TRUE);
1189 1190                  if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1190 1191                          goto out;
1191 1192                  xattr_changed_cb(zfsvfs, pval);
1192 1193                  zfsvfs->z_issnap = B_TRUE;
1193 1194                  zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1194 1195  
1195 1196                  mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1196 1197                  dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1197 1198                  mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1198 1199          } else {
1199 1200                  error = zfsvfs_setup(zfsvfs, B_TRUE);
1200 1201          }
1201 1202  
1202 1203          if (!zfsvfs->z_issnap)
1203 1204                  zfsctl_create(zfsvfs);
1204 1205  out:
1205 1206          if (error) {
1206 1207                  dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1207 1208                  zfsvfs_free(zfsvfs);
1208 1209          } else {
1209 1210                  atomic_add_32(&zfs_active_fs_count, 1);
1210 1211          }
1211 1212  
1212 1213          return (error);
1213 1214  }
1214 1215  
1215 1216  void
1216 1217  zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1217 1218  {
1218 1219          objset_t *os = zfsvfs->z_os;
1219 1220          struct dsl_dataset *ds;
1220 1221  
1221 1222          /*
1222 1223           * Unregister properties.
1223 1224           */
1224 1225          if (!dmu_objset_is_snapshot(os)) {
1225 1226                  ds = dmu_objset_ds(os);
1226 1227                  VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
1227 1228                      zfsvfs) == 0);
1228 1229  
1229 1230                  VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
1230 1231                      zfsvfs) == 0);
1231 1232  
1232 1233                  VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
1233 1234                      zfsvfs) == 0);
1234 1235  
1235 1236                  VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
1236 1237                      zfsvfs) == 0);
1237 1238  
1238 1239                  VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
1239 1240                      zfsvfs) == 0);
1240 1241  
1241 1242                  VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
1242 1243                      zfsvfs) == 0);
1243 1244  
1244 1245                  VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
1245 1246                      zfsvfs) == 0);
1246 1247  
1247 1248                  VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
1248 1249                      zfsvfs) == 0);
1249 1250  
1250 1251                  VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
1251 1252                      zfsvfs) == 0);
1252 1253  
1253 1254                  VERIFY(dsl_prop_unregister(ds, "aclinherit",
1254 1255                      acl_inherit_changed_cb, zfsvfs) == 0);
1255 1256  
1256 1257                  VERIFY(dsl_prop_unregister(ds, "vscan",
1257 1258                      vscan_changed_cb, zfsvfs) == 0);
1258 1259          }
1259 1260  }
1260 1261  
1261 1262  /*
1262 1263   * Convert a decimal digit string to a uint64_t integer.
1263 1264   */
1264 1265  static int
1265 1266  str_to_uint64(char *str, uint64_t *objnum)
1266 1267  {
1267 1268          uint64_t num = 0;
1268 1269  
1269 1270          while (*str) {
1270 1271                  if (*str < '0' || *str > '9')
1271 1272                          return (EINVAL);
1272 1273  
1273 1274                  num = num*10 + *str++ - '0';
1274 1275          }
1275 1276  
1276 1277          *objnum = num;
1277 1278          return (0);
1278 1279  }
1279 1280  
1280 1281  /*
1281 1282   * The boot path passed from the boot loader is in the form of
1282 1283   * "rootpool-name/root-filesystem-object-number'. Convert this
1283 1284   * string to a dataset name: "rootpool-name/root-filesystem-name".
1284 1285   */
1285 1286  static int
1286 1287  zfs_parse_bootfs(char *bpath, char *outpath)
1287 1288  {
1288 1289          char *slashp;
1289 1290          uint64_t objnum;
1290 1291          int error;
1291 1292  
1292 1293          if (*bpath == 0 || *bpath == '/')
1293 1294                  return (EINVAL);
1294 1295  
1295 1296          (void) strcpy(outpath, bpath);
1296 1297  
1297 1298          slashp = strchr(bpath, '/');
1298 1299  
1299 1300          /* if no '/', just return the pool name */
1300 1301          if (slashp == NULL) {
1301 1302                  return (0);
1302 1303          }
1303 1304  
1304 1305          /* if not a number, just return the root dataset name */
1305 1306          if (str_to_uint64(slashp+1, &objnum)) {
1306 1307                  return (0);
1307 1308          }
1308 1309  
1309 1310          *slashp = '\0';
1310 1311          error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1311 1312          *slashp = '/';
1312 1313  
1313 1314          return (error);
1314 1315  }
1315 1316  
1316 1317  /*
1317 1318   * zfs_check_global_label:
1318 1319   *      Check that the hex label string is appropriate for the dataset
1319 1320   *      being mounted into the global_zone proper.
1320 1321   *
1321 1322   *      Return an error if the hex label string is not default or
1322 1323   *      admin_low/admin_high.  For admin_low labels, the corresponding
1323 1324   *      dataset must be readonly.
1324 1325   */
1325 1326  int
1326 1327  zfs_check_global_label(const char *dsname, const char *hexsl)
1327 1328  {
1328 1329          if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1329 1330                  return (0);
1330 1331          if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1331 1332                  return (0);
1332 1333          if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1333 1334                  /* must be readonly */
1334 1335                  uint64_t rdonly;
1335 1336  
1336 1337                  if (dsl_prop_get_integer(dsname,
1337 1338                      zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1338 1339                          return (EACCES);
1339 1340                  return (rdonly ? 0 : EACCES);
1340 1341          }
1341 1342          return (EACCES);
1342 1343  }
1343 1344  
1344 1345  /*
1345 1346   * zfs_mount_label_policy:
1346 1347   *      Determine whether the mount is allowed according to MAC check.
1347 1348   *      by comparing (where appropriate) label of the dataset against
1348 1349   *      the label of the zone being mounted into.  If the dataset has
1349 1350   *      no label, create one.
1350 1351   *
1351 1352   *      Returns:
1352 1353   *               0 :    access allowed
1353 1354   *              >0 :    error code, such as EACCES
1354 1355   */
1355 1356  static int
1356 1357  zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1357 1358  {
1358 1359          int             error, retv;
1359 1360          zone_t          *mntzone = NULL;
1360 1361          ts_label_t      *mnt_tsl;
1361 1362          bslabel_t       *mnt_sl;
1362 1363          bslabel_t       ds_sl;
1363 1364          char            ds_hexsl[MAXNAMELEN];
1364 1365  
1365 1366          retv = EACCES;                          /* assume the worst */
1366 1367  
1367 1368          /*
1368 1369           * Start by getting the dataset label if it exists.
1369 1370           */
1370 1371          error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1371 1372              1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1372 1373          if (error)
1373 1374                  return (EACCES);
1374 1375  
1375 1376          /*
1376 1377           * If labeling is NOT enabled, then disallow the mount of datasets
1377 1378           * which have a non-default label already.  No other label checks
1378 1379           * are needed.
1379 1380           */
1380 1381          if (!is_system_labeled()) {
1381 1382                  if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1382 1383                          return (0);
1383 1384                  return (EACCES);
1384 1385          }
1385 1386  
1386 1387          /*
1387 1388           * Get the label of the mountpoint.  If mounting into the global
1388 1389           * zone (i.e. mountpoint is not within an active zone and the
1389 1390           * zoned property is off), the label must be default or
1390 1391           * admin_low/admin_high only; no other checks are needed.
1391 1392           */
1392 1393          mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1393 1394          if (mntzone->zone_id == GLOBAL_ZONEID) {
1394 1395                  uint64_t zoned;
1395 1396  
1396 1397                  zone_rele(mntzone);
1397 1398  
1398 1399                  if (dsl_prop_get_integer(osname,
1399 1400                      zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1400 1401                          return (EACCES);
1401 1402                  if (!zoned)
1402 1403                          return (zfs_check_global_label(osname, ds_hexsl));
1403 1404                  else
1404 1405                          /*
1405 1406                           * This is the case of a zone dataset being mounted
1406 1407                           * initially, before the zone has been fully created;
1407 1408                           * allow this mount into global zone.
1408 1409                           */
1409 1410                          return (0);
1410 1411          }
1411 1412  
1412 1413          mnt_tsl = mntzone->zone_slabel;
1413 1414          ASSERT(mnt_tsl != NULL);
1414 1415          label_hold(mnt_tsl);
1415 1416          mnt_sl = label2bslabel(mnt_tsl);
1416 1417  
1417 1418          if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1418 1419                  /*
1419 1420                   * The dataset doesn't have a real label, so fabricate one.
1420 1421                   */
1421 1422                  char *str = NULL;
1422 1423  
1423 1424                  if (l_to_str_internal(mnt_sl, &str) == 0 &&
1424 1425                      dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1425 1426                      ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
1426 1427                          retv = 0;
1427 1428                  if (str != NULL)
1428 1429                          kmem_free(str, strlen(str) + 1);
1429 1430          } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1430 1431                  /*
1431 1432                   * Now compare labels to complete the MAC check.  If the
1432 1433                   * labels are equal then allow access.  If the mountpoint
1433 1434                   * label dominates the dataset label, allow readonly access.
1434 1435                   * Otherwise, access is denied.
1435 1436                   */
1436 1437                  if (blequal(mnt_sl, &ds_sl))
1437 1438                          retv = 0;
1438 1439                  else if (bldominates(mnt_sl, &ds_sl)) {
1439 1440                          vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1440 1441                          retv = 0;
1441 1442                  }
1442 1443          }
1443 1444  
1444 1445          label_rele(mnt_tsl);
1445 1446          zone_rele(mntzone);
1446 1447          return (retv);
1447 1448  }
1448 1449  
1449 1450  static int
1450 1451  zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1451 1452  {
1452 1453          int error = 0;
1453 1454          static int zfsrootdone = 0;
1454 1455          zfsvfs_t *zfsvfs = NULL;
1455 1456          znode_t *zp = NULL;
1456 1457          vnode_t *vp = NULL;
1457 1458          char *zfs_bootfs;
1458 1459          char *zfs_devid;
1459 1460  
1460 1461          ASSERT(vfsp);
1461 1462  
1462 1463          /*
1463 1464           * The filesystem that we mount as root is defined in the
1464 1465           * boot property "zfs-bootfs" with a format of
1465 1466           * "poolname/root-dataset-objnum".
1466 1467           */
1467 1468          if (why == ROOT_INIT) {
1468 1469                  if (zfsrootdone++)
1469 1470                          return (EBUSY);
1470 1471                  /*
1471 1472                   * the process of doing a spa_load will require the
1472 1473                   * clock to be set before we could (for example) do
1473 1474                   * something better by looking at the timestamp on
1474 1475                   * an uberblock, so just set it to -1.
1475 1476                   */
1476 1477                  clkset(-1);
1477 1478  
1478 1479                  if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1479 1480                          cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1480 1481                              "bootfs name");
1481 1482                          return (EINVAL);
1482 1483                  }
1483 1484                  zfs_devid = spa_get_bootprop("diskdevid");
1484 1485                  error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1485 1486                  if (zfs_devid)
1486 1487                          spa_free_bootprop(zfs_devid);
1487 1488                  if (error) {
1488 1489                          spa_free_bootprop(zfs_bootfs);
1489 1490                          cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1490 1491                              error);
1491 1492                          return (error);
1492 1493                  }
1493 1494                  if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1494 1495                          spa_free_bootprop(zfs_bootfs);
1495 1496                          cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1496 1497                              error);
1497 1498                          return (error);
1498 1499                  }
1499 1500  
1500 1501                  spa_free_bootprop(zfs_bootfs);
1501 1502  
1502 1503                  if (error = vfs_lock(vfsp))
1503 1504                          return (error);
1504 1505  
1505 1506                  if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1506 1507                          cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1507 1508                          goto out;
1508 1509                  }
1509 1510  
1510 1511                  zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1511 1512                  ASSERT(zfsvfs);
1512 1513                  if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1513 1514                          cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1514 1515                          goto out;
1515 1516                  }
1516 1517  
1517 1518                  vp = ZTOV(zp);
1518 1519                  mutex_enter(&vp->v_lock);
1519 1520                  vp->v_flag |= VROOT;
1520 1521                  mutex_exit(&vp->v_lock);
1521 1522                  rootvp = vp;
1522 1523  
1523 1524                  /*
1524 1525                   * Leave rootvp held.  The root file system is never unmounted.
1525 1526                   */
1526 1527  
1527 1528                  vfs_add((struct vnode *)0, vfsp,
1528 1529                      (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1529 1530  out:
1530 1531                  vfs_unlock(vfsp);
1531 1532                  return (error);
1532 1533          } else if (why == ROOT_REMOUNT) {
1533 1534                  readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1534 1535                  vfsp->vfs_flag |= VFS_REMOUNT;
1535 1536  
1536 1537                  /* refresh mount options */
1537 1538                  zfs_unregister_callbacks(vfsp->vfs_data);
1538 1539                  return (zfs_register_callbacks(vfsp));
1539 1540  
1540 1541          } else if (why == ROOT_UNMOUNT) {
1541 1542                  zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1542 1543                  (void) zfs_sync(vfsp, 0, 0);
1543 1544                  return (0);
1544 1545          }
1545 1546  
1546 1547          /*
1547 1548           * if "why" is equal to anything else other than ROOT_INIT,
1548 1549           * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1549 1550           */
1550 1551          return (ENOTSUP);
1551 1552  }
1552 1553  
1553 1554  /*ARGSUSED*/
1554 1555  static int
1555 1556  zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1556 1557  {
1557 1558          char            *osname;
1558 1559          pathname_t      spn;
1559 1560          int             error = 0;
1560 1561          uio_seg_t       fromspace = (uap->flags & MS_SYSSPACE) ?
1561 1562              UIO_SYSSPACE : UIO_USERSPACE;
1562 1563          int             canwrite;
1563 1564  
1564 1565          if (mvp->v_type != VDIR)
1565 1566                  return (ENOTDIR);
1566 1567  
1567 1568          mutex_enter(&mvp->v_lock);
1568 1569          if ((uap->flags & MS_REMOUNT) == 0 &&
1569 1570              (uap->flags & MS_OVERLAY) == 0 &&
1570 1571              (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1571 1572                  mutex_exit(&mvp->v_lock);
1572 1573                  return (EBUSY);
1573 1574          }
1574 1575          mutex_exit(&mvp->v_lock);
1575 1576  
1576 1577          /*
1577 1578           * ZFS does not support passing unparsed data in via MS_DATA.
1578 1579           * Users should use the MS_OPTIONSTR interface; this means
1579 1580           * that all option parsing is already done and the options struct
1580 1581           * can be interrogated.
1581 1582           */
1582 1583          if ((uap->flags & MS_DATA) && uap->datalen > 0)
1583 1584                  return (EINVAL);
1584 1585  
1585 1586          /*
1586 1587           * Get the objset name (the "special" mount argument).
1587 1588           */
1588 1589          if (error = pn_get(uap->spec, fromspace, &spn))
1589 1590                  return (error);
1590 1591  
1591 1592          osname = spn.pn_path;
1592 1593  
1593 1594          /*
1594 1595           * Check for mount privilege?
1595 1596           *
1596 1597           * If we don't have privilege then see if
1597 1598           * we have local permission to allow it
1598 1599           */
1599 1600          error = secpolicy_fs_mount(cr, mvp, vfsp);
1600 1601          if (error) {
1601 1602                  if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1602 1603                          vattr_t         vattr;
1603 1604  
1604 1605                          /*
1605 1606                           * Make sure user is the owner of the mount point
1606 1607                           * or has sufficient privileges.
1607 1608                           */
1608 1609  
1609 1610                          vattr.va_mask = AT_UID;
1610 1611  
1611 1612                          if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1612 1613                                  goto out;
1613 1614                          }
1614 1615  
1615 1616                          if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1616 1617                              VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1617 1618                                  goto out;
1618 1619                          }
1619 1620                          secpolicy_fs_mount_clearopts(cr, vfsp);
1620 1621                  } else {
1621 1622                          goto out;
1622 1623                  }
1623 1624          }
1624 1625  
1625 1626          /*
1626 1627           * Refuse to mount a filesystem if we are in a local zone and the
1627 1628           * dataset is not visible.
1628 1629           */
1629 1630          if (!INGLOBALZONE(curproc) &&
1630 1631              (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1631 1632                  error = EPERM;
1632 1633                  goto out;
1633 1634          }
1634 1635  
1635 1636          error = zfs_mount_label_policy(vfsp, osname);
1636 1637          if (error)
1637 1638                  goto out;
1638 1639  
1639 1640          /*
1640 1641           * When doing a remount, we simply refresh our temporary properties
1641 1642           * according to those options set in the current VFS options.
1642 1643           */
1643 1644          if (uap->flags & MS_REMOUNT) {
1644 1645                  /* refresh mount options */
1645 1646                  zfs_unregister_callbacks(vfsp->vfs_data);
1646 1647                  error = zfs_register_callbacks(vfsp);
1647 1648                  goto out;
1648 1649          }
1649 1650  
1650 1651          error = zfs_domount(vfsp, osname);
1651 1652  
1652 1653          /*
1653 1654           * Add an extra VFS_HOLD on our parent vfs so that it can't
1654 1655           * disappear due to a forced unmount.
1655 1656           */
1656 1657          if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1657 1658                  VFS_HOLD(mvp->v_vfsp);
1658 1659  
1659 1660  out:
1660 1661          pn_free(&spn);
1661 1662          return (error);
1662 1663  }
1663 1664  
1664 1665  static int
1665 1666  zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1666 1667  {
1667 1668          zfsvfs_t *zfsvfs = vfsp->vfs_data;
1668 1669          dev32_t d32;
1669 1670          uint64_t refdbytes, availbytes, usedobjs, availobjs;
1670 1671  
1671 1672          ZFS_ENTER(zfsvfs);
1672 1673  
1673 1674          dmu_objset_space(zfsvfs->z_os,
1674 1675              &refdbytes, &availbytes, &usedobjs, &availobjs);
1675 1676  
1676 1677          /*
1677 1678           * The underlying storage pool actually uses multiple block sizes.
1678 1679           * We report the fragsize as the smallest block size we support,
1679 1680           * and we report our blocksize as the filesystem's maximum blocksize.
1680 1681           */
1681 1682          statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1682 1683          statp->f_bsize = zfsvfs->z_max_blksz;
1683 1684  
1684 1685          /*
1685 1686           * The following report "total" blocks of various kinds in the
1686 1687           * file system, but reported in terms of f_frsize - the
1687 1688           * "fragment" size.
1688 1689           */
1689 1690  
1690 1691          statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1691 1692          statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1692 1693          statp->f_bavail = statp->f_bfree; /* no root reservation */
1693 1694  
1694 1695          /*
1695 1696           * statvfs() should really be called statufs(), because it assumes
1696 1697           * static metadata.  ZFS doesn't preallocate files, so the best
1697 1698           * we can do is report the max that could possibly fit in f_files,
1698 1699           * and that minus the number actually used in f_ffree.
1699 1700           * For f_ffree, report the smaller of the number of object available
1700 1701           * and the number of blocks (each object will take at least a block).
1701 1702           */
1702 1703          statp->f_ffree = MIN(availobjs, statp->f_bfree);
1703 1704          statp->f_favail = statp->f_ffree;       /* no "root reservation" */
1704 1705          statp->f_files = statp->f_ffree + usedobjs;
1705 1706  
1706 1707          (void) cmpldev(&d32, vfsp->vfs_dev);
1707 1708          statp->f_fsid = d32;
1708 1709  
1709 1710          /*
1710 1711           * We're a zfs filesystem.
1711 1712           */
1712 1713          (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1713 1714  
1714 1715          statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1715 1716  
1716 1717          statp->f_namemax = ZFS_MAXNAMELEN;
1717 1718  
1718 1719          /*
1719 1720           * We have all of 32 characters to stuff a string here.
1720 1721           * Is there anything useful we could/should provide?
1721 1722           */
1722 1723          bzero(statp->f_fstr, sizeof (statp->f_fstr));
1723 1724  
1724 1725          ZFS_EXIT(zfsvfs);
1725 1726          return (0);
1726 1727  }
1727 1728  
1728 1729  static int
1729 1730  zfs_root(vfs_t *vfsp, vnode_t **vpp)
1730 1731  {
1731 1732          zfsvfs_t *zfsvfs = vfsp->vfs_data;
1732 1733          znode_t *rootzp;
1733 1734          int error;
1734 1735  
1735 1736          ZFS_ENTER(zfsvfs);
1736 1737  
1737 1738          error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1738 1739          if (error == 0)
1739 1740                  *vpp = ZTOV(rootzp);
1740 1741  
1741 1742          ZFS_EXIT(zfsvfs);
1742 1743          return (error);
1743 1744  }
1744 1745  
1745 1746  /*
1746 1747   * Teardown the zfsvfs::z_os.
1747 1748   *
1748 1749   * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1749 1750   * and 'z_teardown_inactive_lock' held.
1750 1751   */
1751 1752  static int
1752 1753  zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1753 1754  {
1754 1755          znode_t *zp;
1755 1756  
1756 1757          rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1757 1758  
1758 1759          if (!unmounting) {
1759 1760                  /*
1760 1761                   * We purge the parent filesystem's vfsp as the parent
1761 1762                   * filesystem and all of its snapshots have their vnode's
1762 1763                   * v_vfsp set to the parent's filesystem's vfsp.  Note,
1763 1764                   * 'z_parent' is self referential for non-snapshots.
1764 1765                   */
1765 1766                  (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1766 1767          }
1767 1768  
1768 1769          /*
1769 1770           * Close the zil. NB: Can't close the zil while zfs_inactive
1770 1771           * threads are blocked as zil_close can call zfs_inactive.
1771 1772           */
1772 1773          if (zfsvfs->z_log) {
1773 1774                  zil_close(zfsvfs->z_log);
1774 1775                  zfsvfs->z_log = NULL;
1775 1776          }
1776 1777  
1777 1778          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1778 1779  
1779 1780          /*
1780 1781           * If we are not unmounting (ie: online recv) and someone already
1781 1782           * unmounted this file system while we were doing the switcheroo,
1782 1783           * or a reopen of z_os failed then just bail out now.
1783 1784           */
1784 1785          if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1785 1786                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
1786 1787                  rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1787 1788                  return (EIO);
1788 1789          }
1789 1790  
1790 1791          /*
1791 1792           * At this point there are no vops active, and any new vops will
1792 1793           * fail with EIO since we have z_teardown_lock for writer (only
1793 1794           * relavent for forced unmount).
1794 1795           *
1795 1796           * Release all holds on dbufs.
1796 1797           */
1797 1798          mutex_enter(&zfsvfs->z_znodes_lock);
1798 1799          for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1799 1800              zp = list_next(&zfsvfs->z_all_znodes, zp))
1800 1801                  if (zp->z_sa_hdl) {
1801 1802                          ASSERT(ZTOV(zp)->v_count > 0);
1802 1803                          zfs_znode_dmu_fini(zp);
1803 1804                  }
1804 1805          mutex_exit(&zfsvfs->z_znodes_lock);
1805 1806  
1806 1807          /*
1807 1808           * If we are unmounting, set the unmounted flag and let new vops
1808 1809           * unblock.  zfs_inactive will have the unmounted behavior, and all
1809 1810           * other vops will fail with EIO.
1810 1811           */
1811 1812          if (unmounting) {
1812 1813                  zfsvfs->z_unmounted = B_TRUE;
1813 1814                  rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1814 1815                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
1815 1816          }
1816 1817  
1817 1818          /*
1818 1819           * z_os will be NULL if there was an error in attempting to reopen
1819 1820           * zfsvfs, so just return as the properties had already been
1820 1821           * unregistered and cached data had been evicted before.
1821 1822           */
1822 1823          if (zfsvfs->z_os == NULL)
1823 1824                  return (0);
1824 1825  
1825 1826          /*
1826 1827           * Unregister properties.
1827 1828           */
1828 1829          zfs_unregister_callbacks(zfsvfs);
1829 1830  
1830 1831          /*
1831 1832           * Evict cached data
1832 1833           */
1833 1834          if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
1834 1835                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1835 1836                          txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1836 1837          (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1837 1838  
1838 1839          return (0);
1839 1840  }
1840 1841  
1841 1842  /*ARGSUSED*/
1842 1843  static int
1843 1844  zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1844 1845  {
1845 1846          zfsvfs_t *zfsvfs = vfsp->vfs_data;
1846 1847          objset_t *os;
1847 1848          int ret;
1848 1849  
1849 1850          ret = secpolicy_fs_unmount(cr, vfsp);
1850 1851          if (ret) {
1851 1852                  if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1852 1853                      ZFS_DELEG_PERM_MOUNT, cr))
1853 1854                          return (ret);
1854 1855          }
1855 1856  
1856 1857          /*
1857 1858           * We purge the parent filesystem's vfsp as the parent filesystem
1858 1859           * and all of its snapshots have their vnode's v_vfsp set to the
1859 1860           * parent's filesystem's vfsp.  Note, 'z_parent' is self
1860 1861           * referential for non-snapshots.
1861 1862           */
1862 1863          (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1863 1864  
1864 1865          /*
1865 1866           * Unmount any snapshots mounted under .zfs before unmounting the
1866 1867           * dataset itself.
1867 1868           */
1868 1869          if (zfsvfs->z_ctldir != NULL &&
1869 1870              (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1870 1871                  return (ret);
1871 1872          }
1872 1873  
1873 1874          if (!(fflag & MS_FORCE)) {
1874 1875                  /*
1875 1876                   * Check the number of active vnodes in the file system.
1876 1877                   * Our count is maintained in the vfs structure, but the
1877 1878                   * number is off by 1 to indicate a hold on the vfs
1878 1879                   * structure itself.
1879 1880                   *
1880 1881                   * The '.zfs' directory maintains a reference of its
1881 1882                   * own, and any active references underneath are
1882 1883                   * reflected in the vnode count.
1883 1884                   */
1884 1885                  if (zfsvfs->z_ctldir == NULL) {
1885 1886                          if (vfsp->vfs_count > 1)
1886 1887                                  return (EBUSY);
1887 1888                  } else {
1888 1889                          if (vfsp->vfs_count > 2 ||
1889 1890                              zfsvfs->z_ctldir->v_count > 1)
1890 1891                                  return (EBUSY);
1891 1892                  }
1892 1893          }
1893 1894  
1894 1895          vfsp->vfs_flag |= VFS_UNMOUNTED;
1895 1896  
1896 1897          VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1897 1898          os = zfsvfs->z_os;
1898 1899  
1899 1900          /*
1900 1901           * z_os will be NULL if there was an error in
1901 1902           * attempting to reopen zfsvfs.
1902 1903           */
1903 1904          if (os != NULL) {
1904 1905                  /*
1905 1906                   * Unset the objset user_ptr.
1906 1907                   */
1907 1908                  mutex_enter(&os->os_user_ptr_lock);
1908 1909                  dmu_objset_set_user(os, NULL);
1909 1910                  mutex_exit(&os->os_user_ptr_lock);
1910 1911  
1911 1912                  /*
1912 1913                   * Finally release the objset
1913 1914                   */
1914 1915                  dmu_objset_disown(os, zfsvfs);
1915 1916          }
1916 1917  
1917 1918          /*
1918 1919           * We can now safely destroy the '.zfs' directory node.
1919 1920           */
1920 1921          if (zfsvfs->z_ctldir != NULL)
1921 1922                  zfsctl_destroy(zfsvfs);
1922 1923  
1923 1924          return (0);
1924 1925  }
1925 1926  
1926 1927  static int
1927 1928  zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1928 1929  {
1929 1930          zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1930 1931          znode_t         *zp;
1931 1932          uint64_t        object = 0;
1932 1933          uint64_t        fid_gen = 0;
1933 1934          uint64_t        gen_mask;
1934 1935          uint64_t        zp_gen;
1935 1936          int             i, err;
1936 1937  
1937 1938          *vpp = NULL;
1938 1939  
1939 1940          ZFS_ENTER(zfsvfs);
1940 1941  
1941 1942          if (fidp->fid_len == LONG_FID_LEN) {
1942 1943                  zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1943 1944                  uint64_t        objsetid = 0;
1944 1945                  uint64_t        setgen = 0;
1945 1946  
1946 1947                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1947 1948                          objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1948 1949  
1949 1950                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1950 1951                          setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1951 1952  
1952 1953                  ZFS_EXIT(zfsvfs);
1953 1954  
1954 1955                  err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1955 1956                  if (err)
1956 1957                          return (EINVAL);
1957 1958                  ZFS_ENTER(zfsvfs);
1958 1959          }
1959 1960  
1960 1961          if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1961 1962                  zfid_short_t    *zfid = (zfid_short_t *)fidp;
1962 1963  
1963 1964                  for (i = 0; i < sizeof (zfid->zf_object); i++)
1964 1965                          object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1965 1966  
1966 1967                  for (i = 0; i < sizeof (zfid->zf_gen); i++)
1967 1968                          fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1968 1969          } else {
1969 1970                  ZFS_EXIT(zfsvfs);
1970 1971                  return (EINVAL);
1971 1972          }
1972 1973  
1973 1974          /* A zero fid_gen means we are in the .zfs control directories */
1974 1975          if (fid_gen == 0 &&
1975 1976              (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1976 1977                  *vpp = zfsvfs->z_ctldir;
1977 1978                  ASSERT(*vpp != NULL);
1978 1979                  if (object == ZFSCTL_INO_SNAPDIR) {
1979 1980                          VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1980 1981                              0, NULL, NULL, NULL, NULL, NULL) == 0);
1981 1982                  } else {
1982 1983                          VN_HOLD(*vpp);
1983 1984                  }
1984 1985                  ZFS_EXIT(zfsvfs);
1985 1986                  return (0);
1986 1987          }
1987 1988  
1988 1989          gen_mask = -1ULL >> (64 - 8 * i);
1989 1990  
1990 1991          dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1991 1992          if (err = zfs_zget(zfsvfs, object, &zp)) {
1992 1993                  ZFS_EXIT(zfsvfs);
1993 1994                  return (err);
1994 1995          }
1995 1996          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1996 1997              sizeof (uint64_t));
1997 1998          zp_gen = zp_gen & gen_mask;
1998 1999          if (zp_gen == 0)
1999 2000                  zp_gen = 1;
2000 2001          if (zp->z_unlinked || zp_gen != fid_gen) {
2001 2002                  dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2002 2003                  VN_RELE(ZTOV(zp));
2003 2004                  ZFS_EXIT(zfsvfs);
2004 2005                  return (EINVAL);
2005 2006          }
2006 2007  
2007 2008          *vpp = ZTOV(zp);
2008 2009          ZFS_EXIT(zfsvfs);
2009 2010          return (0);
2010 2011  }
2011 2012  
2012 2013  /*
2013 2014   * Block out VOPs and close zfsvfs_t::z_os
2014 2015   *
2015 2016   * Note, if successful, then we return with the 'z_teardown_lock' and
2016 2017   * 'z_teardown_inactive_lock' write held.
2017 2018   */
2018 2019  int
2019 2020  zfs_suspend_fs(zfsvfs_t *zfsvfs)
2020 2021  {
2021 2022          int error;
2022 2023  
2023 2024          if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2024 2025                  return (error);
2025 2026          dmu_objset_disown(zfsvfs->z_os, zfsvfs);
2026 2027  
2027 2028          return (0);
2028 2029  }
2029 2030  
2030 2031  /*
2031 2032   * Reopen zfsvfs_t::z_os and release VOPs.
2032 2033   */
2033 2034  int
2034 2035  zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
2035 2036  {
2036 2037          int err;
2037 2038  
2038 2039          ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
2039 2040          ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2040 2041  
2041 2042          err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
2042 2043              &zfsvfs->z_os);
2043 2044          if (err) {
2044 2045                  zfsvfs->z_os = NULL;
2045 2046          } else {
2046 2047                  znode_t *zp;
2047 2048                  uint64_t sa_obj = 0;
2048 2049  
2049 2050                  /*
2050 2051                   * Make sure version hasn't changed
2051 2052                   */
2052 2053  
2053 2054                  err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
2054 2055                      &zfsvfs->z_version);
2055 2056  
2056 2057                  if (err)
2057 2058                          goto bail;
2058 2059  
2059 2060                  err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
2060 2061                      ZFS_SA_ATTRS, 8, 1, &sa_obj);
2061 2062  
2062 2063                  if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
2063 2064                          goto bail;
2064 2065  
2065 2066                  if ((err = sa_setup(zfsvfs->z_os, sa_obj,
2066 2067                      zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
2067 2068                          goto bail;
2068 2069  
2069 2070                  if (zfsvfs->z_version >= ZPL_VERSION_SA)
2070 2071                          sa_register_update_callback(zfsvfs->z_os,
2071 2072                              zfs_sa_upgrade);
2072 2073  
2073 2074                  VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2074 2075  
2075 2076                  zfs_set_fuid_feature(zfsvfs);
2076 2077  
2077 2078                  /*
2078 2079                   * Attempt to re-establish all the active znodes with
2079 2080                   * their dbufs.  If a zfs_rezget() fails, then we'll let
2080 2081                   * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2081 2082                   * when they try to use their znode.
2082 2083                   */
2083 2084                  mutex_enter(&zfsvfs->z_znodes_lock);
2084 2085                  for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2085 2086                      zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2086 2087                          (void) zfs_rezget(zp);
2087 2088                  }
2088 2089                  mutex_exit(&zfsvfs->z_znodes_lock);
2089 2090          }
2090 2091  
2091 2092  bail:
2092 2093          /* release the VOPs */
2093 2094          rw_exit(&zfsvfs->z_teardown_inactive_lock);
2094 2095          rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
2095 2096  
2096 2097          if (err) {
2097 2098                  /*
2098 2099                   * Since we couldn't reopen zfsvfs::z_os, or
2099 2100                   * setup the sa framework force unmount this file system.
2100 2101                   */
2101 2102                  if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2102 2103                          (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2103 2104          }
2104 2105          return (err);
2105 2106  }
2106 2107  
2107 2108  static void
2108 2109  zfs_freevfs(vfs_t *vfsp)
2109 2110  {
2110 2111          zfsvfs_t *zfsvfs = vfsp->vfs_data;
2111 2112  
2112 2113          /*
2113 2114           * If this is a snapshot, we have an extra VFS_HOLD on our parent
2114 2115           * from zfs_mount().  Release it here.  If we came through
2115 2116           * zfs_mountroot() instead, we didn't grab an extra hold, so
2116 2117           * skip the VFS_RELE for rootvfs.
2117 2118           */
2118 2119          if (zfsvfs->z_issnap && (vfsp != rootvfs))
2119 2120                  VFS_RELE(zfsvfs->z_parent->z_vfs);
2120 2121  
2121 2122          zfsvfs_free(zfsvfs);
2122 2123  
2123 2124          atomic_add_32(&zfs_active_fs_count, -1);
2124 2125  }
2125 2126  
2126 2127  /*
2127 2128   * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
2128 2129   * so we can't safely do any non-idempotent initialization here.
2129 2130   * Leave that to zfs_init() and zfs_fini(), which are called
2130 2131   * from the module's _init() and _fini() entry points.
2131 2132   */
2132 2133  /*ARGSUSED*/
2133 2134  static int
2134 2135  zfs_vfsinit(int fstype, char *name)
2135 2136  {
2136 2137          int error;
2137 2138  
2138 2139          zfsfstype = fstype;
2139 2140  
2140 2141          /*
2141 2142           * Setup vfsops and vnodeops tables.
2142 2143           */
2143 2144          error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
2144 2145          if (error != 0) {
2145 2146                  cmn_err(CE_WARN, "zfs: bad vfs ops template");
2146 2147          }
2147 2148  
2148 2149          error = zfs_create_op_tables();
2149 2150          if (error) {
2150 2151                  zfs_remove_op_tables();
2151 2152                  cmn_err(CE_WARN, "zfs: bad vnode ops template");
2152 2153                  (void) vfs_freevfsops_by_type(zfsfstype);
2153 2154                  return (error);
2154 2155          }
2155 2156  
2156 2157          mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
2157 2158  
2158 2159          /*
2159 2160           * Unique major number for all zfs mounts.
2160 2161           * If we run out of 32-bit minors, we'll getudev() another major.
2161 2162           */
2162 2163          zfs_major = ddi_name_to_major(ZFS_DRIVER);
2163 2164          zfs_minor = ZFS_MIN_MINOR;
2164 2165  
2165 2166          return (0);
2166 2167  }
2167 2168  
2168 2169  void
2169 2170  zfs_init(void)
2170 2171  {
2171 2172          /*
2172 2173           * Initialize .zfs directory structures
2173 2174           */
2174 2175          zfsctl_init();
2175 2176  
2176 2177          /*
2177 2178           * Initialize znode cache, vnode ops, etc...
2178 2179           */
2179 2180          zfs_znode_init();
2180 2181  
2181 2182          dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2182 2183  }
2183 2184  
2184 2185  void
2185 2186  zfs_fini(void)
2186 2187  {
2187 2188          zfsctl_fini();
2188 2189          zfs_znode_fini();
2189 2190  }
2190 2191  
2191 2192  int
2192 2193  zfs_busy(void)
2193 2194  {
2194 2195          return (zfs_active_fs_count != 0);
2195 2196  }
2196 2197  
2197 2198  int
2198 2199  zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2199 2200  {
2200 2201          int error;
2201 2202          objset_t *os = zfsvfs->z_os;
2202 2203          dmu_tx_t *tx;
2203 2204  
2204 2205          if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2205 2206                  return (EINVAL);
2206 2207  
2207 2208          if (newvers < zfsvfs->z_version)
2208 2209                  return (EINVAL);
2209 2210  
2210 2211          if (zfs_spa_version_map(newvers) >
2211 2212              spa_version(dmu_objset_spa(zfsvfs->z_os)))
2212 2213                  return (ENOTSUP);
2213 2214  
2214 2215          tx = dmu_tx_create(os);
2215 2216          dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2216 2217          if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2217 2218                  dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2218 2219                      ZFS_SA_ATTRS);
2219 2220                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2220 2221          }
2221 2222          error = dmu_tx_assign(tx, TXG_WAIT);
2222 2223          if (error) {
2223 2224                  dmu_tx_abort(tx);
2224 2225                  return (error);
2225 2226          }
2226 2227  
2227 2228          error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2228 2229              8, 1, &newvers, tx);
2229 2230  
2230 2231          if (error) {
2231 2232                  dmu_tx_commit(tx);
2232 2233                  return (error);
2233 2234          }
2234 2235  
2235 2236          if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2236 2237                  uint64_t sa_obj;
2237 2238  
2238 2239                  ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2239 2240                      SPA_VERSION_SA);
2240 2241                  sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,

↓ open down ↓

2208 lines elided

↑ open up ↑

2241 2242                      DMU_OT_NONE, 0, tx);
2242 2243  
2243 2244                  error = zap_add(os, MASTER_NODE_OBJ,
2244 2245                      ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2245 2246                  ASSERT3U(error, ==, 0);
2246 2247  
2247 2248                  VERIFY(0 == sa_set_sa_object(os, sa_obj));
2248 2249                  sa_register_update_callback(os, zfs_sa_upgrade);
2249 2250          }
2250 2251  
2251      -        spa_history_log_internal(LOG_DS_UPGRADE,
2252      -            dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
2253      -            zfsvfs->z_version, newvers, dmu_objset_id(os));
     2252 +        spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
     2253 +            "from %llu to %llu", zfsvfs->z_version, newvers);
2254 2254  
2255 2255          dmu_tx_commit(tx);
2256 2256  
2257 2257          zfsvfs->z_version = newvers;
2258 2258  
2259 2259          zfs_set_fuid_feature(zfsvfs);
2260 2260  
2261 2261          return (0);
2262 2262  }
2263 2263

2264 2264  /*
2265 2265   * Read a property stored within the master node.
2266 2266   */
2267 2267  int
2268 2268  zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2269 2269  {
2270 2270          const char *pname;
2271 2271          int error = ENOENT;
2272 2272  
2273 2273          /*
2274 2274           * Look up the file system's value for the property.  For the
2275 2275           * version property, we look up a slightly different string.
2276 2276           */
2277 2277          if (prop == ZFS_PROP_VERSION)
2278 2278                  pname = ZPL_VERSION_STR;
2279 2279          else
2280 2280                  pname = zfs_prop_to_name(prop);
2281 2281  
2282 2282          if (os != NULL)
2283 2283                  error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2284 2284  
2285 2285          if (error == ENOENT) {
2286 2286                  /* No value set, use the default value */
2287 2287                  switch (prop) {
2288 2288                  case ZFS_PROP_VERSION:
2289 2289                          *value = ZPL_VERSION;
2290 2290                          break;
2291 2291                  case ZFS_PROP_NORMALIZE:
2292 2292                  case ZFS_PROP_UTF8ONLY:
2293 2293                          *value = 0;
2294 2294                          break;
2295 2295                  case ZFS_PROP_CASE:
2296 2296                          *value = ZFS_CASE_SENSITIVE;
2297 2297                          break;
2298 2298                  default:
2299 2299                          return (error);
2300 2300                  }
2301 2301                  error = 0;
2302 2302          }
2303 2303          return (error);
2304 2304  }
2305 2305  
2306 2306  static vfsdef_t vfw = {
2307 2307          VFSDEF_VERSION,
2308 2308          MNTTYPE_ZFS,
2309 2309          zfs_vfsinit,
2310 2310          VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2311 2311              VSW_XID|VSW_ZMOUNT,
2312 2312          &zfs_mntopts
2313 2313  };
2314 2314  
2315 2315  struct modlfs zfs_modlfs = {
2316 2316          &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2317 2317  };

↓ open down ↓

54 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX