1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * ZFS control directory (a.k.a. ".zfs")
  29  *
  30  * This directory provides a common location for all ZFS meta-objects.
  31  * Currently, this is only the 'snapshot' directory, but this may expand in the
  32  * future.  The elements are built using the GFS primitives, as the hierarchy
  33  * does not actually exist on disk.
  34  *
  35  * For 'snapshot', we don't want to have all snapshots always mounted, because
  36  * this would take up a huge amount of space in /etc/mnttab.  We have three
  37  * types of objects:
  38  *
  39  *      ctldir ------> snapshotdir -------> snapshot
  40  *                                             |
  41  *                                             |
  42  *                                             V
  43  *                                         mounted fs
  44  *
  45  * The 'snapshot' node contains just enough information to lookup '..' and act
  46  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  47  * perform an automount of the underlying filesystem and return the
  48  * corresponding vnode.
  49  *
  50  * All mounts are handled automatically by the kernel, but unmounts are
  51  * (currently) handled from user land.  The main reason is that there is no
  52  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  53  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  54  * unmounts any snapshots within the snapshot directory.
  55  *
  56  * The '.zfs', '.zfs/snapshot', and all directories created under
  57  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
  58  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
  59  *
  60  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
  61  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
  62  * However, vnodes within these mounted on file systems have their v_vfsp
  63  * fields set to the head filesystem to make NFS happy (see
  64  * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
  65  * so that it cannot be freed until all snapshots have been unmounted.
  66  */
  67 
  68 #include <fs/fs_subr.h>
  69 #include <sys/zfs_ctldir.h>
  70 #include <sys/zfs_ioctl.h>
  71 #include <sys/zfs_vfsops.h>
  72 #include <sys/vfs_opreg.h>
  73 #include <sys/gfs.h>
  74 #include <sys/stat.h>
  75 #include <sys/dmu.h>
  76 #include <sys/dsl_destroy.h>
  77 #include <sys/dsl_deleg.h>
  78 #include <sys/mount.h>
  79 #include <sys/sunddi.h>
  80 
  81 #include "zfs_namecheck.h"
  82 
  83 typedef struct zfsctl_node {
  84         gfs_dir_t       zc_gfs_private;
  85         uint64_t        zc_id;
  86         timestruc_t     zc_cmtime;      /* ctime and mtime, always the same */
  87 } zfsctl_node_t;
  88 
  89 typedef struct zfsctl_snapdir {
  90         zfsctl_node_t   sd_node;
  91         kmutex_t        sd_lock;
  92         avl_tree_t      sd_snaps;
  93 } zfsctl_snapdir_t;
  94 
  95 typedef struct {
  96         char            *se_name;
  97         vnode_t         *se_root;
  98         avl_node_t      se_node;
  99 } zfs_snapentry_t;
 100 
 101 static int
 102 snapentry_compare(const void *a, const void *b)
 103 {
 104         const zfs_snapentry_t *sa = a;
 105         const zfs_snapentry_t *sb = b;
 106         int ret = strcmp(sa->se_name, sb->se_name);
 107 
 108         if (ret < 0)
 109                 return (-1);
 110         else if (ret > 0)
 111                 return (1);
 112         else
 113                 return (0);
 114 }
 115 
 116 vnodeops_t *zfsctl_ops_root;
 117 vnodeops_t *zfsctl_ops_snapdir;
 118 vnodeops_t *zfsctl_ops_snapshot;
 119 vnodeops_t *zfsctl_ops_shares;
 120 vnodeops_t *zfsctl_ops_shares_dir;
 121 
 122 static const fs_operation_def_t zfsctl_tops_root[];
 123 static const fs_operation_def_t zfsctl_tops_snapdir[];
 124 static const fs_operation_def_t zfsctl_tops_snapshot[];
 125 static const fs_operation_def_t zfsctl_tops_shares[];
 126 
 127 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 128 static vnode_t *zfsctl_mknode_shares(vnode_t *);
 129 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 130 static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
 131 
 132 static gfs_opsvec_t zfsctl_opsvec[] = {
 133         { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
 134         { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
 135         { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
 136         { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
 137         { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
 138         { NULL }
 139 };
 140 
 141 /*
 142  * Root directory elements.  We only have two entries
 143  * snapshot and shares.
 144  */
 145 static gfs_dirent_t zfsctl_root_entries[] = {
 146         { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
 147         { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
 148         { NULL }
 149 };
 150 
 151 /* include . and .. in the calculation */
 152 #define NROOT_ENTRIES   ((sizeof (zfsctl_root_entries) / \
 153     sizeof (gfs_dirent_t)) + 1)
 154 
 155 
 156 /*
 157  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
 158  * directories.  This is called from the ZFS init routine, and initializes the
 159  * vnode ops vectors that we'll be using.
 160  */
 161 void
 162 zfsctl_init(void)
 163 {
 164         VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
 165 }
 166 
 167 void
 168 zfsctl_fini(void)
 169 {
 170         /*
 171          * Remove vfsctl vnode ops
 172          */
 173         if (zfsctl_ops_root)
 174                 vn_freevnodeops(zfsctl_ops_root);
 175         if (zfsctl_ops_snapdir)
 176                 vn_freevnodeops(zfsctl_ops_snapdir);
 177         if (zfsctl_ops_snapshot)
 178                 vn_freevnodeops(zfsctl_ops_snapshot);
 179         if (zfsctl_ops_shares)
 180                 vn_freevnodeops(zfsctl_ops_shares);
 181         if (zfsctl_ops_shares_dir)
 182                 vn_freevnodeops(zfsctl_ops_shares_dir);
 183 
 184         zfsctl_ops_root = NULL;
 185         zfsctl_ops_snapdir = NULL;
 186         zfsctl_ops_snapshot = NULL;
 187         zfsctl_ops_shares = NULL;
 188         zfsctl_ops_shares_dir = NULL;
 189 }
 190 
 191 boolean_t
 192 zfsctl_is_node(vnode_t *vp)
 193 {
 194         return (vn_matchops(vp, zfsctl_ops_root) ||
 195             vn_matchops(vp, zfsctl_ops_snapdir) ||
 196             vn_matchops(vp, zfsctl_ops_snapshot) ||
 197             vn_matchops(vp, zfsctl_ops_shares) ||
 198             vn_matchops(vp, zfsctl_ops_shares_dir));
 199 
 200 }
 201 
 202 /*
 203  * Return the inode number associated with the 'snapshot' or
 204  * 'shares' directory.
 205  */
 206 /* ARGSUSED */
 207 static ino64_t
 208 zfsctl_root_inode_cb(vnode_t *vp, int index)
 209 {
 210         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 211 
 212         ASSERT(index <= 2);
 213 
 214         if (index == 0)
 215                 return (ZFSCTL_INO_SNAPDIR);
 216 
 217         return (zfsvfs->z_shares_dir);
 218 }
 219 
 220 /*
 221  * Create the '.zfs' directory.  This directory is cached as part of the VFS
 222  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
 223  * therefore checks against a vfs_count of 2 instead of 1.  This reference
 224  * is removed when the ctldir is destroyed in the unmount.
 225  */
 226 void
 227 zfsctl_create(zfsvfs_t *zfsvfs)
 228 {
 229         vnode_t *vp, *rvp;
 230         zfsctl_node_t *zcp;
 231         uint64_t crtime[2];
 232 
 233         ASSERT(zfsvfs->z_ctldir == NULL);
 234 
 235         vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
 236             zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
 237             zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
 238         zcp = vp->v_data;
 239         zcp->zc_id = ZFSCTL_INO_ROOT;
 240 
 241         VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
 242         VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 243             &crtime, sizeof (crtime)));
 244         ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
 245         VN_RELE(rvp);
 246 
 247         /*
 248          * We're only faking the fact that we have a root of a filesystem for
 249          * the sake of the GFS interfaces.  Undo the flag manipulation it did
 250          * for us.
 251          */
 252         vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
 253 
 254         zfsvfs->z_ctldir = vp;
 255 }
 256 
 257 /*
 258  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
 259  * There might still be more references if we were force unmounted, but only
 260  * new zfs_inactive() calls can occur and they don't reference .zfs
 261  */
 262 void
 263 zfsctl_destroy(zfsvfs_t *zfsvfs)
 264 {
 265         VN_RELE(zfsvfs->z_ctldir);
 266         zfsvfs->z_ctldir = NULL;
 267 }
 268 
 269 /*
 270  * Given a root znode, retrieve the associated .zfs directory.
 271  * Add a hold to the vnode and return it.
 272  */
 273 vnode_t *
 274 zfsctl_root(znode_t *zp)
 275 {
 276         ASSERT(zfs_has_ctldir(zp));
 277         VN_HOLD(zp->z_zfsvfs->z_ctldir);
 278         return (zp->z_zfsvfs->z_ctldir);
 279 }
 280 
 281 /*
 282  * Common open routine.  Disallow any write access.
 283  */
 284 /* ARGSUSED */
 285 static int
 286 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
 287 {
 288         if (flags & FWRITE)
 289                 return (SET_ERROR(EACCES));
 290 
 291         return (0);
 292 }
 293 
 294 /*
 295  * Common close routine.  Nothing to do here.
 296  */
 297 /* ARGSUSED */
 298 static int
 299 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
 300     cred_t *cr, caller_context_t *ct)
 301 {
 302         return (0);
 303 }
 304 
 305 /*
 306  * Common access routine.  Disallow writes.
 307  */
 308 /* ARGSUSED */
 309 static int
 310 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
 311     caller_context_t *ct)
 312 {
 313         if (flags & V_ACE_MASK) {
 314                 if (mode & ACE_ALL_WRITE_PERMS)
 315                         return (SET_ERROR(EACCES));
 316         } else {
 317                 if (mode & VWRITE)
 318                         return (SET_ERROR(EACCES));
 319         }
 320 
 321         return (0);
 322 }
 323 
 324 /*
 325  * Common getattr function.  Fill in basic information.
 326  */
 327 static void
 328 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 329 {
 330         timestruc_t     now;
 331 
 332         vap->va_uid = 0;
 333         vap->va_gid = 0;
 334         vap->va_rdev = 0;
 335         /*
 336          * We are a purely virtual object, so we have no
 337          * blocksize or allocated blocks.
 338          */
 339         vap->va_blksize = 0;
 340         vap->va_nblocks = 0;
 341         vap->va_seq = 0;
 342         vap->va_fsid = vp->v_vfsp->vfs_dev;
 343         vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 344             S_IROTH | S_IXOTH;
 345         vap->va_type = VDIR;
 346         /*
 347          * We live in the now (for atime).
 348          */
 349         gethrestime(&now);
 350         vap->va_atime = now;
 351 }
 352 
 353 /*ARGSUSED*/
 354 static int
 355 zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 356 {
 357         zfsvfs_t        *zfsvfs = vp->v_vfsp->vfs_data;
 358         zfsctl_node_t   *zcp = vp->v_data;
 359         uint64_t        object = zcp->zc_id;
 360         zfid_short_t    *zfid;
 361         int             i;
 362 
 363         ZFS_ENTER(zfsvfs);
 364 
 365         if (fidp->fid_len < SHORT_FID_LEN) {
 366                 fidp->fid_len = SHORT_FID_LEN;
 367                 ZFS_EXIT(zfsvfs);
 368                 return (SET_ERROR(ENOSPC));
 369         }
 370 
 371         zfid = (zfid_short_t *)fidp;
 372 
 373         zfid->zf_len = SHORT_FID_LEN;
 374 
 375         for (i = 0; i < sizeof (zfid->zf_object); i++)
 376                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 377 
 378         /* .zfs znodes always have a generation number of 0 */
 379         for (i = 0; i < sizeof (zfid->zf_gen); i++)
 380                 zfid->zf_gen[i] = 0;
 381 
 382         ZFS_EXIT(zfsvfs);
 383         return (0);
 384 }
 385 
 386 
 387 /*ARGSUSED*/
 388 static int
 389 zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 390 {
 391         zfsvfs_t        *zfsvfs = vp->v_vfsp->vfs_data;
 392         znode_t         *dzp;
 393         int             error;
 394 
 395         ZFS_ENTER(zfsvfs);
 396 
 397         if (zfsvfs->z_shares_dir == 0) {
 398                 ZFS_EXIT(zfsvfs);
 399                 return (SET_ERROR(ENOTSUP));
 400         }
 401 
 402         if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 403                 error = VOP_FID(ZTOV(dzp), fidp, ct);
 404                 VN_RELE(ZTOV(dzp));
 405         }
 406 
 407         ZFS_EXIT(zfsvfs);
 408         return (error);
 409 }
 410 /*
 411  * .zfs inode namespace
 412  *
 413  * We need to generate unique inode numbers for all files and directories
 414  * within the .zfs pseudo-filesystem.  We use the following scheme:
 415  *
 416  *      ENTRY                   ZFSCTL_INODE
 417  *      .zfs                    1
 418  *      .zfs/snapshot           2
 419  *      .zfs/snapshot/<snap>      objectid(snap)
 420  */
 421 
 422 #define ZFSCTL_INO_SNAP(id)     (id)
 423 
 424 /*
 425  * Get root directory attributes.
 426  */
 427 /* ARGSUSED */
 428 static int
 429 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 430     caller_context_t *ct)
 431 {
 432         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 433         zfsctl_node_t *zcp = vp->v_data;
 434 
 435         ZFS_ENTER(zfsvfs);
 436         vap->va_nodeid = ZFSCTL_INO_ROOT;
 437         vap->va_nlink = vap->va_size = NROOT_ENTRIES;
 438         vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
 439 
 440         zfsctl_common_getattr(vp, vap);
 441         ZFS_EXIT(zfsvfs);
 442 
 443         return (0);
 444 }
 445 
 446 /*
 447  * Special case the handling of "..".
 448  */
 449 /* ARGSUSED */
 450 int
 451 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 452     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
 453     int *direntflags, pathname_t *realpnp)
 454 {
 455         zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 456         int err;
 457 
 458         /*
 459          * No extended attributes allowed under .zfs
 460          */
 461         if (flags & LOOKUP_XATTR)
 462                 return (SET_ERROR(EINVAL));
 463 
 464         ZFS_ENTER(zfsvfs);
 465 
 466         if (strcmp(nm, "..") == 0) {
 467                 err = VFS_ROOT(dvp->v_vfsp, vpp);
 468         } else {
 469                 err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
 470                     cr, ct, direntflags, realpnp);
 471         }
 472 
 473         ZFS_EXIT(zfsvfs);
 474 
 475         return (err);
 476 }
 477 
 478 static int
 479 zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 480     caller_context_t *ct)
 481 {
 482         /*
 483          * We only care about ACL_ENABLED so that libsec can
 484          * display ACL correctly and not default to POSIX draft.
 485          */
 486         if (cmd == _PC_ACL_ENABLED) {
 487                 *valp = _ACL_ACE_ENABLED;
 488                 return (0);
 489         }
 490 
 491         return (fs_pathconf(vp, cmd, valp, cr, ct));
 492 }
 493 
 494 static const fs_operation_def_t zfsctl_tops_root[] = {
 495         { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
 496         { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
 497         { VOPNAME_IOCTL,        { .error = fs_inval }                   },
 498         { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_root_getattr }  },
 499         { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
 500         { VOPNAME_READDIR,      { .vop_readdir = gfs_vop_readdir }      },
 501         { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_root_lookup }    },
 502         { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
 503         { VOPNAME_INACTIVE,     { .vop_inactive = gfs_vop_inactive }    },
 504         { VOPNAME_PATHCONF,     { .vop_pathconf = zfsctl_pathconf }     },
 505         { VOPNAME_FID,          { .vop_fid = zfsctl_common_fid  }       },
 506         { NULL }
 507 };
 508 
 509 /*
 510  * Gets the full dataset name that corresponds to the given snapshot name
 511  * Example:
 512  *      zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
 513  */
 514 static int
 515 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 516 {
 517         objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 518 
 519         if (zfs_component_namecheck(name, NULL, NULL) != 0)
 520                 return (SET_ERROR(EILSEQ));
 521         dmu_objset_name(os, zname);
 522         if (strlen(zname) + 1 + strlen(name) >= len)
 523                 return (SET_ERROR(ENAMETOOLONG));
 524         (void) strcat(zname, "@");
 525         (void) strcat(zname, name);
 526         return (0);
 527 }
 528 
 529 static int
 530 zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
 531 {
 532         vnode_t *svp = sep->se_root;
 533         int error;
 534 
 535         ASSERT(vn_ismntpt(svp));
 536 
 537         /* this will be dropped by dounmount() */
 538         if ((error = vn_vfswlock(svp)) != 0)
 539                 return (error);
 540 
 541         VN_HOLD(svp);
 542         error = dounmount(vn_mountedvfs(svp), fflags, cr);
 543         if (error) {
 544                 VN_RELE(svp);
 545                 return (error);
 546         }
 547 
 548         /*
 549          * We can't use VN_RELE(), as that will try to invoke
 550          * zfsctl_snapdir_inactive(), which would cause us to destroy
 551          * the sd_lock mutex held by our caller.
 552          */
 553         ASSERT(svp->v_count == 1);
 554         gfs_vop_inactive(svp, cr, NULL);
 555 
 556         kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 557         kmem_free(sep, sizeof (zfs_snapentry_t));
 558 
 559         return (0);
 560 }
 561 
 562 static void
 563 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 564 {
 565         avl_index_t where;
 566         vfs_t *vfsp;
 567         refstr_t *pathref;
 568         char newpath[MAXNAMELEN];
 569         char *tail;
 570 
 571         ASSERT(MUTEX_HELD(&sdp->sd_lock));
 572         ASSERT(sep != NULL);
 573 
 574         vfsp = vn_mountedvfs(sep->se_root);
 575         ASSERT(vfsp != NULL);
 576 
 577         vfs_lock_wait(vfsp);
 578 
 579         /*
 580          * Change the name in the AVL tree.
 581          */
 582         avl_remove(&sdp->sd_snaps, sep);
 583         kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 584         sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 585         (void) strcpy(sep->se_name, nm);
 586         VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
 587         avl_insert(&sdp->sd_snaps, sep, where);
 588 
 589         /*
 590          * Change the current mountpoint info:
 591          *      - update the tail of the mntpoint path
 592          *      - update the tail of the resource path
 593          */
 594         pathref = vfs_getmntpoint(vfsp);
 595         (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 596         VERIFY((tail = strrchr(newpath, '/')) != NULL);
 597         *(tail+1) = '\0';
 598         ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 599         (void) strcat(newpath, nm);
 600         refstr_rele(pathref);
 601         vfs_setmntpoint(vfsp, newpath, 0);
 602 
 603         pathref = vfs_getresource(vfsp);
 604         (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 605         VERIFY((tail = strrchr(newpath, '@')) != NULL);
 606         *(tail+1) = '\0';
 607         ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 608         (void) strcat(newpath, nm);
 609         refstr_rele(pathref);
 610         vfs_setresource(vfsp, newpath, 0);
 611 
 612         vfs_unlock(vfsp);
 613 }
 614 
 615 /*ARGSUSED*/
 616 static int
 617 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
 618     cred_t *cr, caller_context_t *ct, int flags)
 619 {
 620         zfsctl_snapdir_t *sdp = sdvp->v_data;
 621         zfs_snapentry_t search, *sep;
 622         zfsvfs_t *zfsvfs;
 623         avl_index_t where;
 624         char from[ZFS_MAX_DATASET_NAME_LEN], to[ZFS_MAX_DATASET_NAME_LEN];
 625         char real[ZFS_MAX_DATASET_NAME_LEN], fsname[ZFS_MAX_DATASET_NAME_LEN];
 626         int err;
 627 
 628         zfsvfs = sdvp->v_vfsp->vfs_data;
 629         ZFS_ENTER(zfsvfs);
 630 
 631         if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 632                 err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
 633                     sizeof (real), NULL);
 634                 if (err == 0) {
 635                         snm = real;
 636                 } else if (err != ENOTSUP) {
 637                         ZFS_EXIT(zfsvfs);
 638                         return (err);
 639                 }
 640         }
 641 
 642         ZFS_EXIT(zfsvfs);
 643 
 644         dmu_objset_name(zfsvfs->z_os, fsname);
 645 
 646         err = zfsctl_snapshot_zname(sdvp, snm, sizeof (from), from);
 647         if (err == 0)
 648                 err = zfsctl_snapshot_zname(tdvp, tnm, sizeof (to), to);
 649         if (err == 0)
 650                 err = zfs_secpolicy_rename_perms(from, to, cr);
 651         if (err != 0)
 652                 return (err);
 653 
 654         /*
 655          * Cannot move snapshots out of the snapdir.
 656          */
 657         if (sdvp != tdvp)
 658                 return (SET_ERROR(EINVAL));
 659 
 660         if (strcmp(snm, tnm) == 0)
 661                 return (0);
 662 
 663         mutex_enter(&sdp->sd_lock);
 664 
 665         search.se_name = (char *)snm;
 666         if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
 667                 mutex_exit(&sdp->sd_lock);
 668                 return (SET_ERROR(ENOENT));
 669         }
 670 
 671         err = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
 672         if (err == 0)
 673                 zfsctl_rename_snap(sdp, sep, tnm);
 674 
 675         mutex_exit(&sdp->sd_lock);
 676 
 677         return (err);
 678 }
 679 
 680 /* ARGSUSED */
 681 static int
 682 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
 683     caller_context_t *ct, int flags)
 684 {
 685         zfsctl_snapdir_t *sdp = dvp->v_data;
 686         zfs_snapentry_t *sep;
 687         zfs_snapentry_t search;
 688         zfsvfs_t *zfsvfs;
 689         char snapname[ZFS_MAX_DATASET_NAME_LEN];
 690         char real[ZFS_MAX_DATASET_NAME_LEN];
 691         int err;
 692 
 693         zfsvfs = dvp->v_vfsp->vfs_data;
 694         ZFS_ENTER(zfsvfs);
 695 
 696         if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 697 
 698                 err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
 699                     sizeof (real), NULL);
 700                 if (err == 0) {
 701                         name = real;
 702                 } else if (err != ENOTSUP) {
 703                         ZFS_EXIT(zfsvfs);
 704                         return (err);
 705                 }
 706         }
 707 
 708         ZFS_EXIT(zfsvfs);
 709 
 710         err = zfsctl_snapshot_zname(dvp, name, sizeof (snapname), snapname);
 711         if (err == 0)
 712                 err = zfs_secpolicy_destroy_perms(snapname, cr);
 713         if (err != 0)
 714                 return (err);
 715 
 716         mutex_enter(&sdp->sd_lock);
 717 
 718         search.se_name = name;
 719         sep = avl_find(&sdp->sd_snaps, &search, NULL);
 720         if (sep) {
 721                 avl_remove(&sdp->sd_snaps, sep);
 722                 err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
 723                 if (err != 0)
 724                         avl_add(&sdp->sd_snaps, sep);
 725                 else
 726                         err = dsl_destroy_snapshot(snapname, B_FALSE);
 727         } else {
 728                 err = SET_ERROR(ENOENT);
 729         }
 730 
 731         mutex_exit(&sdp->sd_lock);
 732 
 733         return (err);
 734 }
 735 
 736 /*
 737  * This creates a snapshot under '.zfs/snapshot'.
 738  */
 739 /* ARGSUSED */
 740 static int
 741 zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
 742     cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
 743 {
 744         zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 745         char name[ZFS_MAX_DATASET_NAME_LEN];
 746         int err;
 747         static enum symfollow follow = NO_FOLLOW;
 748         static enum uio_seg seg = UIO_SYSSPACE;
 749 
 750         if (zfs_component_namecheck(dirname, NULL, NULL) != 0)
 751                 return (SET_ERROR(EILSEQ));
 752 
 753         dmu_objset_name(zfsvfs->z_os, name);
 754 
 755         *vpp = NULL;
 756 
 757         err = zfs_secpolicy_snapshot_perms(name, cr);
 758         if (err != 0)
 759                 return (err);
 760 
 761         if (err == 0) {
 762                 err = dmu_objset_snapshot_one(name, dirname);
 763                 if (err != 0)
 764                         return (err);
 765                 err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
 766         }
 767 
 768         return (err);
 769 }
 770 
 771 /*
 772  * Lookup entry point for the 'snapshot' directory.  Try to open the
 773  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
 774  * Perform a mount of the associated dataset on top of the vnode.
 775  */
 776 /* ARGSUSED */
 777 static int
 778 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 779     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
 780     int *direntflags, pathname_t *realpnp)
 781 {
 782         zfsctl_snapdir_t *sdp = dvp->v_data;
 783         objset_t *snap;
 784         char snapname[ZFS_MAX_DATASET_NAME_LEN];
 785         char real[ZFS_MAX_DATASET_NAME_LEN];
 786         char *mountpoint;
 787         zfs_snapentry_t *sep, search;
 788         struct mounta margs;
 789         vfs_t *vfsp;
 790         size_t mountpoint_len;
 791         avl_index_t where;
 792         zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 793         int err;
 794 
 795         /*
 796          * No extended attributes allowed under .zfs
 797          */
 798         if (flags & LOOKUP_XATTR)
 799                 return (SET_ERROR(EINVAL));
 800 
 801         ASSERT(dvp->v_type == VDIR);
 802 
 803         /*
 804          * If we get a recursive call, that means we got called
 805          * from the domount() code while it was trying to look up the
 806          * spec (which looks like a local path for zfs).  We need to
 807          * add some flag to domount() to tell it not to do this lookup.
 808          */
 809         if (MUTEX_HELD(&sdp->sd_lock))
 810                 return (SET_ERROR(ENOENT));
 811 
 812         ZFS_ENTER(zfsvfs);
 813 
 814         if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
 815                 ZFS_EXIT(zfsvfs);
 816                 return (0);
 817         }
 818 
 819         if (flags & FIGNORECASE) {
 820                 boolean_t conflict = B_FALSE;
 821 
 822                 err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
 823                     sizeof (real), &conflict);
 824                 if (err == 0) {
 825                         nm = real;
 826                 } else if (err != ENOTSUP) {
 827                         ZFS_EXIT(zfsvfs);
 828                         return (err);
 829                 }
 830                 if (realpnp)
 831                         (void) strlcpy(realpnp->pn_buf, nm,
 832                             realpnp->pn_bufsize);
 833                 if (conflict && direntflags)
 834                         *direntflags = ED_CASE_CONFLICT;
 835         }
 836 
 837         mutex_enter(&sdp->sd_lock);
 838         search.se_name = (char *)nm;
 839         if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
 840                 *vpp = sep->se_root;
 841                 VN_HOLD(*vpp);
 842                 err = traverse(vpp);
 843                 if (err != 0) {
 844                         VN_RELE(*vpp);
 845                         *vpp = NULL;
 846                 } else if (*vpp == sep->se_root) {
 847                         /*
 848                          * The snapshot was unmounted behind our backs,
 849                          * try to remount it.
 850                          */
 851                         goto domount;
 852                 } else {
 853                         /*
 854                          * VROOT was set during the traverse call.  We need
 855                          * to clear it since we're pretending to be part
 856                          * of our parent's vfs.
 857                          */
 858                         (*vpp)->v_flag &= ~VROOT;
 859                 }
 860                 mutex_exit(&sdp->sd_lock);
 861                 ZFS_EXIT(zfsvfs);
 862                 return (err);
 863         }
 864 
 865         /*
 866          * The requested snapshot is not currently mounted, look it up.
 867          */
 868         err = zfsctl_snapshot_zname(dvp, nm, sizeof (snapname), snapname);
 869         if (err != 0) {
 870                 mutex_exit(&sdp->sd_lock);
 871                 ZFS_EXIT(zfsvfs);
 872                 /*
 873                  * handle "ls *" or "?" in a graceful manner,
 874                  * forcing EILSEQ to ENOENT.
 875                  * Since shell ultimately passes "*" or "?" as name to lookup
 876                  */
 877                 return (err == EILSEQ ? ENOENT : err);
 878         }
 879         if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
 880                 mutex_exit(&sdp->sd_lock);
 881                 ZFS_EXIT(zfsvfs);
 882                 return (SET_ERROR(ENOENT));
 883         }
 884 
 885         sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 886         sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 887         (void) strcpy(sep->se_name, nm);
 888         *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 889         avl_insert(&sdp->sd_snaps, sep, where);
 890 
 891         dmu_objset_rele(snap, FTAG);
 892 domount:
 893         mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
 894             strlen("/.zfs/snapshot/") + strlen(nm) + 1;
 895         mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 896         (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
 897             refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
 898 
 899         margs.spec = snapname;
 900         margs.dir = mountpoint;
 901         margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
 902         margs.fstype = "zfs";
 903         margs.dataptr = NULL;
 904         margs.datalen = 0;
 905         margs.optptr = NULL;
 906         margs.optlen = 0;
 907 
 908         err = domount("zfs", &margs, *vpp, kcred, &vfsp);
 909         kmem_free(mountpoint, mountpoint_len);
 910 
 911         if (err == 0) {
 912                 /*
 913                  * Return the mounted root rather than the covered mount point.
 914                  * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
 915                  * the ZFS vnode mounted on top of the GFS node.  This ZFS
 916                  * vnode is the root of the newly created vfsp.
 917                  */
 918                 VFS_RELE(vfsp);
 919                 err = traverse(vpp);
 920         }
 921 
 922         if (err == 0) {
 923                 /*
 924                  * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
 925                  *
 926                  * This is where we lie about our v_vfsp in order to
 927                  * make .zfs/snapshot/<snapname> accessible over NFS
 928                  * without requiring manual mounts of <snapname>.
 929                  */
 930                 ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 931                 VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
 932                 (*vpp)->v_vfsp = zfsvfs->z_vfs;
 933                 (*vpp)->v_flag &= ~VROOT;
 934         }
 935         mutex_exit(&sdp->sd_lock);
 936         ZFS_EXIT(zfsvfs);
 937 
 938         /*
 939          * If we had an error, drop our hold on the vnode and
 940          * zfsctl_snapshot_inactive() will clean up.
 941          */
 942         if (err != 0) {
 943                 VN_RELE(*vpp);
 944                 *vpp = NULL;
 945         }
 946         return (err);
 947 }
 948 
 949 /* ARGSUSED */
 950 static int
 951 zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 952     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
 953     int *direntflags, pathname_t *realpnp)
 954 {
 955         zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 956         znode_t *dzp;
 957         int error;
 958 
 959         ZFS_ENTER(zfsvfs);
 960 
 961         if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
 962                 ZFS_EXIT(zfsvfs);
 963                 return (0);
 964         }
 965 
 966         if (zfsvfs->z_shares_dir == 0) {
 967                 ZFS_EXIT(zfsvfs);
 968                 return (SET_ERROR(ENOTSUP));
 969         }
 970         if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 971                 error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
 972                     flags, rdir, cr, ct, direntflags, realpnp);
 973                 VN_RELE(ZTOV(dzp));
 974         }
 975 
 976         ZFS_EXIT(zfsvfs);
 977 
 978         return (error);
 979 }
 980 
 981 /* ARGSUSED */
 982 static int
 983 zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
 984     offset_t *offp, offset_t *nextp, void *data, int flags)
 985 {
 986         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 987         char snapname[ZFS_MAX_DATASET_NAME_LEN];
 988         uint64_t id, cookie;
 989         boolean_t case_conflict;
 990         int error;
 991 
 992         ZFS_ENTER(zfsvfs);
 993 
 994         cookie = *offp;
 995         dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 996         error = dmu_snapshot_list_next(zfsvfs->z_os,
 997             sizeof (snapname), snapname, &id, &cookie, &case_conflict);
 998         dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 999         if (error) {
1000                 ZFS_EXIT(zfsvfs);
1001                 if (error == ENOENT) {
1002                         *eofp = 1;
1003                         return (0);
1004                 }
1005                 return (error);
1006         }
1007 
1008         if (flags & V_RDDIR_ENTFLAGS) {
1009                 edirent_t *eodp = dp;
1010 
1011                 (void) strcpy(eodp->ed_name, snapname);
1012                 eodp->ed_ino = ZFSCTL_INO_SNAP(id);
1013                 eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
1014         } else {
1015                 struct dirent64 *odp = dp;
1016 
1017                 (void) strcpy(odp->d_name, snapname);
1018                 odp->d_ino = ZFSCTL_INO_SNAP(id);
1019         }
1020         *nextp = cookie;
1021 
1022         ZFS_EXIT(zfsvfs);
1023 
1024         return (0);
1025 }
1026 
1027 /* ARGSUSED */
1028 static int
1029 zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
1030     caller_context_t *ct, int flags)
1031 {
1032         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1033         znode_t *dzp;
1034         int error;
1035 
1036         ZFS_ENTER(zfsvfs);
1037 
1038         if (zfsvfs->z_shares_dir == 0) {
1039                 ZFS_EXIT(zfsvfs);
1040                 return (SET_ERROR(ENOTSUP));
1041         }
1042         if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
1043                 error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
1044                 VN_RELE(ZTOV(dzp));
1045         } else {
1046                 *eofp = 1;
1047                 error = SET_ERROR(ENOENT);
1048         }
1049 
1050         ZFS_EXIT(zfsvfs);
1051         return (error);
1052 }
1053 
1054 /*
1055  * pvp is the '.zfs' directory (zfsctl_node_t).
1056  *
1057  * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
1058  *
1059  * This function is the callback to create a GFS vnode for '.zfs/snapshot'
1060  * when a lookup is performed on .zfs for "snapshot".
1061  */
1062 vnode_t *
1063 zfsctl_mknode_snapdir(vnode_t *pvp)
1064 {
1065         vnode_t *vp;
1066         zfsctl_snapdir_t *sdp;
1067 
1068         vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
1069             zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
1070             zfsctl_snapdir_readdir_cb, NULL);
1071         sdp = vp->v_data;
1072         sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
1073         sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
1074         mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
1075         avl_create(&sdp->sd_snaps, snapentry_compare,
1076             sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
1077         return (vp);
1078 }
1079 
1080 vnode_t *
1081 zfsctl_mknode_shares(vnode_t *pvp)
1082 {
1083         vnode_t *vp;
1084         zfsctl_node_t *sdp;
1085 
1086         vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
1087             zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
1088             NULL, NULL);
1089         sdp = vp->v_data;
1090         sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
1091         return (vp);
1092 
1093 }
1094 
1095 /* ARGSUSED */
1096 static int
1097 zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
1098     caller_context_t *ct)
1099 {
1100         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1101         znode_t *dzp;
1102         int error;
1103 
1104         ZFS_ENTER(zfsvfs);
1105         if (zfsvfs->z_shares_dir == 0) {
1106                 ZFS_EXIT(zfsvfs);
1107                 return (SET_ERROR(ENOTSUP));
1108         }
1109         if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
1110                 error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
1111                 VN_RELE(ZTOV(dzp));
1112         }
1113         ZFS_EXIT(zfsvfs);
1114         return (error);
1115 
1116 
1117 }
1118 
1119 /* ARGSUSED */
1120 static int
1121 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
1122     caller_context_t *ct)
1123 {
1124         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1125         zfsctl_snapdir_t *sdp = vp->v_data;
1126 
1127         ZFS_ENTER(zfsvfs);
1128         zfsctl_common_getattr(vp, vap);
1129         vap->va_nodeid = gfs_file_inode(vp);
1130         vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
1131         vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
1132         ZFS_EXIT(zfsvfs);
1133 
1134         return (0);
1135 }
1136 
1137 /* ARGSUSED */
1138 static void
1139 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1140 {
1141         zfsctl_snapdir_t *sdp = vp->v_data;
1142         void *private;
1143 
1144         private = gfs_dir_inactive(vp);
1145         if (private != NULL) {
1146                 ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
1147                 mutex_destroy(&sdp->sd_lock);
1148                 avl_destroy(&sdp->sd_snaps);
1149                 kmem_free(private, sizeof (zfsctl_snapdir_t));
1150         }
1151 }
1152 
1153 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
1154         { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
1155         { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
1156         { VOPNAME_IOCTL,        { .error = fs_inval }                   },
1157         { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_snapdir_getattr } },
1158         { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
1159         { VOPNAME_RENAME,       { .vop_rename = zfsctl_snapdir_rename } },
1160         { VOPNAME_RMDIR,        { .vop_rmdir = zfsctl_snapdir_remove }  },
1161         { VOPNAME_MKDIR,        { .vop_mkdir = zfsctl_snapdir_mkdir }   },
1162         { VOPNAME_READDIR,      { .vop_readdir = gfs_vop_readdir }      },
1163         { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_snapdir_lookup } },
1164         { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
1165         { VOPNAME_INACTIVE,     { .vop_inactive = zfsctl_snapdir_inactive } },
1166         { VOPNAME_FID,          { .vop_fid = zfsctl_common_fid }        },
1167         { NULL }
1168 };
1169 
1170 static const fs_operation_def_t zfsctl_tops_shares[] = {
1171         { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
1172         { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
1173         { VOPNAME_IOCTL,        { .error = fs_inval }                   },
1174         { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_shares_getattr } },
1175         { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
1176         { VOPNAME_READDIR,      { .vop_readdir = zfsctl_shares_readdir } },
1177         { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_shares_lookup }  },
1178         { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
1179         { VOPNAME_INACTIVE,     { .vop_inactive = gfs_vop_inactive } },
1180         { VOPNAME_FID,          { .vop_fid = zfsctl_shares_fid } },
1181         { NULL }
1182 };
1183 
1184 /*
1185  * pvp is the GFS vnode '.zfs/snapshot'.
1186  *
1187  * This creates a GFS node under '.zfs/snapshot' representing each
1188  * snapshot.  This newly created GFS node is what we mount snapshot
1189  * vfs_t's ontop of.
1190  */
1191 static vnode_t *
1192 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
1193 {
1194         vnode_t *vp;
1195         zfsctl_node_t *zcp;
1196 
1197         vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
1198             zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
1199         zcp = vp->v_data;
1200         zcp->zc_id = objset;
1201 
1202         return (vp);
1203 }
1204 
1205 static void
1206 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1207 {
1208         zfsctl_snapdir_t *sdp;
1209         zfs_snapentry_t *sep, *next;
1210         vnode_t *dvp;
1211 
1212         VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
1213         sdp = dvp->v_data;
1214 
1215         mutex_enter(&sdp->sd_lock);
1216 
1217         mutex_enter(&vp->v_lock);
1218         if (vp->v_count > 1) {
1219                 vp->v_count--;
1220                 mutex_exit(&vp->v_lock);
1221                 mutex_exit(&sdp->sd_lock);
1222                 VN_RELE(dvp);
1223                 return;
1224         }
1225         mutex_exit(&vp->v_lock);
1226         ASSERT(!vn_ismntpt(vp));
1227 
1228         sep = avl_first(&sdp->sd_snaps);
1229         while (sep != NULL) {
1230                 next = AVL_NEXT(&sdp->sd_snaps, sep);
1231 
1232                 if (sep->se_root == vp) {
1233                         avl_remove(&sdp->sd_snaps, sep);
1234                         kmem_free(sep->se_name, strlen(sep->se_name) + 1);
1235                         kmem_free(sep, sizeof (zfs_snapentry_t));
1236                         break;
1237                 }
1238                 sep = next;
1239         }
1240         ASSERT(sep != NULL);
1241 
1242         mutex_exit(&sdp->sd_lock);
1243         VN_RELE(dvp);
1244 
1245         /*
1246          * Dispose of the vnode for the snapshot mount point.
1247          * This is safe to do because once this entry has been removed
1248          * from the AVL tree, it can't be found again, so cannot become
1249          * "active".  If we lookup the same name again we will end up
1250          * creating a new vnode.
1251          */
1252         gfs_vop_inactive(vp, cr, ct);
1253 }
1254 
1255 
1256 /*
1257  * These VP's should never see the light of day.  They should always
1258  * be covered.
1259  */
1260 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
1261         { VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive } },
1262         { NULL, { NULL } }
1263 };
1264 
1265 int
1266 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
1267 {
1268         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1269         vnode_t *dvp, *vp;
1270         zfsctl_snapdir_t *sdp;
1271         zfsctl_node_t *zcp;
1272         zfs_snapentry_t *sep;
1273         int error;
1274 
1275         ASSERT(zfsvfs->z_ctldir != NULL);
1276         error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1277             NULL, 0, NULL, kcred, NULL, NULL, NULL);
1278         if (error != 0)
1279                 return (error);
1280         sdp = dvp->v_data;
1281 
1282         mutex_enter(&sdp->sd_lock);
1283         sep = avl_first(&sdp->sd_snaps);
1284         while (sep != NULL) {
1285                 vp = sep->se_root;
1286                 zcp = vp->v_data;
1287                 if (zcp->zc_id == objsetid)
1288                         break;
1289 
1290                 sep = AVL_NEXT(&sdp->sd_snaps, sep);
1291         }
1292 
1293         if (sep != NULL) {
1294                 VN_HOLD(vp);
1295                 /*
1296                  * Return the mounted root rather than the covered mount point.
1297                  * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
1298                  * and returns the ZFS vnode mounted on top of the GFS node.
1299                  * This ZFS vnode is the root of the vfs for objset 'objsetid'.
1300                  */
1301                 error = traverse(&vp);
1302                 if (error == 0) {
1303                         if (vp == sep->se_root)
1304                                 error = SET_ERROR(EINVAL);
1305                         else
1306                                 *zfsvfsp = VTOZ(vp)->z_zfsvfs;
1307                 }
1308                 mutex_exit(&sdp->sd_lock);
1309                 VN_RELE(vp);
1310         } else {
1311                 error = SET_ERROR(EINVAL);
1312                 mutex_exit(&sdp->sd_lock);
1313         }
1314 
1315         VN_RELE(dvp);
1316 
1317         return (error);
1318 }
1319 
1320 /*
1321  * Unmount any snapshots for the given filesystem.  This is called from
1322  * zfs_umount() - if we have a ctldir, then go through and unmount all the
1323  * snapshots.
1324  */
1325 int
1326 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
1327 {
1328         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1329         vnode_t *dvp;
1330         zfsctl_snapdir_t *sdp;
1331         zfs_snapentry_t *sep, *next;
1332         int error;
1333 
1334         ASSERT(zfsvfs->z_ctldir != NULL);
1335         error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
1336             NULL, 0, NULL, cr, NULL, NULL, NULL);
1337         if (error != 0)
1338                 return (error);
1339         sdp = dvp->v_data;
1340 
1341         mutex_enter(&sdp->sd_lock);
1342 
1343         sep = avl_first(&sdp->sd_snaps);
1344         while (sep != NULL) {
1345                 next = AVL_NEXT(&sdp->sd_snaps, sep);
1346 
1347                 /*
1348                  * If this snapshot is not mounted, then it must
1349                  * have just been unmounted by somebody else, and
1350                  * will be cleaned up by zfsctl_snapdir_inactive().
1351                  */
1352                 if (vn_ismntpt(sep->se_root)) {
1353                         avl_remove(&sdp->sd_snaps, sep);
1354                         error = zfsctl_unmount_snap(sep, fflags, cr);
1355                         if (error) {
1356                                 avl_add(&sdp->sd_snaps, sep);
1357                                 break;
1358                         }
1359                 }
1360                 sep = next;
1361         }
1362 
1363         mutex_exit(&sdp->sd_lock);
1364         VN_RELE(dvp);
1365 
1366         return (error);
1367 }