1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2013, 2016 Joyent, Inc.  All rights reserved.
  25  */
  26 
  27 /* vnode ops for the /dev/zvol directory */
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/ddi.h>
  33 #include <sys/sunndi.h>
  34 #include <sys/sunldi.h>
  35 #include <fs/fs_subr.h>
  36 #include <sys/fs/dv_node.h>
  37 #include <sys/fs/sdev_impl.h>
  38 #include <sys/zfs_ioctl.h>
  39 #include <sys/policy.h>
  40 #include <sys/stat.h>
  41 #include <sys/vfs_opreg.h>
  42 
  43 struct vnodeops *devzvol_vnodeops;
  44 static major_t devzvol_major;
  45 static taskq_ent_t devzvol_zclist_task;
  46 
  47 static kmutex_t devzvol_mtx;
  48 /* Below are protected by devzvol_mtx */
  49 static boolean_t devzvol_isopen;
  50 static boolean_t devzvol_zclist_task_running = B_FALSE;
  51 static uint64_t devzvol_gen = 0;
  52 static uint64_t devzvol_zclist;
  53 static size_t devzvol_zclist_size;
  54 static ldi_ident_t devzvol_li;
  55 static ldi_handle_t devzvol_lh;
  56 
  57 /*
  58  * we need to use ddi_mod* since fs/dev gets loaded early on in
  59  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
  60  * other stuff (like drv/random) before the rest of the system is
  61  * ready to go
  62  */
  63 ddi_modhandle_t zfs_mod;
  64 int (*szcm)(char *);
  65 int (*szn2m)(char *, minor_t *);
  66 
  67 int
  68 sdev_zvol_create_minor(char *dsname)
  69 {
  70         if (szcm == NULL)
  71                 return (-1);
  72         return ((*szcm)(dsname));
  73 }
  74 
  75 int
  76 sdev_zvol_name2minor(char *dsname, minor_t *minor)
  77 {
  78         if (szn2m == NULL)
  79                 return (-1);
  80         return ((*szn2m)(dsname, minor));
  81 }
  82 
  83 int
  84 devzvol_open_zfs()
  85 {
  86         int rc;
  87         dev_t dv;
  88 
  89         devzvol_li = ldi_ident_from_anon();
  90         if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
  91             &devzvol_lh, devzvol_li))
  92                 return (-1);
  93         if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
  94             KRTLD_MODE_FIRST, &rc)) == NULL)) {
  95                 return (rc);
  96         }
  97         ASSERT(szcm == NULL && szn2m == NULL);
  98         if ((szcm = (int (*)(char *))
  99             ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
 100                 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
 101                 return (rc);
 102         }
 103         if ((szn2m = (int(*)(char *, minor_t *))
 104             ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
 105                 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
 106                 return (rc);
 107         }
 108         if (ldi_get_dev(devzvol_lh, &dv))
 109                 return (-1);
 110         devzvol_major = getmajor(dv);
 111         return (0);
 112 }
 113 
 114 void
 115 devzvol_close_zfs()
 116 {
 117         szcm = NULL;
 118         szn2m = NULL;
 119         (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
 120         ldi_ident_release(devzvol_li);
 121         if (zfs_mod != NULL) {
 122                 (void) ddi_modclose(zfs_mod);
 123                 zfs_mod = NULL;
 124         }
 125 }
 126 
 127 int
 128 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
 129 {
 130         uint64_t cookie;
 131         int size = 8000;
 132         int unused;
 133         int rc;
 134 
 135         if (cmd != ZFS_IOC_POOL_CONFIGS)
 136                 mutex_enter(&devzvol_mtx);
 137         if (!devzvol_isopen) {
 138                 if ((rc = devzvol_open_zfs()) == 0) {
 139                         devzvol_isopen = B_TRUE;
 140                 } else {
 141                         if (cmd != ZFS_IOC_POOL_CONFIGS)
 142                                 mutex_exit(&devzvol_mtx);
 143                         return (ENXIO);
 144                 }
 145         }
 146         cookie = zc->zc_cookie;
 147 again:
 148         zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
 149             KM_SLEEP);
 150         zc->zc_nvlist_dst_size = size;
 151         rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
 152             &unused);
 153         if (rc == ENOMEM) {
 154                 int newsize;
 155                 newsize = zc->zc_nvlist_dst_size;
 156                 ASSERT(newsize > size);
 157                 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
 158                 size = newsize;
 159                 zc->zc_cookie = cookie;
 160                 goto again;
 161         }
 162         if (alloc_size == NULL)
 163                 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
 164         else
 165                 *alloc_size = size;
 166         if (cmd != ZFS_IOC_POOL_CONFIGS)
 167                 mutex_exit(&devzvol_mtx);
 168         return (rc);
 169 }
 170 
 171 /* figures out if the objset exists and returns its type */
 172 int
 173 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
 174 {
 175         boolean_t       ispool;
 176         zfs_cmd_t       *zc;
 177         int rc;
 178         nvlist_t        *nvl;
 179         size_t nvsz;
 180 
 181         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 182         (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
 183 
 184         nvl = fnvlist_alloc();
 185         fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
 186         zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
 187         zc->zc_nvlist_src_size = nvsz;
 188         fnvlist_free(nvl);
 189 
 190         ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
 191         rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
 192             ZFS_IOC_OBJSET_STATS, zc, NULL);
 193         if (type && rc == 0)
 194                 *type = (ispool) ? DMU_OST_ZFS :
 195                     zc->zc_objset_stats.dds_type;
 196         fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
 197         kmem_free(zc, sizeof (zfs_cmd_t));
 198         return (rc);
 199 }
 200 
 201 /*
 202  * Returns what the zfs dataset name should be, given the /dev/zvol
 203  * path and an optional name (can be NULL).
 204  *
 205  * Note that if the name param is NULL, then path must be an
 206  * actual dataset's directory and not one of the top-level
 207  * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
 208  * specific dataset.
 209  */
 210 char *
 211 devzvol_make_dsname(const char *path, const char *name)
 212 {
 213         char *dsname;
 214         const char *ptr;
 215         int dslen;
 216 
 217         if (strcmp(path, ZVOL_DIR) == 0)
 218                 return (NULL);
 219         if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
 220                 return (NULL);
 221         ptr = path + strlen(ZVOL_DIR);
 222         if (strncmp(ptr, "/dsk", 4) == 0)
 223                 ptr += strlen("/dsk");
 224         else if (strncmp(ptr, "/rdsk", 5) == 0)
 225                 ptr += strlen("/rdsk");
 226         else
 227                 return (NULL);
 228 
 229         if (*ptr == '/')
 230                 ptr++;
 231         else if (name == NULL)
 232                 return (NULL);
 233 
 234         dslen = strlen(ptr);
 235         if (dslen)
 236                 dslen++;                        /* plus null */
 237         if (name)
 238                 dslen += strlen(name) + 1;      /* plus slash */
 239         dsname = kmem_zalloc(dslen, KM_SLEEP);
 240         if (*ptr) {
 241                 (void) strlcpy(dsname, ptr, dslen);
 242                 if (name)
 243                         (void) strlcat(dsname, "/", dslen);
 244         }
 245         if (name)
 246                 (void) strlcat(dsname, name, dslen);
 247         return (dsname);
 248 }
 249 
 250 /*
 251  * check if the zvol's sdev_node is still valid, which means make
 252  * sure the zvol is still valid.  zvol minors aren't proactively
 253  * destroyed when the zvol is destroyed, so we use a validator to clean
 254  * these up (in other words, when such nodes are encountered during
 255  * subsequent lookup() and readdir() operations) so that only valid
 256  * nodes are returned.  The ordering between devname_lookup_func and
 257  * devzvol_validate is a little inefficient in the case of invalid
 258  * or stale nodes because devname_lookup_func calls
 259  * devzvol_create_{dir, link}, then the validator says it's invalid,
 260  * and then the node gets cleaned up.
 261  */
 262 int
 263 devzvol_validate(struct sdev_node *dv)
 264 {
 265         vnode_t *vn = SDEVTOV(dv);
 266         dmu_objset_type_t do_type;
 267         char *dsname;
 268         char *nm = dv->sdev_name;
 269         int rc;
 270 
 271         sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
 272         /*
 273          * validate only READY nodes; if someone is sitting on the
 274          * directory of a dataset that just got destroyed we could
 275          * get a zombie node which we just skip.
 276          */
 277         if (dv->sdev_state != SDEV_READY) {
 278                 sdcmn_err13(("skipping '%s'", nm));
 279                 return (SDEV_VTOR_SKIP);
 280         }
 281 
 282         if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
 283             (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
 284                 return (SDEV_VTOR_VALID);
 285         dsname = devzvol_make_dsname(dv->sdev_path, NULL);
 286         if (dsname == NULL)
 287                 return (SDEV_VTOR_INVALID);
 288 
 289         /*
 290          * Leave any nodes alone that have been explicitly created by
 291          * sdev profiles.
 292          */
 293         if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
 294                 kmem_free(dsname, strlen(dsname) + 1);
 295                 return (SDEV_VTOR_VALID);
 296         }
 297 
 298         rc = devzvol_objset_check(dsname, &do_type);
 299         sdcmn_err13(("  '%s' rc %d", dsname, rc));
 300         if (rc != 0) {
 301                 sdev_node_t *parent = dv->sdev_dotdot;
 302                 /*
 303                  * Explicitly passed-through zvols in our sdev profile can't
 304                  * be created as prof_* shadow nodes, because in the GZ they
 305                  * are symlinks, but in the NGZ they are actual device files.
 306                  *
 307                  * The objset_check will fail on these as they are outside
 308                  * any delegated dataset (zfs will not allow ioctl access to
 309                  * them from this zone). We still want them to work, though.
 310                  */
 311                 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
 312                     parent->sdev_origin != NULL &&
 313                     !(dv->sdev_flags & SDEV_GLOBAL) &&
 314                     (vn->v_type == VBLK || vn->v_type == VCHR) &&
 315                     prof_name_matched(nm, parent)) {
 316                         do_type = DMU_OST_ZVOL;
 317                 } else {
 318                         kmem_free(dsname, strlen(dsname) + 1);
 319                         return (SDEV_VTOR_INVALID);
 320                 }
 321         }
 322 
 323         sdcmn_err13(("  v_type %d do_type %d",
 324             vn->v_type, do_type));
 325         if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
 326             ((vn->v_type == VBLK || vn->v_type == VCHR) &&
 327             do_type != DMU_OST_ZVOL) ||
 328             (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
 329                 kmem_free(dsname, strlen(dsname) + 1);
 330                 return (SDEV_VTOR_STALE);
 331         }
 332         if (vn->v_type == VLNK) {
 333                 char *ptr, *link;
 334                 long val = 0;
 335                 minor_t lminor, ominor;
 336 
 337                 rc = sdev_getlink(vn, &link);
 338                 ASSERT(rc == 0);
 339 
 340                 ptr = strrchr(link, ':') + 1;
 341                 rc = ddi_strtol(ptr, NULL, 10, &val);
 342                 kmem_free(link, strlen(link) + 1);
 343                 ASSERT(rc == 0 && val != 0);
 344                 lminor = (minor_t)val;
 345                 if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
 346                     ominor != lminor) {
 347                         kmem_free(dsname, strlen(dsname) + 1);
 348                         return (SDEV_VTOR_STALE);
 349                 }
 350         }
 351         kmem_free(dsname, strlen(dsname) + 1);
 352         return (SDEV_VTOR_VALID);
 353 }
 354 
 355 /*
 356  * Taskq callback to update the devzvol_zclist.
 357  *
 358  * We need to defer this to the taskq to avoid it running with a user
 359  * context that might be associated with some non-global zone, and thus
 360  * not being able to list all of the pools on the entire system.
 361  */
 362 /*ARGSUSED*/
 363 static void
 364 devzvol_update_zclist_cb(void *arg)
 365 {
 366         zfs_cmd_t       *zc;
 367         int             rc;
 368         size_t          size;
 369 
 370         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 371         mutex_enter(&devzvol_mtx);
 372         zc->zc_cookie = devzvol_gen;
 373 
 374         rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
 375         switch (rc) {
 376                 case 0:
 377                         /* new generation */
 378                         ASSERT(devzvol_gen != zc->zc_cookie);
 379                         devzvol_gen = zc->zc_cookie;
 380                         if (devzvol_zclist)
 381                                 kmem_free((void *)(uintptr_t)devzvol_zclist,
 382                                     devzvol_zclist_size);
 383                         devzvol_zclist = zc->zc_nvlist_dst;
 384                         /* Keep the alloc'd size, not the nvlist size. */
 385                         devzvol_zclist_size = size;
 386                         break;
 387                 default:
 388                         /*
 389                          * Either there was no change in pool configuration
 390                          * since we last asked (rc == EEXIST) or we got a
 391                          * catastrophic error.
 392                          *
 393                          * Give up memory and exit.
 394                          */
 395                         kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
 396                             size);
 397                         break;
 398         }
 399 
 400         VERIFY(devzvol_zclist_task_running == B_TRUE);
 401         devzvol_zclist_task_running = B_FALSE;
 402         mutex_exit(&devzvol_mtx);
 403 
 404         kmem_free(zc, sizeof (zfs_cmd_t));
 405 }
 406 
 407 static void
 408 devzvol_update_zclist(void)
 409 {
 410         mutex_enter(&devzvol_mtx);
 411         if (devzvol_zclist_task_running == B_TRUE) {
 412                 mutex_exit(&devzvol_mtx);
 413                 goto wait;
 414         }
 415 
 416         devzvol_zclist_task_running = B_TRUE;
 417 
 418         taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
 419             &devzvol_zclist_task);
 420 
 421         mutex_exit(&devzvol_mtx);
 422 
 423 wait:
 424         taskq_wait(sdev_taskq);
 425 }
 426 
 427 /*
 428  * Creates sub-directories for each zpool as needed in response to a
 429  * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
 430  */
 431 void
 432 devzvol_create_pool_dirs(struct vnode *dvp)
 433 {
 434         nvlist_t *nv = NULL;
 435         nvpair_t *elem = NULL;
 436         int pools = 0;
 437         int rc;
 438 
 439         sdcmn_err13(("devzvol_create_pool_dirs"));
 440 
 441         devzvol_update_zclist();
 442 
 443         mutex_enter(&devzvol_mtx);
 444 
 445         rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
 446             devzvol_zclist_size, &nv, 0);
 447         if (rc) {
 448                 ASSERT(rc == 0);
 449                 kmem_free((void *)(uintptr_t)devzvol_zclist,
 450                     devzvol_zclist_size);
 451                 devzvol_gen = 0;
 452                 devzvol_zclist = NULL;
 453                 devzvol_zclist_size = 0;
 454                 goto out;
 455         }
 456         mutex_exit(&devzvol_mtx);
 457         while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
 458                 struct vnode *vp;
 459                 ASSERT(dvp->v_count > 0);
 460                 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
 461                     NULL, kcred, NULL, 0, NULL);
 462                 /* should either work, or not be visible from a zone */
 463                 ASSERT(rc == 0 || rc == ENOENT);
 464                 if (rc == 0)
 465                         VN_RELE(vp);
 466                 pools++;
 467         }
 468         nvlist_free(nv);
 469         mutex_enter(&devzvol_mtx);
 470         if (devzvol_isopen && pools == 0) {
 471                 /* clean up so zfs can be unloaded */
 472                 devzvol_close_zfs();
 473                 devzvol_isopen = B_FALSE;
 474         }
 475 out:
 476         mutex_exit(&devzvol_mtx);
 477 }
 478 
 479 /*ARGSUSED3*/
 480 static int
 481 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
 482     cred_t *cred, void *whatever, char *whichever)
 483 {
 484         timestruc_t now;
 485         struct vattr *vap = (struct vattr *)arg;
 486 
 487         sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
 488             ddv->sdev_path, nm));
 489         ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
 490             strlen(ZVOL_DIR)) == 0);
 491         *vap = *sdev_getdefault_attr(VDIR);
 492         gethrestime(&now);
 493         vap->va_atime = now;
 494         vap->va_mtime = now;
 495         vap->va_ctime = now;
 496         return (0);
 497 }
 498 
 499 /*ARGSUSED3*/
 500 static int
 501 devzvol_create_link(struct sdev_node *ddv, char *nm,
 502     void **arg, cred_t *cred, void *whatever, char *whichever)
 503 {
 504         minor_t minor;
 505         char *pathname = (char *)*arg;
 506         int rc;
 507         char *dsname;
 508         char *x;
 509         char str[MAXNAMELEN];
 510         sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
 511             ddv->sdev_path, nm));
 512         dsname = devzvol_make_dsname(ddv->sdev_path, nm);
 513         rc = sdev_zvol_create_minor(dsname);
 514         if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
 515             sdev_zvol_name2minor(dsname, &minor)) {
 516                 sdcmn_err13(("devzvol_create_link %d", rc));
 517                 kmem_free(dsname, strlen(dsname) + 1);
 518                 return (-1);
 519         }
 520         kmem_free(dsname, strlen(dsname) + 1);
 521 
 522         /*
 523          * This is a valid zvol; create a symlink that points to the
 524          * minor which was created under /devices/pseudo/zfs@0
 525          */
 526         *pathname = '\0';
 527         for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
 528                 (void) strcat(pathname, "../");
 529         (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
 530         (void) strncat(pathname, str, MAXPATHLEN);
 531         if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
 532             strlen(ZVOL_FULL_RDEV_DIR)) == 0)
 533                 (void) strcat(pathname, ",raw");
 534         return (0);
 535 }
 536 
 537 /* Clean zvol sdev_nodes that are no longer valid.  */
 538 static void
 539 devzvol_prunedir(struct sdev_node *ddv)
 540 {
 541         struct sdev_node *dv;
 542 
 543         ASSERT(RW_READ_HELD(&ddv->sdev_contents));
 544 
 545         sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
 546         ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
 547         if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
 548                 rw_exit(&ddv->sdev_contents);
 549                 rw_enter(&ddv->sdev_contents, RW_WRITER);
 550         }
 551 
 552         dv = SDEV_FIRST_ENTRY(ddv);
 553         while (dv) {
 554                 sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
 555 
 556                 switch (devzvol_validate(dv)) {
 557                 case SDEV_VTOR_VALID:
 558                 case SDEV_VTOR_SKIP:
 559                         dv = SDEV_NEXT_ENTRY(ddv, dv);
 560                         continue;
 561                 case SDEV_VTOR_INVALID:
 562                         sdcmn_err7(("prunedir: destroy invalid "
 563                             "node: %s\n", dv->sdev_name));
 564                         break;
 565                 }
 566 
 567                 if ((SDEVTOV(dv)->v_type == VDIR) &&
 568                     (sdev_cleandir(dv, NULL, 0) != 0)) {
 569                         dv = SDEV_NEXT_ENTRY(ddv, dv);
 570                         continue;
 571                 }
 572                 SDEV_HOLD(dv);
 573                 /* remove the cache node */
 574                 sdev_cache_update(ddv, &dv, dv->sdev_name,
 575                     SDEV_CACHE_DELETE);
 576                 SDEV_RELE(dv);
 577                 dv = SDEV_FIRST_ENTRY(ddv);
 578         }
 579         rw_downgrade(&ddv->sdev_contents);
 580 }
 581 
 582 /*
 583  * This function is used to create a dir or dev inside a zone's /dev when the
 584  * zone has a zvol that is dynamically created within the zone (i.e. inside
 585  * of a delegated dataset.  Since there is no /devices tree within a zone,
 586  * we create the chr/blk devices directly inside the zone's /dev instead of
 587  * making symlinks.
 588  */
 589 static int
 590 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
 591 {
 592         struct vattr vattr;
 593         timestruc_t now;
 594         enum vtype expected_type = VDIR;
 595         dmu_objset_type_t do_type;
 596         struct sdev_node *dv = NULL;
 597         int res;
 598         char *dsname;
 599 
 600         bzero(&vattr, sizeof (vattr));
 601         gethrestime(&now);
 602         vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
 603         vattr.va_uid = SDEV_UID_DEFAULT;
 604         vattr.va_gid = SDEV_GID_DEFAULT;
 605         vattr.va_type = VNON;
 606         vattr.va_atime = now;
 607         vattr.va_mtime = now;
 608         vattr.va_ctime = now;
 609 
 610         if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
 611                 return (ENOENT);
 612 
 613         if (devzvol_objset_check(dsname, &do_type) != 0) {
 614                 /*
 615                  * objset_check will succeed on any valid objset in the global
 616                  * zone, and any valid delegated dataset. It will fail, however,
 617                  * in non-global zones on explicitly whitelisted zvol devices
 618                  * that are outside any delegated dataset.
 619                  *
 620                  * The directories leading up to the zvol device itself will be
 621                  * created by prof for us in advance (and will always validate
 622                  * because of the matching check in devzvol_validate). The zvol
 623                  * device itself can't be created by prof though because in the
 624                  * GZ it's a symlink, and in the NGZ it is not. So, we create
 625                  * such zvol device files here.
 626                  */
 627                 if (!(parent->sdev_flags & SDEV_GLOBAL) &&
 628                     parent->sdev_origin != NULL &&
 629                     prof_name_matched(nm, parent)) {
 630                         do_type = DMU_OST_ZVOL;
 631                 } else {
 632                         kmem_free(dsname, strlen(dsname) + 1);
 633                         return (ENOENT);
 634                 }
 635         }
 636 
 637         if (do_type == DMU_OST_ZVOL)
 638                 expected_type = VBLK;
 639 
 640         if (expected_type == VDIR) {
 641                 vattr.va_type = VDIR;
 642                 vattr.va_mode = SDEV_DIRMODE_DEFAULT;
 643         } else {
 644                 minor_t minor;
 645                 dev_t devnum;
 646                 int rc;
 647 
 648                 rc = sdev_zvol_create_minor(dsname);
 649                 if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
 650                     sdev_zvol_name2minor(dsname, &minor)) {
 651                         kmem_free(dsname, strlen(dsname) + 1);
 652                         return (ENOENT);
 653                 }
 654 
 655                 devnum = makedevice(devzvol_major, minor);
 656                 vattr.va_rdev = devnum;
 657 
 658                 if (strstr(parent->sdev_path, "/rdsk/") != NULL)
 659                         vattr.va_type = VCHR;
 660                 else
 661                         vattr.va_type = VBLK;
 662                 vattr.va_mode = SDEV_DEVMODE_DEFAULT;
 663         }
 664         kmem_free(dsname, strlen(dsname) + 1);
 665 
 666         rw_enter(&parent->sdev_contents, RW_WRITER);
 667 
 668         res = sdev_mknode(parent, nm, &dv, &vattr,
 669             NULL, NULL, kcred, SDEV_READY);
 670         rw_exit(&parent->sdev_contents);
 671         if (res != 0)
 672                 return (ENOENT);
 673 
 674         SDEV_RELE(dv);
 675         return (0);
 676 }
 677 
 678 /*ARGSUSED*/
 679 static int
 680 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 681     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
 682     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
 683 {
 684         enum vtype expected_type = VDIR;
 685         struct sdev_node *parent = VTOSDEV(dvp);
 686         char *dsname;
 687         dmu_objset_type_t do_type;
 688         int error;
 689 
 690         sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
 691         *vpp = NULL;
 692         /* execute access is required to search the directory */
 693         if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
 694                 return (error);
 695 
 696         rw_enter(&parent->sdev_contents, RW_READER);
 697         if (!SDEV_IS_GLOBAL(parent)) {
 698                 int res;
 699 
 700                 rw_exit(&parent->sdev_contents);
 701 
 702                 /*
 703                  * If we're in the global zone and reach down into a non-global
 704                  * zone's /dev/zvol then this action could trigger the creation
 705                  * of all of the zvol devices for every zone into the non-global
 706                  * zone's /dev tree. This could be a big security hole. To
 707                  * prevent this, disallow the global zone from looking inside
 708                  * a non-global zones /dev/zvol. This behavior is similar to
 709                  * delegated datasets, which cannot be used by the global zone.
 710                  */
 711                 if (getzoneid() == GLOBAL_ZONEID)
 712                         return (EPERM);
 713 
 714                 res = prof_lookup(dvp, nm, vpp, cred);
 715 
 716                 /*
 717                  * We won't find a zvol that was dynamically created inside
 718                  * a NGZ, within a delegated dataset, in the zone's dev profile
 719                  * but prof_lookup will also find it via sdev_cache_lookup.
 720                  */
 721                 if (res == ENOENT) {
 722                         /*
 723                          * We have to create the sdev node for the dymamically
 724                          * created zvol.
 725                          */
 726                         if (devzvol_mk_ngz_node(parent, nm) != 0)
 727                                 return (ENOENT);
 728                         res = prof_lookup(dvp, nm, vpp, cred);
 729                 }
 730 
 731                 return (res);
 732         }
 733 
 734         /*
 735          * Don't let the global-zone style lookup succeed here when we're not
 736          * running in the global zone. This can happen because prof calls into
 737          * us (in prof_filldir) trying to create an explicitly passed-through
 738          * zvol device outside any delegated dataset.
 739          *
 740          * We have to stop this here or else we will create prof shadows of
 741          * the global zone symlink, which will make no sense at all in the
 742          * non-global zone (it has no /devices for the symlink to point at).
 743          *
 744          * These zvols will be created later (at access time) by mk_ngz_node
 745          * instead. The dirs leading up to them will be created by prof
 746          * internally.
 747          *
 748          * We have to return EPERM here, because ENOENT is given special
 749          * meaning by prof in this context.
 750          */
 751         if (getzoneid() != GLOBAL_ZONEID) {
 752                 rw_exit(&parent->sdev_contents);
 753                 return (EPERM);
 754         }
 755 
 756         dsname = devzvol_make_dsname(parent->sdev_path, nm);
 757         rw_exit(&parent->sdev_contents);
 758         sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
 759         if (dsname) {
 760                 error = devzvol_objset_check(dsname, &do_type);
 761                 if (error != 0) {
 762                         error = ENOENT;
 763                         goto out;
 764                 }
 765                 if (do_type == DMU_OST_ZVOL)
 766                         expected_type = VLNK;
 767         }
 768         /*
 769          * the callbacks expect:
 770          *
 771          * parent->sdev_path            nm
 772          * /dev/zvol                       {r}dsk
 773          * /dev/zvol/{r}dsk                <pool name>
 774          * /dev/zvol/{r}dsk/<dataset name> <last ds component>
 775          *
 776          * sdev_name is always last path component of sdev_path
 777          */
 778         if (expected_type == VDIR) {
 779                 error = devname_lookup_func(parent, nm, vpp, cred,
 780                     devzvol_create_dir, SDEV_VATTR);
 781         } else {
 782                 error = devname_lookup_func(parent, nm, vpp, cred,
 783                     devzvol_create_link, SDEV_VLINK);
 784         }
 785         sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
 786         ASSERT(error || ((*vpp)->v_type == expected_type));
 787 out:
 788         if (dsname)
 789                 kmem_free(dsname, strlen(dsname) + 1);
 790         sdcmn_err13(("devzvol_lookup %d", error));
 791         return (error);
 792 }
 793 
 794 /*
 795  * We allow create to find existing nodes
 796  *      - if the node doesn't exist - EROFS
 797  *      - creating an existing dir read-only succeeds, otherwise EISDIR
 798  *      - exclusive creates fail - EEXIST
 799  */
 800 /*ARGSUSED2*/
 801 static int
 802 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
 803     int mode, struct vnode **vpp, struct cred *cred, int flag,
 804     caller_context_t *ct, vsecattr_t *vsecp)
 805 {
 806         int error;
 807         struct vnode *vp;
 808 
 809         *vpp = NULL;
 810 
 811         error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
 812             NULL);
 813         if (error == 0) {
 814                 if (excl == EXCL)
 815                         error = EEXIST;
 816                 else if (vp->v_type == VDIR && (mode & VWRITE))
 817                         error = EISDIR;
 818                 else
 819                         error = VOP_ACCESS(vp, mode, 0, cred, ct);
 820 
 821                 if (error) {
 822                         VN_RELE(vp);
 823                 } else
 824                         *vpp = vp;
 825         } else if (error == ENOENT) {
 826                 error = EROFS;
 827         }
 828 
 829         return (error);
 830 }
 831 
 832 void sdev_iter_snapshots(struct vnode *dvp, char *name);
 833 
 834 void
 835 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
 836 {
 837         zfs_cmd_t       *zc;
 838         int rc;
 839 
 840         sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
 841         zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 842         (void) strcpy(zc->zc_name, name);
 843 
 844         while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
 845                 struct vnode *vpp;
 846                 char *ptr;
 847 
 848                 sdcmn_err13(("  name %s", zc->zc_name));
 849                 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
 850                         goto skip;
 851                 ptr = strrchr(zc->zc_name, '/') + 1;
 852                 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
 853                     kcred, NULL, NULL, NULL);
 854                 if (rc == 0) {
 855                         VN_RELE(vpp);
 856                 } else if (rc == ENOENT) {
 857                         goto skip;
 858                 } else {
 859                         /*
 860                          * EBUSY == problem with zvols's dmu holds?
 861                          * EPERM when in a NGZ and traversing up and out.
 862                          */
 863                         goto skip;
 864                 }
 865                 if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
 866                     zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
 867                         sdev_iter_snapshots(dvp, zc->zc_name);
 868 skip:
 869                 (void) strcpy(zc->zc_name, name);
 870         }
 871         kmem_free(zc, sizeof (zfs_cmd_t));
 872 }
 873 
 874 void
 875 sdev_iter_snapshots(struct vnode *dvp, char *name)
 876 {
 877         sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
 878 }
 879 
 880 /*ARGSUSED4*/
 881 static int
 882 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
 883     int *eofp, caller_context_t *ct_unused, int flags_unused)
 884 {
 885         struct sdev_node *sdvp = VTOSDEV(dvp);
 886         char *ptr;
 887 
 888         sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
 889             sdvp->sdev_name));
 890 
 891         if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
 892                 struct vnode *vp;
 893 
 894                 rw_exit(&sdvp->sdev_contents);
 895                 (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
 896                     devzvol_create_dir, SDEV_VATTR);
 897                 VN_RELE(vp);
 898                 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
 899                     devzvol_create_dir, SDEV_VATTR);
 900                 VN_RELE(vp);
 901                 rw_enter(&sdvp->sdev_contents, RW_READER);
 902                 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 903         }
 904         if (uiop->uio_offset == 0)
 905                 devzvol_prunedir(sdvp);
 906         ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
 907         if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
 908                 rw_exit(&sdvp->sdev_contents);
 909                 devzvol_create_pool_dirs(dvp);
 910                 rw_enter(&sdvp->sdev_contents, RW_READER);
 911                 return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 912         }
 913 
 914         ptr = strchr(ptr + 1, '/');
 915         if (ptr == NULL)
 916                 return (ENOENT);
 917         ptr++;
 918         rw_exit(&sdvp->sdev_contents);
 919         sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
 920         rw_enter(&sdvp->sdev_contents, RW_READER);
 921         return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
 922 }
 923 
 924 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
 925         VOPNAME_READDIR,        { .vop_readdir = devzvol_readdir },
 926         VOPNAME_LOOKUP,         { .vop_lookup = devzvol_lookup },
 927         VOPNAME_CREATE,         { .vop_create = devzvol_create },
 928         VOPNAME_RENAME,         { .error = fs_nosys },
 929         VOPNAME_MKDIR,          { .error = fs_nosys },
 930         VOPNAME_RMDIR,          { .error = fs_nosys },
 931         VOPNAME_REMOVE,         { .error = fs_nosys },
 932         VOPNAME_SYMLINK,        { .error = fs_nosys },
 933         NULL,                   NULL
 934 };