1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2013, 2016 Joyent, Inc. All rights reserved. 25 */ 26 27 /* vnode ops for the /dev/zvol directory */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/ddi.h> 33 #include <sys/sunndi.h> 34 #include <sys/sunldi.h> 35 #include <fs/fs_subr.h> 36 #include <sys/fs/dv_node.h> 37 #include <sys/fs/sdev_impl.h> 38 #include <sys/zfs_ioctl.h> 39 #include <sys/policy.h> 40 #include <sys/stat.h> 41 #include <sys/vfs_opreg.h> 42 43 struct vnodeops *devzvol_vnodeops; 44 static major_t devzvol_major; 45 static taskq_ent_t devzvol_zclist_task; 46 47 static kmutex_t devzvol_mtx; 48 /* Below are protected by devzvol_mtx */ 49 static boolean_t devzvol_isopen; 50 static boolean_t devzvol_zclist_task_running = B_FALSE; 51 static uint64_t devzvol_gen = 0; 52 static uint64_t devzvol_zclist; 53 static size_t devzvol_zclist_size; 54 static ldi_ident_t devzvol_li; 55 static ldi_handle_t devzvol_lh; 56 57 /* 58 * we need to use ddi_mod* since fs/dev gets loaded early on in 59 * startup(), and linking fs/dev to fs/zfs would drag in a lot of 60 * other stuff (like drv/random) before the rest of the system is 61 * ready to go 62 */ 63 ddi_modhandle_t zfs_mod; 64 int (*szcm)(char *); 65 int (*szn2m)(char *, minor_t *); 66 67 int 68 sdev_zvol_create_minor(char *dsname) 69 { 70 if (szcm == NULL) 71 return (-1); 72 return ((*szcm)(dsname)); 73 } 74 75 int 76 sdev_zvol_name2minor(char *dsname, minor_t *minor) 77 { 78 if (szn2m == NULL) 79 return (-1); 80 return ((*szn2m)(dsname, minor)); 81 } 82 83 int 84 devzvol_open_zfs() 85 { 86 int rc; 87 dev_t dv; 88 89 devzvol_li = ldi_ident_from_anon(); 90 if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred, 91 &devzvol_lh, devzvol_li)) 92 return (-1); 93 if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs", 94 KRTLD_MODE_FIRST, &rc)) == NULL)) { 95 return (rc); 96 } 97 ASSERT(szcm == NULL && szn2m == NULL); 98 if ((szcm = (int (*)(char *)) 99 ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) { 100 cmn_err(CE_WARN, "couldn't resolve zvol_create_minor"); 101 return (rc); 102 } 103 if ((szn2m = (int(*)(char *, minor_t *)) 104 ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) { 105 cmn_err(CE_WARN, "couldn't resolve zvol_name2minor"); 106 return (rc); 107 } 108 if (ldi_get_dev(devzvol_lh, &dv)) 109 return (-1); 110 devzvol_major = getmajor(dv); 111 return (0); 112 } 113 114 void 115 devzvol_close_zfs() 116 { 117 szcm = NULL; 118 szn2m = NULL; 119 (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred); 120 ldi_ident_release(devzvol_li); 121 if (zfs_mod != NULL) { 122 (void) ddi_modclose(zfs_mod); 123 zfs_mod = NULL; 124 } 125 } 126 127 int 128 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size) 129 { 130 uint64_t cookie; 131 int size = 8000; 132 int unused; 133 int rc; 134 135 if (cmd != ZFS_IOC_POOL_CONFIGS) 136 mutex_enter(&devzvol_mtx); 137 if (!devzvol_isopen) { 138 if ((rc = devzvol_open_zfs()) == 0) { 139 devzvol_isopen = B_TRUE; 140 } else { 141 if (cmd != ZFS_IOC_POOL_CONFIGS) 142 mutex_exit(&devzvol_mtx); 143 return (ENXIO); 144 } 145 } 146 cookie = zc->zc_cookie; 147 again: 148 zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size, 149 KM_SLEEP); 150 zc->zc_nvlist_dst_size = size; 151 rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred, 152 &unused); 153 if (rc == ENOMEM) { 154 int newsize; 155 newsize = zc->zc_nvlist_dst_size; 156 ASSERT(newsize > size); 157 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); 158 size = newsize; 159 zc->zc_cookie = cookie; 160 goto again; 161 } 162 if (alloc_size == NULL) 163 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); 164 else 165 *alloc_size = size; 166 if (cmd != ZFS_IOC_POOL_CONFIGS) 167 mutex_exit(&devzvol_mtx); 168 return (rc); 169 } 170 171 /* figures out if the objset exists and returns its type */ 172 int 173 devzvol_objset_check(char *dsname, dmu_objset_type_t *type) 174 { 175 boolean_t ispool; 176 zfs_cmd_t *zc; 177 int rc; 178 nvlist_t *nvl; 179 size_t nvsz; 180 181 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); 182 (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN); 183 184 nvl = fnvlist_alloc(); 185 fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE); 186 zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz); 187 zc->zc_nvlist_src_size = nvsz; 188 fnvlist_free(nvl); 189 190 ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE; 191 rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS : 192 ZFS_IOC_OBJSET_STATS, zc, NULL); 193 if (type && rc == 0) 194 *type = (ispool) ? DMU_OST_ZFS : 195 zc->zc_objset_stats.dds_type; 196 fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz); 197 kmem_free(zc, sizeof (zfs_cmd_t)); 198 return (rc); 199 } 200 201 /* 202 * Returns what the zfs dataset name should be, given the /dev/zvol 203 * path and an optional name (can be NULL). 204 * 205 * Note that if the name param is NULL, then path must be an 206 * actual dataset's directory and not one of the top-level 207 * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a 208 * specific dataset. 209 */ 210 char * 211 devzvol_make_dsname(const char *path, const char *name) 212 { 213 char *dsname; 214 const char *ptr; 215 int dslen; 216 217 if (strcmp(path, ZVOL_DIR) == 0) 218 return (NULL); 219 if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)) 220 return (NULL); 221 ptr = path + strlen(ZVOL_DIR); 222 if (strncmp(ptr, "/dsk", 4) == 0) 223 ptr += strlen("/dsk"); 224 else if (strncmp(ptr, "/rdsk", 5) == 0) 225 ptr += strlen("/rdsk"); 226 else 227 return (NULL); 228 229 if (*ptr == '/') 230 ptr++; 231 else if (name == NULL) 232 return (NULL); 233 234 dslen = strlen(ptr); 235 if (dslen) 236 dslen++; /* plus null */ 237 if (name) 238 dslen += strlen(name) + 1; /* plus slash */ 239 dsname = kmem_zalloc(dslen, KM_SLEEP); 240 if (*ptr) { 241 (void) strlcpy(dsname, ptr, dslen); 242 if (name) 243 (void) strlcat(dsname, "/", dslen); 244 } 245 if (name) 246 (void) strlcat(dsname, name, dslen); 247 return (dsname); 248 } 249 250 /* 251 * check if the zvol's sdev_node is still valid, which means make 252 * sure the zvol is still valid. zvol minors aren't proactively 253 * destroyed when the zvol is destroyed, so we use a validator to clean 254 * these up (in other words, when such nodes are encountered during 255 * subsequent lookup() and readdir() operations) so that only valid 256 * nodes are returned. The ordering between devname_lookup_func and 257 * devzvol_validate is a little inefficient in the case of invalid 258 * or stale nodes because devname_lookup_func calls 259 * devzvol_create_{dir, link}, then the validator says it's invalid, 260 * and then the node gets cleaned up. 261 */ 262 int 263 devzvol_validate(struct sdev_node *dv) 264 { 265 vnode_t *vn = SDEVTOV(dv); 266 dmu_objset_type_t do_type; 267 char *dsname; 268 char *nm = dv->sdev_name; 269 int rc; 270 271 sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm)); 272 /* 273 * validate only READY nodes; if someone is sitting on the 274 * directory of a dataset that just got destroyed we could 275 * get a zombie node which we just skip. 276 */ 277 if (dv->sdev_state != SDEV_READY) { 278 sdcmn_err13(("skipping '%s'", nm)); 279 return (SDEV_VTOR_SKIP); 280 } 281 282 if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) || 283 (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0)) 284 return (SDEV_VTOR_VALID); 285 dsname = devzvol_make_dsname(dv->sdev_path, NULL); 286 if (dsname == NULL) 287 return (SDEV_VTOR_INVALID); 288 289 /* 290 * Leave any nodes alone that have been explicitly created by 291 * sdev profiles. 292 */ 293 if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) { 294 kmem_free(dsname, strlen(dsname) + 1); 295 return (SDEV_VTOR_VALID); 296 } 297 298 rc = devzvol_objset_check(dsname, &do_type); 299 sdcmn_err13((" '%s' rc %d", dsname, rc)); 300 if (rc != 0) { 301 sdev_node_t *parent = dv->sdev_dotdot; 302 /* 303 * Explicitly passed-through zvols in our sdev profile can't 304 * be created as prof_* shadow nodes, because in the GZ they 305 * are symlinks, but in the NGZ they are actual device files. 306 * 307 * The objset_check will fail on these as they are outside 308 * any delegated dataset (zfs will not allow ioctl access to 309 * them from this zone). We still want them to work, though. 310 */ 311 if (!(parent->sdev_flags & SDEV_GLOBAL) && 312 parent->sdev_origin != NULL && 313 !(dv->sdev_flags & SDEV_GLOBAL) && 314 (vn->v_type == VBLK || vn->v_type == VCHR) && 315 prof_name_matched(nm, parent)) { 316 do_type = DMU_OST_ZVOL; 317 } else { 318 kmem_free(dsname, strlen(dsname) + 1); 319 return (SDEV_VTOR_INVALID); 320 } 321 } 322 323 sdcmn_err13((" v_type %d do_type %d", 324 vn->v_type, do_type)); 325 if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) || 326 ((vn->v_type == VBLK || vn->v_type == VCHR) && 327 do_type != DMU_OST_ZVOL) || 328 (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) { 329 kmem_free(dsname, strlen(dsname) + 1); 330 return (SDEV_VTOR_STALE); 331 } 332 if (vn->v_type == VLNK) { 333 char *ptr, *link; 334 long val = 0; 335 minor_t lminor, ominor; 336 337 rc = sdev_getlink(vn, &link); 338 ASSERT(rc == 0); 339 340 ptr = strrchr(link, ':') + 1; 341 rc = ddi_strtol(ptr, NULL, 10, &val); 342 kmem_free(link, strlen(link) + 1); 343 ASSERT(rc == 0 && val != 0); 344 lminor = (minor_t)val; 345 if (sdev_zvol_name2minor(dsname, &ominor) < 0 || 346 ominor != lminor) { 347 kmem_free(dsname, strlen(dsname) + 1); 348 return (SDEV_VTOR_STALE); 349 } 350 } 351 kmem_free(dsname, strlen(dsname) + 1); 352 return (SDEV_VTOR_VALID); 353 } 354 355 /* 356 * Taskq callback to update the devzvol_zclist. 357 * 358 * We need to defer this to the taskq to avoid it running with a user 359 * context that might be associated with some non-global zone, and thus 360 * not being able to list all of the pools on the entire system. 361 */ 362 /*ARGSUSED*/ 363 static void 364 devzvol_update_zclist_cb(void *arg) 365 { 366 zfs_cmd_t *zc; 367 int rc; 368 size_t size; 369 370 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); 371 mutex_enter(&devzvol_mtx); 372 zc->zc_cookie = devzvol_gen; 373 374 rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size); 375 switch (rc) { 376 case 0: 377 /* new generation */ 378 ASSERT(devzvol_gen != zc->zc_cookie); 379 devzvol_gen = zc->zc_cookie; 380 if (devzvol_zclist) 381 kmem_free((void *)(uintptr_t)devzvol_zclist, 382 devzvol_zclist_size); 383 devzvol_zclist = zc->zc_nvlist_dst; 384 /* Keep the alloc'd size, not the nvlist size. */ 385 devzvol_zclist_size = size; 386 break; 387 default: 388 /* 389 * Either there was no change in pool configuration 390 * since we last asked (rc == EEXIST) or we got a 391 * catastrophic error. 392 * 393 * Give up memory and exit. 394 */ 395 kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, 396 size); 397 break; 398 } 399 400 VERIFY(devzvol_zclist_task_running == B_TRUE); 401 devzvol_zclist_task_running = B_FALSE; 402 mutex_exit(&devzvol_mtx); 403 404 kmem_free(zc, sizeof (zfs_cmd_t)); 405 } 406 407 static void 408 devzvol_update_zclist(void) 409 { 410 mutex_enter(&devzvol_mtx); 411 if (devzvol_zclist_task_running == B_TRUE) { 412 mutex_exit(&devzvol_mtx); 413 goto wait; 414 } 415 416 devzvol_zclist_task_running = B_TRUE; 417 418 taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0, 419 &devzvol_zclist_task); 420 421 mutex_exit(&devzvol_mtx); 422 423 wait: 424 taskq_wait(sdev_taskq); 425 } 426 427 /* 428 * Creates sub-directories for each zpool as needed in response to a 429 * readdir on one of the /dev/zvol/{dsk,rdsk} directories. 430 */ 431 void 432 devzvol_create_pool_dirs(struct vnode *dvp) 433 { 434 nvlist_t *nv = NULL; 435 nvpair_t *elem = NULL; 436 int pools = 0; 437 int rc; 438 439 sdcmn_err13(("devzvol_create_pool_dirs")); 440 441 devzvol_update_zclist(); 442 443 mutex_enter(&devzvol_mtx); 444 445 rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist, 446 devzvol_zclist_size, &nv, 0); 447 if (rc) { 448 ASSERT(rc == 0); 449 kmem_free((void *)(uintptr_t)devzvol_zclist, 450 devzvol_zclist_size); 451 devzvol_gen = 0; 452 devzvol_zclist = NULL; 453 devzvol_zclist_size = 0; 454 goto out; 455 } 456 mutex_exit(&devzvol_mtx); 457 while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { 458 struct vnode *vp; 459 ASSERT(dvp->v_count > 0); 460 rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, 461 NULL, kcred, NULL, 0, NULL); 462 /* should either work, or not be visible from a zone */ 463 ASSERT(rc == 0 || rc == ENOENT); 464 if (rc == 0) 465 VN_RELE(vp); 466 pools++; 467 } 468 nvlist_free(nv); 469 mutex_enter(&devzvol_mtx); 470 if (devzvol_isopen && pools == 0) { 471 /* clean up so zfs can be unloaded */ 472 devzvol_close_zfs(); 473 devzvol_isopen = B_FALSE; 474 } 475 out: 476 mutex_exit(&devzvol_mtx); 477 } 478 479 /*ARGSUSED3*/ 480 static int 481 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg, 482 cred_t *cred, void *whatever, char *whichever) 483 { 484 timestruc_t now; 485 struct vattr *vap = (struct vattr *)arg; 486 487 sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name, 488 ddv->sdev_path, nm)); 489 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, 490 strlen(ZVOL_DIR)) == 0); 491 *vap = *sdev_getdefault_attr(VDIR); 492 gethrestime(&now); 493 vap->va_atime = now; 494 vap->va_mtime = now; 495 vap->va_ctime = now; 496 return (0); 497 } 498 499 /*ARGSUSED3*/ 500 static int 501 devzvol_create_link(struct sdev_node *ddv, char *nm, 502 void **arg, cred_t *cred, void *whatever, char *whichever) 503 { 504 minor_t minor; 505 char *pathname = (char *)*arg; 506 int rc; 507 char *dsname; 508 char *x; 509 char str[MAXNAMELEN]; 510 sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name, 511 ddv->sdev_path, nm)); 512 dsname = devzvol_make_dsname(ddv->sdev_path, nm); 513 rc = sdev_zvol_create_minor(dsname); 514 if ((rc != 0 && rc != EEXIST && rc != EBUSY) || 515 sdev_zvol_name2minor(dsname, &minor)) { 516 sdcmn_err13(("devzvol_create_link %d", rc)); 517 kmem_free(dsname, strlen(dsname) + 1); 518 return (-1); 519 } 520 kmem_free(dsname, strlen(dsname) + 1); 521 522 /* 523 * This is a valid zvol; create a symlink that points to the 524 * minor which was created under /devices/pseudo/zfs@0 525 */ 526 *pathname = '\0'; 527 for (x = ddv->sdev_path; x = strchr(x, '/'); x++) 528 (void) strcat(pathname, "../"); 529 (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor); 530 (void) strncat(pathname, str, MAXPATHLEN); 531 if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR, 532 strlen(ZVOL_FULL_RDEV_DIR)) == 0) 533 (void) strcat(pathname, ",raw"); 534 return (0); 535 } 536 537 /* Clean zvol sdev_nodes that are no longer valid. */ 538 static void 539 devzvol_prunedir(struct sdev_node *ddv) 540 { 541 struct sdev_node *dv; 542 543 ASSERT(RW_READ_HELD(&ddv->sdev_contents)); 544 545 sdcmn_err13(("prunedir '%s'", ddv->sdev_name)); 546 ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 547 if (rw_tryupgrade(&ddv->sdev_contents) == 0) { 548 rw_exit(&ddv->sdev_contents); 549 rw_enter(&ddv->sdev_contents, RW_WRITER); 550 } 551 552 dv = SDEV_FIRST_ENTRY(ddv); 553 while (dv) { 554 sdcmn_err13(("sdev_name '%s'", dv->sdev_name)); 555 556 switch (devzvol_validate(dv)) { 557 case SDEV_VTOR_VALID: 558 case SDEV_VTOR_SKIP: 559 dv = SDEV_NEXT_ENTRY(ddv, dv); 560 continue; 561 case SDEV_VTOR_INVALID: 562 sdcmn_err7(("prunedir: destroy invalid " 563 "node: %s\n", dv->sdev_name)); 564 break; 565 } 566 567 if ((SDEVTOV(dv)->v_type == VDIR) && 568 (sdev_cleandir(dv, NULL, 0) != 0)) { 569 dv = SDEV_NEXT_ENTRY(ddv, dv); 570 continue; 571 } 572 SDEV_HOLD(dv); 573 /* remove the cache node */ 574 sdev_cache_update(ddv, &dv, dv->sdev_name, 575 SDEV_CACHE_DELETE); 576 SDEV_RELE(dv); 577 dv = SDEV_FIRST_ENTRY(ddv); 578 } 579 rw_downgrade(&ddv->sdev_contents); 580 } 581 582 /* 583 * This function is used to create a dir or dev inside a zone's /dev when the 584 * zone has a zvol that is dynamically created within the zone (i.e. inside 585 * of a delegated dataset. Since there is no /devices tree within a zone, 586 * we create the chr/blk devices directly inside the zone's /dev instead of 587 * making symlinks. 588 */ 589 static int 590 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm) 591 { 592 struct vattr vattr; 593 timestruc_t now; 594 enum vtype expected_type = VDIR; 595 dmu_objset_type_t do_type; 596 struct sdev_node *dv = NULL; 597 int res; 598 char *dsname; 599 600 bzero(&vattr, sizeof (vattr)); 601 gethrestime(&now); 602 vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID; 603 vattr.va_uid = SDEV_UID_DEFAULT; 604 vattr.va_gid = SDEV_GID_DEFAULT; 605 vattr.va_type = VNON; 606 vattr.va_atime = now; 607 vattr.va_mtime = now; 608 vattr.va_ctime = now; 609 610 if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL) 611 return (ENOENT); 612 613 if (devzvol_objset_check(dsname, &do_type) != 0) { 614 /* 615 * objset_check will succeed on any valid objset in the global 616 * zone, and any valid delegated dataset. It will fail, however, 617 * in non-global zones on explicitly whitelisted zvol devices 618 * that are outside any delegated dataset. 619 * 620 * The directories leading up to the zvol device itself will be 621 * created by prof for us in advance (and will always validate 622 * because of the matching check in devzvol_validate). The zvol 623 * device itself can't be created by prof though because in the 624 * GZ it's a symlink, and in the NGZ it is not. So, we create 625 * such zvol device files here. 626 */ 627 if (!(parent->sdev_flags & SDEV_GLOBAL) && 628 parent->sdev_origin != NULL && 629 prof_name_matched(nm, parent)) { 630 do_type = DMU_OST_ZVOL; 631 } else { 632 kmem_free(dsname, strlen(dsname) + 1); 633 return (ENOENT); 634 } 635 } 636 637 if (do_type == DMU_OST_ZVOL) 638 expected_type = VBLK; 639 640 if (expected_type == VDIR) { 641 vattr.va_type = VDIR; 642 vattr.va_mode = SDEV_DIRMODE_DEFAULT; 643 } else { 644 minor_t minor; 645 dev_t devnum; 646 int rc; 647 648 rc = sdev_zvol_create_minor(dsname); 649 if ((rc != 0 && rc != EEXIST && rc != EBUSY) || 650 sdev_zvol_name2minor(dsname, &minor)) { 651 kmem_free(dsname, strlen(dsname) + 1); 652 return (ENOENT); 653 } 654 655 devnum = makedevice(devzvol_major, minor); 656 vattr.va_rdev = devnum; 657 658 if (strstr(parent->sdev_path, "/rdsk/") != NULL) 659 vattr.va_type = VCHR; 660 else 661 vattr.va_type = VBLK; 662 vattr.va_mode = SDEV_DEVMODE_DEFAULT; 663 } 664 kmem_free(dsname, strlen(dsname) + 1); 665 666 rw_enter(&parent->sdev_contents, RW_WRITER); 667 668 res = sdev_mknode(parent, nm, &dv, &vattr, 669 NULL, NULL, kcred, SDEV_READY); 670 rw_exit(&parent->sdev_contents); 671 if (res != 0) 672 return (ENOENT); 673 674 SDEV_RELE(dv); 675 return (0); 676 } 677 678 /*ARGSUSED*/ 679 static int 680 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, 681 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, 682 caller_context_t *ct, int *direntflags, pathname_t *realpnp) 683 { 684 enum vtype expected_type = VDIR; 685 struct sdev_node *parent = VTOSDEV(dvp); 686 char *dsname; 687 dmu_objset_type_t do_type; 688 int error; 689 690 sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm)); 691 *vpp = NULL; 692 /* execute access is required to search the directory */ 693 if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) 694 return (error); 695 696 rw_enter(&parent->sdev_contents, RW_READER); 697 if (!SDEV_IS_GLOBAL(parent)) { 698 int res; 699 700 rw_exit(&parent->sdev_contents); 701 702 /* 703 * If we're in the global zone and reach down into a non-global 704 * zone's /dev/zvol then this action could trigger the creation 705 * of all of the zvol devices for every zone into the non-global 706 * zone's /dev tree. This could be a big security hole. To 707 * prevent this, disallow the global zone from looking inside 708 * a non-global zones /dev/zvol. This behavior is similar to 709 * delegated datasets, which cannot be used by the global zone. 710 */ 711 if (getzoneid() == GLOBAL_ZONEID) 712 return (EPERM); 713 714 res = prof_lookup(dvp, nm, vpp, cred); 715 716 /* 717 * We won't find a zvol that was dynamically created inside 718 * a NGZ, within a delegated dataset, in the zone's dev profile 719 * but prof_lookup will also find it via sdev_cache_lookup. 720 */ 721 if (res == ENOENT) { 722 /* 723 * We have to create the sdev node for the dymamically 724 * created zvol. 725 */ 726 if (devzvol_mk_ngz_node(parent, nm) != 0) 727 return (ENOENT); 728 res = prof_lookup(dvp, nm, vpp, cred); 729 } 730 731 return (res); 732 } 733 734 /* 735 * Don't let the global-zone style lookup succeed here when we're not 736 * running in the global zone. This can happen because prof calls into 737 * us (in prof_filldir) trying to create an explicitly passed-through 738 * zvol device outside any delegated dataset. 739 * 740 * We have to stop this here or else we will create prof shadows of 741 * the global zone symlink, which will make no sense at all in the 742 * non-global zone (it has no /devices for the symlink to point at). 743 * 744 * These zvols will be created later (at access time) by mk_ngz_node 745 * instead. The dirs leading up to them will be created by prof 746 * internally. 747 * 748 * We have to return EPERM here, because ENOENT is given special 749 * meaning by prof in this context. 750 */ 751 if (getzoneid() != GLOBAL_ZONEID) { 752 rw_exit(&parent->sdev_contents); 753 return (EPERM); 754 } 755 756 dsname = devzvol_make_dsname(parent->sdev_path, nm); 757 rw_exit(&parent->sdev_contents); 758 sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)")); 759 if (dsname) { 760 error = devzvol_objset_check(dsname, &do_type); 761 if (error != 0) { 762 error = ENOENT; 763 goto out; 764 } 765 if (do_type == DMU_OST_ZVOL) 766 expected_type = VLNK; 767 } 768 /* 769 * the callbacks expect: 770 * 771 * parent->sdev_path nm 772 * /dev/zvol {r}dsk 773 * /dev/zvol/{r}dsk <pool name> 774 * /dev/zvol/{r}dsk/<dataset name> <last ds component> 775 * 776 * sdev_name is always last path component of sdev_path 777 */ 778 if (expected_type == VDIR) { 779 error = devname_lookup_func(parent, nm, vpp, cred, 780 devzvol_create_dir, SDEV_VATTR); 781 } else { 782 error = devname_lookup_func(parent, nm, vpp, cred, 783 devzvol_create_link, SDEV_VLINK); 784 } 785 sdcmn_err13(("devzvol_lookup %d %d", expected_type, error)); 786 ASSERT(error || ((*vpp)->v_type == expected_type)); 787 out: 788 if (dsname) 789 kmem_free(dsname, strlen(dsname) + 1); 790 sdcmn_err13(("devzvol_lookup %d", error)); 791 return (error); 792 } 793 794 /* 795 * We allow create to find existing nodes 796 * - if the node doesn't exist - EROFS 797 * - creating an existing dir read-only succeeds, otherwise EISDIR 798 * - exclusive creates fail - EEXIST 799 */ 800 /*ARGSUSED2*/ 801 static int 802 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl, 803 int mode, struct vnode **vpp, struct cred *cred, int flag, 804 caller_context_t *ct, vsecattr_t *vsecp) 805 { 806 int error; 807 struct vnode *vp; 808 809 *vpp = NULL; 810 811 error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, 812 NULL); 813 if (error == 0) { 814 if (excl == EXCL) 815 error = EEXIST; 816 else if (vp->v_type == VDIR && (mode & VWRITE)) 817 error = EISDIR; 818 else 819 error = VOP_ACCESS(vp, mode, 0, cred, ct); 820 821 if (error) { 822 VN_RELE(vp); 823 } else 824 *vpp = vp; 825 } else if (error == ENOENT) { 826 error = EROFS; 827 } 828 829 return (error); 830 } 831 832 void sdev_iter_snapshots(struct vnode *dvp, char *name); 833 834 void 835 sdev_iter_datasets(struct vnode *dvp, int arg, char *name) 836 { 837 zfs_cmd_t *zc; 838 int rc; 839 840 sdcmn_err13(("iter name is '%s' (arg %x)", name, arg)); 841 zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); 842 (void) strcpy(zc->zc_name, name); 843 844 while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) { 845 struct vnode *vpp; 846 char *ptr; 847 848 sdcmn_err13((" name %s", zc->zc_name)); 849 if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%')) 850 goto skip; 851 ptr = strrchr(zc->zc_name, '/') + 1; 852 rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL, 853 kcred, NULL, NULL, NULL); 854 if (rc == 0) { 855 VN_RELE(vpp); 856 } else if (rc == ENOENT) { 857 goto skip; 858 } else { 859 /* 860 * EBUSY == problem with zvols's dmu holds? 861 * EPERM when in a NGZ and traversing up and out. 862 */ 863 goto skip; 864 } 865 if (arg == ZFS_IOC_DATASET_LIST_NEXT && 866 zc->zc_objset_stats.dds_type != DMU_OST_ZFS) 867 sdev_iter_snapshots(dvp, zc->zc_name); 868 skip: 869 (void) strcpy(zc->zc_name, name); 870 } 871 kmem_free(zc, sizeof (zfs_cmd_t)); 872 } 873 874 void 875 sdev_iter_snapshots(struct vnode *dvp, char *name) 876 { 877 sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name); 878 } 879 880 /*ARGSUSED4*/ 881 static int 882 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, 883 int *eofp, caller_context_t *ct_unused, int flags_unused) 884 { 885 struct sdev_node *sdvp = VTOSDEV(dvp); 886 char *ptr; 887 888 sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path, 889 sdvp->sdev_name)); 890 891 if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) { 892 struct vnode *vp; 893 894 rw_exit(&sdvp->sdev_contents); 895 (void) devname_lookup_func(sdvp, "dsk", &vp, cred, 896 devzvol_create_dir, SDEV_VATTR); 897 VN_RELE(vp); 898 (void) devname_lookup_func(sdvp, "rdsk", &vp, cred, 899 devzvol_create_dir, SDEV_VATTR); 900 VN_RELE(vp); 901 rw_enter(&sdvp->sdev_contents, RW_READER); 902 return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); 903 } 904 if (uiop->uio_offset == 0) 905 devzvol_prunedir(sdvp); 906 ptr = sdvp->sdev_path + strlen(ZVOL_DIR); 907 if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) { 908 rw_exit(&sdvp->sdev_contents); 909 devzvol_create_pool_dirs(dvp); 910 rw_enter(&sdvp->sdev_contents, RW_READER); 911 return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); 912 } 913 914 ptr = strchr(ptr + 1, '/'); 915 if (ptr == NULL) 916 return (ENOENT); 917 ptr++; 918 rw_exit(&sdvp->sdev_contents); 919 sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr); 920 rw_enter(&sdvp->sdev_contents, RW_READER); 921 return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); 922 } 923 924 const fs_operation_def_t devzvol_vnodeops_tbl[] = { 925 { VOPNAME_READDIR, { .vop_readdir = devzvol_readdir } }, 926 { VOPNAME_LOOKUP, { .vop_lookup = devzvol_lookup } }, 927 { VOPNAME_CREATE, { .vop_create = devzvol_create } }, 928 { VOPNAME_RENAME, { .error = fs_nosys } }, 929 { VOPNAME_MKDIR, { .error = fs_nosys } }, 930 { VOPNAME_RMDIR, { .error = fs_nosys } }, 931 { VOPNAME_REMOVE, { .error = fs_nosys } }, 932 { VOPNAME_SYMLINK, { .error = fs_nosys } }, 933 { NULL, { NULL } } 934 };