1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 23 */ 24 25 /* 26 * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and 27 * lofs(7FS) file systems. It is modeled on code from both of these file 28 * systems. 29 * 30 * The purpose is to create a high performance name space for files on which 31 * applications will compute. Given a large number of data files with various 32 * owners, we want to construct a view onto those files such that only a subset 33 * is visible to the applications and such that the view can be changed very 34 * quickly as compute progresses. Entries in the name space are not mounts and 35 * thus do not appear in the mnttab. Entries in the name space are allowed to 36 * refer to files on different backing file systems. Intermediate directories 37 * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes 38 * in the name space except for entries that refer to backing files ala lofs. 39 * 40 * The name space is managed via ioctls issued on the mounted file system and 41 * is mostly read-only for the compute applications. That is, applications 42 * cannot create new files in the name space. If a file is unlinked by an 43 * application, that only removes the file from the name space, the backing 44 * file remains in place. It is possible for applications to write-through to 45 * the backing files if the file system is mounted read-write. 46 * 47 * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, 48 * and HYPRLOFS_RM_ALL ioctls on the top-level mount. 49 * 50 * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and 51 * the name(s) for the file(s) in the name space. The name(s) may be path(s) 52 * which will be relative to the root of the mount and thus cannot begin with 53 * a /. If the name is a path, it does not have to correspond to any backing 54 * path. The intermediate directories will only exist in the name space. The 55 * entry(ies) will be added to the name space. 56 * 57 * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the 58 * name space which should be removed. The name(s) may be path(s) which will 59 * be relative to the root of the mount and thus cannot begin with a /. The 60 * named entry(ies) will be removed. 61 * 62 * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. 63 */ 64 65 #include <sys/types.h> 66 #include <sys/param.h> 67 #include <sys/sysmacros.h> 68 #include <sys/kmem.h> 69 #include <sys/time.h> 70 #include <sys/pathname.h> 71 #include <sys/vfs.h> 72 #include <sys/vfs_opreg.h> 73 #include <sys/vnode.h> 74 #include <sys/stat.h> 75 #include <sys/uio.h> 76 #include <sys/stat.h> 77 #include <sys/errno.h> 78 #include <sys/cmn_err.h> 79 #include <sys/cred.h> 80 #include <sys/statvfs.h> 81 #include <sys/mount.h> 82 #include <sys/debug.h> 83 #include <sys/systm.h> 84 #include <sys/mntent.h> 85 #include <fs/fs_subr.h> 86 #include <vm/page.h> 87 #include <vm/anon.h> 88 #include <sys/model.h> 89 #include <sys/policy.h> 90 91 #include <sys/fs/swapnode.h> 92 #include <sys/fs/hyprlofs_info.h> 93 94 static int hyprlofsfstype; 95 96 /* 97 * hyprlofs vfs operations. 98 */ 99 static int hyprlofsinit(int, char *); 100 static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); 101 static int hyprlofs_unmount(vfs_t *, int, cred_t *); 102 static int hyprlofs_root(vfs_t *, vnode_t **); 103 static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); 104 static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); 105 106 /* 107 * Loadable module wrapper 108 */ 109 #include <sys/modctl.h> 110 111 static mntopts_t hyprlofs_mntopts; 112 113 static vfsdef_t vfw = { 114 VFSDEF_VERSION, 115 "hyprlofs", 116 hyprlofsinit, 117 VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, 118 &hyprlofs_mntopts 119 }; 120 121 static mntopts_t hyprlofs_mntopts = { 122 0, NULL 123 }; 124 125 /* 126 * Module linkage information 127 */ 128 static struct modlfs modlfs = { 129 &mod_fsops, "filesystem for hyprlofs", &vfw 130 }; 131 132 static struct modlinkage modlinkage = { 133 MODREV_1, &modlfs, NULL 134 }; 135 136 int 137 _init() 138 { 139 return (mod_install(&modlinkage)); 140 } 141 142 int 143 _fini() 144 { 145 int error; 146 147 error = mod_remove(&modlinkage); 148 if (error) 149 return (error); 150 /* 151 * Tear down the operations vectors 152 */ 153 (void) vfs_freevfsops_by_type(hyprlofsfstype); 154 vn_freevnodeops(hyprlofs_vnodeops); 155 return (0); 156 } 157 158 int 159 _info(struct modinfo *modinfop) 160 { 161 return (mod_info(&modlinkage, modinfop)); 162 } 163 164 /* 165 * The following are patchable variables limiting the amount of system 166 * resources hyprlofs can use. 167 * 168 * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can 169 * use for it's data structures (e.g. hlnodes, directory entries). It is set 170 * as a percentage of physical memory which is determined when hyprlofs is 171 * first used in the system. 172 * 173 * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for 174 * the rest of the system. If the amount of free swap space in the system 175 * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon 176 * allocations will fail. 177 */ 178 size_t hyprlofs_maxkmem = 0; 179 size_t hyprlofs_minfree = 0; 180 size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ 181 182 static major_t hyprlofs_major; 183 static minor_t hyprlofs_minor; 184 static kmutex_t hyprlofs_minor_lock; 185 186 /* 187 * initialize global hyprlofs locks and hashes when loading hyprlofs module 188 */ 189 static int 190 hyprlofsinit(int fstype, char *name) 191 { 192 static const fs_operation_def_t hl_vfsops_template[] = { 193 VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, 194 VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, 195 VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, 196 VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, 197 VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, 198 NULL, NULL 199 }; 200 int error; 201 extern void hyprlofs_hash_init(); 202 203 hyprlofs_hash_init(); 204 hyprlofsfstype = fstype; 205 ASSERT(hyprlofsfstype != 0); 206 207 error = vfs_setfsops(fstype, hl_vfsops_template, NULL); 208 if (error != 0) { 209 cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); 210 return (error); 211 } 212 213 error = vn_make_ops(name, hyprlofs_vnodeops_template, 214 &hyprlofs_vnodeops); 215 if (error != 0) { 216 (void) vfs_freevfsops_by_type(fstype); 217 cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); 218 return (error); 219 } 220 221 /* 222 * hyprlofs_minfree is an absolute limit of swap space which still 223 * allows other processes to execute. Set it if its not patched. 224 */ 225 if (hyprlofs_minfree == 0) 226 hyprlofs_minfree = btopr(HYPRLOFSMINFREE); 227 228 /* 229 * The maximum amount of space hyprlofs can allocate is 230 * HYPRLOFSMAXPROCKMEM percent of kernel memory 231 */ 232 if (hyprlofs_maxkmem == 0) 233 hyprlofs_maxkmem = 234 MAX(PAGESIZE, kmem_maxavail() / HYPRLOFSMAXFRACKMEM); 235 236 if ((hyprlofs_major = getudev()) == (major_t)-1) { 237 cmn_err(CE_WARN, 238 "hyprlofsinit: Can't get unique device number."); 239 hyprlofs_major = 0; 240 } 241 mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 242 return (0); 243 } 244 245 static int 246 hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 247 { 248 hlfsmount_t *hm = NULL; 249 hlnode_t *hp; 250 struct pathname dpn; 251 int error; 252 vattr_t rattr; 253 int got_attrs; 254 255 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) 256 return (error); 257 if (secpolicy_hyprlofs_control(cr) != 0) 258 return (EPERM); 259 260 if (mvp->v_type != VDIR) 261 return (ENOTDIR); 262 263 if (uap->flags & MS_REMOUNT) 264 return (EBUSY); 265 266 mutex_enter(&mvp->v_lock); 267 if ((uap->flags & MS_OVERLAY) == 0 && 268 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 269 mutex_exit(&mvp->v_lock); 270 return (EBUSY); 271 } 272 mutex_exit(&mvp->v_lock); 273 274 /* Having the resource be anything but "swap" doesn't make sense. */ 275 vfs_setresource(vfsp, "swap", 0); 276 277 if ((error = pn_get(uap->dir, 278 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, 279 &dpn)) != 0) 280 goto out; 281 282 if ((hm = hyprlofs_memalloc(sizeof (hlfsmount_t), 0)) == NULL) { 283 pn_free(&dpn); 284 error = ENOMEM; 285 goto out; 286 } 287 288 /* Get an available minor device number for this mount */ 289 mutex_enter(&hyprlofs_minor_lock); 290 do { 291 hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; 292 hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); 293 } while (vfs_devismounted(hm->hlm_dev)); 294 mutex_exit(&hyprlofs_minor_lock); 295 296 /* 297 * Set but don't bother entering the mutex since hlfsmount is not on 298 * the mount list yet. 299 */ 300 mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); 301 302 hm->hlm_vfsp = vfsp; 303 304 vfsp->vfs_data = (caddr_t)hm; 305 vfsp->vfs_fstype = hyprlofsfstype; 306 vfsp->vfs_dev = hm->hlm_dev; 307 vfsp->vfs_bsize = PAGESIZE; 308 vfsp->vfs_flag |= VFS_NOTRUNC; 309 vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); 310 hm->hlm_mntpath = hyprlofs_memalloc(dpn.pn_pathlen + 1, HL_MUSTHAVE); 311 (void) strcpy(hm->hlm_mntpath, dpn.pn_path); 312 313 /* allocate and initialize root hlnode structure */ 314 bzero(&rattr, sizeof (vattr_t)); 315 rattr.va_mode = (mode_t)(S_IFDIR | 0777); 316 rattr.va_type = VDIR; 317 rattr.va_rdev = 0; 318 hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE); 319 hyprlofs_node_init(hm, hp, &rattr, cr); 320 321 /* Get the mode, uid, and gid from the underlying mount point. */ 322 rattr.va_mask = AT_MODE|AT_UID|AT_GID; 323 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); 324 325 rw_enter(&hp->hln_rwlock, RW_WRITER); 326 HLNTOV(hp)->v_flag |= VROOT; 327 328 /* 329 * If the getattr succeeded, use its results, otherwise allow the 330 * previously set defaults to prevail. 331 */ 332 if (got_attrs == 0) { 333 hp->hln_mode = rattr.va_mode; 334 hp->hln_uid = rattr.va_uid; 335 hp->hln_gid = rattr.va_gid; 336 } 337 338 /* 339 * Initialize linked list of hlnodes so that the back pointer of the 340 * root hlnode always points to the last one on the list and the 341 * forward pointer of the last node is null 342 */ 343 hp->hln_back = hp; 344 hp->hln_forw = NULL; 345 hp->hln_nlink = 0; 346 hm->hlm_rootnode = hp; 347 348 hyprlofs_dirinit(hp, hp); 349 350 rw_exit(&hp->hln_rwlock); 351 352 pn_free(&dpn); 353 error = 0; 354 355 out: 356 return (error); 357 } 358 359 static int 360 hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) 361 { 362 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); 363 hlnode_t *hnp, *cancel; 364 vnode_t *vp; 365 int error; 366 367 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) 368 return (error); 369 if (secpolicy_hyprlofs_control(cr) != 0) 370 return (EPERM); 371 372 /* 373 * forced unmount is not supported by this file system 374 * and thus, ENOTSUP, is being returned. 375 */ 376 if (flag & MS_FORCE) 377 return (ENOTSUP); 378 379 mutex_enter(&hm->hlm_contents); 380 381 /* 382 * If there are no open files, only the root node should have a ref cnt. 383 * With hlm_contents held, nothing can be added or removed. There may 384 * be some dirty pages. To prevent fsflush from disrupting the unmount, 385 * put a hold on each node while scanning. If we find a previously 386 * referenced node, undo the holds we have placed and fail EBUSY. 387 */ 388 hnp = hm->hlm_rootnode; 389 if (HLNTOV(hnp)->v_count > 1) { 390 mutex_exit(&hm->hlm_contents); 391 return (EBUSY); 392 } 393 394 for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { 395 if ((vp = HLNTOV(hnp))->v_count > 0) { 396 cancel = hm->hlm_rootnode->hln_forw; 397 while (cancel != hnp) { 398 vp = HLNTOV(cancel); 399 ASSERT(vp->v_count > 0); 400 VN_RELE(vp); 401 cancel = cancel->hln_forw; 402 } 403 mutex_exit(&hm->hlm_contents); 404 return (EBUSY); 405 } 406 VN_HOLD(vp); 407 } 408 409 /* We can drop the mutex now because no one can find this mount */ 410 mutex_exit(&hm->hlm_contents); 411 412 /* 413 * Free all alloc'd memory associated with this FS. To do this, we go 414 * through the file list twice, once to remove all the dir entries, and 415 * then to remove all the files. 416 */ 417 418 /* Remove all directory entries */ 419 for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { 420 rw_enter(&hnp->hln_rwlock, RW_WRITER); 421 if (hnp->hln_type == VDIR) 422 hyprlofs_dirtrunc(hnp); 423 rw_exit(&hnp->hln_rwlock); 424 } 425 426 ASSERT(hm->hlm_rootnode); 427 428 /* 429 * All links are gone, v_count is keeping nodes in place. VN_RELE 430 * should make the node disappear, unless somebody is holding pages 431 * against it. Wait and retry until it disappears. 432 * 433 * We re-acquire the lock to prevent others who have a HOLD on a hlnode 434 * from blowing it away (in hyprlofs_inactive) while we're trying to 435 * get to it here. Once we have a HOLD on it we know it'll stick around. 436 */ 437 mutex_enter(&hm->hlm_contents); 438 439 /* Remove all the files (except the rootnode) backwards. */ 440 while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { 441 mutex_exit(&hm->hlm_contents); 442 /* Note we handled the link count in pass 2 above. */ 443 vp = HLNTOV(hnp); 444 VN_RELE(vp); 445 mutex_enter(&hm->hlm_contents); 446 /* 447 * It's still there after the RELE. Someone else like pageout 448 * has a hold on it so wait a bit and then try again. 449 */ 450 if (hnp == hm->hlm_rootnode->hln_back) { 451 VN_HOLD(vp); 452 mutex_exit(&hm->hlm_contents); 453 delay(hz / 4); 454 mutex_enter(&hm->hlm_contents); 455 } 456 } 457 mutex_exit(&hm->hlm_contents); 458 459 VN_RELE(HLNTOV(hm->hlm_rootnode)); 460 461 ASSERT(hm->hlm_mntpath); 462 463 hyprlofs_memfree(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); 464 465 mutex_destroy(&hm->hlm_contents); 466 hyprlofs_memfree(hm, sizeof (hlfsmount_t)); 467 468 return (0); 469 } 470 471 /* Return root hlnode for given vnode */ 472 static int 473 hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) 474 { 475 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); 476 hlnode_t *hp = hm->hlm_rootnode; 477 vnode_t *vp; 478 479 ASSERT(hp); 480 481 vp = HLNTOV(hp); 482 VN_HOLD(vp); 483 *vpp = vp; 484 return (0); 485 } 486 487 static int 488 hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) 489 { 490 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); 491 ulong_t blocks; 492 dev32_t d32; 493 zoneid_t eff_zid; 494 struct zone *zp; 495 496 /* 497 * The FS may have been mounted by the GZ on behalf of the NGZ. In 498 * that case, the hlfsmount zone_id will be the global zone. We want 499 * to show the swap cap inside the zone in this case, even though the 500 * FS was mounted by the GZ. 501 */ 502 if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) 503 zp = curproc->p_zone; 504 else 505 zp = hm->hlm_vfsp->vfs_zone; 506 507 if (zp == NULL) 508 eff_zid = GLOBAL_ZONEUNIQID; 509 else 510 eff_zid = zp->zone_id; 511 512 sbp->f_bsize = PAGESIZE; 513 sbp->f_frsize = PAGESIZE; 514 515 /* 516 * Find the amount of available physical and memory swap 517 */ 518 mutex_enter(&anoninfo_lock); 519 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 520 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; 521 mutex_exit(&anoninfo_lock); 522 523 if (blocks > hyprlofs_minfree) 524 sbp->f_bfree = blocks - hyprlofs_minfree; 525 else 526 sbp->f_bfree = 0; 527 528 sbp->f_bavail = sbp->f_bfree; 529 530 /* 531 * Total number of blocks is what's available plus what's been used 532 */ 533 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); 534 535 if (eff_zid != GLOBAL_ZONEUNIQID && 536 zp->zone_max_swap_ctl != UINT64_MAX) { 537 /* 538 * If the fs is used by a NGZ with a swap cap, then report the 539 * capped size. 540 */ 541 rctl_qty_t cap, used; 542 pgcnt_t pgcap, pgused; 543 544 mutex_enter(&zp->zone_mem_lock); 545 cap = zp->zone_max_swap_ctl; 546 used = zp->zone_max_swap; 547 mutex_exit(&zp->zone_mem_lock); 548 549 pgcap = btop(cap); 550 pgused = btop(used); 551 552 sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); 553 sbp->f_bavail = sbp->f_bfree; 554 sbp->f_blocks = MIN(pgcap, sbp->f_blocks); 555 } 556 557 /* 558 * This is fairly inaccurate since it doesn't take into account the 559 * names stored in the directory entries. 560 */ 561 if (hyprlofs_maxkmem > hyprlofs_kmemspace) 562 sbp->f_ffree = (hyprlofs_maxkmem - hyprlofs_kmemspace) / 563 (sizeof (hlnode_t) + sizeof (hldirent_t)); 564 else 565 sbp->f_ffree = 0; 566 567 sbp->f_files = hyprlofs_maxkmem / 568 (sizeof (hlnode_t) + sizeof (hldirent_t)); 569 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); 570 (void) cmpldev(&d32, vfsp->vfs_dev); 571 sbp->f_fsid = d32; 572 (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); 573 (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); 574 /* 575 * ensure null termination 576 */ 577 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; 578 sbp->f_flag = vf_to_stf(vfsp->vfs_flag); 579 sbp->f_namemax = MAXNAMELEN - 1; 580 return (0); 581 } 582 583 static int 584 hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) 585 { 586 hlfid_t *hfid; 587 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); 588 hlnode_t *hp = NULL; 589 590 hfid = (hlfid_t *)fidp; 591 *vpp = NULL; 592 593 mutex_enter(&hm->hlm_contents); 594 for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { 595 mutex_enter(&hp->hln_tlock); 596 if (hp->hln_nodeid == hfid->hlfid_ino) { 597 /* 598 * If the gen numbers don't match we know the file 599 * won't be found since only one hlnode can have this 600 * number at a time. 601 */ 602 if (hp->hln_gen != hfid->hlfid_gen || 603 hp->hln_nlink == 0) { 604 mutex_exit(&hp->hln_tlock); 605 mutex_exit(&hm->hlm_contents); 606 return (0); 607 } 608 *vpp = (vnode_t *)HLNTOV(hp); 609 610 VN_HOLD(*vpp); 611 612 if ((hp->hln_mode & S_ISVTX) && 613 !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { 614 mutex_enter(&(*vpp)->v_lock); 615 (*vpp)->v_flag |= VISSWAP; 616 mutex_exit(&(*vpp)->v_lock); 617 } 618 mutex_exit(&hp->hln_tlock); 619 mutex_exit(&hm->hlm_contents); 620 return (0); 621 } 622 mutex_exit(&hp->hln_tlock); 623 } 624 mutex_exit(&hm->hlm_contents); 625 return (0); 626 }