1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
23 */
24
25 /*
26 * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
27 * lofs(7FS) file systems. It is modeled on code from both of these file
28 * systems.
29 *
30 * The purpose is to create a high performance name space for files on which
31 * applications will compute. Given a large number of data files with various
32 * owners, we want to construct a view onto those files such that only a subset
33 * is visible to the applications and such that the view can be changed very
34 * quickly as compute progresses. Entries in the name space are not mounts and
35 * thus do not appear in the mnttab. Entries in the name space are allowed to
36 * refer to files on different backing file systems. Intermediate directories
37 * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes
38 * in the name space except for entries that refer to backing files ala lofs.
39 *
40 * The name space is managed via ioctls issued on the mounted file system and
41 * is mostly read-only for the compute applications. That is, applications
42 * cannot create new files in the name space. If a file is unlinked by an
43 * application, that only removes the file from the name space, the backing
44 * file remains in place. It is possible for applications to write-through to
45 * the backing files if the file system is mounted read-write.
46 *
47 * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
48 * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
49 *
50 * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
51 * the name(s) for the file(s) in the name space. The name(s) may be path(s)
52 * which will be relative to the root of the mount and thus cannot begin with
53 * a /. If the name is a path, it does not have to correspond to any backing
54 * path. The intermediate directories will only exist in the name space. The
55 * entry(ies) will be added to the name space.
56 *
57 * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
58 * name space which should be removed. The name(s) may be path(s) which will
59 * be relative to the root of the mount and thus cannot begin with a /. The
60 * named entry(ies) will be removed.
61 *
62 * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
63 */
64
65 #include <sys/types.h>
66 #include <sys/param.h>
67 #include <sys/sysmacros.h>
68 #include <sys/kmem.h>
69 #include <sys/time.h>
70 #include <sys/pathname.h>
71 #include <sys/vfs.h>
72 #include <sys/vfs_opreg.h>
73 #include <sys/vnode.h>
74 #include <sys/stat.h>
75 #include <sys/uio.h>
76 #include <sys/stat.h>
77 #include <sys/errno.h>
78 #include <sys/cmn_err.h>
79 #include <sys/cred.h>
80 #include <sys/statvfs.h>
81 #include <sys/mount.h>
82 #include <sys/debug.h>
83 #include <sys/systm.h>
84 #include <sys/mntent.h>
85 #include <fs/fs_subr.h>
86 #include <vm/page.h>
87 #include <vm/anon.h>
88 #include <sys/model.h>
89 #include <sys/policy.h>
90
91 #include <sys/fs/swapnode.h>
92 #include <sys/fs/hyprlofs_info.h>
93
94 static int hyprlofsfstype;
95
96 /*
97 * hyprlofs vfs operations.
98 */
99 static int hyprlofsinit(int, char *);
100 static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
101 static int hyprlofs_unmount(vfs_t *, int, cred_t *);
102 static int hyprlofs_root(vfs_t *, vnode_t **);
103 static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
104 static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
105
106 /*
107 * Loadable module wrapper
108 */
109 #include <sys/modctl.h>
110
111 static mntopts_t hyprlofs_mntopts;
112
113 static vfsdef_t vfw = {
114 VFSDEF_VERSION,
115 "hyprlofs",
116 hyprlofsinit,
117 VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
118 &hyprlofs_mntopts
119 };
120
121 static mntopts_t hyprlofs_mntopts = {
122 0, NULL
123 };
124
125 /*
126 * Module linkage information
127 */
128 static struct modlfs modlfs = {
129 &mod_fsops, "filesystem for hyprlofs", &vfw
130 };
131
132 static struct modlinkage modlinkage = {
133 MODREV_1, &modlfs, NULL
134 };
135
136 int
137 _init()
138 {
139 return (mod_install(&modlinkage));
140 }
141
142 int
143 _fini()
144 {
145 int error;
146
147 error = mod_remove(&modlinkage);
148 if (error)
149 return (error);
150 /*
151 * Tear down the operations vectors
152 */
153 (void) vfs_freevfsops_by_type(hyprlofsfstype);
154 vn_freevnodeops(hyprlofs_vnodeops);
155 return (0);
156 }
157
158 int
159 _info(struct modinfo *modinfop)
160 {
161 return (mod_info(&modlinkage, modinfop));
162 }
163
164 /*
165 * The following are patchable variables limiting the amount of system
166 * resources hyprlofs can use.
167 *
168 * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
169 * use for it's data structures (e.g. hlnodes, directory entries). It is set
170 * as a percentage of physical memory which is determined when hyprlofs is
171 * first used in the system.
172 *
173 * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
174 * the rest of the system. If the amount of free swap space in the system
175 * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
176 * allocations will fail.
177 */
178 size_t hyprlofs_maxkmem = 0;
179 size_t hyprlofs_minfree = 0;
180 size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */
181
182 static major_t hyprlofs_major;
183 static minor_t hyprlofs_minor;
184 static kmutex_t hyprlofs_minor_lock;
185
186 /*
187 * initialize global hyprlofs locks and hashes when loading hyprlofs module
188 */
189 static int
190 hyprlofsinit(int fstype, char *name)
191 {
192 static const fs_operation_def_t hl_vfsops_template[] = {
193 VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount },
194 VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount },
195 VFSNAME_ROOT, { .vfs_root = hyprlofs_root },
196 VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs },
197 VFSNAME_VGET, { .vfs_vget = hyprlofs_vget },
198 NULL, NULL
199 };
200 int error;
201 extern void hyprlofs_hash_init();
202
203 hyprlofs_hash_init();
204 hyprlofsfstype = fstype;
205 ASSERT(hyprlofsfstype != 0);
206
207 error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
208 if (error != 0) {
209 cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
210 return (error);
211 }
212
213 error = vn_make_ops(name, hyprlofs_vnodeops_template,
214 &hyprlofs_vnodeops);
215 if (error != 0) {
216 (void) vfs_freevfsops_by_type(fstype);
217 cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
218 return (error);
219 }
220
221 /*
222 * hyprlofs_minfree is an absolute limit of swap space which still
223 * allows other processes to execute. Set it if its not patched.
224 */
225 if (hyprlofs_minfree == 0)
226 hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
227
228 /*
229 * The maximum amount of space hyprlofs can allocate is
230 * HYPRLOFSMAXPROCKMEM percent of kernel memory
231 */
232 if (hyprlofs_maxkmem == 0)
233 hyprlofs_maxkmem =
234 MAX(PAGESIZE, kmem_maxavail() / HYPRLOFSMAXFRACKMEM);
235
236 if ((hyprlofs_major = getudev()) == (major_t)-1) {
237 cmn_err(CE_WARN,
238 "hyprlofsinit: Can't get unique device number.");
239 hyprlofs_major = 0;
240 }
241 mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
242 return (0);
243 }
244
245 static int
246 hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
247 {
248 hlfsmount_t *hm = NULL;
249 hlnode_t *hp;
250 struct pathname dpn;
251 int error;
252 vattr_t rattr;
253 int got_attrs;
254
255 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
256 return (error);
257 if (secpolicy_hyprlofs_control(cr) != 0)
258 return (EPERM);
259
260 if (mvp->v_type != VDIR)
261 return (ENOTDIR);
262
263 if (uap->flags & MS_REMOUNT)
264 return (EBUSY);
265
266 mutex_enter(&mvp->v_lock);
267 if ((uap->flags & MS_OVERLAY) == 0 &&
268 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
269 mutex_exit(&mvp->v_lock);
270 return (EBUSY);
271 }
272 mutex_exit(&mvp->v_lock);
273
274 /* Having the resource be anything but "swap" doesn't make sense. */
275 vfs_setresource(vfsp, "swap", 0);
276
277 if ((error = pn_get(uap->dir,
278 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
279 &dpn)) != 0)
280 goto out;
281
282 if ((hm = hyprlofs_memalloc(sizeof (hlfsmount_t), 0)) == NULL) {
283 pn_free(&dpn);
284 error = ENOMEM;
285 goto out;
286 }
287
288 /* Get an available minor device number for this mount */
289 mutex_enter(&hyprlofs_minor_lock);
290 do {
291 hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
292 hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
293 } while (vfs_devismounted(hm->hlm_dev));
294 mutex_exit(&hyprlofs_minor_lock);
295
296 /*
297 * Set but don't bother entering the mutex since hlfsmount is not on
298 * the mount list yet.
299 */
300 mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
301
302 hm->hlm_vfsp = vfsp;
303
304 vfsp->vfs_data = (caddr_t)hm;
305 vfsp->vfs_fstype = hyprlofsfstype;
306 vfsp->vfs_dev = hm->hlm_dev;
307 vfsp->vfs_bsize = PAGESIZE;
308 vfsp->vfs_flag |= VFS_NOTRUNC;
309 vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
310 hm->hlm_mntpath = hyprlofs_memalloc(dpn.pn_pathlen + 1, HL_MUSTHAVE);
311 (void) strcpy(hm->hlm_mntpath, dpn.pn_path);
312
313 /* allocate and initialize root hlnode structure */
314 bzero(&rattr, sizeof (vattr_t));
315 rattr.va_mode = (mode_t)(S_IFDIR | 0777);
316 rattr.va_type = VDIR;
317 rattr.va_rdev = 0;
318 hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
319 hyprlofs_node_init(hm, hp, &rattr, cr);
320
321 /* Get the mode, uid, and gid from the underlying mount point. */
322 rattr.va_mask = AT_MODE|AT_UID|AT_GID;
323 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
324
325 rw_enter(&hp->hln_rwlock, RW_WRITER);
326 HLNTOV(hp)->v_flag |= VROOT;
327
328 /*
329 * If the getattr succeeded, use its results, otherwise allow the
330 * previously set defaults to prevail.
331 */
332 if (got_attrs == 0) {
333 hp->hln_mode = rattr.va_mode;
334 hp->hln_uid = rattr.va_uid;
335 hp->hln_gid = rattr.va_gid;
336 }
337
338 /*
339 * Initialize linked list of hlnodes so that the back pointer of the
340 * root hlnode always points to the last one on the list and the
341 * forward pointer of the last node is null
342 */
343 hp->hln_back = hp;
344 hp->hln_forw = NULL;
345 hp->hln_nlink = 0;
346 hm->hlm_rootnode = hp;
347
348 hyprlofs_dirinit(hp, hp);
349
350 rw_exit(&hp->hln_rwlock);
351
352 pn_free(&dpn);
353 error = 0;
354
355 out:
356 return (error);
357 }
358
359 static int
360 hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
361 {
362 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
363 hlnode_t *hnp, *cancel;
364 vnode_t *vp;
365 int error;
366
367 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
368 return (error);
369 if (secpolicy_hyprlofs_control(cr) != 0)
370 return (EPERM);
371
372 /*
373 * forced unmount is not supported by this file system
374 * and thus, ENOTSUP, is being returned.
375 */
376 if (flag & MS_FORCE)
377 return (ENOTSUP);
378
379 mutex_enter(&hm->hlm_contents);
380
381 /*
382 * If there are no open files, only the root node should have a ref cnt.
383 * With hlm_contents held, nothing can be added or removed. There may
384 * be some dirty pages. To prevent fsflush from disrupting the unmount,
385 * put a hold on each node while scanning. If we find a previously
386 * referenced node, undo the holds we have placed and fail EBUSY.
387 */
388 hnp = hm->hlm_rootnode;
389 if (HLNTOV(hnp)->v_count > 1) {
390 mutex_exit(&hm->hlm_contents);
391 return (EBUSY);
392 }
393
394 for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
395 if ((vp = HLNTOV(hnp))->v_count > 0) {
396 cancel = hm->hlm_rootnode->hln_forw;
397 while (cancel != hnp) {
398 vp = HLNTOV(cancel);
399 ASSERT(vp->v_count > 0);
400 VN_RELE(vp);
401 cancel = cancel->hln_forw;
402 }
403 mutex_exit(&hm->hlm_contents);
404 return (EBUSY);
405 }
406 VN_HOLD(vp);
407 }
408
409 /* We can drop the mutex now because no one can find this mount */
410 mutex_exit(&hm->hlm_contents);
411
412 /*
413 * Free all alloc'd memory associated with this FS. To do this, we go
414 * through the file list twice, once to remove all the dir entries, and
415 * then to remove all the files.
416 */
417
418 /* Remove all directory entries */
419 for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
420 rw_enter(&hnp->hln_rwlock, RW_WRITER);
421 if (hnp->hln_type == VDIR)
422 hyprlofs_dirtrunc(hnp);
423 rw_exit(&hnp->hln_rwlock);
424 }
425
426 ASSERT(hm->hlm_rootnode);
427
428 /*
429 * All links are gone, v_count is keeping nodes in place. VN_RELE
430 * should make the node disappear, unless somebody is holding pages
431 * against it. Wait and retry until it disappears.
432 *
433 * We re-acquire the lock to prevent others who have a HOLD on a hlnode
434 * from blowing it away (in hyprlofs_inactive) while we're trying to
435 * get to it here. Once we have a HOLD on it we know it'll stick around.
436 */
437 mutex_enter(&hm->hlm_contents);
438
439 /* Remove all the files (except the rootnode) backwards. */
440 while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
441 mutex_exit(&hm->hlm_contents);
442 /* Note we handled the link count in pass 2 above. */
443 vp = HLNTOV(hnp);
444 VN_RELE(vp);
445 mutex_enter(&hm->hlm_contents);
446 /*
447 * It's still there after the RELE. Someone else like pageout
448 * has a hold on it so wait a bit and then try again.
449 */
450 if (hnp == hm->hlm_rootnode->hln_back) {
451 VN_HOLD(vp);
452 mutex_exit(&hm->hlm_contents);
453 delay(hz / 4);
454 mutex_enter(&hm->hlm_contents);
455 }
456 }
457 mutex_exit(&hm->hlm_contents);
458
459 VN_RELE(HLNTOV(hm->hlm_rootnode));
460
461 ASSERT(hm->hlm_mntpath);
462
463 hyprlofs_memfree(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
464
465 mutex_destroy(&hm->hlm_contents);
466 hyprlofs_memfree(hm, sizeof (hlfsmount_t));
467
468 return (0);
469 }
470
471 /* Return root hlnode for given vnode */
472 static int
473 hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
474 {
475 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
476 hlnode_t *hp = hm->hlm_rootnode;
477 vnode_t *vp;
478
479 ASSERT(hp);
480
481 vp = HLNTOV(hp);
482 VN_HOLD(vp);
483 *vpp = vp;
484 return (0);
485 }
486
487 static int
488 hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
489 {
490 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
491 ulong_t blocks;
492 dev32_t d32;
493 zoneid_t eff_zid;
494 struct zone *zp;
495
496 /*
497 * The FS may have been mounted by the GZ on behalf of the NGZ. In
498 * that case, the hlfsmount zone_id will be the global zone. We want
499 * to show the swap cap inside the zone in this case, even though the
500 * FS was mounted by the GZ.
501 */
502 if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
503 zp = curproc->p_zone;
504 else
505 zp = hm->hlm_vfsp->vfs_zone;
506
507 if (zp == NULL)
508 eff_zid = GLOBAL_ZONEUNIQID;
509 else
510 eff_zid = zp->zone_id;
511
512 sbp->f_bsize = PAGESIZE;
513 sbp->f_frsize = PAGESIZE;
514
515 /*
516 * Find the amount of available physical and memory swap
517 */
518 mutex_enter(&anoninfo_lock);
519 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
520 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
521 mutex_exit(&anoninfo_lock);
522
523 if (blocks > hyprlofs_minfree)
524 sbp->f_bfree = blocks - hyprlofs_minfree;
525 else
526 sbp->f_bfree = 0;
527
528 sbp->f_bavail = sbp->f_bfree;
529
530 /*
531 * Total number of blocks is what's available plus what's been used
532 */
533 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
534
535 if (eff_zid != GLOBAL_ZONEUNIQID &&
536 zp->zone_max_swap_ctl != UINT64_MAX) {
537 /*
538 * If the fs is used by a NGZ with a swap cap, then report the
539 * capped size.
540 */
541 rctl_qty_t cap, used;
542 pgcnt_t pgcap, pgused;
543
544 mutex_enter(&zp->zone_mem_lock);
545 cap = zp->zone_max_swap_ctl;
546 used = zp->zone_max_swap;
547 mutex_exit(&zp->zone_mem_lock);
548
549 pgcap = btop(cap);
550 pgused = btop(used);
551
552 sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
553 sbp->f_bavail = sbp->f_bfree;
554 sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
555 }
556
557 /*
558 * This is fairly inaccurate since it doesn't take into account the
559 * names stored in the directory entries.
560 */
561 if (hyprlofs_maxkmem > hyprlofs_kmemspace)
562 sbp->f_ffree = (hyprlofs_maxkmem - hyprlofs_kmemspace) /
563 (sizeof (hlnode_t) + sizeof (hldirent_t));
564 else
565 sbp->f_ffree = 0;
566
567 sbp->f_files = hyprlofs_maxkmem /
568 (sizeof (hlnode_t) + sizeof (hldirent_t));
569 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
570 (void) cmpldev(&d32, vfsp->vfs_dev);
571 sbp->f_fsid = d32;
572 (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
573 (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
574 /*
575 * ensure null termination
576 */
577 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
578 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
579 sbp->f_namemax = MAXNAMELEN - 1;
580 return (0);
581 }
582
583 static int
584 hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
585 {
586 hlfid_t *hfid;
587 hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
588 hlnode_t *hp = NULL;
589
590 hfid = (hlfid_t *)fidp;
591 *vpp = NULL;
592
593 mutex_enter(&hm->hlm_contents);
594 for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
595 mutex_enter(&hp->hln_tlock);
596 if (hp->hln_nodeid == hfid->hlfid_ino) {
597 /*
598 * If the gen numbers don't match we know the file
599 * won't be found since only one hlnode can have this
600 * number at a time.
601 */
602 if (hp->hln_gen != hfid->hlfid_gen ||
603 hp->hln_nlink == 0) {
604 mutex_exit(&hp->hln_tlock);
605 mutex_exit(&hm->hlm_contents);
606 return (0);
607 }
608 *vpp = (vnode_t *)HLNTOV(hp);
609
610 VN_HOLD(*vpp);
611
612 if ((hp->hln_mode & S_ISVTX) &&
613 !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
614 mutex_enter(&(*vpp)->v_lock);
615 (*vpp)->v_flag |= VISSWAP;
616 mutex_exit(&(*vpp)->v_lock);
617 }
618 mutex_exit(&hp->hln_tlock);
619 mutex_exit(&hm->hlm_contents);
620 return (0);
621 }
622 mutex_exit(&hp->hln_tlock);
623 }
624 mutex_exit(&hm->hlm_contents);
625 return (0);
626 }