Print this page
7798 vfs_mountfs passes junk in flags to domount
Reviewed by: Alexander Pyhalov <alp@rsu.ru>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Juraj Lutter <juraj.lutter@erigones.com>
Reviewed by: Marcel Telka <marcel@telka.sk>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/vfs.c
+++ new/usr/src/uts/common/fs/vfs.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
25 25 * Copyright 2016 Toomas Soome <tsoome@me.com>
26 26 * Copyright (c) 2016 by Delphix. All rights reserved.
27 27 * Copyright 2016 Nexenta Systems, Inc.
28 + * Copyright 2017 RackTop Systems.
28 29 */
29 30
30 31 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
31 32 /* All Rights Reserved */
32 33
33 34 /*
34 35 * University Copyright- Copyright (c) 1982, 1986, 1988
35 36 * The Regents of the University of California
36 37 * All Rights Reserved
37 38 *
38 39 * University Acknowledgment- Portions of this document are derived from
39 40 * software developed by the University of California, Berkeley, and its
40 41 * contributors.
41 42 */
42 43
43 44 #include <sys/types.h>
44 45 #include <sys/t_lock.h>
45 46 #include <sys/param.h>
46 47 #include <sys/errno.h>
47 48 #include <sys/user.h>
48 49 #include <sys/fstyp.h>
49 50 #include <sys/kmem.h>
50 51 #include <sys/systm.h>
51 52 #include <sys/proc.h>
52 53 #include <sys/mount.h>
53 54 #include <sys/vfs.h>
54 55 #include <sys/vfs_opreg.h>
55 56 #include <sys/fem.h>
56 57 #include <sys/mntent.h>
57 58 #include <sys/stat.h>
58 59 #include <sys/statvfs.h>
59 60 #include <sys/statfs.h>
60 61 #include <sys/cred.h>
61 62 #include <sys/vnode.h>
62 63 #include <sys/rwstlock.h>
63 64 #include <sys/dnlc.h>
64 65 #include <sys/file.h>
65 66 #include <sys/time.h>
66 67 #include <sys/atomic.h>
67 68 #include <sys/cmn_err.h>
68 69 #include <sys/buf.h>
69 70 #include <sys/swap.h>
70 71 #include <sys/debug.h>
71 72 #include <sys/vnode.h>
72 73 #include <sys/modctl.h>
73 74 #include <sys/ddi.h>
74 75 #include <sys/pathname.h>
75 76 #include <sys/bootconf.h>
76 77 #include <sys/dumphdr.h>
77 78 #include <sys/dc_ki.h>
78 79 #include <sys/poll.h>
79 80 #include <sys/sunddi.h>
80 81 #include <sys/sysmacros.h>
81 82 #include <sys/zone.h>
82 83 #include <sys/policy.h>
83 84 #include <sys/ctfs.h>
84 85 #include <sys/objfs.h>
85 86 #include <sys/console.h>
86 87 #include <sys/reboot.h>
87 88 #include <sys/attr.h>
88 89 #include <sys/zio.h>
89 90 #include <sys/spa.h>
90 91 #include <sys/lofi.h>
91 92 #include <sys/bootprops.h>
92 93
93 94 #include <vm/page.h>
94 95
95 96 #include <fs/fs_subr.h>
96 97 /* Private interfaces to create vopstats-related data structures */
97 98 extern void initialize_vopstats(vopstats_t *);
98 99 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *);
99 100 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *);
100 101
101 102 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
102 103 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
103 104 const char *, int, int);
104 105 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
105 106 static void vfs_freemnttab(struct vfs *);
106 107 static void vfs_freeopt(mntopt_t *);
107 108 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
108 109 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
109 110 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
110 111 static void vfs_createopttbl_extend(mntopts_t *, const char *,
111 112 const mntopts_t *);
112 113 static char **vfs_copycancelopt_extend(char **const, int);
113 114 static void vfs_freecancelopt(char **);
114 115 static void getrootfs(char **, char **);
115 116 static int getmacpath(dev_info_t *, void *);
116 117 static void vfs_mnttabvp_setup(void);
117 118
118 119 struct ipmnt {
119 120 struct ipmnt *mip_next;
120 121 dev_t mip_dev;
121 122 struct vfs *mip_vfsp;
122 123 };
123 124
124 125 static kmutex_t vfs_miplist_mutex;
125 126 static struct ipmnt *vfs_miplist = NULL;
126 127 static struct ipmnt *vfs_miplist_end = NULL;
127 128
128 129 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
129 130
130 131 /*
131 132 * VFS global data.
132 133 */
133 134 vnode_t *rootdir; /* pointer to root inode vnode. */
134 135 vnode_t *devicesdir; /* pointer to inode of devices root */
135 136 vnode_t *devdir; /* pointer to inode of dev root */
136 137
137 138 char *server_rootpath; /* root path for diskless clients */
138 139 char *server_hostname; /* hostname of diskless server */
139 140
140 141 static struct vfs root;
141 142 static struct vfs devices;
142 143 static struct vfs dev;
143 144 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */
144 145 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */
145 146 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */
146 147 /* must be power of 2! */
147 148 timespec_t vfs_mnttab_ctime; /* mnttab created time */
148 149 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */
149 150 char *vfs_dummyfstype = "\0";
150 151 struct pollhead vfs_pollhd; /* for mnttab pollers */
151 152 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */
152 153 int mntfstype; /* will be set once mnt fs is mounted */
153 154
154 155 /*
155 156 * Table for generic options recognized in the VFS layer and acted
156 157 * on at this level before parsing file system specific options.
157 158 * The nosuid option is stronger than any of the devices and setuid
158 159 * options, so those are canceled when nosuid is seen.
159 160 *
160 161 * All options which are added here need to be added to the
161 162 * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
162 163 */
163 164 /*
164 165 * VFS Mount options table
165 166 */
166 167 static char *ro_cancel[] = { MNTOPT_RW, NULL };
167 168 static char *rw_cancel[] = { MNTOPT_RO, NULL };
168 169 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
169 170 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
170 171 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
171 172 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
172 173 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
173 174 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
174 175 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
175 176 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
176 177 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
177 178 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
178 179 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
179 180
180 181 static const mntopt_t mntopts[] = {
181 182 /*
182 183 * option name cancel options default arg flags
183 184 */
184 185 { MNTOPT_REMOUNT, NULL, NULL,
185 186 MO_NODISPLAY, (void *)0 },
186 187 { MNTOPT_RO, ro_cancel, NULL, 0,
187 188 (void *)0 },
188 189 { MNTOPT_RW, rw_cancel, NULL, 0,
189 190 (void *)0 },
190 191 { MNTOPT_SUID, suid_cancel, NULL, 0,
191 192 (void *)0 },
192 193 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0,
193 194 (void *)0 },
194 195 { MNTOPT_DEVICES, devices_cancel, NULL, 0,
195 196 (void *)0 },
196 197 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0,
197 198 (void *)0 },
198 199 { MNTOPT_SETUID, setuid_cancel, NULL, 0,
199 200 (void *)0 },
200 201 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0,
201 202 (void *)0 },
202 203 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0,
203 204 (void *)0 },
204 205 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0,
205 206 (void *)0 },
206 207 { MNTOPT_EXEC, exec_cancel, NULL, 0,
207 208 (void *)0 },
208 209 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0,
209 210 (void *)0 },
210 211 };
211 212
212 213 const mntopts_t vfs_mntopts = {
213 214 sizeof (mntopts) / sizeof (mntopt_t),
214 215 (mntopt_t *)&mntopts[0]
215 216 };
216 217
217 218 /*
218 219 * File system operation dispatch functions.
219 220 */
220 221
221 222 int
222 223 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
223 224 {
224 225 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
225 226 }
226 227
227 228 int
228 229 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
229 230 {
230 231 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
231 232 }
232 233
233 234 int
234 235 fsop_root(vfs_t *vfsp, vnode_t **vpp)
235 236 {
236 237 refstr_t *mntpt;
237 238 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
238 239 /*
239 240 * Make sure this root has a path. With lofs, it is possible to have
240 241 * a NULL mountpoint.
241 242 */
242 243 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
243 244 mntpt = vfs_getmntpoint(vfsp);
244 245 vn_setpath_str(*vpp, refstr_value(mntpt),
245 246 strlen(refstr_value(mntpt)));
246 247 refstr_rele(mntpt);
247 248 }
248 249
249 250 return (ret);
250 251 }
251 252
252 253 int
253 254 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
254 255 {
255 256 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
256 257 }
257 258
258 259 int
259 260 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
260 261 {
261 262 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
262 263 }
263 264
264 265 int
265 266 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
266 267 {
267 268 /*
268 269 * In order to handle system attribute fids in a manner
269 270 * transparent to the underlying fs, we embed the fid for
270 271 * the sysattr parent object in the sysattr fid and tack on
271 272 * some extra bytes that only the sysattr layer knows about.
272 273 *
273 274 * This guarantees that sysattr fids are larger than other fids
274 275 * for this vfs. If the vfs supports the sysattr view interface
275 276 * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
276 277 * collision with XATTR_FIDSZ.
277 278 */
278 279 if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
279 280 fidp->fid_len == XATTR_FIDSZ)
280 281 return (xattr_dir_vget(vfsp, vpp, fidp));
281 282
282 283 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
283 284 }
284 285
285 286 int
286 287 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
287 288 {
288 289 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
289 290 }
290 291
291 292 void
292 293 fsop_freefs(vfs_t *vfsp)
293 294 {
294 295 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
295 296 }
296 297
297 298 int
298 299 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
299 300 {
300 301 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
301 302 }
302 303
303 304 int
304 305 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
305 306 {
306 307 ASSERT((fstype >= 0) && (fstype < nfstype));
307 308
308 309 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
309 310 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
310 311 else
311 312 return (ENOTSUP);
312 313 }
313 314
314 315 /*
315 316 * File system initialization. vfs_setfsops() must be called from a file
316 317 * system's init routine.
317 318 */
318 319
319 320 static int
320 321 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
321 322 int *unused_ops)
322 323 {
323 324 static const fs_operation_trans_def_t vfs_ops_table[] = {
324 325 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
325 326 fs_nosys, fs_nosys,
326 327
327 328 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
328 329 fs_nosys, fs_nosys,
329 330
330 331 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
331 332 fs_nosys, fs_nosys,
332 333
333 334 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
334 335 fs_nosys, fs_nosys,
335 336
336 337 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
337 338 (fs_generic_func_p) fs_sync,
338 339 (fs_generic_func_p) fs_sync, /* No errors allowed */
339 340
340 341 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
341 342 fs_nosys, fs_nosys,
342 343
343 344 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
344 345 fs_nosys, fs_nosys,
345 346
346 347 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
347 348 (fs_generic_func_p)fs_freevfs,
348 349 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */
349 350
350 351 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
351 352 (fs_generic_func_p)fs_nosys,
352 353 (fs_generic_func_p)fs_nosys,
353 354
354 355 NULL, 0, NULL, NULL
355 356 };
356 357
357 358 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
358 359 }
359 360
360 361 void
361 362 zfs_boot_init(void)
362 363 {
363 364 if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
364 365 spa_boot_init();
365 366 }
366 367
367 368 int
368 369 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
369 370 {
370 371 int error;
371 372 int unused_ops;
372 373
373 374 /*
374 375 * Verify that fstype refers to a valid fs. Note that
375 376 * 0 is valid since it's used to set "stray" ops.
376 377 */
377 378 if ((fstype < 0) || (fstype >= nfstype))
378 379 return (EINVAL);
379 380
380 381 if (!ALLOCATED_VFSSW(&vfssw[fstype]))
381 382 return (EINVAL);
382 383
383 384 /* Set up the operations vector. */
384 385
385 386 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
386 387
387 388 if (error != 0)
388 389 return (error);
389 390
390 391 vfssw[fstype].vsw_flag |= VSW_INSTALLED;
391 392
392 393 if (actual != NULL)
393 394 *actual = &vfssw[fstype].vsw_vfsops;
394 395
395 396 #if DEBUG
396 397 if (unused_ops != 0)
397 398 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
398 399 "but not used", vfssw[fstype].vsw_name, unused_ops);
399 400 #endif
400 401
401 402 return (0);
402 403 }
403 404
404 405 int
405 406 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
406 407 {
407 408 int error;
408 409 int unused_ops;
409 410
410 411 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
411 412
412 413 error = fs_copyfsops(template, *actual, &unused_ops);
413 414 if (error != 0) {
414 415 kmem_free(*actual, sizeof (vfsops_t));
415 416 *actual = NULL;
416 417 return (error);
417 418 }
418 419
419 420 return (0);
420 421 }
421 422
422 423 /*
423 424 * Free a vfsops structure created as a result of vfs_makefsops().
424 425 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
425 426 * vfs_freevfsops_by_type().
426 427 */
427 428 void
428 429 vfs_freevfsops(vfsops_t *vfsops)
429 430 {
430 431 kmem_free(vfsops, sizeof (vfsops_t));
431 432 }
432 433
433 434 /*
434 435 * Since the vfsops structure is part of the vfssw table and wasn't
435 436 * really allocated, we're not really freeing anything. We keep
436 437 * the name for consistency with vfs_freevfsops(). We do, however,
437 438 * need to take care of a little bookkeeping.
438 439 * NOTE: For a vfsops structure created by vfs_setfsops(), use
439 440 * vfs_freevfsops_by_type().
440 441 */
441 442 int
442 443 vfs_freevfsops_by_type(int fstype)
443 444 {
444 445
445 446 /* Verify that fstype refers to a loaded fs (and not fsid 0). */
446 447 if ((fstype <= 0) || (fstype >= nfstype))
447 448 return (EINVAL);
448 449
449 450 WLOCK_VFSSW();
450 451 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
451 452 WUNLOCK_VFSSW();
452 453 return (EINVAL);
453 454 }
454 455
455 456 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
456 457 WUNLOCK_VFSSW();
457 458
458 459 return (0);
459 460 }
460 461
461 462 /* Support routines used to reference vfs_op */
462 463
463 464 /* Set the operations vector for a vfs */
464 465 void
465 466 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
466 467 {
467 468 vfsops_t *op;
468 469
469 470 ASSERT(vfsp != NULL);
470 471 ASSERT(vfsops != NULL);
471 472
472 473 op = vfsp->vfs_op;
473 474 membar_consumer();
474 475 if (vfsp->vfs_femhead == NULL &&
475 476 atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
476 477 return;
477 478 }
478 479 fsem_setvfsops(vfsp, vfsops);
479 480 }
480 481
481 482 /* Retrieve the operations vector for a vfs */
482 483 vfsops_t *
483 484 vfs_getops(vfs_t *vfsp)
484 485 {
485 486 vfsops_t *op;
486 487
487 488 ASSERT(vfsp != NULL);
488 489
489 490 op = vfsp->vfs_op;
490 491 membar_consumer();
491 492 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
492 493 return (op);
493 494 } else {
494 495 return (fsem_getvfsops(vfsp));
495 496 }
496 497 }
497 498
498 499 /*
499 500 * Returns non-zero (1) if the vfsops matches that of the vfs.
500 501 * Returns zero (0) if not.
501 502 */
502 503 int
503 504 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
504 505 {
505 506 return (vfs_getops(vfsp) == vfsops);
506 507 }
507 508
508 509 /*
509 510 * Returns non-zero (1) if the file system has installed a non-default,
510 511 * non-error vfs_sync routine. Returns zero (0) otherwise.
511 512 */
512 513 int
513 514 vfs_can_sync(vfs_t *vfsp)
514 515 {
515 516 /* vfs_sync() routine is not the default/error function */
516 517 return (vfs_getops(vfsp)->vfs_sync != fs_sync);
517 518 }
518 519
519 520 /*
520 521 * Initialize a vfs structure.
521 522 */
522 523 void
523 524 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
524 525 {
525 526 /* Other initialization has been moved to vfs_alloc() */
526 527 vfsp->vfs_count = 0;
527 528 vfsp->vfs_next = vfsp;
528 529 vfsp->vfs_prev = vfsp;
529 530 vfsp->vfs_zone_next = vfsp;
530 531 vfsp->vfs_zone_prev = vfsp;
531 532 vfsp->vfs_lofi_id = 0;
532 533 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
533 534 vfsimpl_setup(vfsp);
534 535 vfsp->vfs_data = (data);
535 536 vfs_setops((vfsp), (op));
536 537 }
537 538
538 539 /*
539 540 * Allocate and initialize the vfs implementation private data
540 541 * structure, vfs_impl_t.
541 542 */
542 543 void
543 544 vfsimpl_setup(vfs_t *vfsp)
544 545 {
545 546 int i;
546 547
547 548 if (vfsp->vfs_implp != NULL) {
548 549 return;
549 550 }
550 551
551 552 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
552 553 /* Note that these are #define'd in vfs.h */
553 554 vfsp->vfs_vskap = NULL;
554 555 vfsp->vfs_fstypevsp = NULL;
555 556
556 557 /* Set size of counted array, then zero the array */
557 558 vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
558 559 for (i = 1; i < VFS_FEATURE_MAXSZ; i++) {
559 560 vfsp->vfs_featureset[i] = 0;
560 561 }
561 562 }
562 563
563 564 /*
564 565 * Release the vfs_impl_t structure, if it exists. Some unbundled
565 566 * filesystems may not use the newer version of vfs and thus
566 567 * would not contain this implementation private data structure.
567 568 */
568 569 void
569 570 vfsimpl_teardown(vfs_t *vfsp)
570 571 {
571 572 vfs_impl_t *vip = vfsp->vfs_implp;
572 573
573 574 if (vip == NULL)
574 575 return;
575 576
576 577 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
577 578 vfsp->vfs_implp = NULL;
578 579 }
579 580
580 581 /*
581 582 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
582 583 * fstatvfs, and sysfs moved to common/syscall.
583 584 */
584 585
585 586 /*
586 587 * Update every mounted file system. We call the vfs_sync operation of
587 588 * each file system type, passing it a NULL vfsp to indicate that all
588 589 * mounted file systems of that type should be updated.
589 590 */
590 591 void
591 592 vfs_sync(int flag)
592 593 {
593 594 struct vfssw *vswp;
594 595 RLOCK_VFSSW();
595 596 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
596 597 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
597 598 vfs_refvfssw(vswp);
598 599 RUNLOCK_VFSSW();
599 600 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
600 601 CRED());
601 602 vfs_unrefvfssw(vswp);
602 603 RLOCK_VFSSW();
603 604 }
604 605 }
605 606 RUNLOCK_VFSSW();
606 607 }
607 608
608 609 void
609 610 sync(void)
610 611 {
611 612 vfs_sync(0);
612 613 }
613 614
614 615 /*
615 616 * External routines.
616 617 */
617 618
618 619 krwlock_t vfssw_lock; /* lock accesses to vfssw */
619 620
620 621 /*
621 622 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(),
622 623 * but otherwise should be accessed only via vfs_list_lock() and
623 624 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list.
624 625 */
625 626 static krwlock_t vfslist;
626 627
627 628 /*
628 629 * Mount devfs on /devices. This is done right after root is mounted
629 630 * to provide device access support for the system
630 631 */
631 632 static void
632 633 vfs_mountdevices(void)
633 634 {
634 635 struct vfssw *vsw;
635 636 struct vnode *mvp;
636 637 struct mounta mounta = { /* fake mounta for devfs_mount() */
637 638 NULL,
638 639 NULL,
639 640 MS_SYSSPACE,
640 641 NULL,
641 642 NULL,
642 643 0,
643 644 NULL,
644 645 0
645 646 };
646 647
647 648 /*
648 649 * _init devfs module to fill in the vfssw
649 650 */
650 651 if (modload("fs", "devfs") == -1)
651 652 panic("Cannot _init devfs module");
652 653
653 654 /*
654 655 * Hold vfs
655 656 */
656 657 RLOCK_VFSSW();
657 658 vsw = vfs_getvfsswbyname("devfs");
658 659 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
659 660 VFS_HOLD(&devices);
660 661
661 662 /*
662 663 * Locate mount point
663 664 */
664 665 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
665 666 panic("Cannot find /devices");
666 667
667 668 /*
668 669 * Perform the mount of /devices
669 670 */
670 671 if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
671 672 panic("Cannot mount /devices");
672 673
673 674 RUNLOCK_VFSSW();
674 675
675 676 /*
676 677 * Set appropriate members and add to vfs list for mnttab display
677 678 */
678 679 vfs_setresource(&devices, "/devices", 0);
679 680 vfs_setmntpoint(&devices, "/devices", 0);
680 681
681 682 /*
682 683 * Hold the root of /devices so it won't go away
683 684 */
684 685 if (VFS_ROOT(&devices, &devicesdir))
685 686 panic("vfs_mountdevices: not devices root");
686 687
687 688 if (vfs_lock(&devices) != 0) {
688 689 VN_RELE(devicesdir);
689 690 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
690 691 return;
691 692 }
692 693
693 694 if (vn_vfswlock(mvp) != 0) {
694 695 vfs_unlock(&devices);
695 696 VN_RELE(devicesdir);
696 697 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
697 698 return;
698 699 }
699 700
700 701 vfs_add(mvp, &devices, 0);
701 702 vn_vfsunlock(mvp);
702 703 vfs_unlock(&devices);
703 704 VN_RELE(devicesdir);
704 705 }
705 706
706 707 /*
707 708 * mount the first instance of /dev to root and remain mounted
708 709 */
709 710 static void
710 711 vfs_mountdev1(void)
711 712 {
712 713 struct vfssw *vsw;
713 714 struct vnode *mvp;
714 715 struct mounta mounta = { /* fake mounta for sdev_mount() */
715 716 NULL,
716 717 NULL,
717 718 MS_SYSSPACE | MS_OVERLAY,
718 719 NULL,
719 720 NULL,
720 721 0,
721 722 NULL,
722 723 0
723 724 };
724 725
725 726 /*
726 727 * _init dev module to fill in the vfssw
727 728 */
728 729 if (modload("fs", "dev") == -1)
729 730 cmn_err(CE_PANIC, "Cannot _init dev module\n");
730 731
731 732 /*
732 733 * Hold vfs
733 734 */
734 735 RLOCK_VFSSW();
735 736 vsw = vfs_getvfsswbyname("dev");
736 737 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
737 738 VFS_HOLD(&dev);
738 739
739 740 /*
740 741 * Locate mount point
741 742 */
742 743 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
743 744 cmn_err(CE_PANIC, "Cannot find /dev\n");
744 745
745 746 /*
746 747 * Perform the mount of /dev
747 748 */
748 749 if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
749 750 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
750 751
751 752 RUNLOCK_VFSSW();
752 753
753 754 /*
754 755 * Set appropriate members and add to vfs list for mnttab display
755 756 */
756 757 vfs_setresource(&dev, "/dev", 0);
757 758 vfs_setmntpoint(&dev, "/dev", 0);
758 759
759 760 /*
760 761 * Hold the root of /dev so it won't go away
761 762 */
762 763 if (VFS_ROOT(&dev, &devdir))
763 764 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
764 765
765 766 if (vfs_lock(&dev) != 0) {
766 767 VN_RELE(devdir);
767 768 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
768 769 return;
769 770 }
770 771
771 772 if (vn_vfswlock(mvp) != 0) {
772 773 vfs_unlock(&dev);
773 774 VN_RELE(devdir);
774 775 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
775 776 return;
776 777 }
777 778
778 779 vfs_add(mvp, &dev, 0);
779 780 vn_vfsunlock(mvp);
780 781 vfs_unlock(&dev);
781 782 VN_RELE(devdir);
782 783 }
783 784
↓ open down ↓ |
746 lines elided |
↑ open up ↑ |
784 785 /*
785 786 * Mount required filesystem. This is done right after root is mounted.
786 787 */
787 788 static void
788 789 vfs_mountfs(char *module, char *spec, char *path)
789 790 {
790 791 struct vnode *mvp;
791 792 struct mounta mounta;
792 793 vfs_t *vfsp;
793 794
795 + bzero(&mounta, sizeof (mounta));
794 796 mounta.flags = MS_SYSSPACE | MS_DATA;
795 797 mounta.fstype = module;
796 798 mounta.spec = spec;
797 799 mounta.dir = path;
798 800 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
799 801 cmn_err(CE_WARN, "Cannot find %s", path);
800 802 return;
801 803 }
802 804 if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
803 805 cmn_err(CE_WARN, "Cannot mount %s", path);
804 806 else
805 807 VFS_RELE(vfsp);
806 808 VN_RELE(mvp);
807 809 }
808 810
809 811 /*
810 812 * vfs_mountroot is called by main() to mount the root filesystem.
811 813 */
812 814 void
813 815 vfs_mountroot(void)
814 816 {
815 817 struct vnode *rvp = NULL;
816 818 char *path;
817 819 size_t plen;
818 820 struct vfssw *vswp;
819 821 proc_t *p;
820 822
821 823 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
822 824 rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
823 825
824 826 /*
825 827 * Alloc the vfs hash bucket array and locks
826 828 */
827 829 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
828 830
829 831 /*
830 832 * Call machine-dependent routine "rootconf" to choose a root
831 833 * file system type.
832 834 */
833 835 if (rootconf())
834 836 panic("vfs_mountroot: cannot mount root");
835 837 /*
836 838 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir
837 839 * to point to it. These are used by lookuppn() so that it
838 840 * knows where to start from ('/' or '.').
839 841 */
840 842 vfs_setmntpoint(rootvfs, "/", 0);
841 843 if (VFS_ROOT(rootvfs, &rootdir))
842 844 panic("vfs_mountroot: no root vnode");
843 845
844 846 /*
845 847 * At this point, the process tree consists of p0 and possibly some
846 848 * direct children of p0. (i.e. there are no grandchildren)
847 849 *
848 850 * Walk through them all, setting their current directory.
849 851 */
850 852 mutex_enter(&pidlock);
851 853 for (p = practive; p != NULL; p = p->p_next) {
852 854 ASSERT(p == &p0 || p->p_parent == &p0);
853 855
854 856 PTOU(p)->u_cdir = rootdir;
855 857 VN_HOLD(PTOU(p)->u_cdir);
856 858 PTOU(p)->u_rdir = NULL;
857 859 }
858 860 mutex_exit(&pidlock);
859 861
860 862 /*
861 863 * Setup the global zone's rootvp, now that it exists.
862 864 */
863 865 global_zone->zone_rootvp = rootdir;
864 866 VN_HOLD(global_zone->zone_rootvp);
865 867
866 868 /*
867 869 * Notify the module code that it can begin using the
868 870 * root filesystem instead of the boot program's services.
869 871 */
870 872 modrootloaded = 1;
871 873
872 874 /*
873 875 * Special handling for a ZFS root file system.
874 876 */
875 877 zfs_boot_init();
876 878
877 879 /*
878 880 * Set up mnttab information for root
879 881 */
880 882 vfs_setresource(rootvfs, rootfs.bo_name, 0);
881 883
882 884 /*
883 885 * Notify cluster software that the root filesystem is available.
884 886 */
885 887 clboot_mountroot();
886 888
887 889 /* Now that we're all done with the root FS, set up its vopstats */
888 890 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
889 891 /* Set flag for statistics collection */
890 892 if (vswp->vsw_flag & VSW_STATS) {
891 893 initialize_vopstats(&rootvfs->vfs_vopstats);
892 894 rootvfs->vfs_flag |= VFS_STATS;
893 895 rootvfs->vfs_fstypevsp =
894 896 get_fstype_vopstats(rootvfs, vswp);
895 897 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
896 898 }
897 899 vfs_unrefvfssw(vswp);
898 900 }
899 901
900 902 /*
901 903 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
902 904 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
903 905 */
904 906 vfs_mountdevices();
905 907 vfs_mountdev1();
906 908
907 909 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
908 910 vfs_mountfs("proc", "/proc", "/proc");
909 911 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
910 912 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
911 913 vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
912 914 vfs_mountfs("bootfs", "bootfs", "/system/boot");
913 915
914 916 if (getzoneid() == GLOBAL_ZONEID) {
915 917 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
916 918 }
917 919
918 920 if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
919 921 /*
920 922 * Look up the root device via devfs so that a dv_node is
921 923 * created for it. The vnode is never VN_RELE()ed.
922 924 * We allocate more than MAXPATHLEN so that the
923 925 * buffer passed to i_ddi_prompath_to_devfspath() is
924 926 * exactly MAXPATHLEN (the function expects a buffer
925 927 * of that length).
926 928 */
927 929 plen = strlen("/devices");
928 930 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
929 931 (void) strcpy(path, "/devices");
930 932
931 933 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
932 934 != DDI_SUCCESS ||
933 935 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
934 936
935 937 /* NUL terminate in case "path" has garbage */
936 938 path[plen + MAXPATHLEN - 1] = '\0';
937 939 #ifdef DEBUG
938 940 cmn_err(CE_WARN, "!Cannot lookup root device: %s",
939 941 path);
940 942 #endif
941 943 }
942 944 kmem_free(path, plen + MAXPATHLEN);
943 945 }
944 946
945 947 vfs_mnttabvp_setup();
946 948 }
947 949
948 950 /*
949 951 * Check to see if our "block device" is actually a file. If so,
950 952 * automatically add a lofi device, and keep track of this fact.
951 953 */
952 954 static int
953 955 lofi_add(const char *fsname, struct vfs *vfsp,
954 956 mntopts_t *mntopts, struct mounta *uap)
955 957 {
956 958 int fromspace = (uap->flags & MS_SYSSPACE) ?
957 959 UIO_SYSSPACE : UIO_USERSPACE;
958 960 struct lofi_ioctl *li = NULL;
959 961 struct vnode *vp = NULL;
960 962 struct pathname pn = { NULL };
961 963 ldi_ident_t ldi_id;
962 964 ldi_handle_t ldi_hdl;
963 965 vfssw_t *vfssw;
964 966 int id;
965 967 int err = 0;
966 968
967 969 if ((vfssw = vfs_getvfssw(fsname)) == NULL)
968 970 return (0);
969 971
970 972 if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
971 973 vfs_unrefvfssw(vfssw);
972 974 return (0);
973 975 }
974 976
975 977 vfs_unrefvfssw(vfssw);
976 978 vfssw = NULL;
977 979
978 980 if (pn_get(uap->spec, fromspace, &pn) != 0)
979 981 return (0);
980 982
981 983 if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
982 984 goto out;
983 985
984 986 if (vp->v_type != VREG)
985 987 goto out;
986 988
987 989 /* OK, this is a lofi mount. */
988 990
989 991 if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
990 992 vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
991 993 vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
992 994 vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
993 995 err = EINVAL;
994 996 goto out;
995 997 }
996 998
997 999 ldi_id = ldi_ident_from_anon();
998 1000 li = kmem_zalloc(sizeof (*li), KM_SLEEP);
999 1001 (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1000 1002
1001 1003 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1002 1004 &ldi_hdl, ldi_id);
1003 1005
1004 1006 if (err)
1005 1007 goto out2;
1006 1008
1007 1009 err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1008 1010 FREAD | FWRITE | FKIOCTL, kcred, &id);
1009 1011
1010 1012 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1011 1013
1012 1014 if (!err)
1013 1015 vfsp->vfs_lofi_id = id;
1014 1016
1015 1017 out2:
1016 1018 ldi_ident_release(ldi_id);
1017 1019 out:
1018 1020 if (li != NULL)
1019 1021 kmem_free(li, sizeof (*li));
1020 1022 if (vp != NULL)
1021 1023 VN_RELE(vp);
1022 1024 pn_free(&pn);
1023 1025 return (err);
1024 1026 }
1025 1027
1026 1028 static void
1027 1029 lofi_remove(struct vfs *vfsp)
1028 1030 {
1029 1031 struct lofi_ioctl *li = NULL;
1030 1032 ldi_ident_t ldi_id;
1031 1033 ldi_handle_t ldi_hdl;
1032 1034 int err;
1033 1035
1034 1036 if (vfsp->vfs_lofi_id == 0)
1035 1037 return;
1036 1038
1037 1039 ldi_id = ldi_ident_from_anon();
1038 1040
1039 1041 li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1040 1042 li->li_id = vfsp->vfs_lofi_id;
1041 1043 li->li_cleanup = B_TRUE;
1042 1044
1043 1045 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1044 1046 &ldi_hdl, ldi_id);
1045 1047
1046 1048 if (err)
1047 1049 goto out;
1048 1050
1049 1051 err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1050 1052 FREAD | FWRITE | FKIOCTL, kcred, NULL);
1051 1053
1052 1054 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1053 1055
1054 1056 if (!err)
1055 1057 vfsp->vfs_lofi_id = 0;
1056 1058
1057 1059 out:
1058 1060 ldi_ident_release(ldi_id);
1059 1061 if (li != NULL)
1060 1062 kmem_free(li, sizeof (*li));
1061 1063 }
1062 1064
1063 1065 /*
1064 1066 * Common mount code. Called from the system call entry point, from autofs,
1065 1067 * nfsv4 trigger mounts, and from pxfs.
1066 1068 *
1067 1069 * Takes the effective file system type, mount arguments, the mount point
1068 1070 * vnode, flags specifying whether the mount is a remount and whether it
1069 1071 * should be entered into the vfs list, and credentials. Fills in its vfspp
1070 1072 * parameter with the mounted file system instance's vfs.
1071 1073 *
1072 1074 * Note that the effective file system type is specified as a string. It may
1073 1075 * be null, in which case it's determined from the mount arguments, and may
1074 1076 * differ from the type specified in the mount arguments; this is a hook to
1075 1077 * allow interposition when instantiating file system instances.
1076 1078 *
1077 1079 * The caller is responsible for releasing its own hold on the mount point
1078 1080 * vp (this routine does its own hold when necessary).
1079 1081 * Also note that for remounts, the mount point vp should be the vnode for
1080 1082 * the root of the file system rather than the vnode that the file system
1081 1083 * is mounted on top of.
1082 1084 */
1083 1085 int
1084 1086 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1085 1087 struct vfs **vfspp)
1086 1088 {
1087 1089 struct vfssw *vswp;
1088 1090 vfsops_t *vfsops;
1089 1091 struct vfs *vfsp;
1090 1092 struct vnode *bvp;
1091 1093 dev_t bdev = 0;
1092 1094 mntopts_t mnt_mntopts;
1093 1095 int error = 0;
1094 1096 int copyout_error = 0;
1095 1097 int ovflags;
1096 1098 char *opts = uap->optptr;
1097 1099 char *inargs = opts;
1098 1100 int optlen = uap->optlen;
1099 1101 int remount;
1100 1102 int rdonly;
1101 1103 int nbmand = 0;
1102 1104 int delmip = 0;
1103 1105 int addmip = 0;
1104 1106 int splice = ((uap->flags & MS_NOSPLICE) == 0);
1105 1107 int fromspace = (uap->flags & MS_SYSSPACE) ?
1106 1108 UIO_SYSSPACE : UIO_USERSPACE;
1107 1109 char *resource = NULL, *mountpt = NULL;
1108 1110 refstr_t *oldresource, *oldmntpt;
1109 1111 struct pathname pn, rpn;
1110 1112 vsk_anchor_t *vskap;
1111 1113 char fstname[FSTYPSZ];
1112 1114 zone_t *zone;
1113 1115
1114 1116 /*
1115 1117 * The v_flag value for the mount point vp is permanently set
1116 1118 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1117 1119 * for mount point locking.
1118 1120 */
1119 1121 mutex_enter(&vp->v_lock);
1120 1122 vp->v_flag |= VVFSLOCK;
1121 1123 mutex_exit(&vp->v_lock);
1122 1124
1123 1125 mnt_mntopts.mo_count = 0;
1124 1126 /*
1125 1127 * Find the ops vector to use to invoke the file system-specific mount
1126 1128 * method. If the fsname argument is non-NULL, use it directly.
1127 1129 * Otherwise, dig the file system type information out of the mount
1128 1130 * arguments.
1129 1131 *
1130 1132 * A side effect is to hold the vfssw entry.
1131 1133 *
1132 1134 * Mount arguments can be specified in several ways, which are
1133 1135 * distinguished by flag bit settings. The preferred way is to set
1134 1136 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1135 1137 * type supplied as a character string and the last two arguments
1136 1138 * being a pointer to a character buffer and the size of the buffer.
1137 1139 * On entry, the buffer holds a null terminated list of options; on
1138 1140 * return, the string is the list of options the file system
1139 1141 * recognized. If MS_DATA is set arguments five and six point to a
1140 1142 * block of binary data which the file system interprets.
1141 1143 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1142 1144 * consistently with these conventions. To handle them, we check to
1143 1145 * see whether the pointer to the file system name has a numeric value
1144 1146 * less than 256. If so, we treat it as an index.
1145 1147 */
1146 1148 if (fsname != NULL) {
1147 1149 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1148 1150 return (EINVAL);
1149 1151 }
1150 1152 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1151 1153 size_t n;
1152 1154 uint_t fstype;
1153 1155
1154 1156 fsname = fstname;
1155 1157
1156 1158 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1157 1159 RLOCK_VFSSW();
1158 1160 if (fstype == 0 || fstype >= nfstype ||
1159 1161 !ALLOCATED_VFSSW(&vfssw[fstype])) {
1160 1162 RUNLOCK_VFSSW();
1161 1163 return (EINVAL);
1162 1164 }
1163 1165 (void) strcpy(fsname, vfssw[fstype].vsw_name);
1164 1166 RUNLOCK_VFSSW();
1165 1167 if ((vswp = vfs_getvfssw(fsname)) == NULL)
1166 1168 return (EINVAL);
1167 1169 } else {
1168 1170 /*
1169 1171 * Handle either kernel or user address space.
1170 1172 */
1171 1173 if (uap->flags & MS_SYSSPACE) {
1172 1174 error = copystr(uap->fstype, fsname,
1173 1175 FSTYPSZ, &n);
1174 1176 } else {
1175 1177 error = copyinstr(uap->fstype, fsname,
1176 1178 FSTYPSZ, &n);
1177 1179 }
1178 1180 if (error) {
1179 1181 if (error == ENAMETOOLONG)
1180 1182 return (EINVAL);
1181 1183 return (error);
1182 1184 }
1183 1185 if ((vswp = vfs_getvfssw(fsname)) == NULL)
1184 1186 return (EINVAL);
1185 1187 }
1186 1188 } else {
1187 1189 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1188 1190 return (EINVAL);
1189 1191 fsname = vswp->vsw_name;
1190 1192 }
1191 1193 if (!VFS_INSTALLED(vswp))
1192 1194 return (EINVAL);
1193 1195
1194 1196 if ((error = secpolicy_fs_allowed_mount(fsname)) != 0) {
1195 1197 vfs_unrefvfssw(vswp);
1196 1198 return (error);
1197 1199 }
1198 1200
1199 1201 vfsops = &vswp->vsw_vfsops;
1200 1202
1201 1203 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1202 1204 /*
1203 1205 * Fetch mount options and parse them for generic vfs options
1204 1206 */
1205 1207 if (uap->flags & MS_OPTIONSTR) {
1206 1208 /*
1207 1209 * Limit the buffer size
1208 1210 */
1209 1211 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1210 1212 error = EINVAL;
1211 1213 goto errout;
1212 1214 }
1213 1215 if ((uap->flags & MS_SYSSPACE) == 0) {
1214 1216 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1215 1217 inargs[0] = '\0';
1216 1218 if (optlen) {
1217 1219 error = copyinstr(opts, inargs, (size_t)optlen,
1218 1220 NULL);
1219 1221 if (error) {
1220 1222 goto errout;
1221 1223 }
1222 1224 }
1223 1225 }
1224 1226 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1225 1227 }
1226 1228 /*
1227 1229 * Flag bits override the options string.
1228 1230 */
1229 1231 if (uap->flags & MS_REMOUNT)
1230 1232 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1231 1233 if (uap->flags & MS_RDONLY)
1232 1234 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1233 1235 if (uap->flags & MS_NOSUID)
1234 1236 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1235 1237
1236 1238 /*
1237 1239 * Check if this is a remount; must be set in the option string and
1238 1240 * the file system must support a remount option.
1239 1241 */
1240 1242 if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1241 1243 MNTOPT_REMOUNT, NULL)) {
1242 1244 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1243 1245 error = ENOTSUP;
1244 1246 goto errout;
1245 1247 }
1246 1248 uap->flags |= MS_REMOUNT;
1247 1249 }
1248 1250
1249 1251 /*
1250 1252 * uap->flags and vfs_optionisset() should agree.
1251 1253 */
1252 1254 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1253 1255 uap->flags |= MS_RDONLY;
1254 1256 }
1255 1257 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1256 1258 uap->flags |= MS_NOSUID;
1257 1259 }
1258 1260 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1259 1261 ASSERT(splice || !remount);
1260 1262 /*
1261 1263 * If we are splicing the fs into the namespace,
1262 1264 * perform mount point checks.
1263 1265 *
1264 1266 * We want to resolve the path for the mount point to eliminate
1265 1267 * '.' and ".." and symlinks in mount points; we can't do the
1266 1268 * same for the resource string, since it would turn
1267 1269 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do
1268 1270 * this before grabbing vn_vfswlock(), because otherwise we
1269 1271 * would deadlock with lookuppn().
1270 1272 */
1271 1273 if (splice) {
1272 1274 ASSERT(vp->v_count > 0);
1273 1275
1274 1276 /*
1275 1277 * Pick up mount point and device from appropriate space.
1276 1278 */
1277 1279 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1278 1280 resource = kmem_alloc(pn.pn_pathlen + 1,
1279 1281 KM_SLEEP);
1280 1282 (void) strcpy(resource, pn.pn_path);
1281 1283 pn_free(&pn);
1282 1284 }
1283 1285 /*
1284 1286 * Do a lookupname prior to taking the
1285 1287 * writelock. Mark this as completed if
1286 1288 * successful for later cleanup and addition to
1287 1289 * the mount in progress table.
1288 1290 */
1289 1291 if ((uap->flags & MS_GLOBAL) == 0 &&
1290 1292 lookupname(uap->spec, fromspace,
1291 1293 FOLLOW, NULL, &bvp) == 0) {
1292 1294 addmip = 1;
1293 1295 }
1294 1296
1295 1297 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1296 1298 pathname_t *pnp;
1297 1299
1298 1300 if (*pn.pn_path != '/') {
1299 1301 error = EINVAL;
1300 1302 pn_free(&pn);
1301 1303 goto errout;
1302 1304 }
1303 1305 pn_alloc(&rpn);
1304 1306 /*
1305 1307 * Kludge to prevent autofs from deadlocking with
1306 1308 * itself when it calls domount().
1307 1309 *
1308 1310 * If autofs is calling, it is because it is doing
1309 1311 * (autofs) mounts in the process of an NFS mount. A
1310 1312 * lookuppn() here would cause us to block waiting for
1311 1313 * said NFS mount to complete, which can't since this
1312 1314 * is the thread that was supposed to doing it.
1313 1315 */
1314 1316 if (fromspace == UIO_USERSPACE) {
1315 1317 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1316 1318 NULL)) == 0) {
1317 1319 pnp = &rpn;
1318 1320 } else {
1319 1321 /*
1320 1322 * The file disappeared or otherwise
1321 1323 * became inaccessible since we opened
1322 1324 * it; might as well fail the mount
1323 1325 * since the mount point is no longer
1324 1326 * accessible.
1325 1327 */
1326 1328 pn_free(&rpn);
1327 1329 pn_free(&pn);
1328 1330 goto errout;
1329 1331 }
1330 1332 } else {
1331 1333 pnp = &pn;
1332 1334 }
1333 1335 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1334 1336 (void) strcpy(mountpt, pnp->pn_path);
1335 1337
1336 1338 /*
1337 1339 * If the addition of the zone's rootpath
1338 1340 * would push us over a total path length
1339 1341 * of MAXPATHLEN, we fail the mount with
1340 1342 * ENAMETOOLONG, which is what we would have
1341 1343 * gotten if we were trying to perform the same
1342 1344 * mount in the global zone.
1343 1345 *
1344 1346 * strlen() doesn't count the trailing
1345 1347 * '\0', but zone_rootpathlen counts both a
1346 1348 * trailing '/' and the terminating '\0'.
1347 1349 */
1348 1350 if ((curproc->p_zone->zone_rootpathlen - 1 +
1349 1351 strlen(mountpt)) > MAXPATHLEN ||
1350 1352 (resource != NULL &&
1351 1353 (curproc->p_zone->zone_rootpathlen - 1 +
1352 1354 strlen(resource)) > MAXPATHLEN)) {
1353 1355 error = ENAMETOOLONG;
1354 1356 }
1355 1357
1356 1358 pn_free(&rpn);
1357 1359 pn_free(&pn);
1358 1360 }
1359 1361
1360 1362 if (error)
1361 1363 goto errout;
1362 1364
1363 1365 /*
1364 1366 * Prevent path name resolution from proceeding past
1365 1367 * the mount point.
1366 1368 */
1367 1369 if (vn_vfswlock(vp) != 0) {
1368 1370 error = EBUSY;
1369 1371 goto errout;
1370 1372 }
1371 1373
1372 1374 /*
1373 1375 * Verify that it's legitimate to establish a mount on
1374 1376 * the prospective mount point.
1375 1377 */
1376 1378 if (vn_mountedvfs(vp) != NULL) {
1377 1379 /*
1378 1380 * The mount point lock was obtained after some
1379 1381 * other thread raced through and established a mount.
1380 1382 */
1381 1383 vn_vfsunlock(vp);
1382 1384 error = EBUSY;
1383 1385 goto errout;
1384 1386 }
1385 1387 if (vp->v_flag & VNOMOUNT) {
1386 1388 vn_vfsunlock(vp);
1387 1389 error = EINVAL;
1388 1390 goto errout;
1389 1391 }
1390 1392 }
1391 1393 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1392 1394 uap->dataptr = NULL;
1393 1395 uap->datalen = 0;
1394 1396 }
1395 1397
1396 1398 /*
1397 1399 * If this is a remount, we don't want to create a new VFS.
1398 1400 * Instead, we pass the existing one with a remount flag.
1399 1401 */
1400 1402 if (remount) {
1401 1403 /*
1402 1404 * Confirm that the mount point is the root vnode of the
1403 1405 * file system that is being remounted.
1404 1406 * This can happen if the user specifies a different
1405 1407 * mount point directory pathname in the (re)mount command.
1406 1408 *
1407 1409 * Code below can only be reached if splice is true, so it's
1408 1410 * safe to do vn_vfsunlock() here.
1409 1411 */
1410 1412 if ((vp->v_flag & VROOT) == 0) {
1411 1413 vn_vfsunlock(vp);
1412 1414 error = ENOENT;
1413 1415 goto errout;
1414 1416 }
1415 1417 /*
1416 1418 * Disallow making file systems read-only unless file system
1417 1419 * explicitly allows it in its vfssw. Ignore other flags.
1418 1420 */
1419 1421 if (rdonly && vn_is_readonly(vp) == 0 &&
1420 1422 (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1421 1423 vn_vfsunlock(vp);
1422 1424 error = EINVAL;
1423 1425 goto errout;
1424 1426 }
1425 1427 /*
1426 1428 * Disallow changing the NBMAND disposition of the file
1427 1429 * system on remounts.
1428 1430 */
1429 1431 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1430 1432 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1431 1433 vn_vfsunlock(vp);
1432 1434 error = EINVAL;
1433 1435 goto errout;
1434 1436 }
1435 1437 vfsp = vp->v_vfsp;
1436 1438 ovflags = vfsp->vfs_flag;
1437 1439 vfsp->vfs_flag |= VFS_REMOUNT;
1438 1440 vfsp->vfs_flag &= ~VFS_RDONLY;
1439 1441 } else {
1440 1442 vfsp = vfs_alloc(KM_SLEEP);
1441 1443 VFS_INIT(vfsp, vfsops, NULL);
1442 1444 }
1443 1445
1444 1446 VFS_HOLD(vfsp);
1445 1447
1446 1448 if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1447 1449 if (!remount) {
1448 1450 if (splice)
1449 1451 vn_vfsunlock(vp);
1450 1452 vfs_free(vfsp);
1451 1453 } else {
1452 1454 vn_vfsunlock(vp);
1453 1455 VFS_RELE(vfsp);
1454 1456 }
1455 1457 goto errout;
1456 1458 }
1457 1459
1458 1460 /*
1459 1461 * PRIV_SYS_MOUNT doesn't mean you can become root.
1460 1462 */
1461 1463 if (vfsp->vfs_lofi_id != 0) {
1462 1464 uap->flags |= MS_NOSUID;
1463 1465 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1464 1466 }
1465 1467
1466 1468 /*
1467 1469 * The vfs_reflock is not used anymore the code below explicitly
1468 1470 * holds it preventing others accesing it directly.
1469 1471 */
1470 1472 if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1471 1473 !(vfsp->vfs_flag & VFS_REMOUNT))
1472 1474 cmn_err(CE_WARN,
1473 1475 "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1474 1476
1475 1477 /*
1476 1478 * Lock the vfs. If this is a remount we want to avoid spurious umount
1477 1479 * failures that happen as a side-effect of fsflush() and other mount
1478 1480 * and unmount operations that might be going on simultaneously and
1479 1481 * may have locked the vfs currently. To not return EBUSY immediately
1480 1482 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1481 1483 */
1482 1484 if (!remount) {
1483 1485 if (error = vfs_lock(vfsp)) {
1484 1486 vfsp->vfs_flag = ovflags;
1485 1487
1486 1488 lofi_remove(vfsp);
1487 1489
1488 1490 if (splice)
1489 1491 vn_vfsunlock(vp);
1490 1492 vfs_free(vfsp);
1491 1493 goto errout;
1492 1494 }
1493 1495 } else {
1494 1496 vfs_lock_wait(vfsp);
1495 1497 }
1496 1498
1497 1499 /*
1498 1500 * Add device to mount in progress table, global mounts require special
1499 1501 * handling. It is possible that we have already done the lookupname
1500 1502 * on a spliced, non-global fs. If so, we don't want to do it again
1501 1503 * since we cannot do a lookupname after taking the
1502 1504 * wlock above. This case is for a non-spliced, non-global filesystem.
1503 1505 */
1504 1506 if (!addmip) {
1505 1507 if ((uap->flags & MS_GLOBAL) == 0 &&
1506 1508 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1507 1509 addmip = 1;
1508 1510 }
1509 1511 }
1510 1512
1511 1513 if (addmip) {
1512 1514 vnode_t *lvp = NULL;
1513 1515
1514 1516 error = vfs_get_lofi(vfsp, &lvp);
1515 1517 if (error > 0) {
1516 1518 lofi_remove(vfsp);
1517 1519
1518 1520 if (splice)
1519 1521 vn_vfsunlock(vp);
1520 1522 vfs_unlock(vfsp);
1521 1523
1522 1524 if (remount) {
1523 1525 VFS_RELE(vfsp);
1524 1526 } else {
1525 1527 vfs_free(vfsp);
1526 1528 }
1527 1529
1528 1530 goto errout;
1529 1531 } else if (error == -1) {
1530 1532 bdev = bvp->v_rdev;
1531 1533 VN_RELE(bvp);
1532 1534 } else {
1533 1535 bdev = lvp->v_rdev;
1534 1536 VN_RELE(lvp);
1535 1537 VN_RELE(bvp);
1536 1538 }
1537 1539
1538 1540 vfs_addmip(bdev, vfsp);
1539 1541 addmip = 0;
1540 1542 delmip = 1;
1541 1543 }
1542 1544 /*
1543 1545 * Invalidate cached entry for the mount point.
1544 1546 */
1545 1547 if (splice)
1546 1548 dnlc_purge_vp(vp);
1547 1549
1548 1550 /*
1549 1551 * If have an option string but the filesystem doesn't supply a
1550 1552 * prototype options table, create a table with the global
1551 1553 * options and sufficient room to accept all the options in the
1552 1554 * string. Then parse the passed in option string
1553 1555 * accepting all the options in the string. This gives us an
1554 1556 * option table with all the proper cancel properties for the
1555 1557 * global options.
1556 1558 *
1557 1559 * Filesystems that supply a prototype options table are handled
1558 1560 * earlier in this function.
1559 1561 */
1560 1562 if (uap->flags & MS_OPTIONSTR) {
1561 1563 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1562 1564 mntopts_t tmp_mntopts;
1563 1565
1564 1566 tmp_mntopts.mo_count = 0;
1565 1567 vfs_createopttbl_extend(&tmp_mntopts, inargs,
1566 1568 &mnt_mntopts);
1567 1569 vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1568 1570 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1569 1571 vfs_freeopttbl(&tmp_mntopts);
1570 1572 }
1571 1573 }
1572 1574
1573 1575 /*
1574 1576 * Serialize with zone state transitions.
1575 1577 * See vfs_list_add; zone mounted into is:
1576 1578 * zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1577 1579 * not the zone doing the mount (curproc->p_zone), but if we're already
1578 1580 * inside a NGZ, then we know what zone we are.
1579 1581 */
1580 1582 if (INGLOBALZONE(curproc)) {
1581 1583 zone = zone_find_by_path(mountpt);
1582 1584 ASSERT(zone != NULL);
1583 1585 } else {
1584 1586 zone = curproc->p_zone;
1585 1587 /*
1586 1588 * zone_find_by_path does a hold, so do one here too so that
1587 1589 * we can do a zone_rele after mount_completed.
1588 1590 */
1589 1591 zone_hold(zone);
1590 1592 }
1591 1593 mount_in_progress(zone);
1592 1594 /*
1593 1595 * Instantiate (or reinstantiate) the file system. If appropriate,
1594 1596 * splice it into the file system name space.
1595 1597 *
1596 1598 * We want VFS_MOUNT() to be able to override the vfs_resource
1597 1599 * string if necessary (ie, mntfs), and also for a remount to
1598 1600 * change the same (necessary when remounting '/' during boot).
1599 1601 * So we set up vfs_mntpt and vfs_resource to what we think they
1600 1602 * should be, then hand off control to VFS_MOUNT() which can
1601 1603 * override this.
1602 1604 *
1603 1605 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1604 1606 * a vfs which is on the vfs list (i.e. during a remount), we must
1605 1607 * never set those fields to NULL. Several bits of code make
1606 1608 * assumptions that the fields are always valid.
1607 1609 */
1608 1610 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1609 1611 if (remount) {
1610 1612 if ((oldresource = vfsp->vfs_resource) != NULL)
1611 1613 refstr_hold(oldresource);
1612 1614 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1613 1615 refstr_hold(oldmntpt);
1614 1616 }
1615 1617 vfs_setresource(vfsp, resource, 0);
1616 1618 vfs_setmntpoint(vfsp, mountpt, 0);
1617 1619
1618 1620 /*
1619 1621 * going to mount on this vnode, so notify.
1620 1622 */
1621 1623 vnevent_mountedover(vp, NULL);
1622 1624 error = VFS_MOUNT(vfsp, vp, uap, credp);
1623 1625
1624 1626 if (uap->flags & MS_RDONLY)
1625 1627 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1626 1628 if (uap->flags & MS_NOSUID)
1627 1629 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1628 1630 if (uap->flags & MS_GLOBAL)
1629 1631 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1630 1632
1631 1633 if (error) {
1632 1634 lofi_remove(vfsp);
1633 1635
1634 1636 if (remount) {
1635 1637 /* put back pre-remount options */
1636 1638 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1637 1639 vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1638 1640 VFSSP_VERBATIM);
1639 1641 if (oldmntpt)
1640 1642 refstr_rele(oldmntpt);
1641 1643 vfs_setresource(vfsp, refstr_value(oldresource),
1642 1644 VFSSP_VERBATIM);
1643 1645 if (oldresource)
1644 1646 refstr_rele(oldresource);
1645 1647 vfsp->vfs_flag = ovflags;
1646 1648 vfs_unlock(vfsp);
1647 1649 VFS_RELE(vfsp);
1648 1650 } else {
1649 1651 vfs_unlock(vfsp);
1650 1652 vfs_freemnttab(vfsp);
1651 1653 vfs_free(vfsp);
1652 1654 }
1653 1655 } else {
1654 1656 /*
1655 1657 * Set the mount time to now
1656 1658 */
1657 1659 vfsp->vfs_mtime = ddi_get_time();
1658 1660 if (remount) {
1659 1661 vfsp->vfs_flag &= ~VFS_REMOUNT;
1660 1662 if (oldresource)
1661 1663 refstr_rele(oldresource);
1662 1664 if (oldmntpt)
1663 1665 refstr_rele(oldmntpt);
1664 1666 } else if (splice) {
1665 1667 /*
1666 1668 * Link vfsp into the name space at the mount
1667 1669 * point. Vfs_add() is responsible for
1668 1670 * holding the mount point which will be
1669 1671 * released when vfs_remove() is called.
1670 1672 */
1671 1673 vfs_add(vp, vfsp, uap->flags);
1672 1674 } else {
1673 1675 /*
1674 1676 * Hold the reference to file system which is
1675 1677 * not linked into the name space.
1676 1678 */
1677 1679 vfsp->vfs_zone = NULL;
1678 1680 VFS_HOLD(vfsp);
1679 1681 vfsp->vfs_vnodecovered = NULL;
1680 1682 }
1681 1683 /*
1682 1684 * Set flags for global options encountered
1683 1685 */
1684 1686 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1685 1687 vfsp->vfs_flag |= VFS_RDONLY;
1686 1688 else
1687 1689 vfsp->vfs_flag &= ~VFS_RDONLY;
1688 1690 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1689 1691 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1690 1692 } else {
1691 1693 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1692 1694 vfsp->vfs_flag |= VFS_NODEVICES;
1693 1695 else
1694 1696 vfsp->vfs_flag &= ~VFS_NODEVICES;
1695 1697 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1696 1698 vfsp->vfs_flag |= VFS_NOSETUID;
1697 1699 else
1698 1700 vfsp->vfs_flag &= ~VFS_NOSETUID;
1699 1701 }
1700 1702 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1701 1703 vfsp->vfs_flag |= VFS_NBMAND;
1702 1704 else
1703 1705 vfsp->vfs_flag &= ~VFS_NBMAND;
1704 1706
1705 1707 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1706 1708 vfsp->vfs_flag |= VFS_XATTR;
1707 1709 else
1708 1710 vfsp->vfs_flag &= ~VFS_XATTR;
1709 1711
1710 1712 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1711 1713 vfsp->vfs_flag |= VFS_NOEXEC;
1712 1714 else
1713 1715 vfsp->vfs_flag &= ~VFS_NOEXEC;
1714 1716
1715 1717 /*
1716 1718 * Now construct the output option string of options
1717 1719 * we recognized.
1718 1720 */
1719 1721 if (uap->flags & MS_OPTIONSTR) {
1720 1722 vfs_list_read_lock();
1721 1723 copyout_error = vfs_buildoptionstr(
1722 1724 &vfsp->vfs_mntopts, inargs, optlen);
1723 1725 vfs_list_unlock();
1724 1726 if (copyout_error == 0 &&
1725 1727 (uap->flags & MS_SYSSPACE) == 0) {
1726 1728 copyout_error = copyoutstr(inargs, opts,
1727 1729 optlen, NULL);
1728 1730 }
1729 1731 }
1730 1732
1731 1733 /*
1732 1734 * If this isn't a remount, set up the vopstats before
1733 1735 * anyone can touch this. We only allow spliced file
1734 1736 * systems (file systems which are in the namespace) to
1735 1737 * have the VFS_STATS flag set.
1736 1738 * NOTE: PxFS mounts the underlying file system with
1737 1739 * MS_NOSPLICE set and copies those vfs_flags to its private
1738 1740 * vfs structure. As a result, PxFS should never have
1739 1741 * the VFS_STATS flag or else we might access the vfs
1740 1742 * statistics-related fields prior to them being
1741 1743 * properly initialized.
1742 1744 */
1743 1745 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1744 1746 initialize_vopstats(&vfsp->vfs_vopstats);
1745 1747 /*
1746 1748 * We need to set vfs_vskap to NULL because there's
1747 1749 * a chance it won't be set below. This is checked
1748 1750 * in teardown_vopstats() so we can't have garbage.
1749 1751 */
1750 1752 vfsp->vfs_vskap = NULL;
1751 1753 vfsp->vfs_flag |= VFS_STATS;
1752 1754 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1753 1755 }
1754 1756
1755 1757 if (vswp->vsw_flag & VSW_XID)
1756 1758 vfsp->vfs_flag |= VFS_XID;
1757 1759
1758 1760 vfs_unlock(vfsp);
1759 1761 }
1760 1762 mount_completed(zone);
1761 1763 zone_rele(zone);
1762 1764 if (splice)
1763 1765 vn_vfsunlock(vp);
1764 1766
1765 1767 if ((error == 0) && (copyout_error == 0)) {
1766 1768 if (!remount) {
1767 1769 /*
1768 1770 * Don't call get_vskstat_anchor() while holding
1769 1771 * locks since it allocates memory and calls
1770 1772 * VFS_STATVFS(). For NFS, the latter can generate
1771 1773 * an over-the-wire call.
1772 1774 */
1773 1775 vskap = get_vskstat_anchor(vfsp);
1774 1776 /* Only take the lock if we have something to do */
1775 1777 if (vskap != NULL) {
1776 1778 vfs_lock_wait(vfsp);
1777 1779 if (vfsp->vfs_flag & VFS_STATS) {
1778 1780 vfsp->vfs_vskap = vskap;
1779 1781 }
1780 1782 vfs_unlock(vfsp);
1781 1783 }
1782 1784 }
1783 1785 /* Return vfsp to caller. */
1784 1786 *vfspp = vfsp;
1785 1787 }
1786 1788 errout:
1787 1789 vfs_freeopttbl(&mnt_mntopts);
1788 1790 if (resource != NULL)
1789 1791 kmem_free(resource, strlen(resource) + 1);
1790 1792 if (mountpt != NULL)
1791 1793 kmem_free(mountpt, strlen(mountpt) + 1);
1792 1794 /*
1793 1795 * It is possible we errored prior to adding to mount in progress
1794 1796 * table. Must free vnode we acquired with successful lookupname.
1795 1797 */
1796 1798 if (addmip)
1797 1799 VN_RELE(bvp);
1798 1800 if (delmip)
1799 1801 vfs_delmip(vfsp);
1800 1802 ASSERT(vswp != NULL);
1801 1803 vfs_unrefvfssw(vswp);
1802 1804 if (inargs != opts)
1803 1805 kmem_free(inargs, MAX_MNTOPT_STR);
1804 1806 if (copyout_error) {
1805 1807 lofi_remove(vfsp);
1806 1808 VFS_RELE(vfsp);
1807 1809 error = copyout_error;
1808 1810 }
1809 1811 return (error);
1810 1812 }
1811 1813
1812 1814 static void
1813 1815 vfs_setpath(
1814 1816 struct vfs *vfsp, /* vfs being updated */
1815 1817 refstr_t **refp, /* Ref-count string to contain the new path */
1816 1818 const char *newpath, /* Path to add to refp (above) */
1817 1819 uint32_t flag) /* flag */
1818 1820 {
1819 1821 size_t len;
1820 1822 refstr_t *ref;
1821 1823 zone_t *zone = curproc->p_zone;
1822 1824 char *sp;
1823 1825 int have_list_lock = 0;
1824 1826
1825 1827 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1826 1828
1827 1829 /*
1828 1830 * New path must be less than MAXPATHLEN because mntfs
1829 1831 * will only display up to MAXPATHLEN bytes. This is currently
1830 1832 * safe, because domount() uses pn_get(), and other callers
1831 1833 * similarly cap the size to fewer than MAXPATHLEN bytes.
1832 1834 */
1833 1835
1834 1836 ASSERT(strlen(newpath) < MAXPATHLEN);
1835 1837
1836 1838 /* mntfs requires consistency while vfs list lock is held */
1837 1839
1838 1840 if (VFS_ON_LIST(vfsp)) {
1839 1841 have_list_lock = 1;
1840 1842 vfs_list_lock();
1841 1843 }
1842 1844
1843 1845 if (*refp != NULL)
1844 1846 refstr_rele(*refp);
1845 1847
1846 1848 /*
1847 1849 * If we are in a non-global zone then we prefix the supplied path,
1848 1850 * newpath, with the zone's root path, with two exceptions. The first
1849 1851 * is where we have been explicitly directed to avoid doing so; this
1850 1852 * will be the case following a failed remount, where the path supplied
1851 1853 * will be a saved version which must now be restored. The second
1852 1854 * exception is where newpath is not a pathname but a descriptive name,
1853 1855 * e.g. "procfs".
1854 1856 */
1855 1857 if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1856 1858 ref = refstr_alloc(newpath);
1857 1859 goto out;
1858 1860 }
1859 1861
1860 1862 /*
1861 1863 * Truncate the trailing '/' in the zoneroot, and merge
1862 1864 * in the zone's rootpath with the "newpath" (resource
1863 1865 * or mountpoint) passed in.
1864 1866 *
1865 1867 * The size of the required buffer is thus the size of
1866 1868 * the buffer required for the passed-in newpath
1867 1869 * (strlen(newpath) + 1), plus the size of the buffer
1868 1870 * required to hold zone_rootpath (zone_rootpathlen)
1869 1871 * minus one for one of the now-superfluous NUL
1870 1872 * terminations, minus one for the trailing '/'.
1871 1873 *
1872 1874 * That gives us:
1873 1875 *
1874 1876 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1875 1877 *
1876 1878 * Which is what we have below.
1877 1879 */
1878 1880
1879 1881 len = strlen(newpath) + zone->zone_rootpathlen - 1;
1880 1882 sp = kmem_alloc(len, KM_SLEEP);
1881 1883
1882 1884 /*
1883 1885 * Copy everything including the trailing slash, which
1884 1886 * we then overwrite with the NUL character.
1885 1887 */
1886 1888
1887 1889 (void) strcpy(sp, zone->zone_rootpath);
1888 1890 sp[zone->zone_rootpathlen - 2] = '\0';
1889 1891 (void) strcat(sp, newpath);
1890 1892
1891 1893 ref = refstr_alloc(sp);
1892 1894 kmem_free(sp, len);
1893 1895 out:
1894 1896 *refp = ref;
1895 1897
1896 1898 if (have_list_lock) {
1897 1899 vfs_mnttab_modtimeupd();
1898 1900 vfs_list_unlock();
1899 1901 }
1900 1902 }
1901 1903
1902 1904 /*
1903 1905 * Record a mounted resource name in a vfs structure.
1904 1906 * If vfsp is already mounted, caller must hold the vfs lock.
1905 1907 */
1906 1908 void
1907 1909 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1908 1910 {
1909 1911 if (resource == NULL || resource[0] == '\0')
1910 1912 resource = VFS_NORESOURCE;
1911 1913 vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1912 1914 }
1913 1915
1914 1916 /*
1915 1917 * Record a mount point name in a vfs structure.
1916 1918 * If vfsp is already mounted, caller must hold the vfs lock.
1917 1919 */
1918 1920 void
1919 1921 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1920 1922 {
1921 1923 if (mntpt == NULL || mntpt[0] == '\0')
1922 1924 mntpt = VFS_NOMNTPT;
1923 1925 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1924 1926 }
1925 1927
1926 1928 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1927 1929
1928 1930 refstr_t *
1929 1931 vfs_getresource(const struct vfs *vfsp)
1930 1932 {
1931 1933 refstr_t *resource;
1932 1934
1933 1935 vfs_list_read_lock();
1934 1936 resource = vfsp->vfs_resource;
1935 1937 refstr_hold(resource);
1936 1938 vfs_list_unlock();
1937 1939
1938 1940 return (resource);
1939 1941 }
1940 1942
1941 1943 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1942 1944
1943 1945 refstr_t *
1944 1946 vfs_getmntpoint(const struct vfs *vfsp)
1945 1947 {
1946 1948 refstr_t *mntpt;
1947 1949
1948 1950 vfs_list_read_lock();
1949 1951 mntpt = vfsp->vfs_mntpt;
1950 1952 refstr_hold(mntpt);
1951 1953 vfs_list_unlock();
1952 1954
1953 1955 return (mntpt);
1954 1956 }
1955 1957
1956 1958 /*
1957 1959 * Create an empty options table with enough empty slots to hold all
1958 1960 * The options in the options string passed as an argument.
1959 1961 * Potentially prepend another options table.
1960 1962 *
1961 1963 * Note: caller is responsible for locking the vfs list, if needed,
1962 1964 * to protect mops.
1963 1965 */
1964 1966 static void
1965 1967 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1966 1968 const mntopts_t *mtmpl)
1967 1969 {
1968 1970 const char *s = opts;
1969 1971 uint_t count;
1970 1972
1971 1973 if (opts == NULL || *opts == '\0') {
1972 1974 count = 0;
1973 1975 } else {
1974 1976 count = 1;
1975 1977
1976 1978 /*
1977 1979 * Count number of options in the string
1978 1980 */
1979 1981 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1980 1982 count++;
1981 1983 s++;
1982 1984 }
1983 1985 }
1984 1986 vfs_copyopttbl_extend(mtmpl, mops, count);
1985 1987 }
1986 1988
1987 1989 /*
1988 1990 * Create an empty options table with enough empty slots to hold all
1989 1991 * The options in the options string passed as an argument.
1990 1992 *
1991 1993 * This function is *not* for general use by filesystems.
1992 1994 *
1993 1995 * Note: caller is responsible for locking the vfs list, if needed,
1994 1996 * to protect mops.
1995 1997 */
1996 1998 void
1997 1999 vfs_createopttbl(mntopts_t *mops, const char *opts)
1998 2000 {
1999 2001 vfs_createopttbl_extend(mops, opts, NULL);
2000 2002 }
2001 2003
2002 2004
2003 2005 /*
2004 2006 * Swap two mount options tables
2005 2007 */
2006 2008 static void
2007 2009 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2008 2010 {
2009 2011 uint_t tmpcnt;
2010 2012 mntopt_t *tmplist;
2011 2013
2012 2014 tmpcnt = optbl2->mo_count;
2013 2015 tmplist = optbl2->mo_list;
2014 2016 optbl2->mo_count = optbl1->mo_count;
2015 2017 optbl2->mo_list = optbl1->mo_list;
2016 2018 optbl1->mo_count = tmpcnt;
2017 2019 optbl1->mo_list = tmplist;
2018 2020 }
2019 2021
2020 2022 static void
2021 2023 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2022 2024 {
2023 2025 vfs_list_lock();
2024 2026 vfs_swapopttbl_nolock(optbl1, optbl2);
2025 2027 vfs_mnttab_modtimeupd();
2026 2028 vfs_list_unlock();
2027 2029 }
2028 2030
2029 2031 static char **
2030 2032 vfs_copycancelopt_extend(char **const moc, int extend)
2031 2033 {
2032 2034 int i = 0;
2033 2035 int j;
2034 2036 char **result;
2035 2037
2036 2038 if (moc != NULL) {
2037 2039 for (; moc[i] != NULL; i++)
2038 2040 /* count number of options to cancel */;
2039 2041 }
2040 2042
2041 2043 if (i + extend == 0)
2042 2044 return (NULL);
2043 2045
2044 2046 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2045 2047
2046 2048 for (j = 0; j < i; j++) {
2047 2049 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2048 2050 (void) strcpy(result[j], moc[j]);
2049 2051 }
2050 2052 for (; j <= i + extend; j++)
2051 2053 result[j] = NULL;
2052 2054
2053 2055 return (result);
2054 2056 }
2055 2057
2056 2058 static void
2057 2059 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2058 2060 {
2059 2061 char *sp, *dp;
2060 2062
2061 2063 d->mo_flags = s->mo_flags;
2062 2064 d->mo_data = s->mo_data;
2063 2065 sp = s->mo_name;
2064 2066 if (sp != NULL) {
2065 2067 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2066 2068 (void) strcpy(dp, sp);
2067 2069 d->mo_name = dp;
2068 2070 } else {
2069 2071 d->mo_name = NULL; /* should never happen */
2070 2072 }
2071 2073
2072 2074 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2073 2075
2074 2076 sp = s->mo_arg;
2075 2077 if (sp != NULL) {
2076 2078 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2077 2079 (void) strcpy(dp, sp);
2078 2080 d->mo_arg = dp;
2079 2081 } else {
2080 2082 d->mo_arg = NULL;
2081 2083 }
2082 2084 }
2083 2085
2084 2086 /*
2085 2087 * Copy a mount options table, possibly allocating some spare
2086 2088 * slots at the end. It is permissible to copy_extend the NULL table.
2087 2089 */
2088 2090 static void
2089 2091 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2090 2092 {
2091 2093 uint_t i, count;
2092 2094 mntopt_t *motbl;
2093 2095
2094 2096 /*
2095 2097 * Clear out any existing stuff in the options table being initialized
2096 2098 */
2097 2099 vfs_freeopttbl(dmo);
2098 2100 count = (smo == NULL) ? 0 : smo->mo_count;
2099 2101 if ((count + extra) == 0) /* nothing to do */
2100 2102 return;
2101 2103 dmo->mo_count = count + extra;
2102 2104 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2103 2105 dmo->mo_list = motbl;
2104 2106 for (i = 0; i < count; i++) {
2105 2107 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2106 2108 }
2107 2109 for (i = count; i < count + extra; i++) {
2108 2110 motbl[i].mo_flags = MO_EMPTY;
2109 2111 }
2110 2112 }
2111 2113
2112 2114 /*
2113 2115 * Copy a mount options table.
2114 2116 *
2115 2117 * This function is *not* for general use by filesystems.
2116 2118 *
2117 2119 * Note: caller is responsible for locking the vfs list, if needed,
2118 2120 * to protect smo and dmo.
2119 2121 */
2120 2122 void
2121 2123 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2122 2124 {
2123 2125 vfs_copyopttbl_extend(smo, dmo, 0);
2124 2126 }
2125 2127
2126 2128 static char **
2127 2129 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2128 2130 {
2129 2131 int c1 = 0;
2130 2132 int c2 = 0;
2131 2133 char **result;
2132 2134 char **sp1, **sp2, **dp;
2133 2135
2134 2136 /*
2135 2137 * First we count both lists of cancel options.
2136 2138 * If either is NULL or has no elements, we return a copy of
2137 2139 * the other.
2138 2140 */
2139 2141 if (mop1->mo_cancel != NULL) {
2140 2142 for (; mop1->mo_cancel[c1] != NULL; c1++)
2141 2143 /* count cancel options in mop1 */;
2142 2144 }
2143 2145
2144 2146 if (c1 == 0)
2145 2147 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2146 2148
2147 2149 if (mop2->mo_cancel != NULL) {
2148 2150 for (; mop2->mo_cancel[c2] != NULL; c2++)
2149 2151 /* count cancel options in mop2 */;
2150 2152 }
2151 2153
2152 2154 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2153 2155
2154 2156 if (c2 == 0)
2155 2157 return (result);
2156 2158
2157 2159 /*
2158 2160 * When we get here, we've got two sets of cancel options;
2159 2161 * we need to merge the two sets. We know that the result
2160 2162 * array has "c1+c2+1" entries and in the end we might shrink
2161 2163 * it.
2162 2164 * Result now has a copy of the c1 entries from mop1; we'll
2163 2165 * now lookup all the entries of mop2 in mop1 and copy it if
2164 2166 * it is unique.
2165 2167 * This operation is O(n^2) but it's only called once per
2166 2168 * filesystem per duplicate option. This is a situation
2167 2169 * which doesn't arise with the filesystems in ON and
2168 2170 * n is generally 1.
2169 2171 */
2170 2172
2171 2173 dp = &result[c1];
2172 2174 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2173 2175 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2174 2176 if (strcmp(*sp1, *sp2) == 0)
2175 2177 break;
2176 2178 }
2177 2179 if (*sp1 == NULL) {
2178 2180 /*
2179 2181 * Option *sp2 not found in mop1, so copy it.
2180 2182 * The calls to vfs_copycancelopt_extend()
2181 2183 * guarantee that there's enough room.
2182 2184 */
2183 2185 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2184 2186 (void) strcpy(*dp++, *sp2);
2185 2187 }
2186 2188 }
2187 2189 if (dp != &result[c1+c2]) {
2188 2190 size_t bytes = (dp - result + 1) * sizeof (char *);
2189 2191 char **nres = kmem_alloc(bytes, KM_SLEEP);
2190 2192
2191 2193 bcopy(result, nres, bytes);
2192 2194 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2193 2195 result = nres;
2194 2196 }
2195 2197 return (result);
2196 2198 }
2197 2199
2198 2200 /*
2199 2201 * Merge two mount option tables (outer and inner) into one. This is very
2200 2202 * similar to "merging" global variables and automatic variables in C.
2201 2203 *
2202 2204 * This isn't (and doesn't have to be) fast.
2203 2205 *
2204 2206 * This function is *not* for general use by filesystems.
2205 2207 *
2206 2208 * Note: caller is responsible for locking the vfs list, if needed,
2207 2209 * to protect omo, imo & dmo.
2208 2210 */
2209 2211 void
2210 2212 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2211 2213 {
2212 2214 uint_t i, count;
2213 2215 mntopt_t *mop, *motbl;
2214 2216 uint_t freeidx;
2215 2217
2216 2218 /*
2217 2219 * First determine how much space we need to allocate.
2218 2220 */
2219 2221 count = omo->mo_count;
2220 2222 for (i = 0; i < imo->mo_count; i++) {
2221 2223 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2222 2224 continue;
2223 2225 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2224 2226 count++;
2225 2227 }
2226 2228 ASSERT(count >= omo->mo_count &&
2227 2229 count <= omo->mo_count + imo->mo_count);
2228 2230 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2229 2231 for (i = 0; i < omo->mo_count; i++)
2230 2232 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2231 2233 freeidx = omo->mo_count;
2232 2234 for (i = 0; i < imo->mo_count; i++) {
2233 2235 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2234 2236 continue;
2235 2237 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2236 2238 char **newcanp;
2237 2239 uint_t index = mop - omo->mo_list;
2238 2240
2239 2241 newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2240 2242
2241 2243 vfs_freeopt(&motbl[index]);
2242 2244 vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2243 2245
2244 2246 vfs_freecancelopt(motbl[index].mo_cancel);
2245 2247 motbl[index].mo_cancel = newcanp;
2246 2248 } else {
2247 2249 /*
2248 2250 * If it's a new option, just copy it over to the first
2249 2251 * free location.
2250 2252 */
2251 2253 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2252 2254 }
2253 2255 }
2254 2256 dmo->mo_count = count;
2255 2257 dmo->mo_list = motbl;
2256 2258 }
2257 2259
2258 2260 /*
2259 2261 * Functions to set and clear mount options in a mount options table.
2260 2262 */
2261 2263
2262 2264 /*
2263 2265 * Clear a mount option, if it exists.
2264 2266 *
2265 2267 * The update_mnttab arg indicates whether mops is part of a vfs that is on
2266 2268 * the vfs list.
2267 2269 */
2268 2270 static void
2269 2271 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2270 2272 {
2271 2273 struct mntopt *mop;
2272 2274 uint_t i, count;
2273 2275
2274 2276 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2275 2277
2276 2278 count = mops->mo_count;
2277 2279 for (i = 0; i < count; i++) {
2278 2280 mop = &mops->mo_list[i];
2279 2281
2280 2282 if (mop->mo_flags & MO_EMPTY)
2281 2283 continue;
2282 2284 if (strcmp(opt, mop->mo_name))
2283 2285 continue;
2284 2286 mop->mo_flags &= ~MO_SET;
2285 2287 if (mop->mo_arg != NULL) {
2286 2288 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2287 2289 }
2288 2290 mop->mo_arg = NULL;
2289 2291 if (update_mnttab)
2290 2292 vfs_mnttab_modtimeupd();
2291 2293 break;
2292 2294 }
2293 2295 }
2294 2296
2295 2297 void
2296 2298 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2297 2299 {
2298 2300 int gotlock = 0;
2299 2301
2300 2302 if (VFS_ON_LIST(vfsp)) {
2301 2303 gotlock = 1;
2302 2304 vfs_list_lock();
2303 2305 }
2304 2306 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2305 2307 if (gotlock)
2306 2308 vfs_list_unlock();
2307 2309 }
2308 2310
2309 2311
2310 2312 /*
2311 2313 * Set a mount option on. If it's not found in the table, it's silently
2312 2314 * ignored. If the option has MO_IGNORE set, it is still set unless the
2313 2315 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag
2314 2316 * bits can be used to toggle the MO_NODISPLAY bit for the option.
2315 2317 * If the VFS_CREATEOPT flag bit is set then the first option slot with
2316 2318 * MO_EMPTY set is created as the option passed in.
2317 2319 *
2318 2320 * The update_mnttab arg indicates whether mops is part of a vfs that is on
2319 2321 * the vfs list.
2320 2322 */
2321 2323 static void
2322 2324 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2323 2325 const char *arg, int flags, int update_mnttab)
2324 2326 {
2325 2327 mntopt_t *mop;
2326 2328 uint_t i, count;
2327 2329 char *sp;
2328 2330
2329 2331 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2330 2332
2331 2333 if (flags & VFS_CREATEOPT) {
2332 2334 if (vfs_hasopt(mops, opt) != NULL) {
2333 2335 flags &= ~VFS_CREATEOPT;
2334 2336 }
2335 2337 }
2336 2338 count = mops->mo_count;
2337 2339 for (i = 0; i < count; i++) {
2338 2340 mop = &mops->mo_list[i];
2339 2341
2340 2342 if (mop->mo_flags & MO_EMPTY) {
2341 2343 if ((flags & VFS_CREATEOPT) == 0)
2342 2344 continue;
2343 2345 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2344 2346 (void) strcpy(sp, opt);
2345 2347 mop->mo_name = sp;
2346 2348 if (arg != NULL)
2347 2349 mop->mo_flags = MO_HASVALUE;
2348 2350 else
2349 2351 mop->mo_flags = 0;
2350 2352 } else if (strcmp(opt, mop->mo_name)) {
2351 2353 continue;
2352 2354 }
2353 2355 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2354 2356 break;
2355 2357 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2356 2358 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2357 2359 (void) strcpy(sp, arg);
2358 2360 } else {
2359 2361 sp = NULL;
2360 2362 }
2361 2363 if (mop->mo_arg != NULL)
2362 2364 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2363 2365 mop->mo_arg = sp;
2364 2366 if (flags & VFS_DISPLAY)
2365 2367 mop->mo_flags &= ~MO_NODISPLAY;
2366 2368 if (flags & VFS_NODISPLAY)
2367 2369 mop->mo_flags |= MO_NODISPLAY;
2368 2370 mop->mo_flags |= MO_SET;
2369 2371 if (mop->mo_cancel != NULL) {
2370 2372 char **cp;
2371 2373
2372 2374 for (cp = mop->mo_cancel; *cp != NULL; cp++)
2373 2375 vfs_clearmntopt_nolock(mops, *cp, 0);
2374 2376 }
2375 2377 if (update_mnttab)
2376 2378 vfs_mnttab_modtimeupd();
2377 2379 break;
2378 2380 }
2379 2381 }
2380 2382
2381 2383 void
2382 2384 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2383 2385 {
2384 2386 int gotlock = 0;
2385 2387
2386 2388 if (VFS_ON_LIST(vfsp)) {
2387 2389 gotlock = 1;
2388 2390 vfs_list_lock();
2389 2391 }
2390 2392 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2391 2393 if (gotlock)
2392 2394 vfs_list_unlock();
2393 2395 }
2394 2396
2395 2397
2396 2398 /*
2397 2399 * Add a "tag" option to a mounted file system's options list.
2398 2400 *
2399 2401 * Note: caller is responsible for locking the vfs list, if needed,
2400 2402 * to protect mops.
2401 2403 */
2402 2404 static mntopt_t *
2403 2405 vfs_addtag(mntopts_t *mops, const char *tag)
2404 2406 {
2405 2407 uint_t count;
2406 2408 mntopt_t *mop, *motbl;
2407 2409
2408 2410 count = mops->mo_count + 1;
2409 2411 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2410 2412 if (mops->mo_count) {
2411 2413 size_t len = (count - 1) * sizeof (mntopt_t);
2412 2414
2413 2415 bcopy(mops->mo_list, motbl, len);
2414 2416 kmem_free(mops->mo_list, len);
2415 2417 }
2416 2418 mops->mo_count = count;
2417 2419 mops->mo_list = motbl;
2418 2420 mop = &motbl[count - 1];
2419 2421 mop->mo_flags = MO_TAG;
2420 2422 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2421 2423 (void) strcpy(mop->mo_name, tag);
2422 2424 return (mop);
2423 2425 }
2424 2426
2425 2427 /*
2426 2428 * Allow users to set arbitrary "tags" in a vfs's mount options.
2427 2429 * Broader use within the kernel is discouraged.
2428 2430 */
2429 2431 int
2430 2432 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2431 2433 cred_t *cr)
2432 2434 {
2433 2435 vfs_t *vfsp;
2434 2436 mntopts_t *mops;
2435 2437 mntopt_t *mop;
2436 2438 int found = 0;
2437 2439 dev_t dev = makedevice(major, minor);
2438 2440 int err = 0;
2439 2441 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2440 2442
2441 2443 /*
2442 2444 * Find the desired mounted file system
2443 2445 */
2444 2446 vfs_list_lock();
2445 2447 vfsp = rootvfs;
2446 2448 do {
2447 2449 if (vfsp->vfs_dev == dev &&
2448 2450 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2449 2451 found = 1;
2450 2452 break;
2451 2453 }
2452 2454 vfsp = vfsp->vfs_next;
2453 2455 } while (vfsp != rootvfs);
2454 2456
2455 2457 if (!found) {
2456 2458 err = EINVAL;
2457 2459 goto out;
2458 2460 }
2459 2461 err = secpolicy_fs_config(cr, vfsp);
2460 2462 if (err != 0)
2461 2463 goto out;
2462 2464
2463 2465 mops = &vfsp->vfs_mntopts;
2464 2466 /*
2465 2467 * Add tag if it doesn't already exist
2466 2468 */
2467 2469 if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2468 2470 int len;
2469 2471
2470 2472 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2471 2473 len = strlen(buf);
2472 2474 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2473 2475 err = ENAMETOOLONG;
2474 2476 goto out;
2475 2477 }
2476 2478 mop = vfs_addtag(mops, tag);
2477 2479 }
2478 2480 if ((mop->mo_flags & MO_TAG) == 0) {
2479 2481 err = EINVAL;
2480 2482 goto out;
2481 2483 }
2482 2484 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2483 2485 out:
2484 2486 vfs_list_unlock();
2485 2487 kmem_free(buf, MAX_MNTOPT_STR);
2486 2488 return (err);
2487 2489 }
2488 2490
2489 2491 /*
2490 2492 * Allow users to remove arbitrary "tags" in a vfs's mount options.
2491 2493 * Broader use within the kernel is discouraged.
2492 2494 */
2493 2495 int
2494 2496 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2495 2497 cred_t *cr)
2496 2498 {
2497 2499 vfs_t *vfsp;
2498 2500 mntopt_t *mop;
2499 2501 int found = 0;
2500 2502 dev_t dev = makedevice(major, minor);
2501 2503 int err = 0;
2502 2504
2503 2505 /*
2504 2506 * Find the desired mounted file system
2505 2507 */
2506 2508 vfs_list_lock();
2507 2509 vfsp = rootvfs;
2508 2510 do {
2509 2511 if (vfsp->vfs_dev == dev &&
2510 2512 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2511 2513 found = 1;
2512 2514 break;
2513 2515 }
2514 2516 vfsp = vfsp->vfs_next;
2515 2517 } while (vfsp != rootvfs);
2516 2518
2517 2519 if (!found) {
2518 2520 err = EINVAL;
2519 2521 goto out;
2520 2522 }
2521 2523 err = secpolicy_fs_config(cr, vfsp);
2522 2524 if (err != 0)
2523 2525 goto out;
2524 2526
2525 2527 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2526 2528 err = EINVAL;
2527 2529 goto out;
2528 2530 }
2529 2531 if ((mop->mo_flags & MO_TAG) == 0) {
2530 2532 err = EINVAL;
2531 2533 goto out;
2532 2534 }
2533 2535 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2534 2536 out:
2535 2537 vfs_list_unlock();
2536 2538 return (err);
2537 2539 }
2538 2540
2539 2541 /*
2540 2542 * Function to parse an option string and fill in a mount options table.
2541 2543 * Unknown options are silently ignored. The input option string is modified
2542 2544 * by replacing separators with nulls. If the create flag is set, options
2543 2545 * not found in the table are just added on the fly. The table must have
2544 2546 * an option slot marked MO_EMPTY to add an option on the fly.
2545 2547 *
2546 2548 * This function is *not* for general use by filesystems.
2547 2549 *
2548 2550 * Note: caller is responsible for locking the vfs list, if needed,
2549 2551 * to protect mops..
2550 2552 */
2551 2553 void
2552 2554 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2553 2555 {
2554 2556 char *s = osp, *p, *nextop, *valp, *cp, *ep;
2555 2557 int setflg = VFS_NOFORCEOPT;
2556 2558
2557 2559 if (osp == NULL)
2558 2560 return;
2559 2561 while (*s != '\0') {
2560 2562 p = strchr(s, ','); /* find next option */
2561 2563 if (p == NULL) {
2562 2564 cp = NULL;
2563 2565 p = s + strlen(s);
2564 2566 } else {
2565 2567 cp = p; /* save location of comma */
2566 2568 *p++ = '\0'; /* mark end and point to next option */
2567 2569 }
2568 2570 nextop = p;
2569 2571 p = strchr(s, '='); /* look for value */
2570 2572 if (p == NULL) {
2571 2573 valp = NULL; /* no value supplied */
2572 2574 } else {
2573 2575 ep = p; /* save location of equals */
2574 2576 *p++ = '\0'; /* end option and point to value */
2575 2577 valp = p;
2576 2578 }
2577 2579 /*
2578 2580 * set option into options table
2579 2581 */
2580 2582 if (create)
2581 2583 setflg |= VFS_CREATEOPT;
2582 2584 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2583 2585 if (cp != NULL)
2584 2586 *cp = ','; /* restore the comma */
2585 2587 if (valp != NULL)
2586 2588 *ep = '='; /* restore the equals */
2587 2589 s = nextop;
2588 2590 }
2589 2591 }
2590 2592
2591 2593 /*
2592 2594 * Function to inquire if an option exists in a mount options table.
2593 2595 * Returns a pointer to the option if it exists, else NULL.
2594 2596 *
2595 2597 * This function is *not* for general use by filesystems.
2596 2598 *
2597 2599 * Note: caller is responsible for locking the vfs list, if needed,
2598 2600 * to protect mops.
2599 2601 */
2600 2602 struct mntopt *
2601 2603 vfs_hasopt(const mntopts_t *mops, const char *opt)
2602 2604 {
2603 2605 struct mntopt *mop;
2604 2606 uint_t i, count;
2605 2607
2606 2608 count = mops->mo_count;
2607 2609 for (i = 0; i < count; i++) {
2608 2610 mop = &mops->mo_list[i];
2609 2611
2610 2612 if (mop->mo_flags & MO_EMPTY)
2611 2613 continue;
2612 2614 if (strcmp(opt, mop->mo_name) == 0)
2613 2615 return (mop);
2614 2616 }
2615 2617 return (NULL);
2616 2618 }
2617 2619
2618 2620 /*
2619 2621 * Function to inquire if an option is set in a mount options table.
2620 2622 * Returns non-zero if set and fills in the arg pointer with a pointer to
2621 2623 * the argument string or NULL if there is no argument string.
2622 2624 */
2623 2625 static int
2624 2626 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2625 2627 {
2626 2628 struct mntopt *mop;
2627 2629 uint_t i, count;
2628 2630
2629 2631 count = mops->mo_count;
2630 2632 for (i = 0; i < count; i++) {
2631 2633 mop = &mops->mo_list[i];
2632 2634
2633 2635 if (mop->mo_flags & MO_EMPTY)
2634 2636 continue;
2635 2637 if (strcmp(opt, mop->mo_name))
2636 2638 continue;
2637 2639 if ((mop->mo_flags & MO_SET) == 0)
2638 2640 return (0);
2639 2641 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2640 2642 *argp = mop->mo_arg;
2641 2643 return (1);
2642 2644 }
2643 2645 return (0);
2644 2646 }
2645 2647
2646 2648
2647 2649 int
2648 2650 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2649 2651 {
2650 2652 int ret;
2651 2653
2652 2654 vfs_list_read_lock();
2653 2655 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2654 2656 vfs_list_unlock();
2655 2657 return (ret);
2656 2658 }
2657 2659
2658 2660
2659 2661 /*
2660 2662 * Construct a comma separated string of the options set in the given
2661 2663 * mount table, return the string in the given buffer. Return non-zero if
2662 2664 * the buffer would overflow.
2663 2665 *
2664 2666 * This function is *not* for general use by filesystems.
2665 2667 *
2666 2668 * Note: caller is responsible for locking the vfs list, if needed,
2667 2669 * to protect mp.
2668 2670 */
2669 2671 int
2670 2672 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2671 2673 {
2672 2674 char *cp;
2673 2675 uint_t i;
2674 2676
2675 2677 buf[0] = '\0';
2676 2678 cp = buf;
2677 2679 for (i = 0; i < mp->mo_count; i++) {
2678 2680 struct mntopt *mop;
2679 2681
2680 2682 mop = &mp->mo_list[i];
2681 2683 if (mop->mo_flags & MO_SET) {
2682 2684 int optlen, comma = 0;
2683 2685
2684 2686 if (buf[0] != '\0')
2685 2687 comma = 1;
2686 2688 optlen = strlen(mop->mo_name);
2687 2689 if (strlen(buf) + comma + optlen + 1 > len)
2688 2690 goto err;
2689 2691 if (comma)
2690 2692 *cp++ = ',';
2691 2693 (void) strcpy(cp, mop->mo_name);
2692 2694 cp += optlen;
2693 2695 /*
2694 2696 * Append option value if there is one
2695 2697 */
2696 2698 if (mop->mo_arg != NULL) {
2697 2699 int arglen;
2698 2700
2699 2701 arglen = strlen(mop->mo_arg);
2700 2702 if (strlen(buf) + arglen + 2 > len)
2701 2703 goto err;
2702 2704 *cp++ = '=';
2703 2705 (void) strcpy(cp, mop->mo_arg);
2704 2706 cp += arglen;
2705 2707 }
2706 2708 }
2707 2709 }
2708 2710 return (0);
2709 2711 err:
2710 2712 return (EOVERFLOW);
2711 2713 }
2712 2714
2713 2715 static void
2714 2716 vfs_freecancelopt(char **moc)
2715 2717 {
2716 2718 if (moc != NULL) {
2717 2719 int ccnt = 0;
2718 2720 char **cp;
2719 2721
2720 2722 for (cp = moc; *cp != NULL; cp++) {
2721 2723 kmem_free(*cp, strlen(*cp) + 1);
2722 2724 ccnt++;
2723 2725 }
2724 2726 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2725 2727 }
2726 2728 }
2727 2729
2728 2730 static void
2729 2731 vfs_freeopt(mntopt_t *mop)
2730 2732 {
2731 2733 if (mop->mo_name != NULL)
2732 2734 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2733 2735
2734 2736 vfs_freecancelopt(mop->mo_cancel);
2735 2737
2736 2738 if (mop->mo_arg != NULL)
2737 2739 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2738 2740 }
2739 2741
2740 2742 /*
2741 2743 * Free a mount options table
2742 2744 *
2743 2745 * This function is *not* for general use by filesystems.
2744 2746 *
2745 2747 * Note: caller is responsible for locking the vfs list, if needed,
2746 2748 * to protect mp.
2747 2749 */
2748 2750 void
2749 2751 vfs_freeopttbl(mntopts_t *mp)
2750 2752 {
2751 2753 uint_t i, count;
2752 2754
2753 2755 count = mp->mo_count;
2754 2756 for (i = 0; i < count; i++) {
2755 2757 vfs_freeopt(&mp->mo_list[i]);
2756 2758 }
2757 2759 if (count) {
2758 2760 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2759 2761 mp->mo_count = 0;
2760 2762 mp->mo_list = NULL;
2761 2763 }
2762 2764 }
2763 2765
2764 2766
2765 2767 /* ARGSUSED */
2766 2768 static int
2767 2769 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2768 2770 caller_context_t *ct)
2769 2771 {
2770 2772 return (0);
2771 2773 }
2772 2774
2773 2775 /* ARGSUSED */
2774 2776 static int
2775 2777 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2776 2778 caller_context_t *ct)
2777 2779 {
2778 2780 return (0);
2779 2781 }
2780 2782
2781 2783 /*
2782 2784 * The dummy vnode is currently used only by file events notification
2783 2785 * module which is just interested in the timestamps.
2784 2786 */
2785 2787 /* ARGSUSED */
2786 2788 static int
2787 2789 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2788 2790 caller_context_t *ct)
2789 2791 {
2790 2792 bzero(vap, sizeof (vattr_t));
2791 2793 vap->va_type = VREG;
2792 2794 vap->va_nlink = 1;
2793 2795 vap->va_ctime = vfs_mnttab_ctime;
2794 2796 /*
2795 2797 * it is ok to just copy mtime as the time will be monotonically
2796 2798 * increasing.
2797 2799 */
2798 2800 vap->va_mtime = vfs_mnttab_mtime;
2799 2801 vap->va_atime = vap->va_mtime;
2800 2802 return (0);
2801 2803 }
2802 2804
2803 2805 static void
2804 2806 vfs_mnttabvp_setup(void)
2805 2807 {
2806 2808 vnode_t *tvp;
2807 2809 vnodeops_t *vfs_mntdummyvnops;
2808 2810 const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2809 2811 VOPNAME_READ, { .vop_read = vfs_mntdummyread },
2810 2812 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite },
2811 2813 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr },
2812 2814 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
2813 2815 NULL, NULL
2814 2816 };
2815 2817
2816 2818 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2817 2819 &vfs_mntdummyvnops) != 0) {
2818 2820 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2819 2821 /* Shouldn't happen, but not bad enough to panic */
2820 2822 return;
2821 2823 }
2822 2824
2823 2825 /*
2824 2826 * A global dummy vnode is allocated to represent mntfs files.
2825 2827 * The mntfs file (/etc/mnttab) can be monitored for file events
2826 2828 * and receive an event when mnttab changes. Dummy VOP calls
2827 2829 * will be made on this vnode. The file events notification module
2828 2830 * intercepts this vnode and delivers relevant events.
2829 2831 */
2830 2832 tvp = vn_alloc(KM_SLEEP);
2831 2833 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2832 2834 vn_setops(tvp, vfs_mntdummyvnops);
2833 2835 tvp->v_type = VREG;
2834 2836 /*
2835 2837 * The mnt dummy ops do not reference v_data.
2836 2838 * No other module intercepting this vnode should either.
2837 2839 * Just set it to point to itself.
2838 2840 */
2839 2841 tvp->v_data = (caddr_t)tvp;
2840 2842 tvp->v_vfsp = rootvfs;
2841 2843 vfs_mntdummyvp = tvp;
2842 2844 }
2843 2845
2844 2846 /*
2845 2847 * performs fake read/write ops
2846 2848 */
2847 2849 static void
2848 2850 vfs_mnttab_rwop(int rw)
2849 2851 {
2850 2852 struct uio uio;
2851 2853 struct iovec iov;
2852 2854 char buf[1];
2853 2855
2854 2856 if (vfs_mntdummyvp == NULL)
2855 2857 return;
2856 2858
2857 2859 bzero(&uio, sizeof (uio));
2858 2860 bzero(&iov, sizeof (iov));
2859 2861 iov.iov_base = buf;
2860 2862 iov.iov_len = 0;
2861 2863 uio.uio_iov = &iov;
2862 2864 uio.uio_iovcnt = 1;
2863 2865 uio.uio_loffset = 0;
2864 2866 uio.uio_segflg = UIO_SYSSPACE;
2865 2867 uio.uio_resid = 0;
2866 2868 if (rw) {
2867 2869 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2868 2870 } else {
2869 2871 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2870 2872 }
2871 2873 }
2872 2874
2873 2875 /*
2874 2876 * Generate a write operation.
2875 2877 */
2876 2878 void
2877 2879 vfs_mnttab_writeop(void)
2878 2880 {
2879 2881 vfs_mnttab_rwop(1);
2880 2882 }
2881 2883
2882 2884 /*
2883 2885 * Generate a read operation.
2884 2886 */
2885 2887 void
2886 2888 vfs_mnttab_readop(void)
2887 2889 {
2888 2890 vfs_mnttab_rwop(0);
2889 2891 }
2890 2892
2891 2893 /*
2892 2894 * Free any mnttab information recorded in the vfs struct.
2893 2895 * The vfs must not be on the vfs list.
2894 2896 */
2895 2897 static void
2896 2898 vfs_freemnttab(struct vfs *vfsp)
2897 2899 {
2898 2900 ASSERT(!VFS_ON_LIST(vfsp));
2899 2901
2900 2902 /*
2901 2903 * Free device and mount point information
2902 2904 */
2903 2905 if (vfsp->vfs_mntpt != NULL) {
2904 2906 refstr_rele(vfsp->vfs_mntpt);
2905 2907 vfsp->vfs_mntpt = NULL;
2906 2908 }
2907 2909 if (vfsp->vfs_resource != NULL) {
2908 2910 refstr_rele(vfsp->vfs_resource);
2909 2911 vfsp->vfs_resource = NULL;
2910 2912 }
2911 2913 /*
2912 2914 * Now free mount options information
2913 2915 */
2914 2916 vfs_freeopttbl(&vfsp->vfs_mntopts);
2915 2917 }
2916 2918
2917 2919 /*
2918 2920 * Return the last mnttab modification time
2919 2921 */
2920 2922 void
2921 2923 vfs_mnttab_modtime(timespec_t *ts)
2922 2924 {
2923 2925 ASSERT(RW_LOCK_HELD(&vfslist));
2924 2926 *ts = vfs_mnttab_mtime;
2925 2927 }
2926 2928
2927 2929 /*
2928 2930 * See if mnttab is changed
2929 2931 */
2930 2932 void
2931 2933 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2932 2934 {
2933 2935 int changed;
2934 2936
2935 2937 *phpp = (struct pollhead *)NULL;
2936 2938
2937 2939 /*
2938 2940 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2939 2941 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2940 2942 * to not grab the vfs list lock because tv_sec is monotonically
2941 2943 * increasing.
2942 2944 */
2943 2945
2944 2946 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2945 2947 (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2946 2948 if (!changed) {
2947 2949 *phpp = &vfs_pollhd;
2948 2950 }
2949 2951 }
2950 2952
2951 2953 /* Provide a unique and monotonically-increasing timestamp. */
2952 2954 void
2953 2955 vfs_mono_time(timespec_t *ts)
2954 2956 {
2955 2957 static volatile hrtime_t hrt; /* The saved time. */
2956 2958 hrtime_t newhrt, oldhrt; /* For effecting the CAS. */
2957 2959 timespec_t newts;
2958 2960
2959 2961 /*
2960 2962 * Try gethrestime() first, but be prepared to fabricate a sensible
2961 2963 * answer at the first sign of any trouble.
2962 2964 */
2963 2965 gethrestime(&newts);
2964 2966 newhrt = ts2hrt(&newts);
2965 2967 for (;;) {
2966 2968 oldhrt = hrt;
2967 2969 if (newhrt <= hrt)
2968 2970 newhrt = hrt + 1;
2969 2971 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2970 2972 break;
2971 2973 }
2972 2974 hrt2ts(newhrt, ts);
2973 2975 }
2974 2976
2975 2977 /*
2976 2978 * Update the mnttab modification time and wake up any waiters for
2977 2979 * mnttab changes
2978 2980 */
2979 2981 void
2980 2982 vfs_mnttab_modtimeupd()
2981 2983 {
2982 2984 hrtime_t oldhrt, newhrt;
2983 2985
2984 2986 ASSERT(RW_WRITE_HELD(&vfslist));
2985 2987 oldhrt = ts2hrt(&vfs_mnttab_mtime);
2986 2988 gethrestime(&vfs_mnttab_mtime);
2987 2989 newhrt = ts2hrt(&vfs_mnttab_mtime);
2988 2990 if (oldhrt == (hrtime_t)0)
2989 2991 vfs_mnttab_ctime = vfs_mnttab_mtime;
2990 2992 /*
2991 2993 * Attempt to provide unique mtime (like uniqtime but not).
2992 2994 */
2993 2995 if (newhrt == oldhrt) {
2994 2996 newhrt++;
2995 2997 hrt2ts(newhrt, &vfs_mnttab_mtime);
2996 2998 }
2997 2999 pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2998 3000 vfs_mnttab_writeop();
2999 3001 }
3000 3002
3001 3003 int
3002 3004 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3003 3005 {
3004 3006 vnode_t *coveredvp;
3005 3007 int error;
3006 3008 extern void teardown_vopstats(vfs_t *);
3007 3009
3008 3010 /*
3009 3011 * Get covered vnode. This will be NULL if the vfs is not linked
3010 3012 * into the file system name space (i.e., domount() with MNT_NOSPICE).
3011 3013 */
3012 3014 coveredvp = vfsp->vfs_vnodecovered;
3013 3015 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3014 3016
3015 3017 /*
3016 3018 * Purge all dnlc entries for this vfs.
3017 3019 */
3018 3020 (void) dnlc_purge_vfsp(vfsp, 0);
3019 3021
3020 3022 /* For forcible umount, skip VFS_SYNC() since it may hang */
3021 3023 if ((flag & MS_FORCE) == 0)
3022 3024 (void) VFS_SYNC(vfsp, 0, cr);
3023 3025
3024 3026 /*
3025 3027 * Lock the vfs to maintain fs status quo during unmount. This
3026 3028 * has to be done after the sync because ufs_update tries to acquire
3027 3029 * the vfs_reflock.
3028 3030 */
3029 3031 vfs_lock_wait(vfsp);
3030 3032
3031 3033 if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3032 3034 vfs_unlock(vfsp);
3033 3035 if (coveredvp != NULL)
3034 3036 vn_vfsunlock(coveredvp);
3035 3037 } else if (coveredvp != NULL) {
3036 3038 teardown_vopstats(vfsp);
3037 3039 /*
3038 3040 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3039 3041 * when it frees vfsp so we do a VN_HOLD() so we can
3040 3042 * continue to use coveredvp afterwards.
3041 3043 */
3042 3044 VN_HOLD(coveredvp);
3043 3045 vfs_remove(vfsp);
3044 3046 vn_vfsunlock(coveredvp);
3045 3047 VN_RELE(coveredvp);
3046 3048 } else {
3047 3049 teardown_vopstats(vfsp);
3048 3050 /*
3049 3051 * Release the reference to vfs that is not linked
3050 3052 * into the name space.
3051 3053 */
3052 3054 vfs_unlock(vfsp);
3053 3055 VFS_RELE(vfsp);
3054 3056 }
3055 3057 return (error);
3056 3058 }
3057 3059
3058 3060
3059 3061 /*
3060 3062 * Vfs_unmountall() is called by uadmin() to unmount all
3061 3063 * mounted file systems (except the root file system) during shutdown.
3062 3064 * It follows the existing locking protocol when traversing the vfs list
3063 3065 * to sync and unmount vfses. Even though there should be no
3064 3066 * other thread running while the system is shutting down, it is prudent
3065 3067 * to still follow the locking protocol.
3066 3068 */
3067 3069 void
3068 3070 vfs_unmountall(void)
3069 3071 {
3070 3072 struct vfs *vfsp;
3071 3073 struct vfs *prev_vfsp = NULL;
3072 3074 int error;
3073 3075
3074 3076 /*
3075 3077 * Toss all dnlc entries now so that the per-vfs sync
3076 3078 * and unmount operations don't have to slog through
3077 3079 * a bunch of uninteresting vnodes over and over again.
3078 3080 */
3079 3081 dnlc_purge();
3080 3082
3081 3083 vfs_list_lock();
3082 3084 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3083 3085 prev_vfsp = vfsp->vfs_prev;
3084 3086
3085 3087 if (vfs_lock(vfsp) != 0)
3086 3088 continue;
3087 3089 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3088 3090 vfs_unlock(vfsp);
3089 3091 if (error)
3090 3092 continue;
3091 3093
3092 3094 vfs_list_unlock();
3093 3095
3094 3096 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3095 3097 (void) dounmount(vfsp, 0, CRED());
3096 3098
3097 3099 /*
3098 3100 * Since we dropped the vfslist lock above we must
3099 3101 * verify that next_vfsp still exists, else start over.
3100 3102 */
3101 3103 vfs_list_lock();
3102 3104 for (vfsp = rootvfs->vfs_prev;
3103 3105 vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3104 3106 if (vfsp == prev_vfsp)
3105 3107 break;
3106 3108 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3107 3109 prev_vfsp = rootvfs->vfs_prev;
3108 3110 }
3109 3111 vfs_list_unlock();
3110 3112 }
3111 3113
3112 3114 /*
3113 3115 * Called to add an entry to the end of the vfs mount in progress list
3114 3116 */
3115 3117 void
3116 3118 vfs_addmip(dev_t dev, struct vfs *vfsp)
3117 3119 {
3118 3120 struct ipmnt *mipp;
3119 3121
3120 3122 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3121 3123 mipp->mip_next = NULL;
3122 3124 mipp->mip_dev = dev;
3123 3125 mipp->mip_vfsp = vfsp;
3124 3126 mutex_enter(&vfs_miplist_mutex);
3125 3127 if (vfs_miplist_end != NULL)
3126 3128 vfs_miplist_end->mip_next = mipp;
3127 3129 else
3128 3130 vfs_miplist = mipp;
3129 3131 vfs_miplist_end = mipp;
3130 3132 mutex_exit(&vfs_miplist_mutex);
3131 3133 }
3132 3134
3133 3135 /*
3134 3136 * Called to remove an entry from the mount in progress list
3135 3137 * Either because the mount completed or it failed.
3136 3138 */
3137 3139 void
3138 3140 vfs_delmip(struct vfs *vfsp)
3139 3141 {
3140 3142 struct ipmnt *mipp, *mipprev;
3141 3143
3142 3144 mutex_enter(&vfs_miplist_mutex);
3143 3145 mipprev = NULL;
3144 3146 for (mipp = vfs_miplist;
3145 3147 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3146 3148 mipprev = mipp;
3147 3149 }
3148 3150 if (mipp == NULL)
3149 3151 return; /* shouldn't happen */
3150 3152 if (mipp == vfs_miplist_end)
3151 3153 vfs_miplist_end = mipprev;
3152 3154 if (mipprev == NULL)
3153 3155 vfs_miplist = mipp->mip_next;
3154 3156 else
3155 3157 mipprev->mip_next = mipp->mip_next;
3156 3158 mutex_exit(&vfs_miplist_mutex);
3157 3159 kmem_free(mipp, sizeof (struct ipmnt));
3158 3160 }
3159 3161
3160 3162 /*
3161 3163 * vfs_add is called by a specific filesystem's mount routine to add
3162 3164 * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3163 3165 * The vfs should already have been locked by the caller.
3164 3166 *
3165 3167 * coveredvp is NULL if this is the root.
3166 3168 */
3167 3169 void
3168 3170 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3169 3171 {
3170 3172 int newflag;
3171 3173
3172 3174 ASSERT(vfs_lock_held(vfsp));
3173 3175 VFS_HOLD(vfsp);
3174 3176 newflag = vfsp->vfs_flag;
3175 3177 if (mflag & MS_RDONLY)
3176 3178 newflag |= VFS_RDONLY;
3177 3179 else
3178 3180 newflag &= ~VFS_RDONLY;
3179 3181 if (mflag & MS_NOSUID)
3180 3182 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3181 3183 else
3182 3184 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3183 3185 if (mflag & MS_NOMNTTAB)
3184 3186 newflag |= VFS_NOMNTTAB;
3185 3187 else
3186 3188 newflag &= ~VFS_NOMNTTAB;
3187 3189
3188 3190 if (coveredvp != NULL) {
3189 3191 ASSERT(vn_vfswlock_held(coveredvp));
3190 3192 coveredvp->v_vfsmountedhere = vfsp;
3191 3193 VN_HOLD(coveredvp);
3192 3194 }
3193 3195 vfsp->vfs_vnodecovered = coveredvp;
3194 3196 vfsp->vfs_flag = newflag;
3195 3197
3196 3198 vfs_list_add(vfsp);
3197 3199 }
3198 3200
3199 3201 /*
3200 3202 * Remove a vfs from the vfs list, null out the pointer from the
3201 3203 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3202 3204 * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3203 3205 * reference to the vfs and to the covered vnode.
3204 3206 *
3205 3207 * Called from dounmount after it's confirmed with the file system
3206 3208 * that the unmount is legal.
3207 3209 */
3208 3210 void
3209 3211 vfs_remove(struct vfs *vfsp)
3210 3212 {
3211 3213 vnode_t *vp;
3212 3214
3213 3215 ASSERT(vfs_lock_held(vfsp));
3214 3216
3215 3217 /*
3216 3218 * Can't unmount root. Should never happen because fs will
3217 3219 * be busy.
3218 3220 */
3219 3221 if (vfsp == rootvfs)
3220 3222 panic("vfs_remove: unmounting root");
3221 3223
3222 3224 vfs_list_remove(vfsp);
3223 3225
3224 3226 /*
3225 3227 * Unhook from the file system name space.
3226 3228 */
3227 3229 vp = vfsp->vfs_vnodecovered;
3228 3230 ASSERT(vn_vfswlock_held(vp));
3229 3231 vp->v_vfsmountedhere = NULL;
3230 3232 vfsp->vfs_vnodecovered = NULL;
3231 3233 VN_RELE(vp);
3232 3234
3233 3235 /*
3234 3236 * Release lock and wakeup anybody waiting.
3235 3237 */
3236 3238 vfs_unlock(vfsp);
3237 3239 VFS_RELE(vfsp);
3238 3240 }
3239 3241
3240 3242 /*
3241 3243 * Lock a filesystem to prevent access to it while mounting,
3242 3244 * unmounting and syncing. Return EBUSY immediately if lock
3243 3245 * can't be acquired.
3244 3246 */
3245 3247 int
3246 3248 vfs_lock(vfs_t *vfsp)
3247 3249 {
3248 3250 vn_vfslocks_entry_t *vpvfsentry;
3249 3251
3250 3252 vpvfsentry = vn_vfslocks_getlock(vfsp);
3251 3253 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3252 3254 return (0);
3253 3255
3254 3256 vn_vfslocks_rele(vpvfsentry);
3255 3257 return (EBUSY);
3256 3258 }
3257 3259
3258 3260 int
3259 3261 vfs_rlock(vfs_t *vfsp)
3260 3262 {
3261 3263 vn_vfslocks_entry_t *vpvfsentry;
3262 3264
3263 3265 vpvfsentry = vn_vfslocks_getlock(vfsp);
3264 3266
3265 3267 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3266 3268 return (0);
3267 3269
3268 3270 vn_vfslocks_rele(vpvfsentry);
3269 3271 return (EBUSY);
3270 3272 }
3271 3273
3272 3274 void
3273 3275 vfs_lock_wait(vfs_t *vfsp)
3274 3276 {
3275 3277 vn_vfslocks_entry_t *vpvfsentry;
3276 3278
3277 3279 vpvfsentry = vn_vfslocks_getlock(vfsp);
3278 3280 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3279 3281 }
3280 3282
3281 3283 void
3282 3284 vfs_rlock_wait(vfs_t *vfsp)
3283 3285 {
3284 3286 vn_vfslocks_entry_t *vpvfsentry;
3285 3287
3286 3288 vpvfsentry = vn_vfslocks_getlock(vfsp);
3287 3289 rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3288 3290 }
3289 3291
3290 3292 /*
3291 3293 * Unlock a locked filesystem.
3292 3294 */
3293 3295 void
3294 3296 vfs_unlock(vfs_t *vfsp)
3295 3297 {
3296 3298 vn_vfslocks_entry_t *vpvfsentry;
3297 3299
3298 3300 /*
3299 3301 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3300 3302 * And these changes should remain for the patch changes as it is.
3301 3303 */
3302 3304 if (panicstr)
3303 3305 return;
3304 3306
3305 3307 /*
3306 3308 * ve_refcount needs to be dropped twice here.
3307 3309 * 1. To release refernce after a call to vfs_locks_getlock()
3308 3310 * 2. To release the reference from the locking routines like
3309 3311 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3310 3312 */
3311 3313
3312 3314 vpvfsentry = vn_vfslocks_getlock(vfsp);
3313 3315 vn_vfslocks_rele(vpvfsentry);
3314 3316
3315 3317 rwst_exit(&vpvfsentry->ve_lock);
3316 3318 vn_vfslocks_rele(vpvfsentry);
3317 3319 }
3318 3320
3319 3321 /*
3320 3322 * Utility routine that allows a filesystem to construct its
3321 3323 * fsid in "the usual way" - by munging some underlying dev_t and
3322 3324 * the filesystem type number into the 64-bit fsid. Note that
3323 3325 * this implicitly relies on dev_t persistence to make filesystem
3324 3326 * id's persistent.
3325 3327 *
3326 3328 * There's nothing to prevent an individual fs from constructing its
3327 3329 * fsid in a different way, and indeed they should.
3328 3330 *
3329 3331 * Since we want fsids to be 32-bit quantities (so that they can be
3330 3332 * exported identically by either 32-bit or 64-bit APIs, as well as
3331 3333 * the fact that fsid's are "known" to NFS), we compress the device
3332 3334 * number given down to 32-bits, and panic if that isn't possible.
3333 3335 */
3334 3336 void
3335 3337 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3336 3338 {
3337 3339 if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3338 3340 panic("device number too big for fsid!");
3339 3341 fsi->val[1] = val;
3340 3342 }
3341 3343
3342 3344 int
3343 3345 vfs_lock_held(vfs_t *vfsp)
3344 3346 {
3345 3347 int held;
3346 3348 vn_vfslocks_entry_t *vpvfsentry;
3347 3349
3348 3350 /*
3349 3351 * vfs_lock_held will mimic sema_held behaviour
3350 3352 * if panicstr is set. And these changes should remain
3351 3353 * for the patch changes as it is.
3352 3354 */
3353 3355 if (panicstr)
3354 3356 return (1);
3355 3357
3356 3358 vpvfsentry = vn_vfslocks_getlock(vfsp);
3357 3359 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3358 3360
3359 3361 vn_vfslocks_rele(vpvfsentry);
3360 3362 return (held);
3361 3363 }
3362 3364
3363 3365 struct _kthread *
3364 3366 vfs_lock_owner(vfs_t *vfsp)
3365 3367 {
3366 3368 struct _kthread *owner;
3367 3369 vn_vfslocks_entry_t *vpvfsentry;
3368 3370
3369 3371 /*
3370 3372 * vfs_wlock_held will mimic sema_held behaviour
3371 3373 * if panicstr is set. And these changes should remain
3372 3374 * for the patch changes as it is.
3373 3375 */
3374 3376 if (panicstr)
3375 3377 return (NULL);
3376 3378
3377 3379 vpvfsentry = vn_vfslocks_getlock(vfsp);
3378 3380 owner = rwst_owner(&vpvfsentry->ve_lock);
3379 3381
3380 3382 vn_vfslocks_rele(vpvfsentry);
3381 3383 return (owner);
3382 3384 }
3383 3385
3384 3386 /*
3385 3387 * vfs list locking.
3386 3388 *
3387 3389 * Rather than manipulate the vfslist lock directly, we abstract into lock
3388 3390 * and unlock routines to allow the locking implementation to be changed for
3389 3391 * clustering.
3390 3392 *
3391 3393 * Whenever the vfs list is modified through its hash links, the overall list
3392 3394 * lock must be obtained before locking the relevant hash bucket. But to see
3393 3395 * whether a given vfs is on the list, it suffices to obtain the lock for the
3394 3396 * hash bucket without getting the overall list lock. (See getvfs() below.)
3395 3397 */
3396 3398
3397 3399 void
3398 3400 vfs_list_lock()
3399 3401 {
3400 3402 rw_enter(&vfslist, RW_WRITER);
3401 3403 }
3402 3404
3403 3405 void
3404 3406 vfs_list_read_lock()
3405 3407 {
3406 3408 rw_enter(&vfslist, RW_READER);
3407 3409 }
3408 3410
3409 3411 void
3410 3412 vfs_list_unlock()
3411 3413 {
3412 3414 rw_exit(&vfslist);
3413 3415 }
3414 3416
3415 3417 /*
3416 3418 * Low level worker routines for adding entries to and removing entries from
3417 3419 * the vfs list.
3418 3420 */
3419 3421
3420 3422 static void
3421 3423 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3422 3424 {
3423 3425 int vhno;
3424 3426 struct vfs **hp;
3425 3427 dev_t dev;
3426 3428
3427 3429 ASSERT(RW_WRITE_HELD(&vfslist));
3428 3430
3429 3431 dev = expldev(vfsp->vfs_fsid.val[0]);
3430 3432 vhno = VFSHASH(getmajor(dev), getminor(dev));
3431 3433
3432 3434 mutex_enter(&rvfs_list[vhno].rvfs_lock);
3433 3435
3434 3436 /*
3435 3437 * Link into the hash table, inserting it at the end, so that LOFS
3436 3438 * with the same fsid as UFS (or other) file systems will not hide the
3437 3439 * UFS.
3438 3440 */
3439 3441 if (insert_at_head) {
3440 3442 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3441 3443 rvfs_list[vhno].rvfs_head = vfsp;
3442 3444 } else {
3443 3445 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3444 3446 hp = &(*hp)->vfs_hash)
3445 3447 continue;
3446 3448 /*
3447 3449 * hp now contains the address of the pointer to update
3448 3450 * to effect the insertion.
3449 3451 */
3450 3452 vfsp->vfs_hash = NULL;
3451 3453 *hp = vfsp;
3452 3454 }
3453 3455
3454 3456 rvfs_list[vhno].rvfs_len++;
3455 3457 mutex_exit(&rvfs_list[vhno].rvfs_lock);
3456 3458 }
3457 3459
3458 3460
3459 3461 static void
3460 3462 vfs_hash_remove(struct vfs *vfsp)
3461 3463 {
3462 3464 int vhno;
3463 3465 struct vfs *tvfsp;
3464 3466 dev_t dev;
3465 3467
3466 3468 ASSERT(RW_WRITE_HELD(&vfslist));
3467 3469
3468 3470 dev = expldev(vfsp->vfs_fsid.val[0]);
3469 3471 vhno = VFSHASH(getmajor(dev), getminor(dev));
3470 3472
3471 3473 mutex_enter(&rvfs_list[vhno].rvfs_lock);
3472 3474
3473 3475 /*
3474 3476 * Remove from hash.
3475 3477 */
3476 3478 if (rvfs_list[vhno].rvfs_head == vfsp) {
3477 3479 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3478 3480 rvfs_list[vhno].rvfs_len--;
3479 3481 goto foundit;
3480 3482 }
3481 3483 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3482 3484 tvfsp = tvfsp->vfs_hash) {
3483 3485 if (tvfsp->vfs_hash == vfsp) {
3484 3486 tvfsp->vfs_hash = vfsp->vfs_hash;
3485 3487 rvfs_list[vhno].rvfs_len--;
3486 3488 goto foundit;
3487 3489 }
3488 3490 }
3489 3491 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3490 3492
3491 3493 foundit:
3492 3494
3493 3495 mutex_exit(&rvfs_list[vhno].rvfs_lock);
3494 3496 }
3495 3497
3496 3498
3497 3499 void
3498 3500 vfs_list_add(struct vfs *vfsp)
3499 3501 {
3500 3502 zone_t *zone;
3501 3503
3502 3504 /*
3503 3505 * Typically, the vfs_t will have been created on behalf of the file
3504 3506 * system in vfs_init, where it will have been provided with a
3505 3507 * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3506 3508 * by an unbundled file system. We therefore check for such an example
3507 3509 * before stamping the vfs_t with its creation time for the benefit of
3508 3510 * mntfs.
3509 3511 */
3510 3512 if (vfsp->vfs_implp == NULL)
3511 3513 vfsimpl_setup(vfsp);
3512 3514 vfs_mono_time(&vfsp->vfs_hrctime);
3513 3515
3514 3516 /*
3515 3517 * The zone that owns the mount is the one that performed the mount.
3516 3518 * Note that this isn't necessarily the same as the zone mounted into.
3517 3519 * The corresponding zone_rele_ref() will be done when the vfs_t
3518 3520 * is being free'd.
3519 3521 */
3520 3522 vfsp->vfs_zone = curproc->p_zone;
3521 3523 zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3522 3524 zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3523 3525 ZONE_REF_VFS);
3524 3526
3525 3527 /*
3526 3528 * Find the zone mounted into, and put this mount on its vfs list.
3527 3529 */
3528 3530 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3529 3531 ASSERT(zone != NULL);
3530 3532 /*
3531 3533 * Special casing for the root vfs. This structure is allocated
3532 3534 * statically and hooked onto rootvfs at link time. During the
3533 3535 * vfs_mountroot call at system startup time, the root file system's
3534 3536 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3535 3537 * as argument. The code below must detect and handle this special
3536 3538 * case. The only apparent justification for this special casing is
3537 3539 * to ensure that the root file system appears at the head of the
3538 3540 * list.
3539 3541 *
3540 3542 * XXX: I'm assuming that it's ok to do normal list locking when
3541 3543 * adding the entry for the root file system (this used to be
3542 3544 * done with no locks held).
3543 3545 */
3544 3546 vfs_list_lock();
3545 3547 /*
3546 3548 * Link into the vfs list proper.
3547 3549 */
3548 3550 if (vfsp == &root) {
3549 3551 /*
3550 3552 * Assert: This vfs is already on the list as its first entry.
3551 3553 * Thus, there's nothing to do.
3552 3554 */
3553 3555 ASSERT(rootvfs == vfsp);
3554 3556 /*
3555 3557 * Add it to the head of the global zone's vfslist.
3556 3558 */
3557 3559 ASSERT(zone == global_zone);
3558 3560 ASSERT(zone->zone_vfslist == NULL);
3559 3561 zone->zone_vfslist = vfsp;
3560 3562 } else {
3561 3563 /*
3562 3564 * Link to end of list using vfs_prev (as rootvfs is now a
3563 3565 * doubly linked circular list) so list is in mount order for
3564 3566 * mnttab use.
3565 3567 */
3566 3568 rootvfs->vfs_prev->vfs_next = vfsp;
3567 3569 vfsp->vfs_prev = rootvfs->vfs_prev;
3568 3570 rootvfs->vfs_prev = vfsp;
3569 3571 vfsp->vfs_next = rootvfs;
3570 3572
3571 3573 /*
3572 3574 * Do it again for the zone-private list (which may be NULL).
3573 3575 */
3574 3576 if (zone->zone_vfslist == NULL) {
3575 3577 ASSERT(zone != global_zone);
3576 3578 zone->zone_vfslist = vfsp;
3577 3579 } else {
3578 3580 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3579 3581 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3580 3582 zone->zone_vfslist->vfs_zone_prev = vfsp;
3581 3583 vfsp->vfs_zone_next = zone->zone_vfslist;
3582 3584 }
3583 3585 }
3584 3586
3585 3587 /*
3586 3588 * Link into the hash table, inserting it at the end, so that LOFS
3587 3589 * with the same fsid as UFS (or other) file systems will not hide
3588 3590 * the UFS.
3589 3591 */
3590 3592 vfs_hash_add(vfsp, 0);
3591 3593
3592 3594 /*
3593 3595 * update the mnttab modification time
3594 3596 */
3595 3597 vfs_mnttab_modtimeupd();
3596 3598 vfs_list_unlock();
3597 3599 zone_rele(zone);
3598 3600 }
3599 3601
3600 3602 void
3601 3603 vfs_list_remove(struct vfs *vfsp)
3602 3604 {
3603 3605 zone_t *zone;
3604 3606
3605 3607 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3606 3608 ASSERT(zone != NULL);
3607 3609 /*
3608 3610 * Callers are responsible for preventing attempts to unmount the
3609 3611 * root.
3610 3612 */
3611 3613 ASSERT(vfsp != rootvfs);
3612 3614
3613 3615 vfs_list_lock();
3614 3616
3615 3617 /*
3616 3618 * Remove from hash.
3617 3619 */
3618 3620 vfs_hash_remove(vfsp);
3619 3621
3620 3622 /*
3621 3623 * Remove from vfs list.
3622 3624 */
3623 3625 vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3624 3626 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3625 3627 vfsp->vfs_next = vfsp->vfs_prev = NULL;
3626 3628
3627 3629 /*
3628 3630 * Remove from zone-specific vfs list.
3629 3631 */
3630 3632 if (zone->zone_vfslist == vfsp)
3631 3633 zone->zone_vfslist = vfsp->vfs_zone_next;
3632 3634
3633 3635 if (vfsp->vfs_zone_next == vfsp) {
3634 3636 ASSERT(vfsp->vfs_zone_prev == vfsp);
3635 3637 ASSERT(zone->zone_vfslist == vfsp);
3636 3638 zone->zone_vfslist = NULL;
3637 3639 }
3638 3640
3639 3641 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3640 3642 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3641 3643 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3642 3644
3643 3645 /*
3644 3646 * update the mnttab modification time
3645 3647 */
3646 3648 vfs_mnttab_modtimeupd();
3647 3649 vfs_list_unlock();
3648 3650 zone_rele(zone);
3649 3651 }
3650 3652
3651 3653 struct vfs *
3652 3654 getvfs(fsid_t *fsid)
3653 3655 {
3654 3656 struct vfs *vfsp;
3655 3657 int val0 = fsid->val[0];
3656 3658 int val1 = fsid->val[1];
3657 3659 dev_t dev = expldev(val0);
3658 3660 int vhno = VFSHASH(getmajor(dev), getminor(dev));
3659 3661 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3660 3662
3661 3663 mutex_enter(hmp);
3662 3664 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3663 3665 if (vfsp->vfs_fsid.val[0] == val0 &&
3664 3666 vfsp->vfs_fsid.val[1] == val1) {
3665 3667 VFS_HOLD(vfsp);
3666 3668 mutex_exit(hmp);
3667 3669 return (vfsp);
3668 3670 }
3669 3671 }
3670 3672 mutex_exit(hmp);
3671 3673 return (NULL);
3672 3674 }
3673 3675
3674 3676 /*
3675 3677 * Search the vfs mount in progress list for a specified device/vfs entry.
3676 3678 * Returns 0 if the first entry in the list that the device matches has the
3677 3679 * given vfs pointer as well. If the device matches but a different vfs
3678 3680 * pointer is encountered in the list before the given vfs pointer then
3679 3681 * a 1 is returned.
3680 3682 */
3681 3683
3682 3684 int
3683 3685 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3684 3686 {
3685 3687 int retval = 0;
3686 3688 struct ipmnt *mipp;
3687 3689
3688 3690 mutex_enter(&vfs_miplist_mutex);
3689 3691 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3690 3692 if (mipp->mip_dev == dev) {
3691 3693 if (mipp->mip_vfsp != vfsp)
3692 3694 retval = 1;
3693 3695 break;
3694 3696 }
3695 3697 }
3696 3698 mutex_exit(&vfs_miplist_mutex);
3697 3699 return (retval);
3698 3700 }
3699 3701
3700 3702 /*
3701 3703 * Search the vfs list for a specified device. Returns 1, if entry is found
3702 3704 * or 0 if no suitable entry is found.
3703 3705 */
3704 3706
3705 3707 int
3706 3708 vfs_devismounted(dev_t dev)
3707 3709 {
3708 3710 struct vfs *vfsp;
3709 3711 int found;
3710 3712
3711 3713 vfs_list_read_lock();
3712 3714 vfsp = rootvfs;
3713 3715 found = 0;
3714 3716 do {
3715 3717 if (vfsp->vfs_dev == dev) {
3716 3718 found = 1;
3717 3719 break;
3718 3720 }
3719 3721 vfsp = vfsp->vfs_next;
3720 3722 } while (vfsp != rootvfs);
3721 3723
3722 3724 vfs_list_unlock();
3723 3725 return (found);
3724 3726 }
3725 3727
3726 3728 /*
3727 3729 * Search the vfs list for a specified device. Returns a pointer to it
3728 3730 * or NULL if no suitable entry is found. The caller of this routine
3729 3731 * is responsible for releasing the returned vfs pointer.
3730 3732 */
3731 3733 struct vfs *
3732 3734 vfs_dev2vfsp(dev_t dev)
3733 3735 {
3734 3736 struct vfs *vfsp;
3735 3737 int found;
3736 3738
3737 3739 vfs_list_read_lock();
3738 3740 vfsp = rootvfs;
3739 3741 found = 0;
3740 3742 do {
3741 3743 /*
3742 3744 * The following could be made more efficient by making
3743 3745 * the entire loop use vfs_zone_next if the call is from
3744 3746 * a zone. The only callers, however, ustat(2) and
3745 3747 * umount2(2), don't seem to justify the added
3746 3748 * complexity at present.
3747 3749 */
3748 3750 if (vfsp->vfs_dev == dev &&
3749 3751 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3750 3752 curproc->p_zone)) {
3751 3753 VFS_HOLD(vfsp);
3752 3754 found = 1;
3753 3755 break;
3754 3756 }
3755 3757 vfsp = vfsp->vfs_next;
3756 3758 } while (vfsp != rootvfs);
3757 3759 vfs_list_unlock();
3758 3760 return (found ? vfsp: NULL);
3759 3761 }
3760 3762
3761 3763 /*
3762 3764 * Search the vfs list for a specified mntpoint. Returns a pointer to it
3763 3765 * or NULL if no suitable entry is found. The caller of this routine
3764 3766 * is responsible for releasing the returned vfs pointer.
3765 3767 *
3766 3768 * Note that if multiple mntpoints match, the last one matching is
3767 3769 * returned in an attempt to return the "top" mount when overlay
3768 3770 * mounts are covering the same mount point. This is accomplished by starting
3769 3771 * at the end of the list and working our way backwards, stopping at the first
3770 3772 * matching mount.
3771 3773 */
3772 3774 struct vfs *
3773 3775 vfs_mntpoint2vfsp(const char *mp)
3774 3776 {
3775 3777 struct vfs *vfsp;
3776 3778 struct vfs *retvfsp = NULL;
3777 3779 zone_t *zone = curproc->p_zone;
3778 3780 struct vfs *list;
3779 3781
3780 3782 vfs_list_read_lock();
3781 3783 if (getzoneid() == GLOBAL_ZONEID) {
3782 3784 /*
3783 3785 * The global zone may see filesystems in any zone.
3784 3786 */
3785 3787 vfsp = rootvfs->vfs_prev;
3786 3788 do {
3787 3789 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3788 3790 retvfsp = vfsp;
3789 3791 break;
3790 3792 }
3791 3793 vfsp = vfsp->vfs_prev;
3792 3794 } while (vfsp != rootvfs->vfs_prev);
3793 3795 } else if ((list = zone->zone_vfslist) != NULL) {
3794 3796 const char *mntpt;
3795 3797
3796 3798 vfsp = list->vfs_zone_prev;
3797 3799 do {
3798 3800 mntpt = refstr_value(vfsp->vfs_mntpt);
3799 3801 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3800 3802 if (strcmp(mntpt, mp) == 0) {
3801 3803 retvfsp = vfsp;
3802 3804 break;
3803 3805 }
3804 3806 vfsp = vfsp->vfs_zone_prev;
3805 3807 } while (vfsp != list->vfs_zone_prev);
3806 3808 }
3807 3809 if (retvfsp)
3808 3810 VFS_HOLD(retvfsp);
3809 3811 vfs_list_unlock();
3810 3812 return (retvfsp);
3811 3813 }
3812 3814
3813 3815 /*
3814 3816 * Search the vfs list for a specified vfsops.
3815 3817 * if vfs entry is found then return 1, else 0.
3816 3818 */
3817 3819 int
3818 3820 vfs_opsinuse(vfsops_t *ops)
3819 3821 {
3820 3822 struct vfs *vfsp;
3821 3823 int found;
3822 3824
3823 3825 vfs_list_read_lock();
3824 3826 vfsp = rootvfs;
3825 3827 found = 0;
3826 3828 do {
3827 3829 if (vfs_getops(vfsp) == ops) {
3828 3830 found = 1;
3829 3831 break;
3830 3832 }
3831 3833 vfsp = vfsp->vfs_next;
3832 3834 } while (vfsp != rootvfs);
3833 3835 vfs_list_unlock();
3834 3836 return (found);
3835 3837 }
3836 3838
3837 3839 /*
3838 3840 * Allocate an entry in vfssw for a file system type
3839 3841 */
3840 3842 struct vfssw *
3841 3843 allocate_vfssw(const char *type)
3842 3844 {
3843 3845 struct vfssw *vswp;
3844 3846
3845 3847 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3846 3848 /*
3847 3849 * The vfssw table uses the empty string to identify an
3848 3850 * available entry; we cannot add any type which has
3849 3851 * a leading NUL. The string length is limited to
3850 3852 * the size of the st_fstype array in struct stat.
3851 3853 */
3852 3854 return (NULL);
3853 3855 }
3854 3856
3855 3857 ASSERT(VFSSW_WRITE_LOCKED());
3856 3858 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3857 3859 if (!ALLOCATED_VFSSW(vswp)) {
3858 3860 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3859 3861 (void) strcpy(vswp->vsw_name, type);
3860 3862 ASSERT(vswp->vsw_count == 0);
3861 3863 vswp->vsw_count = 1;
3862 3864 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3863 3865 return (vswp);
3864 3866 }
3865 3867 return (NULL);
3866 3868 }
3867 3869
3868 3870 /*
3869 3871 * Impose additional layer of translation between vfstype names
3870 3872 * and module names in the filesystem.
3871 3873 */
3872 3874 static const char *
3873 3875 vfs_to_modname(const char *vfstype)
3874 3876 {
3875 3877 if (strcmp(vfstype, "proc") == 0) {
3876 3878 vfstype = "procfs";
3877 3879 } else if (strcmp(vfstype, "fd") == 0) {
3878 3880 vfstype = "fdfs";
3879 3881 } else if (strncmp(vfstype, "nfs", 3) == 0) {
3880 3882 vfstype = "nfs";
3881 3883 }
3882 3884
3883 3885 return (vfstype);
3884 3886 }
3885 3887
3886 3888 /*
3887 3889 * Find a vfssw entry given a file system type name.
3888 3890 * Try to autoload the filesystem if it's not found.
3889 3891 * If it's installed, return the vfssw locked to prevent unloading.
3890 3892 */
3891 3893 struct vfssw *
3892 3894 vfs_getvfssw(const char *type)
3893 3895 {
3894 3896 struct vfssw *vswp;
3895 3897 const char *modname;
3896 3898
3897 3899 RLOCK_VFSSW();
3898 3900 vswp = vfs_getvfsswbyname(type);
3899 3901 modname = vfs_to_modname(type);
3900 3902
3901 3903 if (rootdir == NULL) {
3902 3904 /*
3903 3905 * If we haven't yet loaded the root file system, then our
3904 3906 * _init won't be called until later. Allocate vfssw entry,
3905 3907 * because mod_installfs won't be called.
3906 3908 */
3907 3909 if (vswp == NULL) {
3908 3910 RUNLOCK_VFSSW();
3909 3911 WLOCK_VFSSW();
3910 3912 if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3911 3913 if ((vswp = allocate_vfssw(type)) == NULL) {
3912 3914 WUNLOCK_VFSSW();
3913 3915 return (NULL);
3914 3916 }
3915 3917 }
3916 3918 WUNLOCK_VFSSW();
3917 3919 RLOCK_VFSSW();
3918 3920 }
3919 3921 if (!VFS_INSTALLED(vswp)) {
3920 3922 RUNLOCK_VFSSW();
3921 3923 (void) modloadonly("fs", modname);
3922 3924 } else
3923 3925 RUNLOCK_VFSSW();
3924 3926 return (vswp);
3925 3927 }
3926 3928
3927 3929 /*
3928 3930 * Try to load the filesystem. Before calling modload(), we drop
3929 3931 * our lock on the VFS switch table, and pick it up after the
3930 3932 * module is loaded. However, there is a potential race: the
3931 3933 * module could be unloaded after the call to modload() completes
3932 3934 * but before we pick up the lock and drive on. Therefore,
3933 3935 * we keep reloading the module until we've loaded the module
3934 3936 * _and_ we have the lock on the VFS switch table.
3935 3937 */
3936 3938 while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3937 3939 RUNLOCK_VFSSW();
3938 3940 if (modload("fs", modname) == -1)
3939 3941 return (NULL);
3940 3942 RLOCK_VFSSW();
3941 3943 if (vswp == NULL)
3942 3944 if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3943 3945 break;
3944 3946 }
3945 3947 RUNLOCK_VFSSW();
3946 3948
3947 3949 return (vswp);
3948 3950 }
3949 3951
3950 3952 /*
3951 3953 * Find a vfssw entry given a file system type name.
3952 3954 */
3953 3955 struct vfssw *
3954 3956 vfs_getvfsswbyname(const char *type)
3955 3957 {
3956 3958 struct vfssw *vswp;
3957 3959
3958 3960 ASSERT(VFSSW_LOCKED());
3959 3961 if (type == NULL || *type == '\0')
3960 3962 return (NULL);
3961 3963
3962 3964 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3963 3965 if (strcmp(type, vswp->vsw_name) == 0) {
3964 3966 vfs_refvfssw(vswp);
3965 3967 return (vswp);
3966 3968 }
3967 3969 }
3968 3970
3969 3971 return (NULL);
3970 3972 }
3971 3973
3972 3974 /*
3973 3975 * Find a vfssw entry given a set of vfsops.
3974 3976 */
3975 3977 struct vfssw *
3976 3978 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3977 3979 {
3978 3980 struct vfssw *vswp;
3979 3981
3980 3982 RLOCK_VFSSW();
3981 3983 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3982 3984 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3983 3985 vfs_refvfssw(vswp);
3984 3986 RUNLOCK_VFSSW();
3985 3987 return (vswp);
3986 3988 }
3987 3989 }
3988 3990 RUNLOCK_VFSSW();
3989 3991
3990 3992 return (NULL);
3991 3993 }
3992 3994
3993 3995 /*
3994 3996 * Reference a vfssw entry.
3995 3997 */
3996 3998 void
3997 3999 vfs_refvfssw(struct vfssw *vswp)
3998 4000 {
3999 4001
4000 4002 mutex_enter(&vswp->vsw_lock);
4001 4003 vswp->vsw_count++;
4002 4004 mutex_exit(&vswp->vsw_lock);
4003 4005 }
4004 4006
4005 4007 /*
4006 4008 * Unreference a vfssw entry.
4007 4009 */
4008 4010 void
4009 4011 vfs_unrefvfssw(struct vfssw *vswp)
4010 4012 {
4011 4013
4012 4014 mutex_enter(&vswp->vsw_lock);
4013 4015 vswp->vsw_count--;
4014 4016 mutex_exit(&vswp->vsw_lock);
4015 4017 }
4016 4018
4017 4019 static int sync_retries = 20; /* number of retries when not making progress */
4018 4020 static int sync_triesleft; /* portion of sync_retries remaining */
4019 4021
4020 4022 static pgcnt_t old_pgcnt, new_pgcnt;
4021 4023 static int new_bufcnt, old_bufcnt;
4022 4024
4023 4025 /*
4024 4026 * Sync all of the mounted filesystems, and then wait for the actual i/o to
4025 4027 * complete. We wait by counting the number of dirty pages and buffers,
4026 4028 * pushing them out using bio_busy() and page_busy(), and then counting again.
4027 4029 * This routine is used during the uadmin A_SHUTDOWN code. It should only
4028 4030 * be used after some higher-level mechanism has quiesced the system so that
4029 4031 * new writes are not being initiated while we are waiting for completion.
4030 4032 *
4031 4033 * To ensure finite running time, our algorithm uses sync_triesleft (a progress
4032 4034 * counter used by the vfs_syncall() loop below). It is declared above so
4033 4035 * it can be found easily in the debugger.
4034 4036 *
4035 4037 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make
4036 4038 * sync_retries consecutive calls to bio_busy() and page_busy() without
4037 4039 * decreasing either the number of dirty buffers or dirty pages below the
4038 4040 * lowest count we have seen so far, we give up and return from vfs_syncall().
4039 4041 *
4040 4042 * Each loop iteration ends with a call to delay() one second to allow time for
4041 4043 * i/o completion and to permit the user time to read our progress messages.
4042 4044 */
4043 4045 void
4044 4046 vfs_syncall(void)
4045 4047 {
4046 4048 if (rootdir == NULL && !modrootloaded)
4047 4049 return; /* no filesystems have been loaded yet */
4048 4050
4049 4051 printf("syncing file systems...");
4050 4052 sync();
4051 4053
4052 4054 sync_triesleft = sync_retries;
4053 4055
4054 4056 old_bufcnt = new_bufcnt = INT_MAX;
4055 4057 old_pgcnt = new_pgcnt = ULONG_MAX;
4056 4058
4057 4059 while (sync_triesleft > 0) {
4058 4060 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4059 4061 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4060 4062
4061 4063 new_bufcnt = bio_busy(B_TRUE);
4062 4064 new_pgcnt = page_busy(B_TRUE);
4063 4065
4064 4066 if (new_bufcnt == 0 && new_pgcnt == 0)
4065 4067 break;
4066 4068
4067 4069 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4068 4070 sync_triesleft = sync_retries;
4069 4071 else
4070 4072 sync_triesleft--;
4071 4073
4072 4074 if (new_bufcnt)
4073 4075 printf(" [%d]", new_bufcnt);
4074 4076 if (new_pgcnt)
4075 4077 printf(" %lu", new_pgcnt);
4076 4078
4077 4079 delay(hz);
4078 4080 }
4079 4081
4080 4082 if (new_bufcnt != 0 || new_pgcnt != 0)
4081 4083 printf(" done (not all i/o completed)\n");
4082 4084 else
4083 4085 printf(" done\n");
4084 4086
4085 4087 delay(hz);
4086 4088 }
4087 4089
4088 4090 /*
4089 4091 * Map VFS flags to statvfs flags. These shouldn't really be separate
4090 4092 * flags at all.
4091 4093 */
4092 4094 uint_t
4093 4095 vf_to_stf(uint_t vf)
4094 4096 {
4095 4097 uint_t stf = 0;
4096 4098
4097 4099 if (vf & VFS_RDONLY)
4098 4100 stf |= ST_RDONLY;
4099 4101 if (vf & VFS_NOSETUID)
4100 4102 stf |= ST_NOSUID;
4101 4103 if (vf & VFS_NOTRUNC)
4102 4104 stf |= ST_NOTRUNC;
4103 4105
4104 4106 return (stf);
4105 4107 }
4106 4108
4107 4109 /*
4108 4110 * Entries for (illegal) fstype 0.
4109 4111 */
4110 4112 /* ARGSUSED */
4111 4113 int
4112 4114 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4113 4115 {
4114 4116 cmn_err(CE_PANIC, "stray vfs operation");
4115 4117 return (0);
4116 4118 }
4117 4119
4118 4120 /*
4119 4121 * Entries for (illegal) fstype 0.
4120 4122 */
4121 4123 int
4122 4124 vfsstray(void)
4123 4125 {
4124 4126 cmn_err(CE_PANIC, "stray vfs operation");
4125 4127 return (0);
4126 4128 }
4127 4129
4128 4130 /*
4129 4131 * Support for dealing with forced UFS unmount and its interaction with
4130 4132 * LOFS. Could be used by any filesystem.
4131 4133 * See bug 1203132.
4132 4134 */
4133 4135 int
4134 4136 vfs_EIO(void)
4135 4137 {
4136 4138 return (EIO);
4137 4139 }
4138 4140
4139 4141 /*
4140 4142 * We've gotta define the op for sync separately, since the compiler gets
4141 4143 * confused if we mix and match ANSI and normal style prototypes when
4142 4144 * a "short" argument is present and spits out a warning.
4143 4145 */
4144 4146 /*ARGSUSED*/
4145 4147 int
4146 4148 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4147 4149 {
4148 4150 return (EIO);
4149 4151 }
4150 4152
4151 4153 vfs_t EIO_vfs;
4152 4154 vfsops_t *EIO_vfsops;
4153 4155
4154 4156 /*
4155 4157 * Called from startup() to initialize all loaded vfs's
4156 4158 */
4157 4159 void
4158 4160 vfsinit(void)
4159 4161 {
4160 4162 struct vfssw *vswp;
4161 4163 int error;
4162 4164 extern int vopstats_enabled;
4163 4165 extern void vopstats_startup();
4164 4166
4165 4167 static const fs_operation_def_t EIO_vfsops_template[] = {
4166 4168 VFSNAME_MOUNT, { .error = vfs_EIO },
4167 4169 VFSNAME_UNMOUNT, { .error = vfs_EIO },
4168 4170 VFSNAME_ROOT, { .error = vfs_EIO },
4169 4171 VFSNAME_STATVFS, { .error = vfs_EIO },
4170 4172 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync },
4171 4173 VFSNAME_VGET, { .error = vfs_EIO },
4172 4174 VFSNAME_MOUNTROOT, { .error = vfs_EIO },
4173 4175 VFSNAME_FREEVFS, { .error = vfs_EIO },
4174 4176 VFSNAME_VNSTATE, { .error = vfs_EIO },
4175 4177 NULL, NULL
4176 4178 };
4177 4179
4178 4180 static const fs_operation_def_t stray_vfsops_template[] = {
4179 4181 VFSNAME_MOUNT, { .error = vfsstray },
4180 4182 VFSNAME_UNMOUNT, { .error = vfsstray },
4181 4183 VFSNAME_ROOT, { .error = vfsstray },
4182 4184 VFSNAME_STATVFS, { .error = vfsstray },
4183 4185 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync },
4184 4186 VFSNAME_VGET, { .error = vfsstray },
4185 4187 VFSNAME_MOUNTROOT, { .error = vfsstray },
4186 4188 VFSNAME_FREEVFS, { .error = vfsstray },
4187 4189 VFSNAME_VNSTATE, { .error = vfsstray },
4188 4190 NULL, NULL
4189 4191 };
4190 4192
4191 4193 /* Create vfs cache */
4192 4194 vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4193 4195 sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4194 4196
4195 4197 /* Initialize the vnode cache (file systems may use it during init). */
4196 4198 vn_create_cache();
4197 4199
4198 4200 /* Setup event monitor framework */
4199 4201 fem_init();
4200 4202
4201 4203 /* Initialize the dummy stray file system type. */
4202 4204 error = vfs_setfsops(0, stray_vfsops_template, NULL);
4203 4205
4204 4206 /* Initialize the dummy EIO file system. */
4205 4207 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4206 4208 if (error != 0) {
4207 4209 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4208 4210 /* Shouldn't happen, but not bad enough to panic */
4209 4211 }
4210 4212
4211 4213 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4212 4214
4213 4215 /*
4214 4216 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4215 4217 * on this vfs can immediately notice it's invalid.
4216 4218 */
4217 4219 EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4218 4220
4219 4221 /*
4220 4222 * Call the init routines of non-loadable filesystems only.
4221 4223 * Filesystems which are loaded as separate modules will be
4222 4224 * initialized by the module loading code instead.
4223 4225 */
4224 4226
4225 4227 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4226 4228 RLOCK_VFSSW();
4227 4229 if (vswp->vsw_init != NULL)
4228 4230 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4229 4231 RUNLOCK_VFSSW();
4230 4232 }
4231 4233
4232 4234 vopstats_startup();
4233 4235
4234 4236 if (vopstats_enabled) {
4235 4237 /* EIO_vfs can collect stats, but we don't retrieve them */
4236 4238 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4237 4239 EIO_vfs.vfs_fstypevsp = NULL;
4238 4240 EIO_vfs.vfs_vskap = NULL;
4239 4241 EIO_vfs.vfs_flag |= VFS_STATS;
4240 4242 }
4241 4243
4242 4244 xattr_init();
4243 4245
4244 4246 reparse_point_init();
4245 4247 }
4246 4248
4247 4249 vfs_t *
4248 4250 vfs_alloc(int kmflag)
4249 4251 {
4250 4252 vfs_t *vfsp;
4251 4253
4252 4254 vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4253 4255
4254 4256 /*
4255 4257 * Do the simplest initialization here.
4256 4258 * Everything else gets done in vfs_init()
4257 4259 */
4258 4260 bzero(vfsp, sizeof (vfs_t));
4259 4261 return (vfsp);
4260 4262 }
4261 4263
4262 4264 void
4263 4265 vfs_free(vfs_t *vfsp)
4264 4266 {
4265 4267 /*
4266 4268 * One would be tempted to assert that "vfsp->vfs_count == 0".
4267 4269 * The problem is that this gets called out of domount() with
4268 4270 * a partially initialized vfs and a vfs_count of 1. This is
4269 4271 * also called from vfs_rele() with a vfs_count of 0. We can't
4270 4272 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4271 4273 * returned. This is because VFS_MOUNT() fully initializes the
4272 4274 * vfs structure and its associated data. VFS_RELE() will call
4273 4275 * VFS_FREEVFS() which may panic the system if the data structures
4274 4276 * aren't fully initialized from a successful VFS_MOUNT()).
4275 4277 */
4276 4278
4277 4279 /* If FEM was in use, make sure everything gets cleaned up */
4278 4280 if (vfsp->vfs_femhead) {
4279 4281 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4280 4282 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4281 4283 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4282 4284 vfsp->vfs_femhead = NULL;
4283 4285 }
4284 4286
4285 4287 if (vfsp->vfs_implp)
4286 4288 vfsimpl_teardown(vfsp);
4287 4289 sema_destroy(&vfsp->vfs_reflock);
4288 4290 kmem_cache_free(vfs_cache, vfsp);
4289 4291 }
4290 4292
4291 4293 /*
4292 4294 * Increments the vfs reference count by one atomically.
4293 4295 */
4294 4296 void
4295 4297 vfs_hold(vfs_t *vfsp)
4296 4298 {
4297 4299 atomic_inc_32(&vfsp->vfs_count);
4298 4300 ASSERT(vfsp->vfs_count != 0);
4299 4301 }
4300 4302
4301 4303 /*
4302 4304 * Decrements the vfs reference count by one atomically. When
4303 4305 * vfs reference count becomes zero, it calls the file system
4304 4306 * specific vfs_freevfs() to free up the resources.
4305 4307 */
4306 4308 void
4307 4309 vfs_rele(vfs_t *vfsp)
4308 4310 {
4309 4311 ASSERT(vfsp->vfs_count != 0);
4310 4312 if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4311 4313 VFS_FREEVFS(vfsp);
4312 4314 lofi_remove(vfsp);
4313 4315 if (vfsp->vfs_zone)
4314 4316 zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4315 4317 ZONE_REF_VFS);
4316 4318 vfs_freemnttab(vfsp);
4317 4319 vfs_free(vfsp);
4318 4320 }
4319 4321 }
4320 4322
4321 4323 /*
4322 4324 * Generic operations vector support.
4323 4325 *
4324 4326 * This is used to build operations vectors for both the vfs and vnode.
4325 4327 * It's normally called only when a file system is loaded.
4326 4328 *
4327 4329 * There are many possible algorithms for this, including the following:
4328 4330 *
4329 4331 * (1) scan the list of known operations; for each, see if the file system
4330 4332 * includes an entry for it, and fill it in as appropriate.
4331 4333 *
4332 4334 * (2) set up defaults for all known operations. scan the list of ops
4333 4335 * supplied by the file system; for each which is both supplied and
4334 4336 * known, fill it in.
4335 4337 *
4336 4338 * (3) sort the lists of known ops & supplied ops; scan the list, filling
4337 4339 * in entries as we go.
4338 4340 *
4339 4341 * we choose (1) for simplicity, and because performance isn't critical here.
4340 4342 * note that (2) could be sped up using a precomputed hash table on known ops.
4341 4343 * (3) could be faster than either, but only if the lists were very large or
4342 4344 * supplied in sorted order.
4343 4345 *
4344 4346 */
4345 4347
4346 4348 int
4347 4349 fs_build_vector(void *vector, int *unused_ops,
4348 4350 const fs_operation_trans_def_t *translation,
4349 4351 const fs_operation_def_t *operations)
4350 4352 {
4351 4353 int i, num_trans, num_ops, used;
4352 4354
4353 4355 /*
4354 4356 * Count the number of translations and the number of supplied
4355 4357 * operations.
4356 4358 */
4357 4359
4358 4360 {
4359 4361 const fs_operation_trans_def_t *p;
4360 4362
4361 4363 for (num_trans = 0, p = translation;
4362 4364 p->name != NULL;
4363 4365 num_trans++, p++)
4364 4366 ;
4365 4367 }
4366 4368
4367 4369 {
4368 4370 const fs_operation_def_t *p;
4369 4371
4370 4372 for (num_ops = 0, p = operations;
4371 4373 p->name != NULL;
4372 4374 num_ops++, p++)
4373 4375 ;
4374 4376 }
4375 4377
4376 4378 /* Walk through each operation known to our caller. There will be */
4377 4379 /* one entry in the supplied "translation table" for each. */
4378 4380
4379 4381 used = 0;
4380 4382
4381 4383 for (i = 0; i < num_trans; i++) {
4382 4384 int j, found;
4383 4385 char *curname;
4384 4386 fs_generic_func_p result;
4385 4387 fs_generic_func_p *location;
4386 4388
4387 4389 curname = translation[i].name;
4388 4390
4389 4391 /* Look for a matching operation in the list supplied by the */
4390 4392 /* file system. */
4391 4393
4392 4394 found = 0;
4393 4395
4394 4396 for (j = 0; j < num_ops; j++) {
4395 4397 if (strcmp(operations[j].name, curname) == 0) {
4396 4398 used++;
4397 4399 found = 1;
4398 4400 break;
4399 4401 }
4400 4402 }
4401 4403
4402 4404 /*
4403 4405 * If the file system is using a "placeholder" for default
4404 4406 * or error functions, grab the appropriate function out of
4405 4407 * the translation table. If the file system didn't supply
4406 4408 * this operation at all, use the default function.
4407 4409 */
4408 4410
4409 4411 if (found) {
4410 4412 result = operations[j].func.fs_generic;
4411 4413 if (result == fs_default) {
4412 4414 result = translation[i].defaultFunc;
4413 4415 } else if (result == fs_error) {
4414 4416 result = translation[i].errorFunc;
4415 4417 } else if (result == NULL) {
4416 4418 /* Null values are PROHIBITED */
4417 4419 return (EINVAL);
4418 4420 }
4419 4421 } else {
4420 4422 result = translation[i].defaultFunc;
4421 4423 }
4422 4424
4423 4425 /* Now store the function into the operations vector. */
4424 4426
4425 4427 location = (fs_generic_func_p *)
4426 4428 (((char *)vector) + translation[i].offset);
4427 4429
4428 4430 *location = result;
4429 4431 }
4430 4432
4431 4433 *unused_ops = num_ops - used;
4432 4434
4433 4435 return (0);
4434 4436 }
4435 4437
4436 4438 /* Placeholder functions, should never be called. */
4437 4439
4438 4440 int
4439 4441 fs_error(void)
4440 4442 {
4441 4443 cmn_err(CE_PANIC, "fs_error called");
4442 4444 return (0);
4443 4445 }
4444 4446
4445 4447 int
4446 4448 fs_default(void)
4447 4449 {
4448 4450 cmn_err(CE_PANIC, "fs_default called");
4449 4451 return (0);
4450 4452 }
4451 4453
4452 4454 #ifdef __sparc
4453 4455
4454 4456 /*
4455 4457 * Part of the implementation of booting off a mirrored root
4456 4458 * involves a change of dev_t for the root device. To
4457 4459 * accomplish this, first remove the existing hash table
4458 4460 * entry for the root device, convert to the new dev_t,
4459 4461 * then re-insert in the hash table at the head of the list.
4460 4462 */
4461 4463 void
4462 4464 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4463 4465 {
4464 4466 vfs_list_lock();
4465 4467
4466 4468 vfs_hash_remove(vfsp);
4467 4469
4468 4470 vfsp->vfs_dev = ndev;
4469 4471 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4470 4472
4471 4473 vfs_hash_add(vfsp, 1);
4472 4474
4473 4475 vfs_list_unlock();
4474 4476 }
4475 4477
4476 4478 #else /* x86 NEWBOOT */
4477 4479
4478 4480 #if defined(__x86)
4479 4481 extern int hvmboot_rootconf();
4480 4482 #endif /* __x86 */
4481 4483
4482 4484 extern ib_boot_prop_t *iscsiboot_prop;
4483 4485
4484 4486 int
4485 4487 rootconf()
4486 4488 {
4487 4489 int error;
4488 4490 struct vfssw *vsw;
4489 4491 extern void pm_init();
4490 4492 char *fstyp, *fsmod;
4491 4493 int ret = -1;
4492 4494
4493 4495 getrootfs(&fstyp, &fsmod);
4494 4496
4495 4497 #if defined(__x86)
4496 4498 /*
4497 4499 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4498 4500 * which lives in /platform/i86hvm, and hence is only available when
4499 4501 * booted in an x86 hvm environment. If the hvm_bootstrap misc module
4500 4502 * is not available then the modstub for this function will return 0.
4501 4503 * If the hvm_bootstrap misc module is available it will be loaded
4502 4504 * and hvmboot_rootconf() will be invoked.
4503 4505 */
4504 4506 if (error = hvmboot_rootconf())
4505 4507 return (error);
4506 4508 #endif /* __x86 */
4507 4509
4508 4510 if (error = clboot_rootconf())
4509 4511 return (error);
4510 4512
4511 4513 if (modload("fs", fsmod) == -1)
4512 4514 panic("Cannot _init %s module", fsmod);
4513 4515
4514 4516 RLOCK_VFSSW();
4515 4517 vsw = vfs_getvfsswbyname(fstyp);
4516 4518 RUNLOCK_VFSSW();
4517 4519 if (vsw == NULL) {
4518 4520 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4519 4521 return (ENXIO);
4520 4522 }
4521 4523 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4522 4524 VFS_HOLD(rootvfs);
4523 4525
4524 4526 /* always mount readonly first */
4525 4527 rootvfs->vfs_flag |= VFS_RDONLY;
4526 4528
4527 4529 pm_init();
4528 4530
4529 4531 if (netboot && iscsiboot_prop) {
4530 4532 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4531 4533 " shouldn't happen in the same time");
4532 4534 return (EINVAL);
4533 4535 }
4534 4536
4535 4537 if (netboot || iscsiboot_prop) {
4536 4538 ret = strplumb();
4537 4539 if (ret != 0) {
4538 4540 cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4539 4541 return (EFAULT);
4540 4542 }
4541 4543 }
4542 4544
4543 4545 if ((ret == 0) && iscsiboot_prop) {
4544 4546 ret = modload("drv", "iscsi");
4545 4547 /* -1 indicates fail */
4546 4548 if (ret == -1) {
4547 4549 cmn_err(CE_WARN, "Failed to load iscsi module");
4548 4550 iscsi_boot_prop_free();
4549 4551 return (EINVAL);
4550 4552 } else {
4551 4553 if (!i_ddi_attach_pseudo_node("iscsi")) {
4552 4554 cmn_err(CE_WARN,
4553 4555 "Failed to attach iscsi driver");
4554 4556 iscsi_boot_prop_free();
4555 4557 return (ENODEV);
4556 4558 }
4557 4559 }
4558 4560 }
4559 4561
4560 4562 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4561 4563 vfs_unrefvfssw(vsw);
4562 4564 rootdev = rootvfs->vfs_dev;
4563 4565
4564 4566 if (error)
4565 4567 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4566 4568 rootfs.bo_name, fstyp);
4567 4569 else
4568 4570 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4569 4571 rootfs.bo_name, fstyp);
4570 4572 return (error);
4571 4573 }
4572 4574
4573 4575 /*
4574 4576 * XXX this is called by nfs only and should probably be removed
4575 4577 * If booted with ASKNAME, prompt on the console for a filesystem
4576 4578 * name and return it.
4577 4579 */
4578 4580 void
4579 4581 getfsname(char *askfor, char *name, size_t namelen)
4580 4582 {
4581 4583 if (boothowto & RB_ASKNAME) {
4582 4584 printf("%s name: ", askfor);
4583 4585 console_gets(name, namelen);
4584 4586 }
4585 4587 }
4586 4588
4587 4589 /*
4588 4590 * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4589 4591 * property.
4590 4592 *
4591 4593 * Filesystem types starting with the prefix "nfs" are diskless clients;
4592 4594 * init the root filename name (rootfs.bo_name), too.
4593 4595 *
4594 4596 * If we are booting via NFS we currently have these options:
4595 4597 * nfs - dynamically choose NFS V2, V3, or V4 (default)
4596 4598 * nfs2 - force NFS V2
4597 4599 * nfs3 - force NFS V3
4598 4600 * nfs4 - force NFS V4
4599 4601 * Because we need to maintain backward compatibility with the naming
4600 4602 * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4601 4603 * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs". The dynamic
4602 4604 * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4603 4605 * This is only for root filesystems, all other uses will expect
4604 4606 * that "nfs" == NFS V2.
4605 4607 */
4606 4608 static void
4607 4609 getrootfs(char **fstypp, char **fsmodp)
4608 4610 {
4609 4611 char *propstr = NULL;
4610 4612
4611 4613 /*
4612 4614 * Check fstype property; for diskless it should be one of "nfs",
4613 4615 * "nfs2", "nfs3" or "nfs4".
4614 4616 */
4615 4617 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4616 4618 DDI_PROP_DONTPASS, "fstype", &propstr)
4617 4619 == DDI_SUCCESS) {
4618 4620 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4619 4621 ddi_prop_free(propstr);
4620 4622
4621 4623 /*
4622 4624 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4623 4625 * assume the type of this root filesystem is 'zfs'.
4624 4626 */
4625 4627 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4626 4628 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4627 4629 == DDI_SUCCESS) {
4628 4630 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4629 4631 ddi_prop_free(propstr);
4630 4632 }
4631 4633
4632 4634 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4633 4635 *fstypp = *fsmodp = rootfs.bo_fstype;
4634 4636 return;
4635 4637 }
4636 4638
4637 4639 ++netboot;
4638 4640
4639 4641 if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4640 4642 (void) strcpy(rootfs.bo_fstype, "nfs");
4641 4643 else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4642 4644 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4643 4645
4644 4646 /*
4645 4647 * check if path to network interface is specified in bootpath
4646 4648 * or by a hypervisor domain configuration file.
4647 4649 * XXPV - enable strlumb_get_netdev_path()
4648 4650 */
4649 4651 if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4650 4652 "xpv-nfsroot")) {
4651 4653 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4652 4654 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4653 4655 DDI_PROP_DONTPASS, "bootpath", &propstr)
4654 4656 == DDI_SUCCESS) {
4655 4657 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4656 4658 ddi_prop_free(propstr);
4657 4659 } else {
4658 4660 rootfs.bo_name[0] = '\0';
4659 4661 }
4660 4662 *fstypp = rootfs.bo_fstype;
4661 4663 *fsmodp = "nfs";
4662 4664 }
4663 4665 #endif
4664 4666
4665 4667 /*
4666 4668 * VFS feature routines
4667 4669 */
4668 4670
4669 4671 #define VFTINDEX(feature) (((feature) >> 32) & 0xFFFFFFFF)
4670 4672 #define VFTBITS(feature) ((feature) & 0xFFFFFFFFLL)
4671 4673
4672 4674 /* Register a feature in the vfs */
4673 4675 void
4674 4676 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4675 4677 {
4676 4678 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4677 4679 if (vfsp->vfs_implp == NULL)
4678 4680 return;
4679 4681
4680 4682 vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4681 4683 }
4682 4684
4683 4685 void
4684 4686 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4685 4687 {
4686 4688 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4687 4689 if (vfsp->vfs_implp == NULL)
4688 4690 return;
4689 4691 vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4690 4692 }
4691 4693
4692 4694 /*
4693 4695 * Query a vfs for a feature.
4694 4696 * Returns 1 if feature is present, 0 if not
4695 4697 */
4696 4698 int
4697 4699 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4698 4700 {
4699 4701 int ret = 0;
4700 4702
4701 4703 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4702 4704 if (vfsp->vfs_implp == NULL)
4703 4705 return (ret);
4704 4706
4705 4707 if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4706 4708 ret = 1;
4707 4709
4708 4710 return (ret);
4709 4711 }
4710 4712
4711 4713 /*
4712 4714 * Propagate feature set from one vfs to another
4713 4715 */
4714 4716 void
4715 4717 vfs_propagate_features(vfs_t *from, vfs_t *to)
4716 4718 {
4717 4719 int i;
4718 4720
4719 4721 if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4720 4722 return;
4721 4723
4722 4724 for (i = 1; i <= to->vfs_featureset[0]; i++) {
4723 4725 to->vfs_featureset[i] = from->vfs_featureset[i];
4724 4726 }
4725 4727 }
4726 4728
4727 4729 #define LOFINODE_PATH "/dev/lofi/%d"
4728 4730
4729 4731 /*
4730 4732 * Return the vnode for the lofi node if there's a lofi mount in place.
4731 4733 * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4732 4734 * failure.
4733 4735 */
4734 4736 int
4735 4737 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4736 4738 {
4737 4739 char *path = NULL;
4738 4740 int strsize;
4739 4741 int err;
4740 4742
4741 4743 if (vfsp->vfs_lofi_id == 0) {
4742 4744 *vpp = NULL;
4743 4745 return (-1);
4744 4746 }
4745 4747
4746 4748 strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4747 4749 path = kmem_alloc(strsize + 1, KM_SLEEP);
4748 4750 (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4749 4751
4750 4752 /*
4751 4753 * We may be inside a zone, so we need to use the /dev path, but
4752 4754 * it's created asynchronously, so we wait here.
4753 4755 */
4754 4756 for (;;) {
4755 4757 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4756 4758
4757 4759 if (err != ENOENT)
4758 4760 break;
4759 4761
4760 4762 if ((err = delay_sig(hz / 8)) == EINTR)
4761 4763 break;
4762 4764 }
4763 4765
4764 4766 if (err)
4765 4767 *vpp = NULL;
4766 4768
4767 4769 kmem_free(path, strsize + 1);
4768 4770 return (err);
4769 4771 }
↓ open down ↓ |
3966 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX