Print this page
fsh, fsd, libfsd, fsdadm
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/vnode.c
+++ new/usr/src/uts/common/fs/vnode.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29 /*
30 30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 31 * The Regents of the University of California
32 32 * All Rights Reserved
33 33 *
34 34 * University Acknowledgment- Portions of this document are derived from
35 35 * software developed by the University of California, Berkeley, and its
36 36 * contributors.
37 37 */
38 38
39 39 #include <sys/types.h>
40 40 #include <sys/param.h>
41 41 #include <sys/t_lock.h>
42 42 #include <sys/errno.h>
43 43 #include <sys/cred.h>
44 44 #include <sys/user.h>
45 45 #include <sys/uio.h>
46 46 #include <sys/file.h>
47 47 #include <sys/pathname.h>
48 48 #include <sys/vfs.h>
49 49 #include <sys/vfs_opreg.h>
50 50 #include <sys/vnode.h>
51 51 #include <sys/rwstlock.h>
52 52 #include <sys/fem.h>
53 53 #include <sys/stat.h>
54 54 #include <sys/mode.h>
55 55 #include <sys/conf.h>
56 56 #include <sys/sysmacros.h>
57 57 #include <sys/cmn_err.h>
↓ open down ↓ |
57 lines elided |
↑ open up ↑ |
58 58 #include <sys/systm.h>
59 59 #include <sys/kmem.h>
60 60 #include <sys/debug.h>
61 61 #include <c2/audit.h>
62 62 #include <sys/acl.h>
63 63 #include <sys/nbmlock.h>
64 64 #include <sys/fcntl.h>
65 65 #include <fs/fs_subr.h>
66 66 #include <sys/taskq.h>
67 67 #include <fs/fs_reparse.h>
68 +#include <sys/fsh_impl.h>
68 69
69 70 /* Determine if this vnode is a file that is read-only */
70 71 #define ISROFILE(vp) \
71 72 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
72 73 (vp)->v_type != VFIFO && vn_is_readonly(vp))
73 74
74 75 /* Tunable via /etc/system; used only by admin/install */
75 76 int nfs_global_client_only;
76 77
77 78 /*
78 79 * Array of vopstats_t for per-FS-type vopstats. This array has the same
79 80 * number of entries as and parallel to the vfssw table. (Arguably, it could
80 81 * be part of the vfssw table.) Once it's initialized, it's accessed using
81 82 * the same fstype index that is used to index into the vfssw table.
82 83 */
83 84 vopstats_t **vopstats_fstype;
84 85
85 86 /* vopstats initialization template used for fast initialization via bcopy() */
86 87 static vopstats_t *vs_templatep;
87 88
88 89 /* Kmem cache handle for vsk_anchor_t allocations */
89 90 kmem_cache_t *vsk_anchor_cache;
90 91
91 92 /* file events cleanup routine */
92 93 extern void free_fopdata(vnode_t *);
93 94
94 95 /*
95 96 * Root of AVL tree for the kstats associated with vopstats. Lock protects
96 97 * updates to vsktat_tree.
97 98 */
98 99 avl_tree_t vskstat_tree;
99 100 kmutex_t vskstat_tree_lock;
100 101
101 102 /* Global variable which enables/disables the vopstats collection */
102 103 int vopstats_enabled = 1;
103 104
104 105 /*
105 106 * forward declarations for internal vnode specific data (vsd)
106 107 */
107 108 static void *vsd_realloc(void *, size_t, size_t);
108 109
109 110 /*
110 111 * forward declarations for reparse point functions
111 112 */
112 113 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
113 114
114 115 /*
115 116 * VSD -- VNODE SPECIFIC DATA
116 117 * The v_data pointer is typically used by a file system to store a
117 118 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
118 119 * However, there are times when additional project private data needs
119 120 * to be stored separately from the data (node) pointed to by v_data.
120 121 * This additional data could be stored by the file system itself or
121 122 * by a completely different kernel entity. VSD provides a way for
122 123 * callers to obtain a key and store a pointer to private data associated
123 124 * with a vnode.
124 125 *
125 126 * Callers are responsible for protecting the vsd by holding v_vsd_lock
126 127 * for calls to vsd_set() and vsd_get().
127 128 */
128 129
129 130 /*
130 131 * vsd_lock protects:
131 132 * vsd_nkeys - creation and deletion of vsd keys
132 133 * vsd_list - insertion and deletion of vsd_node in the vsd_list
133 134 * vsd_destructor - adding and removing destructors to the list
134 135 */
135 136 static kmutex_t vsd_lock;
136 137 static uint_t vsd_nkeys; /* size of destructor array */
137 138 /* list of vsd_node's */
138 139 static list_t *vsd_list = NULL;
139 140 /* per-key destructor funcs */
140 141 static void (**vsd_destructor)(void *);
141 142
142 143 /*
143 144 * The following is the common set of actions needed to update the
144 145 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
145 146 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
146 147 * recording of the bytes transferred. Since the code is similar
147 148 * but small, it is nearly a duplicate. Consequently any changes
148 149 * to one may need to be reflected in the other.
149 150 * Rundown of the variables:
150 151 * vp - Pointer to the vnode
151 152 * counter - Partial name structure member to update in vopstats for counts
152 153 * bytecounter - Partial name structure member to update in vopstats for bytes
153 154 * bytesval - Value to update in vopstats for bytes
154 155 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
155 156 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
156 157 */
157 158
158 159 #define VOPSTATS_UPDATE(vp, counter) { \
159 160 vfs_t *vfsp = (vp)->v_vfsp; \
160 161 if (vfsp && vfsp->vfs_implp && \
161 162 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
162 163 vopstats_t *vsp = &vfsp->vfs_vopstats; \
163 164 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
164 165 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
165 166 size_t, uint64_t *); \
166 167 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
167 168 (*stataddr)++; \
168 169 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
169 170 vsp->n##counter.value.ui64++; \
170 171 } \
171 172 } \
172 173 }
173 174
174 175 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
175 176 vfs_t *vfsp = (vp)->v_vfsp; \
176 177 if (vfsp && vfsp->vfs_implp && \
177 178 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
178 179 vopstats_t *vsp = &vfsp->vfs_vopstats; \
179 180 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
180 181 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
181 182 size_t, uint64_t *); \
182 183 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
183 184 (*stataddr)++; \
184 185 vsp->bytecounter.value.ui64 += bytesval; \
185 186 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
186 187 vsp->n##counter.value.ui64++; \
187 188 vsp->bytecounter.value.ui64 += bytesval; \
188 189 } \
189 190 } \
190 191 }
191 192
192 193 /*
193 194 * If the filesystem does not support XIDs map credential
194 195 * If the vfsp is NULL, perhaps we should also map?
195 196 */
196 197 #define VOPXID_MAP_CR(vp, cr) { \
197 198 vfs_t *vfsp = (vp)->v_vfsp; \
198 199 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
199 200 cr = crgetmapped(cr); \
200 201 }
201 202
202 203 /*
203 204 * Convert stat(2) formats to vnode types and vice versa. (Knows about
204 205 * numerical order of S_IFMT and vnode types.)
205 206 */
206 207 enum vtype iftovt_tab[] = {
207 208 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
208 209 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
209 210 };
210 211
211 212 ushort_t vttoif_tab[] = {
212 213 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
213 214 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
214 215 };
215 216
216 217 /*
217 218 * The system vnode cache.
218 219 */
219 220
220 221 kmem_cache_t *vn_cache;
221 222
222 223
223 224 /*
224 225 * Vnode operations vector.
225 226 */
226 227
227 228 static const fs_operation_trans_def_t vn_ops_table[] = {
228 229 VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
229 230 fs_nosys, fs_nosys,
230 231
231 232 VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
232 233 fs_nosys, fs_nosys,
233 234
234 235 VOPNAME_READ, offsetof(struct vnodeops, vop_read),
235 236 fs_nosys, fs_nosys,
236 237
237 238 VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
238 239 fs_nosys, fs_nosys,
239 240
240 241 VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
241 242 fs_nosys, fs_nosys,
242 243
243 244 VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
244 245 fs_setfl, fs_nosys,
245 246
246 247 VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
247 248 fs_nosys, fs_nosys,
248 249
249 250 VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
250 251 fs_nosys, fs_nosys,
251 252
252 253 VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
253 254 fs_nosys, fs_nosys,
254 255
255 256 VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
256 257 fs_nosys, fs_nosys,
257 258
258 259 VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
259 260 fs_nosys, fs_nosys,
260 261
261 262 VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
262 263 fs_nosys, fs_nosys,
263 264
264 265 VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
265 266 fs_nosys, fs_nosys,
266 267
267 268 VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
268 269 fs_nosys, fs_nosys,
269 270
270 271 VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
271 272 fs_nosys, fs_nosys,
272 273
273 274 VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
274 275 fs_nosys, fs_nosys,
275 276
276 277 VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
277 278 fs_nosys, fs_nosys,
278 279
279 280 VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
280 281 fs_nosys, fs_nosys,
281 282
282 283 VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
283 284 fs_nosys, fs_nosys,
284 285
285 286 VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
286 287 fs_nosys, fs_nosys,
287 288
288 289 VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
289 290 fs_nosys, fs_nosys,
290 291
291 292 VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
292 293 fs_nosys, fs_nosys,
293 294
294 295 VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
295 296 fs_rwlock, fs_rwlock,
296 297
297 298 VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
298 299 (fs_generic_func_p) fs_rwunlock,
299 300 (fs_generic_func_p) fs_rwunlock, /* no errors allowed */
300 301
301 302 VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
302 303 fs_nosys, fs_nosys,
303 304
304 305 VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
305 306 fs_cmp, fs_cmp, /* no errors allowed */
306 307
307 308 VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
308 309 fs_frlock, fs_nosys,
309 310
310 311 VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
311 312 fs_nosys, fs_nosys,
312 313
313 314 VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
314 315 fs_nosys, fs_nosys,
315 316
316 317 VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
317 318 fs_nosys, fs_nosys,
318 319
319 320 VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
320 321 fs_nosys, fs_nosys,
321 322
322 323 VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
323 324 (fs_generic_func_p) fs_nosys_map,
324 325 (fs_generic_func_p) fs_nosys_map,
325 326
326 327 VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
327 328 (fs_generic_func_p) fs_nosys_addmap,
328 329 (fs_generic_func_p) fs_nosys_addmap,
329 330
330 331 VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
331 332 fs_nosys, fs_nosys,
332 333
333 334 VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
334 335 (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
335 336
336 337 VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
337 338 fs_nosys, fs_nosys,
338 339
339 340 VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
340 341 fs_pathconf, fs_nosys,
341 342
342 343 VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
343 344 fs_nosys, fs_nosys,
344 345
345 346 VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
346 347 fs_nosys, fs_nosys,
347 348
348 349 VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
349 350 (fs_generic_func_p) fs_dispose,
350 351 (fs_generic_func_p) fs_nodispose,
351 352
352 353 VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
353 354 fs_nosys, fs_nosys,
354 355
355 356 VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
356 357 fs_fab_acl, fs_nosys,
357 358
358 359 VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
359 360 fs_shrlock, fs_nosys,
360 361
361 362 VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
362 363 (fs_generic_func_p) fs_vnevent_nosupport,
363 364 (fs_generic_func_p) fs_vnevent_nosupport,
364 365
365 366 VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
366 367 fs_nosys, fs_nosys,
367 368
368 369 VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
369 370 fs_nosys, fs_nosys,
370 371
371 372 NULL, 0, NULL, NULL
372 373 };
373 374
374 375 /* Extensible attribute (xva) routines. */
375 376
376 377 /*
377 378 * Zero out the structure, set the size of the requested/returned bitmaps,
378 379 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
379 380 * to the returned attributes array.
380 381 */
381 382 void
382 383 xva_init(xvattr_t *xvap)
383 384 {
384 385 bzero(xvap, sizeof (xvattr_t));
385 386 xvap->xva_mapsize = XVA_MAPSIZE;
386 387 xvap->xva_magic = XVA_MAGIC;
387 388 xvap->xva_vattr.va_mask = AT_XVATTR;
388 389 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
389 390 }
390 391
391 392 /*
392 393 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
393 394 * structure. Otherwise, returns NULL.
394 395 */
395 396 xoptattr_t *
396 397 xva_getxoptattr(xvattr_t *xvap)
397 398 {
398 399 xoptattr_t *xoap = NULL;
399 400 if (xvap->xva_vattr.va_mask & AT_XVATTR)
400 401 xoap = &xvap->xva_xoptattrs;
401 402 return (xoap);
402 403 }
403 404
404 405 /*
405 406 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
406 407 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
407 408 * kstat name.
408 409 */
409 410 static int
410 411 vska_compar(const void *n1, const void *n2)
411 412 {
412 413 int ret;
413 414 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
414 415 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
415 416
416 417 if (p1 < p2) {
417 418 ret = -1;
418 419 } else if (p1 > p2) {
419 420 ret = 1;
420 421 } else {
421 422 ret = 0;
422 423 }
423 424
424 425 return (ret);
425 426 }
426 427
427 428 /*
428 429 * Used to create a single template which will be bcopy()ed to a newly
429 430 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
430 431 */
431 432 static vopstats_t *
432 433 create_vopstats_template()
433 434 {
434 435 vopstats_t *vsp;
435 436
436 437 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
437 438 bzero(vsp, sizeof (*vsp)); /* Start fresh */
438 439
439 440 /* VOP_OPEN */
440 441 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
441 442 /* VOP_CLOSE */
442 443 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
443 444 /* VOP_READ I/O */
444 445 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
445 446 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
446 447 /* VOP_WRITE I/O */
447 448 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
448 449 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
449 450 /* VOP_IOCTL */
450 451 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
451 452 /* VOP_SETFL */
452 453 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
453 454 /* VOP_GETATTR */
454 455 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
455 456 /* VOP_SETATTR */
456 457 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
457 458 /* VOP_ACCESS */
458 459 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
459 460 /* VOP_LOOKUP */
460 461 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
461 462 /* VOP_CREATE */
462 463 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
463 464 /* VOP_REMOVE */
464 465 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
465 466 /* VOP_LINK */
466 467 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
467 468 /* VOP_RENAME */
468 469 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
469 470 /* VOP_MKDIR */
470 471 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
471 472 /* VOP_RMDIR */
472 473 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
473 474 /* VOP_READDIR I/O */
474 475 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
475 476 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
476 477 KSTAT_DATA_UINT64);
477 478 /* VOP_SYMLINK */
478 479 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
479 480 /* VOP_READLINK */
480 481 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
481 482 /* VOP_FSYNC */
482 483 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
483 484 /* VOP_INACTIVE */
484 485 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
485 486 /* VOP_FID */
486 487 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
487 488 /* VOP_RWLOCK */
488 489 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
489 490 /* VOP_RWUNLOCK */
490 491 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
491 492 /* VOP_SEEK */
492 493 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
493 494 /* VOP_CMP */
494 495 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
495 496 /* VOP_FRLOCK */
496 497 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
497 498 /* VOP_SPACE */
498 499 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
499 500 /* VOP_REALVP */
500 501 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
501 502 /* VOP_GETPAGE */
502 503 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
503 504 /* VOP_PUTPAGE */
504 505 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
505 506 /* VOP_MAP */
506 507 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
507 508 /* VOP_ADDMAP */
508 509 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
509 510 /* VOP_DELMAP */
510 511 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
511 512 /* VOP_POLL */
512 513 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
513 514 /* VOP_DUMP */
514 515 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
515 516 /* VOP_PATHCONF */
516 517 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
517 518 /* VOP_PAGEIO */
518 519 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
519 520 /* VOP_DUMPCTL */
520 521 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
521 522 /* VOP_DISPOSE */
522 523 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
523 524 /* VOP_SETSECATTR */
524 525 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
525 526 /* VOP_GETSECATTR */
526 527 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
527 528 /* VOP_SHRLOCK */
528 529 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
529 530 /* VOP_VNEVENT */
530 531 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
531 532 /* VOP_REQZCBUF */
532 533 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
533 534 /* VOP_RETZCBUF */
534 535 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
535 536
536 537 return (vsp);
537 538 }
538 539
539 540 /*
540 541 * Creates a kstat structure associated with a vopstats structure.
541 542 */
542 543 kstat_t *
543 544 new_vskstat(char *ksname, vopstats_t *vsp)
544 545 {
545 546 kstat_t *ksp;
546 547
547 548 if (!vopstats_enabled) {
548 549 return (NULL);
549 550 }
550 551
551 552 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
552 553 sizeof (vopstats_t)/sizeof (kstat_named_t),
553 554 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
554 555 if (ksp) {
555 556 ksp->ks_data = vsp;
556 557 kstat_install(ksp);
557 558 }
558 559
559 560 return (ksp);
560 561 }
561 562
562 563 /*
563 564 * Called from vfsinit() to initialize the support mechanisms for vopstats
564 565 */
565 566 void
566 567 vopstats_startup()
567 568 {
568 569 if (!vopstats_enabled)
569 570 return;
570 571
571 572 /*
572 573 * Creates the AVL tree which holds per-vfs vopstat anchors. This
573 574 * is necessary since we need to check if a kstat exists before we
574 575 * attempt to create it. Also, initialize its lock.
575 576 */
576 577 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
577 578 offsetof(vsk_anchor_t, vsk_node));
578 579 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
579 580
580 581 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
581 582 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
582 583 NULL, NULL, 0);
583 584
584 585 /*
585 586 * Set up the array of pointers for the vopstats-by-FS-type.
586 587 * The entries will be allocated/initialized as each file system
587 588 * goes through modload/mod_installfs.
588 589 */
589 590 vopstats_fstype = (vopstats_t **)kmem_zalloc(
590 591 (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
591 592
592 593 /* Set up the global vopstats initialization template */
593 594 vs_templatep = create_vopstats_template();
594 595 }
595 596
596 597 /*
597 598 * We need to have the all of the counters zeroed.
598 599 * The initialization of the vopstats_t includes on the order of
599 600 * 50 calls to kstat_named_init(). Rather that do that on every call,
600 601 * we do it once in a template (vs_templatep) then bcopy it over.
601 602 */
602 603 void
603 604 initialize_vopstats(vopstats_t *vsp)
604 605 {
605 606 if (vsp == NULL)
606 607 return;
607 608
608 609 bcopy(vs_templatep, vsp, sizeof (vopstats_t));
609 610 }
610 611
611 612 /*
612 613 * If possible, determine which vopstats by fstype to use and
613 614 * return a pointer to the caller.
614 615 */
615 616 vopstats_t *
616 617 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
617 618 {
618 619 int fstype = 0; /* Index into vfssw[] */
619 620 vopstats_t *vsp = NULL;
620 621
621 622 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
622 623 !vopstats_enabled)
623 624 return (NULL);
624 625 /*
625 626 * Set up the fstype. We go to so much trouble because all versions
626 627 * of NFS use the same fstype in their vfs even though they have
627 628 * distinct entries in the vfssw[] table.
628 629 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
629 630 */
630 631 if (vswp) {
631 632 fstype = vswp - vfssw; /* Gets us the index */
632 633 } else {
633 634 fstype = vfsp->vfs_fstype;
634 635 }
635 636
636 637 /*
637 638 * Point to the per-fstype vopstats. The only valid values are
638 639 * non-zero positive values less than the number of vfssw[] table
639 640 * entries.
640 641 */
641 642 if (fstype > 0 && fstype < nfstype) {
642 643 vsp = vopstats_fstype[fstype];
643 644 }
644 645
645 646 return (vsp);
646 647 }
647 648
648 649 /*
649 650 * Generate a kstat name, create the kstat structure, and allocate a
650 651 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
651 652 * to the caller. This must only be called from a mount.
652 653 */
653 654 vsk_anchor_t *
654 655 get_vskstat_anchor(vfs_t *vfsp)
655 656 {
656 657 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
657 658 statvfs64_t statvfsbuf; /* Needed to find f_fsid */
658 659 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
659 660 kstat_t *ksp; /* Ptr to new kstat */
660 661 avl_index_t where; /* Location in the AVL tree */
661 662
662 663 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
663 664 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
664 665 return (NULL);
665 666
666 667 /* Need to get the fsid to build a kstat name */
667 668 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
668 669 /* Create a name for our kstats based on fsid */
669 670 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
670 671 VOPSTATS_STR, statvfsbuf.f_fsid);
671 672
672 673 /* Allocate and initialize the vsk_anchor_t */
673 674 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
674 675 bzero(vskp, sizeof (*vskp));
675 676 vskp->vsk_fsid = statvfsbuf.f_fsid;
676 677
677 678 mutex_enter(&vskstat_tree_lock);
678 679 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
679 680 avl_insert(&vskstat_tree, vskp, where);
680 681 mutex_exit(&vskstat_tree_lock);
681 682
682 683 /*
683 684 * Now that we've got the anchor in the AVL
684 685 * tree, we can create the kstat.
685 686 */
686 687 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
687 688 if (ksp) {
688 689 vskp->vsk_ksp = ksp;
689 690 }
690 691 } else {
691 692 /* Oops, found one! Release memory and lock. */
692 693 mutex_exit(&vskstat_tree_lock);
693 694 kmem_cache_free(vsk_anchor_cache, vskp);
694 695 vskp = NULL;
695 696 }
696 697 }
697 698 return (vskp);
698 699 }
699 700
700 701 /*
701 702 * We're in the process of tearing down the vfs and need to cleanup
702 703 * the data structures associated with the vopstats. Must only be called
703 704 * from dounmount().
704 705 */
705 706 void
706 707 teardown_vopstats(vfs_t *vfsp)
707 708 {
708 709 vsk_anchor_t *vskap;
709 710 avl_index_t where;
710 711
711 712 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
712 713 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
713 714 return;
714 715
715 716 /* This is a safe check since VFS_STATS must be set (see above) */
716 717 if ((vskap = vfsp->vfs_vskap) == NULL)
717 718 return;
718 719
719 720 /* Whack the pointer right away */
720 721 vfsp->vfs_vskap = NULL;
721 722
722 723 /* Lock the tree, remove the node, and delete the kstat */
723 724 mutex_enter(&vskstat_tree_lock);
724 725 if (avl_find(&vskstat_tree, vskap, &where)) {
725 726 avl_remove(&vskstat_tree, vskap);
726 727 }
727 728
728 729 if (vskap->vsk_ksp) {
729 730 kstat_delete(vskap->vsk_ksp);
730 731 }
731 732 mutex_exit(&vskstat_tree_lock);
732 733
733 734 kmem_cache_free(vsk_anchor_cache, vskap);
734 735 }
735 736
736 737 /*
737 738 * Read or write a vnode. Called from kernel code.
738 739 */
739 740 int
740 741 vn_rdwr(
741 742 enum uio_rw rw,
742 743 struct vnode *vp,
743 744 caddr_t base,
744 745 ssize_t len,
745 746 offset_t offset,
746 747 enum uio_seg seg,
747 748 int ioflag,
748 749 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */
749 750 cred_t *cr,
750 751 ssize_t *residp)
751 752 {
752 753 struct uio uio;
753 754 struct iovec iov;
754 755 int error;
755 756 int in_crit = 0;
756 757
757 758 if (rw == UIO_WRITE && ISROFILE(vp))
758 759 return (EROFS);
759 760
760 761 if (len < 0)
761 762 return (EIO);
762 763
763 764 VOPXID_MAP_CR(vp, cr);
764 765
765 766 iov.iov_base = base;
766 767 iov.iov_len = len;
767 768 uio.uio_iov = &iov;
768 769 uio.uio_iovcnt = 1;
769 770 uio.uio_loffset = offset;
770 771 uio.uio_segflg = (short)seg;
771 772 uio.uio_resid = len;
772 773 uio.uio_llimit = ulimit;
773 774
774 775 /*
775 776 * We have to enter the critical region before calling VOP_RWLOCK
776 777 * to avoid a deadlock with ufs.
777 778 */
778 779 if (nbl_need_check(vp)) {
779 780 int svmand;
780 781
781 782 nbl_start_crit(vp, RW_READER);
782 783 in_crit = 1;
783 784 error = nbl_svmand(vp, cr, &svmand);
784 785 if (error != 0)
785 786 goto done;
786 787 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
787 788 uio.uio_offset, uio.uio_resid, svmand, NULL)) {
788 789 error = EACCES;
789 790 goto done;
790 791 }
791 792 }
792 793
793 794 (void) VOP_RWLOCK(vp,
794 795 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
795 796 if (rw == UIO_WRITE) {
796 797 uio.uio_fmode = FWRITE;
797 798 uio.uio_extflg = UIO_COPY_DEFAULT;
798 799 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
799 800 } else {
800 801 uio.uio_fmode = FREAD;
801 802 uio.uio_extflg = UIO_COPY_CACHED;
802 803 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
803 804 }
804 805 VOP_RWUNLOCK(vp,
805 806 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
806 807 if (residp)
807 808 *residp = uio.uio_resid;
808 809 else if (uio.uio_resid)
809 810 error = EIO;
810 811
811 812 done:
812 813 if (in_crit)
813 814 nbl_end_crit(vp);
814 815 return (error);
815 816 }
816 817
817 818 /*
818 819 * Release a vnode. Call VOP_INACTIVE on last reference or
819 820 * decrement reference count.
820 821 *
821 822 * To avoid race conditions, the v_count is left at 1 for
822 823 * the call to VOP_INACTIVE. This prevents another thread
823 824 * from reclaiming and releasing the vnode *before* the
824 825 * VOP_INACTIVE routine has a chance to destroy the vnode.
825 826 * We can't have more than 1 thread calling VOP_INACTIVE
826 827 * on a vnode.
827 828 */
828 829 void
829 830 vn_rele(vnode_t *vp)
830 831 {
831 832 VERIFY(vp->v_count > 0);
832 833 mutex_enter(&vp->v_lock);
833 834 if (vp->v_count == 1) {
834 835 mutex_exit(&vp->v_lock);
835 836 VOP_INACTIVE(vp, CRED(), NULL);
836 837 return;
837 838 }
838 839 vp->v_count--;
839 840 mutex_exit(&vp->v_lock);
840 841 }
841 842
842 843 /*
843 844 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
844 845 * as a single reference, so v_count is not decremented until the last DNLC hold
845 846 * is released. This makes it possible to distinguish vnodes that are referenced
846 847 * only by the DNLC.
847 848 */
848 849 void
849 850 vn_rele_dnlc(vnode_t *vp)
850 851 {
851 852 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
852 853 mutex_enter(&vp->v_lock);
853 854 if (--vp->v_count_dnlc == 0) {
854 855 if (vp->v_count == 1) {
855 856 mutex_exit(&vp->v_lock);
856 857 VOP_INACTIVE(vp, CRED(), NULL);
857 858 return;
858 859 }
859 860 vp->v_count--;
860 861 }
861 862 mutex_exit(&vp->v_lock);
862 863 }
863 864
864 865 /*
865 866 * Like vn_rele() except that it clears v_stream under v_lock.
866 867 * This is used by sockfs when it dismantels the association between
867 868 * the sockfs node and the vnode in the underlaying file system.
868 869 * v_lock has to be held to prevent a thread coming through the lookupname
869 870 * path from accessing a stream head that is going away.
870 871 */
871 872 void
872 873 vn_rele_stream(vnode_t *vp)
873 874 {
874 875 VERIFY(vp->v_count > 0);
875 876 mutex_enter(&vp->v_lock);
876 877 vp->v_stream = NULL;
877 878 if (vp->v_count == 1) {
878 879 mutex_exit(&vp->v_lock);
879 880 VOP_INACTIVE(vp, CRED(), NULL);
880 881 return;
881 882 }
882 883 vp->v_count--;
883 884 mutex_exit(&vp->v_lock);
884 885 }
885 886
886 887 static void
887 888 vn_rele_inactive(vnode_t *vp)
888 889 {
889 890 VOP_INACTIVE(vp, CRED(), NULL);
890 891 }
891 892
892 893 /*
893 894 * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
894 895 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
895 896 * the file system as a result of releasing the vnode. Note, file systems
896 897 * already have to handle the race where the vnode is incremented before the
897 898 * inactive routine is called and does its locking.
898 899 *
899 900 * Warning: Excessive use of this routine can lead to performance problems.
900 901 * This is because taskqs throttle back allocation if too many are created.
901 902 */
902 903 void
903 904 vn_rele_async(vnode_t *vp, taskq_t *taskq)
904 905 {
905 906 VERIFY(vp->v_count > 0);
906 907 mutex_enter(&vp->v_lock);
907 908 if (vp->v_count == 1) {
908 909 mutex_exit(&vp->v_lock);
909 910 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
910 911 vp, TQ_SLEEP) != NULL);
911 912 return;
912 913 }
913 914 vp->v_count--;
914 915 mutex_exit(&vp->v_lock);
915 916 }
916 917
917 918 int
918 919 vn_open(
919 920 char *pnamep,
920 921 enum uio_seg seg,
921 922 int filemode,
922 923 int createmode,
923 924 struct vnode **vpp,
924 925 enum create crwhy,
925 926 mode_t umask)
926 927 {
927 928 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
928 929 umask, NULL, -1));
929 930 }
930 931
931 932
932 933 /*
933 934 * Open/create a vnode.
934 935 * This may be callable by the kernel, the only known use
935 936 * of user context being that the current user credentials
936 937 * are used for permissions. crwhy is defined iff filemode & FCREAT.
937 938 */
938 939 int
939 940 vn_openat(
940 941 char *pnamep,
941 942 enum uio_seg seg,
942 943 int filemode,
943 944 int createmode,
944 945 struct vnode **vpp,
945 946 enum create crwhy,
946 947 mode_t umask,
947 948 struct vnode *startvp,
948 949 int fd)
949 950 {
950 951 struct vnode *vp;
951 952 int mode;
952 953 int accessflags;
953 954 int error;
954 955 int in_crit = 0;
955 956 int open_done = 0;
956 957 int shrlock_done = 0;
957 958 struct vattr vattr;
958 959 enum symfollow follow;
959 960 int estale_retry = 0;
960 961 struct shrlock shr;
961 962 struct shr_locowner shr_own;
962 963
963 964 mode = 0;
964 965 accessflags = 0;
965 966 if (filemode & FREAD)
966 967 mode |= VREAD;
967 968 if (filemode & (FWRITE|FTRUNC))
968 969 mode |= VWRITE;
969 970 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
970 971 mode |= VEXEC;
971 972
972 973 /* symlink interpretation */
973 974 if (filemode & FNOFOLLOW)
974 975 follow = NO_FOLLOW;
975 976 else
976 977 follow = FOLLOW;
977 978
978 979 if (filemode & FAPPEND)
979 980 accessflags |= V_APPEND;
980 981
981 982 top:
982 983 if (filemode & FCREAT) {
983 984 enum vcexcl excl;
984 985
985 986 /*
986 987 * Wish to create a file.
987 988 */
988 989 vattr.va_type = VREG;
989 990 vattr.va_mode = createmode;
990 991 vattr.va_mask = AT_TYPE|AT_MODE;
991 992 if (filemode & FTRUNC) {
992 993 vattr.va_size = 0;
993 994 vattr.va_mask |= AT_SIZE;
994 995 }
995 996 if (filemode & FEXCL)
996 997 excl = EXCL;
997 998 else
998 999 excl = NONEXCL;
999 1000
1000 1001 if (error =
1001 1002 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1002 1003 (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1003 1004 return (error);
1004 1005 } else {
1005 1006 /*
1006 1007 * Wish to open a file. Just look it up.
1007 1008 */
1008 1009 if (error = lookupnameat(pnamep, seg, follow,
1009 1010 NULLVPP, &vp, startvp)) {
1010 1011 if ((error == ESTALE) &&
1011 1012 fs_need_estale_retry(estale_retry++))
1012 1013 goto top;
1013 1014 return (error);
1014 1015 }
1015 1016
1016 1017 /*
1017 1018 * Get the attributes to check whether file is large.
1018 1019 * We do this only if the FOFFMAX flag is not set and
1019 1020 * only for regular files.
1020 1021 */
1021 1022
1022 1023 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1023 1024 vattr.va_mask = AT_SIZE;
1024 1025 if ((error = VOP_GETATTR(vp, &vattr, 0,
1025 1026 CRED(), NULL))) {
1026 1027 goto out;
1027 1028 }
1028 1029 if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1029 1030 /*
1030 1031 * Large File API - regular open fails
1031 1032 * if FOFFMAX flag is set in file mode
1032 1033 */
1033 1034 error = EOVERFLOW;
1034 1035 goto out;
1035 1036 }
1036 1037 }
1037 1038 /*
1038 1039 * Can't write directories, active texts, or
1039 1040 * read-only filesystems. Can't truncate files
1040 1041 * on which mandatory locking is in effect.
1041 1042 */
1042 1043 if (filemode & (FWRITE|FTRUNC)) {
1043 1044 /*
1044 1045 * Allow writable directory if VDIROPEN flag is set.
1045 1046 */
1046 1047 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1047 1048 error = EISDIR;
1048 1049 goto out;
1049 1050 }
1050 1051 if (ISROFILE(vp)) {
1051 1052 error = EROFS;
1052 1053 goto out;
1053 1054 }
1054 1055 /*
1055 1056 * Can't truncate files on which
1056 1057 * sysv mandatory locking is in effect.
1057 1058 */
1058 1059 if (filemode & FTRUNC) {
1059 1060 vnode_t *rvp;
1060 1061
1061 1062 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1062 1063 rvp = vp;
1063 1064 if (rvp->v_filocks != NULL) {
1064 1065 vattr.va_mask = AT_MODE;
1065 1066 if ((error = VOP_GETATTR(vp,
1066 1067 &vattr, 0, CRED(), NULL)) == 0 &&
1067 1068 MANDLOCK(vp, vattr.va_mode))
1068 1069 error = EAGAIN;
1069 1070 }
1070 1071 }
1071 1072 if (error)
1072 1073 goto out;
1073 1074 }
1074 1075 /*
1075 1076 * Check permissions.
1076 1077 */
1077 1078 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1078 1079 goto out;
1079 1080 /*
1080 1081 * Require FSEARCH to return a directory.
1081 1082 * Require FEXEC to return a regular file.
1082 1083 */
1083 1084 if ((filemode & FSEARCH) && vp->v_type != VDIR) {
1084 1085 error = ENOTDIR;
1085 1086 goto out;
1086 1087 }
1087 1088 if ((filemode & FEXEC) && vp->v_type != VREG) {
1088 1089 error = ENOEXEC; /* XXX: error code? */
1089 1090 goto out;
1090 1091 }
1091 1092 }
1092 1093
1093 1094 /*
1094 1095 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1095 1096 */
1096 1097 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1097 1098 error = ELOOP;
1098 1099 goto out;
1099 1100 }
1100 1101 if (filemode & FNOLINKS) {
1101 1102 vattr.va_mask = AT_NLINK;
1102 1103 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1103 1104 goto out;
1104 1105 }
1105 1106 if (vattr.va_nlink != 1) {
1106 1107 error = EMLINK;
1107 1108 goto out;
1108 1109 }
1109 1110 }
1110 1111
1111 1112 /*
1112 1113 * Opening a socket corresponding to the AF_UNIX pathname
1113 1114 * in the filesystem name space is not supported.
1114 1115 * However, VSOCK nodes in namefs are supported in order
1115 1116 * to make fattach work for sockets.
1116 1117 *
1117 1118 * XXX This uses VOP_REALVP to distinguish between
1118 1119 * an unopened namefs node (where VOP_REALVP returns a
1119 1120 * different VSOCK vnode) and a VSOCK created by vn_create
1120 1121 * in some file system (where VOP_REALVP would never return
1121 1122 * a different vnode).
1122 1123 */
1123 1124 if (vp->v_type == VSOCK) {
1124 1125 struct vnode *nvp;
1125 1126
1126 1127 error = VOP_REALVP(vp, &nvp, NULL);
1127 1128 if (error != 0 || nvp == NULL || nvp == vp ||
1128 1129 nvp->v_type != VSOCK) {
1129 1130 error = EOPNOTSUPP;
1130 1131 goto out;
1131 1132 }
1132 1133 }
1133 1134
1134 1135 if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1135 1136 /* get share reservation */
1136 1137 shr.s_access = 0;
1137 1138 if (filemode & FWRITE)
1138 1139 shr.s_access |= F_WRACC;
1139 1140 if (filemode & FREAD)
1140 1141 shr.s_access |= F_RDACC;
1141 1142 shr.s_deny = 0;
1142 1143 shr.s_sysid = 0;
1143 1144 shr.s_pid = ttoproc(curthread)->p_pid;
1144 1145 shr_own.sl_pid = shr.s_pid;
1145 1146 shr_own.sl_id = fd;
1146 1147 shr.s_own_len = sizeof (shr_own);
1147 1148 shr.s_owner = (caddr_t)&shr_own;
1148 1149 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1149 1150 NULL);
1150 1151 if (error)
1151 1152 goto out;
1152 1153 shrlock_done = 1;
1153 1154
1154 1155 /* nbmand conflict check if truncating file */
1155 1156 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1156 1157 nbl_start_crit(vp, RW_READER);
1157 1158 in_crit = 1;
1158 1159
1159 1160 vattr.va_mask = AT_SIZE;
1160 1161 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1161 1162 goto out;
1162 1163 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1163 1164 NULL)) {
1164 1165 error = EACCES;
1165 1166 goto out;
1166 1167 }
1167 1168 }
1168 1169 }
1169 1170
1170 1171 /*
1171 1172 * Do opening protocol.
1172 1173 */
1173 1174 error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1174 1175 if (error)
1175 1176 goto out;
1176 1177 open_done = 1;
1177 1178
1178 1179 /*
1179 1180 * Truncate if required.
1180 1181 */
1181 1182 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1182 1183 vattr.va_size = 0;
1183 1184 vattr.va_mask = AT_SIZE;
1184 1185 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1185 1186 goto out;
1186 1187 }
1187 1188 out:
1188 1189 ASSERT(vp->v_count > 0);
1189 1190
1190 1191 if (in_crit) {
1191 1192 nbl_end_crit(vp);
1192 1193 in_crit = 0;
1193 1194 }
1194 1195 if (error) {
1195 1196 if (open_done) {
1196 1197 (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1197 1198 NULL);
1198 1199 open_done = 0;
1199 1200 shrlock_done = 0;
1200 1201 }
1201 1202 if (shrlock_done) {
1202 1203 (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1203 1204 NULL);
1204 1205 shrlock_done = 0;
1205 1206 }
1206 1207
1207 1208 /*
1208 1209 * The following clause was added to handle a problem
1209 1210 * with NFS consistency. It is possible that a lookup
1210 1211 * of the file to be opened succeeded, but the file
1211 1212 * itself doesn't actually exist on the server. This
1212 1213 * is chiefly due to the DNLC containing an entry for
1213 1214 * the file which has been removed on the server. In
1214 1215 * this case, we just start over. If there was some
1215 1216 * other cause for the ESTALE error, then the lookup
1216 1217 * of the file will fail and the error will be returned
1217 1218 * above instead of looping around from here.
1218 1219 */
1219 1220 VN_RELE(vp);
1220 1221 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1221 1222 goto top;
1222 1223 } else
1223 1224 *vpp = vp;
1224 1225 return (error);
1225 1226 }
1226 1227
1227 1228 /*
1228 1229 * The following two accessor functions are for the NFSv4 server. Since there
1229 1230 * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1230 1231 * vnode open counts correct when a client "upgrades" an open or does an
1231 1232 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1232 1233 * open mode (add or subtract read or write), but also change the share/deny
1233 1234 * modes. However, share reservations are not integrated with OPEN, yet, so
1234 1235 * we need to handle each separately. These functions are cleaner than having
1235 1236 * the NFS server manipulate the counts directly, however, nobody else should
1236 1237 * use these functions.
1237 1238 */
1238 1239 void
1239 1240 vn_open_upgrade(
1240 1241 vnode_t *vp,
1241 1242 int filemode)
1242 1243 {
1243 1244 ASSERT(vp->v_type == VREG);
1244 1245
1245 1246 if (filemode & FREAD)
1246 1247 atomic_add_32(&(vp->v_rdcnt), 1);
1247 1248 if (filemode & FWRITE)
1248 1249 atomic_add_32(&(vp->v_wrcnt), 1);
1249 1250
1250 1251 }
1251 1252
1252 1253 void
1253 1254 vn_open_downgrade(
1254 1255 vnode_t *vp,
1255 1256 int filemode)
1256 1257 {
1257 1258 ASSERT(vp->v_type == VREG);
1258 1259
1259 1260 if (filemode & FREAD) {
1260 1261 ASSERT(vp->v_rdcnt > 0);
1261 1262 atomic_add_32(&(vp->v_rdcnt), -1);
1262 1263 }
1263 1264 if (filemode & FWRITE) {
1264 1265 ASSERT(vp->v_wrcnt > 0);
1265 1266 atomic_add_32(&(vp->v_wrcnt), -1);
1266 1267 }
1267 1268
1268 1269 }
1269 1270
1270 1271 int
1271 1272 vn_create(
1272 1273 char *pnamep,
1273 1274 enum uio_seg seg,
1274 1275 struct vattr *vap,
1275 1276 enum vcexcl excl,
1276 1277 int mode,
1277 1278 struct vnode **vpp,
1278 1279 enum create why,
1279 1280 int flag,
1280 1281 mode_t umask)
1281 1282 {
1282 1283 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1283 1284 umask, NULL));
1284 1285 }
1285 1286
1286 1287 /*
1287 1288 * Create a vnode (makenode).
1288 1289 */
1289 1290 int
1290 1291 vn_createat(
1291 1292 char *pnamep,
1292 1293 enum uio_seg seg,
1293 1294 struct vattr *vap,
1294 1295 enum vcexcl excl,
1295 1296 int mode,
1296 1297 struct vnode **vpp,
1297 1298 enum create why,
1298 1299 int flag,
1299 1300 mode_t umask,
1300 1301 struct vnode *startvp)
1301 1302 {
1302 1303 struct vnode *dvp; /* ptr to parent dir vnode */
1303 1304 struct vnode *vp = NULL;
1304 1305 struct pathname pn;
1305 1306 int error;
1306 1307 int in_crit = 0;
1307 1308 struct vattr vattr;
1308 1309 enum symfollow follow;
1309 1310 int estale_retry = 0;
1310 1311 uint32_t auditing = AU_AUDITING();
1311 1312
1312 1313 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1313 1314
1314 1315 /* symlink interpretation */
1315 1316 if ((flag & FNOFOLLOW) || excl == EXCL)
1316 1317 follow = NO_FOLLOW;
1317 1318 else
1318 1319 follow = FOLLOW;
1319 1320 flag &= ~(FNOFOLLOW|FNOLINKS);
1320 1321
1321 1322 top:
1322 1323 /*
1323 1324 * Lookup directory.
1324 1325 * If new object is a file, call lower level to create it.
1325 1326 * Note that it is up to the lower level to enforce exclusive
1326 1327 * creation, if the file is already there.
1327 1328 * This allows the lower level to do whatever
1328 1329 * locking or protocol that is needed to prevent races.
1329 1330 * If the new object is directory call lower level to make
1330 1331 * the new directory, with "." and "..".
1331 1332 */
1332 1333 if (error = pn_get(pnamep, seg, &pn))
1333 1334 return (error);
1334 1335 if (auditing)
1335 1336 audit_vncreate_start();
1336 1337 dvp = NULL;
1337 1338 *vpp = NULL;
1338 1339 /*
1339 1340 * lookup will find the parent directory for the vnode.
1340 1341 * When it is done the pn holds the name of the entry
1341 1342 * in the directory.
1342 1343 * If this is a non-exclusive create we also find the node itself.
1343 1344 */
1344 1345 error = lookuppnat(&pn, NULL, follow, &dvp,
1345 1346 (excl == EXCL) ? NULLVPP : vpp, startvp);
1346 1347 if (error) {
1347 1348 pn_free(&pn);
1348 1349 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1349 1350 goto top;
1350 1351 if (why == CRMKDIR && error == EINVAL)
1351 1352 error = EEXIST; /* SVID */
1352 1353 return (error);
1353 1354 }
1354 1355
1355 1356 if (why != CRMKNOD)
1356 1357 vap->va_mode &= ~VSVTX;
1357 1358
1358 1359 /*
1359 1360 * If default ACLs are defined for the directory don't apply the
1360 1361 * umask if umask is passed.
1361 1362 */
1362 1363
1363 1364 if (umask) {
1364 1365
1365 1366 vsecattr_t vsec;
1366 1367
1367 1368 vsec.vsa_aclcnt = 0;
1368 1369 vsec.vsa_aclentp = NULL;
1369 1370 vsec.vsa_dfaclcnt = 0;
1370 1371 vsec.vsa_dfaclentp = NULL;
1371 1372 vsec.vsa_mask = VSA_DFACLCNT;
1372 1373 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1373 1374 /*
1374 1375 * If error is ENOSYS then treat it as no error
1375 1376 * Don't want to force all file systems to support
1376 1377 * aclent_t style of ACL's.
1377 1378 */
1378 1379 if (error == ENOSYS)
1379 1380 error = 0;
1380 1381 if (error) {
1381 1382 if (*vpp != NULL)
1382 1383 VN_RELE(*vpp);
1383 1384 goto out;
1384 1385 } else {
1385 1386 /*
1386 1387 * Apply the umask if no default ACLs.
1387 1388 */
1388 1389 if (vsec.vsa_dfaclcnt == 0)
1389 1390 vap->va_mode &= ~umask;
1390 1391
1391 1392 /*
1392 1393 * VOP_GETSECATTR() may have allocated memory for
1393 1394 * ACLs we didn't request, so double-check and
1394 1395 * free it if necessary.
1395 1396 */
1396 1397 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1397 1398 kmem_free((caddr_t)vsec.vsa_aclentp,
1398 1399 vsec.vsa_aclcnt * sizeof (aclent_t));
1399 1400 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1400 1401 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1401 1402 vsec.vsa_dfaclcnt * sizeof (aclent_t));
1402 1403 }
1403 1404 }
1404 1405
1405 1406 /*
1406 1407 * In general we want to generate EROFS if the file system is
1407 1408 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1408 1409 * documents the open system call, and it says that O_CREAT has no
1409 1410 * effect if the file already exists. Bug 1119649 states
1410 1411 * that open(path, O_CREAT, ...) fails when attempting to open an
1411 1412 * existing file on a read only file system. Thus, the first part
1412 1413 * of the following if statement has 3 checks:
1413 1414 * if the file exists &&
1414 1415 * it is being open with write access &&
1415 1416 * the file system is read only
1416 1417 * then generate EROFS
1417 1418 */
1418 1419 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1419 1420 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1420 1421 if (*vpp)
1421 1422 VN_RELE(*vpp);
1422 1423 error = EROFS;
1423 1424 } else if (excl == NONEXCL && *vpp != NULL) {
1424 1425 vnode_t *rvp;
1425 1426
1426 1427 /*
1427 1428 * File already exists. If a mandatory lock has been
1428 1429 * applied, return error.
1429 1430 */
1430 1431 vp = *vpp;
1431 1432 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1432 1433 rvp = vp;
1433 1434 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1434 1435 nbl_start_crit(vp, RW_READER);
1435 1436 in_crit = 1;
1436 1437 }
1437 1438 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1438 1439 vattr.va_mask = AT_MODE|AT_SIZE;
1439 1440 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1440 1441 goto out;
1441 1442 }
1442 1443 if (MANDLOCK(vp, vattr.va_mode)) {
1443 1444 error = EAGAIN;
1444 1445 goto out;
1445 1446 }
1446 1447 /*
1447 1448 * File cannot be truncated if non-blocking mandatory
1448 1449 * locks are currently on the file.
1449 1450 */
1450 1451 if ((vap->va_mask & AT_SIZE) && in_crit) {
1451 1452 u_offset_t offset;
1452 1453 ssize_t length;
1453 1454
1454 1455 offset = vap->va_size > vattr.va_size ?
1455 1456 vattr.va_size : vap->va_size;
1456 1457 length = vap->va_size > vattr.va_size ?
1457 1458 vap->va_size - vattr.va_size :
1458 1459 vattr.va_size - vap->va_size;
1459 1460 if (nbl_conflict(vp, NBL_WRITE, offset,
1460 1461 length, 0, NULL)) {
1461 1462 error = EACCES;
1462 1463 goto out;
1463 1464 }
1464 1465 }
1465 1466 }
1466 1467
1467 1468 /*
1468 1469 * If the file is the root of a VFS, we've crossed a
1469 1470 * mount point and the "containing" directory that we
1470 1471 * acquired above (dvp) is irrelevant because it's in
1471 1472 * a different file system. We apply VOP_CREATE to the
1472 1473 * target itself instead of to the containing directory
1473 1474 * and supply a null path name to indicate (conventionally)
1474 1475 * the node itself as the "component" of interest.
1475 1476 *
1476 1477 * The intercession of the file system is necessary to
1477 1478 * ensure that the appropriate permission checks are
1478 1479 * done.
1479 1480 */
1480 1481 if (vp->v_flag & VROOT) {
1481 1482 ASSERT(why != CRMKDIR);
1482 1483 error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1483 1484 CRED(), flag, NULL, NULL);
1484 1485 /*
1485 1486 * If the create succeeded, it will have created
1486 1487 * a new reference to the vnode. Give up the
1487 1488 * original reference. The assertion should not
1488 1489 * get triggered because NBMAND locks only apply to
1489 1490 * VREG files. And if in_crit is non-zero for some
1490 1491 * reason, detect that here, rather than when we
1491 1492 * deference a null vp.
1492 1493 */
1493 1494 ASSERT(in_crit == 0);
1494 1495 VN_RELE(vp);
1495 1496 vp = NULL;
1496 1497 goto out;
1497 1498 }
1498 1499
1499 1500 /*
1500 1501 * Large File API - non-large open (FOFFMAX flag not set)
1501 1502 * of regular file fails if the file size exceeds MAXOFF32_T.
1502 1503 */
1503 1504 if (why != CRMKDIR &&
1504 1505 !(flag & FOFFMAX) &&
1505 1506 (vp->v_type == VREG)) {
1506 1507 vattr.va_mask = AT_SIZE;
1507 1508 if ((error = VOP_GETATTR(vp, &vattr, 0,
1508 1509 CRED(), NULL))) {
1509 1510 goto out;
1510 1511 }
1511 1512 if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1512 1513 error = EOVERFLOW;
1513 1514 goto out;
1514 1515 }
1515 1516 }
1516 1517 }
1517 1518
1518 1519 if (error == 0) {
1519 1520 /*
1520 1521 * Call mkdir() if specified, otherwise create().
1521 1522 */
1522 1523 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1523 1524
1524 1525 if (why == CRMKDIR)
1525 1526 /*
1526 1527 * N.B., if vn_createat() ever requests
1527 1528 * case-insensitive behavior then it will need
1528 1529 * to be passed to VOP_MKDIR(). VOP_CREATE()
1529 1530 * will already get it via "flag"
1530 1531 */
1531 1532 error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1532 1533 NULL, 0, NULL);
1533 1534 else if (!must_be_dir)
1534 1535 error = VOP_CREATE(dvp, pn.pn_path, vap,
1535 1536 excl, mode, vpp, CRED(), flag, NULL, NULL);
1536 1537 else
1537 1538 error = ENOTDIR;
1538 1539 }
1539 1540
1540 1541 out:
1541 1542
1542 1543 if (auditing)
1543 1544 audit_vncreate_finish(*vpp, error);
1544 1545 if (in_crit) {
1545 1546 nbl_end_crit(vp);
1546 1547 in_crit = 0;
1547 1548 }
1548 1549 if (vp != NULL) {
1549 1550 VN_RELE(vp);
1550 1551 vp = NULL;
1551 1552 }
1552 1553 pn_free(&pn);
1553 1554 VN_RELE(dvp);
1554 1555 /*
1555 1556 * The following clause was added to handle a problem
1556 1557 * with NFS consistency. It is possible that a lookup
1557 1558 * of the file to be created succeeded, but the file
1558 1559 * itself doesn't actually exist on the server. This
1559 1560 * is chiefly due to the DNLC containing an entry for
1560 1561 * the file which has been removed on the server. In
1561 1562 * this case, we just start over. If there was some
1562 1563 * other cause for the ESTALE error, then the lookup
1563 1564 * of the file will fail and the error will be returned
1564 1565 * above instead of looping around from here.
1565 1566 */
1566 1567 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1567 1568 goto top;
1568 1569 return (error);
1569 1570 }
1570 1571
1571 1572 int
1572 1573 vn_link(char *from, char *to, enum uio_seg seg)
1573 1574 {
1574 1575 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1575 1576 }
1576 1577
1577 1578 int
1578 1579 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1579 1580 vnode_t *tstartvp, char *to, enum uio_seg seg)
1580 1581 {
1581 1582 struct vnode *fvp; /* from vnode ptr */
1582 1583 struct vnode *tdvp; /* to directory vnode ptr */
1583 1584 struct pathname pn;
1584 1585 int error;
1585 1586 struct vattr vattr;
1586 1587 dev_t fsid;
1587 1588 int estale_retry = 0;
1588 1589 uint32_t auditing = AU_AUDITING();
1589 1590
1590 1591 top:
1591 1592 fvp = tdvp = NULL;
1592 1593 if (error = pn_get(to, seg, &pn))
1593 1594 return (error);
1594 1595 if (auditing && fstartvp != NULL)
1595 1596 audit_setfsat_path(1);
1596 1597 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1597 1598 goto out;
1598 1599 if (auditing && tstartvp != NULL)
1599 1600 audit_setfsat_path(3);
1600 1601 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1601 1602 goto out;
1602 1603 /*
1603 1604 * Make sure both source vnode and target directory vnode are
1604 1605 * in the same vfs and that it is writeable.
1605 1606 */
1606 1607 vattr.va_mask = AT_FSID;
1607 1608 if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1608 1609 goto out;
1609 1610 fsid = vattr.va_fsid;
1610 1611 vattr.va_mask = AT_FSID;
1611 1612 if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1612 1613 goto out;
1613 1614 if (fsid != vattr.va_fsid) {
1614 1615 error = EXDEV;
1615 1616 goto out;
1616 1617 }
1617 1618 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1618 1619 error = EROFS;
1619 1620 goto out;
1620 1621 }
1621 1622 /*
1622 1623 * Do the link.
1623 1624 */
1624 1625 (void) pn_fixslash(&pn);
1625 1626 error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1626 1627 out:
1627 1628 pn_free(&pn);
1628 1629 if (fvp)
1629 1630 VN_RELE(fvp);
1630 1631 if (tdvp)
1631 1632 VN_RELE(tdvp);
1632 1633 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1633 1634 goto top;
1634 1635 return (error);
1635 1636 }
1636 1637
1637 1638 int
1638 1639 vn_rename(char *from, char *to, enum uio_seg seg)
1639 1640 {
1640 1641 return (vn_renameat(NULL, from, NULL, to, seg));
1641 1642 }
1642 1643
1643 1644 int
1644 1645 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1645 1646 char *tname, enum uio_seg seg)
1646 1647 {
1647 1648 int error;
1648 1649 struct vattr vattr;
1649 1650 struct pathname fpn; /* from pathname */
1650 1651 struct pathname tpn; /* to pathname */
1651 1652 dev_t fsid;
1652 1653 int in_crit_src, in_crit_targ;
1653 1654 vnode_t *fromvp, *fvp;
1654 1655 vnode_t *tovp, *targvp;
1655 1656 int estale_retry = 0;
1656 1657 uint32_t auditing = AU_AUDITING();
1657 1658
1658 1659 top:
1659 1660 fvp = fromvp = tovp = targvp = NULL;
1660 1661 in_crit_src = in_crit_targ = 0;
1661 1662 /*
1662 1663 * Get to and from pathnames.
1663 1664 */
1664 1665 if (error = pn_get(fname, seg, &fpn))
1665 1666 return (error);
1666 1667 if (error = pn_get(tname, seg, &tpn)) {
1667 1668 pn_free(&fpn);
1668 1669 return (error);
1669 1670 }
1670 1671
1671 1672 /*
1672 1673 * First we need to resolve the correct directories
1673 1674 * The passed in directories may only be a starting point,
1674 1675 * but we need the real directories the file(s) live in.
1675 1676 * For example the fname may be something like usr/lib/sparc
1676 1677 * and we were passed in the / directory, but we need to
1677 1678 * use the lib directory for the rename.
1678 1679 */
1679 1680
1680 1681 if (auditing && fdvp != NULL)
1681 1682 audit_setfsat_path(1);
1682 1683 /*
1683 1684 * Lookup to and from directories.
1684 1685 */
1685 1686 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1686 1687 goto out;
1687 1688 }
1688 1689
1689 1690 /*
1690 1691 * Make sure there is an entry.
1691 1692 */
1692 1693 if (fvp == NULL) {
1693 1694 error = ENOENT;
1694 1695 goto out;
1695 1696 }
1696 1697
1697 1698 if (auditing && tdvp != NULL)
1698 1699 audit_setfsat_path(3);
1699 1700 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1700 1701 goto out;
1701 1702 }
1702 1703
1703 1704 /*
1704 1705 * Make sure both the from vnode directory and the to directory
1705 1706 * are in the same vfs and the to directory is writable.
1706 1707 * We check fsid's, not vfs pointers, so loopback fs works.
1707 1708 */
1708 1709 if (fromvp != tovp) {
1709 1710 vattr.va_mask = AT_FSID;
1710 1711 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1711 1712 goto out;
1712 1713 fsid = vattr.va_fsid;
1713 1714 vattr.va_mask = AT_FSID;
1714 1715 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1715 1716 goto out;
1716 1717 if (fsid != vattr.va_fsid) {
1717 1718 error = EXDEV;
1718 1719 goto out;
1719 1720 }
1720 1721 }
1721 1722
1722 1723 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1723 1724 error = EROFS;
1724 1725 goto out;
1725 1726 }
1726 1727
1727 1728 if (targvp && (fvp != targvp)) {
1728 1729 nbl_start_crit(targvp, RW_READER);
1729 1730 in_crit_targ = 1;
1730 1731 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1731 1732 error = EACCES;
1732 1733 goto out;
1733 1734 }
1734 1735 }
1735 1736
1736 1737 if (nbl_need_check(fvp)) {
1737 1738 nbl_start_crit(fvp, RW_READER);
1738 1739 in_crit_src = 1;
1739 1740 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1740 1741 error = EACCES;
1741 1742 goto out;
1742 1743 }
1743 1744 }
1744 1745
1745 1746 /*
1746 1747 * Do the rename.
1747 1748 */
1748 1749 (void) pn_fixslash(&tpn);
1749 1750 error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1750 1751 NULL, 0);
1751 1752
1752 1753 out:
1753 1754 pn_free(&fpn);
1754 1755 pn_free(&tpn);
1755 1756 if (in_crit_src)
1756 1757 nbl_end_crit(fvp);
1757 1758 if (in_crit_targ)
1758 1759 nbl_end_crit(targvp);
1759 1760 if (fromvp)
1760 1761 VN_RELE(fromvp);
1761 1762 if (tovp)
1762 1763 VN_RELE(tovp);
1763 1764 if (targvp)
1764 1765 VN_RELE(targvp);
1765 1766 if (fvp)
1766 1767 VN_RELE(fvp);
1767 1768 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1768 1769 goto top;
1769 1770 return (error);
1770 1771 }
1771 1772
1772 1773 /*
1773 1774 * Remove a file or directory.
1774 1775 */
1775 1776 int
1776 1777 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1777 1778 {
1778 1779 return (vn_removeat(NULL, fnamep, seg, dirflag));
1779 1780 }
1780 1781
1781 1782 int
1782 1783 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1783 1784 {
1784 1785 struct vnode *vp; /* entry vnode */
1785 1786 struct vnode *dvp; /* ptr to parent dir vnode */
1786 1787 struct vnode *coveredvp;
1787 1788 struct pathname pn; /* name of entry */
1788 1789 enum vtype vtype;
1789 1790 int error;
1790 1791 struct vfs *vfsp;
1791 1792 struct vfs *dvfsp; /* ptr to parent dir vfs */
1792 1793 int in_crit = 0;
1793 1794 int estale_retry = 0;
1794 1795
1795 1796 top:
1796 1797 if (error = pn_get(fnamep, seg, &pn))
1797 1798 return (error);
1798 1799 dvp = vp = NULL;
1799 1800 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1800 1801 pn_free(&pn);
1801 1802 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1802 1803 goto top;
1803 1804 return (error);
1804 1805 }
1805 1806
1806 1807 /*
1807 1808 * Make sure there is an entry.
1808 1809 */
1809 1810 if (vp == NULL) {
1810 1811 error = ENOENT;
1811 1812 goto out;
1812 1813 }
1813 1814
1814 1815 vfsp = vp->v_vfsp;
1815 1816 dvfsp = dvp->v_vfsp;
1816 1817
1817 1818 /*
1818 1819 * If the named file is the root of a mounted filesystem, fail,
1819 1820 * unless it's marked unlinkable. In that case, unmount the
1820 1821 * filesystem and proceed to unlink the covered vnode. (If the
1821 1822 * covered vnode is a directory, use rmdir instead of unlink,
1822 1823 * to avoid file system corruption.)
1823 1824 */
1824 1825 if (vp->v_flag & VROOT) {
1825 1826 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1826 1827 error = EBUSY;
1827 1828 goto out;
1828 1829 }
1829 1830
1830 1831 /*
1831 1832 * Namefs specific code starts here.
1832 1833 */
1833 1834
1834 1835 if (dirflag == RMDIRECTORY) {
1835 1836 /*
1836 1837 * User called rmdir(2) on a file that has
1837 1838 * been namefs mounted on top of. Since
1838 1839 * namefs doesn't allow directories to
1839 1840 * be mounted on other files we know
1840 1841 * vp is not of type VDIR so fail to operation.
1841 1842 */
1842 1843 error = ENOTDIR;
1843 1844 goto out;
1844 1845 }
1845 1846
1846 1847 /*
1847 1848 * If VROOT is still set after grabbing vp->v_lock,
1848 1849 * noone has finished nm_unmount so far and coveredvp
1849 1850 * is valid.
1850 1851 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1851 1852 * vp->v_lock, any race window is eliminated.
1852 1853 */
1853 1854
1854 1855 mutex_enter(&vp->v_lock);
1855 1856 if ((vp->v_flag & VROOT) == 0) {
1856 1857 /* Someone beat us to the unmount */
1857 1858 mutex_exit(&vp->v_lock);
1858 1859 error = EBUSY;
1859 1860 goto out;
1860 1861 }
1861 1862 vfsp = vp->v_vfsp;
1862 1863 coveredvp = vfsp->vfs_vnodecovered;
1863 1864 ASSERT(coveredvp);
1864 1865 /*
1865 1866 * Note: Implementation of vn_vfswlock shows that ordering of
1866 1867 * v_lock / vn_vfswlock is not an issue here.
1867 1868 */
1868 1869 error = vn_vfswlock(coveredvp);
1869 1870 mutex_exit(&vp->v_lock);
1870 1871
1871 1872 if (error)
1872 1873 goto out;
1873 1874
1874 1875 VN_HOLD(coveredvp);
1875 1876 VN_RELE(vp);
1876 1877 error = dounmount(vfsp, 0, CRED());
1877 1878
1878 1879 /*
1879 1880 * Unmounted the namefs file system; now get
1880 1881 * the object it was mounted over.
1881 1882 */
1882 1883 vp = coveredvp;
1883 1884 /*
1884 1885 * If namefs was mounted over a directory, then
1885 1886 * we want to use rmdir() instead of unlink().
1886 1887 */
1887 1888 if (vp->v_type == VDIR)
1888 1889 dirflag = RMDIRECTORY;
1889 1890
1890 1891 if (error)
1891 1892 goto out;
1892 1893 }
1893 1894
1894 1895 /*
1895 1896 * Make sure filesystem is writeable.
1896 1897 * We check the parent directory's vfs in case this is an lofs vnode.
1897 1898 */
1898 1899 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1899 1900 error = EROFS;
1900 1901 goto out;
1901 1902 }
1902 1903
1903 1904 vtype = vp->v_type;
1904 1905
1905 1906 /*
1906 1907 * If there is the possibility of an nbmand share reservation, make
1907 1908 * sure it's okay to remove the file. Keep a reference to the
1908 1909 * vnode, so that we can exit the nbl critical region after
1909 1910 * calling VOP_REMOVE.
1910 1911 * If there is no possibility of an nbmand share reservation,
1911 1912 * release the vnode reference now. Filesystems like NFS may
1912 1913 * behave differently if there is an extra reference, so get rid of
1913 1914 * this one. Fortunately, we can't have nbmand mounts on NFS
1914 1915 * filesystems.
1915 1916 */
1916 1917 if (nbl_need_check(vp)) {
1917 1918 nbl_start_crit(vp, RW_READER);
1918 1919 in_crit = 1;
1919 1920 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1920 1921 error = EACCES;
1921 1922 goto out;
1922 1923 }
1923 1924 } else {
1924 1925 VN_RELE(vp);
1925 1926 vp = NULL;
1926 1927 }
1927 1928
1928 1929 if (dirflag == RMDIRECTORY) {
1929 1930 /*
1930 1931 * Caller is using rmdir(2), which can only be applied to
1931 1932 * directories.
1932 1933 */
1933 1934 if (vtype != VDIR) {
1934 1935 error = ENOTDIR;
1935 1936 } else {
1936 1937 vnode_t *cwd;
1937 1938 proc_t *pp = curproc;
1938 1939
1939 1940 mutex_enter(&pp->p_lock);
1940 1941 cwd = PTOU(pp)->u_cdir;
1941 1942 VN_HOLD(cwd);
1942 1943 mutex_exit(&pp->p_lock);
1943 1944 error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1944 1945 NULL, 0);
1945 1946 VN_RELE(cwd);
1946 1947 }
1947 1948 } else {
1948 1949 /*
1949 1950 * Unlink(2) can be applied to anything.
1950 1951 */
1951 1952 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1952 1953 }
1953 1954
1954 1955 out:
1955 1956 pn_free(&pn);
1956 1957 if (in_crit) {
1957 1958 nbl_end_crit(vp);
1958 1959 in_crit = 0;
1959 1960 }
1960 1961 if (vp != NULL)
1961 1962 VN_RELE(vp);
1962 1963 if (dvp != NULL)
1963 1964 VN_RELE(dvp);
1964 1965 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1965 1966 goto top;
1966 1967 return (error);
1967 1968 }
1968 1969
1969 1970 /*
1970 1971 * Utility function to compare equality of vnodes.
1971 1972 * Compare the underlying real vnodes, if there are underlying vnodes.
1972 1973 * This is a more thorough comparison than the VN_CMP() macro provides.
1973 1974 */
1974 1975 int
1975 1976 vn_compare(vnode_t *vp1, vnode_t *vp2)
1976 1977 {
1977 1978 vnode_t *realvp;
1978 1979
1979 1980 if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1980 1981 vp1 = realvp;
1981 1982 if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1982 1983 vp2 = realvp;
1983 1984 return (VN_CMP(vp1, vp2));
1984 1985 }
1985 1986
1986 1987 /*
1987 1988 * The number of locks to hash into. This value must be a power
1988 1989 * of 2 minus 1 and should probably also be prime.
1989 1990 */
1990 1991 #define NUM_BUCKETS 1023
1991 1992
1992 1993 struct vn_vfslocks_bucket {
1993 1994 kmutex_t vb_lock;
1994 1995 vn_vfslocks_entry_t *vb_list;
1995 1996 char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1996 1997 };
1997 1998
1998 1999 /*
1999 2000 * Total number of buckets will be NUM_BUCKETS + 1 .
2000 2001 */
2001 2002
2002 2003 #pragma align 64(vn_vfslocks_buckets)
2003 2004 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
2004 2005
2005 2006 #define VN_VFSLOCKS_SHIFT 9
2006 2007
2007 2008 #define VN_VFSLOCKS_HASH(vfsvpptr) \
2008 2009 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2009 2010
2010 2011 /*
2011 2012 * vn_vfslocks_getlock() uses an HASH scheme to generate
2012 2013 * rwstlock using vfs/vnode pointer passed to it.
2013 2014 *
2014 2015 * vn_vfslocks_rele() releases a reference in the
2015 2016 * HASH table which allows the entry allocated by
2016 2017 * vn_vfslocks_getlock() to be freed at a later
2017 2018 * stage when the refcount drops to zero.
2018 2019 */
2019 2020
2020 2021 vn_vfslocks_entry_t *
2021 2022 vn_vfslocks_getlock(void *vfsvpptr)
2022 2023 {
2023 2024 struct vn_vfslocks_bucket *bp;
2024 2025 vn_vfslocks_entry_t *vep;
2025 2026 vn_vfslocks_entry_t *tvep;
2026 2027
2027 2028 ASSERT(vfsvpptr != NULL);
2028 2029 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2029 2030
2030 2031 mutex_enter(&bp->vb_lock);
2031 2032 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2032 2033 if (vep->ve_vpvfs == vfsvpptr) {
2033 2034 vep->ve_refcnt++;
2034 2035 mutex_exit(&bp->vb_lock);
2035 2036 return (vep);
2036 2037 }
2037 2038 }
2038 2039 mutex_exit(&bp->vb_lock);
2039 2040 vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2040 2041 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2041 2042 vep->ve_vpvfs = (char *)vfsvpptr;
2042 2043 vep->ve_refcnt = 1;
2043 2044 mutex_enter(&bp->vb_lock);
2044 2045 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2045 2046 if (tvep->ve_vpvfs == vfsvpptr) {
2046 2047 tvep->ve_refcnt++;
2047 2048 mutex_exit(&bp->vb_lock);
2048 2049
2049 2050 /*
2050 2051 * There is already an entry in the hash
2051 2052 * destroy what we just allocated.
2052 2053 */
2053 2054 rwst_destroy(&vep->ve_lock);
2054 2055 kmem_free(vep, sizeof (*vep));
2055 2056 return (tvep);
2056 2057 }
2057 2058 }
2058 2059 vep->ve_next = bp->vb_list;
2059 2060 bp->vb_list = vep;
2060 2061 mutex_exit(&bp->vb_lock);
2061 2062 return (vep);
2062 2063 }
2063 2064
2064 2065 void
2065 2066 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2066 2067 {
2067 2068 struct vn_vfslocks_bucket *bp;
2068 2069 vn_vfslocks_entry_t *vep;
2069 2070 vn_vfslocks_entry_t *pvep;
2070 2071
2071 2072 ASSERT(vepent != NULL);
2072 2073 ASSERT(vepent->ve_vpvfs != NULL);
2073 2074
2074 2075 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2075 2076
2076 2077 mutex_enter(&bp->vb_lock);
2077 2078 vepent->ve_refcnt--;
2078 2079
2079 2080 if ((int32_t)vepent->ve_refcnt < 0)
2080 2081 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2081 2082
2082 2083 if (vepent->ve_refcnt == 0) {
2083 2084 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2084 2085 if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2085 2086 if (bp->vb_list == vep)
2086 2087 bp->vb_list = vep->ve_next;
2087 2088 else {
2088 2089 /* LINTED */
2089 2090 pvep->ve_next = vep->ve_next;
2090 2091 }
2091 2092 mutex_exit(&bp->vb_lock);
2092 2093 rwst_destroy(&vep->ve_lock);
2093 2094 kmem_free(vep, sizeof (*vep));
2094 2095 return;
2095 2096 }
2096 2097 pvep = vep;
2097 2098 }
2098 2099 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2099 2100 }
2100 2101 mutex_exit(&bp->vb_lock);
2101 2102 }
2102 2103
2103 2104 /*
2104 2105 * vn_vfswlock_wait is used to implement a lock which is logically a writers
2105 2106 * lock protecting the v_vfsmountedhere field.
2106 2107 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2107 2108 * except that it blocks to acquire the lock VVFSLOCK.
2108 2109 *
2109 2110 * traverse() and routines re-implementing part of traverse (e.g. autofs)
2110 2111 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2111 2112 * need the non-blocking version of the writers lock i.e. vn_vfswlock
2112 2113 */
2113 2114 int
2114 2115 vn_vfswlock_wait(vnode_t *vp)
2115 2116 {
2116 2117 int retval;
2117 2118 vn_vfslocks_entry_t *vpvfsentry;
2118 2119 ASSERT(vp != NULL);
2119 2120
2120 2121 vpvfsentry = vn_vfslocks_getlock(vp);
2121 2122 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2122 2123
2123 2124 if (retval == EINTR) {
2124 2125 vn_vfslocks_rele(vpvfsentry);
2125 2126 return (EINTR);
2126 2127 }
2127 2128 return (retval);
2128 2129 }
2129 2130
2130 2131 int
2131 2132 vn_vfsrlock_wait(vnode_t *vp)
2132 2133 {
2133 2134 int retval;
2134 2135 vn_vfslocks_entry_t *vpvfsentry;
2135 2136 ASSERT(vp != NULL);
2136 2137
2137 2138 vpvfsentry = vn_vfslocks_getlock(vp);
2138 2139 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2139 2140
2140 2141 if (retval == EINTR) {
2141 2142 vn_vfslocks_rele(vpvfsentry);
2142 2143 return (EINTR);
2143 2144 }
2144 2145
2145 2146 return (retval);
2146 2147 }
2147 2148
2148 2149
2149 2150 /*
2150 2151 * vn_vfswlock is used to implement a lock which is logically a writers lock
2151 2152 * protecting the v_vfsmountedhere field.
2152 2153 */
2153 2154 int
2154 2155 vn_vfswlock(vnode_t *vp)
2155 2156 {
2156 2157 vn_vfslocks_entry_t *vpvfsentry;
2157 2158
2158 2159 /*
2159 2160 * If vp is NULL then somebody is trying to lock the covered vnode
2160 2161 * of /. (vfs_vnodecovered is NULL for /). This situation will
2161 2162 * only happen when unmounting /. Since that operation will fail
2162 2163 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2163 2164 */
2164 2165 if (vp == NULL)
2165 2166 return (EBUSY);
2166 2167
2167 2168 vpvfsentry = vn_vfslocks_getlock(vp);
2168 2169
2169 2170 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2170 2171 return (0);
2171 2172
2172 2173 vn_vfslocks_rele(vpvfsentry);
2173 2174 return (EBUSY);
2174 2175 }
2175 2176
2176 2177 int
2177 2178 vn_vfsrlock(vnode_t *vp)
2178 2179 {
2179 2180 vn_vfslocks_entry_t *vpvfsentry;
2180 2181
2181 2182 /*
2182 2183 * If vp is NULL then somebody is trying to lock the covered vnode
2183 2184 * of /. (vfs_vnodecovered is NULL for /). This situation will
2184 2185 * only happen when unmounting /. Since that operation will fail
2185 2186 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2186 2187 */
2187 2188 if (vp == NULL)
2188 2189 return (EBUSY);
2189 2190
2190 2191 vpvfsentry = vn_vfslocks_getlock(vp);
2191 2192
2192 2193 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2193 2194 return (0);
2194 2195
2195 2196 vn_vfslocks_rele(vpvfsentry);
2196 2197 return (EBUSY);
2197 2198 }
2198 2199
2199 2200 void
2200 2201 vn_vfsunlock(vnode_t *vp)
2201 2202 {
2202 2203 vn_vfslocks_entry_t *vpvfsentry;
2203 2204
2204 2205 /*
2205 2206 * ve_refcnt needs to be decremented twice.
2206 2207 * 1. To release refernce after a call to vn_vfslocks_getlock()
2207 2208 * 2. To release the reference from the locking routines like
2208 2209 * vn_vfsrlock/vn_vfswlock etc,.
2209 2210 */
2210 2211 vpvfsentry = vn_vfslocks_getlock(vp);
2211 2212 vn_vfslocks_rele(vpvfsentry);
2212 2213
2213 2214 rwst_exit(&vpvfsentry->ve_lock);
2214 2215 vn_vfslocks_rele(vpvfsentry);
2215 2216 }
2216 2217
2217 2218 int
2218 2219 vn_vfswlock_held(vnode_t *vp)
2219 2220 {
2220 2221 int held;
2221 2222 vn_vfslocks_entry_t *vpvfsentry;
2222 2223
2223 2224 ASSERT(vp != NULL);
2224 2225
2225 2226 vpvfsentry = vn_vfslocks_getlock(vp);
2226 2227 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2227 2228
2228 2229 vn_vfslocks_rele(vpvfsentry);
2229 2230 return (held);
2230 2231 }
2231 2232
2232 2233
2233 2234 int
2234 2235 vn_make_ops(
2235 2236 const char *name, /* Name of file system */
2236 2237 const fs_operation_def_t *templ, /* Operation specification */
2237 2238 vnodeops_t **actual) /* Return the vnodeops */
2238 2239 {
2239 2240 int unused_ops;
2240 2241 int error;
2241 2242
2242 2243 *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2243 2244
2244 2245 (*actual)->vnop_name = name;
2245 2246
2246 2247 error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2247 2248 if (error) {
2248 2249 kmem_free(*actual, sizeof (vnodeops_t));
2249 2250 }
2250 2251
2251 2252 #if DEBUG
2252 2253 if (unused_ops != 0)
2253 2254 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2254 2255 "but not used", name, unused_ops);
2255 2256 #endif
2256 2257
2257 2258 return (error);
2258 2259 }
2259 2260
2260 2261 /*
2261 2262 * Free the vnodeops created as a result of vn_make_ops()
2262 2263 */
2263 2264 void
2264 2265 vn_freevnodeops(vnodeops_t *vnops)
2265 2266 {
2266 2267 kmem_free(vnops, sizeof (vnodeops_t));
2267 2268 }
2268 2269
2269 2270 /*
2270 2271 * Vnode cache.
2271 2272 */
2272 2273
2273 2274 /* ARGSUSED */
2274 2275 static int
2275 2276 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2276 2277 {
2277 2278 struct vnode *vp;
2278 2279
2279 2280 vp = buf;
2280 2281
2281 2282 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2282 2283 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2283 2284 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2284 2285 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2285 2286 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2286 2287 vp->v_path = NULL;
2287 2288 vp->v_mpssdata = NULL;
2288 2289 vp->v_vsd = NULL;
2289 2290 vp->v_fopdata = NULL;
2290 2291
2291 2292 return (0);
2292 2293 }
2293 2294
2294 2295 /* ARGSUSED */
2295 2296 static void
2296 2297 vn_cache_destructor(void *buf, void *cdrarg)
2297 2298 {
2298 2299 struct vnode *vp;
2299 2300
2300 2301 vp = buf;
2301 2302
2302 2303 rw_destroy(&vp->v_nbllock);
2303 2304 cv_destroy(&vp->v_cv);
2304 2305 mutex_destroy(&vp->v_vsd_lock);
2305 2306 mutex_destroy(&vp->v_lock);
2306 2307 }
2307 2308
2308 2309 void
2309 2310 vn_create_cache(void)
2310 2311 {
2311 2312 /* LINTED */
2312 2313 ASSERT((1 << VNODE_ALIGN_LOG2) ==
2313 2314 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2314 2315 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2315 2316 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2316 2317 NULL, 0);
2317 2318 }
2318 2319
2319 2320 void
2320 2321 vn_destroy_cache(void)
2321 2322 {
2322 2323 kmem_cache_destroy(vn_cache);
2323 2324 }
2324 2325
2325 2326 /*
2326 2327 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2327 2328 * cached by the file system and vnodes remain associated.
2328 2329 */
2329 2330 void
2330 2331 vn_recycle(vnode_t *vp)
2331 2332 {
2332 2333 ASSERT(vp->v_pages == NULL);
2333 2334
2334 2335 /*
2335 2336 * XXX - This really belongs in vn_reinit(), but we have some issues
2336 2337 * with the counts. Best to have it here for clean initialization.
2337 2338 */
2338 2339 vp->v_rdcnt = 0;
2339 2340 vp->v_wrcnt = 0;
2340 2341 vp->v_mmap_read = 0;
2341 2342 vp->v_mmap_write = 0;
2342 2343
2343 2344 /*
2344 2345 * If FEM was in use, make sure everything gets cleaned up
2345 2346 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2346 2347 * constructor.
2347 2348 */
2348 2349 if (vp->v_femhead) {
2349 2350 /* XXX - There should be a free_femhead() that does all this */
2350 2351 ASSERT(vp->v_femhead->femh_list == NULL);
2351 2352 mutex_destroy(&vp->v_femhead->femh_lock);
2352 2353 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2353 2354 vp->v_femhead = NULL;
2354 2355 }
2355 2356 if (vp->v_path) {
2356 2357 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2357 2358 vp->v_path = NULL;
2358 2359 }
2359 2360
2360 2361 if (vp->v_fopdata != NULL) {
2361 2362 free_fopdata(vp);
2362 2363 }
2363 2364 vp->v_mpssdata = NULL;
2364 2365 vsd_free(vp);
2365 2366 }
2366 2367
2367 2368 /*
2368 2369 * Used to reset the vnode fields including those that are directly accessible
2369 2370 * as well as those which require an accessor function.
2370 2371 *
2371 2372 * Does not initialize:
2372 2373 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2373 2374 * v_data (since FS-nodes and vnodes point to each other and should
2374 2375 * be updated simultaneously)
2375 2376 * v_op (in case someone needs to make a VOP call on this object)
2376 2377 */
2377 2378 void
2378 2379 vn_reinit(vnode_t *vp)
2379 2380 {
2380 2381 vp->v_count = 1;
2381 2382 vp->v_count_dnlc = 0;
2382 2383 vp->v_vfsp = NULL;
2383 2384 vp->v_stream = NULL;
2384 2385 vp->v_vfsmountedhere = NULL;
2385 2386 vp->v_flag = 0;
2386 2387 vp->v_type = VNON;
2387 2388 vp->v_rdev = NODEV;
2388 2389
2389 2390 vp->v_filocks = NULL;
2390 2391 vp->v_shrlocks = NULL;
2391 2392 vp->v_pages = NULL;
2392 2393
2393 2394 vp->v_locality = NULL;
2394 2395 vp->v_xattrdir = NULL;
2395 2396
2396 2397 /* Handles v_femhead, v_path, and the r/w/map counts */
2397 2398 vn_recycle(vp);
2398 2399 }
2399 2400
2400 2401 vnode_t *
2401 2402 vn_alloc(int kmflag)
2402 2403 {
2403 2404 vnode_t *vp;
2404 2405
2405 2406 vp = kmem_cache_alloc(vn_cache, kmflag);
2406 2407
2407 2408 if (vp != NULL) {
2408 2409 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2409 2410 vp->v_fopdata = NULL;
2410 2411 vn_reinit(vp);
2411 2412 }
2412 2413
2413 2414 return (vp);
2414 2415 }
2415 2416
2416 2417 void
2417 2418 vn_free(vnode_t *vp)
2418 2419 {
2419 2420 ASSERT(vp->v_shrlocks == NULL);
2420 2421 ASSERT(vp->v_filocks == NULL);
2421 2422
2422 2423 /*
2423 2424 * Some file systems call vn_free() with v_count of zero,
2424 2425 * some with v_count of 1. In any case, the value should
2425 2426 * never be anything else.
2426 2427 */
2427 2428 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2428 2429 ASSERT(vp->v_count_dnlc == 0);
2429 2430 if (vp->v_path != NULL) {
2430 2431 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2431 2432 vp->v_path = NULL;
2432 2433 }
2433 2434
2434 2435 /* If FEM was in use, make sure everything gets cleaned up */
2435 2436 if (vp->v_femhead) {
2436 2437 /* XXX - There should be a free_femhead() that does all this */
2437 2438 ASSERT(vp->v_femhead->femh_list == NULL);
2438 2439 mutex_destroy(&vp->v_femhead->femh_lock);
2439 2440 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2440 2441 vp->v_femhead = NULL;
2441 2442 }
2442 2443
2443 2444 if (vp->v_fopdata != NULL) {
2444 2445 free_fopdata(vp);
2445 2446 }
2446 2447 vp->v_mpssdata = NULL;
2447 2448 vsd_free(vp);
2448 2449 kmem_cache_free(vn_cache, vp);
2449 2450 }
2450 2451
2451 2452 /*
2452 2453 * vnode status changes, should define better states than 1, 0.
2453 2454 */
2454 2455 void
2455 2456 vn_reclaim(vnode_t *vp)
2456 2457 {
2457 2458 vfs_t *vfsp = vp->v_vfsp;
2458 2459
2459 2460 if (vfsp == NULL ||
2460 2461 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2461 2462 return;
2462 2463 }
2463 2464 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2464 2465 }
2465 2466
2466 2467 void
2467 2468 vn_idle(vnode_t *vp)
2468 2469 {
2469 2470 vfs_t *vfsp = vp->v_vfsp;
2470 2471
2471 2472 if (vfsp == NULL ||
2472 2473 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2473 2474 return;
2474 2475 }
2475 2476 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2476 2477 }
2477 2478 void
2478 2479 vn_exists(vnode_t *vp)
2479 2480 {
2480 2481 vfs_t *vfsp = vp->v_vfsp;
2481 2482
2482 2483 if (vfsp == NULL ||
2483 2484 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2484 2485 return;
2485 2486 }
2486 2487 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2487 2488 }
2488 2489
2489 2490 void
2490 2491 vn_invalid(vnode_t *vp)
2491 2492 {
2492 2493 vfs_t *vfsp = vp->v_vfsp;
2493 2494
2494 2495 if (vfsp == NULL ||
2495 2496 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2496 2497 return;
2497 2498 }
2498 2499 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2499 2500 }
2500 2501
2501 2502 /* Vnode event notification */
2502 2503
2503 2504 int
2504 2505 vnevent_support(vnode_t *vp, caller_context_t *ct)
2505 2506 {
2506 2507 if (vp == NULL)
2507 2508 return (EINVAL);
2508 2509
2509 2510 return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2510 2511 }
2511 2512
2512 2513 void
2513 2514 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2514 2515 {
2515 2516 if (vp == NULL || vp->v_femhead == NULL) {
2516 2517 return;
2517 2518 }
2518 2519 (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2519 2520 }
2520 2521
2521 2522 void
2522 2523 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2523 2524 caller_context_t *ct)
2524 2525 {
2525 2526 if (vp == NULL || vp->v_femhead == NULL) {
2526 2527 return;
2527 2528 }
2528 2529 (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2529 2530 }
2530 2531
2531 2532 void
2532 2533 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2533 2534 {
2534 2535 if (vp == NULL || vp->v_femhead == NULL) {
2535 2536 return;
2536 2537 }
2537 2538 (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2538 2539 }
2539 2540
2540 2541 void
2541 2542 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2542 2543 {
2543 2544 if (vp == NULL || vp->v_femhead == NULL) {
2544 2545 return;
2545 2546 }
2546 2547 (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2547 2548 }
2548 2549
2549 2550 void
2550 2551 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2551 2552 {
2552 2553 if (vp == NULL || vp->v_femhead == NULL) {
2553 2554 return;
2554 2555 }
2555 2556 (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2556 2557 }
2557 2558
2558 2559 void
2559 2560 vnevent_create(vnode_t *vp, caller_context_t *ct)
2560 2561 {
2561 2562 if (vp == NULL || vp->v_femhead == NULL) {
2562 2563 return;
2563 2564 }
2564 2565 (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2565 2566 }
2566 2567
2567 2568 void
2568 2569 vnevent_link(vnode_t *vp, caller_context_t *ct)
2569 2570 {
2570 2571 if (vp == NULL || vp->v_femhead == NULL) {
2571 2572 return;
2572 2573 }
2573 2574 (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2574 2575 }
2575 2576
2576 2577 void
2577 2578 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2578 2579 {
2579 2580 if (vp == NULL || vp->v_femhead == NULL) {
2580 2581 return;
2581 2582 }
2582 2583 (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2583 2584 }
2584 2585
2585 2586 /*
2586 2587 * Vnode accessors.
2587 2588 */
2588 2589
2589 2590 int
2590 2591 vn_is_readonly(vnode_t *vp)
2591 2592 {
2592 2593 return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2593 2594 }
2594 2595
2595 2596 int
2596 2597 vn_has_flocks(vnode_t *vp)
2597 2598 {
2598 2599 return (vp->v_filocks != NULL);
2599 2600 }
2600 2601
2601 2602 int
2602 2603 vn_has_mandatory_locks(vnode_t *vp, int mode)
2603 2604 {
2604 2605 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2605 2606 }
2606 2607
2607 2608 int
2608 2609 vn_has_cached_data(vnode_t *vp)
2609 2610 {
2610 2611 return (vp->v_pages != NULL);
2611 2612 }
2612 2613
2613 2614 /*
2614 2615 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2615 2616 * zone_enter(2).
2616 2617 */
2617 2618 int
2618 2619 vn_can_change_zones(vnode_t *vp)
2619 2620 {
2620 2621 struct vfssw *vswp;
2621 2622 int allow = 1;
2622 2623 vnode_t *rvp;
2623 2624
2624 2625 if (nfs_global_client_only != 0)
2625 2626 return (1);
2626 2627
2627 2628 /*
2628 2629 * We always want to look at the underlying vnode if there is one.
2629 2630 */
2630 2631 if (VOP_REALVP(vp, &rvp, NULL) != 0)
2631 2632 rvp = vp;
2632 2633 /*
2633 2634 * Some pseudo filesystems (including doorfs) don't actually register
2634 2635 * their vfsops_t, so the following may return NULL; we happily let
2635 2636 * such vnodes switch zones.
2636 2637 */
2637 2638 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2638 2639 if (vswp != NULL) {
2639 2640 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2640 2641 allow = 0;
2641 2642 vfs_unrefvfssw(vswp);
2642 2643 }
2643 2644 return (allow);
2644 2645 }
2645 2646
2646 2647 /*
2647 2648 * Return nonzero if the vnode is a mount point, zero if not.
2648 2649 */
2649 2650 int
2650 2651 vn_ismntpt(vnode_t *vp)
2651 2652 {
2652 2653 return (vp->v_vfsmountedhere != NULL);
2653 2654 }
2654 2655
2655 2656 /* Retrieve the vfs (if any) mounted on this vnode */
2656 2657 vfs_t *
2657 2658 vn_mountedvfs(vnode_t *vp)
2658 2659 {
2659 2660 return (vp->v_vfsmountedhere);
2660 2661 }
2661 2662
2662 2663 /*
2663 2664 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2664 2665 */
2665 2666 int
2666 2667 vn_in_dnlc(vnode_t *vp)
2667 2668 {
2668 2669 return (vp->v_count_dnlc > 0);
2669 2670 }
2670 2671
2671 2672 /*
2672 2673 * vn_has_other_opens() checks whether a particular file is opened by more than
2673 2674 * just the caller and whether the open is for read and/or write.
2674 2675 * This routine is for calling after the caller has already called VOP_OPEN()
2675 2676 * and the caller wishes to know if they are the only one with it open for
2676 2677 * the mode(s) specified.
2677 2678 *
2678 2679 * Vnode counts are only kept on regular files (v_type=VREG).
2679 2680 */
2680 2681 int
2681 2682 vn_has_other_opens(
2682 2683 vnode_t *vp,
2683 2684 v_mode_t mode)
2684 2685 {
2685 2686
2686 2687 ASSERT(vp != NULL);
2687 2688
2688 2689 switch (mode) {
2689 2690 case V_WRITE:
2690 2691 if (vp->v_wrcnt > 1)
2691 2692 return (V_TRUE);
2692 2693 break;
2693 2694 case V_RDORWR:
2694 2695 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2695 2696 return (V_TRUE);
2696 2697 break;
2697 2698 case V_RDANDWR:
2698 2699 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2699 2700 return (V_TRUE);
2700 2701 break;
2701 2702 case V_READ:
2702 2703 if (vp->v_rdcnt > 1)
2703 2704 return (V_TRUE);
2704 2705 break;
2705 2706 }
2706 2707
2707 2708 return (V_FALSE);
2708 2709 }
2709 2710
2710 2711 /*
2711 2712 * vn_is_opened() checks whether a particular file is opened and
2712 2713 * whether the open is for read and/or write.
2713 2714 *
2714 2715 * Vnode counts are only kept on regular files (v_type=VREG).
2715 2716 */
2716 2717 int
2717 2718 vn_is_opened(
2718 2719 vnode_t *vp,
2719 2720 v_mode_t mode)
2720 2721 {
2721 2722
2722 2723 ASSERT(vp != NULL);
2723 2724
2724 2725 switch (mode) {
2725 2726 case V_WRITE:
2726 2727 if (vp->v_wrcnt)
2727 2728 return (V_TRUE);
2728 2729 break;
2729 2730 case V_RDANDWR:
2730 2731 if (vp->v_rdcnt && vp->v_wrcnt)
2731 2732 return (V_TRUE);
2732 2733 break;
2733 2734 case V_RDORWR:
2734 2735 if (vp->v_rdcnt || vp->v_wrcnt)
2735 2736 return (V_TRUE);
2736 2737 break;
2737 2738 case V_READ:
2738 2739 if (vp->v_rdcnt)
2739 2740 return (V_TRUE);
2740 2741 break;
2741 2742 }
2742 2743
2743 2744 return (V_FALSE);
2744 2745 }
2745 2746
2746 2747 /*
2747 2748 * vn_is_mapped() checks whether a particular file is mapped and whether
2748 2749 * the file is mapped read and/or write.
2749 2750 */
2750 2751 int
2751 2752 vn_is_mapped(
2752 2753 vnode_t *vp,
2753 2754 v_mode_t mode)
2754 2755 {
2755 2756
2756 2757 ASSERT(vp != NULL);
2757 2758
2758 2759 #if !defined(_LP64)
2759 2760 switch (mode) {
2760 2761 /*
2761 2762 * The atomic_add_64_nv functions force atomicity in the
2762 2763 * case of 32 bit architectures. Otherwise the 64 bit values
2763 2764 * require two fetches. The value of the fields may be
2764 2765 * (potentially) changed between the first fetch and the
2765 2766 * second
2766 2767 */
2767 2768 case V_WRITE:
2768 2769 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2769 2770 return (V_TRUE);
2770 2771 break;
2771 2772 case V_RDANDWR:
2772 2773 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2773 2774 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2774 2775 return (V_TRUE);
2775 2776 break;
2776 2777 case V_RDORWR:
2777 2778 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2778 2779 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2779 2780 return (V_TRUE);
2780 2781 break;
2781 2782 case V_READ:
2782 2783 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2783 2784 return (V_TRUE);
2784 2785 break;
2785 2786 }
2786 2787 #else
2787 2788 switch (mode) {
2788 2789 case V_WRITE:
2789 2790 if (vp->v_mmap_write)
2790 2791 return (V_TRUE);
2791 2792 break;
2792 2793 case V_RDANDWR:
2793 2794 if (vp->v_mmap_read && vp->v_mmap_write)
2794 2795 return (V_TRUE);
2795 2796 break;
2796 2797 case V_RDORWR:
2797 2798 if (vp->v_mmap_read || vp->v_mmap_write)
2798 2799 return (V_TRUE);
2799 2800 break;
2800 2801 case V_READ:
2801 2802 if (vp->v_mmap_read)
2802 2803 return (V_TRUE);
2803 2804 break;
2804 2805 }
2805 2806 #endif
2806 2807
2807 2808 return (V_FALSE);
2808 2809 }
2809 2810
2810 2811 /*
2811 2812 * Set the operations vector for a vnode.
2812 2813 *
2813 2814 * FEM ensures that the v_femhead pointer is filled in before the
2814 2815 * v_op pointer is changed. This means that if the v_femhead pointer
2815 2816 * is NULL, and the v_op field hasn't changed since before which checked
2816 2817 * the v_femhead pointer; then our update is ok - we are not racing with
2817 2818 * FEM.
2818 2819 */
2819 2820 void
2820 2821 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2821 2822 {
2822 2823 vnodeops_t *op;
2823 2824
2824 2825 ASSERT(vp != NULL);
2825 2826 ASSERT(vnodeops != NULL);
2826 2827
2827 2828 op = vp->v_op;
2828 2829 membar_consumer();
2829 2830 /*
2830 2831 * If vp->v_femhead == NULL, then we'll call casptr() to do the
2831 2832 * compare-and-swap on vp->v_op. If either fails, then FEM is
2832 2833 * in effect on the vnode and we need to have FEM deal with it.
2833 2834 */
2834 2835 if (vp->v_femhead != NULL || casptr(&vp->v_op, op, vnodeops) != op) {
2835 2836 fem_setvnops(vp, vnodeops);
2836 2837 }
2837 2838 }
2838 2839
2839 2840 /*
2840 2841 * Retrieve the operations vector for a vnode
2841 2842 * As with vn_setops(above); make sure we aren't racing with FEM.
2842 2843 * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2843 2844 * make sense to the callers of this routine.
2844 2845 */
2845 2846 vnodeops_t *
2846 2847 vn_getops(vnode_t *vp)
2847 2848 {
2848 2849 vnodeops_t *op;
2849 2850
2850 2851 ASSERT(vp != NULL);
2851 2852
2852 2853 op = vp->v_op;
2853 2854 membar_consumer();
2854 2855 if (vp->v_femhead == NULL && op == vp->v_op) {
2855 2856 return (op);
2856 2857 } else {
2857 2858 return (fem_getvnops(vp));
2858 2859 }
2859 2860 }
2860 2861
2861 2862 /*
2862 2863 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2863 2864 * Returns zero (0) if not.
2864 2865 */
2865 2866 int
2866 2867 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2867 2868 {
2868 2869 return (vn_getops(vp) == vnodeops);
2869 2870 }
2870 2871
2871 2872 /*
2872 2873 * Returns non-zero (1) if the specified operation matches the
2873 2874 * corresponding operation for that the vnode.
2874 2875 * Returns zero (0) if not.
2875 2876 */
2876 2877
2877 2878 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2878 2879
2879 2880 int
2880 2881 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2881 2882 {
2882 2883 const fs_operation_trans_def_t *otdp;
2883 2884 fs_generic_func_p *loc = NULL;
2884 2885 vnodeops_t *vop = vn_getops(vp);
2885 2886
2886 2887 ASSERT(vopname != NULL);
2887 2888
2888 2889 for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2889 2890 if (MATCHNAME(otdp->name, vopname)) {
2890 2891 loc = (fs_generic_func_p *)
2891 2892 ((char *)(vop) + otdp->offset);
2892 2893 break;
2893 2894 }
2894 2895 }
2895 2896
2896 2897 return ((loc != NULL) && (*loc == funcp));
2897 2898 }
2898 2899
2899 2900 /*
2900 2901 * fs_new_caller_id() needs to return a unique ID on a given local system.
2901 2902 * The IDs do not need to survive across reboots. These are primarily
2902 2903 * used so that (FEM) monitors can detect particular callers (such as
2903 2904 * the NFS server) to a given vnode/vfs operation.
2904 2905 */
2905 2906 u_longlong_t
2906 2907 fs_new_caller_id()
2907 2908 {
2908 2909 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2909 2910
2910 2911 return ((u_longlong_t)atomic_add_64_nv(&next_caller_id, 1));
2911 2912 }
2912 2913
2913 2914 /*
2914 2915 * Given a starting vnode and a path, updates the path in the target vnode in
2915 2916 * a safe manner. If the vnode already has path information embedded, then the
2916 2917 * cached path is left untouched.
2917 2918 */
2918 2919
2919 2920 size_t max_vnode_path = 4 * MAXPATHLEN;
2920 2921
2921 2922 void
2922 2923 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2923 2924 const char *path, size_t plen)
2924 2925 {
2925 2926 char *rpath;
2926 2927 vnode_t *base;
2927 2928 size_t rpathlen, rpathalloc;
2928 2929 int doslash = 1;
2929 2930
2930 2931 if (*path == '/') {
2931 2932 base = rootvp;
2932 2933 path++;
2933 2934 plen--;
2934 2935 } else {
2935 2936 base = startvp;
2936 2937 }
2937 2938
2938 2939 /*
2939 2940 * We cannot grab base->v_lock while we hold vp->v_lock because of
2940 2941 * the potential for deadlock.
2941 2942 */
2942 2943 mutex_enter(&base->v_lock);
2943 2944 if (base->v_path == NULL) {
2944 2945 mutex_exit(&base->v_lock);
2945 2946 return;
2946 2947 }
2947 2948
2948 2949 rpathlen = strlen(base->v_path);
2949 2950 rpathalloc = rpathlen + plen + 1;
2950 2951 /* Avoid adding a slash if there's already one there */
2951 2952 if (base->v_path[rpathlen-1] == '/')
2952 2953 doslash = 0;
2953 2954 else
2954 2955 rpathalloc++;
2955 2956
2956 2957 /*
2957 2958 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
2958 2959 * so we must do this dance. If, by chance, something changes the path,
2959 2960 * just give up since there is no real harm.
2960 2961 */
2961 2962 mutex_exit(&base->v_lock);
2962 2963
2963 2964 /* Paths should stay within reason */
2964 2965 if (rpathalloc > max_vnode_path)
2965 2966 return;
2966 2967
2967 2968 rpath = kmem_alloc(rpathalloc, KM_SLEEP);
2968 2969
2969 2970 mutex_enter(&base->v_lock);
2970 2971 if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
2971 2972 mutex_exit(&base->v_lock);
2972 2973 kmem_free(rpath, rpathalloc);
2973 2974 return;
2974 2975 }
2975 2976 bcopy(base->v_path, rpath, rpathlen);
2976 2977 mutex_exit(&base->v_lock);
2977 2978
2978 2979 if (doslash)
2979 2980 rpath[rpathlen++] = '/';
2980 2981 bcopy(path, rpath + rpathlen, plen);
2981 2982 rpath[rpathlen + plen] = '\0';
2982 2983
2983 2984 mutex_enter(&vp->v_lock);
2984 2985 if (vp->v_path != NULL) {
2985 2986 mutex_exit(&vp->v_lock);
2986 2987 kmem_free(rpath, rpathalloc);
2987 2988 } else {
2988 2989 vp->v_path = rpath;
2989 2990 mutex_exit(&vp->v_lock);
2990 2991 }
2991 2992 }
2992 2993
2993 2994 /*
2994 2995 * Sets the path to the vnode to be the given string, regardless of current
2995 2996 * context. The string must be a complete path from rootdir. This is only used
2996 2997 * by fsop_root() for setting the path based on the mountpoint.
2997 2998 */
2998 2999 void
2999 3000 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3000 3001 {
3001 3002 char *buf = kmem_alloc(len + 1, KM_SLEEP);
3002 3003
3003 3004 mutex_enter(&vp->v_lock);
3004 3005 if (vp->v_path != NULL) {
3005 3006 mutex_exit(&vp->v_lock);
3006 3007 kmem_free(buf, len + 1);
3007 3008 return;
3008 3009 }
3009 3010
3010 3011 vp->v_path = buf;
3011 3012 bcopy(str, vp->v_path, len);
3012 3013 vp->v_path[len] = '\0';
3013 3014
3014 3015 mutex_exit(&vp->v_lock);
3015 3016 }
3016 3017
3017 3018 /*
3018 3019 * Called from within filesystem's vop_rename() to handle renames once the
3019 3020 * target vnode is available.
3020 3021 */
3021 3022 void
3022 3023 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3023 3024 {
3024 3025 char *tmp;
3025 3026
3026 3027 mutex_enter(&vp->v_lock);
3027 3028 tmp = vp->v_path;
3028 3029 vp->v_path = NULL;
3029 3030 mutex_exit(&vp->v_lock);
3030 3031 vn_setpath(rootdir, dvp, vp, nm, len);
3031 3032 if (tmp != NULL)
3032 3033 kmem_free(tmp, strlen(tmp) + 1);
3033 3034 }
3034 3035
3035 3036 /*
3036 3037 * Similar to vn_setpath_str(), this function sets the path of the destination
3037 3038 * vnode to the be the same as the source vnode.
3038 3039 */
3039 3040 void
3040 3041 vn_copypath(struct vnode *src, struct vnode *dst)
3041 3042 {
3042 3043 char *buf;
3043 3044 int alloc;
3044 3045
3045 3046 mutex_enter(&src->v_lock);
3046 3047 if (src->v_path == NULL) {
3047 3048 mutex_exit(&src->v_lock);
3048 3049 return;
3049 3050 }
3050 3051 alloc = strlen(src->v_path) + 1;
3051 3052
3052 3053 /* avoid kmem_alloc() with lock held */
3053 3054 mutex_exit(&src->v_lock);
3054 3055 buf = kmem_alloc(alloc, KM_SLEEP);
3055 3056 mutex_enter(&src->v_lock);
3056 3057 if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3057 3058 mutex_exit(&src->v_lock);
3058 3059 kmem_free(buf, alloc);
3059 3060 return;
3060 3061 }
3061 3062 bcopy(src->v_path, buf, alloc);
3062 3063 mutex_exit(&src->v_lock);
3063 3064
3064 3065 mutex_enter(&dst->v_lock);
3065 3066 if (dst->v_path != NULL) {
3066 3067 mutex_exit(&dst->v_lock);
3067 3068 kmem_free(buf, alloc);
3068 3069 return;
3069 3070 }
3070 3071 dst->v_path = buf;
3071 3072 mutex_exit(&dst->v_lock);
3072 3073 }
3073 3074
3074 3075 /*
3075 3076 * XXX Private interface for segvn routines that handle vnode
3076 3077 * large page segments.
3077 3078 *
3078 3079 * return 1 if vp's file system VOP_PAGEIO() implementation
3079 3080 * can be safely used instead of VOP_GETPAGE() for handling
3080 3081 * pagefaults against regular non swap files. VOP_PAGEIO()
3081 3082 * interface is considered safe here if its implementation
3082 3083 * is very close to VOP_GETPAGE() implementation.
3083 3084 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3084 3085 * panic if there're file holes but instead returns an error.
3085 3086 * Doesn't assume file won't be changed by user writes, etc.
3086 3087 *
3087 3088 * return 0 otherwise.
3088 3089 *
3089 3090 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3090 3091 */
3091 3092 int
3092 3093 vn_vmpss_usepageio(vnode_t *vp)
3093 3094 {
3094 3095 vfs_t *vfsp = vp->v_vfsp;
3095 3096 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3096 3097 char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3097 3098 char **fsok = pageio_ok_fss;
3098 3099
3099 3100 if (fsname == NULL) {
3100 3101 return (0);
3101 3102 }
3102 3103
3103 3104 for (; *fsok; fsok++) {
3104 3105 if (strcmp(*fsok, fsname) == 0) {
3105 3106 return (1);
3106 3107 }
3107 3108 }
3108 3109 return (0);
3109 3110 }
3110 3111
3111 3112 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3112 3113
3113 3114 int
3114 3115 fop_open(
3115 3116 vnode_t **vpp,
3116 3117 int mode,
3117 3118 cred_t *cr,
3118 3119 caller_context_t *ct)
3119 3120 {
3120 3121 int ret;
3121 3122 vnode_t *vp = *vpp;
3122 3123
3123 3124 VN_HOLD(vp);
3124 3125 /*
3125 3126 * Adding to the vnode counts before calling open
3126 3127 * avoids the need for a mutex. It circumvents a race
3127 3128 * condition where a query made on the vnode counts results in a
3128 3129 * false negative. The inquirer goes away believing the file is
3129 3130 * not open when there is an open on the file already under way.
3130 3131 *
3131 3132 * The counts are meant to prevent NFS from granting a delegation
3132 3133 * when it would be dangerous to do so.
3133 3134 *
3134 3135 * The vnode counts are only kept on regular files
3135 3136 */
3136 3137 if ((*vpp)->v_type == VREG) {
3137 3138 if (mode & FREAD)
3138 3139 atomic_add_32(&((*vpp)->v_rdcnt), 1);
3139 3140 if (mode & FWRITE)
3140 3141 atomic_add_32(&((*vpp)->v_wrcnt), 1);
3141 3142 }
3142 3143
3143 3144 VOPXID_MAP_CR(vp, cr);
3144 3145
3145 3146 ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3146 3147
3147 3148 if (ret) {
3148 3149 /*
3149 3150 * Use the saved vp just in case the vnode ptr got trashed
3150 3151 * by the error.
3151 3152 */
3152 3153 VOPSTATS_UPDATE(vp, open);
3153 3154 if ((vp->v_type == VREG) && (mode & FREAD))
3154 3155 atomic_add_32(&(vp->v_rdcnt), -1);
3155 3156 if ((vp->v_type == VREG) && (mode & FWRITE))
3156 3157 atomic_add_32(&(vp->v_wrcnt), -1);
3157 3158 } else {
3158 3159 /*
3159 3160 * Some filesystems will return a different vnode,
3160 3161 * but the same path was still used to open it.
3161 3162 * So if we do change the vnode and need to
3162 3163 * copy over the path, do so here, rather than special
3163 3164 * casing each filesystem. Adjust the vnode counts to
3164 3165 * reflect the vnode switch.
3165 3166 */
3166 3167 VOPSTATS_UPDATE(*vpp, open);
3167 3168 if (*vpp != vp && *vpp != NULL) {
3168 3169 vn_copypath(vp, *vpp);
3169 3170 if (((*vpp)->v_type == VREG) && (mode & FREAD))
3170 3171 atomic_add_32(&((*vpp)->v_rdcnt), 1);
3171 3172 if ((vp->v_type == VREG) && (mode & FREAD))
3172 3173 atomic_add_32(&(vp->v_rdcnt), -1);
3173 3174 if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3174 3175 atomic_add_32(&((*vpp)->v_wrcnt), 1);
3175 3176 if ((vp->v_type == VREG) && (mode & FWRITE))
3176 3177 atomic_add_32(&(vp->v_wrcnt), -1);
3177 3178 }
3178 3179 }
3179 3180 VN_RELE(vp);
3180 3181 return (ret);
3181 3182 }
3182 3183
3183 3184 int
3184 3185 fop_close(
3185 3186 vnode_t *vp,
3186 3187 int flag,
3187 3188 int count,
3188 3189 offset_t offset,
3189 3190 cred_t *cr,
3190 3191 caller_context_t *ct)
3191 3192 {
3192 3193 int err;
3193 3194
3194 3195 VOPXID_MAP_CR(vp, cr);
3195 3196
3196 3197 err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3197 3198 VOPSTATS_UPDATE(vp, close);
3198 3199 /*
3199 3200 * Check passed in count to handle possible dups. Vnode counts are only
3200 3201 * kept on regular files
3201 3202 */
3202 3203 if ((vp->v_type == VREG) && (count == 1)) {
3203 3204 if (flag & FREAD) {
3204 3205 ASSERT(vp->v_rdcnt > 0);
3205 3206 atomic_add_32(&(vp->v_rdcnt), -1);
3206 3207 }
3207 3208 if (flag & FWRITE) {
3208 3209 ASSERT(vp->v_wrcnt > 0);
3209 3210 atomic_add_32(&(vp->v_wrcnt), -1);
3210 3211 }
3211 3212 }
3212 3213 return (err);
3213 3214 }
3214 3215
3215 3216 int
3216 3217 fop_read(
3217 3218 vnode_t *vp,
↓ open down ↓ |
3140 lines elided |
↑ open up ↑ |
3218 3219 uio_t *uiop,
3219 3220 int ioflag,
3220 3221 cred_t *cr,
3221 3222 caller_context_t *ct)
3222 3223 {
3223 3224 int err;
3224 3225 ssize_t resid_start = uiop->uio_resid;
3225 3226
3226 3227 VOPXID_MAP_CR(vp, cr);
3227 3228
3228 - err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3229 + err = fsh_read(vp, uiop, ioflag, cr, ct);
3229 3230 VOPSTATS_UPDATE_IO(vp, read,
3230 3231 read_bytes, (resid_start - uiop->uio_resid));
3231 3232 return (err);
3232 3233 }
3233 3234
3234 3235 int
3235 3236 fop_write(
3236 3237 vnode_t *vp,
3237 3238 uio_t *uiop,
3238 3239 int ioflag,
3239 3240 cred_t *cr,
3240 3241 caller_context_t *ct)
3241 3242 {
3242 3243 int err;
3243 3244 ssize_t resid_start = uiop->uio_resid;
3244 3245
3245 3246 VOPXID_MAP_CR(vp, cr);
3246 3247
3247 - err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3248 + err = fsh_write(vp, uiop, ioflag, cr, ct);
3248 3249 VOPSTATS_UPDATE_IO(vp, write,
3249 3250 write_bytes, (resid_start - uiop->uio_resid));
3250 3251 return (err);
3251 3252 }
3252 3253
3253 3254 int
3254 3255 fop_ioctl(
3255 3256 vnode_t *vp,
3256 3257 int cmd,
3257 3258 intptr_t arg,
3258 3259 int flag,
3259 3260 cred_t *cr,
3260 3261 int *rvalp,
3261 3262 caller_context_t *ct)
3262 3263 {
3263 3264 int err;
3264 3265
3265 3266 VOPXID_MAP_CR(vp, cr);
3266 3267
3267 3268 err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3268 3269 VOPSTATS_UPDATE(vp, ioctl);
3269 3270 return (err);
3270 3271 }
3271 3272
3272 3273 int
3273 3274 fop_setfl(
3274 3275 vnode_t *vp,
3275 3276 int oflags,
3276 3277 int nflags,
3277 3278 cred_t *cr,
3278 3279 caller_context_t *ct)
3279 3280 {
3280 3281 int err;
3281 3282
3282 3283 VOPXID_MAP_CR(vp, cr);
3283 3284
3284 3285 err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3285 3286 VOPSTATS_UPDATE(vp, setfl);
3286 3287 return (err);
3287 3288 }
3288 3289
3289 3290 int
3290 3291 fop_getattr(
3291 3292 vnode_t *vp,
3292 3293 vattr_t *vap,
3293 3294 int flags,
3294 3295 cred_t *cr,
3295 3296 caller_context_t *ct)
3296 3297 {
3297 3298 int err;
3298 3299
3299 3300 VOPXID_MAP_CR(vp, cr);
3300 3301
3301 3302 /*
3302 3303 * If this file system doesn't understand the xvattr extensions
3303 3304 * then turn off the xvattr bit.
3304 3305 */
3305 3306 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3306 3307 vap->va_mask &= ~AT_XVATTR;
3307 3308 }
3308 3309
3309 3310 /*
3310 3311 * We're only allowed to skip the ACL check iff we used a 32 bit
3311 3312 * ACE mask with VOP_ACCESS() to determine permissions.
3312 3313 */
3313 3314 if ((flags & ATTR_NOACLCHECK) &&
3314 3315 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3315 3316 return (EINVAL);
3316 3317 }
3317 3318 err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3318 3319 VOPSTATS_UPDATE(vp, getattr);
3319 3320 return (err);
3320 3321 }
3321 3322
3322 3323 int
3323 3324 fop_setattr(
3324 3325 vnode_t *vp,
3325 3326 vattr_t *vap,
3326 3327 int flags,
3327 3328 cred_t *cr,
3328 3329 caller_context_t *ct)
3329 3330 {
3330 3331 int err;
3331 3332
3332 3333 VOPXID_MAP_CR(vp, cr);
3333 3334
3334 3335 /*
3335 3336 * If this file system doesn't understand the xvattr extensions
3336 3337 * then turn off the xvattr bit.
3337 3338 */
3338 3339 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3339 3340 vap->va_mask &= ~AT_XVATTR;
3340 3341 }
3341 3342
3342 3343 /*
3343 3344 * We're only allowed to skip the ACL check iff we used a 32 bit
3344 3345 * ACE mask with VOP_ACCESS() to determine permissions.
3345 3346 */
3346 3347 if ((flags & ATTR_NOACLCHECK) &&
3347 3348 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3348 3349 return (EINVAL);
3349 3350 }
3350 3351 err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3351 3352 VOPSTATS_UPDATE(vp, setattr);
3352 3353 return (err);
3353 3354 }
3354 3355
3355 3356 int
3356 3357 fop_access(
3357 3358 vnode_t *vp,
3358 3359 int mode,
3359 3360 int flags,
3360 3361 cred_t *cr,
3361 3362 caller_context_t *ct)
3362 3363 {
3363 3364 int err;
3364 3365
3365 3366 if ((flags & V_ACE_MASK) &&
3366 3367 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3367 3368 return (EINVAL);
3368 3369 }
3369 3370
3370 3371 VOPXID_MAP_CR(vp, cr);
3371 3372
3372 3373 err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3373 3374 VOPSTATS_UPDATE(vp, access);
3374 3375 return (err);
3375 3376 }
3376 3377
3377 3378 int
3378 3379 fop_lookup(
3379 3380 vnode_t *dvp,
3380 3381 char *nm,
3381 3382 vnode_t **vpp,
3382 3383 pathname_t *pnp,
3383 3384 int flags,
3384 3385 vnode_t *rdir,
3385 3386 cred_t *cr,
3386 3387 caller_context_t *ct,
3387 3388 int *deflags, /* Returned per-dirent flags */
3388 3389 pathname_t *ppnp) /* Returned case-preserved name in directory */
3389 3390 {
3390 3391 int ret;
3391 3392
3392 3393 /*
3393 3394 * If this file system doesn't support case-insensitive access
3394 3395 * and said access is requested, fail quickly. It is required
3395 3396 * that if the vfs supports case-insensitive lookup, it also
3396 3397 * supports extended dirent flags.
3397 3398 */
3398 3399 if (flags & FIGNORECASE &&
3399 3400 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3400 3401 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3401 3402 return (EINVAL);
3402 3403
3403 3404 VOPXID_MAP_CR(dvp, cr);
3404 3405
3405 3406 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3406 3407 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3407 3408 } else {
3408 3409 ret = (*(dvp)->v_op->vop_lookup)
3409 3410 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3410 3411 }
3411 3412 if (ret == 0 && *vpp) {
3412 3413 VOPSTATS_UPDATE(*vpp, lookup);
3413 3414 if ((*vpp)->v_path == NULL) {
3414 3415 vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3415 3416 }
3416 3417 }
3417 3418
3418 3419 return (ret);
3419 3420 }
3420 3421
3421 3422 int
3422 3423 fop_create(
3423 3424 vnode_t *dvp,
3424 3425 char *name,
3425 3426 vattr_t *vap,
3426 3427 vcexcl_t excl,
3427 3428 int mode,
3428 3429 vnode_t **vpp,
3429 3430 cred_t *cr,
3430 3431 int flags,
3431 3432 caller_context_t *ct,
3432 3433 vsecattr_t *vsecp) /* ACL to set during create */
3433 3434 {
3434 3435 int ret;
3435 3436
3436 3437 if (vsecp != NULL &&
3437 3438 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3438 3439 return (EINVAL);
3439 3440 }
3440 3441 /*
3441 3442 * If this file system doesn't support case-insensitive access
3442 3443 * and said access is requested, fail quickly.
3443 3444 */
3444 3445 if (flags & FIGNORECASE &&
3445 3446 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3446 3447 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3447 3448 return (EINVAL);
3448 3449
3449 3450 VOPXID_MAP_CR(dvp, cr);
3450 3451
3451 3452 ret = (*(dvp)->v_op->vop_create)
3452 3453 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3453 3454 if (ret == 0 && *vpp) {
3454 3455 VOPSTATS_UPDATE(*vpp, create);
3455 3456 if ((*vpp)->v_path == NULL) {
3456 3457 vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3457 3458 }
3458 3459 }
3459 3460
3460 3461 return (ret);
3461 3462 }
3462 3463
3463 3464 int
3464 3465 fop_remove(
3465 3466 vnode_t *dvp,
3466 3467 char *nm,
3467 3468 cred_t *cr,
3468 3469 caller_context_t *ct,
3469 3470 int flags)
3470 3471 {
3471 3472 int err;
3472 3473
3473 3474 /*
3474 3475 * If this file system doesn't support case-insensitive access
3475 3476 * and said access is requested, fail quickly.
3476 3477 */
3477 3478 if (flags & FIGNORECASE &&
3478 3479 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3479 3480 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3480 3481 return (EINVAL);
3481 3482
3482 3483 VOPXID_MAP_CR(dvp, cr);
3483 3484
3484 3485 err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3485 3486 VOPSTATS_UPDATE(dvp, remove);
3486 3487 return (err);
3487 3488 }
3488 3489
3489 3490 int
3490 3491 fop_link(
3491 3492 vnode_t *tdvp,
3492 3493 vnode_t *svp,
3493 3494 char *tnm,
3494 3495 cred_t *cr,
3495 3496 caller_context_t *ct,
3496 3497 int flags)
3497 3498 {
3498 3499 int err;
3499 3500
3500 3501 /*
3501 3502 * If the target file system doesn't support case-insensitive access
3502 3503 * and said access is requested, fail quickly.
3503 3504 */
3504 3505 if (flags & FIGNORECASE &&
3505 3506 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3506 3507 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3507 3508 return (EINVAL);
3508 3509
3509 3510 VOPXID_MAP_CR(tdvp, cr);
3510 3511
3511 3512 err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3512 3513 VOPSTATS_UPDATE(tdvp, link);
3513 3514 return (err);
3514 3515 }
3515 3516
3516 3517 int
3517 3518 fop_rename(
3518 3519 vnode_t *sdvp,
3519 3520 char *snm,
3520 3521 vnode_t *tdvp,
3521 3522 char *tnm,
3522 3523 cred_t *cr,
3523 3524 caller_context_t *ct,
3524 3525 int flags)
3525 3526 {
3526 3527 int err;
3527 3528
3528 3529 /*
3529 3530 * If the file system involved does not support
3530 3531 * case-insensitive access and said access is requested, fail
3531 3532 * quickly.
3532 3533 */
3533 3534 if (flags & FIGNORECASE &&
3534 3535 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3535 3536 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3536 3537 return (EINVAL);
3537 3538
3538 3539 VOPXID_MAP_CR(tdvp, cr);
3539 3540
3540 3541 err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3541 3542 VOPSTATS_UPDATE(sdvp, rename);
3542 3543 return (err);
3543 3544 }
3544 3545
3545 3546 int
3546 3547 fop_mkdir(
3547 3548 vnode_t *dvp,
3548 3549 char *dirname,
3549 3550 vattr_t *vap,
3550 3551 vnode_t **vpp,
3551 3552 cred_t *cr,
3552 3553 caller_context_t *ct,
3553 3554 int flags,
3554 3555 vsecattr_t *vsecp) /* ACL to set during create */
3555 3556 {
3556 3557 int ret;
3557 3558
3558 3559 if (vsecp != NULL &&
3559 3560 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3560 3561 return (EINVAL);
3561 3562 }
3562 3563 /*
3563 3564 * If this file system doesn't support case-insensitive access
3564 3565 * and said access is requested, fail quickly.
3565 3566 */
3566 3567 if (flags & FIGNORECASE &&
3567 3568 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3568 3569 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3569 3570 return (EINVAL);
3570 3571
3571 3572 VOPXID_MAP_CR(dvp, cr);
3572 3573
3573 3574 ret = (*(dvp)->v_op->vop_mkdir)
3574 3575 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3575 3576 if (ret == 0 && *vpp) {
3576 3577 VOPSTATS_UPDATE(*vpp, mkdir);
3577 3578 if ((*vpp)->v_path == NULL) {
3578 3579 vn_setpath(rootdir, dvp, *vpp, dirname,
3579 3580 strlen(dirname));
3580 3581 }
3581 3582 }
3582 3583
3583 3584 return (ret);
3584 3585 }
3585 3586
3586 3587 int
3587 3588 fop_rmdir(
3588 3589 vnode_t *dvp,
3589 3590 char *nm,
3590 3591 vnode_t *cdir,
3591 3592 cred_t *cr,
3592 3593 caller_context_t *ct,
3593 3594 int flags)
3594 3595 {
3595 3596 int err;
3596 3597
3597 3598 /*
3598 3599 * If this file system doesn't support case-insensitive access
3599 3600 * and said access is requested, fail quickly.
3600 3601 */
3601 3602 if (flags & FIGNORECASE &&
3602 3603 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3603 3604 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3604 3605 return (EINVAL);
3605 3606
3606 3607 VOPXID_MAP_CR(dvp, cr);
3607 3608
3608 3609 err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3609 3610 VOPSTATS_UPDATE(dvp, rmdir);
3610 3611 return (err);
3611 3612 }
3612 3613
3613 3614 int
3614 3615 fop_readdir(
3615 3616 vnode_t *vp,
3616 3617 uio_t *uiop,
3617 3618 cred_t *cr,
3618 3619 int *eofp,
3619 3620 caller_context_t *ct,
3620 3621 int flags)
3621 3622 {
3622 3623 int err;
3623 3624 ssize_t resid_start = uiop->uio_resid;
3624 3625
3625 3626 /*
3626 3627 * If this file system doesn't support retrieving directory
3627 3628 * entry flags and said access is requested, fail quickly.
3628 3629 */
3629 3630 if (flags & V_RDDIR_ENTFLAGS &&
3630 3631 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3631 3632 return (EINVAL);
3632 3633
3633 3634 VOPXID_MAP_CR(vp, cr);
3634 3635
3635 3636 err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3636 3637 VOPSTATS_UPDATE_IO(vp, readdir,
3637 3638 readdir_bytes, (resid_start - uiop->uio_resid));
3638 3639 return (err);
3639 3640 }
3640 3641
3641 3642 int
3642 3643 fop_symlink(
3643 3644 vnode_t *dvp,
3644 3645 char *linkname,
3645 3646 vattr_t *vap,
3646 3647 char *target,
3647 3648 cred_t *cr,
3648 3649 caller_context_t *ct,
3649 3650 int flags)
3650 3651 {
3651 3652 int err;
3652 3653 xvattr_t xvattr;
3653 3654
3654 3655 /*
3655 3656 * If this file system doesn't support case-insensitive access
3656 3657 * and said access is requested, fail quickly.
3657 3658 */
3658 3659 if (flags & FIGNORECASE &&
3659 3660 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3660 3661 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3661 3662 return (EINVAL);
3662 3663
3663 3664 VOPXID_MAP_CR(dvp, cr);
3664 3665
3665 3666 /* check for reparse point */
3666 3667 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3667 3668 (strncmp(target, FS_REPARSE_TAG_STR,
3668 3669 strlen(FS_REPARSE_TAG_STR)) == 0)) {
3669 3670 if (!fs_reparse_mark(target, vap, &xvattr))
3670 3671 vap = (vattr_t *)&xvattr;
3671 3672 }
3672 3673
3673 3674 err = (*(dvp)->v_op->vop_symlink)
3674 3675 (dvp, linkname, vap, target, cr, ct, flags);
3675 3676 VOPSTATS_UPDATE(dvp, symlink);
3676 3677 return (err);
3677 3678 }
3678 3679
3679 3680 int
3680 3681 fop_readlink(
3681 3682 vnode_t *vp,
3682 3683 uio_t *uiop,
3683 3684 cred_t *cr,
3684 3685 caller_context_t *ct)
3685 3686 {
3686 3687 int err;
3687 3688
3688 3689 VOPXID_MAP_CR(vp, cr);
3689 3690
3690 3691 err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3691 3692 VOPSTATS_UPDATE(vp, readlink);
3692 3693 return (err);
3693 3694 }
3694 3695
3695 3696 int
3696 3697 fop_fsync(
3697 3698 vnode_t *vp,
3698 3699 int syncflag,
3699 3700 cred_t *cr,
3700 3701 caller_context_t *ct)
3701 3702 {
3702 3703 int err;
3703 3704
3704 3705 VOPXID_MAP_CR(vp, cr);
3705 3706
3706 3707 err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3707 3708 VOPSTATS_UPDATE(vp, fsync);
3708 3709 return (err);
3709 3710 }
3710 3711
3711 3712 void
3712 3713 fop_inactive(
3713 3714 vnode_t *vp,
3714 3715 cred_t *cr,
3715 3716 caller_context_t *ct)
3716 3717 {
3717 3718 /* Need to update stats before vop call since we may lose the vnode */
3718 3719 VOPSTATS_UPDATE(vp, inactive);
3719 3720
3720 3721 VOPXID_MAP_CR(vp, cr);
3721 3722
3722 3723 (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3723 3724 }
3724 3725
3725 3726 int
3726 3727 fop_fid(
3727 3728 vnode_t *vp,
3728 3729 fid_t *fidp,
3729 3730 caller_context_t *ct)
3730 3731 {
3731 3732 int err;
3732 3733
3733 3734 err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3734 3735 VOPSTATS_UPDATE(vp, fid);
3735 3736 return (err);
3736 3737 }
3737 3738
3738 3739 int
3739 3740 fop_rwlock(
3740 3741 vnode_t *vp,
3741 3742 int write_lock,
3742 3743 caller_context_t *ct)
3743 3744 {
3744 3745 int ret;
3745 3746
3746 3747 ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3747 3748 VOPSTATS_UPDATE(vp, rwlock);
3748 3749 return (ret);
3749 3750 }
3750 3751
3751 3752 void
3752 3753 fop_rwunlock(
3753 3754 vnode_t *vp,
3754 3755 int write_lock,
3755 3756 caller_context_t *ct)
3756 3757 {
3757 3758 (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3758 3759 VOPSTATS_UPDATE(vp, rwunlock);
3759 3760 }
3760 3761
3761 3762 int
3762 3763 fop_seek(
3763 3764 vnode_t *vp,
3764 3765 offset_t ooff,
3765 3766 offset_t *noffp,
3766 3767 caller_context_t *ct)
3767 3768 {
3768 3769 int err;
3769 3770
3770 3771 err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
3771 3772 VOPSTATS_UPDATE(vp, seek);
3772 3773 return (err);
3773 3774 }
3774 3775
3775 3776 int
3776 3777 fop_cmp(
3777 3778 vnode_t *vp1,
3778 3779 vnode_t *vp2,
3779 3780 caller_context_t *ct)
3780 3781 {
3781 3782 int err;
3782 3783
3783 3784 err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
3784 3785 VOPSTATS_UPDATE(vp1, cmp);
3785 3786 return (err);
3786 3787 }
3787 3788
3788 3789 int
3789 3790 fop_frlock(
3790 3791 vnode_t *vp,
3791 3792 int cmd,
3792 3793 flock64_t *bfp,
3793 3794 int flag,
3794 3795 offset_t offset,
3795 3796 struct flk_callback *flk_cbp,
3796 3797 cred_t *cr,
3797 3798 caller_context_t *ct)
3798 3799 {
3799 3800 int err;
3800 3801
3801 3802 VOPXID_MAP_CR(vp, cr);
3802 3803
3803 3804 err = (*(vp)->v_op->vop_frlock)
3804 3805 (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3805 3806 VOPSTATS_UPDATE(vp, frlock);
3806 3807 return (err);
3807 3808 }
3808 3809
3809 3810 int
3810 3811 fop_space(
3811 3812 vnode_t *vp,
3812 3813 int cmd,
3813 3814 flock64_t *bfp,
3814 3815 int flag,
3815 3816 offset_t offset,
3816 3817 cred_t *cr,
3817 3818 caller_context_t *ct)
3818 3819 {
3819 3820 int err;
3820 3821
3821 3822 VOPXID_MAP_CR(vp, cr);
3822 3823
3823 3824 err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
3824 3825 VOPSTATS_UPDATE(vp, space);
3825 3826 return (err);
3826 3827 }
3827 3828
3828 3829 int
3829 3830 fop_realvp(
3830 3831 vnode_t *vp,
3831 3832 vnode_t **vpp,
3832 3833 caller_context_t *ct)
3833 3834 {
3834 3835 int err;
3835 3836
3836 3837 err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
3837 3838 VOPSTATS_UPDATE(vp, realvp);
3838 3839 return (err);
3839 3840 }
3840 3841
3841 3842 int
3842 3843 fop_getpage(
3843 3844 vnode_t *vp,
3844 3845 offset_t off,
3845 3846 size_t len,
3846 3847 uint_t *protp,
3847 3848 page_t **plarr,
3848 3849 size_t plsz,
3849 3850 struct seg *seg,
3850 3851 caddr_t addr,
3851 3852 enum seg_rw rw,
3852 3853 cred_t *cr,
3853 3854 caller_context_t *ct)
3854 3855 {
3855 3856 int err;
3856 3857
3857 3858 VOPXID_MAP_CR(vp, cr);
3858 3859
3859 3860 err = (*(vp)->v_op->vop_getpage)
3860 3861 (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
3861 3862 VOPSTATS_UPDATE(vp, getpage);
3862 3863 return (err);
3863 3864 }
3864 3865
3865 3866 int
3866 3867 fop_putpage(
3867 3868 vnode_t *vp,
3868 3869 offset_t off,
3869 3870 size_t len,
3870 3871 int flags,
3871 3872 cred_t *cr,
3872 3873 caller_context_t *ct)
3873 3874 {
3874 3875 int err;
3875 3876
3876 3877 VOPXID_MAP_CR(vp, cr);
3877 3878
3878 3879 err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
3879 3880 VOPSTATS_UPDATE(vp, putpage);
3880 3881 return (err);
3881 3882 }
3882 3883
3883 3884 int
3884 3885 fop_map(
3885 3886 vnode_t *vp,
3886 3887 offset_t off,
3887 3888 struct as *as,
3888 3889 caddr_t *addrp,
3889 3890 size_t len,
3890 3891 uchar_t prot,
3891 3892 uchar_t maxprot,
3892 3893 uint_t flags,
3893 3894 cred_t *cr,
3894 3895 caller_context_t *ct)
3895 3896 {
3896 3897 int err;
3897 3898
3898 3899 VOPXID_MAP_CR(vp, cr);
3899 3900
3900 3901 err = (*(vp)->v_op->vop_map)
3901 3902 (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
3902 3903 VOPSTATS_UPDATE(vp, map);
3903 3904 return (err);
3904 3905 }
3905 3906
3906 3907 int
3907 3908 fop_addmap(
3908 3909 vnode_t *vp,
3909 3910 offset_t off,
3910 3911 struct as *as,
3911 3912 caddr_t addr,
3912 3913 size_t len,
3913 3914 uchar_t prot,
3914 3915 uchar_t maxprot,
3915 3916 uint_t flags,
3916 3917 cred_t *cr,
3917 3918 caller_context_t *ct)
3918 3919 {
3919 3920 int error;
3920 3921 u_longlong_t delta;
3921 3922
3922 3923 VOPXID_MAP_CR(vp, cr);
3923 3924
3924 3925 error = (*(vp)->v_op->vop_addmap)
3925 3926 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3926 3927
3927 3928 if ((!error) && (vp->v_type == VREG)) {
3928 3929 delta = (u_longlong_t)btopr(len);
3929 3930 /*
3930 3931 * If file is declared MAP_PRIVATE, it can't be written back
3931 3932 * even if open for write. Handle as read.
3932 3933 */
3933 3934 if (flags & MAP_PRIVATE) {
3934 3935 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3935 3936 (int64_t)delta);
3936 3937 } else {
3937 3938 /*
3938 3939 * atomic_add_64 forces the fetch of a 64 bit value to
3939 3940 * be atomic on 32 bit machines
3940 3941 */
3941 3942 if (maxprot & PROT_WRITE)
3942 3943 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3943 3944 (int64_t)delta);
3944 3945 if (maxprot & PROT_READ)
3945 3946 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3946 3947 (int64_t)delta);
3947 3948 if (maxprot & PROT_EXEC)
3948 3949 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3949 3950 (int64_t)delta);
3950 3951 }
3951 3952 }
3952 3953 VOPSTATS_UPDATE(vp, addmap);
3953 3954 return (error);
3954 3955 }
3955 3956
3956 3957 int
3957 3958 fop_delmap(
3958 3959 vnode_t *vp,
3959 3960 offset_t off,
3960 3961 struct as *as,
3961 3962 caddr_t addr,
3962 3963 size_t len,
3963 3964 uint_t prot,
3964 3965 uint_t maxprot,
3965 3966 uint_t flags,
3966 3967 cred_t *cr,
3967 3968 caller_context_t *ct)
3968 3969 {
3969 3970 int error;
3970 3971 u_longlong_t delta;
3971 3972
3972 3973 VOPXID_MAP_CR(vp, cr);
3973 3974
3974 3975 error = (*(vp)->v_op->vop_delmap)
3975 3976 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3976 3977
3977 3978 /*
3978 3979 * NFS calls into delmap twice, the first time
3979 3980 * it simply establishes a callback mechanism and returns EAGAIN
3980 3981 * while the real work is being done upon the second invocation.
3981 3982 * We have to detect this here and only decrement the counts upon
3982 3983 * the second delmap request.
3983 3984 */
3984 3985 if ((error != EAGAIN) && (vp->v_type == VREG)) {
3985 3986
3986 3987 delta = (u_longlong_t)btopr(len);
3987 3988
3988 3989 if (flags & MAP_PRIVATE) {
3989 3990 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3990 3991 (int64_t)(-delta));
3991 3992 } else {
3992 3993 /*
3993 3994 * atomic_add_64 forces the fetch of a 64 bit value
3994 3995 * to be atomic on 32 bit machines
3995 3996 */
3996 3997 if (maxprot & PROT_WRITE)
3997 3998 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3998 3999 (int64_t)(-delta));
3999 4000 if (maxprot & PROT_READ)
4000 4001 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4001 4002 (int64_t)(-delta));
4002 4003 if (maxprot & PROT_EXEC)
4003 4004 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4004 4005 (int64_t)(-delta));
4005 4006 }
4006 4007 }
4007 4008 VOPSTATS_UPDATE(vp, delmap);
4008 4009 return (error);
4009 4010 }
4010 4011
4011 4012
4012 4013 int
4013 4014 fop_poll(
4014 4015 vnode_t *vp,
4015 4016 short events,
4016 4017 int anyyet,
4017 4018 short *reventsp,
4018 4019 struct pollhead **phpp,
4019 4020 caller_context_t *ct)
4020 4021 {
4021 4022 int err;
4022 4023
4023 4024 err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4024 4025 VOPSTATS_UPDATE(vp, poll);
4025 4026 return (err);
4026 4027 }
4027 4028
4028 4029 int
4029 4030 fop_dump(
4030 4031 vnode_t *vp,
4031 4032 caddr_t addr,
4032 4033 offset_t lbdn,
4033 4034 offset_t dblks,
4034 4035 caller_context_t *ct)
4035 4036 {
4036 4037 int err;
4037 4038
4038 4039 /* ensure lbdn and dblks can be passed safely to bdev_dump */
4039 4040 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4040 4041 return (EIO);
4041 4042
4042 4043 err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4043 4044 VOPSTATS_UPDATE(vp, dump);
4044 4045 return (err);
4045 4046 }
4046 4047
4047 4048 int
4048 4049 fop_pathconf(
4049 4050 vnode_t *vp,
4050 4051 int cmd,
4051 4052 ulong_t *valp,
4052 4053 cred_t *cr,
4053 4054 caller_context_t *ct)
4054 4055 {
4055 4056 int err;
4056 4057
4057 4058 VOPXID_MAP_CR(vp, cr);
4058 4059
4059 4060 err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4060 4061 VOPSTATS_UPDATE(vp, pathconf);
4061 4062 return (err);
4062 4063 }
4063 4064
4064 4065 int
4065 4066 fop_pageio(
4066 4067 vnode_t *vp,
4067 4068 struct page *pp,
4068 4069 u_offset_t io_off,
4069 4070 size_t io_len,
4070 4071 int flags,
4071 4072 cred_t *cr,
4072 4073 caller_context_t *ct)
4073 4074 {
4074 4075 int err;
4075 4076
4076 4077 VOPXID_MAP_CR(vp, cr);
4077 4078
4078 4079 err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4079 4080 VOPSTATS_UPDATE(vp, pageio);
4080 4081 return (err);
4081 4082 }
4082 4083
4083 4084 int
4084 4085 fop_dumpctl(
4085 4086 vnode_t *vp,
4086 4087 int action,
4087 4088 offset_t *blkp,
4088 4089 caller_context_t *ct)
4089 4090 {
4090 4091 int err;
4091 4092 err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4092 4093 VOPSTATS_UPDATE(vp, dumpctl);
4093 4094 return (err);
4094 4095 }
4095 4096
4096 4097 void
4097 4098 fop_dispose(
4098 4099 vnode_t *vp,
4099 4100 page_t *pp,
4100 4101 int flag,
4101 4102 int dn,
4102 4103 cred_t *cr,
4103 4104 caller_context_t *ct)
4104 4105 {
4105 4106 /* Must do stats first since it's possible to lose the vnode */
4106 4107 VOPSTATS_UPDATE(vp, dispose);
4107 4108
4108 4109 VOPXID_MAP_CR(vp, cr);
4109 4110
4110 4111 (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4111 4112 }
4112 4113
4113 4114 int
4114 4115 fop_setsecattr(
4115 4116 vnode_t *vp,
4116 4117 vsecattr_t *vsap,
4117 4118 int flag,
4118 4119 cred_t *cr,
4119 4120 caller_context_t *ct)
4120 4121 {
4121 4122 int err;
4122 4123
4123 4124 VOPXID_MAP_CR(vp, cr);
4124 4125
4125 4126 /*
4126 4127 * We're only allowed to skip the ACL check iff we used a 32 bit
4127 4128 * ACE mask with VOP_ACCESS() to determine permissions.
4128 4129 */
4129 4130 if ((flag & ATTR_NOACLCHECK) &&
4130 4131 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4131 4132 return (EINVAL);
4132 4133 }
4133 4134 err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4134 4135 VOPSTATS_UPDATE(vp, setsecattr);
4135 4136 return (err);
4136 4137 }
4137 4138
4138 4139 int
4139 4140 fop_getsecattr(
4140 4141 vnode_t *vp,
4141 4142 vsecattr_t *vsap,
4142 4143 int flag,
4143 4144 cred_t *cr,
4144 4145 caller_context_t *ct)
4145 4146 {
4146 4147 int err;
4147 4148
4148 4149 /*
4149 4150 * We're only allowed to skip the ACL check iff we used a 32 bit
4150 4151 * ACE mask with VOP_ACCESS() to determine permissions.
4151 4152 */
4152 4153 if ((flag & ATTR_NOACLCHECK) &&
4153 4154 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4154 4155 return (EINVAL);
4155 4156 }
4156 4157
4157 4158 VOPXID_MAP_CR(vp, cr);
4158 4159
4159 4160 err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4160 4161 VOPSTATS_UPDATE(vp, getsecattr);
4161 4162 return (err);
4162 4163 }
4163 4164
4164 4165 int
4165 4166 fop_shrlock(
4166 4167 vnode_t *vp,
4167 4168 int cmd,
4168 4169 struct shrlock *shr,
4169 4170 int flag,
4170 4171 cred_t *cr,
4171 4172 caller_context_t *ct)
4172 4173 {
4173 4174 int err;
4174 4175
4175 4176 VOPXID_MAP_CR(vp, cr);
4176 4177
4177 4178 err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4178 4179 VOPSTATS_UPDATE(vp, shrlock);
4179 4180 return (err);
4180 4181 }
4181 4182
4182 4183 int
4183 4184 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4184 4185 caller_context_t *ct)
4185 4186 {
4186 4187 int err;
4187 4188
4188 4189 err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4189 4190 VOPSTATS_UPDATE(vp, vnevent);
4190 4191 return (err);
4191 4192 }
4192 4193
4193 4194 int
4194 4195 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4195 4196 caller_context_t *ct)
4196 4197 {
4197 4198 int err;
4198 4199
4199 4200 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4200 4201 return (ENOTSUP);
4201 4202 err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4202 4203 VOPSTATS_UPDATE(vp, reqzcbuf);
4203 4204 return (err);
4204 4205 }
4205 4206
4206 4207 int
4207 4208 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4208 4209 {
4209 4210 int err;
4210 4211
4211 4212 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4212 4213 return (ENOTSUP);
4213 4214 err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4214 4215 VOPSTATS_UPDATE(vp, retzcbuf);
4215 4216 return (err);
4216 4217 }
4217 4218
4218 4219 /*
4219 4220 * Default destructor
4220 4221 * Needed because NULL destructor means that the key is unused
4221 4222 */
4222 4223 /* ARGSUSED */
4223 4224 void
4224 4225 vsd_defaultdestructor(void *value)
4225 4226 {}
4226 4227
4227 4228 /*
4228 4229 * Create a key (index into per vnode array)
4229 4230 * Locks out vsd_create, vsd_destroy, and vsd_free
4230 4231 * May allocate memory with lock held
4231 4232 */
4232 4233 void
4233 4234 vsd_create(uint_t *keyp, void (*destructor)(void *))
4234 4235 {
4235 4236 int i;
4236 4237 uint_t nkeys;
4237 4238
4238 4239 /*
4239 4240 * if key is allocated, do nothing
4240 4241 */
4241 4242 mutex_enter(&vsd_lock);
4242 4243 if (*keyp) {
4243 4244 mutex_exit(&vsd_lock);
4244 4245 return;
4245 4246 }
4246 4247 /*
4247 4248 * find an unused key
4248 4249 */
4249 4250 if (destructor == NULL)
4250 4251 destructor = vsd_defaultdestructor;
4251 4252
4252 4253 for (i = 0; i < vsd_nkeys; ++i)
4253 4254 if (vsd_destructor[i] == NULL)
4254 4255 break;
4255 4256
4256 4257 /*
4257 4258 * if no unused keys, increase the size of the destructor array
4258 4259 */
4259 4260 if (i == vsd_nkeys) {
4260 4261 if ((nkeys = (vsd_nkeys << 1)) == 0)
4261 4262 nkeys = 1;
4262 4263 vsd_destructor =
4263 4264 (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4264 4265 (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4265 4266 (size_t)(nkeys * sizeof (void (*)(void *))));
4266 4267 vsd_nkeys = nkeys;
4267 4268 }
4268 4269
4269 4270 /*
4270 4271 * allocate the next available unused key
4271 4272 */
4272 4273 vsd_destructor[i] = destructor;
4273 4274 *keyp = i + 1;
4274 4275
4275 4276 /* create vsd_list, if it doesn't exist */
4276 4277 if (vsd_list == NULL) {
4277 4278 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4278 4279 list_create(vsd_list, sizeof (struct vsd_node),
4279 4280 offsetof(struct vsd_node, vs_nodes));
4280 4281 }
4281 4282
4282 4283 mutex_exit(&vsd_lock);
4283 4284 }
4284 4285
4285 4286 /*
4286 4287 * Destroy a key
4287 4288 *
4288 4289 * Assumes that the caller is preventing vsd_set and vsd_get
4289 4290 * Locks out vsd_create, vsd_destroy, and vsd_free
4290 4291 * May free memory with lock held
4291 4292 */
4292 4293 void
4293 4294 vsd_destroy(uint_t *keyp)
4294 4295 {
4295 4296 uint_t key;
4296 4297 struct vsd_node *vsd;
4297 4298
4298 4299 /*
4299 4300 * protect the key namespace and our destructor lists
4300 4301 */
4301 4302 mutex_enter(&vsd_lock);
4302 4303 key = *keyp;
4303 4304 *keyp = 0;
4304 4305
4305 4306 ASSERT(key <= vsd_nkeys);
4306 4307
4307 4308 /*
4308 4309 * if the key is valid
4309 4310 */
4310 4311 if (key != 0) {
4311 4312 uint_t k = key - 1;
4312 4313 /*
4313 4314 * for every vnode with VSD, call key's destructor
4314 4315 */
4315 4316 for (vsd = list_head(vsd_list); vsd != NULL;
4316 4317 vsd = list_next(vsd_list, vsd)) {
4317 4318 /*
4318 4319 * no VSD for key in this vnode
4319 4320 */
4320 4321 if (key > vsd->vs_nkeys)
4321 4322 continue;
4322 4323 /*
4323 4324 * call destructor for key
4324 4325 */
4325 4326 if (vsd->vs_value[k] && vsd_destructor[k])
4326 4327 (*vsd_destructor[k])(vsd->vs_value[k]);
4327 4328 /*
4328 4329 * reset value for key
4329 4330 */
4330 4331 vsd->vs_value[k] = NULL;
4331 4332 }
4332 4333 /*
4333 4334 * actually free the key (NULL destructor == unused)
4334 4335 */
4335 4336 vsd_destructor[k] = NULL;
4336 4337 }
4337 4338
4338 4339 mutex_exit(&vsd_lock);
4339 4340 }
4340 4341
4341 4342 /*
4342 4343 * Quickly return the per vnode value that was stored with the specified key
4343 4344 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4344 4345 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4345 4346 */
4346 4347 void *
4347 4348 vsd_get(vnode_t *vp, uint_t key)
4348 4349 {
4349 4350 struct vsd_node *vsd;
4350 4351
4351 4352 ASSERT(vp != NULL);
4352 4353 ASSERT(mutex_owned(&vp->v_vsd_lock));
4353 4354
4354 4355 vsd = vp->v_vsd;
4355 4356
4356 4357 if (key && vsd != NULL && key <= vsd->vs_nkeys)
4357 4358 return (vsd->vs_value[key - 1]);
4358 4359 return (NULL);
4359 4360 }
4360 4361
4361 4362 /*
4362 4363 * Set a per vnode value indexed with the specified key
4363 4364 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4364 4365 */
4365 4366 int
4366 4367 vsd_set(vnode_t *vp, uint_t key, void *value)
4367 4368 {
4368 4369 struct vsd_node *vsd;
4369 4370
4370 4371 ASSERT(vp != NULL);
4371 4372 ASSERT(mutex_owned(&vp->v_vsd_lock));
4372 4373
4373 4374 if (key == 0)
4374 4375 return (EINVAL);
4375 4376
4376 4377 vsd = vp->v_vsd;
4377 4378 if (vsd == NULL)
4378 4379 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4379 4380
4380 4381 /*
4381 4382 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4382 4383 * code won't happen and we will continue down and allocate space for
4383 4384 * the vs_value array.
4384 4385 * If the caller is replacing one value with another, then it is up
4385 4386 * to the caller to free/rele/destroy the previous value (if needed).
4386 4387 */
4387 4388 if (key <= vsd->vs_nkeys) {
4388 4389 vsd->vs_value[key - 1] = value;
4389 4390 return (0);
4390 4391 }
4391 4392
4392 4393 ASSERT(key <= vsd_nkeys);
4393 4394
4394 4395 if (vsd->vs_nkeys == 0) {
4395 4396 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4396 4397 /*
4397 4398 * Link onto list of all VSD nodes.
4398 4399 */
4399 4400 list_insert_head(vsd_list, vsd);
4400 4401 mutex_exit(&vsd_lock);
4401 4402 }
4402 4403
4403 4404 /*
4404 4405 * Allocate vnode local storage and set the value for key
4405 4406 */
4406 4407 vsd->vs_value = vsd_realloc(vsd->vs_value,
4407 4408 vsd->vs_nkeys * sizeof (void *),
4408 4409 key * sizeof (void *));
4409 4410 vsd->vs_nkeys = key;
4410 4411 vsd->vs_value[key - 1] = value;
4411 4412
4412 4413 return (0);
4413 4414 }
4414 4415
4415 4416 /*
4416 4417 * Called from vn_free() to run the destructor function for each vsd
4417 4418 * Locks out vsd_create and vsd_destroy
4418 4419 * Assumes that the destructor *DOES NOT* use vsd
4419 4420 */
4420 4421 void
4421 4422 vsd_free(vnode_t *vp)
4422 4423 {
4423 4424 int i;
4424 4425 struct vsd_node *vsd = vp->v_vsd;
4425 4426
4426 4427 if (vsd == NULL)
4427 4428 return;
4428 4429
4429 4430 if (vsd->vs_nkeys == 0) {
4430 4431 kmem_free(vsd, sizeof (*vsd));
4431 4432 vp->v_vsd = NULL;
4432 4433 return;
4433 4434 }
4434 4435
4435 4436 /*
4436 4437 * lock out vsd_create and vsd_destroy, call
4437 4438 * the destructor, and mark the value as destroyed.
4438 4439 */
4439 4440 mutex_enter(&vsd_lock);
4440 4441
4441 4442 for (i = 0; i < vsd->vs_nkeys; i++) {
4442 4443 if (vsd->vs_value[i] && vsd_destructor[i])
4443 4444 (*vsd_destructor[i])(vsd->vs_value[i]);
4444 4445 vsd->vs_value[i] = NULL;
4445 4446 }
4446 4447
4447 4448 /*
4448 4449 * remove from linked list of VSD nodes
4449 4450 */
4450 4451 list_remove(vsd_list, vsd);
4451 4452
4452 4453 mutex_exit(&vsd_lock);
4453 4454
4454 4455 /*
4455 4456 * free up the VSD
4456 4457 */
4457 4458 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4458 4459 kmem_free(vsd, sizeof (struct vsd_node));
4459 4460 vp->v_vsd = NULL;
4460 4461 }
4461 4462
4462 4463 /*
4463 4464 * realloc
4464 4465 */
4465 4466 static void *
4466 4467 vsd_realloc(void *old, size_t osize, size_t nsize)
4467 4468 {
4468 4469 void *new;
4469 4470
4470 4471 new = kmem_zalloc(nsize, KM_SLEEP);
4471 4472 if (old) {
4472 4473 bcopy(old, new, osize);
4473 4474 kmem_free(old, osize);
4474 4475 }
4475 4476 return (new);
4476 4477 }
4477 4478
4478 4479 /*
4479 4480 * Setup the extensible system attribute for creating a reparse point.
4480 4481 * The symlink data 'target' is validated for proper format of a reparse
4481 4482 * string and a check also made to make sure the symlink data does not
4482 4483 * point to an existing file.
4483 4484 *
4484 4485 * return 0 if ok else -1.
4485 4486 */
4486 4487 static int
4487 4488 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4488 4489 {
4489 4490 xoptattr_t *xoap;
4490 4491
4491 4492 if ((!target) || (!vap) || (!xvattr))
4492 4493 return (-1);
4493 4494
4494 4495 /* validate reparse string */
4495 4496 if (reparse_validate((const char *)target))
4496 4497 return (-1);
4497 4498
4498 4499 xva_init(xvattr);
4499 4500 xvattr->xva_vattr = *vap;
4500 4501 xvattr->xva_vattr.va_mask |= AT_XVATTR;
4501 4502 xoap = xva_getxoptattr(xvattr);
4502 4503 ASSERT(xoap);
4503 4504 XVA_SET_REQ(xvattr, XAT_REPARSE);
4504 4505 xoap->xoa_reparse = 1;
4505 4506
4506 4507 return (0);
4507 4508 }
4508 4509
4509 4510 /*
4510 4511 * Function to check whether a symlink is a reparse point.
4511 4512 * Return B_TRUE if it is a reparse point, else return B_FALSE
4512 4513 */
4513 4514 boolean_t
4514 4515 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4515 4516 {
4516 4517 xvattr_t xvattr;
4517 4518 xoptattr_t *xoap;
4518 4519
4519 4520 if ((vp->v_type != VLNK) ||
4520 4521 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4521 4522 return (B_FALSE);
4522 4523
4523 4524 xva_init(&xvattr);
4524 4525 xoap = xva_getxoptattr(&xvattr);
4525 4526 ASSERT(xoap);
4526 4527 XVA_SET_REQ(&xvattr, XAT_REPARSE);
4527 4528
4528 4529 if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4529 4530 return (B_FALSE);
4530 4531
4531 4532 if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4532 4533 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4533 4534 return (B_FALSE);
4534 4535
4535 4536 return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4536 4537 }
↓ open down ↓ |
1279 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX