1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2018, Joyent, Inc. 25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 27 */ 28 29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 30 /* All Rights Reserved */ 31 32 /* 33 * University Copyright- Copyright (c) 1982, 1986, 1988 34 * The Regents of the University of California 35 * All Rights Reserved 36 * 37 * University Acknowledgment- Portions of this document are derived from 38 * software developed by the University of California, Berkeley, and its 39 * contributors. 40 */ 41 42 #include <sys/types.h> 43 #include <sys/param.h> 44 #include <sys/t_lock.h> 45 #include <sys/errno.h> 46 #include <sys/cred.h> 47 #include <sys/user.h> 48 #include <sys/uio.h> 49 #include <sys/file.h> 50 #include <sys/pathname.h> 51 #include <sys/vfs.h> 52 #include <sys/vfs_opreg.h> 53 #include <sys/vnode.h> 54 #include <sys/rwstlock.h> 55 #include <sys/fem.h> 56 #include <sys/stat.h> 57 #include <sys/mode.h> 58 #include <sys/conf.h> 59 #include <sys/sysmacros.h> 60 #include <sys/cmn_err.h> 61 #include <sys/systm.h> 62 #include <sys/kmem.h> 63 #include <sys/debug.h> 64 #include <c2/audit.h> 65 #include <sys/acl.h> 66 #include <sys/nbmlock.h> 67 #include <sys/fcntl.h> 68 #include <fs/fs_subr.h> 69 #include <sys/taskq.h> 70 #include <fs/fs_reparse.h> 71 #include <sys/time.h> 72 #include <sys/sdt.h> 73 74 /* Determine if this vnode is a file that is read-only */ 75 #define ISROFILE(vp) \ 76 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \ 77 (vp)->v_type != VFIFO && vn_is_readonly(vp)) 78 79 /* Tunable via /etc/system; used only by admin/install */ 80 int nfs_global_client_only; 81 82 /* 83 * Array of vopstats_t for per-FS-type vopstats. This array has the same 84 * number of entries as and parallel to the vfssw table. (Arguably, it could 85 * be part of the vfssw table.) Once it's initialized, it's accessed using 86 * the same fstype index that is used to index into the vfssw table. 87 */ 88 vopstats_t **vopstats_fstype; 89 90 /* vopstats initialization template used for fast initialization via bcopy() */ 91 static vopstats_t *vs_templatep; 92 93 /* Kmem cache handle for vsk_anchor_t allocations */ 94 kmem_cache_t *vsk_anchor_cache; 95 96 /* file events cleanup routine */ 97 extern void free_fopdata(vnode_t *); 98 99 /* 100 * Root of AVL tree for the kstats associated with vopstats. Lock protects 101 * updates to vsktat_tree. 102 */ 103 avl_tree_t vskstat_tree; 104 kmutex_t vskstat_tree_lock; 105 106 /* Global variable which enables/disables the vopstats collection */ 107 int vopstats_enabled = 1; 108 109 /* Global used for empty/invalid v_path */ 110 char *vn_vpath_empty = ""; 111 112 /* 113 * forward declarations for internal vnode specific data (vsd) 114 */ 115 static void *vsd_realloc(void *, size_t, size_t); 116 117 /* 118 * forward declarations for reparse point functions 119 */ 120 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr); 121 122 /* 123 * VSD -- VNODE SPECIFIC DATA 124 * The v_data pointer is typically used by a file system to store a 125 * pointer to the file system's private node (e.g. ufs inode, nfs rnode). 126 * However, there are times when additional project private data needs 127 * to be stored separately from the data (node) pointed to by v_data. 128 * This additional data could be stored by the file system itself or 129 * by a completely different kernel entity. VSD provides a way for 130 * callers to obtain a key and store a pointer to private data associated 131 * with a vnode. 132 * 133 * Callers are responsible for protecting the vsd by holding v_vsd_lock 134 * for calls to vsd_set() and vsd_get(). 135 */ 136 137 /* 138 * vsd_lock protects: 139 * vsd_nkeys - creation and deletion of vsd keys 140 * vsd_list - insertion and deletion of vsd_node in the vsd_list 141 * vsd_destructor - adding and removing destructors to the list 142 */ 143 static kmutex_t vsd_lock; 144 static uint_t vsd_nkeys; /* size of destructor array */ 145 /* list of vsd_node's */ 146 static list_t *vsd_list = NULL; 147 /* per-key destructor funcs */ 148 static void (**vsd_destructor)(void *); 149 150 /* 151 * The following is the common set of actions needed to update the 152 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and 153 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the 154 * recording of the bytes transferred. Since the code is similar 155 * but small, it is nearly a duplicate. Consequently any changes 156 * to one may need to be reflected in the other. 157 * Rundown of the variables: 158 * vp - Pointer to the vnode 159 * counter - Partial name structure member to update in vopstats for counts 160 * bytecounter - Partial name structure member to update in vopstats for bytes 161 * bytesval - Value to update in vopstats for bytes 162 * fstype - Index into vsanchor_fstype[], same as index into vfssw[] 163 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i]) 164 */ 165 166 #define VOPSTATS_UPDATE(vp, counter) { \ 167 vfs_t *vfsp = (vp)->v_vfsp; \ 168 if (vfsp && vfsp->vfs_implp && \ 169 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ 170 vopstats_t *vsp = &vfsp->vfs_vopstats; \ 171 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ 172 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ 173 size_t, uint64_t *); \ 174 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \ 175 (*stataddr)++; \ 176 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ 177 vsp->n##counter.value.ui64++; \ 178 } \ 179 } \ 180 } 181 182 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \ 183 vfs_t *vfsp = (vp)->v_vfsp; \ 184 if (vfsp && vfsp->vfs_implp && \ 185 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ 186 vopstats_t *vsp = &vfsp->vfs_vopstats; \ 187 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ 188 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ 189 size_t, uint64_t *); \ 190 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \ 191 (*stataddr)++; \ 192 vsp->bytecounter.value.ui64 += bytesval; \ 193 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ 194 vsp->n##counter.value.ui64++; \ 195 vsp->bytecounter.value.ui64 += bytesval; \ 196 } \ 197 } \ 198 } 199 200 /* 201 * If the filesystem does not support XIDs map credential 202 * If the vfsp is NULL, perhaps we should also map? 203 */ 204 #define VOPXID_MAP_CR(vp, cr) { \ 205 vfs_t *vfsp = (vp)->v_vfsp; \ 206 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \ 207 cr = crgetmapped(cr); \ 208 } 209 210 /* 211 * Convert stat(2) formats to vnode types and vice versa. (Knows about 212 * numerical order of S_IFMT and vnode types.) 213 */ 214 enum vtype iftovt_tab[] = { 215 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 216 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 217 }; 218 219 ushort_t vttoif_tab[] = { 220 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 221 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0 222 }; 223 224 /* 225 * The system vnode cache. 226 */ 227 228 kmem_cache_t *vn_cache; 229 230 231 /* 232 * Vnode operations vector. 233 */ 234 235 static const fs_operation_trans_def_t vn_ops_table[] = { 236 VOPNAME_OPEN, offsetof(struct vnodeops, vop_open), 237 fs_nosys, fs_nosys, 238 239 VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close), 240 fs_nosys, fs_nosys, 241 242 VOPNAME_READ, offsetof(struct vnodeops, vop_read), 243 fs_nosys, fs_nosys, 244 245 VOPNAME_WRITE, offsetof(struct vnodeops, vop_write), 246 fs_nosys, fs_nosys, 247 248 VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl), 249 fs_nosys, fs_nosys, 250 251 VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl), 252 fs_setfl, fs_nosys, 253 254 VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr), 255 fs_nosys, fs_nosys, 256 257 VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr), 258 fs_nosys, fs_nosys, 259 260 VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access), 261 fs_nosys, fs_nosys, 262 263 VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup), 264 fs_nosys, fs_nosys, 265 266 VOPNAME_CREATE, offsetof(struct vnodeops, vop_create), 267 fs_nosys, fs_nosys, 268 269 VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove), 270 fs_nosys, fs_nosys, 271 272 VOPNAME_LINK, offsetof(struct vnodeops, vop_link), 273 fs_nosys, fs_nosys, 274 275 VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename), 276 fs_nosys, fs_nosys, 277 278 VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir), 279 fs_nosys, fs_nosys, 280 281 VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir), 282 fs_nosys, fs_nosys, 283 284 VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir), 285 fs_nosys, fs_nosys, 286 287 VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink), 288 fs_nosys, fs_nosys, 289 290 VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink), 291 fs_nosys, fs_nosys, 292 293 VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync), 294 fs_nosys, fs_nosys, 295 296 VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive), 297 fs_nosys, fs_nosys, 298 299 VOPNAME_FID, offsetof(struct vnodeops, vop_fid), 300 fs_nosys, fs_nosys, 301 302 VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock), 303 fs_rwlock, fs_rwlock, 304 305 VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock), 306 (fs_generic_func_p) fs_rwunlock, 307 (fs_generic_func_p) fs_rwunlock, /* no errors allowed */ 308 309 VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek), 310 fs_nosys, fs_nosys, 311 312 VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp), 313 fs_cmp, fs_cmp, /* no errors allowed */ 314 315 VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock), 316 fs_frlock, fs_nosys, 317 318 VOPNAME_SPACE, offsetof(struct vnodeops, vop_space), 319 fs_nosys, fs_nosys, 320 321 VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp), 322 fs_nosys, fs_nosys, 323 324 VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage), 325 fs_nosys, fs_nosys, 326 327 VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage), 328 fs_nosys, fs_nosys, 329 330 VOPNAME_MAP, offsetof(struct vnodeops, vop_map), 331 (fs_generic_func_p) fs_nosys_map, 332 (fs_generic_func_p) fs_nosys_map, 333 334 VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap), 335 (fs_generic_func_p) fs_nosys_addmap, 336 (fs_generic_func_p) fs_nosys_addmap, 337 338 VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap), 339 fs_nosys, fs_nosys, 340 341 VOPNAME_POLL, offsetof(struct vnodeops, vop_poll), 342 (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll, 343 344 VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump), 345 fs_nosys, fs_nosys, 346 347 VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf), 348 fs_pathconf, fs_nosys, 349 350 VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio), 351 fs_nosys, fs_nosys, 352 353 VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl), 354 fs_nosys, fs_nosys, 355 356 VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose), 357 (fs_generic_func_p) fs_dispose, 358 (fs_generic_func_p) fs_nodispose, 359 360 VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr), 361 fs_nosys, fs_nosys, 362 363 VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr), 364 fs_fab_acl, fs_nosys, 365 366 VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock), 367 fs_shrlock, fs_nosys, 368 369 VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent), 370 (fs_generic_func_p) fs_vnevent_nosupport, 371 (fs_generic_func_p) fs_vnevent_nosupport, 372 373 VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf), 374 fs_nosys, fs_nosys, 375 376 VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf), 377 fs_nosys, fs_nosys, 378 379 NULL, 0, NULL, NULL 380 }; 381 382 /* Extensible attribute (xva) routines. */ 383 384 /* 385 * Zero out the structure, set the size of the requested/returned bitmaps, 386 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer 387 * to the returned attributes array. 388 */ 389 void 390 xva_init(xvattr_t *xvap) 391 { 392 bzero(xvap, sizeof (xvattr_t)); 393 xvap->xva_mapsize = XVA_MAPSIZE; 394 xvap->xva_magic = XVA_MAGIC; 395 xvap->xva_vattr.va_mask = AT_XVATTR; 396 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; 397 } 398 399 /* 400 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t 401 * structure. Otherwise, returns NULL. 402 */ 403 xoptattr_t * 404 xva_getxoptattr(xvattr_t *xvap) 405 { 406 xoptattr_t *xoap = NULL; 407 if (xvap->xva_vattr.va_mask & AT_XVATTR) 408 xoap = &xvap->xva_xoptattrs; 409 return (xoap); 410 } 411 412 /* 413 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree. 414 * We use the f_fsid reported by VFS_STATVFS() since we use that for the 415 * kstat name. 416 */ 417 static int 418 vska_compar(const void *n1, const void *n2) 419 { 420 int ret; 421 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid; 422 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid; 423 424 if (p1 < p2) { 425 ret = -1; 426 } else if (p1 > p2) { 427 ret = 1; 428 } else { 429 ret = 0; 430 } 431 432 return (ret); 433 } 434 435 /* 436 * Used to create a single template which will be bcopy()ed to a newly 437 * allocated vsanchor_combo_t structure in new_vsanchor(), below. 438 */ 439 static vopstats_t * 440 create_vopstats_template() 441 { 442 vopstats_t *vsp; 443 444 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP); 445 bzero(vsp, sizeof (*vsp)); /* Start fresh */ 446 447 /* VOP_OPEN */ 448 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64); 449 /* VOP_CLOSE */ 450 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64); 451 /* VOP_READ I/O */ 452 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64); 453 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64); 454 /* VOP_WRITE I/O */ 455 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64); 456 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64); 457 /* VOP_IOCTL */ 458 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64); 459 /* VOP_SETFL */ 460 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64); 461 /* VOP_GETATTR */ 462 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64); 463 /* VOP_SETATTR */ 464 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64); 465 /* VOP_ACCESS */ 466 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64); 467 /* VOP_LOOKUP */ 468 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64); 469 /* VOP_CREATE */ 470 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64); 471 /* VOP_REMOVE */ 472 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64); 473 /* VOP_LINK */ 474 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64); 475 /* VOP_RENAME */ 476 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64); 477 /* VOP_MKDIR */ 478 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64); 479 /* VOP_RMDIR */ 480 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64); 481 /* VOP_READDIR I/O */ 482 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64); 483 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes", 484 KSTAT_DATA_UINT64); 485 /* VOP_SYMLINK */ 486 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64); 487 /* VOP_READLINK */ 488 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64); 489 /* VOP_FSYNC */ 490 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64); 491 /* VOP_INACTIVE */ 492 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64); 493 /* VOP_FID */ 494 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64); 495 /* VOP_RWLOCK */ 496 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64); 497 /* VOP_RWUNLOCK */ 498 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64); 499 /* VOP_SEEK */ 500 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64); 501 /* VOP_CMP */ 502 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64); 503 /* VOP_FRLOCK */ 504 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64); 505 /* VOP_SPACE */ 506 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64); 507 /* VOP_REALVP */ 508 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64); 509 /* VOP_GETPAGE */ 510 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64); 511 /* VOP_PUTPAGE */ 512 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64); 513 /* VOP_MAP */ 514 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64); 515 /* VOP_ADDMAP */ 516 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64); 517 /* VOP_DELMAP */ 518 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64); 519 /* VOP_POLL */ 520 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64); 521 /* VOP_DUMP */ 522 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64); 523 /* VOP_PATHCONF */ 524 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64); 525 /* VOP_PAGEIO */ 526 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64); 527 /* VOP_DUMPCTL */ 528 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64); 529 /* VOP_DISPOSE */ 530 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64); 531 /* VOP_SETSECATTR */ 532 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64); 533 /* VOP_GETSECATTR */ 534 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64); 535 /* VOP_SHRLOCK */ 536 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64); 537 /* VOP_VNEVENT */ 538 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); 539 /* VOP_REQZCBUF */ 540 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64); 541 /* VOP_RETZCBUF */ 542 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64); 543 544 return (vsp); 545 } 546 547 /* 548 * Creates a kstat structure associated with a vopstats structure. 549 */ 550 kstat_t * 551 new_vskstat(char *ksname, vopstats_t *vsp) 552 { 553 kstat_t *ksp; 554 555 if (!vopstats_enabled) { 556 return (NULL); 557 } 558 559 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED, 560 sizeof (vopstats_t)/sizeof (kstat_named_t), 561 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); 562 if (ksp) { 563 ksp->ks_data = vsp; 564 kstat_install(ksp); 565 } 566 567 return (ksp); 568 } 569 570 /* 571 * Called from vfsinit() to initialize the support mechanisms for vopstats 572 */ 573 void 574 vopstats_startup() 575 { 576 if (!vopstats_enabled) 577 return; 578 579 /* 580 * Creates the AVL tree which holds per-vfs vopstat anchors. This 581 * is necessary since we need to check if a kstat exists before we 582 * attempt to create it. Also, initialize its lock. 583 */ 584 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t), 585 offsetof(vsk_anchor_t, vsk_node)); 586 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL); 587 588 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache", 589 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL, 590 NULL, NULL, 0); 591 592 /* 593 * Set up the array of pointers for the vopstats-by-FS-type. 594 * The entries will be allocated/initialized as each file system 595 * goes through modload/mod_installfs. 596 */ 597 vopstats_fstype = (vopstats_t **)kmem_zalloc( 598 (sizeof (vopstats_t *) * nfstype), KM_SLEEP); 599 600 /* Set up the global vopstats initialization template */ 601 vs_templatep = create_vopstats_template(); 602 } 603 604 /* 605 * We need to have the all of the counters zeroed. 606 * The initialization of the vopstats_t includes on the order of 607 * 50 calls to kstat_named_init(). Rather that do that on every call, 608 * we do it once in a template (vs_templatep) then bcopy it over. 609 */ 610 void 611 initialize_vopstats(vopstats_t *vsp) 612 { 613 if (vsp == NULL) 614 return; 615 616 bcopy(vs_templatep, vsp, sizeof (vopstats_t)); 617 } 618 619 /* 620 * If possible, determine which vopstats by fstype to use and 621 * return a pointer to the caller. 622 */ 623 vopstats_t * 624 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp) 625 { 626 int fstype = 0; /* Index into vfssw[] */ 627 vopstats_t *vsp = NULL; 628 629 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || 630 !vopstats_enabled) 631 return (NULL); 632 /* 633 * Set up the fstype. We go to so much trouble because all versions 634 * of NFS use the same fstype in their vfs even though they have 635 * distinct entries in the vfssw[] table. 636 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry. 637 */ 638 if (vswp) { 639 fstype = vswp - vfssw; /* Gets us the index */ 640 } else { 641 fstype = vfsp->vfs_fstype; 642 } 643 644 /* 645 * Point to the per-fstype vopstats. The only valid values are 646 * non-zero positive values less than the number of vfssw[] table 647 * entries. 648 */ 649 if (fstype > 0 && fstype < nfstype) { 650 vsp = vopstats_fstype[fstype]; 651 } 652 653 return (vsp); 654 } 655 656 /* 657 * Generate a kstat name, create the kstat structure, and allocate a 658 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t 659 * to the caller. This must only be called from a mount. 660 */ 661 vsk_anchor_t * 662 get_vskstat_anchor(vfs_t *vfsp) 663 { 664 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */ 665 statvfs64_t statvfsbuf; /* Needed to find f_fsid */ 666 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */ 667 kstat_t *ksp; /* Ptr to new kstat */ 668 avl_index_t where; /* Location in the AVL tree */ 669 670 if (vfsp == NULL || vfsp->vfs_implp == NULL || 671 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) 672 return (NULL); 673 674 /* Need to get the fsid to build a kstat name */ 675 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) { 676 /* Create a name for our kstats based on fsid */ 677 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx", 678 VOPSTATS_STR, statvfsbuf.f_fsid); 679 680 /* Allocate and initialize the vsk_anchor_t */ 681 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP); 682 bzero(vskp, sizeof (*vskp)); 683 vskp->vsk_fsid = statvfsbuf.f_fsid; 684 685 mutex_enter(&vskstat_tree_lock); 686 if (avl_find(&vskstat_tree, vskp, &where) == NULL) { 687 avl_insert(&vskstat_tree, vskp, where); 688 mutex_exit(&vskstat_tree_lock); 689 690 /* 691 * Now that we've got the anchor in the AVL 692 * tree, we can create the kstat. 693 */ 694 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats); 695 if (ksp) { 696 vskp->vsk_ksp = ksp; 697 } 698 } else { 699 /* Oops, found one! Release memory and lock. */ 700 mutex_exit(&vskstat_tree_lock); 701 kmem_cache_free(vsk_anchor_cache, vskp); 702 vskp = NULL; 703 } 704 } 705 return (vskp); 706 } 707 708 /* 709 * We're in the process of tearing down the vfs and need to cleanup 710 * the data structures associated with the vopstats. Must only be called 711 * from dounmount(). 712 */ 713 void 714 teardown_vopstats(vfs_t *vfsp) 715 { 716 vsk_anchor_t *vskap; 717 avl_index_t where; 718 719 if (vfsp == NULL || vfsp->vfs_implp == NULL || 720 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) 721 return; 722 723 /* This is a safe check since VFS_STATS must be set (see above) */ 724 if ((vskap = vfsp->vfs_vskap) == NULL) 725 return; 726 727 /* Whack the pointer right away */ 728 vfsp->vfs_vskap = NULL; 729 730 /* Lock the tree, remove the node, and delete the kstat */ 731 mutex_enter(&vskstat_tree_lock); 732 if (avl_find(&vskstat_tree, vskap, &where)) { 733 avl_remove(&vskstat_tree, vskap); 734 } 735 736 if (vskap->vsk_ksp) { 737 kstat_delete(vskap->vsk_ksp); 738 } 739 mutex_exit(&vskstat_tree_lock); 740 741 kmem_cache_free(vsk_anchor_cache, vskap); 742 } 743 744 /* 745 * Read or write a vnode. Called from kernel code. 746 */ 747 int 748 vn_rdwr( 749 enum uio_rw rw, 750 struct vnode *vp, 751 caddr_t base, 752 ssize_t len, 753 offset_t offset, 754 enum uio_seg seg, 755 int ioflag, 756 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ 757 cred_t *cr, 758 ssize_t *residp) 759 { 760 struct uio uio; 761 struct iovec iov; 762 int error; 763 int in_crit = 0; 764 765 if (rw == UIO_WRITE && ISROFILE(vp)) 766 return (EROFS); 767 768 if (len < 0) 769 return (EIO); 770 771 VOPXID_MAP_CR(vp, cr); 772 773 iov.iov_base = base; 774 iov.iov_len = len; 775 uio.uio_iov = &iov; 776 uio.uio_iovcnt = 1; 777 uio.uio_loffset = offset; 778 uio.uio_segflg = (short)seg; 779 uio.uio_resid = len; 780 uio.uio_llimit = ulimit; 781 782 /* 783 * We have to enter the critical region before calling VOP_RWLOCK 784 * to avoid a deadlock with ufs. 785 */ 786 if (nbl_need_check(vp)) { 787 int svmand; 788 789 nbl_start_crit(vp, RW_READER); 790 in_crit = 1; 791 error = nbl_svmand(vp, cr, &svmand); 792 if (error != 0) 793 goto done; 794 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ, 795 uio.uio_offset, uio.uio_resid, svmand, NULL)) { 796 error = EACCES; 797 goto done; 798 } 799 } 800 801 (void) VOP_RWLOCK(vp, 802 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); 803 if (rw == UIO_WRITE) { 804 uio.uio_fmode = FWRITE; 805 uio.uio_extflg = UIO_COPY_DEFAULT; 806 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL); 807 } else { 808 uio.uio_fmode = FREAD; 809 uio.uio_extflg = UIO_COPY_CACHED; 810 error = VOP_READ(vp, &uio, ioflag, cr, NULL); 811 } 812 VOP_RWUNLOCK(vp, 813 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); 814 if (residp) 815 *residp = uio.uio_resid; 816 else if (uio.uio_resid) 817 error = EIO; 818 819 done: 820 if (in_crit) 821 nbl_end_crit(vp); 822 return (error); 823 } 824 825 /* 826 * Release a vnode. Call VOP_INACTIVE on last reference or 827 * decrement reference count. 828 * 829 * To avoid race conditions, the v_count is left at 1 for 830 * the call to VOP_INACTIVE. This prevents another thread 831 * from reclaiming and releasing the vnode *before* the 832 * VOP_INACTIVE routine has a chance to destroy the vnode. 833 * We can't have more than 1 thread calling VOP_INACTIVE 834 * on a vnode. 835 */ 836 void 837 vn_rele(vnode_t *vp) 838 { 839 VERIFY(vp->v_count > 0); 840 mutex_enter(&vp->v_lock); 841 if (vp->v_count == 1) { 842 mutex_exit(&vp->v_lock); 843 VOP_INACTIVE(vp, CRED(), NULL); 844 return; 845 } 846 VN_RELE_LOCKED(vp); 847 mutex_exit(&vp->v_lock); 848 } 849 850 /* 851 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated 852 * as a single reference, so v_count is not decremented until the last DNLC hold 853 * is released. This makes it possible to distinguish vnodes that are referenced 854 * only by the DNLC. 855 */ 856 void 857 vn_rele_dnlc(vnode_t *vp) 858 { 859 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); 860 mutex_enter(&vp->v_lock); 861 if (--vp->v_count_dnlc == 0) { 862 if (vp->v_count == 1) { 863 mutex_exit(&vp->v_lock); 864 VOP_INACTIVE(vp, CRED(), NULL); 865 return; 866 } 867 VN_RELE_LOCKED(vp); 868 } 869 mutex_exit(&vp->v_lock); 870 } 871 872 /* 873 * Like vn_rele() except that it clears v_stream under v_lock. 874 * This is used by sockfs when it dismantles the association between 875 * the sockfs node and the vnode in the underlying file system. 876 * v_lock has to be held to prevent a thread coming through the lookupname 877 * path from accessing a stream head that is going away. 878 */ 879 void 880 vn_rele_stream(vnode_t *vp) 881 { 882 VERIFY(vp->v_count > 0); 883 mutex_enter(&vp->v_lock); 884 vp->v_stream = NULL; 885 if (vp->v_count == 1) { 886 mutex_exit(&vp->v_lock); 887 VOP_INACTIVE(vp, CRED(), NULL); 888 return; 889 } 890 VN_RELE_LOCKED(vp); 891 mutex_exit(&vp->v_lock); 892 } 893 894 static void 895 vn_rele_inactive(vnode_t *vp) 896 { 897 VOP_INACTIVE(vp, CRED(), NULL); 898 } 899 900 /* 901 * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it 902 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering 903 * the file system as a result of releasing the vnode. Note, file systems 904 * already have to handle the race where the vnode is incremented before the 905 * inactive routine is called and does its locking. 906 * 907 * Warning: Excessive use of this routine can lead to performance problems. 908 * This is because taskqs throttle back allocation if too many are created. 909 */ 910 void 911 vn_rele_async(vnode_t *vp, taskq_t *taskq) 912 { 913 VERIFY(vp->v_count > 0); 914 mutex_enter(&vp->v_lock); 915 if (vp->v_count == 1) { 916 mutex_exit(&vp->v_lock); 917 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive, 918 vp, TQ_SLEEP) != NULL); 919 return; 920 } 921 VN_RELE_LOCKED(vp); 922 mutex_exit(&vp->v_lock); 923 } 924 925 int 926 vn_open( 927 char *pnamep, 928 enum uio_seg seg, 929 int filemode, 930 int createmode, 931 struct vnode **vpp, 932 enum create crwhy, 933 mode_t umask) 934 { 935 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, 936 umask, NULL, -1)); 937 } 938 939 940 /* 941 * Open/create a vnode. 942 * This may be callable by the kernel, the only known use 943 * of user context being that the current user credentials 944 * are used for permissions. crwhy is defined iff filemode & FCREAT. 945 */ 946 int 947 vn_openat( 948 char *pnamep, 949 enum uio_seg seg, 950 int filemode, 951 int createmode, 952 struct vnode **vpp, 953 enum create crwhy, 954 mode_t umask, 955 struct vnode *startvp, 956 int fd) 957 { 958 struct vnode *vp; 959 int mode; 960 int accessflags; 961 int error; 962 int in_crit = 0; 963 int open_done = 0; 964 int shrlock_done = 0; 965 struct vattr vattr; 966 enum symfollow follow; 967 int estale_retry = 0; 968 struct shrlock shr; 969 struct shr_locowner shr_own; 970 971 mode = 0; 972 accessflags = 0; 973 if (filemode & FREAD) 974 mode |= VREAD; 975 if (filemode & (FWRITE|FTRUNC)) 976 mode |= VWRITE; 977 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN)) 978 mode |= VEXEC; 979 980 /* symlink interpretation */ 981 if (filemode & FNOFOLLOW) 982 follow = NO_FOLLOW; 983 else 984 follow = FOLLOW; 985 986 if (filemode & FAPPEND) 987 accessflags |= V_APPEND; 988 989 top: 990 if (filemode & FCREAT) { 991 enum vcexcl excl; 992 993 /* 994 * Wish to create a file. 995 */ 996 vattr.va_type = VREG; 997 vattr.va_mode = createmode; 998 vattr.va_mask = AT_TYPE|AT_MODE; 999 if (filemode & FTRUNC) { 1000 vattr.va_size = 0; 1001 vattr.va_mask |= AT_SIZE; 1002 } 1003 if (filemode & FEXCL) 1004 excl = EXCL; 1005 else 1006 excl = NONEXCL; 1007 1008 if (error = 1009 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy, 1010 (filemode & ~(FTRUNC|FEXCL)), umask, startvp)) 1011 return (error); 1012 } else { 1013 /* 1014 * Wish to open a file. Just look it up. 1015 */ 1016 if (error = lookupnameat(pnamep, seg, follow, 1017 NULLVPP, &vp, startvp)) { 1018 if ((error == ESTALE) && 1019 fs_need_estale_retry(estale_retry++)) 1020 goto top; 1021 return (error); 1022 } 1023 1024 /* 1025 * Get the attributes to check whether file is large. 1026 * We do this only if the FOFFMAX flag is not set and 1027 * only for regular files. 1028 */ 1029 1030 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) { 1031 vattr.va_mask = AT_SIZE; 1032 if ((error = VOP_GETATTR(vp, &vattr, 0, 1033 CRED(), NULL))) { 1034 goto out; 1035 } 1036 if (vattr.va_size > (u_offset_t)MAXOFF32_T) { 1037 /* 1038 * Large File API - regular open fails 1039 * if FOFFMAX flag is set in file mode 1040 */ 1041 error = EOVERFLOW; 1042 goto out; 1043 } 1044 } 1045 /* 1046 * Can't write directories, active texts, or 1047 * read-only filesystems. Can't truncate files 1048 * on which mandatory locking is in effect. 1049 */ 1050 if (filemode & (FWRITE|FTRUNC)) { 1051 /* 1052 * Allow writable directory if VDIROPEN flag is set. 1053 */ 1054 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) { 1055 error = EISDIR; 1056 goto out; 1057 } 1058 if (ISROFILE(vp)) { 1059 error = EROFS; 1060 goto out; 1061 } 1062 /* 1063 * Can't truncate files on which 1064 * sysv mandatory locking is in effect. 1065 */ 1066 if (filemode & FTRUNC) { 1067 vnode_t *rvp; 1068 1069 if (VOP_REALVP(vp, &rvp, NULL) != 0) 1070 rvp = vp; 1071 if (rvp->v_filocks != NULL) { 1072 vattr.va_mask = AT_MODE; 1073 if ((error = VOP_GETATTR(vp, 1074 &vattr, 0, CRED(), NULL)) == 0 && 1075 MANDLOCK(vp, vattr.va_mode)) 1076 error = EAGAIN; 1077 } 1078 } 1079 if (error) 1080 goto out; 1081 } 1082 /* 1083 * Check permissions. 1084 */ 1085 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL)) 1086 goto out; 1087 /* 1088 * Require FSEARCH to return a directory. 1089 * Require FEXEC to return a regular file. 1090 */ 1091 if ((filemode & FSEARCH) && vp->v_type != VDIR) { 1092 error = ENOTDIR; 1093 goto out; 1094 } 1095 if ((filemode & FEXEC) && vp->v_type != VREG) { 1096 error = ENOEXEC; /* XXX: error code? */ 1097 goto out; 1098 } 1099 } 1100 1101 /* 1102 * Do remaining checks for FNOFOLLOW and FNOLINKS. 1103 */ 1104 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { 1105 error = ELOOP; 1106 goto out; 1107 } 1108 if (filemode & FNOLINKS) { 1109 vattr.va_mask = AT_NLINK; 1110 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { 1111 goto out; 1112 } 1113 if (vattr.va_nlink != 1) { 1114 error = EMLINK; 1115 goto out; 1116 } 1117 } 1118 1119 /* 1120 * Opening a socket corresponding to the AF_UNIX pathname 1121 * in the filesystem name space is not supported. 1122 * However, VSOCK nodes in namefs are supported in order 1123 * to make fattach work for sockets. 1124 * 1125 * XXX This uses VOP_REALVP to distinguish between 1126 * an unopened namefs node (where VOP_REALVP returns a 1127 * different VSOCK vnode) and a VSOCK created by vn_create 1128 * in some file system (where VOP_REALVP would never return 1129 * a different vnode). 1130 */ 1131 if (vp->v_type == VSOCK) { 1132 struct vnode *nvp; 1133 1134 error = VOP_REALVP(vp, &nvp, NULL); 1135 if (error != 0 || nvp == NULL || nvp == vp || 1136 nvp->v_type != VSOCK) { 1137 error = EOPNOTSUPP; 1138 goto out; 1139 } 1140 } 1141 1142 if ((vp->v_type == VREG) && nbl_need_check(vp)) { 1143 /* get share reservation */ 1144 shr.s_access = 0; 1145 if (filemode & FWRITE) 1146 shr.s_access |= F_WRACC; 1147 if (filemode & FREAD) 1148 shr.s_access |= F_RDACC; 1149 shr.s_deny = 0; 1150 shr.s_sysid = 0; 1151 shr.s_pid = ttoproc(curthread)->p_pid; 1152 shr_own.sl_pid = shr.s_pid; 1153 shr_own.sl_id = fd; 1154 shr.s_own_len = sizeof (shr_own); 1155 shr.s_owner = (caddr_t)&shr_own; 1156 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(), 1157 NULL); 1158 if (error) 1159 goto out; 1160 shrlock_done = 1; 1161 1162 /* nbmand conflict check if truncating file */ 1163 if ((filemode & FTRUNC) && !(filemode & FCREAT)) { 1164 nbl_start_crit(vp, RW_READER); 1165 in_crit = 1; 1166 1167 vattr.va_mask = AT_SIZE; 1168 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) 1169 goto out; 1170 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0, 1171 NULL)) { 1172 error = EACCES; 1173 goto out; 1174 } 1175 } 1176 } 1177 1178 /* 1179 * Do opening protocol. 1180 */ 1181 error = VOP_OPEN(&vp, filemode, CRED(), NULL); 1182 if (error) 1183 goto out; 1184 open_done = 1; 1185 1186 /* 1187 * Truncate if required. 1188 */ 1189 if ((filemode & FTRUNC) && !(filemode & FCREAT)) { 1190 vattr.va_size = 0; 1191 vattr.va_mask = AT_SIZE; 1192 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) 1193 goto out; 1194 } 1195 out: 1196 ASSERT(vp->v_count > 0); 1197 1198 if (in_crit) { 1199 nbl_end_crit(vp); 1200 in_crit = 0; 1201 } 1202 if (error) { 1203 if (open_done) { 1204 (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(), 1205 NULL); 1206 open_done = 0; 1207 shrlock_done = 0; 1208 } 1209 if (shrlock_done) { 1210 (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(), 1211 NULL); 1212 shrlock_done = 0; 1213 } 1214 1215 /* 1216 * The following clause was added to handle a problem 1217 * with NFS consistency. It is possible that a lookup 1218 * of the file to be opened succeeded, but the file 1219 * itself doesn't actually exist on the server. This 1220 * is chiefly due to the DNLC containing an entry for 1221 * the file which has been removed on the server. In 1222 * this case, we just start over. If there was some 1223 * other cause for the ESTALE error, then the lookup 1224 * of the file will fail and the error will be returned 1225 * above instead of looping around from here. 1226 */ 1227 VN_RELE(vp); 1228 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1229 goto top; 1230 } else 1231 *vpp = vp; 1232 return (error); 1233 } 1234 1235 /* 1236 * The following two accessor functions are for the NFSv4 server. Since there 1237 * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the 1238 * vnode open counts correct when a client "upgrades" an open or does an 1239 * open_downgrade. In NFS, an upgrade or downgrade can not only change the 1240 * open mode (add or subtract read or write), but also change the share/deny 1241 * modes. However, share reservations are not integrated with OPEN, yet, so 1242 * we need to handle each separately. These functions are cleaner than having 1243 * the NFS server manipulate the counts directly, however, nobody else should 1244 * use these functions. 1245 */ 1246 void 1247 vn_open_upgrade( 1248 vnode_t *vp, 1249 int filemode) 1250 { 1251 ASSERT(vp->v_type == VREG); 1252 1253 if (filemode & FREAD) 1254 atomic_inc_32(&vp->v_rdcnt); 1255 if (filemode & FWRITE) 1256 atomic_inc_32(&vp->v_wrcnt); 1257 1258 } 1259 1260 void 1261 vn_open_downgrade( 1262 vnode_t *vp, 1263 int filemode) 1264 { 1265 ASSERT(vp->v_type == VREG); 1266 1267 if (filemode & FREAD) { 1268 ASSERT(vp->v_rdcnt > 0); 1269 atomic_dec_32(&vp->v_rdcnt); 1270 } 1271 if (filemode & FWRITE) { 1272 ASSERT(vp->v_wrcnt > 0); 1273 atomic_dec_32(&vp->v_wrcnt); 1274 } 1275 1276 } 1277 1278 int 1279 vn_create( 1280 char *pnamep, 1281 enum uio_seg seg, 1282 struct vattr *vap, 1283 enum vcexcl excl, 1284 int mode, 1285 struct vnode **vpp, 1286 enum create why, 1287 int flag, 1288 mode_t umask) 1289 { 1290 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag, 1291 umask, NULL)); 1292 } 1293 1294 /* 1295 * Create a vnode (makenode). 1296 */ 1297 int 1298 vn_createat( 1299 char *pnamep, 1300 enum uio_seg seg, 1301 struct vattr *vap, 1302 enum vcexcl excl, 1303 int mode, 1304 struct vnode **vpp, 1305 enum create why, 1306 int flag, 1307 mode_t umask, 1308 struct vnode *startvp) 1309 { 1310 struct vnode *dvp; /* ptr to parent dir vnode */ 1311 struct vnode *vp = NULL; 1312 struct pathname pn; 1313 int error; 1314 int in_crit = 0; 1315 struct vattr vattr; 1316 enum symfollow follow; 1317 int estale_retry = 0; 1318 uint32_t auditing = AU_AUDITING(); 1319 1320 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 1321 1322 /* symlink interpretation */ 1323 if ((flag & FNOFOLLOW) || excl == EXCL) 1324 follow = NO_FOLLOW; 1325 else 1326 follow = FOLLOW; 1327 flag &= ~(FNOFOLLOW|FNOLINKS); 1328 1329 top: 1330 /* 1331 * Lookup directory. 1332 * If new object is a file, call lower level to create it. 1333 * Note that it is up to the lower level to enforce exclusive 1334 * creation, if the file is already there. 1335 * This allows the lower level to do whatever 1336 * locking or protocol that is needed to prevent races. 1337 * If the new object is directory call lower level to make 1338 * the new directory, with "." and "..". 1339 */ 1340 if (error = pn_get(pnamep, seg, &pn)) 1341 return (error); 1342 if (auditing) 1343 audit_vncreate_start(); 1344 dvp = NULL; 1345 *vpp = NULL; 1346 /* 1347 * lookup will find the parent directory for the vnode. 1348 * When it is done the pn holds the name of the entry 1349 * in the directory. 1350 * If this is a non-exclusive create we also find the node itself. 1351 */ 1352 error = lookuppnat(&pn, NULL, follow, &dvp, 1353 (excl == EXCL) ? NULLVPP : vpp, startvp); 1354 if (error) { 1355 pn_free(&pn); 1356 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1357 goto top; 1358 if (why == CRMKDIR && error == EINVAL) 1359 error = EEXIST; /* SVID */ 1360 return (error); 1361 } 1362 1363 if (why != CRMKNOD) 1364 vap->va_mode &= ~VSVTX; 1365 1366 /* 1367 * If default ACLs are defined for the directory don't apply the 1368 * umask if umask is passed. 1369 */ 1370 1371 if (umask) { 1372 1373 vsecattr_t vsec; 1374 1375 vsec.vsa_aclcnt = 0; 1376 vsec.vsa_aclentp = NULL; 1377 vsec.vsa_dfaclcnt = 0; 1378 vsec.vsa_dfaclentp = NULL; 1379 vsec.vsa_mask = VSA_DFACLCNT; 1380 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL); 1381 /* 1382 * If error is ENOSYS then treat it as no error 1383 * Don't want to force all file systems to support 1384 * aclent_t style of ACL's. 1385 */ 1386 if (error == ENOSYS) 1387 error = 0; 1388 if (error) { 1389 if (*vpp != NULL) 1390 VN_RELE(*vpp); 1391 goto out; 1392 } else { 1393 /* 1394 * Apply the umask if no default ACLs. 1395 */ 1396 if (vsec.vsa_dfaclcnt == 0) 1397 vap->va_mode &= ~umask; 1398 1399 /* 1400 * VOP_GETSECATTR() may have allocated memory for 1401 * ACLs we didn't request, so double-check and 1402 * free it if necessary. 1403 */ 1404 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL) 1405 kmem_free((caddr_t)vsec.vsa_aclentp, 1406 vsec.vsa_aclcnt * sizeof (aclent_t)); 1407 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL) 1408 kmem_free((caddr_t)vsec.vsa_dfaclentp, 1409 vsec.vsa_dfaclcnt * sizeof (aclent_t)); 1410 } 1411 } 1412 1413 /* 1414 * In general we want to generate EROFS if the file system is 1415 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1 1416 * documents the open system call, and it says that O_CREAT has no 1417 * effect if the file already exists. Bug 1119649 states 1418 * that open(path, O_CREAT, ...) fails when attempting to open an 1419 * existing file on a read only file system. Thus, the first part 1420 * of the following if statement has 3 checks: 1421 * if the file exists && 1422 * it is being open with write access && 1423 * the file system is read only 1424 * then generate EROFS 1425 */ 1426 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) || 1427 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 1428 if (*vpp) 1429 VN_RELE(*vpp); 1430 error = EROFS; 1431 } else if (excl == NONEXCL && *vpp != NULL) { 1432 vnode_t *rvp; 1433 1434 /* 1435 * File already exists. If a mandatory lock has been 1436 * applied, return error. 1437 */ 1438 vp = *vpp; 1439 if (VOP_REALVP(vp, &rvp, NULL) != 0) 1440 rvp = vp; 1441 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) { 1442 nbl_start_crit(vp, RW_READER); 1443 in_crit = 1; 1444 } 1445 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) { 1446 vattr.va_mask = AT_MODE|AT_SIZE; 1447 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) { 1448 goto out; 1449 } 1450 if (MANDLOCK(vp, vattr.va_mode)) { 1451 error = EAGAIN; 1452 goto out; 1453 } 1454 /* 1455 * File cannot be truncated if non-blocking mandatory 1456 * locks are currently on the file. 1457 */ 1458 if ((vap->va_mask & AT_SIZE) && in_crit) { 1459 u_offset_t offset; 1460 ssize_t length; 1461 1462 offset = vap->va_size > vattr.va_size ? 1463 vattr.va_size : vap->va_size; 1464 length = vap->va_size > vattr.va_size ? 1465 vap->va_size - vattr.va_size : 1466 vattr.va_size - vap->va_size; 1467 if (nbl_conflict(vp, NBL_WRITE, offset, 1468 length, 0, NULL)) { 1469 error = EACCES; 1470 goto out; 1471 } 1472 } 1473 } 1474 1475 /* 1476 * If the file is the root of a VFS, we've crossed a 1477 * mount point and the "containing" directory that we 1478 * acquired above (dvp) is irrelevant because it's in 1479 * a different file system. We apply VOP_CREATE to the 1480 * target itself instead of to the containing directory 1481 * and supply a null path name to indicate (conventionally) 1482 * the node itself as the "component" of interest. 1483 * 1484 * The call to VOP_CREATE() is necessary to ensure 1485 * that the appropriate permission checks are made, 1486 * i.e. EISDIR, EACCES, etc. We already know that vpp 1487 * exists since we are in the else condition where this 1488 * was checked. 1489 */ 1490 if (vp->v_flag & VROOT) { 1491 ASSERT(why != CRMKDIR); 1492 error = VOP_CREATE(vp, "", vap, excl, mode, vpp, 1493 CRED(), flag, NULL, NULL); 1494 /* 1495 * If the create succeeded, it will have created a 1496 * new reference on a new vnode (*vpp) in the child 1497 * file system, so we want to drop our reference on 1498 * the old (vp) upon exit. 1499 */ 1500 goto out; 1501 } 1502 1503 /* 1504 * Large File API - non-large open (FOFFMAX flag not set) 1505 * of regular file fails if the file size exceeds MAXOFF32_T. 1506 */ 1507 if (why != CRMKDIR && 1508 !(flag & FOFFMAX) && 1509 (vp->v_type == VREG)) { 1510 vattr.va_mask = AT_SIZE; 1511 if ((error = VOP_GETATTR(vp, &vattr, 0, 1512 CRED(), NULL))) { 1513 goto out; 1514 } 1515 if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) { 1516 error = EOVERFLOW; 1517 goto out; 1518 } 1519 } 1520 } 1521 1522 if (error == 0) { 1523 /* 1524 * Call mkdir() if specified, otherwise create(). 1525 */ 1526 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */ 1527 1528 if (why == CRMKDIR) 1529 /* 1530 * N.B., if vn_createat() ever requests 1531 * case-insensitive behavior then it will need 1532 * to be passed to VOP_MKDIR(). VOP_CREATE() 1533 * will already get it via "flag" 1534 */ 1535 error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(), 1536 NULL, 0, NULL); 1537 else if (!must_be_dir) 1538 error = VOP_CREATE(dvp, pn.pn_path, vap, 1539 excl, mode, vpp, CRED(), flag, NULL, NULL); 1540 else 1541 error = ENOTDIR; 1542 } 1543 1544 out: 1545 1546 if (auditing) 1547 audit_vncreate_finish(*vpp, error); 1548 if (in_crit) { 1549 nbl_end_crit(vp); 1550 in_crit = 0; 1551 } 1552 if (vp != NULL) { 1553 VN_RELE(vp); 1554 vp = NULL; 1555 } 1556 pn_free(&pn); 1557 VN_RELE(dvp); 1558 /* 1559 * The following clause was added to handle a problem 1560 * with NFS consistency. It is possible that a lookup 1561 * of the file to be created succeeded, but the file 1562 * itself doesn't actually exist on the server. This 1563 * is chiefly due to the DNLC containing an entry for 1564 * the file which has been removed on the server. In 1565 * this case, we just start over. If there was some 1566 * other cause for the ESTALE error, then the lookup 1567 * of the file will fail and the error will be returned 1568 * above instead of looping around from here. 1569 */ 1570 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1571 goto top; 1572 return (error); 1573 } 1574 1575 int 1576 vn_link(char *from, char *to, enum uio_seg seg) 1577 { 1578 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg)); 1579 } 1580 1581 int 1582 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow, 1583 vnode_t *tstartvp, char *to, enum uio_seg seg) 1584 { 1585 struct vnode *fvp; /* from vnode ptr */ 1586 struct vnode *tdvp; /* to directory vnode ptr */ 1587 struct pathname pn; 1588 int error; 1589 struct vattr vattr; 1590 dev_t fsid; 1591 int estale_retry = 0; 1592 uint32_t auditing = AU_AUDITING(); 1593 1594 top: 1595 fvp = tdvp = NULL; 1596 if (error = pn_get(to, seg, &pn)) 1597 return (error); 1598 if (auditing && fstartvp != NULL) 1599 audit_setfsat_path(1); 1600 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp)) 1601 goto out; 1602 if (auditing && tstartvp != NULL) 1603 audit_setfsat_path(3); 1604 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp)) 1605 goto out; 1606 /* 1607 * Make sure both source vnode and target directory vnode are 1608 * in the same vfs and that it is writeable. 1609 */ 1610 vattr.va_mask = AT_FSID; 1611 if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL)) 1612 goto out; 1613 fsid = vattr.va_fsid; 1614 vattr.va_mask = AT_FSID; 1615 if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL)) 1616 goto out; 1617 if (fsid != vattr.va_fsid) { 1618 error = EXDEV; 1619 goto out; 1620 } 1621 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) { 1622 error = EROFS; 1623 goto out; 1624 } 1625 /* 1626 * Do the link. 1627 */ 1628 (void) pn_fixslash(&pn); 1629 error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0); 1630 out: 1631 pn_free(&pn); 1632 if (fvp) 1633 VN_RELE(fvp); 1634 if (tdvp) 1635 VN_RELE(tdvp); 1636 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1637 goto top; 1638 return (error); 1639 } 1640 1641 int 1642 vn_rename(char *from, char *to, enum uio_seg seg) 1643 { 1644 return (vn_renameat(NULL, from, NULL, to, seg)); 1645 } 1646 1647 int 1648 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp, 1649 char *tname, enum uio_seg seg) 1650 { 1651 int error; 1652 struct vattr vattr; 1653 struct pathname fpn; /* from pathname */ 1654 struct pathname tpn; /* to pathname */ 1655 dev_t fsid; 1656 int in_crit_src, in_crit_targ; 1657 vnode_t *fromvp, *fvp; 1658 vnode_t *tovp, *targvp; 1659 int estale_retry = 0; 1660 uint32_t auditing = AU_AUDITING(); 1661 1662 top: 1663 fvp = fromvp = tovp = targvp = NULL; 1664 in_crit_src = in_crit_targ = 0; 1665 /* 1666 * Get to and from pathnames. 1667 */ 1668 if (error = pn_get(fname, seg, &fpn)) 1669 return (error); 1670 if (error = pn_get(tname, seg, &tpn)) { 1671 pn_free(&fpn); 1672 return (error); 1673 } 1674 1675 /* 1676 * First we need to resolve the correct directories 1677 * The passed in directories may only be a starting point, 1678 * but we need the real directories the file(s) live in. 1679 * For example the fname may be something like usr/lib/sparc 1680 * and we were passed in the / directory, but we need to 1681 * use the lib directory for the rename. 1682 */ 1683 1684 if (auditing && fdvp != NULL) 1685 audit_setfsat_path(1); 1686 /* 1687 * Lookup to and from directories. 1688 */ 1689 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) { 1690 goto out; 1691 } 1692 1693 /* 1694 * Make sure there is an entry. 1695 */ 1696 if (fvp == NULL) { 1697 error = ENOENT; 1698 goto out; 1699 } 1700 1701 if (auditing && tdvp != NULL) 1702 audit_setfsat_path(3); 1703 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) { 1704 goto out; 1705 } 1706 1707 /* 1708 * Make sure both the from vnode directory and the to directory 1709 * are in the same vfs and the to directory is writable. 1710 * We check fsid's, not vfs pointers, so loopback fs works. 1711 */ 1712 if (fromvp != tovp) { 1713 vattr.va_mask = AT_FSID; 1714 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL)) 1715 goto out; 1716 fsid = vattr.va_fsid; 1717 vattr.va_mask = AT_FSID; 1718 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL)) 1719 goto out; 1720 if (fsid != vattr.va_fsid) { 1721 error = EXDEV; 1722 goto out; 1723 } 1724 } 1725 1726 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) { 1727 error = EROFS; 1728 goto out; 1729 } 1730 1731 /* 1732 * Make sure "from" vp is not a mount point. 1733 * Note, lookup did traverse() already, so 1734 * we'll be looking at the mounted FS root. 1735 * (but allow files like mnttab) 1736 */ 1737 if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) { 1738 error = EBUSY; 1739 goto out; 1740 } 1741 1742 if (targvp && (fvp != targvp)) { 1743 nbl_start_crit(targvp, RW_READER); 1744 in_crit_targ = 1; 1745 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { 1746 error = EACCES; 1747 goto out; 1748 } 1749 } 1750 1751 if (nbl_need_check(fvp)) { 1752 nbl_start_crit(fvp, RW_READER); 1753 in_crit_src = 1; 1754 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) { 1755 error = EACCES; 1756 goto out; 1757 } 1758 } 1759 1760 /* 1761 * Do the rename. 1762 */ 1763 (void) pn_fixslash(&tpn); 1764 error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(), 1765 NULL, 0); 1766 1767 out: 1768 pn_free(&fpn); 1769 pn_free(&tpn); 1770 if (in_crit_src) 1771 nbl_end_crit(fvp); 1772 if (in_crit_targ) 1773 nbl_end_crit(targvp); 1774 if (fromvp) 1775 VN_RELE(fromvp); 1776 if (tovp) 1777 VN_RELE(tovp); 1778 if (targvp) 1779 VN_RELE(targvp); 1780 if (fvp) 1781 VN_RELE(fvp); 1782 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1783 goto top; 1784 return (error); 1785 } 1786 1787 /* 1788 * Remove a file or directory. 1789 */ 1790 int 1791 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) 1792 { 1793 return (vn_removeat(NULL, fnamep, seg, dirflag)); 1794 } 1795 1796 int 1797 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag) 1798 { 1799 struct vnode *vp; /* entry vnode */ 1800 struct vnode *dvp; /* ptr to parent dir vnode */ 1801 struct vnode *coveredvp; 1802 struct pathname pn; /* name of entry */ 1803 enum vtype vtype; 1804 int error; 1805 struct vfs *vfsp; 1806 struct vfs *dvfsp; /* ptr to parent dir vfs */ 1807 int in_crit = 0; 1808 int estale_retry = 0; 1809 1810 top: 1811 if (error = pn_get(fnamep, seg, &pn)) 1812 return (error); 1813 dvp = vp = NULL; 1814 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) { 1815 pn_free(&pn); 1816 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1817 goto top; 1818 return (error); 1819 } 1820 1821 /* 1822 * Make sure there is an entry. 1823 */ 1824 if (vp == NULL) { 1825 error = ENOENT; 1826 goto out; 1827 } 1828 1829 vfsp = vp->v_vfsp; 1830 dvfsp = dvp->v_vfsp; 1831 1832 /* 1833 * If the named file is the root of a mounted filesystem, fail, 1834 * unless it's marked unlinkable. In that case, unmount the 1835 * filesystem and proceed to unlink the covered vnode. (If the 1836 * covered vnode is a directory, use rmdir instead of unlink, 1837 * to avoid file system corruption.) 1838 */ 1839 if (vp->v_flag & VROOT) { 1840 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) { 1841 error = EBUSY; 1842 goto out; 1843 } 1844 1845 /* 1846 * Namefs specific code starts here. 1847 */ 1848 1849 if (dirflag == RMDIRECTORY) { 1850 /* 1851 * User called rmdir(2) on a file that has 1852 * been namefs mounted on top of. Since 1853 * namefs doesn't allow directories to 1854 * be mounted on other files we know 1855 * vp is not of type VDIR so fail to operation. 1856 */ 1857 error = ENOTDIR; 1858 goto out; 1859 } 1860 1861 /* 1862 * If VROOT is still set after grabbing vp->v_lock, 1863 * noone has finished nm_unmount so far and coveredvp 1864 * is valid. 1865 * If we manage to grab vn_vfswlock(coveredvp) before releasing 1866 * vp->v_lock, any race window is eliminated. 1867 */ 1868 1869 mutex_enter(&vp->v_lock); 1870 if ((vp->v_flag & VROOT) == 0) { 1871 /* Someone beat us to the unmount */ 1872 mutex_exit(&vp->v_lock); 1873 error = EBUSY; 1874 goto out; 1875 } 1876 vfsp = vp->v_vfsp; 1877 coveredvp = vfsp->vfs_vnodecovered; 1878 ASSERT(coveredvp); 1879 /* 1880 * Note: Implementation of vn_vfswlock shows that ordering of 1881 * v_lock / vn_vfswlock is not an issue here. 1882 */ 1883 error = vn_vfswlock(coveredvp); 1884 mutex_exit(&vp->v_lock); 1885 1886 if (error) 1887 goto out; 1888 1889 VN_HOLD(coveredvp); 1890 VN_RELE(vp); 1891 error = dounmount(vfsp, 0, CRED()); 1892 1893 /* 1894 * Unmounted the namefs file system; now get 1895 * the object it was mounted over. 1896 */ 1897 vp = coveredvp; 1898 /* 1899 * If namefs was mounted over a directory, then 1900 * we want to use rmdir() instead of unlink(). 1901 */ 1902 if (vp->v_type == VDIR) 1903 dirflag = RMDIRECTORY; 1904 1905 if (error) 1906 goto out; 1907 } 1908 1909 /* 1910 * Make sure filesystem is writeable. 1911 * We check the parent directory's vfs in case this is an lofs vnode. 1912 */ 1913 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) { 1914 error = EROFS; 1915 goto out; 1916 } 1917 1918 vtype = vp->v_type; 1919 1920 /* 1921 * If there is the possibility of an nbmand share reservation, make 1922 * sure it's okay to remove the file. Keep a reference to the 1923 * vnode, so that we can exit the nbl critical region after 1924 * calling VOP_REMOVE. 1925 * If there is no possibility of an nbmand share reservation, 1926 * release the vnode reference now. Filesystems like NFS may 1927 * behave differently if there is an extra reference, so get rid of 1928 * this one. Fortunately, we can't have nbmand mounts on NFS 1929 * filesystems. 1930 */ 1931 if (nbl_need_check(vp)) { 1932 nbl_start_crit(vp, RW_READER); 1933 in_crit = 1; 1934 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) { 1935 error = EACCES; 1936 goto out; 1937 } 1938 } else { 1939 VN_RELE(vp); 1940 vp = NULL; 1941 } 1942 1943 if (dirflag == RMDIRECTORY) { 1944 /* 1945 * Caller is using rmdir(2), which can only be applied to 1946 * directories. 1947 */ 1948 if (vtype != VDIR) { 1949 error = ENOTDIR; 1950 } else { 1951 vnode_t *cwd; 1952 proc_t *pp = curproc; 1953 1954 mutex_enter(&pp->p_lock); 1955 cwd = PTOU(pp)->u_cdir; 1956 VN_HOLD(cwd); 1957 mutex_exit(&pp->p_lock); 1958 error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(), 1959 NULL, 0); 1960 VN_RELE(cwd); 1961 } 1962 } else { 1963 /* 1964 * Unlink(2) can be applied to anything. 1965 */ 1966 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0); 1967 } 1968 1969 out: 1970 pn_free(&pn); 1971 if (in_crit) { 1972 nbl_end_crit(vp); 1973 in_crit = 0; 1974 } 1975 if (vp != NULL) 1976 VN_RELE(vp); 1977 if (dvp != NULL) 1978 VN_RELE(dvp); 1979 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1980 goto top; 1981 return (error); 1982 } 1983 1984 /* 1985 * Utility function to compare equality of vnodes. 1986 * Compare the underlying real vnodes, if there are underlying vnodes. 1987 * This is a more thorough comparison than the VN_CMP() macro provides. 1988 */ 1989 int 1990 vn_compare(vnode_t *vp1, vnode_t *vp2) 1991 { 1992 vnode_t *realvp; 1993 1994 if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0) 1995 vp1 = realvp; 1996 if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0) 1997 vp2 = realvp; 1998 return (VN_CMP(vp1, vp2)); 1999 } 2000 2001 /* 2002 * The number of locks to hash into. This value must be a power 2003 * of 2 minus 1 and should probably also be prime. 2004 */ 2005 #define NUM_BUCKETS 1023 2006 2007 struct vn_vfslocks_bucket { 2008 kmutex_t vb_lock; 2009 vn_vfslocks_entry_t *vb_list; 2010 char pad[64 - sizeof (kmutex_t) - sizeof (void *)]; 2011 }; 2012 2013 /* 2014 * Total number of buckets will be NUM_BUCKETS + 1 . 2015 */ 2016 2017 #pragma align 64(vn_vfslocks_buckets) 2018 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1]; 2019 2020 #define VN_VFSLOCKS_SHIFT 9 2021 2022 #define VN_VFSLOCKS_HASH(vfsvpptr) \ 2023 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS) 2024 2025 /* 2026 * vn_vfslocks_getlock() uses an HASH scheme to generate 2027 * rwstlock using vfs/vnode pointer passed to it. 2028 * 2029 * vn_vfslocks_rele() releases a reference in the 2030 * HASH table which allows the entry allocated by 2031 * vn_vfslocks_getlock() to be freed at a later 2032 * stage when the refcount drops to zero. 2033 */ 2034 2035 vn_vfslocks_entry_t * 2036 vn_vfslocks_getlock(void *vfsvpptr) 2037 { 2038 struct vn_vfslocks_bucket *bp; 2039 vn_vfslocks_entry_t *vep; 2040 vn_vfslocks_entry_t *tvep; 2041 2042 ASSERT(vfsvpptr != NULL); 2043 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)]; 2044 2045 mutex_enter(&bp->vb_lock); 2046 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { 2047 if (vep->ve_vpvfs == vfsvpptr) { 2048 vep->ve_refcnt++; 2049 mutex_exit(&bp->vb_lock); 2050 return (vep); 2051 } 2052 } 2053 mutex_exit(&bp->vb_lock); 2054 vep = kmem_alloc(sizeof (*vep), KM_SLEEP); 2055 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL); 2056 vep->ve_vpvfs = (char *)vfsvpptr; 2057 vep->ve_refcnt = 1; 2058 mutex_enter(&bp->vb_lock); 2059 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) { 2060 if (tvep->ve_vpvfs == vfsvpptr) { 2061 tvep->ve_refcnt++; 2062 mutex_exit(&bp->vb_lock); 2063 2064 /* 2065 * There is already an entry in the hash 2066 * destroy what we just allocated. 2067 */ 2068 rwst_destroy(&vep->ve_lock); 2069 kmem_free(vep, sizeof (*vep)); 2070 return (tvep); 2071 } 2072 } 2073 vep->ve_next = bp->vb_list; 2074 bp->vb_list = vep; 2075 mutex_exit(&bp->vb_lock); 2076 return (vep); 2077 } 2078 2079 void 2080 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent) 2081 { 2082 struct vn_vfslocks_bucket *bp; 2083 vn_vfslocks_entry_t *vep; 2084 vn_vfslocks_entry_t *pvep; 2085 2086 ASSERT(vepent != NULL); 2087 ASSERT(vepent->ve_vpvfs != NULL); 2088 2089 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)]; 2090 2091 mutex_enter(&bp->vb_lock); 2092 vepent->ve_refcnt--; 2093 2094 if ((int32_t)vepent->ve_refcnt < 0) 2095 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative"); 2096 2097 if (vepent->ve_refcnt == 0) { 2098 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { 2099 if (vep->ve_vpvfs == vepent->ve_vpvfs) { 2100 if (bp->vb_list == vep) 2101 bp->vb_list = vep->ve_next; 2102 else { 2103 /* LINTED */ 2104 pvep->ve_next = vep->ve_next; 2105 } 2106 mutex_exit(&bp->vb_lock); 2107 rwst_destroy(&vep->ve_lock); 2108 kmem_free(vep, sizeof (*vep)); 2109 return; 2110 } 2111 pvep = vep; 2112 } 2113 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found"); 2114 } 2115 mutex_exit(&bp->vb_lock); 2116 } 2117 2118 /* 2119 * vn_vfswlock_wait is used to implement a lock which is logically a writers 2120 * lock protecting the v_vfsmountedhere field. 2121 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock, 2122 * except that it blocks to acquire the lock VVFSLOCK. 2123 * 2124 * traverse() and routines re-implementing part of traverse (e.g. autofs) 2125 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on 2126 * need the non-blocking version of the writers lock i.e. vn_vfswlock 2127 */ 2128 int 2129 vn_vfswlock_wait(vnode_t *vp) 2130 { 2131 int retval; 2132 vn_vfslocks_entry_t *vpvfsentry; 2133 ASSERT(vp != NULL); 2134 2135 vpvfsentry = vn_vfslocks_getlock(vp); 2136 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER); 2137 2138 if (retval == EINTR) { 2139 vn_vfslocks_rele(vpvfsentry); 2140 return (EINTR); 2141 } 2142 return (retval); 2143 } 2144 2145 int 2146 vn_vfsrlock_wait(vnode_t *vp) 2147 { 2148 int retval; 2149 vn_vfslocks_entry_t *vpvfsentry; 2150 ASSERT(vp != NULL); 2151 2152 vpvfsentry = vn_vfslocks_getlock(vp); 2153 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER); 2154 2155 if (retval == EINTR) { 2156 vn_vfslocks_rele(vpvfsentry); 2157 return (EINTR); 2158 } 2159 2160 return (retval); 2161 } 2162 2163 2164 /* 2165 * vn_vfswlock is used to implement a lock which is logically a writers lock 2166 * protecting the v_vfsmountedhere field. 2167 */ 2168 int 2169 vn_vfswlock(vnode_t *vp) 2170 { 2171 vn_vfslocks_entry_t *vpvfsentry; 2172 2173 /* 2174 * If vp is NULL then somebody is trying to lock the covered vnode 2175 * of /. (vfs_vnodecovered is NULL for /). This situation will 2176 * only happen when unmounting /. Since that operation will fail 2177 * anyway, return EBUSY here instead of in VFS_UNMOUNT. 2178 */ 2179 if (vp == NULL) 2180 return (EBUSY); 2181 2182 vpvfsentry = vn_vfslocks_getlock(vp); 2183 2184 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 2185 return (0); 2186 2187 vn_vfslocks_rele(vpvfsentry); 2188 return (EBUSY); 2189 } 2190 2191 int 2192 vn_vfsrlock(vnode_t *vp) 2193 { 2194 vn_vfslocks_entry_t *vpvfsentry; 2195 2196 /* 2197 * If vp is NULL then somebody is trying to lock the covered vnode 2198 * of /. (vfs_vnodecovered is NULL for /). This situation will 2199 * only happen when unmounting /. Since that operation will fail 2200 * anyway, return EBUSY here instead of in VFS_UNMOUNT. 2201 */ 2202 if (vp == NULL) 2203 return (EBUSY); 2204 2205 vpvfsentry = vn_vfslocks_getlock(vp); 2206 2207 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 2208 return (0); 2209 2210 vn_vfslocks_rele(vpvfsentry); 2211 return (EBUSY); 2212 } 2213 2214 void 2215 vn_vfsunlock(vnode_t *vp) 2216 { 2217 vn_vfslocks_entry_t *vpvfsentry; 2218 2219 /* 2220 * ve_refcnt needs to be decremented twice. 2221 * 1. To release refernce after a call to vn_vfslocks_getlock() 2222 * 2. To release the reference from the locking routines like 2223 * vn_vfsrlock/vn_vfswlock etc,. 2224 */ 2225 vpvfsentry = vn_vfslocks_getlock(vp); 2226 vn_vfslocks_rele(vpvfsentry); 2227 2228 rwst_exit(&vpvfsentry->ve_lock); 2229 vn_vfslocks_rele(vpvfsentry); 2230 } 2231 2232 int 2233 vn_vfswlock_held(vnode_t *vp) 2234 { 2235 int held; 2236 vn_vfslocks_entry_t *vpvfsentry; 2237 2238 ASSERT(vp != NULL); 2239 2240 vpvfsentry = vn_vfslocks_getlock(vp); 2241 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 2242 2243 vn_vfslocks_rele(vpvfsentry); 2244 return (held); 2245 } 2246 2247 2248 int 2249 vn_make_ops( 2250 const char *name, /* Name of file system */ 2251 const fs_operation_def_t *templ, /* Operation specification */ 2252 vnodeops_t **actual) /* Return the vnodeops */ 2253 { 2254 int unused_ops; 2255 int error; 2256 2257 *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP); 2258 2259 (*actual)->vnop_name = name; 2260 2261 error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ); 2262 if (error) { 2263 kmem_free(*actual, sizeof (vnodeops_t)); 2264 } 2265 2266 #if DEBUG 2267 if (unused_ops != 0) 2268 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied " 2269 "but not used", name, unused_ops); 2270 #endif 2271 2272 return (error); 2273 } 2274 2275 /* 2276 * Free the vnodeops created as a result of vn_make_ops() 2277 */ 2278 void 2279 vn_freevnodeops(vnodeops_t *vnops) 2280 { 2281 kmem_free(vnops, sizeof (vnodeops_t)); 2282 } 2283 2284 /* 2285 * Vnode cache. 2286 */ 2287 2288 /* ARGSUSED */ 2289 static int 2290 vn_cache_constructor(void *buf, void *cdrarg, int kmflags) 2291 { 2292 struct vnode *vp; 2293 2294 vp = buf; 2295 2296 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); 2297 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL); 2298 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); 2299 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); 2300 vp->v_femhead = NULL; /* Must be done before vn_reinit() */ 2301 vp->v_path = vn_vpath_empty; 2302 vp->v_path_stamp = 0; 2303 vp->v_mpssdata = NULL; 2304 vp->v_vsd = NULL; 2305 vp->v_fopdata = NULL; 2306 2307 return (0); 2308 } 2309 2310 /* ARGSUSED */ 2311 static void 2312 vn_cache_destructor(void *buf, void *cdrarg) 2313 { 2314 struct vnode *vp; 2315 2316 vp = buf; 2317 2318 rw_destroy(&vp->v_nbllock); 2319 cv_destroy(&vp->v_cv); 2320 mutex_destroy(&vp->v_vsd_lock); 2321 mutex_destroy(&vp->v_lock); 2322 } 2323 2324 void 2325 vn_create_cache(void) 2326 { 2327 /* LINTED */ 2328 ASSERT((1 << VNODE_ALIGN_LOG2) == 2329 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN)); 2330 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode), 2331 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL, 2332 NULL, 0); 2333 } 2334 2335 void 2336 vn_destroy_cache(void) 2337 { 2338 kmem_cache_destroy(vn_cache); 2339 } 2340 2341 /* 2342 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are 2343 * cached by the file system and vnodes remain associated. 2344 */ 2345 void 2346 vn_recycle(vnode_t *vp) 2347 { 2348 ASSERT(vp->v_pages == NULL); 2349 VERIFY(vp->v_path != NULL); 2350 2351 /* 2352 * XXX - This really belongs in vn_reinit(), but we have some issues 2353 * with the counts. Best to have it here for clean initialization. 2354 */ 2355 vp->v_rdcnt = 0; 2356 vp->v_wrcnt = 0; 2357 vp->v_mmap_read = 0; 2358 vp->v_mmap_write = 0; 2359 2360 /* 2361 * If FEM was in use, make sure everything gets cleaned up 2362 * NOTE: vp->v_femhead is initialized to NULL in the vnode 2363 * constructor. 2364 */ 2365 if (vp->v_femhead) { 2366 /* XXX - There should be a free_femhead() that does all this */ 2367 ASSERT(vp->v_femhead->femh_list == NULL); 2368 mutex_destroy(&vp->v_femhead->femh_lock); 2369 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); 2370 vp->v_femhead = NULL; 2371 } 2372 if (vp->v_path != vn_vpath_empty) { 2373 kmem_free(vp->v_path, strlen(vp->v_path) + 1); 2374 vp->v_path = vn_vpath_empty; 2375 } 2376 vp->v_path_stamp = 0; 2377 2378 if (vp->v_fopdata != NULL) { 2379 free_fopdata(vp); 2380 } 2381 vp->v_mpssdata = NULL; 2382 vsd_free(vp); 2383 } 2384 2385 /* 2386 * Used to reset the vnode fields including those that are directly accessible 2387 * as well as those which require an accessor function. 2388 * 2389 * Does not initialize: 2390 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv 2391 * v_data (since FS-nodes and vnodes point to each other and should 2392 * be updated simultaneously) 2393 * v_op (in case someone needs to make a VOP call on this object) 2394 */ 2395 void 2396 vn_reinit(vnode_t *vp) 2397 { 2398 vp->v_count = 1; 2399 vp->v_count_dnlc = 0; 2400 vp->v_vfsp = NULL; 2401 vp->v_stream = NULL; 2402 vp->v_vfsmountedhere = NULL; 2403 vp->v_flag = 0; 2404 vp->v_type = VNON; 2405 vp->v_rdev = NODEV; 2406 2407 vp->v_filocks = NULL; 2408 vp->v_shrlocks = NULL; 2409 vp->v_pages = NULL; 2410 2411 vp->v_locality = NULL; 2412 vp->v_xattrdir = NULL; 2413 2414 /* 2415 * In a few specific instances, vn_reinit() is used to initialize 2416 * locally defined vnode_t instances. Lacking the construction offered 2417 * by vn_alloc(), these vnodes require v_path initialization. 2418 */ 2419 if (vp->v_path == NULL) { 2420 vp->v_path = vn_vpath_empty; 2421 } 2422 2423 /* Handles v_femhead, v_path, and the r/w/map counts */ 2424 vn_recycle(vp); 2425 } 2426 2427 vnode_t * 2428 vn_alloc(int kmflag) 2429 { 2430 vnode_t *vp; 2431 2432 vp = kmem_cache_alloc(vn_cache, kmflag); 2433 2434 if (vp != NULL) { 2435 vp->v_femhead = NULL; /* Must be done before vn_reinit() */ 2436 vp->v_fopdata = NULL; 2437 vn_reinit(vp); 2438 } 2439 2440 return (vp); 2441 } 2442 2443 void 2444 vn_free(vnode_t *vp) 2445 { 2446 ASSERT(vp->v_shrlocks == NULL); 2447 ASSERT(vp->v_filocks == NULL); 2448 2449 /* 2450 * Some file systems call vn_free() with v_count of zero, 2451 * some with v_count of 1. In any case, the value should 2452 * never be anything else. 2453 */ 2454 ASSERT((vp->v_count == 0) || (vp->v_count == 1)); 2455 ASSERT(vp->v_count_dnlc == 0); 2456 VERIFY(vp->v_path != NULL); 2457 if (vp->v_path != vn_vpath_empty) { 2458 kmem_free(vp->v_path, strlen(vp->v_path) + 1); 2459 vp->v_path = vn_vpath_empty; 2460 } 2461 2462 /* If FEM was in use, make sure everything gets cleaned up */ 2463 if (vp->v_femhead) { 2464 /* XXX - There should be a free_femhead() that does all this */ 2465 ASSERT(vp->v_femhead->femh_list == NULL); 2466 mutex_destroy(&vp->v_femhead->femh_lock); 2467 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); 2468 vp->v_femhead = NULL; 2469 } 2470 2471 if (vp->v_fopdata != NULL) { 2472 free_fopdata(vp); 2473 } 2474 vp->v_mpssdata = NULL; 2475 vsd_free(vp); 2476 kmem_cache_free(vn_cache, vp); 2477 } 2478 2479 /* 2480 * vnode status changes, should define better states than 1, 0. 2481 */ 2482 void 2483 vn_reclaim(vnode_t *vp) 2484 { 2485 vfs_t *vfsp = vp->v_vfsp; 2486 2487 if (vfsp == NULL || 2488 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2489 return; 2490 } 2491 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED); 2492 } 2493 2494 void 2495 vn_idle(vnode_t *vp) 2496 { 2497 vfs_t *vfsp = vp->v_vfsp; 2498 2499 if (vfsp == NULL || 2500 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2501 return; 2502 } 2503 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED); 2504 } 2505 void 2506 vn_exists(vnode_t *vp) 2507 { 2508 vfs_t *vfsp = vp->v_vfsp; 2509 2510 if (vfsp == NULL || 2511 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2512 return; 2513 } 2514 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS); 2515 } 2516 2517 void 2518 vn_invalid(vnode_t *vp) 2519 { 2520 vfs_t *vfsp = vp->v_vfsp; 2521 2522 if (vfsp == NULL || 2523 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2524 return; 2525 } 2526 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED); 2527 } 2528 2529 /* Vnode event notification */ 2530 2531 int 2532 vnevent_support(vnode_t *vp, caller_context_t *ct) 2533 { 2534 if (vp == NULL) 2535 return (EINVAL); 2536 2537 return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct)); 2538 } 2539 2540 void 2541 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) 2542 { 2543 if (vp == NULL || vp->v_femhead == NULL) { 2544 return; 2545 } 2546 (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); 2547 } 2548 2549 void 2550 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, 2551 caller_context_t *ct) 2552 { 2553 if (vp == NULL || vp->v_femhead == NULL) { 2554 return; 2555 } 2556 (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct); 2557 } 2558 2559 void 2560 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) 2561 { 2562 if (vp == NULL || vp->v_femhead == NULL) { 2563 return; 2564 } 2565 (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); 2566 } 2567 2568 void 2569 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) 2570 { 2571 if (vp == NULL || vp->v_femhead == NULL) { 2572 return; 2573 } 2574 (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct); 2575 } 2576 2577 void 2578 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) 2579 { 2580 if (vp == NULL || vp->v_femhead == NULL) { 2581 return; 2582 } 2583 (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct); 2584 } 2585 2586 void 2587 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name, 2588 caller_context_t *ct) 2589 { 2590 if (vp == NULL || vp->v_femhead == NULL) { 2591 return; 2592 } 2593 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct); 2594 } 2595 2596 void 2597 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, 2598 caller_context_t *ct) 2599 { 2600 if (vp == NULL || vp->v_femhead == NULL) { 2601 return; 2602 } 2603 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct); 2604 } 2605 2606 void 2607 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, 2608 caller_context_t *ct) 2609 { 2610 if (vp == NULL || vp->v_femhead == NULL) { 2611 return; 2612 } 2613 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct); 2614 } 2615 2616 void 2617 vnevent_create(vnode_t *vp, caller_context_t *ct) 2618 { 2619 if (vp == NULL || vp->v_femhead == NULL) { 2620 return; 2621 } 2622 (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct); 2623 } 2624 2625 void 2626 vnevent_link(vnode_t *vp, caller_context_t *ct) 2627 { 2628 if (vp == NULL || vp->v_femhead == NULL) { 2629 return; 2630 } 2631 (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct); 2632 } 2633 2634 void 2635 vnevent_mountedover(vnode_t *vp, caller_context_t *ct) 2636 { 2637 if (vp == NULL || vp->v_femhead == NULL) { 2638 return; 2639 } 2640 (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct); 2641 } 2642 2643 void 2644 vnevent_truncate(vnode_t *vp, caller_context_t *ct) 2645 { 2646 if (vp == NULL || vp->v_femhead == NULL) { 2647 return; 2648 } 2649 (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); 2650 } 2651 2652 /* 2653 * Vnode accessors. 2654 */ 2655 2656 int 2657 vn_is_readonly(vnode_t *vp) 2658 { 2659 return (vp->v_vfsp->vfs_flag & VFS_RDONLY); 2660 } 2661 2662 int 2663 vn_has_flocks(vnode_t *vp) 2664 { 2665 return (vp->v_filocks != NULL); 2666 } 2667 2668 int 2669 vn_has_mandatory_locks(vnode_t *vp, int mode) 2670 { 2671 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode))); 2672 } 2673 2674 int 2675 vn_has_cached_data(vnode_t *vp) 2676 { 2677 return (vp->v_pages != NULL); 2678 } 2679 2680 /* 2681 * Return 0 if the vnode in question shouldn't be permitted into a zone via 2682 * zone_enter(2). 2683 */ 2684 int 2685 vn_can_change_zones(vnode_t *vp) 2686 { 2687 struct vfssw *vswp; 2688 int allow = 1; 2689 vnode_t *rvp; 2690 2691 if (nfs_global_client_only != 0) 2692 return (1); 2693 2694 /* 2695 * We always want to look at the underlying vnode if there is one. 2696 */ 2697 if (VOP_REALVP(vp, &rvp, NULL) != 0) 2698 rvp = vp; 2699 /* 2700 * Some pseudo filesystems (including doorfs) don't actually register 2701 * their vfsops_t, so the following may return NULL; we happily let 2702 * such vnodes switch zones. 2703 */ 2704 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp)); 2705 if (vswp != NULL) { 2706 if (vswp->vsw_flag & VSW_NOTZONESAFE) 2707 allow = 0; 2708 vfs_unrefvfssw(vswp); 2709 } 2710 return (allow); 2711 } 2712 2713 /* 2714 * Return nonzero if the vnode is a mount point, zero if not. 2715 */ 2716 int 2717 vn_ismntpt(vnode_t *vp) 2718 { 2719 return (vp->v_vfsmountedhere != NULL); 2720 } 2721 2722 /* Retrieve the vfs (if any) mounted on this vnode */ 2723 vfs_t * 2724 vn_mountedvfs(vnode_t *vp) 2725 { 2726 return (vp->v_vfsmountedhere); 2727 } 2728 2729 /* 2730 * Return nonzero if the vnode is referenced by the dnlc, zero if not. 2731 */ 2732 int 2733 vn_in_dnlc(vnode_t *vp) 2734 { 2735 return (vp->v_count_dnlc > 0); 2736 } 2737 2738 /* 2739 * vn_has_other_opens() checks whether a particular file is opened by more than 2740 * just the caller and whether the open is for read and/or write. 2741 * This routine is for calling after the caller has already called VOP_OPEN() 2742 * and the caller wishes to know if they are the only one with it open for 2743 * the mode(s) specified. 2744 * 2745 * Vnode counts are only kept on regular files (v_type=VREG). 2746 */ 2747 int 2748 vn_has_other_opens( 2749 vnode_t *vp, 2750 v_mode_t mode) 2751 { 2752 2753 ASSERT(vp != NULL); 2754 2755 switch (mode) { 2756 case V_WRITE: 2757 if (vp->v_wrcnt > 1) 2758 return (V_TRUE); 2759 break; 2760 case V_RDORWR: 2761 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1)) 2762 return (V_TRUE); 2763 break; 2764 case V_RDANDWR: 2765 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1)) 2766 return (V_TRUE); 2767 break; 2768 case V_READ: 2769 if (vp->v_rdcnt > 1) 2770 return (V_TRUE); 2771 break; 2772 } 2773 2774 return (V_FALSE); 2775 } 2776 2777 /* 2778 * vn_is_opened() checks whether a particular file is opened and 2779 * whether the open is for read and/or write. 2780 * 2781 * Vnode counts are only kept on regular files (v_type=VREG). 2782 */ 2783 int 2784 vn_is_opened( 2785 vnode_t *vp, 2786 v_mode_t mode) 2787 { 2788 2789 ASSERT(vp != NULL); 2790 2791 switch (mode) { 2792 case V_WRITE: 2793 if (vp->v_wrcnt) 2794 return (V_TRUE); 2795 break; 2796 case V_RDANDWR: 2797 if (vp->v_rdcnt && vp->v_wrcnt) 2798 return (V_TRUE); 2799 break; 2800 case V_RDORWR: 2801 if (vp->v_rdcnt || vp->v_wrcnt) 2802 return (V_TRUE); 2803 break; 2804 case V_READ: 2805 if (vp->v_rdcnt) 2806 return (V_TRUE); 2807 break; 2808 } 2809 2810 return (V_FALSE); 2811 } 2812 2813 /* 2814 * vn_is_mapped() checks whether a particular file is mapped and whether 2815 * the file is mapped read and/or write. 2816 */ 2817 int 2818 vn_is_mapped( 2819 vnode_t *vp, 2820 v_mode_t mode) 2821 { 2822 2823 ASSERT(vp != NULL); 2824 2825 #if !defined(_LP64) 2826 switch (mode) { 2827 /* 2828 * The atomic_add_64_nv functions force atomicity in the 2829 * case of 32 bit architectures. Otherwise the 64 bit values 2830 * require two fetches. The value of the fields may be 2831 * (potentially) changed between the first fetch and the 2832 * second 2833 */ 2834 case V_WRITE: 2835 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0)) 2836 return (V_TRUE); 2837 break; 2838 case V_RDANDWR: 2839 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) && 2840 (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) 2841 return (V_TRUE); 2842 break; 2843 case V_RDORWR: 2844 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) || 2845 (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) 2846 return (V_TRUE); 2847 break; 2848 case V_READ: 2849 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0)) 2850 return (V_TRUE); 2851 break; 2852 } 2853 #else 2854 switch (mode) { 2855 case V_WRITE: 2856 if (vp->v_mmap_write) 2857 return (V_TRUE); 2858 break; 2859 case V_RDANDWR: 2860 if (vp->v_mmap_read && vp->v_mmap_write) 2861 return (V_TRUE); 2862 break; 2863 case V_RDORWR: 2864 if (vp->v_mmap_read || vp->v_mmap_write) 2865 return (V_TRUE); 2866 break; 2867 case V_READ: 2868 if (vp->v_mmap_read) 2869 return (V_TRUE); 2870 break; 2871 } 2872 #endif 2873 2874 return (V_FALSE); 2875 } 2876 2877 /* 2878 * Set the operations vector for a vnode. 2879 * 2880 * FEM ensures that the v_femhead pointer is filled in before the 2881 * v_op pointer is changed. This means that if the v_femhead pointer 2882 * is NULL, and the v_op field hasn't changed since before which checked 2883 * the v_femhead pointer; then our update is ok - we are not racing with 2884 * FEM. 2885 */ 2886 void 2887 vn_setops(vnode_t *vp, vnodeops_t *vnodeops) 2888 { 2889 vnodeops_t *op; 2890 2891 ASSERT(vp != NULL); 2892 ASSERT(vnodeops != NULL); 2893 2894 op = vp->v_op; 2895 membar_consumer(); 2896 /* 2897 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do 2898 * the compare-and-swap on vp->v_op. If either fails, then FEM is 2899 * in effect on the vnode and we need to have FEM deal with it. 2900 */ 2901 if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) != 2902 op) { 2903 fem_setvnops(vp, vnodeops); 2904 } 2905 } 2906 2907 /* 2908 * Retrieve the operations vector for a vnode 2909 * As with vn_setops(above); make sure we aren't racing with FEM. 2910 * FEM sets the v_op to a special, internal, vnodeops that wouldn't 2911 * make sense to the callers of this routine. 2912 */ 2913 vnodeops_t * 2914 vn_getops(vnode_t *vp) 2915 { 2916 vnodeops_t *op; 2917 2918 ASSERT(vp != NULL); 2919 2920 op = vp->v_op; 2921 membar_consumer(); 2922 if (vp->v_femhead == NULL && op == vp->v_op) { 2923 return (op); 2924 } else { 2925 return (fem_getvnops(vp)); 2926 } 2927 } 2928 2929 /* 2930 * Returns non-zero (1) if the vnodeops matches that of the vnode. 2931 * Returns zero (0) if not. 2932 */ 2933 int 2934 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops) 2935 { 2936 return (vn_getops(vp) == vnodeops); 2937 } 2938 2939 /* 2940 * Returns non-zero (1) if the specified operation matches the 2941 * corresponding operation for that the vnode. 2942 * Returns zero (0) if not. 2943 */ 2944 2945 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0)) 2946 2947 int 2948 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp) 2949 { 2950 const fs_operation_trans_def_t *otdp; 2951 fs_generic_func_p *loc = NULL; 2952 vnodeops_t *vop = vn_getops(vp); 2953 2954 ASSERT(vopname != NULL); 2955 2956 for (otdp = vn_ops_table; otdp->name != NULL; otdp++) { 2957 if (MATCHNAME(otdp->name, vopname)) { 2958 loc = (fs_generic_func_p *) 2959 ((char *)(vop) + otdp->offset); 2960 break; 2961 } 2962 } 2963 2964 return ((loc != NULL) && (*loc == funcp)); 2965 } 2966 2967 /* 2968 * fs_new_caller_id() needs to return a unique ID on a given local system. 2969 * The IDs do not need to survive across reboots. These are primarily 2970 * used so that (FEM) monitors can detect particular callers (such as 2971 * the NFS server) to a given vnode/vfs operation. 2972 */ 2973 u_longlong_t 2974 fs_new_caller_id() 2975 { 2976 static uint64_t next_caller_id = 0LL; /* First call returns 1 */ 2977 2978 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id)); 2979 } 2980 2981 /* 2982 * The value stored in v_path is relative to rootdir, located in the global 2983 * zone. Zones or chroot environments which reside deeper inside the VFS 2984 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of 2985 * what lies below their perceived root. In order to keep v_path usable for 2986 * these child environments, its allocations are allowed to exceed MAXPATHLEN. 2987 * 2988 * An upper bound of max_vnode_path is placed upon v_path allocations to 2989 * prevent the system from going too wild at the behest of pathological 2990 * behavior from the operator. 2991 */ 2992 size_t max_vnode_path = 4 * MAXPATHLEN; 2993 2994 2995 void 2996 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp) 2997 { 2998 char *buf; 2999 3000 mutex_enter(&vp->v_lock); 3001 /* 3002 * If the snapshot of v_path_stamp passed in via compare_stamp does not 3003 * match the present value on the vnode, it indicates that subsequent 3004 * changes have occurred. The v_path value is not cleared in this case 3005 * since the new value may be valid. 3006 */ 3007 if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) { 3008 mutex_exit(&vp->v_lock); 3009 return; 3010 } 3011 buf = vp->v_path; 3012 vp->v_path = vn_vpath_empty; 3013 vp->v_path_stamp = 0; 3014 mutex_exit(&vp->v_lock); 3015 if (buf != vn_vpath_empty) { 3016 kmem_free(buf, strlen(buf) + 1); 3017 } 3018 } 3019 3020 static void 3021 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len, 3022 boolean_t is_rename) 3023 { 3024 char *buf, *oldbuf; 3025 hrtime_t pstamp; 3026 size_t baselen, buflen = 0; 3027 3028 /* Handle the vn_setpath_str case. */ 3029 if (pvp == NULL) { 3030 if (len + 1 > max_vnode_path) { 3031 DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp, 3032 vnode_t *, vp, char *, name, size_t, len + 1); 3033 return; 3034 } 3035 buf = kmem_alloc(len + 1, KM_SLEEP); 3036 bcopy(name, buf, len); 3037 buf[len] = '\0'; 3038 3039 mutex_enter(&vp->v_lock); 3040 oldbuf = vp->v_path; 3041 vp->v_path = buf; 3042 vp->v_path_stamp = gethrtime(); 3043 mutex_exit(&vp->v_lock); 3044 if (oldbuf != vn_vpath_empty) { 3045 kmem_free(oldbuf, strlen(oldbuf) + 1); 3046 } 3047 return; 3048 } 3049 3050 /* Take snapshot of parent dir */ 3051 mutex_enter(&pvp->v_lock); 3052 3053 if ((pvp->v_flag & VTRAVERSE) != 0) { 3054 /* 3055 * When the parent vnode has VTRAVERSE set in its flags, normal 3056 * assumptions about v_path calculation no longer apply. The 3057 * primary situation where this occurs is via the VFS tricks 3058 * which procfs plays in order to allow /proc/PID/(root|cwd) to 3059 * yield meaningful results. 3060 * 3061 * When this flag is set, v_path on the child must not be 3062 * updated since the calculated value is likely to be 3063 * incorrect, given the current context. 3064 */ 3065 mutex_exit(&pvp->v_lock); 3066 return; 3067 } 3068 3069 retrybuf: 3070 if (pvp->v_path == vn_vpath_empty) { 3071 /* 3072 * Without v_path from the parent directory, generating a child 3073 * path from the name is impossible. 3074 */ 3075 if (len > 0) { 3076 pstamp = pvp->v_path_stamp; 3077 mutex_exit(&pvp->v_lock); 3078 vn_clearpath(vp, pstamp); 3079 return; 3080 } 3081 3082 /* 3083 * The only feasible case here is where a NUL lookup is being 3084 * performed on rootdir prior to its v_path being populated. 3085 */ 3086 ASSERT(pvp->v_path_stamp == 0); 3087 baselen = 0; 3088 pstamp = 0; 3089 } else { 3090 pstamp = pvp->v_path_stamp; 3091 baselen = strlen(pvp->v_path); 3092 /* ignore a trailing slash if present */ 3093 if (pvp->v_path[baselen - 1] == '/') { 3094 /* This should only the be case for rootdir */ 3095 ASSERT(baselen == 1 && pvp == rootdir); 3096 baselen--; 3097 } 3098 } 3099 mutex_exit(&pvp->v_lock); 3100 3101 if (buflen != 0) { 3102 /* Free the existing (mis-sized) buffer in case of retry */ 3103 kmem_free(buf, buflen); 3104 } 3105 /* base, '/', name and trailing NUL */ 3106 buflen = baselen + len + 2; 3107 if (buflen > max_vnode_path) { 3108 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp, 3109 vnode_t *, vp, char *, name, size_t, buflen); 3110 return; 3111 } 3112 buf = kmem_alloc(buflen, KM_SLEEP); 3113 3114 mutex_enter(&pvp->v_lock); 3115 if (pvp->v_path_stamp != pstamp) { 3116 size_t vlen; 3117 3118 /* 3119 * Since v_path_stamp changed on the parent, it is likely that 3120 * v_path has been altered as well. If the length does not 3121 * exactly match what was previously measured, the buffer 3122 * allocation must be repeated for proper sizing. 3123 */ 3124 if (pvp->v_path == vn_vpath_empty) { 3125 /* Give up if parent lack v_path */ 3126 mutex_exit(&pvp->v_lock); 3127 kmem_free(buf, buflen); 3128 return; 3129 } 3130 vlen = strlen(pvp->v_path); 3131 if (pvp->v_path[vlen - 1] == '/') { 3132 vlen--; 3133 } 3134 if (vlen != baselen) { 3135 goto retrybuf; 3136 } 3137 } 3138 bcopy(pvp->v_path, buf, baselen); 3139 mutex_exit(&pvp->v_lock); 3140 3141 buf[baselen] = '/'; 3142 baselen++; 3143 bcopy(name, &buf[baselen], len + 1); 3144 3145 mutex_enter(&vp->v_lock); 3146 if (vp->v_path_stamp == 0) { 3147 /* never-visited vnode can inherit stamp from parent */ 3148 ASSERT(vp->v_path == vn_vpath_empty); 3149 vp->v_path_stamp = pstamp; 3150 vp->v_path = buf; 3151 mutex_exit(&vp->v_lock); 3152 } else if (vp->v_path_stamp < pstamp || is_rename) { 3153 /* 3154 * Install the updated path and stamp, ensuring that the v_path 3155 * pointer is valid at all times for dtrace. 3156 */ 3157 oldbuf = vp->v_path; 3158 vp->v_path = buf; 3159 vp->v_path_stamp = gethrtime(); 3160 mutex_exit(&vp->v_lock); 3161 kmem_free(oldbuf, strlen(oldbuf) + 1); 3162 } else { 3163 /* 3164 * If the timestamp matches or is greater, it means another 3165 * thread performed the update first while locks were dropped 3166 * here to make the allocation. We defer to the newer value. 3167 */ 3168 mutex_exit(&vp->v_lock); 3169 kmem_free(buf, buflen); 3170 } 3171 ASSERT(MUTEX_NOT_HELD(&vp->v_lock)); 3172 } 3173 3174 void 3175 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name) 3176 { 3177 size_t len; 3178 3179 /* 3180 * If the parent is older or empty, there's nothing further to do. 3181 */ 3182 if (pvp->v_path == vn_vpath_empty || 3183 pvp->v_path_stamp <= vp->v_path_stamp) { 3184 return; 3185 } 3186 3187 /* 3188 * Given the lack of appropriate context, meaningful updates to v_path 3189 * cannot be made for during lookups for the '.' or '..' entries. 3190 */ 3191 len = strlen(name); 3192 if (len == 0 || (len == 1 && name[0] == '.') || 3193 (len == 2 && name[0] == '.' && name[1] == '.')) { 3194 return; 3195 } 3196 3197 vn_setpath_common(pvp, vp, name, len, B_FALSE); 3198 } 3199 3200 /* 3201 * Given a starting vnode and a path, updates the path in the target vnode in 3202 * a safe manner. If the vnode already has path information embedded, then the 3203 * cached path is left untouched. 3204 */ 3205 /* ARGSUSED */ 3206 void 3207 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name, 3208 size_t len) 3209 { 3210 vn_setpath_common(pvp, vp, name, len, B_FALSE); 3211 } 3212 3213 /* 3214 * Sets the path to the vnode to be the given string, regardless of current 3215 * context. The string must be a complete path from rootdir. This is only used 3216 * by fsop_root() for setting the path based on the mountpoint. 3217 */ 3218 void 3219 vn_setpath_str(vnode_t *vp, const char *str, size_t len) 3220 { 3221 vn_setpath_common(NULL, vp, str, len, B_FALSE); 3222 } 3223 3224 /* 3225 * Called from within filesystem's vop_rename() to handle renames once the 3226 * target vnode is available. 3227 */ 3228 void 3229 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len) 3230 { 3231 vn_setpath_common(pvp, vp, name, len, B_TRUE); 3232 } 3233 3234 /* 3235 * Similar to vn_setpath_str(), this function sets the path of the destination 3236 * vnode to the be the same as the source vnode. 3237 */ 3238 void 3239 vn_copypath(struct vnode *src, struct vnode *dst) 3240 { 3241 char *buf; 3242 hrtime_t stamp; 3243 size_t buflen; 3244 3245 mutex_enter(&src->v_lock); 3246 if (src->v_path == vn_vpath_empty) { 3247 mutex_exit(&src->v_lock); 3248 return; 3249 } 3250 buflen = strlen(src->v_path) + 1; 3251 mutex_exit(&src->v_lock); 3252 3253 buf = kmem_alloc(buflen, KM_SLEEP); 3254 3255 mutex_enter(&src->v_lock); 3256 if (src->v_path == vn_vpath_empty || 3257 strlen(src->v_path) + 1 != buflen) { 3258 mutex_exit(&src->v_lock); 3259 kmem_free(buf, buflen); 3260 return; 3261 } 3262 bcopy(src->v_path, buf, buflen); 3263 stamp = src->v_path_stamp; 3264 mutex_exit(&src->v_lock); 3265 3266 mutex_enter(&dst->v_lock); 3267 if (dst->v_path != vn_vpath_empty) { 3268 mutex_exit(&dst->v_lock); 3269 kmem_free(buf, buflen); 3270 return; 3271 } 3272 dst->v_path = buf; 3273 dst->v_path_stamp = stamp; 3274 mutex_exit(&dst->v_lock); 3275 } 3276 3277 3278 /* 3279 * XXX Private interface for segvn routines that handle vnode 3280 * large page segments. 3281 * 3282 * return 1 if vp's file system VOP_PAGEIO() implementation 3283 * can be safely used instead of VOP_GETPAGE() for handling 3284 * pagefaults against regular non swap files. VOP_PAGEIO() 3285 * interface is considered safe here if its implementation 3286 * is very close to VOP_GETPAGE() implementation. 3287 * e.g. It zero's out the part of the page beyond EOF. Doesn't 3288 * panic if there're file holes but instead returns an error. 3289 * Doesn't assume file won't be changed by user writes, etc. 3290 * 3291 * return 0 otherwise. 3292 * 3293 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs. 3294 */ 3295 int 3296 vn_vmpss_usepageio(vnode_t *vp) 3297 { 3298 vfs_t *vfsp = vp->v_vfsp; 3299 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name; 3300 char *pageio_ok_fss[] = {"ufs", "nfs", NULL}; 3301 char **fsok = pageio_ok_fss; 3302 3303 if (fsname == NULL) { 3304 return (0); 3305 } 3306 3307 for (; *fsok; fsok++) { 3308 if (strcmp(*fsok, fsname) == 0) { 3309 return (1); 3310 } 3311 } 3312 return (0); 3313 } 3314 3315 /* VOP_XXX() macros call the corresponding fop_xxx() function */ 3316 3317 int 3318 fop_open( 3319 vnode_t **vpp, 3320 int mode, 3321 cred_t *cr, 3322 caller_context_t *ct) 3323 { 3324 int ret; 3325 vnode_t *vp = *vpp; 3326 3327 VN_HOLD(vp); 3328 /* 3329 * Adding to the vnode counts before calling open 3330 * avoids the need for a mutex. It circumvents a race 3331 * condition where a query made on the vnode counts results in a 3332 * false negative. The inquirer goes away believing the file is 3333 * not open when there is an open on the file already under way. 3334 * 3335 * The counts are meant to prevent NFS from granting a delegation 3336 * when it would be dangerous to do so. 3337 * 3338 * The vnode counts are only kept on regular files 3339 */ 3340 if ((*vpp)->v_type == VREG) { 3341 if (mode & FREAD) 3342 atomic_inc_32(&(*vpp)->v_rdcnt); 3343 if (mode & FWRITE) 3344 atomic_inc_32(&(*vpp)->v_wrcnt); 3345 } 3346 3347 VOPXID_MAP_CR(vp, cr); 3348 3349 ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct); 3350 3351 if (ret) { 3352 /* 3353 * Use the saved vp just in case the vnode ptr got trashed 3354 * by the error. 3355 */ 3356 VOPSTATS_UPDATE(vp, open); 3357 if ((vp->v_type == VREG) && (mode & FREAD)) 3358 atomic_dec_32(&vp->v_rdcnt); 3359 if ((vp->v_type == VREG) && (mode & FWRITE)) 3360 atomic_dec_32(&vp->v_wrcnt); 3361 } else { 3362 /* 3363 * Some filesystems will return a different vnode, 3364 * but the same path was still used to open it. 3365 * So if we do change the vnode and need to 3366 * copy over the path, do so here, rather than special 3367 * casing each filesystem. Adjust the vnode counts to 3368 * reflect the vnode switch. 3369 */ 3370 VOPSTATS_UPDATE(*vpp, open); 3371 if (*vpp != vp) { 3372 vn_copypath(vp, *vpp); 3373 if (((*vpp)->v_type == VREG) && (mode & FREAD)) 3374 atomic_inc_32(&(*vpp)->v_rdcnt); 3375 if ((vp->v_type == VREG) && (mode & FREAD)) 3376 atomic_dec_32(&vp->v_rdcnt); 3377 if (((*vpp)->v_type == VREG) && (mode & FWRITE)) 3378 atomic_inc_32(&(*vpp)->v_wrcnt); 3379 if ((vp->v_type == VREG) && (mode & FWRITE)) 3380 atomic_dec_32(&vp->v_wrcnt); 3381 } 3382 } 3383 VN_RELE(vp); 3384 return (ret); 3385 } 3386 3387 int 3388 fop_close( 3389 vnode_t *vp, 3390 int flag, 3391 int count, 3392 offset_t offset, 3393 cred_t *cr, 3394 caller_context_t *ct) 3395 { 3396 int err; 3397 3398 VOPXID_MAP_CR(vp, cr); 3399 3400 err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct); 3401 VOPSTATS_UPDATE(vp, close); 3402 /* 3403 * Check passed in count to handle possible dups. Vnode counts are only 3404 * kept on regular files 3405 */ 3406 if ((vp->v_type == VREG) && (count == 1)) { 3407 if (flag & FREAD) { 3408 ASSERT(vp->v_rdcnt > 0); 3409 atomic_dec_32(&vp->v_rdcnt); 3410 } 3411 if (flag & FWRITE) { 3412 ASSERT(vp->v_wrcnt > 0); 3413 atomic_dec_32(&vp->v_wrcnt); 3414 } 3415 } 3416 return (err); 3417 } 3418 3419 int 3420 fop_read( 3421 vnode_t *vp, 3422 uio_t *uiop, 3423 int ioflag, 3424 cred_t *cr, 3425 caller_context_t *ct) 3426 { 3427 int err; 3428 ssize_t resid_start = uiop->uio_resid; 3429 3430 VOPXID_MAP_CR(vp, cr); 3431 3432 err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); 3433 VOPSTATS_UPDATE_IO(vp, read, 3434 read_bytes, (resid_start - uiop->uio_resid)); 3435 return (err); 3436 } 3437 3438 int 3439 fop_write( 3440 vnode_t *vp, 3441 uio_t *uiop, 3442 int ioflag, 3443 cred_t *cr, 3444 caller_context_t *ct) 3445 { 3446 int err; 3447 ssize_t resid_start = uiop->uio_resid; 3448 3449 VOPXID_MAP_CR(vp, cr); 3450 3451 err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); 3452 VOPSTATS_UPDATE_IO(vp, write, 3453 write_bytes, (resid_start - uiop->uio_resid)); 3454 return (err); 3455 } 3456 3457 int 3458 fop_ioctl( 3459 vnode_t *vp, 3460 int cmd, 3461 intptr_t arg, 3462 int flag, 3463 cred_t *cr, 3464 int *rvalp, 3465 caller_context_t *ct) 3466 { 3467 int err; 3468 3469 VOPXID_MAP_CR(vp, cr); 3470 3471 err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct); 3472 VOPSTATS_UPDATE(vp, ioctl); 3473 return (err); 3474 } 3475 3476 int 3477 fop_setfl( 3478 vnode_t *vp, 3479 int oflags, 3480 int nflags, 3481 cred_t *cr, 3482 caller_context_t *ct) 3483 { 3484 int err; 3485 3486 VOPXID_MAP_CR(vp, cr); 3487 3488 err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct); 3489 VOPSTATS_UPDATE(vp, setfl); 3490 return (err); 3491 } 3492 3493 int 3494 fop_getattr( 3495 vnode_t *vp, 3496 vattr_t *vap, 3497 int flags, 3498 cred_t *cr, 3499 caller_context_t *ct) 3500 { 3501 int err; 3502 3503 VOPXID_MAP_CR(vp, cr); 3504 3505 /* 3506 * If this file system doesn't understand the xvattr extensions 3507 * then turn off the xvattr bit. 3508 */ 3509 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { 3510 vap->va_mask &= ~AT_XVATTR; 3511 } 3512 3513 /* 3514 * We're only allowed to skip the ACL check iff we used a 32 bit 3515 * ACE mask with VOP_ACCESS() to determine permissions. 3516 */ 3517 if ((flags & ATTR_NOACLCHECK) && 3518 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 3519 return (EINVAL); 3520 } 3521 err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct); 3522 VOPSTATS_UPDATE(vp, getattr); 3523 return (err); 3524 } 3525 3526 int 3527 fop_setattr( 3528 vnode_t *vp, 3529 vattr_t *vap, 3530 int flags, 3531 cred_t *cr, 3532 caller_context_t *ct) 3533 { 3534 int err; 3535 3536 VOPXID_MAP_CR(vp, cr); 3537 3538 /* 3539 * If this file system doesn't understand the xvattr extensions 3540 * then turn off the xvattr bit. 3541 */ 3542 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { 3543 vap->va_mask &= ~AT_XVATTR; 3544 } 3545 3546 /* 3547 * We're only allowed to skip the ACL check iff we used a 32 bit 3548 * ACE mask with VOP_ACCESS() to determine permissions. 3549 */ 3550 if ((flags & ATTR_NOACLCHECK) && 3551 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 3552 return (EINVAL); 3553 } 3554 err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct); 3555 VOPSTATS_UPDATE(vp, setattr); 3556 return (err); 3557 } 3558 3559 int 3560 fop_access( 3561 vnode_t *vp, 3562 int mode, 3563 int flags, 3564 cred_t *cr, 3565 caller_context_t *ct) 3566 { 3567 int err; 3568 3569 if ((flags & V_ACE_MASK) && 3570 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 3571 return (EINVAL); 3572 } 3573 3574 VOPXID_MAP_CR(vp, cr); 3575 3576 err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct); 3577 VOPSTATS_UPDATE(vp, access); 3578 return (err); 3579 } 3580 3581 int 3582 fop_lookup( 3583 vnode_t *dvp, 3584 char *nm, 3585 vnode_t **vpp, 3586 pathname_t *pnp, 3587 int flags, 3588 vnode_t *rdir, 3589 cred_t *cr, 3590 caller_context_t *ct, 3591 int *deflags, /* Returned per-dirent flags */ 3592 pathname_t *ppnp) /* Returned case-preserved name in directory */ 3593 { 3594 int ret; 3595 3596 /* 3597 * If this file system doesn't support case-insensitive access 3598 * and said access is requested, fail quickly. It is required 3599 * that if the vfs supports case-insensitive lookup, it also 3600 * supports extended dirent flags. 3601 */ 3602 if (flags & FIGNORECASE && 3603 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3604 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3605 return (EINVAL); 3606 3607 VOPXID_MAP_CR(dvp, cr); 3608 3609 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) { 3610 ret = xattr_dir_lookup(dvp, vpp, flags, cr); 3611 } else { 3612 ret = (*(dvp)->v_op->vop_lookup) 3613 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp); 3614 } 3615 if (ret == 0 && *vpp) { 3616 VOPSTATS_UPDATE(*vpp, lookup); 3617 vn_updatepath(dvp, *vpp, nm); 3618 } 3619 3620 return (ret); 3621 } 3622 3623 int 3624 fop_create( 3625 vnode_t *dvp, 3626 char *name, 3627 vattr_t *vap, 3628 vcexcl_t excl, 3629 int mode, 3630 vnode_t **vpp, 3631 cred_t *cr, 3632 int flags, 3633 caller_context_t *ct, 3634 vsecattr_t *vsecp) /* ACL to set during create */ 3635 { 3636 int ret; 3637 3638 if (vsecp != NULL && 3639 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { 3640 return (EINVAL); 3641 } 3642 /* 3643 * If this file system doesn't support case-insensitive access 3644 * and said access is requested, fail quickly. 3645 */ 3646 if (flags & FIGNORECASE && 3647 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3648 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3649 return (EINVAL); 3650 3651 VOPXID_MAP_CR(dvp, cr); 3652 3653 ret = (*(dvp)->v_op->vop_create) 3654 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); 3655 if (ret == 0 && *vpp) { 3656 VOPSTATS_UPDATE(*vpp, create); 3657 vn_updatepath(dvp, *vpp, name); 3658 } 3659 3660 return (ret); 3661 } 3662 3663 int 3664 fop_remove( 3665 vnode_t *dvp, 3666 char *nm, 3667 cred_t *cr, 3668 caller_context_t *ct, 3669 int flags) 3670 { 3671 int err; 3672 3673 /* 3674 * If this file system doesn't support case-insensitive access 3675 * and said access is requested, fail quickly. 3676 */ 3677 if (flags & FIGNORECASE && 3678 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3679 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3680 return (EINVAL); 3681 3682 VOPXID_MAP_CR(dvp, cr); 3683 3684 err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags); 3685 VOPSTATS_UPDATE(dvp, remove); 3686 return (err); 3687 } 3688 3689 int 3690 fop_link( 3691 vnode_t *tdvp, 3692 vnode_t *svp, 3693 char *tnm, 3694 cred_t *cr, 3695 caller_context_t *ct, 3696 int flags) 3697 { 3698 int err; 3699 3700 /* 3701 * If the target file system doesn't support case-insensitive access 3702 * and said access is requested, fail quickly. 3703 */ 3704 if (flags & FIGNORECASE && 3705 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3706 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3707 return (EINVAL); 3708 3709 VOPXID_MAP_CR(tdvp, cr); 3710 3711 err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags); 3712 VOPSTATS_UPDATE(tdvp, link); 3713 return (err); 3714 } 3715 3716 int 3717 fop_rename( 3718 vnode_t *sdvp, 3719 char *snm, 3720 vnode_t *tdvp, 3721 char *tnm, 3722 cred_t *cr, 3723 caller_context_t *ct, 3724 int flags) 3725 { 3726 int err; 3727 3728 /* 3729 * If the file system involved does not support 3730 * case-insensitive access and said access is requested, fail 3731 * quickly. 3732 */ 3733 if (flags & FIGNORECASE && 3734 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3735 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))) 3736 return (EINVAL); 3737 3738 VOPXID_MAP_CR(tdvp, cr); 3739 3740 err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags); 3741 VOPSTATS_UPDATE(sdvp, rename); 3742 return (err); 3743 } 3744 3745 int 3746 fop_mkdir( 3747 vnode_t *dvp, 3748 char *dirname, 3749 vattr_t *vap, 3750 vnode_t **vpp, 3751 cred_t *cr, 3752 caller_context_t *ct, 3753 int flags, 3754 vsecattr_t *vsecp) /* ACL to set during create */ 3755 { 3756 int ret; 3757 3758 if (vsecp != NULL && 3759 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { 3760 return (EINVAL); 3761 } 3762 /* 3763 * If this file system doesn't support case-insensitive access 3764 * and said access is requested, fail quickly. 3765 */ 3766 if (flags & FIGNORECASE && 3767 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3768 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3769 return (EINVAL); 3770 3771 VOPXID_MAP_CR(dvp, cr); 3772 3773 ret = (*(dvp)->v_op->vop_mkdir) 3774 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); 3775 if (ret == 0 && *vpp) { 3776 VOPSTATS_UPDATE(*vpp, mkdir); 3777 vn_updatepath(dvp, *vpp, dirname); 3778 } 3779 3780 return (ret); 3781 } 3782 3783 int 3784 fop_rmdir( 3785 vnode_t *dvp, 3786 char *nm, 3787 vnode_t *cdir, 3788 cred_t *cr, 3789 caller_context_t *ct, 3790 int flags) 3791 { 3792 int err; 3793 3794 /* 3795 * If this file system doesn't support case-insensitive access 3796 * and said access is requested, fail quickly. 3797 */ 3798 if (flags & FIGNORECASE && 3799 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3800 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3801 return (EINVAL); 3802 3803 VOPXID_MAP_CR(dvp, cr); 3804 3805 err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags); 3806 VOPSTATS_UPDATE(dvp, rmdir); 3807 return (err); 3808 } 3809 3810 int 3811 fop_readdir( 3812 vnode_t *vp, 3813 uio_t *uiop, 3814 cred_t *cr, 3815 int *eofp, 3816 caller_context_t *ct, 3817 int flags) 3818 { 3819 int err; 3820 ssize_t resid_start = uiop->uio_resid; 3821 3822 /* 3823 * If this file system doesn't support retrieving directory 3824 * entry flags and said access is requested, fail quickly. 3825 */ 3826 if (flags & V_RDDIR_ENTFLAGS && 3827 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0) 3828 return (EINVAL); 3829 3830 VOPXID_MAP_CR(vp, cr); 3831 3832 err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags); 3833 VOPSTATS_UPDATE_IO(vp, readdir, 3834 readdir_bytes, (resid_start - uiop->uio_resid)); 3835 return (err); 3836 } 3837 3838 int 3839 fop_symlink( 3840 vnode_t *dvp, 3841 char *linkname, 3842 vattr_t *vap, 3843 char *target, 3844 cred_t *cr, 3845 caller_context_t *ct, 3846 int flags) 3847 { 3848 int err; 3849 xvattr_t xvattr; 3850 3851 /* 3852 * If this file system doesn't support case-insensitive access 3853 * and said access is requested, fail quickly. 3854 */ 3855 if (flags & FIGNORECASE && 3856 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3857 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3858 return (EINVAL); 3859 3860 VOPXID_MAP_CR(dvp, cr); 3861 3862 /* check for reparse point */ 3863 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) && 3864 (strncmp(target, FS_REPARSE_TAG_STR, 3865 strlen(FS_REPARSE_TAG_STR)) == 0)) { 3866 if (!fs_reparse_mark(target, vap, &xvattr)) 3867 vap = (vattr_t *)&xvattr; 3868 } 3869 3870 err = (*(dvp)->v_op->vop_symlink) 3871 (dvp, linkname, vap, target, cr, ct, flags); 3872 VOPSTATS_UPDATE(dvp, symlink); 3873 return (err); 3874 } 3875 3876 int 3877 fop_readlink( 3878 vnode_t *vp, 3879 uio_t *uiop, 3880 cred_t *cr, 3881 caller_context_t *ct) 3882 { 3883 int err; 3884 3885 VOPXID_MAP_CR(vp, cr); 3886 3887 err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct); 3888 VOPSTATS_UPDATE(vp, readlink); 3889 return (err); 3890 } 3891 3892 int 3893 fop_fsync( 3894 vnode_t *vp, 3895 int syncflag, 3896 cred_t *cr, 3897 caller_context_t *ct) 3898 { 3899 int err; 3900 3901 VOPXID_MAP_CR(vp, cr); 3902 3903 err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct); 3904 VOPSTATS_UPDATE(vp, fsync); 3905 return (err); 3906 } 3907 3908 void 3909 fop_inactive( 3910 vnode_t *vp, 3911 cred_t *cr, 3912 caller_context_t *ct) 3913 { 3914 /* Need to update stats before vop call since we may lose the vnode */ 3915 VOPSTATS_UPDATE(vp, inactive); 3916 3917 VOPXID_MAP_CR(vp, cr); 3918 3919 (*(vp)->v_op->vop_inactive)(vp, cr, ct); 3920 } 3921 3922 int 3923 fop_fid( 3924 vnode_t *vp, 3925 fid_t *fidp, 3926 caller_context_t *ct) 3927 { 3928 int err; 3929 3930 err = (*(vp)->v_op->vop_fid)(vp, fidp, ct); 3931 VOPSTATS_UPDATE(vp, fid); 3932 return (err); 3933 } 3934 3935 int 3936 fop_rwlock( 3937 vnode_t *vp, 3938 int write_lock, 3939 caller_context_t *ct) 3940 { 3941 int ret; 3942 3943 ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct)); 3944 VOPSTATS_UPDATE(vp, rwlock); 3945 return (ret); 3946 } 3947 3948 void 3949 fop_rwunlock( 3950 vnode_t *vp, 3951 int write_lock, 3952 caller_context_t *ct) 3953 { 3954 (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct); 3955 VOPSTATS_UPDATE(vp, rwunlock); 3956 } 3957 3958 int 3959 fop_seek( 3960 vnode_t *vp, 3961 offset_t ooff, 3962 offset_t *noffp, 3963 caller_context_t *ct) 3964 { 3965 int err; 3966 3967 err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct); 3968 VOPSTATS_UPDATE(vp, seek); 3969 return (err); 3970 } 3971 3972 int 3973 fop_cmp( 3974 vnode_t *vp1, 3975 vnode_t *vp2, 3976 caller_context_t *ct) 3977 { 3978 int err; 3979 3980 err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct); 3981 VOPSTATS_UPDATE(vp1, cmp); 3982 return (err); 3983 } 3984 3985 int 3986 fop_frlock( 3987 vnode_t *vp, 3988 int cmd, 3989 flock64_t *bfp, 3990 int flag, 3991 offset_t offset, 3992 struct flk_callback *flk_cbp, 3993 cred_t *cr, 3994 caller_context_t *ct) 3995 { 3996 int err; 3997 3998 VOPXID_MAP_CR(vp, cr); 3999 4000 err = (*(vp)->v_op->vop_frlock) 4001 (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); 4002 VOPSTATS_UPDATE(vp, frlock); 4003 return (err); 4004 } 4005 4006 int 4007 fop_space( 4008 vnode_t *vp, 4009 int cmd, 4010 flock64_t *bfp, 4011 int flag, 4012 offset_t offset, 4013 cred_t *cr, 4014 caller_context_t *ct) 4015 { 4016 int err; 4017 4018 VOPXID_MAP_CR(vp, cr); 4019 4020 err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct); 4021 VOPSTATS_UPDATE(vp, space); 4022 return (err); 4023 } 4024 4025 int 4026 fop_realvp( 4027 vnode_t *vp, 4028 vnode_t **vpp, 4029 caller_context_t *ct) 4030 { 4031 int err; 4032 4033 err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct); 4034 VOPSTATS_UPDATE(vp, realvp); 4035 return (err); 4036 } 4037 4038 int 4039 fop_getpage( 4040 vnode_t *vp, 4041 offset_t off, 4042 size_t len, 4043 uint_t *protp, 4044 page_t **plarr, 4045 size_t plsz, 4046 struct seg *seg, 4047 caddr_t addr, 4048 enum seg_rw rw, 4049 cred_t *cr, 4050 caller_context_t *ct) 4051 { 4052 int err; 4053 4054 VOPXID_MAP_CR(vp, cr); 4055 4056 err = (*(vp)->v_op->vop_getpage) 4057 (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct); 4058 VOPSTATS_UPDATE(vp, getpage); 4059 return (err); 4060 } 4061 4062 int 4063 fop_putpage( 4064 vnode_t *vp, 4065 offset_t off, 4066 size_t len, 4067 int flags, 4068 cred_t *cr, 4069 caller_context_t *ct) 4070 { 4071 int err; 4072 4073 VOPXID_MAP_CR(vp, cr); 4074 4075 err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct); 4076 VOPSTATS_UPDATE(vp, putpage); 4077 return (err); 4078 } 4079 4080 int 4081 fop_map( 4082 vnode_t *vp, 4083 offset_t off, 4084 struct as *as, 4085 caddr_t *addrp, 4086 size_t len, 4087 uchar_t prot, 4088 uchar_t maxprot, 4089 uint_t flags, 4090 cred_t *cr, 4091 caller_context_t *ct) 4092 { 4093 int err; 4094 4095 VOPXID_MAP_CR(vp, cr); 4096 4097 err = (*(vp)->v_op->vop_map) 4098 (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct); 4099 VOPSTATS_UPDATE(vp, map); 4100 return (err); 4101 } 4102 4103 int 4104 fop_addmap( 4105 vnode_t *vp, 4106 offset_t off, 4107 struct as *as, 4108 caddr_t addr, 4109 size_t len, 4110 uchar_t prot, 4111 uchar_t maxprot, 4112 uint_t flags, 4113 cred_t *cr, 4114 caller_context_t *ct) 4115 { 4116 int error; 4117 u_longlong_t delta; 4118 4119 VOPXID_MAP_CR(vp, cr); 4120 4121 error = (*(vp)->v_op->vop_addmap) 4122 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); 4123 4124 if ((!error) && (vp->v_type == VREG)) { 4125 delta = (u_longlong_t)btopr(len); 4126 /* 4127 * If file is declared MAP_PRIVATE, it can't be written back 4128 * even if open for write. Handle as read. 4129 */ 4130 if (flags & MAP_PRIVATE) { 4131 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4132 (int64_t)delta); 4133 } else { 4134 /* 4135 * atomic_add_64 forces the fetch of a 64 bit value to 4136 * be atomic on 32 bit machines 4137 */ 4138 if (maxprot & PROT_WRITE) 4139 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), 4140 (int64_t)delta); 4141 if (maxprot & PROT_READ) 4142 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4143 (int64_t)delta); 4144 if (maxprot & PROT_EXEC) 4145 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4146 (int64_t)delta); 4147 } 4148 } 4149 VOPSTATS_UPDATE(vp, addmap); 4150 return (error); 4151 } 4152 4153 int 4154 fop_delmap( 4155 vnode_t *vp, 4156 offset_t off, 4157 struct as *as, 4158 caddr_t addr, 4159 size_t len, 4160 uint_t prot, 4161 uint_t maxprot, 4162 uint_t flags, 4163 cred_t *cr, 4164 caller_context_t *ct) 4165 { 4166 int error; 4167 u_longlong_t delta; 4168 4169 VOPXID_MAP_CR(vp, cr); 4170 4171 error = (*(vp)->v_op->vop_delmap) 4172 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); 4173 4174 /* 4175 * NFS calls into delmap twice, the first time 4176 * it simply establishes a callback mechanism and returns EAGAIN 4177 * while the real work is being done upon the second invocation. 4178 * We have to detect this here and only decrement the counts upon 4179 * the second delmap request. 4180 */ 4181 if ((error != EAGAIN) && (vp->v_type == VREG)) { 4182 4183 delta = (u_longlong_t)btopr(len); 4184 4185 if (flags & MAP_PRIVATE) { 4186 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4187 (int64_t)(-delta)); 4188 } else { 4189 /* 4190 * atomic_add_64 forces the fetch of a 64 bit value 4191 * to be atomic on 32 bit machines 4192 */ 4193 if (maxprot & PROT_WRITE) 4194 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), 4195 (int64_t)(-delta)); 4196 if (maxprot & PROT_READ) 4197 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4198 (int64_t)(-delta)); 4199 if (maxprot & PROT_EXEC) 4200 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4201 (int64_t)(-delta)); 4202 } 4203 } 4204 VOPSTATS_UPDATE(vp, delmap); 4205 return (error); 4206 } 4207 4208 4209 int 4210 fop_poll( 4211 vnode_t *vp, 4212 short events, 4213 int anyyet, 4214 short *reventsp, 4215 struct pollhead **phpp, 4216 caller_context_t *ct) 4217 { 4218 int err; 4219 4220 err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct); 4221 VOPSTATS_UPDATE(vp, poll); 4222 return (err); 4223 } 4224 4225 int 4226 fop_dump( 4227 vnode_t *vp, 4228 caddr_t addr, 4229 offset_t lbdn, 4230 offset_t dblks, 4231 caller_context_t *ct) 4232 { 4233 int err; 4234 4235 /* ensure lbdn and dblks can be passed safely to bdev_dump */ 4236 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks)) 4237 return (EIO); 4238 4239 err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct); 4240 VOPSTATS_UPDATE(vp, dump); 4241 return (err); 4242 } 4243 4244 int 4245 fop_pathconf( 4246 vnode_t *vp, 4247 int cmd, 4248 ulong_t *valp, 4249 cred_t *cr, 4250 caller_context_t *ct) 4251 { 4252 int err; 4253 4254 VOPXID_MAP_CR(vp, cr); 4255 4256 err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct); 4257 VOPSTATS_UPDATE(vp, pathconf); 4258 return (err); 4259 } 4260 4261 int 4262 fop_pageio( 4263 vnode_t *vp, 4264 struct page *pp, 4265 u_offset_t io_off, 4266 size_t io_len, 4267 int flags, 4268 cred_t *cr, 4269 caller_context_t *ct) 4270 { 4271 int err; 4272 4273 VOPXID_MAP_CR(vp, cr); 4274 4275 err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct); 4276 VOPSTATS_UPDATE(vp, pageio); 4277 return (err); 4278 } 4279 4280 int 4281 fop_dumpctl( 4282 vnode_t *vp, 4283 int action, 4284 offset_t *blkp, 4285 caller_context_t *ct) 4286 { 4287 int err; 4288 err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct); 4289 VOPSTATS_UPDATE(vp, dumpctl); 4290 return (err); 4291 } 4292 4293 void 4294 fop_dispose( 4295 vnode_t *vp, 4296 page_t *pp, 4297 int flag, 4298 int dn, 4299 cred_t *cr, 4300 caller_context_t *ct) 4301 { 4302 /* Must do stats first since it's possible to lose the vnode */ 4303 VOPSTATS_UPDATE(vp, dispose); 4304 4305 VOPXID_MAP_CR(vp, cr); 4306 4307 (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct); 4308 } 4309 4310 int 4311 fop_setsecattr( 4312 vnode_t *vp, 4313 vsecattr_t *vsap, 4314 int flag, 4315 cred_t *cr, 4316 caller_context_t *ct) 4317 { 4318 int err; 4319 4320 VOPXID_MAP_CR(vp, cr); 4321 4322 /* 4323 * We're only allowed to skip the ACL check iff we used a 32 bit 4324 * ACE mask with VOP_ACCESS() to determine permissions. 4325 */ 4326 if ((flag & ATTR_NOACLCHECK) && 4327 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 4328 return (EINVAL); 4329 } 4330 err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct); 4331 VOPSTATS_UPDATE(vp, setsecattr); 4332 return (err); 4333 } 4334 4335 int 4336 fop_getsecattr( 4337 vnode_t *vp, 4338 vsecattr_t *vsap, 4339 int flag, 4340 cred_t *cr, 4341 caller_context_t *ct) 4342 { 4343 int err; 4344 4345 /* 4346 * We're only allowed to skip the ACL check iff we used a 32 bit 4347 * ACE mask with VOP_ACCESS() to determine permissions. 4348 */ 4349 if ((flag & ATTR_NOACLCHECK) && 4350 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 4351 return (EINVAL); 4352 } 4353 4354 VOPXID_MAP_CR(vp, cr); 4355 4356 err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct); 4357 VOPSTATS_UPDATE(vp, getsecattr); 4358 return (err); 4359 } 4360 4361 int 4362 fop_shrlock( 4363 vnode_t *vp, 4364 int cmd, 4365 struct shrlock *shr, 4366 int flag, 4367 cred_t *cr, 4368 caller_context_t *ct) 4369 { 4370 int err; 4371 4372 VOPXID_MAP_CR(vp, cr); 4373 4374 err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct); 4375 VOPSTATS_UPDATE(vp, shrlock); 4376 return (err); 4377 } 4378 4379 int 4380 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm, 4381 caller_context_t *ct) 4382 { 4383 int err; 4384 4385 err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct); 4386 VOPSTATS_UPDATE(vp, vnevent); 4387 return (err); 4388 } 4389 4390 int 4391 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr, 4392 caller_context_t *ct) 4393 { 4394 int err; 4395 4396 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) 4397 return (ENOTSUP); 4398 err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct); 4399 VOPSTATS_UPDATE(vp, reqzcbuf); 4400 return (err); 4401 } 4402 4403 int 4404 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct) 4405 { 4406 int err; 4407 4408 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) 4409 return (ENOTSUP); 4410 err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct); 4411 VOPSTATS_UPDATE(vp, retzcbuf); 4412 return (err); 4413 } 4414 4415 /* 4416 * Default destructor 4417 * Needed because NULL destructor means that the key is unused 4418 */ 4419 /* ARGSUSED */ 4420 void 4421 vsd_defaultdestructor(void *value) 4422 {} 4423 4424 /* 4425 * Create a key (index into per vnode array) 4426 * Locks out vsd_create, vsd_destroy, and vsd_free 4427 * May allocate memory with lock held 4428 */ 4429 void 4430 vsd_create(uint_t *keyp, void (*destructor)(void *)) 4431 { 4432 int i; 4433 uint_t nkeys; 4434 4435 /* 4436 * if key is allocated, do nothing 4437 */ 4438 mutex_enter(&vsd_lock); 4439 if (*keyp) { 4440 mutex_exit(&vsd_lock); 4441 return; 4442 } 4443 /* 4444 * find an unused key 4445 */ 4446 if (destructor == NULL) 4447 destructor = vsd_defaultdestructor; 4448 4449 for (i = 0; i < vsd_nkeys; ++i) 4450 if (vsd_destructor[i] == NULL) 4451 break; 4452 4453 /* 4454 * if no unused keys, increase the size of the destructor array 4455 */ 4456 if (i == vsd_nkeys) { 4457 if ((nkeys = (vsd_nkeys << 1)) == 0) 4458 nkeys = 1; 4459 vsd_destructor = 4460 (void (**)(void *))vsd_realloc((void *)vsd_destructor, 4461 (size_t)(vsd_nkeys * sizeof (void (*)(void *))), 4462 (size_t)(nkeys * sizeof (void (*)(void *)))); 4463 vsd_nkeys = nkeys; 4464 } 4465 4466 /* 4467 * allocate the next available unused key 4468 */ 4469 vsd_destructor[i] = destructor; 4470 *keyp = i + 1; 4471 4472 /* create vsd_list, if it doesn't exist */ 4473 if (vsd_list == NULL) { 4474 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 4475 list_create(vsd_list, sizeof (struct vsd_node), 4476 offsetof(struct vsd_node, vs_nodes)); 4477 } 4478 4479 mutex_exit(&vsd_lock); 4480 } 4481 4482 /* 4483 * Destroy a key 4484 * 4485 * Assumes that the caller is preventing vsd_set and vsd_get 4486 * Locks out vsd_create, vsd_destroy, and vsd_free 4487 * May free memory with lock held 4488 */ 4489 void 4490 vsd_destroy(uint_t *keyp) 4491 { 4492 uint_t key; 4493 struct vsd_node *vsd; 4494 4495 /* 4496 * protect the key namespace and our destructor lists 4497 */ 4498 mutex_enter(&vsd_lock); 4499 key = *keyp; 4500 *keyp = 0; 4501 4502 ASSERT(key <= vsd_nkeys); 4503 4504 /* 4505 * if the key is valid 4506 */ 4507 if (key != 0) { 4508 uint_t k = key - 1; 4509 /* 4510 * for every vnode with VSD, call key's destructor 4511 */ 4512 for (vsd = list_head(vsd_list); vsd != NULL; 4513 vsd = list_next(vsd_list, vsd)) { 4514 /* 4515 * no VSD for key in this vnode 4516 */ 4517 if (key > vsd->vs_nkeys) 4518 continue; 4519 /* 4520 * call destructor for key 4521 */ 4522 if (vsd->vs_value[k] && vsd_destructor[k]) 4523 (*vsd_destructor[k])(vsd->vs_value[k]); 4524 /* 4525 * reset value for key 4526 */ 4527 vsd->vs_value[k] = NULL; 4528 } 4529 /* 4530 * actually free the key (NULL destructor == unused) 4531 */ 4532 vsd_destructor[k] = NULL; 4533 } 4534 4535 mutex_exit(&vsd_lock); 4536 } 4537 4538 /* 4539 * Quickly return the per vnode value that was stored with the specified key 4540 * Assumes the caller is protecting key from vsd_create and vsd_destroy 4541 * Assumes the caller is holding v_vsd_lock to protect the vsd. 4542 */ 4543 void * 4544 vsd_get(vnode_t *vp, uint_t key) 4545 { 4546 struct vsd_node *vsd; 4547 4548 ASSERT(vp != NULL); 4549 ASSERT(mutex_owned(&vp->v_vsd_lock)); 4550 4551 vsd = vp->v_vsd; 4552 4553 if (key && vsd != NULL && key <= vsd->vs_nkeys) 4554 return (vsd->vs_value[key - 1]); 4555 return (NULL); 4556 } 4557 4558 /* 4559 * Set a per vnode value indexed with the specified key 4560 * Assumes the caller is holding v_vsd_lock to protect the vsd. 4561 */ 4562 int 4563 vsd_set(vnode_t *vp, uint_t key, void *value) 4564 { 4565 struct vsd_node *vsd; 4566 4567 ASSERT(vp != NULL); 4568 ASSERT(mutex_owned(&vp->v_vsd_lock)); 4569 4570 if (key == 0) 4571 return (EINVAL); 4572 4573 vsd = vp->v_vsd; 4574 if (vsd == NULL) 4575 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP); 4576 4577 /* 4578 * If the vsd was just allocated, vs_nkeys will be 0, so the following 4579 * code won't happen and we will continue down and allocate space for 4580 * the vs_value array. 4581 * If the caller is replacing one value with another, then it is up 4582 * to the caller to free/rele/destroy the previous value (if needed). 4583 */ 4584 if (key <= vsd->vs_nkeys) { 4585 vsd->vs_value[key - 1] = value; 4586 return (0); 4587 } 4588 4589 ASSERT(key <= vsd_nkeys); 4590 4591 if (vsd->vs_nkeys == 0) { 4592 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */ 4593 /* 4594 * Link onto list of all VSD nodes. 4595 */ 4596 list_insert_head(vsd_list, vsd); 4597 mutex_exit(&vsd_lock); 4598 } 4599 4600 /* 4601 * Allocate vnode local storage and set the value for key 4602 */ 4603 vsd->vs_value = vsd_realloc(vsd->vs_value, 4604 vsd->vs_nkeys * sizeof (void *), 4605 key * sizeof (void *)); 4606 vsd->vs_nkeys = key; 4607 vsd->vs_value[key - 1] = value; 4608 4609 return (0); 4610 } 4611 4612 /* 4613 * Called from vn_free() to run the destructor function for each vsd 4614 * Locks out vsd_create and vsd_destroy 4615 * Assumes that the destructor *DOES NOT* use vsd 4616 */ 4617 void 4618 vsd_free(vnode_t *vp) 4619 { 4620 int i; 4621 struct vsd_node *vsd = vp->v_vsd; 4622 4623 if (vsd == NULL) 4624 return; 4625 4626 if (vsd->vs_nkeys == 0) { 4627 kmem_free(vsd, sizeof (*vsd)); 4628 vp->v_vsd = NULL; 4629 return; 4630 } 4631 4632 /* 4633 * lock out vsd_create and vsd_destroy, call 4634 * the destructor, and mark the value as destroyed. 4635 */ 4636 mutex_enter(&vsd_lock); 4637 4638 for (i = 0; i < vsd->vs_nkeys; i++) { 4639 if (vsd->vs_value[i] && vsd_destructor[i]) 4640 (*vsd_destructor[i])(vsd->vs_value[i]); 4641 vsd->vs_value[i] = NULL; 4642 } 4643 4644 /* 4645 * remove from linked list of VSD nodes 4646 */ 4647 list_remove(vsd_list, vsd); 4648 4649 mutex_exit(&vsd_lock); 4650 4651 /* 4652 * free up the VSD 4653 */ 4654 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *)); 4655 kmem_free(vsd, sizeof (struct vsd_node)); 4656 vp->v_vsd = NULL; 4657 } 4658 4659 /* 4660 * realloc 4661 */ 4662 static void * 4663 vsd_realloc(void *old, size_t osize, size_t nsize) 4664 { 4665 void *new; 4666 4667 new = kmem_zalloc(nsize, KM_SLEEP); 4668 if (old) { 4669 bcopy(old, new, osize); 4670 kmem_free(old, osize); 4671 } 4672 return (new); 4673 } 4674 4675 /* 4676 * Setup the extensible system attribute for creating a reparse point. 4677 * The symlink data 'target' is validated for proper format of a reparse 4678 * string and a check also made to make sure the symlink data does not 4679 * point to an existing file. 4680 * 4681 * return 0 if ok else -1. 4682 */ 4683 static int 4684 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr) 4685 { 4686 xoptattr_t *xoap; 4687 4688 if ((!target) || (!vap) || (!xvattr)) 4689 return (-1); 4690 4691 /* validate reparse string */ 4692 if (reparse_validate((const char *)target)) 4693 return (-1); 4694 4695 xva_init(xvattr); 4696 xvattr->xva_vattr = *vap; 4697 xvattr->xva_vattr.va_mask |= AT_XVATTR; 4698 xoap = xva_getxoptattr(xvattr); 4699 ASSERT(xoap); 4700 XVA_SET_REQ(xvattr, XAT_REPARSE); 4701 xoap->xoa_reparse = 1; 4702 4703 return (0); 4704 } 4705 4706 /* 4707 * Function to check whether a symlink is a reparse point. 4708 * Return B_TRUE if it is a reparse point, else return B_FALSE 4709 */ 4710 boolean_t 4711 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4712 { 4713 xvattr_t xvattr; 4714 xoptattr_t *xoap; 4715 4716 if ((vp->v_type != VLNK) || 4717 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR))) 4718 return (B_FALSE); 4719 4720 xva_init(&xvattr); 4721 xoap = xva_getxoptattr(&xvattr); 4722 ASSERT(xoap); 4723 XVA_SET_REQ(&xvattr, XAT_REPARSE); 4724 4725 if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct)) 4726 return (B_FALSE); 4727 4728 if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) || 4729 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE)))) 4730 return (B_FALSE); 4731 4732 return (xoap->xoa_reparse ? B_TRUE : B_FALSE); 4733 }