1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 31 * All Rights Reserved 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/vfs_opreg.h> 42 #include <sys/file.h> 43 #include <sys/filio.h> 44 #include <sys/uio.h> 45 #include <sys/buf.h> 46 #include <sys/mman.h> 47 #include <sys/pathname.h> 48 #include <sys/dirent.h> 49 #include <sys/debug.h> 50 #include <sys/vmsystm.h> 51 #include <sys/fcntl.h> 52 #include <sys/flock.h> 53 #include <sys/swap.h> 54 #include <sys/errno.h> 55 #include <sys/strsubr.h> 56 #include <sys/sysmacros.h> 57 #include <sys/kmem.h> 58 #include <sys/cmn_err.h> 59 #include <sys/pathconf.h> 60 #include <sys/utsname.h> 61 #include <sys/dnlc.h> 62 #include <sys/acl.h> 63 #include <sys/systeminfo.h> 64 #include <sys/policy.h> 65 #include <sys/sdt.h> 66 #include <sys/list.h> 67 #include <sys/stat.h> 68 #include <sys/zone.h> 69 70 #include <rpc/types.h> 71 #include <rpc/auth.h> 72 #include <rpc/clnt.h> 73 74 #include <nfs/nfs.h> 75 #include <nfs/nfs_clnt.h> 76 #include <nfs/nfs_acl.h> 77 #include <nfs/lm.h> 78 #include <nfs/nfs4.h> 79 #include <nfs/nfs4_kprot.h> 80 #include <nfs/rnode4.h> 81 #include <nfs/nfs4_clnt.h> 82 83 #include <vm/hat.h> 84 #include <vm/as.h> 85 #include <vm/page.h> 86 #include <vm/pvn.h> 87 #include <vm/seg.h> 88 #include <vm/seg_map.h> 89 #include <vm/seg_kpm.h> 90 #include <vm/seg_vn.h> 91 92 #include <fs/fs_subr.h> 93 94 #include <sys/ddi.h> 95 #include <sys/int_fmtio.h> 96 #include <sys/fs/autofs.h> 97 98 typedef struct { 99 nfs4_ga_res_t *di_garp; 100 cred_t *di_cred; 101 hrtime_t di_time_call; 102 } dirattr_info_t; 103 104 typedef enum nfs4_acl_op { 105 NFS4_ACL_GET, 106 NFS4_ACL_SET 107 } nfs4_acl_op_t; 108 109 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 110 111 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 112 char *, dirattr_info_t *); 113 114 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 115 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 116 nfs4_error_t *, int *); 117 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 118 cred_t *); 119 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 120 stable_how4 *); 121 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 122 cred_t *, bool_t, struct uio *); 123 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 124 vsecattr_t *); 125 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 126 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 127 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 128 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 129 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 130 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 131 int, vnode_t **, cred_t *); 132 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 133 cred_t *, int, int, enum createmode4, int); 134 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 135 caller_context_t *); 136 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 137 vnode_t *, char *, cred_t *, nfsstat4 *); 138 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 139 vnode_t *, char *, cred_t *, nfsstat4 *); 140 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 141 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 142 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 143 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 144 page_t *[], size_t, struct seg *, caddr_t, 145 enum seg_rw, cred_t *); 146 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 147 cred_t *); 148 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 149 int, cred_t *); 150 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 151 int, cred_t *); 152 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 153 static void nfs4_set_mod(vnode_t *); 154 static void nfs4_get_commit(vnode_t *); 155 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 156 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 157 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 158 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 159 cred_t *); 160 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 161 cred_t *); 162 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 163 hrtime_t, vnode_t *, cred_t *); 164 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 165 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 166 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 167 u_offset_t); 168 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 169 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 170 static cred_t *state_to_cred(nfs4_open_stream_t *); 171 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 172 static pid_t lo_to_pid(lock_owner4 *); 173 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 174 cred_t *, nfs4_lock_owner_t *); 175 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 176 nfs4_lock_owner_t *); 177 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 178 static void nfs4_delmap_callback(struct as *, void *, uint_t); 179 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 180 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 181 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 182 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 183 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 184 uid_t, gid_t, int); 185 186 /* 187 * Routines that implement the setting of v4 args for the misc. ops 188 */ 189 static void nfs4args_lock_free(nfs_argop4 *); 190 static void nfs4args_lockt_free(nfs_argop4 *); 191 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 192 int, rnode4_t *, cred_t *, bitmap4, int *, 193 nfs4_stateid_types_t *); 194 static void nfs4args_setattr_free(nfs_argop4 *); 195 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 196 bitmap4); 197 static void nfs4args_verify_free(nfs_argop4 *); 198 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 199 WRITE4args **, nfs4_stateid_types_t *); 200 201 /* 202 * These are the vnode ops functions that implement the vnode interface to 203 * the networked file system. See more comments below at nfs4_vnodeops. 204 */ 205 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 206 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 207 caller_context_t *); 208 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 209 caller_context_t *); 210 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 211 caller_context_t *); 212 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 213 caller_context_t *); 214 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 215 caller_context_t *); 216 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 217 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 218 caller_context_t *); 219 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 220 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 221 int, vnode_t **, cred_t *, int, caller_context_t *, 222 vsecattr_t *); 223 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 224 int); 225 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 226 caller_context_t *, int); 227 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 228 caller_context_t *, int); 229 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 230 cred_t *, caller_context_t *, int, vsecattr_t *); 231 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 232 caller_context_t *, int); 233 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 234 cred_t *, caller_context_t *, int); 235 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 236 caller_context_t *, int); 237 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 238 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 239 page_t *[], size_t, struct seg *, caddr_t, 240 enum seg_rw, cred_t *, caller_context_t *); 241 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 242 caller_context_t *); 243 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 244 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 245 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 246 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 247 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 248 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 249 struct flk_callback *, cred_t *, caller_context_t *); 250 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 251 cred_t *, caller_context_t *); 252 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 253 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 254 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 255 cred_t *, caller_context_t *); 256 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 257 caller_context_t *); 258 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 259 caller_context_t *); 260 /* 261 * These vnode ops are required to be called from outside this source file, 262 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 263 * as static. 264 */ 265 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 266 caller_context_t *); 267 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 268 int nfs4_lookup(vnode_t *, char *, vnode_t **, 269 struct pathname *, int, vnode_t *, cred_t *, 270 caller_context_t *, int *, pathname_t *); 271 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 272 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 273 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 274 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 275 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 276 caller_context_t *); 277 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 278 caller_context_t *); 279 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 280 caller_context_t *); 281 282 /* 283 * Used for nfs4_commit_vp() to indicate if we should 284 * wait on pending writes. 285 */ 286 #define NFS4_WRITE_NOWAIT 0 287 #define NFS4_WRITE_WAIT 1 288 289 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 290 291 /* 292 * Error flags used to pass information about certain special errors 293 * which need to be handled specially. 294 */ 295 #define NFS_EOF -98 296 #define NFS_VERF_MISMATCH -97 297 298 /* 299 * Flags used to differentiate between which operation drove the 300 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 301 */ 302 #define NFS4_CLOSE_OP 0x1 303 #define NFS4_DELMAP_OP 0x2 304 #define NFS4_INACTIVE_OP 0x3 305 306 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 307 308 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 309 #define ALIGN64(x, ptr, sz) \ 310 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 311 if (x) { \ 312 x = sizeof (uint64_t) - (x); \ 313 sz -= (x); \ 314 ptr += (x); \ 315 } 316 317 #ifdef DEBUG 318 int nfs4_client_attr_debug = 0; 319 int nfs4_client_state_debug = 0; 320 int nfs4_client_shadow_debug = 0; 321 int nfs4_client_lock_debug = 0; 322 int nfs4_seqid_sync = 0; 323 int nfs4_client_map_debug = 0; 324 static int nfs4_pageio_debug = 0; 325 int nfs4_client_inactive_debug = 0; 326 int nfs4_client_recov_debug = 0; 327 int nfs4_client_failover_debug = 0; 328 int nfs4_client_call_debug = 0; 329 int nfs4_client_lookup_debug = 0; 330 int nfs4_client_zone_debug = 0; 331 int nfs4_lost_rqst_debug = 0; 332 int nfs4_rdattrerr_debug = 0; 333 int nfs4_open_stream_debug = 0; 334 335 int nfs4read_error_inject; 336 337 static int nfs4_create_misses = 0; 338 339 static int nfs4_readdir_cache_shorts = 0; 340 static int nfs4_readdir_readahead = 0; 341 342 static int nfs4_bio_do_stop = 0; 343 344 static int nfs4_lostpage = 0; /* number of times we lost original page */ 345 346 int nfs4_mmap_debug = 0; 347 348 static int nfs4_pathconf_cache_hits = 0; 349 static int nfs4_pathconf_cache_misses = 0; 350 351 int nfs4close_all_cnt; 352 int nfs4close_one_debug = 0; 353 int nfs4close_notw_debug = 0; 354 355 int denied_to_flk_debug = 0; 356 void *lockt_denied_debug; 357 358 #endif 359 360 /* 361 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 362 * or NFS4ERR_RESOURCE. 363 */ 364 static int confirm_retry_sec = 30; 365 366 static int nfs4_lookup_neg_cache = 1; 367 368 /* 369 * number of pages to read ahead 370 * optimized for 100 base-T. 371 */ 372 static int nfs4_nra = 4; 373 374 static int nfs4_do_symlink_cache = 1; 375 376 static int nfs4_pathconf_disable_cache = 0; 377 378 /* 379 * These are the vnode ops routines which implement the vnode interface to 380 * the networked file system. These routines just take their parameters, 381 * make them look networkish by putting the right info into interface structs, 382 * and then calling the appropriate remote routine(s) to do the work. 383 * 384 * Note on directory name lookup cacheing: If we detect a stale fhandle, 385 * we purge the directory cache relative to that vnode. This way, the 386 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 387 * more details on rnode locking. 388 */ 389 390 struct vnodeops *nfs4_vnodeops; 391 392 const fs_operation_def_t nfs4_vnodeops_template[] = { 393 VOPNAME_OPEN, { .vop_open = nfs4_open }, 394 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 395 VOPNAME_READ, { .vop_read = nfs4_read }, 396 VOPNAME_WRITE, { .vop_write = nfs4_write }, 397 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 398 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 399 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 400 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 401 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 402 VOPNAME_CREATE, { .vop_create = nfs4_create }, 403 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 404 VOPNAME_LINK, { .vop_link = nfs4_link }, 405 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 406 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 407 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 408 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 409 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 410 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 411 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 412 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 413 VOPNAME_FID, { .vop_fid = nfs4_fid }, 414 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 415 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 416 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 417 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 418 VOPNAME_SPACE, { .vop_space = nfs4_space }, 419 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 420 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 421 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 422 VOPNAME_MAP, { .vop_map = nfs4_map }, 423 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 424 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 425 /* no separate nfs4_dump */ 426 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 427 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 428 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 429 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 430 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 431 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 432 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 433 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 434 NULL, NULL 435 }; 436 437 /* 438 * The following are subroutines and definitions to set args or get res 439 * for the different nfsv4 ops 440 */ 441 442 void 443 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 444 { 445 int i; 446 447 for (i = 0; i < arglen; i++) { 448 if (argop[i].argop == OP_LOOKUP) { 449 kmem_free( 450 argop[i].nfs_argop4_u.oplookup. 451 objname.utf8string_val, 452 argop[i].nfs_argop4_u.oplookup. 453 objname.utf8string_len); 454 } 455 } 456 } 457 458 static void 459 nfs4args_lock_free(nfs_argop4 *argop) 460 { 461 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 462 463 if (locker->new_lock_owner == TRUE) { 464 open_to_lock_owner4 *open_owner; 465 466 open_owner = &locker->locker4_u.open_owner; 467 if (open_owner->lock_owner.owner_val != NULL) { 468 kmem_free(open_owner->lock_owner.owner_val, 469 open_owner->lock_owner.owner_len); 470 } 471 } 472 } 473 474 static void 475 nfs4args_lockt_free(nfs_argop4 *argop) 476 { 477 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 478 479 if (lowner->owner_val != NULL) { 480 kmem_free(lowner->owner_val, lowner->owner_len); 481 } 482 } 483 484 static void 485 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 486 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 487 nfs4_stateid_types_t *sid_types) 488 { 489 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 490 mntinfo4_t *mi; 491 492 argop->argop = OP_SETATTR; 493 /* 494 * The stateid is set to 0 if client is not modifying the size 495 * and otherwise to whatever nfs4_get_stateid() returns. 496 * 497 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 498 * state struct could be found for the process/file pair. We may 499 * want to change this in the future (by OPENing the file). See 500 * bug # 4474852. 501 */ 502 if (vap->va_mask & AT_SIZE) { 503 504 ASSERT(rp != NULL); 505 mi = VTOMI4(RTOV4(rp)); 506 507 argop->nfs_argop4_u.opsetattr.stateid = 508 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 509 OP_SETATTR, sid_types, FALSE); 510 } else { 511 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 512 sizeof (stateid4)); 513 } 514 515 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 516 if (*error) 517 bzero(attr, sizeof (*attr)); 518 } 519 520 static void 521 nfs4args_setattr_free(nfs_argop4 *argop) 522 { 523 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 524 } 525 526 static int 527 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 528 bitmap4 supp) 529 { 530 fattr4 *attr; 531 int error = 0; 532 533 argop->argop = op; 534 switch (op) { 535 case OP_VERIFY: 536 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 537 break; 538 case OP_NVERIFY: 539 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 540 break; 541 default: 542 return (EINVAL); 543 } 544 if (!error) 545 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 546 if (error) 547 bzero(attr, sizeof (*attr)); 548 return (error); 549 } 550 551 static void 552 nfs4args_verify_free(nfs_argop4 *argop) 553 { 554 switch (argop->argop) { 555 case OP_VERIFY: 556 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 557 break; 558 case OP_NVERIFY: 559 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 560 break; 561 default: 562 break; 563 } 564 } 565 566 static void 567 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 568 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 569 { 570 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 571 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 572 573 argop->argop = OP_WRITE; 574 wargs->stable = stable; 575 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 576 mi, OP_WRITE, sid_tp); 577 wargs->mblk = NULL; 578 *wargs_pp = wargs; 579 } 580 581 void 582 nfs4args_copen_free(OPEN4cargs *open_args) 583 { 584 if (open_args->owner.owner_val) { 585 kmem_free(open_args->owner.owner_val, 586 open_args->owner.owner_len); 587 } 588 if ((open_args->opentype == OPEN4_CREATE) && 589 (open_args->mode != EXCLUSIVE4)) { 590 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 591 } 592 } 593 594 /* 595 * XXX: This is referenced in modstubs.s 596 */ 597 struct vnodeops * 598 nfs4_getvnodeops(void) 599 { 600 return (nfs4_vnodeops); 601 } 602 603 /* 604 * The OPEN operation opens a regular file. 605 */ 606 /*ARGSUSED3*/ 607 static int 608 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 609 { 610 vnode_t *dvp = NULL; 611 rnode4_t *rp, *drp; 612 int error; 613 int just_been_created; 614 char fn[MAXNAMELEN]; 615 616 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 617 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 618 return (EIO); 619 rp = VTOR4(*vpp); 620 621 /* 622 * Check to see if opening something besides a regular file; 623 * if so skip the OTW call 624 */ 625 if ((*vpp)->v_type != VREG) { 626 error = nfs4_open_non_reg_file(vpp, flag, cr); 627 return (error); 628 } 629 630 /* 631 * XXX - would like a check right here to know if the file is 632 * executable or not, so as to skip OTW 633 */ 634 635 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 636 return (error); 637 638 drp = VTOR4(dvp); 639 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 640 return (EINTR); 641 642 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 643 nfs_rw_exit(&drp->r_rwlock); 644 return (error); 645 } 646 647 /* 648 * See if this file has just been CREATEd. 649 * If so, clear the flag and update the dnlc, which was previously 650 * skipped in nfs4_create. 651 * XXX need better serilization on this. 652 * XXX move this into the nf4open_otw call, after we have 653 * XXX acquired the open owner seqid sync. 654 */ 655 mutex_enter(&rp->r_statev4_lock); 656 if (rp->created_v4) { 657 rp->created_v4 = 0; 658 mutex_exit(&rp->r_statev4_lock); 659 660 dnlc_update(dvp, fn, *vpp); 661 /* This is needed so we don't bump the open ref count */ 662 just_been_created = 1; 663 } else { 664 mutex_exit(&rp->r_statev4_lock); 665 just_been_created = 0; 666 } 667 668 /* 669 * If caller specified O_TRUNC/FTRUNC, then be sure to set 670 * FWRITE (to drive successful setattr(size=0) after open) 671 */ 672 if (flag & FTRUNC) 673 flag |= FWRITE; 674 675 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 676 just_been_created); 677 678 if (!error && !((*vpp)->v_flag & VROOT)) 679 dnlc_update(dvp, fn, *vpp); 680 681 nfs_rw_exit(&drp->r_rwlock); 682 683 /* release the hold from vtodv */ 684 VN_RELE(dvp); 685 686 /* exchange the shadow for the master vnode, if needed */ 687 688 if (error == 0 && IS_SHADOW(*vpp, rp)) 689 sv_exchange(vpp); 690 691 return (error); 692 } 693 694 /* 695 * See if there's a "lost open" request to be saved and recovered. 696 */ 697 static void 698 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 699 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 700 vnode_t *dvp, OPEN4cargs *open_args) 701 { 702 vfs_t *vfsp; 703 char *srccfp; 704 705 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 706 707 if (error != ETIMEDOUT && error != EINTR && 708 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 709 lost_rqstp->lr_op = 0; 710 return; 711 } 712 713 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 714 "nfs4open_save_lost_rqst: error %d", error)); 715 716 lost_rqstp->lr_op = OP_OPEN; 717 718 /* 719 * The vp (if it is not NULL) and dvp are held and rele'd via 720 * the recovery code. See nfs4_save_lost_rqst. 721 */ 722 lost_rqstp->lr_vp = vp; 723 lost_rqstp->lr_dvp = dvp; 724 lost_rqstp->lr_oop = oop; 725 lost_rqstp->lr_osp = NULL; 726 lost_rqstp->lr_lop = NULL; 727 lost_rqstp->lr_cr = cr; 728 lost_rqstp->lr_flk = NULL; 729 lost_rqstp->lr_oacc = open_args->share_access; 730 lost_rqstp->lr_odeny = open_args->share_deny; 731 lost_rqstp->lr_oclaim = open_args->claim; 732 if (open_args->claim == CLAIM_DELEGATE_CUR) { 733 lost_rqstp->lr_ostateid = 734 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 735 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 736 } else { 737 srccfp = open_args->open_claim4_u.cfile; 738 } 739 lost_rqstp->lr_ofile.utf8string_len = 0; 740 lost_rqstp->lr_ofile.utf8string_val = NULL; 741 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 742 lost_rqstp->lr_putfirst = FALSE; 743 } 744 745 struct nfs4_excl_time { 746 uint32 seconds; 747 uint32 nseconds; 748 }; 749 750 /* 751 * The OPEN operation creates and/or opens a regular file 752 * 753 * ARGSUSED 754 */ 755 static int 756 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 757 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 758 enum createmode4 createmode, int file_just_been_created) 759 { 760 rnode4_t *rp; 761 rnode4_t *drp = VTOR4(dvp); 762 vnode_t *vp = NULL; 763 vnode_t *vpi = *vpp; 764 bool_t needrecov = FALSE; 765 766 int doqueue = 1; 767 768 COMPOUND4args_clnt args; 769 COMPOUND4res_clnt res; 770 nfs_argop4 *argop; 771 nfs_resop4 *resop; 772 int argoplist_size; 773 int idx_open, idx_fattr; 774 775 GETFH4res *gf_res = NULL; 776 OPEN4res *op_res = NULL; 777 nfs4_ga_res_t *garp; 778 fattr4 *attr = NULL; 779 struct nfs4_excl_time verf; 780 bool_t did_excl_setup = FALSE; 781 int created_osp; 782 783 OPEN4cargs *open_args; 784 nfs4_open_owner_t *oop = NULL; 785 nfs4_open_stream_t *osp = NULL; 786 seqid4 seqid = 0; 787 bool_t retry_open = FALSE; 788 nfs4_recov_state_t recov_state; 789 nfs4_lost_rqst_t lost_rqst; 790 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 791 hrtime_t t; 792 int acc = 0; 793 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 794 cred_t *ncr = NULL; 795 796 nfs4_sharedfh_t *otw_sfh; 797 nfs4_sharedfh_t *orig_sfh; 798 int fh_differs = 0; 799 int numops, setgid_flag; 800 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 801 802 /* 803 * Make sure we properly deal with setting the right gid on 804 * a newly created file to reflect the parent's setgid bit 805 */ 806 setgid_flag = 0; 807 if (create_flag && in_va) { 808 809 /* 810 * If there is grpid mount flag used or 811 * the parent's directory has the setgid bit set 812 * _and_ the client was able to get a valid mapping 813 * for the parent dir's owner_group, we want to 814 * append NVERIFY(owner_group == dva.va_gid) and 815 * SETATTR to the CREATE compound. 816 */ 817 mutex_enter(&drp->r_statelock); 818 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID || 819 drp->r_attr.va_mode & VSGID) && 820 drp->r_attr.va_gid != GID_NOBODY) { 821 in_va->va_mask |= AT_GID; 822 in_va->va_gid = drp->r_attr.va_gid; 823 setgid_flag = 1; 824 } 825 mutex_exit(&drp->r_statelock); 826 } 827 828 /* 829 * Normal/non-create compound: 830 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 831 * 832 * Open(create) compound no setgid: 833 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 834 * RESTOREFH + GETATTR 835 * 836 * Open(create) setgid: 837 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 838 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 839 * NVERIFY(grp) + SETATTR 840 */ 841 if (setgid_flag) { 842 numops = 10; 843 idx_open = 1; 844 idx_fattr = 3; 845 } else if (create_flag) { 846 numops = 7; 847 idx_open = 2; 848 idx_fattr = 4; 849 } else { 850 numops = 4; 851 idx_open = 1; 852 idx_fattr = 3; 853 } 854 855 args.array_len = numops; 856 argoplist_size = numops * sizeof (nfs_argop4); 857 argop = kmem_alloc(argoplist_size, KM_SLEEP); 858 859 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 860 "open %s open flag 0x%x cred %p", file_name, open_flag, 861 (void *)cr)); 862 863 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 864 if (create_flag) { 865 /* 866 * We are to create a file. Initialize the passed in vnode 867 * pointer. 868 */ 869 vpi = NULL; 870 } else { 871 /* 872 * Check to see if the client owns a read delegation and is 873 * trying to open for write. If so, then return the delegation 874 * to avoid the server doing a cb_recall and returning DELAY. 875 * NB - we don't use the statev4_lock here because we'd have 876 * to drop the lock anyway and the result would be stale. 877 */ 878 if ((open_flag & FWRITE) && 879 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 880 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 881 882 /* 883 * If the file has a delegation, then do an access check up 884 * front. This avoids having to an access check later after 885 * we've already done start_op, which could deadlock. 886 */ 887 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 888 if (open_flag & FREAD && 889 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 890 acc |= VREAD; 891 if (open_flag & FWRITE && 892 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 893 acc |= VWRITE; 894 } 895 } 896 897 drp = VTOR4(dvp); 898 899 recov_state.rs_flags = 0; 900 recov_state.rs_num_retry_despite_err = 0; 901 cred_otw = cr; 902 903 recov_retry: 904 fh_differs = 0; 905 nfs4_error_zinit(&e); 906 907 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 908 if (e.error) { 909 if (ncr != NULL) 910 crfree(ncr); 911 kmem_free(argop, argoplist_size); 912 return (e.error); 913 } 914 915 args.ctag = TAG_OPEN; 916 args.array_len = numops; 917 args.array = argop; 918 919 /* putfh directory fh */ 920 argop[0].argop = OP_CPUTFH; 921 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 922 923 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 924 argop[idx_open].argop = OP_COPEN; 925 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 926 open_args->claim = CLAIM_NULL; 927 928 /* name of file */ 929 open_args->open_claim4_u.cfile = file_name; 930 open_args->owner.owner_len = 0; 931 open_args->owner.owner_val = NULL; 932 933 if (create_flag) { 934 /* CREATE a file */ 935 open_args->opentype = OPEN4_CREATE; 936 open_args->mode = createmode; 937 if (createmode == EXCLUSIVE4) { 938 if (did_excl_setup == FALSE) { 939 verf.seconds = zone_get_hostid(NULL); 940 if (verf.seconds != 0) 941 verf.nseconds = newnum(); 942 else { 943 timestruc_t now; 944 945 gethrestime(&now); 946 verf.seconds = now.tv_sec; 947 verf.nseconds = now.tv_nsec; 948 } 949 /* 950 * Since the server will use this value for the 951 * mtime, make sure that it can't overflow. Zero 952 * out the MSB. The actual value does not matter 953 * here, only its uniqeness. 954 */ 955 verf.seconds &= INT32_MAX; 956 did_excl_setup = TRUE; 957 } 958 959 /* Now copy over verifier to OPEN4args. */ 960 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 961 } else { 962 int v_error; 963 bitmap4 supp_attrs; 964 servinfo4_t *svp; 965 966 attr = &open_args->createhow4_u.createattrs; 967 968 svp = drp->r_server; 969 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 970 supp_attrs = svp->sv_supp_attrs; 971 nfs_rw_exit(&svp->sv_lock); 972 973 /* GUARDED4 or UNCHECKED4 */ 974 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 975 supp_attrs); 976 if (v_error) { 977 bzero(attr, sizeof (*attr)); 978 nfs4args_copen_free(open_args); 979 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 980 &recov_state, FALSE); 981 if (ncr != NULL) 982 crfree(ncr); 983 kmem_free(argop, argoplist_size); 984 return (v_error); 985 } 986 } 987 } else { 988 /* NO CREATE */ 989 open_args->opentype = OPEN4_NOCREATE; 990 } 991 992 if (recov_state.rs_sp != NULL) { 993 mutex_enter(&recov_state.rs_sp->s_lock); 994 open_args->owner.clientid = recov_state.rs_sp->clientid; 995 mutex_exit(&recov_state.rs_sp->s_lock); 996 } else { 997 /* XXX should we just fail here? */ 998 open_args->owner.clientid = 0; 999 } 1000 1001 /* 1002 * This increments oop's ref count or creates a temporary 'just_created' 1003 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 1004 * completes. 1005 */ 1006 mutex_enter(&VTOMI4(dvp)->mi_lock); 1007 1008 /* See if a permanent or just created open owner exists */ 1009 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1010 if (!oop) { 1011 /* 1012 * This open owner does not exist so create a temporary 1013 * just created one. 1014 */ 1015 oop = create_open_owner(cr, VTOMI4(dvp)); 1016 ASSERT(oop != NULL); 1017 } 1018 mutex_exit(&VTOMI4(dvp)->mi_lock); 1019 1020 /* this length never changes, do alloc before seqid sync */ 1021 open_args->owner.owner_len = sizeof (oop->oo_name); 1022 open_args->owner.owner_val = 1023 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1024 1025 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1026 if (e.error == EAGAIN) { 1027 open_owner_rele(oop); 1028 nfs4args_copen_free(open_args); 1029 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1030 if (ncr != NULL) { 1031 crfree(ncr); 1032 ncr = NULL; 1033 } 1034 goto recov_retry; 1035 } 1036 1037 /* Check to see if we need to do the OTW call */ 1038 if (!create_flag) { 1039 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1040 file_just_been_created, &e.error, acc, &recov_state)) { 1041 1042 /* 1043 * The OTW open is not necessary. Either 1044 * the open can succeed without it (eg. 1045 * delegation, error == 0) or the open 1046 * must fail due to an access failure 1047 * (error != 0). In either case, tidy 1048 * up and return. 1049 */ 1050 1051 nfs4_end_open_seqid_sync(oop); 1052 open_owner_rele(oop); 1053 nfs4args_copen_free(open_args); 1054 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1055 if (ncr != NULL) 1056 crfree(ncr); 1057 kmem_free(argop, argoplist_size); 1058 return (e.error); 1059 } 1060 } 1061 1062 bcopy(&oop->oo_name, open_args->owner.owner_val, 1063 open_args->owner.owner_len); 1064 1065 seqid = nfs4_get_open_seqid(oop) + 1; 1066 open_args->seqid = seqid; 1067 open_args->share_access = 0; 1068 if (open_flag & FREAD) 1069 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1070 if (open_flag & FWRITE) 1071 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1072 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1073 1074 1075 1076 /* 1077 * getfh w/sanity check for idx_open/idx_fattr 1078 */ 1079 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1080 argop[idx_open + 1].argop = OP_GETFH; 1081 1082 /* getattr */ 1083 argop[idx_fattr].argop = OP_GETATTR; 1084 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1085 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1086 1087 if (setgid_flag) { 1088 vattr_t _v; 1089 servinfo4_t *svp; 1090 bitmap4 supp_attrs; 1091 1092 svp = drp->r_server; 1093 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1094 supp_attrs = svp->sv_supp_attrs; 1095 nfs_rw_exit(&svp->sv_lock); 1096 1097 /* 1098 * For setgid case, we need to: 1099 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1100 */ 1101 argop[4].argop = OP_SAVEFH; 1102 1103 argop[5].argop = OP_CPUTFH; 1104 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1105 1106 argop[6].argop = OP_GETATTR; 1107 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1108 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1109 1110 argop[7].argop = OP_RESTOREFH; 1111 1112 /* 1113 * nverify 1114 */ 1115 _v.va_mask = AT_GID; 1116 _v.va_gid = in_va->va_gid; 1117 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1118 supp_attrs))) { 1119 1120 /* 1121 * setattr 1122 * 1123 * We _know_ we're not messing with AT_SIZE or 1124 * AT_XTIME, so no need for stateid or flags. 1125 * Also we specify NULL rp since we're only 1126 * interested in setting owner_group attributes. 1127 */ 1128 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1129 supp_attrs, &e.error, 0); 1130 if (e.error) 1131 nfs4args_verify_free(&argop[8]); 1132 } 1133 1134 if (e.error) { 1135 /* 1136 * XXX - Revisit the last argument to nfs4_end_op() 1137 * once 5020486 is fixed. 1138 */ 1139 nfs4_end_open_seqid_sync(oop); 1140 open_owner_rele(oop); 1141 nfs4args_copen_free(open_args); 1142 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1143 if (ncr != NULL) 1144 crfree(ncr); 1145 kmem_free(argop, argoplist_size); 1146 return (e.error); 1147 } 1148 } else if (create_flag) { 1149 argop[1].argop = OP_SAVEFH; 1150 1151 argop[5].argop = OP_RESTOREFH; 1152 1153 argop[6].argop = OP_GETATTR; 1154 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1155 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1156 } 1157 1158 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1159 "nfs4open_otw: %s call, nm %s, rp %s", 1160 needrecov ? "recov" : "first", file_name, 1161 rnode4info(VTOR4(dvp)))); 1162 1163 t = gethrtime(); 1164 1165 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1166 1167 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1168 nfs4_set_open_seqid(seqid, oop, args.ctag); 1169 1170 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1171 1172 if (e.error || needrecov) { 1173 bool_t abort = FALSE; 1174 1175 if (needrecov) { 1176 nfs4_bseqid_entry_t *bsep = NULL; 1177 1178 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1179 cred_otw, vpi, dvp, open_args); 1180 1181 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1182 bsep = nfs4_create_bseqid_entry(oop, NULL, 1183 vpi, 0, args.ctag, open_args->seqid); 1184 num_bseqid_retry--; 1185 } 1186 1187 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1188 NULL, lost_rqst.lr_op == OP_OPEN ? 1189 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL); 1190 1191 if (bsep) 1192 kmem_free(bsep, sizeof (*bsep)); 1193 /* give up if we keep getting BAD_SEQID */ 1194 if (num_bseqid_retry == 0) 1195 abort = TRUE; 1196 if (abort == TRUE && e.error == 0) 1197 e.error = geterrno4(res.status); 1198 } 1199 nfs4_end_open_seqid_sync(oop); 1200 open_owner_rele(oop); 1201 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1202 nfs4args_copen_free(open_args); 1203 if (setgid_flag) { 1204 nfs4args_verify_free(&argop[8]); 1205 nfs4args_setattr_free(&argop[9]); 1206 } 1207 if (!e.error) 1208 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1209 if (ncr != NULL) { 1210 crfree(ncr); 1211 ncr = NULL; 1212 } 1213 if (!needrecov || abort == TRUE || e.error == EINTR || 1214 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1215 kmem_free(argop, argoplist_size); 1216 return (e.error); 1217 } 1218 goto recov_retry; 1219 } 1220 1221 /* 1222 * Will check and update lease after checking the rflag for 1223 * OPEN_CONFIRM in the successful OPEN call. 1224 */ 1225 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1226 1227 /* 1228 * XXX what if we're crossing mount points from server1:/drp 1229 * to server2:/drp/rp. 1230 */ 1231 1232 /* Signal our end of use of the open seqid */ 1233 nfs4_end_open_seqid_sync(oop); 1234 1235 /* 1236 * This will destroy the open owner if it was just created, 1237 * and no one else has put a reference on it. 1238 */ 1239 open_owner_rele(oop); 1240 if (create_flag && (createmode != EXCLUSIVE4) && 1241 res.status == NFS4ERR_BADOWNER) 1242 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1243 1244 e.error = geterrno4(res.status); 1245 nfs4args_copen_free(open_args); 1246 if (setgid_flag) { 1247 nfs4args_verify_free(&argop[8]); 1248 nfs4args_setattr_free(&argop[9]); 1249 } 1250 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1251 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1252 /* 1253 * If the reply is NFS4ERR_ACCESS, it may be because 1254 * we are root (no root net access). If the real uid 1255 * is not root, then retry with the real uid instead. 1256 */ 1257 if (ncr != NULL) { 1258 crfree(ncr); 1259 ncr = NULL; 1260 } 1261 if (res.status == NFS4ERR_ACCESS && 1262 (ncr = crnetadjust(cred_otw)) != NULL) { 1263 cred_otw = ncr; 1264 goto recov_retry; 1265 } 1266 kmem_free(argop, argoplist_size); 1267 return (e.error); 1268 } 1269 1270 resop = &res.array[idx_open]; /* open res */ 1271 op_res = &resop->nfs_resop4_u.opopen; 1272 1273 #ifdef DEBUG 1274 /* 1275 * verify attrset bitmap 1276 */ 1277 if (create_flag && 1278 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1279 /* make sure attrset returned is what we asked for */ 1280 /* XXX Ignore this 'error' for now */ 1281 if (attr->attrmask != op_res->attrset) 1282 /* EMPTY */; 1283 } 1284 #endif 1285 1286 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1287 mutex_enter(&VTOMI4(dvp)->mi_lock); 1288 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1289 mutex_exit(&VTOMI4(dvp)->mi_lock); 1290 } 1291 1292 resop = &res.array[idx_open + 1]; /* getfh res */ 1293 gf_res = &resop->nfs_resop4_u.opgetfh; 1294 1295 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1296 1297 /* 1298 * The open stateid has been updated on the server but not 1299 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1300 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1301 * WRITE call. That, however, will use the old stateid, so go ahead 1302 * and upate the open stateid now, before any call to makenfs4node. 1303 */ 1304 if (vpi) { 1305 nfs4_open_stream_t *tmp_osp; 1306 rnode4_t *tmp_rp = VTOR4(vpi); 1307 1308 tmp_osp = find_open_stream(oop, tmp_rp); 1309 if (tmp_osp) { 1310 tmp_osp->open_stateid = op_res->stateid; 1311 mutex_exit(&tmp_osp->os_sync_lock); 1312 open_stream_rele(tmp_osp, tmp_rp); 1313 } 1314 1315 /* 1316 * We must determine if the file handle given by the otw open 1317 * is the same as the file handle which was passed in with 1318 * *vpp. This case can be reached if the file we are trying 1319 * to open has been removed and another file has been created 1320 * having the same file name. The passed in vnode is released 1321 * later. 1322 */ 1323 orig_sfh = VTOR4(vpi)->r_fh; 1324 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1325 } 1326 1327 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1328 1329 if (create_flag || fh_differs) { 1330 int rnode_err = 0; 1331 1332 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1333 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1334 1335 if (e.error) 1336 PURGE_ATTRCACHE4(vp); 1337 /* 1338 * For the newly created vp case, make sure the rnode 1339 * isn't bad before using it. 1340 */ 1341 mutex_enter(&(VTOR4(vp))->r_statelock); 1342 if (VTOR4(vp)->r_flags & R4RECOVERR) 1343 rnode_err = EIO; 1344 mutex_exit(&(VTOR4(vp))->r_statelock); 1345 1346 if (rnode_err) { 1347 nfs4_end_open_seqid_sync(oop); 1348 nfs4args_copen_free(open_args); 1349 if (setgid_flag) { 1350 nfs4args_verify_free(&argop[8]); 1351 nfs4args_setattr_free(&argop[9]); 1352 } 1353 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1354 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1355 needrecov); 1356 open_owner_rele(oop); 1357 VN_RELE(vp); 1358 if (ncr != NULL) 1359 crfree(ncr); 1360 sfh4_rele(&otw_sfh); 1361 kmem_free(argop, argoplist_size); 1362 return (EIO); 1363 } 1364 } else { 1365 vp = vpi; 1366 } 1367 sfh4_rele(&otw_sfh); 1368 1369 /* 1370 * It seems odd to get a full set of attrs and then not update 1371 * the object's attrcache in the non-create case. Create case uses 1372 * the attrs since makenfs4node checks to see if the attrs need to 1373 * be updated (and then updates them). The non-create case should 1374 * update attrs also. 1375 */ 1376 if (! create_flag && ! fh_differs && !e.error) { 1377 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1378 } 1379 1380 nfs4_error_zinit(&e); 1381 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1382 /* This does not do recovery for vp explicitly. */ 1383 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1384 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1385 1386 if (e.error || e.stat) { 1387 nfs4_end_open_seqid_sync(oop); 1388 nfs4args_copen_free(open_args); 1389 if (setgid_flag) { 1390 nfs4args_verify_free(&argop[8]); 1391 nfs4args_setattr_free(&argop[9]); 1392 } 1393 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1394 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1395 needrecov); 1396 open_owner_rele(oop); 1397 if (create_flag || fh_differs) { 1398 /* rele the makenfs4node */ 1399 VN_RELE(vp); 1400 } 1401 if (ncr != NULL) { 1402 crfree(ncr); 1403 ncr = NULL; 1404 } 1405 if (retry_open == TRUE) { 1406 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1407 "nfs4open_otw: retry the open since OPEN " 1408 "CONFIRM failed with error %d stat %d", 1409 e.error, e.stat)); 1410 if (create_flag && createmode == GUARDED4) { 1411 NFS4_DEBUG(nfs4_client_recov_debug, 1412 (CE_NOTE, "nfs4open_otw: switch " 1413 "createmode from GUARDED4 to " 1414 "UNCHECKED4")); 1415 createmode = UNCHECKED4; 1416 } 1417 goto recov_retry; 1418 } 1419 if (!e.error) { 1420 if (create_flag && (createmode != EXCLUSIVE4) && 1421 e.stat == NFS4ERR_BADOWNER) 1422 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1423 1424 e.error = geterrno4(e.stat); 1425 } 1426 kmem_free(argop, argoplist_size); 1427 return (e.error); 1428 } 1429 } 1430 1431 rp = VTOR4(vp); 1432 1433 mutex_enter(&rp->r_statev4_lock); 1434 if (create_flag) 1435 rp->created_v4 = 1; 1436 mutex_exit(&rp->r_statev4_lock); 1437 1438 mutex_enter(&oop->oo_lock); 1439 /* Doesn't matter if 'oo_just_created' already was set as this */ 1440 oop->oo_just_created = NFS4_PERM_CREATED; 1441 if (oop->oo_cred_otw) 1442 crfree(oop->oo_cred_otw); 1443 oop->oo_cred_otw = cred_otw; 1444 crhold(oop->oo_cred_otw); 1445 mutex_exit(&oop->oo_lock); 1446 1447 /* returns with 'os_sync_lock' held */ 1448 osp = find_or_create_open_stream(oop, rp, &created_osp); 1449 if (!osp) { 1450 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1451 "nfs4open_otw: failed to create an open stream")); 1452 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1453 "signal our end of use of the open seqid")); 1454 1455 nfs4_end_open_seqid_sync(oop); 1456 open_owner_rele(oop); 1457 nfs4args_copen_free(open_args); 1458 if (setgid_flag) { 1459 nfs4args_verify_free(&argop[8]); 1460 nfs4args_setattr_free(&argop[9]); 1461 } 1462 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1463 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1464 if (create_flag || fh_differs) 1465 VN_RELE(vp); 1466 if (ncr != NULL) 1467 crfree(ncr); 1468 1469 kmem_free(argop, argoplist_size); 1470 return (EINVAL); 1471 1472 } 1473 1474 osp->open_stateid = op_res->stateid; 1475 1476 if (open_flag & FREAD) 1477 osp->os_share_acc_read++; 1478 if (open_flag & FWRITE) 1479 osp->os_share_acc_write++; 1480 osp->os_share_deny_none++; 1481 1482 /* 1483 * Need to reset this bitfield for the possible case where we were 1484 * going to OTW CLOSE the file, got a non-recoverable error, and before 1485 * we could retry the CLOSE, OPENed the file again. 1486 */ 1487 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1488 osp->os_final_close = 0; 1489 osp->os_force_close = 0; 1490 #ifdef DEBUG 1491 if (osp->os_failed_reopen) 1492 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1493 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1494 (void *)osp, (void *)cr, rnode4info(rp))); 1495 #endif 1496 osp->os_failed_reopen = 0; 1497 1498 mutex_exit(&osp->os_sync_lock); 1499 1500 nfs4_end_open_seqid_sync(oop); 1501 1502 if (created_osp && recov_state.rs_sp != NULL) { 1503 mutex_enter(&recov_state.rs_sp->s_lock); 1504 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1505 mutex_exit(&recov_state.rs_sp->s_lock); 1506 } 1507 1508 /* get rid of our reference to find oop */ 1509 open_owner_rele(oop); 1510 1511 open_stream_rele(osp, rp); 1512 1513 /* accept delegation, if any */ 1514 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1515 1516 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1517 1518 if (createmode == EXCLUSIVE4 && 1519 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1520 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1521 " EXCLUSIVE4: sending a SETATTR")); 1522 /* 1523 * If doing an exclusive create, then generate 1524 * a SETATTR to set the initial attributes. 1525 * Try to set the mtime and the atime to the 1526 * server's current time. It is somewhat 1527 * expected that these fields will be used to 1528 * store the exclusive create cookie. If not, 1529 * server implementors will need to know that 1530 * a SETATTR will follow an exclusive create 1531 * and the cookie should be destroyed if 1532 * appropriate. 1533 * 1534 * The AT_GID and AT_SIZE bits are turned off 1535 * so that the SETATTR request will not attempt 1536 * to process these. The gid will be set 1537 * separately if appropriate. The size is turned 1538 * off because it is assumed that a new file will 1539 * be created empty and if the file wasn't empty, 1540 * then the exclusive create will have failed 1541 * because the file must have existed already. 1542 * Therefore, no truncate operation is needed. 1543 */ 1544 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1545 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1546 1547 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1548 if (e.error) { 1549 /* 1550 * Couldn't correct the attributes of 1551 * the newly created file and the 1552 * attributes are wrong. Remove the 1553 * file and return an error to the 1554 * application. 1555 */ 1556 /* XXX will this take care of client state ? */ 1557 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1558 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1559 " remove file", e.error)); 1560 VN_RELE(vp); 1561 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1562 /* 1563 * Since we've reled the vnode and removed 1564 * the file we now need to return the error. 1565 * At this point we don't want to update the 1566 * dircaches, call nfs4_waitfor_purge_complete 1567 * or set vpp to vp so we need to skip these 1568 * as well. 1569 */ 1570 goto skip_update_dircaches; 1571 } 1572 } 1573 1574 /* 1575 * If we created or found the correct vnode, due to create_flag or 1576 * fh_differs being set, then update directory cache attribute, readdir 1577 * and dnlc caches. 1578 */ 1579 if (create_flag || fh_differs) { 1580 dirattr_info_t dinfo, *dinfop; 1581 1582 /* 1583 * Make sure getattr succeeded before using results. 1584 * note: op 7 is getattr(dir) for both flavors of 1585 * open(create). 1586 */ 1587 if (create_flag && res.status == NFS4_OK) { 1588 dinfo.di_time_call = t; 1589 dinfo.di_cred = cr; 1590 dinfo.di_garp = 1591 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1592 dinfop = &dinfo; 1593 } else { 1594 dinfop = NULL; 1595 } 1596 1597 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1598 dinfop); 1599 } 1600 1601 /* 1602 * If the page cache for this file was flushed from actions 1603 * above, it was done asynchronously and if that is true, 1604 * there is a need to wait here for it to complete. This must 1605 * be done outside of start_fop/end_fop. 1606 */ 1607 (void) nfs4_waitfor_purge_complete(vp); 1608 1609 /* 1610 * It is implicit that we are in the open case (create_flag == 0) since 1611 * fh_differs can only be set to a non-zero value in the open case. 1612 */ 1613 if (fh_differs != 0 && vpi != NULL) 1614 VN_RELE(vpi); 1615 1616 /* 1617 * Be sure to set *vpp to the correct value before returning. 1618 */ 1619 *vpp = vp; 1620 1621 skip_update_dircaches: 1622 1623 nfs4args_copen_free(open_args); 1624 if (setgid_flag) { 1625 nfs4args_verify_free(&argop[8]); 1626 nfs4args_setattr_free(&argop[9]); 1627 } 1628 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1629 1630 if (ncr) 1631 crfree(ncr); 1632 kmem_free(argop, argoplist_size); 1633 return (e.error); 1634 } 1635 1636 /* 1637 * Reopen an open instance. cf. nfs4open_otw(). 1638 * 1639 * Errors are returned by the nfs4_error_t parameter. 1640 * - ep->error contains an errno value or zero. 1641 * - if it is zero, ep->stat is set to an NFS status code, if any. 1642 * If the file could not be reopened, but the caller should continue, the 1643 * file is marked dead and no error values are returned. If the caller 1644 * should stop recovering open files and start over, either the ep->error 1645 * value or ep->stat will indicate an error (either something that requires 1646 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1647 * filehandles) may be handled silently by this routine. 1648 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1649 * will be started, so the caller should not do it. 1650 * 1651 * Gotos: 1652 * - kill_file : reopen failed in such a fashion to constitute marking the 1653 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1654 * is for cases where recovery is not possible. 1655 * - failed_reopen : same as above, except that the file has already been 1656 * marked dead, so no need to do it again. 1657 * - bailout : reopen failed but we are able to recover and retry the reopen - 1658 * either within this function immediately or via the calling function. 1659 */ 1660 1661 void 1662 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1663 open_claim_type4 claim, bool_t frc_use_claim_previous, 1664 bool_t is_recov) 1665 { 1666 COMPOUND4args_clnt args; 1667 COMPOUND4res_clnt res; 1668 nfs_argop4 argop[4]; 1669 nfs_resop4 *resop; 1670 OPEN4res *op_res = NULL; 1671 OPEN4cargs *open_args; 1672 GETFH4res *gf_res; 1673 rnode4_t *rp = VTOR4(vp); 1674 int doqueue = 1; 1675 cred_t *cr = NULL, *cred_otw = NULL; 1676 nfs4_open_owner_t *oop = NULL; 1677 seqid4 seqid; 1678 nfs4_ga_res_t *garp; 1679 char fn[MAXNAMELEN]; 1680 nfs4_recov_state_t recov = {NULL, 0}; 1681 nfs4_lost_rqst_t lost_rqst; 1682 mntinfo4_t *mi = VTOMI4(vp); 1683 bool_t abort; 1684 char *failed_msg = ""; 1685 int fh_different; 1686 hrtime_t t; 1687 nfs4_bseqid_entry_t *bsep = NULL; 1688 1689 ASSERT(nfs4_consistent_type(vp)); 1690 ASSERT(nfs_zone() == mi->mi_zone); 1691 1692 nfs4_error_zinit(ep); 1693 1694 /* this is the cred used to find the open owner */ 1695 cr = state_to_cred(osp); 1696 if (cr == NULL) { 1697 failed_msg = "Couldn't reopen: no cred"; 1698 goto kill_file; 1699 } 1700 /* use this cred for OTW operations */ 1701 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1702 1703 top: 1704 nfs4_error_zinit(ep); 1705 1706 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1707 /* File system has been unmounted, quit */ 1708 ep->error = EIO; 1709 failed_msg = "Couldn't reopen: file system has been unmounted"; 1710 goto kill_file; 1711 } 1712 1713 oop = osp->os_open_owner; 1714 1715 ASSERT(oop != NULL); 1716 if (oop == NULL) { /* be defensive in non-DEBUG */ 1717 failed_msg = "can't reopen: no open owner"; 1718 goto kill_file; 1719 } 1720 open_owner_hold(oop); 1721 1722 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1723 if (ep->error) { 1724 open_owner_rele(oop); 1725 oop = NULL; 1726 goto bailout; 1727 } 1728 1729 /* 1730 * If the rnode has a delegation and the delegation has been 1731 * recovered and the server didn't request a recall and the caller 1732 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1733 * recovery) and the rnode hasn't been marked dead, then install 1734 * the delegation stateid in the open stream. Otherwise, proceed 1735 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1736 */ 1737 mutex_enter(&rp->r_statev4_lock); 1738 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1739 !rp->r_deleg_return_pending && 1740 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1741 !rp->r_deleg_needs_recall && 1742 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1743 !(rp->r_flags & R4RECOVERR)) { 1744 mutex_enter(&osp->os_sync_lock); 1745 osp->os_delegation = 1; 1746 osp->open_stateid = rp->r_deleg_stateid; 1747 mutex_exit(&osp->os_sync_lock); 1748 mutex_exit(&rp->r_statev4_lock); 1749 goto bailout; 1750 } 1751 mutex_exit(&rp->r_statev4_lock); 1752 1753 /* 1754 * If the file failed recovery, just quit. This failure need not 1755 * affect other reopens, so don't return an error. 1756 */ 1757 mutex_enter(&rp->r_statelock); 1758 if (rp->r_flags & R4RECOVERR) { 1759 mutex_exit(&rp->r_statelock); 1760 ep->error = 0; 1761 goto failed_reopen; 1762 } 1763 mutex_exit(&rp->r_statelock); 1764 1765 /* 1766 * argop is empty here 1767 * 1768 * PUTFH, OPEN, GETATTR 1769 */ 1770 args.ctag = TAG_REOPEN; 1771 args.array_len = 4; 1772 args.array = argop; 1773 1774 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1775 "nfs4_reopen: file is type %d, id %s", 1776 vp->v_type, rnode4info(VTOR4(vp)))); 1777 1778 argop[0].argop = OP_CPUTFH; 1779 1780 if (claim != CLAIM_PREVIOUS) { 1781 /* 1782 * if this is a file mount then 1783 * use the mntinfo parentfh 1784 */ 1785 argop[0].nfs_argop4_u.opcputfh.sfh = 1786 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1787 VTOSV(vp)->sv_dfh; 1788 } else { 1789 /* putfh fh to reopen */ 1790 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1791 } 1792 1793 argop[1].argop = OP_COPEN; 1794 open_args = &argop[1].nfs_argop4_u.opcopen; 1795 open_args->claim = claim; 1796 1797 if (claim == CLAIM_NULL) { 1798 1799 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1800 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1801 "failed for vp 0x%p for CLAIM_NULL with %m", 1802 (void *)vp); 1803 failed_msg = "Couldn't reopen: vtoname failed for " 1804 "CLAIM_NULL"; 1805 /* nothing allocated yet */ 1806 goto kill_file; 1807 } 1808 1809 open_args->open_claim4_u.cfile = fn; 1810 } else if (claim == CLAIM_PREVIOUS) { 1811 1812 /* 1813 * We have two cases to deal with here: 1814 * 1) We're being called to reopen files in order to satisfy 1815 * a lock operation request which requires us to explicitly 1816 * reopen files which were opened under a delegation. If 1817 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1818 * that case, frc_use_claim_previous is TRUE and we must 1819 * use the rnode's current delegation type (r_deleg_type). 1820 * 2) We're reopening files during some form of recovery. 1821 * In this case, frc_use_claim_previous is FALSE and we 1822 * use the delegation type appropriate for recovery 1823 * (r_deleg_needs_recovery). 1824 */ 1825 mutex_enter(&rp->r_statev4_lock); 1826 open_args->open_claim4_u.delegate_type = 1827 frc_use_claim_previous ? 1828 rp->r_deleg_type : 1829 rp->r_deleg_needs_recovery; 1830 mutex_exit(&rp->r_statev4_lock); 1831 1832 } else if (claim == CLAIM_DELEGATE_CUR) { 1833 1834 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1835 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1836 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1837 "with %m", (void *)vp); 1838 failed_msg = "Couldn't reopen: vtoname failed for " 1839 "CLAIM_DELEGATE_CUR"; 1840 /* nothing allocated yet */ 1841 goto kill_file; 1842 } 1843 1844 mutex_enter(&rp->r_statev4_lock); 1845 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1846 rp->r_deleg_stateid; 1847 mutex_exit(&rp->r_statev4_lock); 1848 1849 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1850 } 1851 open_args->opentype = OPEN4_NOCREATE; 1852 open_args->owner.clientid = mi2clientid(mi); 1853 open_args->owner.owner_len = sizeof (oop->oo_name); 1854 open_args->owner.owner_val = 1855 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1856 bcopy(&oop->oo_name, open_args->owner.owner_val, 1857 open_args->owner.owner_len); 1858 open_args->share_access = 0; 1859 open_args->share_deny = 0; 1860 1861 mutex_enter(&osp->os_sync_lock); 1862 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1863 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1864 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1865 (void *)osp, (void *)rp, osp->os_share_acc_read, 1866 osp->os_share_acc_write, osp->os_open_ref_count, 1867 osp->os_mmap_read, osp->os_mmap_write, claim)); 1868 1869 if (osp->os_share_acc_read || osp->os_mmap_read) 1870 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1871 if (osp->os_share_acc_write || osp->os_mmap_write) 1872 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1873 if (osp->os_share_deny_read) 1874 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1875 if (osp->os_share_deny_write) 1876 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1877 mutex_exit(&osp->os_sync_lock); 1878 1879 seqid = nfs4_get_open_seqid(oop) + 1; 1880 open_args->seqid = seqid; 1881 1882 /* Construct the getfh part of the compound */ 1883 argop[2].argop = OP_GETFH; 1884 1885 /* Construct the getattr part of the compound */ 1886 argop[3].argop = OP_GETATTR; 1887 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1888 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1889 1890 t = gethrtime(); 1891 1892 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1893 1894 if (ep->error) { 1895 if (!is_recov && !frc_use_claim_previous && 1896 (ep->error == EINTR || ep->error == ETIMEDOUT || 1897 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1898 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1899 cred_otw, vp, NULL, open_args); 1900 abort = nfs4_start_recovery(ep, 1901 VTOMI4(vp), vp, NULL, NULL, 1902 lost_rqst.lr_op == OP_OPEN ? 1903 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL); 1904 nfs4args_copen_free(open_args); 1905 goto bailout; 1906 } 1907 1908 nfs4args_copen_free(open_args); 1909 1910 if (ep->error == EACCES && cred_otw != cr) { 1911 crfree(cred_otw); 1912 cred_otw = cr; 1913 crhold(cred_otw); 1914 nfs4_end_open_seqid_sync(oop); 1915 open_owner_rele(oop); 1916 oop = NULL; 1917 goto top; 1918 } 1919 if (ep->error == ETIMEDOUT) 1920 goto bailout; 1921 failed_msg = "Couldn't reopen: rpc error"; 1922 goto kill_file; 1923 } 1924 1925 if (nfs4_need_to_bump_seqid(&res)) 1926 nfs4_set_open_seqid(seqid, oop, args.ctag); 1927 1928 switch (res.status) { 1929 case NFS4_OK: 1930 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1931 mutex_enter(&rp->r_statelock); 1932 rp->r_delay_interval = 0; 1933 mutex_exit(&rp->r_statelock); 1934 } 1935 break; 1936 case NFS4ERR_BAD_SEQID: 1937 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1938 args.ctag, open_args->seqid); 1939 1940 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1941 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1942 NULL, OP_OPEN, bsep, NULL, NULL); 1943 1944 nfs4args_copen_free(open_args); 1945 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1946 nfs4_end_open_seqid_sync(oop); 1947 open_owner_rele(oop); 1948 oop = NULL; 1949 kmem_free(bsep, sizeof (*bsep)); 1950 1951 goto kill_file; 1952 case NFS4ERR_NO_GRACE: 1953 nfs4args_copen_free(open_args); 1954 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1955 nfs4_end_open_seqid_sync(oop); 1956 open_owner_rele(oop); 1957 oop = NULL; 1958 if (claim == CLAIM_PREVIOUS) { 1959 /* 1960 * Retry as a plain open. We don't need to worry about 1961 * checking the changeinfo: it is acceptable for a 1962 * client to re-open a file and continue processing 1963 * (in the absence of locks). 1964 */ 1965 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1966 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1967 "will retry as CLAIM_NULL")); 1968 claim = CLAIM_NULL; 1969 nfs4_mi_kstat_inc_no_grace(mi); 1970 goto top; 1971 } 1972 failed_msg = 1973 "Couldn't reopen: tried reclaim outside grace period. "; 1974 goto kill_file; 1975 case NFS4ERR_GRACE: 1976 nfs4_set_grace_wait(mi); 1977 nfs4args_copen_free(open_args); 1978 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1979 nfs4_end_open_seqid_sync(oop); 1980 open_owner_rele(oop); 1981 oop = NULL; 1982 ep->error = nfs4_wait_for_grace(mi, &recov); 1983 if (ep->error != 0) 1984 goto bailout; 1985 goto top; 1986 case NFS4ERR_DELAY: 1987 nfs4_set_delay_wait(vp); 1988 nfs4args_copen_free(open_args); 1989 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1990 nfs4_end_open_seqid_sync(oop); 1991 open_owner_rele(oop); 1992 oop = NULL; 1993 ep->error = nfs4_wait_for_delay(vp, &recov); 1994 nfs4_mi_kstat_inc_delay(mi); 1995 if (ep->error != 0) 1996 goto bailout; 1997 goto top; 1998 case NFS4ERR_FHEXPIRED: 1999 /* recover filehandle and retry */ 2000 abort = nfs4_start_recovery(ep, 2001 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL); 2002 nfs4args_copen_free(open_args); 2003 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2004 nfs4_end_open_seqid_sync(oop); 2005 open_owner_rele(oop); 2006 oop = NULL; 2007 if (abort == FALSE) 2008 goto top; 2009 failed_msg = "Couldn't reopen: recovery aborted"; 2010 goto kill_file; 2011 case NFS4ERR_RESOURCE: 2012 case NFS4ERR_STALE_CLIENTID: 2013 case NFS4ERR_WRONGSEC: 2014 case NFS4ERR_EXPIRED: 2015 /* 2016 * Do not mark the file dead and let the calling 2017 * function initiate recovery. 2018 */ 2019 nfs4args_copen_free(open_args); 2020 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2021 nfs4_end_open_seqid_sync(oop); 2022 open_owner_rele(oop); 2023 oop = NULL; 2024 goto bailout; 2025 case NFS4ERR_ACCESS: 2026 if (cred_otw != cr) { 2027 crfree(cred_otw); 2028 cred_otw = cr; 2029 crhold(cred_otw); 2030 nfs4args_copen_free(open_args); 2031 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2032 nfs4_end_open_seqid_sync(oop); 2033 open_owner_rele(oop); 2034 oop = NULL; 2035 goto top; 2036 } 2037 /* fall through */ 2038 default: 2039 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2040 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2041 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2042 rnode4info(VTOR4(vp)))); 2043 failed_msg = "Couldn't reopen: NFSv4 error"; 2044 nfs4args_copen_free(open_args); 2045 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2046 goto kill_file; 2047 } 2048 2049 resop = &res.array[1]; /* open res */ 2050 op_res = &resop->nfs_resop4_u.opopen; 2051 2052 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2053 2054 /* 2055 * Check if the path we reopened really is the same 2056 * file. We could end up in a situation where the file 2057 * was removed and a new file created with the same name. 2058 */ 2059 resop = &res.array[2]; 2060 gf_res = &resop->nfs_resop4_u.opgetfh; 2061 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2062 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2063 if (fh_different) { 2064 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2065 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2066 /* Oops, we don't have the same file */ 2067 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2068 failed_msg = "Couldn't reopen: Persistent " 2069 "file handle changed"; 2070 else 2071 failed_msg = "Couldn't reopen: Volatile " 2072 "(no expire on open) file handle changed"; 2073 2074 nfs4args_copen_free(open_args); 2075 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2076 nfs_rw_exit(&mi->mi_fh_lock); 2077 goto kill_file; 2078 2079 } else { 2080 /* 2081 * We have volatile file handles that don't compare. 2082 * If the fids are the same then we assume that the 2083 * file handle expired but the rnode still refers to 2084 * the same file object. 2085 * 2086 * First check that we have fids or not. 2087 * If we don't we have a dumb server so we will 2088 * just assume every thing is ok for now. 2089 */ 2090 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2091 rp->r_attr.va_mask & AT_NODEID && 2092 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2093 /* 2094 * We have fids, but they don't 2095 * compare. So kill the file. 2096 */ 2097 failed_msg = 2098 "Couldn't reopen: file handle changed" 2099 " due to mismatched fids"; 2100 nfs4args_copen_free(open_args); 2101 (void) xdr_free(xdr_COMPOUND4res_clnt, 2102 (caddr_t)&res); 2103 nfs_rw_exit(&mi->mi_fh_lock); 2104 goto kill_file; 2105 } else { 2106 /* 2107 * We have volatile file handles that refers 2108 * to the same file (at least they have the 2109 * same fid) or we don't have fids so we 2110 * can't tell. :(. We'll be a kind and accepting 2111 * client so we'll update the rnode's file 2112 * handle with the otw handle. 2113 * 2114 * We need to drop mi->mi_fh_lock since 2115 * sh4_update acquires it. Since there is 2116 * only one recovery thread there is no 2117 * race. 2118 */ 2119 nfs_rw_exit(&mi->mi_fh_lock); 2120 sfh4_update(rp->r_fh, &gf_res->object); 2121 } 2122 } 2123 } else { 2124 nfs_rw_exit(&mi->mi_fh_lock); 2125 } 2126 2127 ASSERT(nfs4_consistent_type(vp)); 2128 2129 /* 2130 * If the server wanted an OPEN_CONFIRM but that fails, just start 2131 * over. Presumably if there is a persistent error it will show up 2132 * when we resend the OPEN. 2133 */ 2134 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2135 bool_t retry_open = FALSE; 2136 2137 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2138 cred_otw, is_recov, &retry_open, 2139 oop, FALSE, ep, NULL); 2140 if (ep->error || ep->stat) { 2141 nfs4args_copen_free(open_args); 2142 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2143 nfs4_end_open_seqid_sync(oop); 2144 open_owner_rele(oop); 2145 oop = NULL; 2146 goto top; 2147 } 2148 } 2149 2150 mutex_enter(&osp->os_sync_lock); 2151 osp->open_stateid = op_res->stateid; 2152 osp->os_delegation = 0; 2153 /* 2154 * Need to reset this bitfield for the possible case where we were 2155 * going to OTW CLOSE the file, got a non-recoverable error, and before 2156 * we could retry the CLOSE, OPENed the file again. 2157 */ 2158 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2159 osp->os_final_close = 0; 2160 osp->os_force_close = 0; 2161 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2162 osp->os_dc_openacc = open_args->share_access; 2163 mutex_exit(&osp->os_sync_lock); 2164 2165 nfs4_end_open_seqid_sync(oop); 2166 2167 /* accept delegation, if any */ 2168 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2169 2170 nfs4args_copen_free(open_args); 2171 2172 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2173 2174 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2175 2176 ASSERT(nfs4_consistent_type(vp)); 2177 2178 open_owner_rele(oop); 2179 crfree(cr); 2180 crfree(cred_otw); 2181 return; 2182 2183 kill_file: 2184 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2185 failed_reopen: 2186 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2187 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2188 (void *)osp, (void *)cr, rnode4info(rp))); 2189 mutex_enter(&osp->os_sync_lock); 2190 osp->os_failed_reopen = 1; 2191 mutex_exit(&osp->os_sync_lock); 2192 bailout: 2193 if (oop != NULL) { 2194 nfs4_end_open_seqid_sync(oop); 2195 open_owner_rele(oop); 2196 } 2197 if (cr != NULL) 2198 crfree(cr); 2199 if (cred_otw != NULL) 2200 crfree(cred_otw); 2201 } 2202 2203 /* for . and .. OPENs */ 2204 /* ARGSUSED */ 2205 static int 2206 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2207 { 2208 rnode4_t *rp; 2209 nfs4_ga_res_t gar; 2210 2211 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2212 2213 /* 2214 * If close-to-open consistency checking is turned off or 2215 * if there is no cached data, we can avoid 2216 * the over the wire getattr. Otherwise, force a 2217 * call to the server to get fresh attributes and to 2218 * check caches. This is required for close-to-open 2219 * consistency. 2220 */ 2221 rp = VTOR4(*vpp); 2222 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2223 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2224 return (0); 2225 2226 gar.n4g_va.va_mask = AT_ALL; 2227 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2228 } 2229 2230 /* 2231 * CLOSE a file 2232 */ 2233 /* ARGSUSED */ 2234 static int 2235 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2236 caller_context_t *ct) 2237 { 2238 rnode4_t *rp; 2239 int error = 0; 2240 int r_error = 0; 2241 int n4error = 0; 2242 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2243 2244 /* 2245 * Remove client state for this (lockowner, file) pair. 2246 * Issue otw v4 call to have the server do the same. 2247 */ 2248 2249 rp = VTOR4(vp); 2250 2251 /* 2252 * zone_enter(2) prevents processes from changing zones with NFS files 2253 * open; if we happen to get here from the wrong zone we can't do 2254 * anything over the wire. 2255 */ 2256 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2257 /* 2258 * We could attempt to clean up locks, except we're sure 2259 * that the current process didn't acquire any locks on 2260 * the file: any attempt to lock a file belong to another zone 2261 * will fail, and one can't lock an NFS file and then change 2262 * zones, as that fails too. 2263 * 2264 * Returning an error here is the sane thing to do. A 2265 * subsequent call to VN_RELE() which translates to a 2266 * nfs4_inactive() will clean up state: if the zone of the 2267 * vnode's origin is still alive and kicking, the inactive 2268 * thread will handle the request (from the correct zone), and 2269 * everything (minus the OTW close call) should be OK. If the 2270 * zone is going away nfs4_async_inactive() will throw away 2271 * delegations, open streams and cached pages inline. 2272 */ 2273 return (EIO); 2274 } 2275 2276 /* 2277 * If we are using local locking for this filesystem, then 2278 * release all of the SYSV style record locks. Otherwise, 2279 * we are doing network locking and we need to release all 2280 * of the network locks. All of the locks held by this 2281 * process on this file are released no matter what the 2282 * incoming reference count is. 2283 */ 2284 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2285 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2286 cleanshares(vp, ttoproc(curthread)->p_pid); 2287 } else 2288 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2289 2290 if (e.error) { 2291 struct lm_sysid *lmsid; 2292 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2293 if (lmsid == NULL) { 2294 DTRACE_PROBE2(unknown__sysid, int, e.error, 2295 vnode_t *, vp); 2296 } else { 2297 cleanlocks(vp, ttoproc(curthread)->p_pid, 2298 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2299 } 2300 return (e.error); 2301 } 2302 2303 if (count > 1) 2304 return (0); 2305 2306 /* 2307 * If the file has been `unlinked', then purge the 2308 * DNLC so that this vnode will get reycled quicker 2309 * and the .nfs* file on the server will get removed. 2310 */ 2311 if (rp->r_unldvp != NULL) 2312 dnlc_purge_vp(vp); 2313 2314 /* 2315 * If the file was open for write and there are pages, 2316 * do a synchronous flush and commit of all of the 2317 * dirty and uncommitted pages. 2318 */ 2319 ASSERT(!e.error); 2320 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2321 error = nfs4_putpage_commit(vp, 0, 0, cr); 2322 2323 mutex_enter(&rp->r_statelock); 2324 r_error = rp->r_error; 2325 rp->r_error = 0; 2326 mutex_exit(&rp->r_statelock); 2327 2328 /* 2329 * If this file type is one for which no explicit 'open' was 2330 * done, then bail now (ie. no need for protocol 'close'). If 2331 * there was an error w/the vm subsystem, return _that_ error, 2332 * otherwise, return any errors that may've been reported via 2333 * the rnode. 2334 */ 2335 if (vp->v_type != VREG) 2336 return (error ? error : r_error); 2337 2338 /* 2339 * The sync putpage commit may have failed above, but since 2340 * we're working w/a regular file, we need to do the protocol 2341 * 'close' (nfs4close_one will figure out if an otw close is 2342 * needed or not). Report any errors _after_ doing the protocol 2343 * 'close'. 2344 */ 2345 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2346 n4error = e.error ? e.error : geterrno4(e.stat); 2347 2348 /* 2349 * Error reporting prio (Hi -> Lo) 2350 * 2351 * i) nfs4_putpage_commit (error) 2352 * ii) rnode's (r_error) 2353 * iii) nfs4close_one (n4error) 2354 */ 2355 return (error ? error : (r_error ? r_error : n4error)); 2356 } 2357 2358 /* 2359 * Initialize *lost_rqstp. 2360 */ 2361 2362 static void 2363 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2364 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2365 vnode_t *vp) 2366 { 2367 if (error != ETIMEDOUT && error != EINTR && 2368 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2369 lost_rqstp->lr_op = 0; 2370 return; 2371 } 2372 2373 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2374 "nfs4close_save_lost_rqst: error %d", error)); 2375 2376 lost_rqstp->lr_op = OP_CLOSE; 2377 /* 2378 * The vp is held and rele'd via the recovery code. 2379 * See nfs4_save_lost_rqst. 2380 */ 2381 lost_rqstp->lr_vp = vp; 2382 lost_rqstp->lr_dvp = NULL; 2383 lost_rqstp->lr_oop = oop; 2384 lost_rqstp->lr_osp = osp; 2385 ASSERT(osp != NULL); 2386 ASSERT(mutex_owned(&osp->os_sync_lock)); 2387 osp->os_pending_close = 1; 2388 lost_rqstp->lr_lop = NULL; 2389 lost_rqstp->lr_cr = cr; 2390 lost_rqstp->lr_flk = NULL; 2391 lost_rqstp->lr_putfirst = FALSE; 2392 } 2393 2394 /* 2395 * Assumes you already have the open seqid sync grabbed as well as the 2396 * 'os_sync_lock'. Note: this will release the open seqid sync and 2397 * 'os_sync_lock' if client recovery starts. Calling functions have to 2398 * be prepared to handle this. 2399 * 2400 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2401 * was needed and was started, and that the calling function should retry 2402 * this function; otherwise it is returned as 0. 2403 * 2404 * Errors are returned via the nfs4_error_t parameter. 2405 */ 2406 static void 2407 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2408 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2409 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2410 { 2411 COMPOUND4args_clnt args; 2412 COMPOUND4res_clnt res; 2413 CLOSE4args *close_args; 2414 nfs_resop4 *resop; 2415 nfs_argop4 argop[3]; 2416 int doqueue = 1; 2417 mntinfo4_t *mi; 2418 seqid4 seqid; 2419 vnode_t *vp; 2420 bool_t needrecov = FALSE; 2421 nfs4_lost_rqst_t lost_rqst; 2422 hrtime_t t; 2423 2424 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2425 2426 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2427 2428 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2429 2430 /* Only set this to 1 if recovery is started */ 2431 *recov = 0; 2432 2433 /* do the OTW call to close the file */ 2434 2435 if (close_type == CLOSE_RESEND) 2436 args.ctag = TAG_CLOSE_LOST; 2437 else if (close_type == CLOSE_AFTER_RESEND) 2438 args.ctag = TAG_CLOSE_UNDO; 2439 else 2440 args.ctag = TAG_CLOSE; 2441 2442 args.array_len = 3; 2443 args.array = argop; 2444 2445 vp = RTOV4(rp); 2446 2447 mi = VTOMI4(vp); 2448 2449 /* putfh target fh */ 2450 argop[0].argop = OP_CPUTFH; 2451 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2452 2453 argop[1].argop = OP_GETATTR; 2454 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2455 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2456 2457 argop[2].argop = OP_CLOSE; 2458 close_args = &argop[2].nfs_argop4_u.opclose; 2459 2460 seqid = nfs4_get_open_seqid(oop) + 1; 2461 2462 close_args->seqid = seqid; 2463 close_args->open_stateid = osp->open_stateid; 2464 2465 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2466 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2467 rnode4info(rp))); 2468 2469 t = gethrtime(); 2470 2471 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2472 2473 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2474 nfs4_set_open_seqid(seqid, oop, args.ctag); 2475 } 2476 2477 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2478 if (ep->error && !needrecov) { 2479 /* 2480 * if there was an error and no recovery is to be done 2481 * then then set up the file to flush its cache if 2482 * needed for the next caller. 2483 */ 2484 mutex_enter(&rp->r_statelock); 2485 PURGE_ATTRCACHE4_LOCKED(rp); 2486 rp->r_flags &= ~R4WRITEMODIFIED; 2487 mutex_exit(&rp->r_statelock); 2488 return; 2489 } 2490 2491 if (needrecov) { 2492 bool_t abort; 2493 nfs4_bseqid_entry_t *bsep = NULL; 2494 2495 if (close_type != CLOSE_RESEND) 2496 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2497 osp, cred_otw, vp); 2498 2499 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2500 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2501 0, args.ctag, close_args->seqid); 2502 2503 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2504 "nfs4close_otw: initiating recovery. error %d " 2505 "res.status %d", ep->error, res.status)); 2506 2507 /* 2508 * Drop the 'os_sync_lock' here so we don't hit 2509 * a potential recursive mutex_enter via an 2510 * 'open_stream_hold()'. 2511 */ 2512 mutex_exit(&osp->os_sync_lock); 2513 *have_sync_lockp = 0; 2514 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2515 (close_type != CLOSE_RESEND && 2516 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2517 OP_CLOSE, bsep, NULL, NULL); 2518 2519 /* drop open seq sync, and let the calling function regrab it */ 2520 nfs4_end_open_seqid_sync(oop); 2521 *did_start_seqid_syncp = 0; 2522 2523 if (bsep) 2524 kmem_free(bsep, sizeof (*bsep)); 2525 /* 2526 * For signals, the caller wants to quit, so don't say to 2527 * retry. For forced unmount, if it's a user thread, it 2528 * wants to quit. If it's a recovery thread, the retry 2529 * will happen higher-up on the call stack. Either way, 2530 * don't say to retry. 2531 */ 2532 if (abort == FALSE && ep->error != EINTR && 2533 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2534 close_type != CLOSE_RESEND && 2535 close_type != CLOSE_AFTER_RESEND) 2536 *recov = 1; 2537 else 2538 *recov = 0; 2539 2540 if (!ep->error) 2541 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2542 return; 2543 } 2544 2545 if (res.status) { 2546 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2547 return; 2548 } 2549 2550 mutex_enter(&rp->r_statev4_lock); 2551 rp->created_v4 = 0; 2552 mutex_exit(&rp->r_statev4_lock); 2553 2554 resop = &res.array[2]; 2555 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2556 osp->os_valid = 0; 2557 2558 /* 2559 * This removes the reference obtained at OPEN; ie, when the 2560 * open stream structure was created. 2561 * 2562 * We don't have to worry about calling 'open_stream_rele' 2563 * since we our currently holding a reference to the open 2564 * stream which means the count cannot go to 0 with this 2565 * decrement. 2566 */ 2567 ASSERT(osp->os_ref_count >= 2); 2568 osp->os_ref_count--; 2569 2570 if (!ep->error) 2571 nfs4_attr_cache(vp, 2572 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2573 t, cred_otw, TRUE, NULL); 2574 2575 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2576 " returning %d", ep->error)); 2577 2578 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2579 } 2580 2581 /* ARGSUSED */ 2582 static int 2583 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2584 caller_context_t *ct) 2585 { 2586 rnode4_t *rp; 2587 u_offset_t off; 2588 offset_t diff; 2589 uint_t on; 2590 uint_t n; 2591 caddr_t base; 2592 uint_t flags; 2593 int error; 2594 mntinfo4_t *mi; 2595 2596 rp = VTOR4(vp); 2597 2598 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2599 2600 if (IS_SHADOW(vp, rp)) 2601 vp = RTOV4(rp); 2602 2603 if (vp->v_type != VREG) 2604 return (EISDIR); 2605 2606 mi = VTOMI4(vp); 2607 2608 if (nfs_zone() != mi->mi_zone) 2609 return (EIO); 2610 2611 if (uiop->uio_resid == 0) 2612 return (0); 2613 2614 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2615 return (EINVAL); 2616 2617 mutex_enter(&rp->r_statelock); 2618 if (rp->r_flags & R4RECOVERRP) 2619 error = (rp->r_error ? rp->r_error : EIO); 2620 else 2621 error = 0; 2622 mutex_exit(&rp->r_statelock); 2623 if (error) 2624 return (error); 2625 2626 /* 2627 * Bypass VM if caching has been disabled (e.g., locking) or if 2628 * using client-side direct I/O and the file is not mmap'd and 2629 * there are no cached pages. 2630 */ 2631 if ((vp->v_flag & VNOCACHE) || 2632 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2633 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2634 size_t resid = 0; 2635 2636 return (nfs4read(vp, NULL, uiop->uio_loffset, 2637 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2638 } 2639 2640 error = 0; 2641 2642 do { 2643 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2644 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2645 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2646 2647 if (error = nfs4_validate_caches(vp, cr)) 2648 break; 2649 2650 mutex_enter(&rp->r_statelock); 2651 while (rp->r_flags & R4INCACHEPURGE) { 2652 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2653 mutex_exit(&rp->r_statelock); 2654 return (EINTR); 2655 } 2656 } 2657 diff = rp->r_size - uiop->uio_loffset; 2658 mutex_exit(&rp->r_statelock); 2659 if (diff <= 0) 2660 break; 2661 if (diff < n) 2662 n = (uint_t)diff; 2663 2664 if (vpm_enable) { 2665 /* 2666 * Copy data. 2667 */ 2668 error = vpm_data_copy(vp, off + on, n, uiop, 2669 1, NULL, 0, S_READ); 2670 } else { 2671 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2672 S_READ); 2673 2674 error = uiomove(base + on, n, UIO_READ, uiop); 2675 } 2676 2677 if (!error) { 2678 /* 2679 * If read a whole block or read to eof, 2680 * won't need this buffer again soon. 2681 */ 2682 mutex_enter(&rp->r_statelock); 2683 if (n + on == MAXBSIZE || 2684 uiop->uio_loffset == rp->r_size) 2685 flags = SM_DONTNEED; 2686 else 2687 flags = 0; 2688 mutex_exit(&rp->r_statelock); 2689 if (vpm_enable) { 2690 error = vpm_sync_pages(vp, off, n, flags); 2691 } else { 2692 error = segmap_release(segkmap, base, flags); 2693 } 2694 } else { 2695 if (vpm_enable) { 2696 (void) vpm_sync_pages(vp, off, n, 0); 2697 } else { 2698 (void) segmap_release(segkmap, base, 0); 2699 } 2700 } 2701 } while (!error && uiop->uio_resid > 0); 2702 2703 return (error); 2704 } 2705 2706 /* ARGSUSED */ 2707 static int 2708 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2709 caller_context_t *ct) 2710 { 2711 rlim64_t limit = uiop->uio_llimit; 2712 rnode4_t *rp; 2713 u_offset_t off; 2714 caddr_t base; 2715 uint_t flags; 2716 int remainder; 2717 size_t n; 2718 int on; 2719 int error; 2720 int resid; 2721 u_offset_t offset; 2722 mntinfo4_t *mi; 2723 uint_t bsize; 2724 2725 rp = VTOR4(vp); 2726 2727 if (IS_SHADOW(vp, rp)) 2728 vp = RTOV4(rp); 2729 2730 if (vp->v_type != VREG) 2731 return (EISDIR); 2732 2733 mi = VTOMI4(vp); 2734 2735 if (nfs_zone() != mi->mi_zone) 2736 return (EIO); 2737 2738 if (uiop->uio_resid == 0) 2739 return (0); 2740 2741 mutex_enter(&rp->r_statelock); 2742 if (rp->r_flags & R4RECOVERRP) 2743 error = (rp->r_error ? rp->r_error : EIO); 2744 else 2745 error = 0; 2746 mutex_exit(&rp->r_statelock); 2747 if (error) 2748 return (error); 2749 2750 if (ioflag & FAPPEND) { 2751 struct vattr va; 2752 2753 /* 2754 * Must serialize if appending. 2755 */ 2756 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2757 nfs_rw_exit(&rp->r_rwlock); 2758 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2759 INTR4(vp))) 2760 return (EINTR); 2761 } 2762 2763 va.va_mask = AT_SIZE; 2764 error = nfs4getattr(vp, &va, cr); 2765 if (error) 2766 return (error); 2767 uiop->uio_loffset = va.va_size; 2768 } 2769 2770 offset = uiop->uio_loffset + uiop->uio_resid; 2771 2772 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2773 return (EINVAL); 2774 2775 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2776 limit = MAXOFFSET_T; 2777 2778 /* 2779 * Check to make sure that the process will not exceed 2780 * its limit on file size. It is okay to write up to 2781 * the limit, but not beyond. Thus, the write which 2782 * reaches the limit will be short and the next write 2783 * will return an error. 2784 */ 2785 remainder = 0; 2786 if (offset > uiop->uio_llimit) { 2787 remainder = offset - uiop->uio_llimit; 2788 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2789 if (uiop->uio_resid <= 0) { 2790 proc_t *p = ttoproc(curthread); 2791 2792 uiop->uio_resid += remainder; 2793 mutex_enter(&p->p_lock); 2794 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2795 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2796 mutex_exit(&p->p_lock); 2797 return (EFBIG); 2798 } 2799 } 2800 2801 /* update the change attribute, if we have a write delegation */ 2802 2803 mutex_enter(&rp->r_statev4_lock); 2804 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2805 rp->r_deleg_change++; 2806 2807 mutex_exit(&rp->r_statev4_lock); 2808 2809 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2810 return (EINTR); 2811 2812 /* 2813 * Bypass VM if caching has been disabled (e.g., locking) or if 2814 * using client-side direct I/O and the file is not mmap'd and 2815 * there are no cached pages. 2816 */ 2817 if ((vp->v_flag & VNOCACHE) || 2818 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2819 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2820 size_t bufsize; 2821 int count; 2822 u_offset_t org_offset; 2823 stable_how4 stab_comm; 2824 nfs4_fwrite: 2825 if (rp->r_flags & R4STALE) { 2826 resid = uiop->uio_resid; 2827 offset = uiop->uio_loffset; 2828 error = rp->r_error; 2829 /* 2830 * A close may have cleared r_error, if so, 2831 * propagate ESTALE error return properly 2832 */ 2833 if (error == 0) 2834 error = ESTALE; 2835 goto bottom; 2836 } 2837 2838 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2839 base = kmem_alloc(bufsize, KM_SLEEP); 2840 do { 2841 if (ioflag & FDSYNC) 2842 stab_comm = DATA_SYNC4; 2843 else 2844 stab_comm = FILE_SYNC4; 2845 resid = uiop->uio_resid; 2846 offset = uiop->uio_loffset; 2847 count = MIN(uiop->uio_resid, bufsize); 2848 org_offset = uiop->uio_loffset; 2849 error = uiomove(base, count, UIO_WRITE, uiop); 2850 if (!error) { 2851 error = nfs4write(vp, base, org_offset, 2852 count, cr, &stab_comm); 2853 if (!error) { 2854 mutex_enter(&rp->r_statelock); 2855 if (rp->r_size < uiop->uio_loffset) 2856 rp->r_size = uiop->uio_loffset; 2857 mutex_exit(&rp->r_statelock); 2858 } 2859 } 2860 } while (!error && uiop->uio_resid > 0); 2861 kmem_free(base, bufsize); 2862 goto bottom; 2863 } 2864 2865 bsize = vp->v_vfsp->vfs_bsize; 2866 2867 do { 2868 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2869 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2870 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2871 2872 resid = uiop->uio_resid; 2873 offset = uiop->uio_loffset; 2874 2875 if (rp->r_flags & R4STALE) { 2876 error = rp->r_error; 2877 /* 2878 * A close may have cleared r_error, if so, 2879 * propagate ESTALE error return properly 2880 */ 2881 if (error == 0) 2882 error = ESTALE; 2883 break; 2884 } 2885 2886 /* 2887 * Don't create dirty pages faster than they 2888 * can be cleaned so that the system doesn't 2889 * get imbalanced. If the async queue is 2890 * maxed out, then wait for it to drain before 2891 * creating more dirty pages. Also, wait for 2892 * any threads doing pagewalks in the vop_getattr 2893 * entry points so that they don't block for 2894 * long periods. 2895 */ 2896 mutex_enter(&rp->r_statelock); 2897 while ((mi->mi_max_threads != 0 && 2898 rp->r_awcount > 2 * mi->mi_max_threads) || 2899 rp->r_gcount > 0) { 2900 if (INTR4(vp)) { 2901 klwp_t *lwp = ttolwp(curthread); 2902 2903 if (lwp != NULL) 2904 lwp->lwp_nostop++; 2905 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2906 mutex_exit(&rp->r_statelock); 2907 if (lwp != NULL) 2908 lwp->lwp_nostop--; 2909 error = EINTR; 2910 goto bottom; 2911 } 2912 if (lwp != NULL) 2913 lwp->lwp_nostop--; 2914 } else 2915 cv_wait(&rp->r_cv, &rp->r_statelock); 2916 } 2917 mutex_exit(&rp->r_statelock); 2918 2919 /* 2920 * Touch the page and fault it in if it is not in core 2921 * before segmap_getmapflt or vpm_data_copy can lock it. 2922 * This is to avoid the deadlock if the buffer is mapped 2923 * to the same file through mmap which we want to write. 2924 */ 2925 uio_prefaultpages((long)n, uiop); 2926 2927 if (vpm_enable) { 2928 /* 2929 * It will use kpm mappings, so no need to 2930 * pass an address. 2931 */ 2932 error = writerp4(rp, NULL, n, uiop, 0); 2933 } else { 2934 if (segmap_kpm) { 2935 int pon = uiop->uio_loffset & PAGEOFFSET; 2936 size_t pn = MIN(PAGESIZE - pon, 2937 uiop->uio_resid); 2938 int pagecreate; 2939 2940 mutex_enter(&rp->r_statelock); 2941 pagecreate = (pon == 0) && (pn == PAGESIZE || 2942 uiop->uio_loffset + pn >= rp->r_size); 2943 mutex_exit(&rp->r_statelock); 2944 2945 base = segmap_getmapflt(segkmap, vp, off + on, 2946 pn, !pagecreate, S_WRITE); 2947 2948 error = writerp4(rp, base + pon, n, uiop, 2949 pagecreate); 2950 2951 } else { 2952 base = segmap_getmapflt(segkmap, vp, off + on, 2953 n, 0, S_READ); 2954 error = writerp4(rp, base + on, n, uiop, 0); 2955 } 2956 } 2957 2958 if (!error) { 2959 if (mi->mi_flags & MI4_NOAC) 2960 flags = SM_WRITE; 2961 else if ((uiop->uio_loffset % bsize) == 0 || 2962 IS_SWAPVP(vp)) { 2963 /* 2964 * Have written a whole block. 2965 * Start an asynchronous write 2966 * and mark the buffer to 2967 * indicate that it won't be 2968 * needed again soon. 2969 */ 2970 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2971 } else 2972 flags = 0; 2973 if ((ioflag & (FSYNC|FDSYNC)) || 2974 (rp->r_flags & R4OUTOFSPACE)) { 2975 flags &= ~SM_ASYNC; 2976 flags |= SM_WRITE; 2977 } 2978 if (vpm_enable) { 2979 error = vpm_sync_pages(vp, off, n, flags); 2980 } else { 2981 error = segmap_release(segkmap, base, flags); 2982 } 2983 } else { 2984 if (vpm_enable) { 2985 (void) vpm_sync_pages(vp, off, n, 0); 2986 } else { 2987 (void) segmap_release(segkmap, base, 0); 2988 } 2989 /* 2990 * In the event that we got an access error while 2991 * faulting in a page for a write-only file just 2992 * force a write. 2993 */ 2994 if (error == EACCES) 2995 goto nfs4_fwrite; 2996 } 2997 } while (!error && uiop->uio_resid > 0); 2998 2999 bottom: 3000 if (error) { 3001 uiop->uio_resid = resid + remainder; 3002 uiop->uio_loffset = offset; 3003 } else { 3004 uiop->uio_resid += remainder; 3005 3006 mutex_enter(&rp->r_statev4_lock); 3007 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 3008 gethrestime(&rp->r_attr.va_mtime); 3009 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3010 } 3011 mutex_exit(&rp->r_statev4_lock); 3012 } 3013 3014 nfs_rw_exit(&rp->r_lkserlock); 3015 3016 return (error); 3017 } 3018 3019 /* 3020 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 3021 */ 3022 static int 3023 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 3024 int flags, cred_t *cr) 3025 { 3026 struct buf *bp; 3027 int error; 3028 page_t *savepp; 3029 uchar_t fsdata; 3030 stable_how4 stab_comm; 3031 3032 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3033 bp = pageio_setup(pp, len, vp, flags); 3034 ASSERT(bp != NULL); 3035 3036 /* 3037 * pageio_setup should have set b_addr to 0. This 3038 * is correct since we want to do I/O on a page 3039 * boundary. bp_mapin will use this addr to calculate 3040 * an offset, and then set b_addr to the kernel virtual 3041 * address it allocated for us. 3042 */ 3043 ASSERT(bp->b_un.b_addr == 0); 3044 3045 bp->b_edev = 0; 3046 bp->b_dev = 0; 3047 bp->b_lblkno = lbtodb(off); 3048 bp->b_file = vp; 3049 bp->b_offset = (offset_t)off; 3050 bp_mapin(bp); 3051 3052 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3053 freemem > desfree) 3054 stab_comm = UNSTABLE4; 3055 else 3056 stab_comm = FILE_SYNC4; 3057 3058 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3059 3060 bp_mapout(bp); 3061 pageio_done(bp); 3062 3063 if (stab_comm == UNSTABLE4) 3064 fsdata = C_DELAYCOMMIT; 3065 else 3066 fsdata = C_NOCOMMIT; 3067 3068 savepp = pp; 3069 do { 3070 pp->p_fsdata = fsdata; 3071 } while ((pp = pp->p_next) != savepp); 3072 3073 return (error); 3074 } 3075 3076 /* 3077 */ 3078 static int 3079 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3080 { 3081 nfs4_open_owner_t *oop; 3082 nfs4_open_stream_t *osp; 3083 rnode4_t *rp = VTOR4(vp); 3084 mntinfo4_t *mi = VTOMI4(vp); 3085 int reopen_needed; 3086 3087 ASSERT(nfs_zone() == mi->mi_zone); 3088 3089 3090 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3091 if (!oop) 3092 return (EIO); 3093 3094 /* returns with 'os_sync_lock' held */ 3095 osp = find_open_stream(oop, rp); 3096 if (!osp) { 3097 open_owner_rele(oop); 3098 return (EIO); 3099 } 3100 3101 if (osp->os_failed_reopen) { 3102 mutex_exit(&osp->os_sync_lock); 3103 open_stream_rele(osp, rp); 3104 open_owner_rele(oop); 3105 return (EIO); 3106 } 3107 3108 /* 3109 * Determine whether a reopen is needed. If this 3110 * is a delegation open stream, then the os_delegation bit 3111 * should be set. 3112 */ 3113 3114 reopen_needed = osp->os_delegation; 3115 3116 mutex_exit(&osp->os_sync_lock); 3117 open_owner_rele(oop); 3118 3119 if (reopen_needed) { 3120 nfs4_error_zinit(ep); 3121 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3122 mutex_enter(&osp->os_sync_lock); 3123 if (ep->error || ep->stat || osp->os_failed_reopen) { 3124 mutex_exit(&osp->os_sync_lock); 3125 open_stream_rele(osp, rp); 3126 return (EIO); 3127 } 3128 mutex_exit(&osp->os_sync_lock); 3129 } 3130 open_stream_rele(osp, rp); 3131 3132 return (0); 3133 } 3134 3135 /* 3136 * Write to file. Writes to remote server in largest size 3137 * chunks that the server can handle. Write is synchronous. 3138 */ 3139 static int 3140 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3141 stable_how4 *stab_comm) 3142 { 3143 mntinfo4_t *mi; 3144 COMPOUND4args_clnt args; 3145 COMPOUND4res_clnt res; 3146 WRITE4args *wargs; 3147 WRITE4res *wres; 3148 nfs_argop4 argop[2]; 3149 nfs_resop4 *resop; 3150 int tsize; 3151 stable_how4 stable; 3152 rnode4_t *rp; 3153 int doqueue = 1; 3154 bool_t needrecov; 3155 nfs4_recov_state_t recov_state; 3156 nfs4_stateid_types_t sid_types; 3157 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3158 int recov; 3159 3160 rp = VTOR4(vp); 3161 mi = VTOMI4(vp); 3162 3163 ASSERT(nfs_zone() == mi->mi_zone); 3164 3165 stable = *stab_comm; 3166 *stab_comm = FILE_SYNC4; 3167 3168 needrecov = FALSE; 3169 recov_state.rs_flags = 0; 3170 recov_state.rs_num_retry_despite_err = 0; 3171 nfs4_init_stateid_types(&sid_types); 3172 3173 /* Is curthread the recovery thread? */ 3174 mutex_enter(&mi->mi_lock); 3175 recov = (mi->mi_recovthread == curthread); 3176 mutex_exit(&mi->mi_lock); 3177 3178 recov_retry: 3179 args.ctag = TAG_WRITE; 3180 args.array_len = 2; 3181 args.array = argop; 3182 3183 if (!recov) { 3184 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3185 &recov_state, NULL); 3186 if (e.error) 3187 return (e.error); 3188 } 3189 3190 /* 0. putfh target fh */ 3191 argop[0].argop = OP_CPUTFH; 3192 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3193 3194 /* 1. write */ 3195 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3196 3197 do { 3198 3199 wargs->offset = (offset4)offset; 3200 wargs->data_val = base; 3201 3202 if (mi->mi_io_kstats) { 3203 mutex_enter(&mi->mi_lock); 3204 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3205 mutex_exit(&mi->mi_lock); 3206 } 3207 3208 if ((vp->v_flag & VNOCACHE) || 3209 (rp->r_flags & R4DIRECTIO) || 3210 (mi->mi_flags & MI4_DIRECTIO)) 3211 tsize = MIN(mi->mi_stsize, count); 3212 else 3213 tsize = MIN(mi->mi_curwrite, count); 3214 wargs->data_len = (uint_t)tsize; 3215 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3216 3217 if (mi->mi_io_kstats) { 3218 mutex_enter(&mi->mi_lock); 3219 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3220 mutex_exit(&mi->mi_lock); 3221 } 3222 3223 if (!recov) { 3224 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3225 if (e.error && !needrecov) { 3226 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3227 &recov_state, needrecov); 3228 return (e.error); 3229 } 3230 } else { 3231 if (e.error) 3232 return (e.error); 3233 } 3234 3235 /* 3236 * Do handling of OLD_STATEID outside 3237 * of the normal recovery framework. 3238 * 3239 * If write receives a BAD stateid error while using a 3240 * delegation stateid, retry using the open stateid (if it 3241 * exists). If it doesn't have an open stateid, reopen the 3242 * file first, then retry. 3243 */ 3244 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3245 sid_types.cur_sid_type != SPEC_SID) { 3246 nfs4_save_stateid(&wargs->stateid, &sid_types); 3247 if (!recov) 3248 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3249 &recov_state, needrecov); 3250 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3251 goto recov_retry; 3252 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3253 sid_types.cur_sid_type == DEL_SID) { 3254 nfs4_save_stateid(&wargs->stateid, &sid_types); 3255 mutex_enter(&rp->r_statev4_lock); 3256 rp->r_deleg_return_pending = TRUE; 3257 mutex_exit(&rp->r_statev4_lock); 3258 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3259 if (!recov) 3260 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3261 &recov_state, needrecov); 3262 (void) xdr_free(xdr_COMPOUND4res_clnt, 3263 (caddr_t)&res); 3264 return (EIO); 3265 } 3266 if (!recov) 3267 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3268 &recov_state, needrecov); 3269 /* hold needed for nfs4delegreturn_thread */ 3270 VN_HOLD(vp); 3271 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3272 NFS4_DR_DISCARD), FALSE); 3273 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3274 goto recov_retry; 3275 } 3276 3277 if (needrecov) { 3278 bool_t abort; 3279 3280 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3281 "nfs4write: client got error %d, res.status %d" 3282 ", so start recovery", e.error, res.status)); 3283 3284 abort = nfs4_start_recovery(&e, 3285 VTOMI4(vp), vp, NULL, &wargs->stateid, 3286 NULL, OP_WRITE, NULL, NULL, NULL); 3287 if (!e.error) { 3288 e.error = geterrno4(res.status); 3289 (void) xdr_free(xdr_COMPOUND4res_clnt, 3290 (caddr_t)&res); 3291 } 3292 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3293 &recov_state, needrecov); 3294 if (abort == FALSE) 3295 goto recov_retry; 3296 return (e.error); 3297 } 3298 3299 if (res.status) { 3300 e.error = geterrno4(res.status); 3301 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3302 if (!recov) 3303 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3304 &recov_state, needrecov); 3305 return (e.error); 3306 } 3307 3308 resop = &res.array[1]; /* write res */ 3309 wres = &resop->nfs_resop4_u.opwrite; 3310 3311 if ((int)wres->count > tsize) { 3312 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3313 3314 zcmn_err(getzoneid(), CE_WARN, 3315 "nfs4write: server wrote %u, requested was %u", 3316 (int)wres->count, tsize); 3317 if (!recov) 3318 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3319 &recov_state, needrecov); 3320 return (EIO); 3321 } 3322 if (wres->committed == UNSTABLE4) { 3323 *stab_comm = UNSTABLE4; 3324 if (wargs->stable == DATA_SYNC4 || 3325 wargs->stable == FILE_SYNC4) { 3326 (void) xdr_free(xdr_COMPOUND4res_clnt, 3327 (caddr_t)&res); 3328 zcmn_err(getzoneid(), CE_WARN, 3329 "nfs4write: server %s did not commit " 3330 "to stable storage", 3331 rp->r_server->sv_hostname); 3332 if (!recov) 3333 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3334 OH_WRITE, &recov_state, needrecov); 3335 return (EIO); 3336 } 3337 } 3338 3339 tsize = (int)wres->count; 3340 count -= tsize; 3341 base += tsize; 3342 offset += tsize; 3343 if (mi->mi_io_kstats) { 3344 mutex_enter(&mi->mi_lock); 3345 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3346 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3347 tsize; 3348 mutex_exit(&mi->mi_lock); 3349 } 3350 lwp_stat_update(LWP_STAT_OUBLK, 1); 3351 mutex_enter(&rp->r_statelock); 3352 if (rp->r_flags & R4HAVEVERF) { 3353 if (rp->r_writeverf != wres->writeverf) { 3354 nfs4_set_mod(vp); 3355 rp->r_writeverf = wres->writeverf; 3356 } 3357 } else { 3358 rp->r_writeverf = wres->writeverf; 3359 rp->r_flags |= R4HAVEVERF; 3360 } 3361 PURGE_ATTRCACHE4_LOCKED(rp); 3362 rp->r_flags |= R4WRITEMODIFIED; 3363 gethrestime(&rp->r_attr.va_mtime); 3364 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3365 mutex_exit(&rp->r_statelock); 3366 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3367 } while (count); 3368 3369 if (!recov) 3370 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3371 needrecov); 3372 3373 return (e.error); 3374 } 3375 3376 /* 3377 * Read from a file. Reads data in largest chunks our interface can handle. 3378 */ 3379 static int 3380 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3381 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3382 { 3383 mntinfo4_t *mi; 3384 COMPOUND4args_clnt args; 3385 COMPOUND4res_clnt res; 3386 READ4args *rargs; 3387 nfs_argop4 argop[2]; 3388 int tsize; 3389 int doqueue; 3390 rnode4_t *rp; 3391 int data_len; 3392 bool_t is_eof; 3393 bool_t needrecov = FALSE; 3394 nfs4_recov_state_t recov_state; 3395 nfs4_stateid_types_t sid_types; 3396 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3397 3398 rp = VTOR4(vp); 3399 mi = VTOMI4(vp); 3400 doqueue = 1; 3401 3402 ASSERT(nfs_zone() == mi->mi_zone); 3403 3404 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3405 3406 args.array_len = 2; 3407 args.array = argop; 3408 3409 nfs4_init_stateid_types(&sid_types); 3410 3411 recov_state.rs_flags = 0; 3412 recov_state.rs_num_retry_despite_err = 0; 3413 3414 recov_retry: 3415 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3416 &recov_state, NULL); 3417 if (e.error) 3418 return (e.error); 3419 3420 /* putfh target fh */ 3421 argop[0].argop = OP_CPUTFH; 3422 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3423 3424 /* read */ 3425 argop[1].argop = OP_READ; 3426 rargs = &argop[1].nfs_argop4_u.opread; 3427 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3428 OP_READ, &sid_types, async); 3429 3430 do { 3431 if (mi->mi_io_kstats) { 3432 mutex_enter(&mi->mi_lock); 3433 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3434 mutex_exit(&mi->mi_lock); 3435 } 3436 3437 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3438 "nfs4read: %s call, rp %s", 3439 needrecov ? "recov" : "first", 3440 rnode4info(rp))); 3441 3442 if ((vp->v_flag & VNOCACHE) || 3443 (rp->r_flags & R4DIRECTIO) || 3444 (mi->mi_flags & MI4_DIRECTIO)) 3445 tsize = MIN(mi->mi_tsize, count); 3446 else 3447 tsize = MIN(mi->mi_curread, count); 3448 3449 rargs->offset = (offset4)offset; 3450 rargs->count = (count4)tsize; 3451 rargs->res_data_val_alt = NULL; 3452 rargs->res_mblk = NULL; 3453 rargs->res_uiop = NULL; 3454 rargs->res_maxsize = 0; 3455 rargs->wlist = NULL; 3456 3457 if (uiop) 3458 rargs->res_uiop = uiop; 3459 else 3460 rargs->res_data_val_alt = base; 3461 rargs->res_maxsize = tsize; 3462 3463 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3464 #ifdef DEBUG 3465 if (nfs4read_error_inject) { 3466 res.status = nfs4read_error_inject; 3467 nfs4read_error_inject = 0; 3468 } 3469 #endif 3470 3471 if (mi->mi_io_kstats) { 3472 mutex_enter(&mi->mi_lock); 3473 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3474 mutex_exit(&mi->mi_lock); 3475 } 3476 3477 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3478 if (e.error != 0 && !needrecov) { 3479 nfs4_end_fop(mi, vp, NULL, OH_READ, 3480 &recov_state, needrecov); 3481 return (e.error); 3482 } 3483 3484 /* 3485 * Do proper retry for OLD and BAD stateid errors outside 3486 * of the normal recovery framework. There are two differences 3487 * between async and sync reads. The first is that we allow 3488 * retry on BAD_STATEID for async reads, but not sync reads. 3489 * The second is that we mark the file dead for a failed 3490 * attempt with a special stateid for sync reads, but just 3491 * return EIO for async reads. 3492 * 3493 * If a sync read receives a BAD stateid error while using a 3494 * delegation stateid, retry using the open stateid (if it 3495 * exists). If it doesn't have an open stateid, reopen the 3496 * file first, then retry. 3497 */ 3498 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3499 res.status == NFS4ERR_BAD_STATEID) && async) { 3500 nfs4_end_fop(mi, vp, NULL, OH_READ, 3501 &recov_state, needrecov); 3502 if (sid_types.cur_sid_type == SPEC_SID) { 3503 (void) xdr_free(xdr_COMPOUND4res_clnt, 3504 (caddr_t)&res); 3505 return (EIO); 3506 } 3507 nfs4_save_stateid(&rargs->stateid, &sid_types); 3508 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3509 goto recov_retry; 3510 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3511 !async && sid_types.cur_sid_type != SPEC_SID) { 3512 nfs4_save_stateid(&rargs->stateid, &sid_types); 3513 nfs4_end_fop(mi, vp, NULL, OH_READ, 3514 &recov_state, needrecov); 3515 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3516 goto recov_retry; 3517 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3518 sid_types.cur_sid_type == DEL_SID) { 3519 nfs4_save_stateid(&rargs->stateid, &sid_types); 3520 mutex_enter(&rp->r_statev4_lock); 3521 rp->r_deleg_return_pending = TRUE; 3522 mutex_exit(&rp->r_statev4_lock); 3523 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3524 nfs4_end_fop(mi, vp, NULL, OH_READ, 3525 &recov_state, needrecov); 3526 (void) xdr_free(xdr_COMPOUND4res_clnt, 3527 (caddr_t)&res); 3528 return (EIO); 3529 } 3530 nfs4_end_fop(mi, vp, NULL, OH_READ, 3531 &recov_state, needrecov); 3532 /* hold needed for nfs4delegreturn_thread */ 3533 VN_HOLD(vp); 3534 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3535 NFS4_DR_DISCARD), FALSE); 3536 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3537 goto recov_retry; 3538 } 3539 if (needrecov) { 3540 bool_t abort; 3541 3542 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3543 "nfs4read: initiating recovery\n")); 3544 abort = nfs4_start_recovery(&e, 3545 mi, vp, NULL, &rargs->stateid, 3546 NULL, OP_READ, NULL, NULL, NULL); 3547 nfs4_end_fop(mi, vp, NULL, OH_READ, 3548 &recov_state, needrecov); 3549 /* 3550 * Do not retry if we got OLD_STATEID using a special 3551 * stateid. This avoids looping with a broken server. 3552 */ 3553 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3554 sid_types.cur_sid_type == SPEC_SID) 3555 abort = TRUE; 3556 3557 if (abort == FALSE) { 3558 /* 3559 * Need to retry all possible stateids in 3560 * case the recovery error wasn't stateid 3561 * related or the stateids have become 3562 * stale (server reboot). 3563 */ 3564 nfs4_init_stateid_types(&sid_types); 3565 (void) xdr_free(xdr_COMPOUND4res_clnt, 3566 (caddr_t)&res); 3567 goto recov_retry; 3568 } 3569 3570 if (!e.error) { 3571 e.error = geterrno4(res.status); 3572 (void) xdr_free(xdr_COMPOUND4res_clnt, 3573 (caddr_t)&res); 3574 } 3575 return (e.error); 3576 } 3577 3578 if (res.status) { 3579 e.error = geterrno4(res.status); 3580 nfs4_end_fop(mi, vp, NULL, OH_READ, 3581 &recov_state, needrecov); 3582 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3583 return (e.error); 3584 } 3585 3586 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3587 count -= data_len; 3588 if (base) 3589 base += data_len; 3590 offset += data_len; 3591 if (mi->mi_io_kstats) { 3592 mutex_enter(&mi->mi_lock); 3593 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3594 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3595 mutex_exit(&mi->mi_lock); 3596 } 3597 lwp_stat_update(LWP_STAT_INBLK, 1); 3598 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3599 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3600 3601 } while (count && !is_eof); 3602 3603 *residp = count; 3604 3605 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3606 3607 return (e.error); 3608 } 3609 3610 /* ARGSUSED */ 3611 static int 3612 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3613 caller_context_t *ct) 3614 { 3615 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3616 return (EIO); 3617 switch (cmd) { 3618 case _FIODIRECTIO: 3619 return (nfs4_directio(vp, (int)arg, cr)); 3620 default: 3621 return (ENOTTY); 3622 } 3623 } 3624 3625 /* ARGSUSED */ 3626 int 3627 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3628 caller_context_t *ct) 3629 { 3630 int error; 3631 rnode4_t *rp = VTOR4(vp); 3632 3633 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3634 return (EIO); 3635 /* 3636 * If it has been specified that the return value will 3637 * just be used as a hint, and we are only being asked 3638 * for size, fsid or rdevid, then return the client's 3639 * notion of these values without checking to make sure 3640 * that the attribute cache is up to date. 3641 * The whole point is to avoid an over the wire GETATTR 3642 * call. 3643 */ 3644 if (flags & ATTR_HINT) { 3645 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) { 3646 mutex_enter(&rp->r_statelock); 3647 if (vap->va_mask & AT_SIZE) 3648 vap->va_size = rp->r_size; 3649 if (vap->va_mask & AT_FSID) 3650 vap->va_fsid = rp->r_attr.va_fsid; 3651 if (vap->va_mask & AT_RDEV) 3652 vap->va_rdev = rp->r_attr.va_rdev; 3653 mutex_exit(&rp->r_statelock); 3654 return (0); 3655 } 3656 } 3657 3658 /* 3659 * Only need to flush pages if asking for the mtime 3660 * and if there any dirty pages or any outstanding 3661 * asynchronous (write) requests for this file. 3662 */ 3663 if (vap->va_mask & AT_MTIME) { 3664 rp = VTOR4(vp); 3665 if (nfs4_has_pages(vp)) { 3666 mutex_enter(&rp->r_statev4_lock); 3667 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3668 mutex_exit(&rp->r_statev4_lock); 3669 if (rp->r_flags & R4DIRTY || 3670 rp->r_awcount > 0) { 3671 mutex_enter(&rp->r_statelock); 3672 rp->r_gcount++; 3673 mutex_exit(&rp->r_statelock); 3674 error = 3675 nfs4_putpage(vp, (u_offset_t)0, 3676 0, 0, cr, NULL); 3677 mutex_enter(&rp->r_statelock); 3678 if (error && (error == ENOSPC || 3679 error == EDQUOT)) { 3680 if (!rp->r_error) 3681 rp->r_error = error; 3682 } 3683 if (--rp->r_gcount == 0) 3684 cv_broadcast(&rp->r_cv); 3685 mutex_exit(&rp->r_statelock); 3686 } 3687 } else { 3688 mutex_exit(&rp->r_statev4_lock); 3689 } 3690 } 3691 } 3692 return (nfs4getattr(vp, vap, cr)); 3693 } 3694 3695 int 3696 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3697 { 3698 /* 3699 * If these are the only two bits cleared 3700 * on the server then return 0 (OK) else 3701 * return 1 (BAD). 3702 */ 3703 on_client &= ~(S_ISUID|S_ISGID); 3704 if (on_client == from_server) 3705 return (0); 3706 else 3707 return (1); 3708 } 3709 3710 /*ARGSUSED4*/ 3711 static int 3712 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3713 caller_context_t *ct) 3714 { 3715 if (vap->va_mask & AT_NOSET) 3716 return (EINVAL); 3717 3718 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3719 return (EIO); 3720 3721 /* 3722 * Don't call secpolicy_vnode_setattr, the client cannot 3723 * use its cached attributes to make security decisions 3724 * as the server may be faking mode bits or mapping uid/gid. 3725 * Always just let the server to the checking. 3726 * If we provide the ability to remove basic priviledges 3727 * to setattr (e.g. basic without chmod) then we will 3728 * need to add a check here before calling the server. 3729 */ 3730 3731 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3732 } 3733 3734 /* 3735 * To replace the "guarded" version 3 setattr, we use two types of compound 3736 * setattr requests: 3737 * 1. The "normal" setattr, used when the size of the file isn't being 3738 * changed - { Putfh <fh>; Setattr; Getattr }/ 3739 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3740 * with only ctime as the argument. If the server ctime differs from 3741 * what is cached on the client, the verify will fail, but we would 3742 * already have the ctime from the preceding getattr, so just set it 3743 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3744 * Setattr; Getattr }. 3745 * 3746 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3747 * this setattr and NULL if they are not. 3748 */ 3749 static int 3750 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3751 vsecattr_t *vsap) 3752 { 3753 COMPOUND4args_clnt args; 3754 COMPOUND4res_clnt res, *resp = NULL; 3755 nfs4_ga_res_t *garp = NULL; 3756 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3757 nfs_argop4 argop[5]; 3758 int verify_argop = -1; 3759 int setattr_argop = 1; 3760 nfs_resop4 *resop; 3761 vattr_t va; 3762 rnode4_t *rp; 3763 int doqueue = 1; 3764 uint_t mask = vap->va_mask; 3765 mode_t omode; 3766 vsecattr_t *vsp; 3767 timestruc_t ctime; 3768 bool_t needrecov = FALSE; 3769 nfs4_recov_state_t recov_state; 3770 nfs4_stateid_types_t sid_types; 3771 stateid4 stateid; 3772 hrtime_t t; 3773 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3774 servinfo4_t *svp; 3775 bitmap4 supp_attrs; 3776 3777 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3778 rp = VTOR4(vp); 3779 nfs4_init_stateid_types(&sid_types); 3780 3781 /* 3782 * Only need to flush pages if there are any pages and 3783 * if the file is marked as dirty in some fashion. The 3784 * file must be flushed so that we can accurately 3785 * determine the size of the file and the cached data 3786 * after the SETATTR returns. A file is considered to 3787 * be dirty if it is either marked with R4DIRTY, has 3788 * outstanding i/o's active, or is mmap'd. In this 3789 * last case, we can't tell whether there are dirty 3790 * pages, so we flush just to be sure. 3791 */ 3792 if (nfs4_has_pages(vp) && 3793 ((rp->r_flags & R4DIRTY) || 3794 rp->r_count > 0 || 3795 rp->r_mapcnt > 0)) { 3796 ASSERT(vp->v_type != VCHR); 3797 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3798 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3799 mutex_enter(&rp->r_statelock); 3800 if (!rp->r_error) 3801 rp->r_error = e.error; 3802 mutex_exit(&rp->r_statelock); 3803 } 3804 } 3805 3806 if (mask & AT_SIZE) { 3807 /* 3808 * Verification setattr compound for non-deleg AT_SIZE: 3809 * { Putfh; Getattr; Verify; Setattr; Getattr } 3810 * Set ctime local here (outside the do_again label) 3811 * so that subsequent retries (after failed VERIFY) 3812 * will use ctime from GETATTR results (from failed 3813 * verify compound) as VERIFY arg. 3814 * If file has delegation, then VERIFY(time_metadata) 3815 * is of little added value, so don't bother. 3816 */ 3817 mutex_enter(&rp->r_statev4_lock); 3818 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3819 rp->r_deleg_return_pending) { 3820 numops = 5; 3821 ctime = rp->r_attr.va_ctime; 3822 } 3823 mutex_exit(&rp->r_statev4_lock); 3824 } 3825 3826 recov_state.rs_flags = 0; 3827 recov_state.rs_num_retry_despite_err = 0; 3828 3829 args.ctag = TAG_SETATTR; 3830 do_again: 3831 recov_retry: 3832 setattr_argop = numops - 2; 3833 3834 args.array = argop; 3835 args.array_len = numops; 3836 3837 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3838 if (e.error) 3839 return (e.error); 3840 3841 3842 /* putfh target fh */ 3843 argop[0].argop = OP_CPUTFH; 3844 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3845 3846 if (numops == 5) { 3847 /* 3848 * We only care about the ctime, but need to get mtime 3849 * and size for proper cache update. 3850 */ 3851 /* getattr */ 3852 argop[1].argop = OP_GETATTR; 3853 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3854 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3855 3856 /* verify - set later in loop */ 3857 verify_argop = 2; 3858 } 3859 3860 /* setattr */ 3861 svp = rp->r_server; 3862 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3863 supp_attrs = svp->sv_supp_attrs; 3864 nfs_rw_exit(&svp->sv_lock); 3865 3866 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3867 supp_attrs, &e.error, &sid_types); 3868 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3869 if (e.error) { 3870 /* req time field(s) overflow - return immediately */ 3871 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3872 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3873 opsetattr.obj_attributes); 3874 return (e.error); 3875 } 3876 omode = rp->r_attr.va_mode; 3877 3878 /* getattr */ 3879 argop[numops-1].argop = OP_GETATTR; 3880 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3881 /* 3882 * If we are setting the ACL (indicated only by vsap != NULL), request 3883 * the ACL in this getattr. The ACL returned from this getattr will be 3884 * used in updating the ACL cache. 3885 */ 3886 if (vsap != NULL) 3887 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3888 FATTR4_ACL_MASK; 3889 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3890 3891 /* 3892 * setattr iterates if the object size is set and the cached ctime 3893 * does not match the file ctime. In that case, verify the ctime first. 3894 */ 3895 3896 do { 3897 if (verify_argop != -1) { 3898 /* 3899 * Verify that the ctime match before doing setattr. 3900 */ 3901 va.va_mask = AT_CTIME; 3902 va.va_ctime = ctime; 3903 svp = rp->r_server; 3904 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3905 supp_attrs = svp->sv_supp_attrs; 3906 nfs_rw_exit(&svp->sv_lock); 3907 e.error = nfs4args_verify(&argop[verify_argop], &va, 3908 OP_VERIFY, supp_attrs); 3909 if (e.error) { 3910 /* req time field(s) overflow - return */ 3911 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3912 needrecov); 3913 break; 3914 } 3915 } 3916 3917 doqueue = 1; 3918 3919 t = gethrtime(); 3920 3921 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3922 3923 /* 3924 * Purge the access cache and ACL cache if changing either the 3925 * owner of the file, the group owner, or the mode. These may 3926 * change the access permissions of the file, so purge old 3927 * information and start over again. 3928 */ 3929 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3930 (void) nfs4_access_purge_rp(rp); 3931 if (rp->r_secattr != NULL) { 3932 mutex_enter(&rp->r_statelock); 3933 vsp = rp->r_secattr; 3934 rp->r_secattr = NULL; 3935 mutex_exit(&rp->r_statelock); 3936 if (vsp != NULL) 3937 nfs4_acl_free_cache(vsp); 3938 } 3939 } 3940 3941 /* 3942 * If res.array_len == numops, then everything succeeded, 3943 * except for possibly the final getattr. If only the 3944 * last getattr failed, give up, and don't try recovery. 3945 */ 3946 if (res.array_len == numops) { 3947 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3948 needrecov); 3949 if (! e.error) 3950 resp = &res; 3951 break; 3952 } 3953 3954 /* 3955 * if either rpc call failed or completely succeeded - done 3956 */ 3957 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3958 if (e.error) { 3959 PURGE_ATTRCACHE4(vp); 3960 if (!needrecov) { 3961 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3962 needrecov); 3963 break; 3964 } 3965 } 3966 3967 /* 3968 * Do proper retry for OLD_STATEID outside of the normal 3969 * recovery framework. 3970 */ 3971 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3972 sid_types.cur_sid_type != SPEC_SID && 3973 sid_types.cur_sid_type != NO_SID) { 3974 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3975 needrecov); 3976 nfs4_save_stateid(&stateid, &sid_types); 3977 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3978 opsetattr.obj_attributes); 3979 if (verify_argop != -1) { 3980 nfs4args_verify_free(&argop[verify_argop]); 3981 verify_argop = -1; 3982 } 3983 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3984 goto recov_retry; 3985 } 3986 3987 if (needrecov) { 3988 bool_t abort; 3989 3990 abort = nfs4_start_recovery(&e, 3991 VTOMI4(vp), vp, NULL, NULL, NULL, 3992 OP_SETATTR, NULL, NULL, NULL); 3993 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3994 needrecov); 3995 /* 3996 * Do not retry if we failed with OLD_STATEID using 3997 * a special stateid. This is done to avoid looping 3998 * with a broken server. 3999 */ 4000 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4001 (sid_types.cur_sid_type == SPEC_SID || 4002 sid_types.cur_sid_type == NO_SID)) 4003 abort = TRUE; 4004 if (!e.error) { 4005 if (res.status == NFS4ERR_BADOWNER) 4006 nfs4_log_badowner(VTOMI4(vp), 4007 OP_SETATTR); 4008 4009 e.error = geterrno4(res.status); 4010 (void) xdr_free(xdr_COMPOUND4res_clnt, 4011 (caddr_t)&res); 4012 } 4013 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4014 opsetattr.obj_attributes); 4015 if (verify_argop != -1) { 4016 nfs4args_verify_free(&argop[verify_argop]); 4017 verify_argop = -1; 4018 } 4019 if (abort == FALSE) { 4020 /* 4021 * Need to retry all possible stateids in 4022 * case the recovery error wasn't stateid 4023 * related or the stateids have become 4024 * stale (server reboot). 4025 */ 4026 nfs4_init_stateid_types(&sid_types); 4027 goto recov_retry; 4028 } 4029 return (e.error); 4030 } 4031 4032 /* 4033 * Need to call nfs4_end_op before nfs4getattr to 4034 * avoid potential nfs4_start_op deadlock. See RFE 4035 * 4777612. Calls to nfs4_invalidate_pages() and 4036 * nfs4_purge_stale_fh() might also generate over the 4037 * wire calls which my cause nfs4_start_op() deadlock. 4038 */ 4039 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4040 4041 /* 4042 * Check to update lease. 4043 */ 4044 resp = &res; 4045 if (res.status == NFS4_OK) { 4046 break; 4047 } 4048 4049 /* 4050 * Check if verify failed to see if try again 4051 */ 4052 if ((verify_argop == -1) || (res.array_len != 3)) { 4053 /* 4054 * can't continue... 4055 */ 4056 if (res.status == NFS4ERR_BADOWNER) 4057 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4058 4059 e.error = geterrno4(res.status); 4060 } else { 4061 /* 4062 * When the verify request fails, the client ctime is 4063 * not in sync with the server. This is the same as 4064 * the version 3 "not synchronized" error, and we 4065 * handle it in a similar manner (XXX do we need to???). 4066 * Use the ctime returned in the first getattr for 4067 * the input to the next verify. 4068 * If we couldn't get the attributes, then we give up 4069 * because we can't complete the operation as required. 4070 */ 4071 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4072 } 4073 if (e.error) { 4074 PURGE_ATTRCACHE4(vp); 4075 nfs4_purge_stale_fh(e.error, vp, cr); 4076 } else { 4077 /* 4078 * retry with a new verify value 4079 */ 4080 ctime = garp->n4g_va.va_ctime; 4081 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4082 resp = NULL; 4083 } 4084 if (!e.error) { 4085 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4086 opsetattr.obj_attributes); 4087 if (verify_argop != -1) { 4088 nfs4args_verify_free(&argop[verify_argop]); 4089 verify_argop = -1; 4090 } 4091 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4092 goto do_again; 4093 } 4094 } while (!e.error); 4095 4096 if (e.error) { 4097 /* 4098 * If we are here, rfs4call has an irrecoverable error - return 4099 */ 4100 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4101 opsetattr.obj_attributes); 4102 if (verify_argop != -1) { 4103 nfs4args_verify_free(&argop[verify_argop]); 4104 verify_argop = -1; 4105 } 4106 if (resp) 4107 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4108 return (e.error); 4109 } 4110 4111 4112 4113 /* 4114 * If changing the size of the file, invalidate 4115 * any local cached data which is no longer part 4116 * of the file. We also possibly invalidate the 4117 * last page in the file. We could use 4118 * pvn_vpzero(), but this would mark the page as 4119 * modified and require it to be written back to 4120 * the server for no particularly good reason. 4121 * This way, if we access it, then we bring it 4122 * back in. A read should be cheaper than a 4123 * write. 4124 */ 4125 if (mask & AT_SIZE) { 4126 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4127 } 4128 4129 /* either no error or one of the postop getattr failed */ 4130 4131 /* 4132 * XXX Perform a simplified version of wcc checking. Instead of 4133 * have another getattr to get pre-op, just purge cache if 4134 * any of the ops prior to and including the getattr failed. 4135 * If the getattr succeeded then update the attrcache accordingly. 4136 */ 4137 4138 garp = NULL; 4139 if (res.status == NFS4_OK) { 4140 /* 4141 * Last getattr 4142 */ 4143 resop = &res.array[numops - 1]; 4144 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4145 } 4146 /* 4147 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4148 * rather than filling it. See the function itself for details. 4149 */ 4150 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4151 if (garp != NULL) { 4152 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4153 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4154 vs_ace4_destroy(&garp->n4g_vsa); 4155 } else { 4156 if (vsap != NULL) { 4157 /* 4158 * The ACL was supposed to be set and to be 4159 * returned in the last getattr of this 4160 * compound, but for some reason the getattr 4161 * result doesn't contain the ACL. In this 4162 * case, purge the ACL cache. 4163 */ 4164 if (rp->r_secattr != NULL) { 4165 mutex_enter(&rp->r_statelock); 4166 vsp = rp->r_secattr; 4167 rp->r_secattr = NULL; 4168 mutex_exit(&rp->r_statelock); 4169 if (vsp != NULL) 4170 nfs4_acl_free_cache(vsp); 4171 } 4172 } 4173 } 4174 } 4175 4176 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4177 /* 4178 * Set the size, rather than relying on getting it updated 4179 * via a GETATTR. With delegations the client tries to 4180 * suppress GETATTR calls. 4181 */ 4182 mutex_enter(&rp->r_statelock); 4183 rp->r_size = vap->va_size; 4184 mutex_exit(&rp->r_statelock); 4185 } 4186 4187 /* 4188 * Can free up request args and res 4189 */ 4190 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4191 opsetattr.obj_attributes); 4192 if (verify_argop != -1) { 4193 nfs4args_verify_free(&argop[verify_argop]); 4194 verify_argop = -1; 4195 } 4196 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4197 4198 /* 4199 * Some servers will change the mode to clear the setuid 4200 * and setgid bits when changing the uid or gid. The 4201 * client needs to compensate appropriately. 4202 */ 4203 if (mask & (AT_UID | AT_GID)) { 4204 int terror, do_setattr; 4205 4206 do_setattr = 0; 4207 va.va_mask = AT_MODE; 4208 terror = nfs4getattr(vp, &va, cr); 4209 if (!terror && 4210 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4211 (!(mask & AT_MODE) && va.va_mode != omode))) { 4212 va.va_mask = AT_MODE; 4213 if (mask & AT_MODE) { 4214 /* 4215 * We asked the mode to be changed and what 4216 * we just got from the server in getattr is 4217 * not what we wanted it to be, so set it now. 4218 */ 4219 va.va_mode = vap->va_mode; 4220 do_setattr = 1; 4221 } else { 4222 /* 4223 * We did not ask the mode to be changed, 4224 * Check to see that the server just cleared 4225 * I_SUID and I_GUID from it. If not then 4226 * set mode to omode with UID/GID cleared. 4227 */ 4228 if (nfs4_compare_modes(va.va_mode, omode)) { 4229 omode &= ~(S_ISUID|S_ISGID); 4230 va.va_mode = omode; 4231 do_setattr = 1; 4232 } 4233 } 4234 4235 if (do_setattr) 4236 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4237 } 4238 } 4239 4240 return (e.error); 4241 } 4242 4243 /* ARGSUSED */ 4244 static int 4245 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4246 { 4247 COMPOUND4args_clnt args; 4248 COMPOUND4res_clnt res; 4249 int doqueue; 4250 uint32_t acc, resacc, argacc; 4251 rnode4_t *rp; 4252 cred_t *cred, *ncr, *ncrfree = NULL; 4253 nfs4_access_type_t cacc; 4254 int num_ops; 4255 nfs_argop4 argop[3]; 4256 nfs_resop4 *resop; 4257 bool_t needrecov = FALSE, do_getattr; 4258 nfs4_recov_state_t recov_state; 4259 int rpc_error; 4260 hrtime_t t; 4261 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4262 mntinfo4_t *mi = VTOMI4(vp); 4263 4264 if (nfs_zone() != mi->mi_zone) 4265 return (EIO); 4266 4267 acc = 0; 4268 if (mode & VREAD) 4269 acc |= ACCESS4_READ; 4270 if (mode & VWRITE) { 4271 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4272 return (EROFS); 4273 if (vp->v_type == VDIR) 4274 acc |= ACCESS4_DELETE; 4275 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4276 } 4277 if (mode & VEXEC) { 4278 if (vp->v_type == VDIR) 4279 acc |= ACCESS4_LOOKUP; 4280 else 4281 acc |= ACCESS4_EXECUTE; 4282 } 4283 4284 if (VTOR4(vp)->r_acache != NULL) { 4285 e.error = nfs4_validate_caches(vp, cr); 4286 if (e.error) 4287 return (e.error); 4288 } 4289 4290 rp = VTOR4(vp); 4291 if (vp->v_type == VDIR) 4292 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4293 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4294 else 4295 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4296 ACCESS4_EXECUTE; 4297 recov_state.rs_flags = 0; 4298 recov_state.rs_num_retry_despite_err = 0; 4299 4300 cred = cr; 4301 /* 4302 * ncr and ncrfree both initially 4303 * point to the memory area returned 4304 * by crnetadjust(); 4305 * ncrfree not NULL when exiting means 4306 * that we need to release it 4307 */ 4308 ncr = crnetadjust(cred); 4309 ncrfree = ncr; 4310 4311 tryagain: 4312 cacc = nfs4_access_check(rp, acc, cred); 4313 if (cacc == NFS4_ACCESS_ALLOWED) { 4314 if (ncrfree != NULL) 4315 crfree(ncrfree); 4316 return (0); 4317 } 4318 if (cacc == NFS4_ACCESS_DENIED) { 4319 /* 4320 * If the cred can be adjusted, try again 4321 * with the new cred. 4322 */ 4323 if (ncr != NULL) { 4324 cred = ncr; 4325 ncr = NULL; 4326 goto tryagain; 4327 } 4328 if (ncrfree != NULL) 4329 crfree(ncrfree); 4330 return (EACCES); 4331 } 4332 4333 recov_retry: 4334 /* 4335 * Don't take with r_statev4_lock here. r_deleg_type could 4336 * change as soon as lock is released. Since it is an int, 4337 * there is no atomicity issue. 4338 */ 4339 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4340 num_ops = do_getattr ? 3 : 2; 4341 4342 args.ctag = TAG_ACCESS; 4343 4344 args.array_len = num_ops; 4345 args.array = argop; 4346 4347 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4348 &recov_state, NULL)) { 4349 if (ncrfree != NULL) 4350 crfree(ncrfree); 4351 return (e.error); 4352 } 4353 4354 /* putfh target fh */ 4355 argop[0].argop = OP_CPUTFH; 4356 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4357 4358 /* access */ 4359 argop[1].argop = OP_ACCESS; 4360 argop[1].nfs_argop4_u.opaccess.access = argacc; 4361 4362 /* getattr */ 4363 if (do_getattr) { 4364 argop[2].argop = OP_GETATTR; 4365 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4366 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4367 } 4368 4369 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4370 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4371 rnode4info(VTOR4(vp)))); 4372 4373 doqueue = 1; 4374 t = gethrtime(); 4375 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4376 rpc_error = e.error; 4377 4378 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4379 if (needrecov) { 4380 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4381 "nfs4_access: initiating recovery\n")); 4382 4383 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4384 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) { 4385 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4386 &recov_state, needrecov); 4387 if (!e.error) 4388 (void) xdr_free(xdr_COMPOUND4res_clnt, 4389 (caddr_t)&res); 4390 goto recov_retry; 4391 } 4392 } 4393 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4394 4395 if (e.error) 4396 goto out; 4397 4398 if (res.status) { 4399 e.error = geterrno4(res.status); 4400 /* 4401 * This might generate over the wire calls throught 4402 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4403 * here to avoid a deadlock. 4404 */ 4405 nfs4_purge_stale_fh(e.error, vp, cr); 4406 goto out; 4407 } 4408 resop = &res.array[1]; /* access res */ 4409 4410 resacc = resop->nfs_resop4_u.opaccess.access; 4411 4412 if (do_getattr) { 4413 resop++; /* getattr res */ 4414 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4415 t, cr, FALSE, NULL); 4416 } 4417 4418 if (!e.error) { 4419 nfs4_access_cache(rp, argacc, resacc, cred); 4420 /* 4421 * we just cached results with cred; if cred is the 4422 * adjusted credentials from crnetadjust, we do not want 4423 * to release them before exiting: hence setting ncrfree 4424 * to NULL 4425 */ 4426 if (cred != cr) 4427 ncrfree = NULL; 4428 /* XXX check the supported bits too? */ 4429 if ((acc & resacc) != acc) { 4430 /* 4431 * The following code implements the semantic 4432 * that a setuid root program has *at least* the 4433 * permissions of the user that is running the 4434 * program. See rfs3call() for more portions 4435 * of the implementation of this functionality. 4436 */ 4437 /* XXX-LP */ 4438 if (ncr != NULL) { 4439 (void) xdr_free(xdr_COMPOUND4res_clnt, 4440 (caddr_t)&res); 4441 cred = ncr; 4442 ncr = NULL; 4443 goto tryagain; 4444 } 4445 e.error = EACCES; 4446 } 4447 } 4448 4449 out: 4450 if (!rpc_error) 4451 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4452 4453 if (ncrfree != NULL) 4454 crfree(ncrfree); 4455 4456 return (e.error); 4457 } 4458 4459 /* ARGSUSED */ 4460 static int 4461 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4462 { 4463 COMPOUND4args_clnt args; 4464 COMPOUND4res_clnt res; 4465 int doqueue; 4466 rnode4_t *rp; 4467 nfs_argop4 argop[3]; 4468 nfs_resop4 *resop; 4469 READLINK4res *lr_res; 4470 nfs4_ga_res_t *garp; 4471 uint_t len; 4472 char *linkdata; 4473 bool_t needrecov = FALSE; 4474 nfs4_recov_state_t recov_state; 4475 hrtime_t t; 4476 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4477 4478 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4479 return (EIO); 4480 /* 4481 * Can't readlink anything other than a symbolic link. 4482 */ 4483 if (vp->v_type != VLNK) 4484 return (EINVAL); 4485 4486 rp = VTOR4(vp); 4487 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4488 e.error = nfs4_validate_caches(vp, cr); 4489 if (e.error) 4490 return (e.error); 4491 mutex_enter(&rp->r_statelock); 4492 if (rp->r_symlink.contents != NULL) { 4493 e.error = uiomove(rp->r_symlink.contents, 4494 rp->r_symlink.len, UIO_READ, uiop); 4495 mutex_exit(&rp->r_statelock); 4496 return (e.error); 4497 } 4498 mutex_exit(&rp->r_statelock); 4499 } 4500 recov_state.rs_flags = 0; 4501 recov_state.rs_num_retry_despite_err = 0; 4502 4503 recov_retry: 4504 args.array_len = 3; 4505 args.array = argop; 4506 args.ctag = TAG_READLINK; 4507 4508 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4509 if (e.error) { 4510 return (e.error); 4511 } 4512 4513 /* 0. putfh symlink fh */ 4514 argop[0].argop = OP_CPUTFH; 4515 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4516 4517 /* 1. readlink */ 4518 argop[1].argop = OP_READLINK; 4519 4520 /* 2. getattr */ 4521 argop[2].argop = OP_GETATTR; 4522 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4523 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4524 4525 doqueue = 1; 4526 4527 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4528 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4529 rnode4info(VTOR4(vp)))); 4530 4531 t = gethrtime(); 4532 4533 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4534 4535 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4536 if (needrecov) { 4537 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4538 "nfs4_readlink: initiating recovery\n")); 4539 4540 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4541 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) { 4542 if (!e.error) 4543 (void) xdr_free(xdr_COMPOUND4res_clnt, 4544 (caddr_t)&res); 4545 4546 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4547 needrecov); 4548 goto recov_retry; 4549 } 4550 } 4551 4552 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4553 4554 if (e.error) 4555 return (e.error); 4556 4557 /* 4558 * There is an path in the code below which calls 4559 * nfs4_purge_stale_fh(), which may generate otw calls through 4560 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4561 * here to avoid nfs4_start_op() deadlock. 4562 */ 4563 4564 if (res.status && (res.array_len < args.array_len)) { 4565 /* 4566 * either Putfh or Link failed 4567 */ 4568 e.error = geterrno4(res.status); 4569 nfs4_purge_stale_fh(e.error, vp, cr); 4570 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4571 return (e.error); 4572 } 4573 4574 resop = &res.array[1]; /* readlink res */ 4575 lr_res = &resop->nfs_resop4_u.opreadlink; 4576 4577 /* 4578 * treat symlink names as data 4579 */ 4580 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4581 if (linkdata != NULL) { 4582 int uio_len = len - 1; 4583 /* len includes null byte, which we won't uiomove */ 4584 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4585 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4586 mutex_enter(&rp->r_statelock); 4587 if (rp->r_symlink.contents == NULL) { 4588 rp->r_symlink.contents = linkdata; 4589 rp->r_symlink.len = uio_len; 4590 rp->r_symlink.size = len; 4591 mutex_exit(&rp->r_statelock); 4592 } else { 4593 mutex_exit(&rp->r_statelock); 4594 kmem_free(linkdata, len); 4595 } 4596 } else { 4597 kmem_free(linkdata, len); 4598 } 4599 } 4600 if (res.status == NFS4_OK) { 4601 resop++; /* getattr res */ 4602 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4603 } 4604 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4605 4606 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4607 4608 /* 4609 * The over the wire error for attempting to readlink something 4610 * other than a symbolic link is ENXIO. However, we need to 4611 * return EINVAL instead of ENXIO, so we map it here. 4612 */ 4613 return (e.error == ENXIO ? EINVAL : e.error); 4614 } 4615 4616 /* 4617 * Flush local dirty pages to stable storage on the server. 4618 * 4619 * If FNODSYNC is specified, then there is nothing to do because 4620 * metadata changes are not cached on the client before being 4621 * sent to the server. 4622 */ 4623 /* ARGSUSED */ 4624 static int 4625 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4626 { 4627 int error; 4628 4629 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4630 return (0); 4631 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4632 return (EIO); 4633 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4634 if (!error) 4635 error = VTOR4(vp)->r_error; 4636 return (error); 4637 } 4638 4639 /* 4640 * Weirdness: if the file was removed or the target of a rename 4641 * operation while it was open, it got renamed instead. Here we 4642 * remove the renamed file. 4643 */ 4644 /* ARGSUSED */ 4645 void 4646 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4647 { 4648 rnode4_t *rp; 4649 4650 ASSERT(vp != DNLC_NO_VNODE); 4651 4652 rp = VTOR4(vp); 4653 4654 if (IS_SHADOW(vp, rp)) { 4655 sv_inactive(vp); 4656 return; 4657 } 4658 4659 /* 4660 * If this is coming from the wrong zone, we let someone in the right 4661 * zone take care of it asynchronously. We can get here due to 4662 * VN_RELE() being called from pageout() or fsflush(). This call may 4663 * potentially turn into an expensive no-op if, for instance, v_count 4664 * gets incremented in the meantime, but it's still correct. 4665 */ 4666 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4667 nfs4_async_inactive(vp, cr); 4668 return; 4669 } 4670 4671 /* 4672 * Some of the cleanup steps might require over-the-wire 4673 * operations. Since VOP_INACTIVE can get called as a result of 4674 * other over-the-wire operations (e.g., an attribute cache update 4675 * can lead to a DNLC purge), doing those steps now would lead to a 4676 * nested call to the recovery framework, which can deadlock. So 4677 * do any over-the-wire cleanups asynchronously, in a separate 4678 * thread. 4679 */ 4680 4681 mutex_enter(&rp->r_os_lock); 4682 mutex_enter(&rp->r_statelock); 4683 mutex_enter(&rp->r_statev4_lock); 4684 4685 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4686 mutex_exit(&rp->r_statev4_lock); 4687 mutex_exit(&rp->r_statelock); 4688 mutex_exit(&rp->r_os_lock); 4689 nfs4_async_inactive(vp, cr); 4690 return; 4691 } 4692 4693 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4694 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4695 mutex_exit(&rp->r_statev4_lock); 4696 mutex_exit(&rp->r_statelock); 4697 mutex_exit(&rp->r_os_lock); 4698 nfs4_async_inactive(vp, cr); 4699 return; 4700 } 4701 4702 if (rp->r_unldvp != NULL) { 4703 mutex_exit(&rp->r_statev4_lock); 4704 mutex_exit(&rp->r_statelock); 4705 mutex_exit(&rp->r_os_lock); 4706 nfs4_async_inactive(vp, cr); 4707 return; 4708 } 4709 mutex_exit(&rp->r_statev4_lock); 4710 mutex_exit(&rp->r_statelock); 4711 mutex_exit(&rp->r_os_lock); 4712 4713 rp4_addfree(rp, cr); 4714 } 4715 4716 /* 4717 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4718 * various bits of state. The caller must not refer to vp after this call. 4719 */ 4720 4721 void 4722 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4723 { 4724 rnode4_t *rp = VTOR4(vp); 4725 nfs4_recov_state_t recov_state; 4726 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4727 vnode_t *unldvp; 4728 char *unlname; 4729 cred_t *unlcred; 4730 COMPOUND4args_clnt args; 4731 COMPOUND4res_clnt res, *resp; 4732 nfs_argop4 argop[2]; 4733 int doqueue; 4734 #ifdef DEBUG 4735 char *name; 4736 #endif 4737 4738 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4739 ASSERT(!IS_SHADOW(vp, rp)); 4740 4741 #ifdef DEBUG 4742 name = fn_name(VTOSV(vp)->sv_name); 4743 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4744 "release vnode %s", name)); 4745 kmem_free(name, MAXNAMELEN); 4746 #endif 4747 4748 if (vp->v_type == VREG) { 4749 bool_t recov_failed = FALSE; 4750 4751 e.error = nfs4close_all(vp, cr); 4752 if (e.error) { 4753 /* Check to see if recovery failed */ 4754 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4755 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4756 recov_failed = TRUE; 4757 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4758 if (!recov_failed) { 4759 mutex_enter(&rp->r_statelock); 4760 if (rp->r_flags & R4RECOVERR) 4761 recov_failed = TRUE; 4762 mutex_exit(&rp->r_statelock); 4763 } 4764 if (recov_failed) { 4765 NFS4_DEBUG(nfs4_client_recov_debug, 4766 (CE_NOTE, "nfs4_inactive_otw: " 4767 "close failed (recovery failure)")); 4768 } 4769 } 4770 } 4771 4772 redo: 4773 if (rp->r_unldvp == NULL) { 4774 rp4_addfree(rp, cr); 4775 return; 4776 } 4777 4778 /* 4779 * Save the vnode pointer for the directory where the 4780 * unlinked-open file got renamed, then set it to NULL 4781 * to prevent another thread from getting here before 4782 * we're done with the remove. While we have the 4783 * statelock, make local copies of the pertinent rnode 4784 * fields. If we weren't to do this in an atomic way, the 4785 * the unl* fields could become inconsistent with respect 4786 * to each other due to a race condition between this 4787 * code and nfs_remove(). See bug report 1034328. 4788 */ 4789 mutex_enter(&rp->r_statelock); 4790 if (rp->r_unldvp == NULL) { 4791 mutex_exit(&rp->r_statelock); 4792 rp4_addfree(rp, cr); 4793 return; 4794 } 4795 4796 unldvp = rp->r_unldvp; 4797 rp->r_unldvp = NULL; 4798 unlname = rp->r_unlname; 4799 rp->r_unlname = NULL; 4800 unlcred = rp->r_unlcred; 4801 rp->r_unlcred = NULL; 4802 mutex_exit(&rp->r_statelock); 4803 4804 /* 4805 * If there are any dirty pages left, then flush 4806 * them. This is unfortunate because they just 4807 * may get thrown away during the remove operation, 4808 * but we have to do this for correctness. 4809 */ 4810 if (nfs4_has_pages(vp) && 4811 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4812 ASSERT(vp->v_type != VCHR); 4813 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4814 if (e.error) { 4815 mutex_enter(&rp->r_statelock); 4816 if (!rp->r_error) 4817 rp->r_error = e.error; 4818 mutex_exit(&rp->r_statelock); 4819 } 4820 } 4821 4822 recov_state.rs_flags = 0; 4823 recov_state.rs_num_retry_despite_err = 0; 4824 recov_retry_remove: 4825 /* 4826 * Do the remove operation on the renamed file 4827 */ 4828 args.ctag = TAG_INACTIVE; 4829 4830 /* 4831 * Remove ops: putfh dir; remove 4832 */ 4833 args.array_len = 2; 4834 args.array = argop; 4835 4836 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4837 if (e.error) { 4838 kmem_free(unlname, MAXNAMELEN); 4839 crfree(unlcred); 4840 VN_RELE(unldvp); 4841 /* 4842 * Try again; this time around r_unldvp will be NULL, so we'll 4843 * just call rp4_addfree() and return. 4844 */ 4845 goto redo; 4846 } 4847 4848 /* putfh directory */ 4849 argop[0].argop = OP_CPUTFH; 4850 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4851 4852 /* remove */ 4853 argop[1].argop = OP_CREMOVE; 4854 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4855 4856 doqueue = 1; 4857 resp = &res; 4858 4859 #if 0 /* notyet */ 4860 /* 4861 * Can't do this yet. We may be being called from 4862 * dnlc_purge_XXX while that routine is holding a 4863 * mutex lock to the nc_rele list. The calls to 4864 * nfs3_cache_wcc_data may result in calls to 4865 * dnlc_purge_XXX. This will result in a deadlock. 4866 */ 4867 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4868 if (e.error) { 4869 PURGE_ATTRCACHE4(unldvp); 4870 resp = NULL; 4871 } else if (res.status) { 4872 e.error = geterrno4(res.status); 4873 PURGE_ATTRCACHE4(unldvp); 4874 /* 4875 * This code is inactive right now 4876 * but if made active there should 4877 * be a nfs4_end_op() call before 4878 * nfs4_purge_stale_fh to avoid start_op() 4879 * deadlock. See BugId: 4948726 4880 */ 4881 nfs4_purge_stale_fh(error, unldvp, cr); 4882 } else { 4883 nfs_resop4 *resop; 4884 REMOVE4res *rm_res; 4885 4886 resop = &res.array[1]; 4887 rm_res = &resop->nfs_resop4_u.opremove; 4888 /* 4889 * Update directory cache attribute, 4890 * readdir and dnlc caches. 4891 */ 4892 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4893 } 4894 #else 4895 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4896 4897 PURGE_ATTRCACHE4(unldvp); 4898 #endif 4899 4900 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4901 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4902 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 4903 if (!e.error) 4904 (void) xdr_free(xdr_COMPOUND4res_clnt, 4905 (caddr_t)&res); 4906 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4907 &recov_state, TRUE); 4908 goto recov_retry_remove; 4909 } 4910 } 4911 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4912 4913 /* 4914 * Release stuff held for the remove 4915 */ 4916 VN_RELE(unldvp); 4917 if (!e.error && resp) 4918 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4919 4920 kmem_free(unlname, MAXNAMELEN); 4921 crfree(unlcred); 4922 goto redo; 4923 } 4924 4925 /* 4926 * Remote file system operations having to do with directory manipulation. 4927 */ 4928 /* ARGSUSED3 */ 4929 int 4930 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4931 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4932 int *direntflags, pathname_t *realpnp) 4933 { 4934 int error; 4935 vnode_t *vp, *avp = NULL; 4936 rnode4_t *drp; 4937 4938 *vpp = NULL; 4939 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4940 return (EPERM); 4941 /* 4942 * if LOOKUP_XATTR, must replace dvp (object) with 4943 * object's attrdir before continuing with lookup 4944 */ 4945 if (flags & LOOKUP_XATTR) { 4946 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4947 if (error) 4948 return (error); 4949 4950 dvp = avp; 4951 4952 /* 4953 * If lookup is for "", just return dvp now. The attrdir 4954 * has already been activated (from nfs4lookup_xattr), and 4955 * the caller will RELE the original dvp -- not 4956 * the attrdir. So, set vpp and return. 4957 * Currently, when the LOOKUP_XATTR flag is 4958 * passed to VOP_LOOKUP, the name is always empty, and 4959 * shortcircuiting here avoids 3 unneeded lock/unlock 4960 * pairs. 4961 * 4962 * If a non-empty name was provided, then it is the 4963 * attribute name, and it will be looked up below. 4964 */ 4965 if (*nm == '\0') { 4966 *vpp = dvp; 4967 return (0); 4968 } 4969 4970 /* 4971 * The vfs layer never sends a name when asking for the 4972 * attrdir, so we should never get here (unless of course 4973 * name is passed at some time in future -- at which time 4974 * we'll blow up here). 4975 */ 4976 ASSERT(0); 4977 } 4978 4979 drp = VTOR4(dvp); 4980 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4981 return (EINTR); 4982 4983 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4984 nfs_rw_exit(&drp->r_rwlock); 4985 4986 /* 4987 * If vnode is a device, create special vnode. 4988 */ 4989 if (!error && ISVDEV((*vpp)->v_type)) { 4990 vp = *vpp; 4991 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4992 VN_RELE(vp); 4993 } 4994 4995 return (error); 4996 } 4997 4998 /* ARGSUSED */ 4999 static int 5000 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 5001 { 5002 int error; 5003 rnode4_t *drp; 5004 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 5005 mntinfo4_t *mi; 5006 5007 mi = VTOMI4(dvp); 5008 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 5009 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 5010 return (EINVAL); 5011 5012 drp = VTOR4(dvp); 5013 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5014 return (EINTR); 5015 5016 mutex_enter(&drp->r_statelock); 5017 /* 5018 * If the server doesn't support xattrs just return EINVAL 5019 */ 5020 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 5021 mutex_exit(&drp->r_statelock); 5022 nfs_rw_exit(&drp->r_rwlock); 5023 return (EINVAL); 5024 } 5025 5026 /* 5027 * If there is a cached xattr directory entry, 5028 * use it as long as the attributes are valid. If the 5029 * attributes are not valid, take the simple approach and 5030 * free the cached value and re-fetch a new value. 5031 * 5032 * We don't negative entry cache for now, if we did we 5033 * would need to check if the file has changed on every 5034 * lookup. But xattrs don't exist very often and failing 5035 * an openattr is not much more expensive than and NVERIFY or GETATTR 5036 * so do an openattr over the wire for now. 5037 */ 5038 if (drp->r_xattr_dir != NULL) { 5039 if (ATTRCACHE4_VALID(dvp)) { 5040 VN_HOLD(drp->r_xattr_dir); 5041 *vpp = drp->r_xattr_dir; 5042 mutex_exit(&drp->r_statelock); 5043 nfs_rw_exit(&drp->r_rwlock); 5044 return (0); 5045 } 5046 VN_RELE(drp->r_xattr_dir); 5047 drp->r_xattr_dir = NULL; 5048 } 5049 mutex_exit(&drp->r_statelock); 5050 5051 error = nfs4openattr(dvp, vpp, cflag, cr); 5052 5053 nfs_rw_exit(&drp->r_rwlock); 5054 5055 return (error); 5056 } 5057 5058 static int 5059 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5060 { 5061 int error; 5062 rnode4_t *drp; 5063 5064 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5065 5066 /* 5067 * If lookup is for "", just return dvp. Don't need 5068 * to send it over the wire, look it up in the dnlc, 5069 * or perform any access checks. 5070 */ 5071 if (*nm == '\0') { 5072 VN_HOLD(dvp); 5073 *vpp = dvp; 5074 return (0); 5075 } 5076 5077 /* 5078 * Can't do lookups in non-directories. 5079 */ 5080 if (dvp->v_type != VDIR) 5081 return (ENOTDIR); 5082 5083 /* 5084 * If lookup is for ".", just return dvp. Don't need 5085 * to send it over the wire or look it up in the dnlc, 5086 * just need to check access. 5087 */ 5088 if (nm[0] == '.' && nm[1] == '\0') { 5089 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5090 if (error) 5091 return (error); 5092 VN_HOLD(dvp); 5093 *vpp = dvp; 5094 return (0); 5095 } 5096 5097 drp = VTOR4(dvp); 5098 if (!(drp->r_flags & R4LOOKUP)) { 5099 mutex_enter(&drp->r_statelock); 5100 drp->r_flags |= R4LOOKUP; 5101 mutex_exit(&drp->r_statelock); 5102 } 5103 5104 *vpp = NULL; 5105 /* 5106 * Lookup this name in the DNLC. If there is no entry 5107 * lookup over the wire. 5108 */ 5109 if (!skipdnlc) 5110 *vpp = dnlc_lookup(dvp, nm); 5111 if (*vpp == NULL) { 5112 /* 5113 * We need to go over the wire to lookup the name. 5114 */ 5115 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5116 } 5117 5118 /* 5119 * We hit on the dnlc 5120 */ 5121 if (*vpp != DNLC_NO_VNODE || 5122 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5123 /* 5124 * But our attrs may not be valid. 5125 */ 5126 if (ATTRCACHE4_VALID(dvp)) { 5127 error = nfs4_waitfor_purge_complete(dvp); 5128 if (error) { 5129 VN_RELE(*vpp); 5130 *vpp = NULL; 5131 return (error); 5132 } 5133 5134 /* 5135 * If after the purge completes, check to make sure 5136 * our attrs are still valid. 5137 */ 5138 if (ATTRCACHE4_VALID(dvp)) { 5139 /* 5140 * If we waited for a purge we may have 5141 * lost our vnode so look it up again. 5142 */ 5143 VN_RELE(*vpp); 5144 *vpp = dnlc_lookup(dvp, nm); 5145 if (*vpp == NULL) 5146 return (nfs4lookupnew_otw(dvp, 5147 nm, vpp, cr)); 5148 5149 /* 5150 * The access cache should almost always hit 5151 */ 5152 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5153 5154 if (error) { 5155 VN_RELE(*vpp); 5156 *vpp = NULL; 5157 return (error); 5158 } 5159 if (*vpp == DNLC_NO_VNODE) { 5160 VN_RELE(*vpp); 5161 *vpp = NULL; 5162 return (ENOENT); 5163 } 5164 return (0); 5165 } 5166 } 5167 } 5168 5169 ASSERT(*vpp != NULL); 5170 5171 /* 5172 * We may have gotten here we have one of the following cases: 5173 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5174 * need to validate them. 5175 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5176 * must validate. 5177 * 5178 * Go to the server and check if the directory has changed, if 5179 * it hasn't we are done and can use the dnlc entry. 5180 */ 5181 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5182 } 5183 5184 /* 5185 * Go to the server and check if the directory has changed, if 5186 * it hasn't we are done and can use the dnlc entry. If it 5187 * has changed we get a new copy of its attributes and check 5188 * the access for VEXEC, then relookup the filename and 5189 * get its filehandle and attributes. 5190 * 5191 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5192 * if the NVERIFY failed we must 5193 * purge the caches 5194 * cache new attributes (will set r_time_attr_inval) 5195 * cache new access 5196 * recheck VEXEC access 5197 * add name to dnlc, possibly negative 5198 * if LOOKUP succeeded 5199 * cache new attributes 5200 * else 5201 * set a new r_time_attr_inval for dvp 5202 * check to make sure we have access 5203 * 5204 * The vpp returned is the vnode passed in if the directory is valid, 5205 * a new vnode if successful lookup, or NULL on error. 5206 */ 5207 static int 5208 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5209 { 5210 COMPOUND4args_clnt args; 5211 COMPOUND4res_clnt res; 5212 fattr4 *ver_fattr; 5213 fattr4_change dchange; 5214 int32_t *ptr; 5215 int argoplist_size = 7 * sizeof (nfs_argop4); 5216 nfs_argop4 *argop; 5217 int doqueue; 5218 mntinfo4_t *mi; 5219 nfs4_recov_state_t recov_state; 5220 hrtime_t t; 5221 int isdotdot; 5222 vnode_t *nvp; 5223 nfs_fh4 *fhp; 5224 nfs4_sharedfh_t *sfhp; 5225 nfs4_access_type_t cacc; 5226 rnode4_t *nrp; 5227 rnode4_t *drp = VTOR4(dvp); 5228 nfs4_ga_res_t *garp = NULL; 5229 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5230 5231 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5232 ASSERT(nm != NULL); 5233 ASSERT(nm[0] != '\0'); 5234 ASSERT(dvp->v_type == VDIR); 5235 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5236 ASSERT(*vpp != NULL); 5237 5238 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5239 isdotdot = 1; 5240 args.ctag = TAG_LOOKUP_VPARENT; 5241 } else { 5242 /* 5243 * If dvp were a stub, it should have triggered and caused 5244 * a mount for us to get this far. 5245 */ 5246 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5247 5248 isdotdot = 0; 5249 args.ctag = TAG_LOOKUP_VALID; 5250 } 5251 5252 mi = VTOMI4(dvp); 5253 recov_state.rs_flags = 0; 5254 recov_state.rs_num_retry_despite_err = 0; 5255 5256 nvp = NULL; 5257 5258 /* Save the original mount point security information */ 5259 (void) save_mnt_secinfo(mi->mi_curr_serv); 5260 5261 recov_retry: 5262 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5263 &recov_state, NULL); 5264 if (e.error) { 5265 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5266 VN_RELE(*vpp); 5267 *vpp = NULL; 5268 return (e.error); 5269 } 5270 5271 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5272 5273 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5274 args.array_len = 7; 5275 args.array = argop; 5276 5277 /* 0. putfh file */ 5278 argop[0].argop = OP_CPUTFH; 5279 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5280 5281 /* 1. nverify the change info */ 5282 argop[1].argop = OP_NVERIFY; 5283 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5284 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5285 ver_fattr->attrlist4 = (char *)&dchange; 5286 ptr = (int32_t *)&dchange; 5287 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5288 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5289 5290 /* 2. getattr directory */ 5291 argop[2].argop = OP_GETATTR; 5292 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5293 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5294 5295 /* 3. access directory */ 5296 argop[3].argop = OP_ACCESS; 5297 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5298 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5299 5300 /* 4. lookup name */ 5301 if (isdotdot) { 5302 argop[4].argop = OP_LOOKUPP; 5303 } else { 5304 argop[4].argop = OP_CLOOKUP; 5305 argop[4].nfs_argop4_u.opclookup.cname = nm; 5306 } 5307 5308 /* 5. resulting file handle */ 5309 argop[5].argop = OP_GETFH; 5310 5311 /* 6. resulting file attributes */ 5312 argop[6].argop = OP_GETATTR; 5313 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5314 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5315 5316 doqueue = 1; 5317 t = gethrtime(); 5318 5319 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5320 5321 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5322 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5323 if (e.error != 0 && *vpp != NULL) 5324 VN_RELE(*vpp); 5325 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5326 &recov_state, FALSE); 5327 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5328 kmem_free(argop, argoplist_size); 5329 return (e.error); 5330 } 5331 5332 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5333 /* 5334 * For WRONGSEC of a non-dotdot case, send secinfo directly 5335 * from this thread, do not go thru the recovery thread since 5336 * we need the nm information. 5337 * 5338 * Not doing dotdot case because there is no specification 5339 * for (PUTFH, SECINFO "..") yet. 5340 */ 5341 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5342 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5343 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5344 &recov_state, FALSE); 5345 else 5346 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5347 &recov_state, TRUE); 5348 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5349 kmem_free(argop, argoplist_size); 5350 if (!e.error) 5351 goto recov_retry; 5352 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5353 VN_RELE(*vpp); 5354 *vpp = NULL; 5355 return (e.error); 5356 } 5357 5358 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5359 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5360 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5361 &recov_state, TRUE); 5362 5363 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5364 kmem_free(argop, argoplist_size); 5365 goto recov_retry; 5366 } 5367 } 5368 5369 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5370 5371 if (e.error || res.array_len == 0) { 5372 /* 5373 * If e.error isn't set, then reply has no ops (or we couldn't 5374 * be here). The only legal way to reply without an op array 5375 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5376 * be in the reply for all other status values. 5377 * 5378 * For valid replies without an ops array, return ENOTSUP 5379 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5380 * return EIO -- don't trust status. 5381 */ 5382 if (e.error == 0) 5383 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5384 ENOTSUP : EIO; 5385 VN_RELE(*vpp); 5386 *vpp = NULL; 5387 kmem_free(argop, argoplist_size); 5388 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5389 return (e.error); 5390 } 5391 5392 if (res.status != NFS4ERR_SAME) { 5393 e.error = geterrno4(res.status); 5394 5395 /* 5396 * The NVERIFY "failed" so the directory has changed 5397 * First make sure PUTFH succeeded and NVERIFY "failed" 5398 * cleanly. 5399 */ 5400 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5401 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5402 nfs4_purge_stale_fh(e.error, dvp, cr); 5403 VN_RELE(*vpp); 5404 *vpp = NULL; 5405 goto exit; 5406 } 5407 5408 /* 5409 * We know the NVERIFY "failed" so we must: 5410 * purge the caches (access and indirectly dnlc if needed) 5411 */ 5412 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5413 5414 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5415 nfs4_purge_stale_fh(e.error, dvp, cr); 5416 VN_RELE(*vpp); 5417 *vpp = NULL; 5418 goto exit; 5419 } 5420 5421 /* 5422 * Install new cached attributes for the directory 5423 */ 5424 nfs4_attr_cache(dvp, 5425 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5426 t, cr, FALSE, NULL); 5427 5428 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5429 nfs4_purge_stale_fh(e.error, dvp, cr); 5430 VN_RELE(*vpp); 5431 *vpp = NULL; 5432 e.error = geterrno4(res.status); 5433 goto exit; 5434 } 5435 5436 /* 5437 * Now we know the directory is valid, 5438 * cache new directory access 5439 */ 5440 nfs4_access_cache(drp, 5441 args.array[3].nfs_argop4_u.opaccess.access, 5442 res.array[3].nfs_resop4_u.opaccess.access, cr); 5443 5444 /* 5445 * recheck VEXEC access 5446 */ 5447 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5448 if (cacc != NFS4_ACCESS_ALLOWED) { 5449 /* 5450 * Directory permissions might have been revoked 5451 */ 5452 if (cacc == NFS4_ACCESS_DENIED) { 5453 e.error = EACCES; 5454 VN_RELE(*vpp); 5455 *vpp = NULL; 5456 goto exit; 5457 } 5458 5459 /* 5460 * Somehow we must not have asked for enough 5461 * so try a singleton ACCESS, should never happen. 5462 */ 5463 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5464 if (e.error) { 5465 VN_RELE(*vpp); 5466 *vpp = NULL; 5467 goto exit; 5468 } 5469 } 5470 5471 e.error = geterrno4(res.status); 5472 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5473 /* 5474 * The lookup failed, probably no entry 5475 */ 5476 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5477 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5478 } else { 5479 /* 5480 * Might be some other error, so remove 5481 * the dnlc entry to make sure we start all 5482 * over again, next time. 5483 */ 5484 dnlc_remove(dvp, nm); 5485 } 5486 VN_RELE(*vpp); 5487 *vpp = NULL; 5488 goto exit; 5489 } 5490 5491 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5492 /* 5493 * The file exists but we can't get its fh for 5494 * some unknown reason. Remove it from the dnlc 5495 * and error out to be safe. 5496 */ 5497 dnlc_remove(dvp, nm); 5498 VN_RELE(*vpp); 5499 *vpp = NULL; 5500 goto exit; 5501 } 5502 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5503 if (fhp->nfs_fh4_len == 0) { 5504 /* 5505 * The file exists but a bogus fh 5506 * some unknown reason. Remove it from the dnlc 5507 * and error out to be safe. 5508 */ 5509 e.error = ENOENT; 5510 dnlc_remove(dvp, nm); 5511 VN_RELE(*vpp); 5512 *vpp = NULL; 5513 goto exit; 5514 } 5515 sfhp = sfh4_get(fhp, mi); 5516 5517 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5518 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5519 5520 /* 5521 * Make the new rnode 5522 */ 5523 if (isdotdot) { 5524 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5525 if (e.error) { 5526 sfh4_rele(&sfhp); 5527 VN_RELE(*vpp); 5528 *vpp = NULL; 5529 goto exit; 5530 } 5531 /* 5532 * XXX if nfs4_make_dotdot uses an existing rnode 5533 * XXX it doesn't update the attributes. 5534 * XXX for now just save them again to save an OTW 5535 */ 5536 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5537 } else { 5538 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5539 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5540 /* 5541 * If v_type == VNON, then garp was NULL because 5542 * the last op in the compound failed and makenfs4node 5543 * could not find the vnode for sfhp. It created 5544 * a new vnode, so we have nothing to purge here. 5545 */ 5546 if (nvp->v_type == VNON) { 5547 vattr_t vattr; 5548 5549 vattr.va_mask = AT_TYPE; 5550 /* 5551 * N.B. We've already called nfs4_end_fop above. 5552 */ 5553 e.error = nfs4getattr(nvp, &vattr, cr); 5554 if (e.error) { 5555 sfh4_rele(&sfhp); 5556 VN_RELE(*vpp); 5557 *vpp = NULL; 5558 VN_RELE(nvp); 5559 goto exit; 5560 } 5561 nvp->v_type = vattr.va_type; 5562 } 5563 } 5564 sfh4_rele(&sfhp); 5565 5566 nrp = VTOR4(nvp); 5567 mutex_enter(&nrp->r_statev4_lock); 5568 if (!nrp->created_v4) { 5569 mutex_exit(&nrp->r_statev4_lock); 5570 dnlc_update(dvp, nm, nvp); 5571 } else 5572 mutex_exit(&nrp->r_statev4_lock); 5573 5574 VN_RELE(*vpp); 5575 *vpp = nvp; 5576 } else { 5577 hrtime_t now; 5578 hrtime_t delta = 0; 5579 5580 e.error = 0; 5581 5582 /* 5583 * Because the NVERIFY "succeeded" we know that the 5584 * directory attributes are still valid 5585 * so update r_time_attr_inval 5586 */ 5587 now = gethrtime(); 5588 mutex_enter(&drp->r_statelock); 5589 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5590 delta = now - drp->r_time_attr_saved; 5591 if (delta < mi->mi_acdirmin) 5592 delta = mi->mi_acdirmin; 5593 else if (delta > mi->mi_acdirmax) 5594 delta = mi->mi_acdirmax; 5595 } 5596 drp->r_time_attr_inval = now + delta; 5597 mutex_exit(&drp->r_statelock); 5598 dnlc_update(dvp, nm, *vpp); 5599 5600 /* 5601 * Even though we have a valid directory attr cache 5602 * and dnlc entry, we may not have access. 5603 * This should almost always hit the cache. 5604 */ 5605 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5606 if (e.error) { 5607 VN_RELE(*vpp); 5608 *vpp = NULL; 5609 } 5610 5611 if (*vpp == DNLC_NO_VNODE) { 5612 VN_RELE(*vpp); 5613 *vpp = NULL; 5614 e.error = ENOENT; 5615 } 5616 } 5617 5618 exit: 5619 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5620 kmem_free(argop, argoplist_size); 5621 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5622 return (e.error); 5623 } 5624 5625 /* 5626 * We need to go over the wire to lookup the name, but 5627 * while we are there verify the directory has not 5628 * changed but if it has, get new attributes and check access 5629 * 5630 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5631 * NVERIFY GETATTR ACCESS 5632 * 5633 * With the results: 5634 * if the NVERIFY failed we must purge the caches, add new attributes, 5635 * and cache new access. 5636 * set a new r_time_attr_inval 5637 * add name to dnlc, possibly negative 5638 * if LOOKUP succeeded 5639 * cache new attributes 5640 */ 5641 static int 5642 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5643 { 5644 COMPOUND4args_clnt args; 5645 COMPOUND4res_clnt res; 5646 fattr4 *ver_fattr; 5647 fattr4_change dchange; 5648 int32_t *ptr; 5649 nfs4_ga_res_t *garp = NULL; 5650 int argoplist_size = 9 * sizeof (nfs_argop4); 5651 nfs_argop4 *argop; 5652 int doqueue; 5653 mntinfo4_t *mi; 5654 nfs4_recov_state_t recov_state; 5655 hrtime_t t; 5656 int isdotdot; 5657 vnode_t *nvp; 5658 nfs_fh4 *fhp; 5659 nfs4_sharedfh_t *sfhp; 5660 nfs4_access_type_t cacc; 5661 rnode4_t *nrp; 5662 rnode4_t *drp = VTOR4(dvp); 5663 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5664 5665 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5666 ASSERT(nm != NULL); 5667 ASSERT(nm[0] != '\0'); 5668 ASSERT(dvp->v_type == VDIR); 5669 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5670 ASSERT(*vpp == NULL); 5671 5672 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5673 isdotdot = 1; 5674 args.ctag = TAG_LOOKUP_PARENT; 5675 } else { 5676 /* 5677 * If dvp were a stub, it should have triggered and caused 5678 * a mount for us to get this far. 5679 */ 5680 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5681 5682 isdotdot = 0; 5683 args.ctag = TAG_LOOKUP; 5684 } 5685 5686 mi = VTOMI4(dvp); 5687 recov_state.rs_flags = 0; 5688 recov_state.rs_num_retry_despite_err = 0; 5689 5690 nvp = NULL; 5691 5692 /* Save the original mount point security information */ 5693 (void) save_mnt_secinfo(mi->mi_curr_serv); 5694 5695 recov_retry: 5696 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5697 &recov_state, NULL); 5698 if (e.error) { 5699 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5700 return (e.error); 5701 } 5702 5703 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5704 5705 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5706 args.array_len = 9; 5707 args.array = argop; 5708 5709 /* 0. putfh file */ 5710 argop[0].argop = OP_CPUTFH; 5711 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5712 5713 /* 1. savefh for the nverify */ 5714 argop[1].argop = OP_SAVEFH; 5715 5716 /* 2. lookup name */ 5717 if (isdotdot) { 5718 argop[2].argop = OP_LOOKUPP; 5719 } else { 5720 argop[2].argop = OP_CLOOKUP; 5721 argop[2].nfs_argop4_u.opclookup.cname = nm; 5722 } 5723 5724 /* 3. resulting file handle */ 5725 argop[3].argop = OP_GETFH; 5726 5727 /* 4. resulting file attributes */ 5728 argop[4].argop = OP_GETATTR; 5729 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5730 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5731 5732 /* 5. restorefh back the directory for the nverify */ 5733 argop[5].argop = OP_RESTOREFH; 5734 5735 /* 6. nverify the change info */ 5736 argop[6].argop = OP_NVERIFY; 5737 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5738 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5739 ver_fattr->attrlist4 = (char *)&dchange; 5740 ptr = (int32_t *)&dchange; 5741 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5742 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5743 5744 /* 7. getattr directory */ 5745 argop[7].argop = OP_GETATTR; 5746 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5747 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5748 5749 /* 8. access directory */ 5750 argop[8].argop = OP_ACCESS; 5751 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5752 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5753 5754 doqueue = 1; 5755 t = gethrtime(); 5756 5757 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5758 5759 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5760 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5761 if (e.error != 0 && *vpp != NULL) 5762 VN_RELE(*vpp); 5763 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5764 &recov_state, FALSE); 5765 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5766 kmem_free(argop, argoplist_size); 5767 return (e.error); 5768 } 5769 5770 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5771 /* 5772 * For WRONGSEC of a non-dotdot case, send secinfo directly 5773 * from this thread, do not go thru the recovery thread since 5774 * we need the nm information. 5775 * 5776 * Not doing dotdot case because there is no specification 5777 * for (PUTFH, SECINFO "..") yet. 5778 */ 5779 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5780 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5781 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5782 &recov_state, FALSE); 5783 else 5784 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5785 &recov_state, TRUE); 5786 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5787 kmem_free(argop, argoplist_size); 5788 if (!e.error) 5789 goto recov_retry; 5790 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5791 return (e.error); 5792 } 5793 5794 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5795 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5796 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5797 &recov_state, TRUE); 5798 5799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5800 kmem_free(argop, argoplist_size); 5801 goto recov_retry; 5802 } 5803 } 5804 5805 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5806 5807 if (e.error || res.array_len == 0) { 5808 /* 5809 * If e.error isn't set, then reply has no ops (or we couldn't 5810 * be here). The only legal way to reply without an op array 5811 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5812 * be in the reply for all other status values. 5813 * 5814 * For valid replies without an ops array, return ENOTSUP 5815 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5816 * return EIO -- don't trust status. 5817 */ 5818 if (e.error == 0) 5819 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5820 ENOTSUP : EIO; 5821 5822 kmem_free(argop, argoplist_size); 5823 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5824 return (e.error); 5825 } 5826 5827 e.error = geterrno4(res.status); 5828 5829 /* 5830 * The PUTFH and SAVEFH may have failed. 5831 */ 5832 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5833 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5834 nfs4_purge_stale_fh(e.error, dvp, cr); 5835 goto exit; 5836 } 5837 5838 /* 5839 * Check if the file exists, if it does delay entering 5840 * into the dnlc until after we update the directory 5841 * attributes so we don't cause it to get purged immediately. 5842 */ 5843 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5844 /* 5845 * The lookup failed, probably no entry 5846 */ 5847 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5848 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5849 goto exit; 5850 } 5851 5852 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5853 /* 5854 * The file exists but we can't get its fh for 5855 * some unknown reason. Error out to be safe. 5856 */ 5857 goto exit; 5858 } 5859 5860 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5861 if (fhp->nfs_fh4_len == 0) { 5862 /* 5863 * The file exists but a bogus fh 5864 * some unknown reason. Error out to be safe. 5865 */ 5866 e.error = EIO; 5867 goto exit; 5868 } 5869 sfhp = sfh4_get(fhp, mi); 5870 5871 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5872 sfh4_rele(&sfhp); 5873 goto exit; 5874 } 5875 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5876 5877 /* 5878 * The RESTOREFH may have failed 5879 */ 5880 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5881 sfh4_rele(&sfhp); 5882 e.error = EIO; 5883 goto exit; 5884 } 5885 5886 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5887 /* 5888 * First make sure the NVERIFY failed as we expected, 5889 * if it didn't then be conservative and error out 5890 * as we can't trust the directory. 5891 */ 5892 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5893 sfh4_rele(&sfhp); 5894 e.error = EIO; 5895 goto exit; 5896 } 5897 5898 /* 5899 * We know the NVERIFY "failed" so the directory has changed, 5900 * so we must: 5901 * purge the caches (access and indirectly dnlc if needed) 5902 */ 5903 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5904 5905 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5906 sfh4_rele(&sfhp); 5907 goto exit; 5908 } 5909 nfs4_attr_cache(dvp, 5910 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5911 t, cr, FALSE, NULL); 5912 5913 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5914 nfs4_purge_stale_fh(e.error, dvp, cr); 5915 sfh4_rele(&sfhp); 5916 e.error = geterrno4(res.status); 5917 goto exit; 5918 } 5919 5920 /* 5921 * Now we know the directory is valid, 5922 * cache new directory access 5923 */ 5924 nfs4_access_cache(drp, 5925 args.array[8].nfs_argop4_u.opaccess.access, 5926 res.array[8].nfs_resop4_u.opaccess.access, cr); 5927 5928 /* 5929 * recheck VEXEC access 5930 */ 5931 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5932 if (cacc != NFS4_ACCESS_ALLOWED) { 5933 /* 5934 * Directory permissions might have been revoked 5935 */ 5936 if (cacc == NFS4_ACCESS_DENIED) { 5937 sfh4_rele(&sfhp); 5938 e.error = EACCES; 5939 goto exit; 5940 } 5941 5942 /* 5943 * Somehow we must not have asked for enough 5944 * so try a singleton ACCESS should never happen 5945 */ 5946 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5947 if (e.error) { 5948 sfh4_rele(&sfhp); 5949 goto exit; 5950 } 5951 } 5952 5953 e.error = geterrno4(res.status); 5954 } else { 5955 hrtime_t now; 5956 hrtime_t delta = 0; 5957 5958 e.error = 0; 5959 5960 /* 5961 * Because the NVERIFY "succeeded" we know that the 5962 * directory attributes are still valid 5963 * so update r_time_attr_inval 5964 */ 5965 now = gethrtime(); 5966 mutex_enter(&drp->r_statelock); 5967 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5968 delta = now - drp->r_time_attr_saved; 5969 if (delta < mi->mi_acdirmin) 5970 delta = mi->mi_acdirmin; 5971 else if (delta > mi->mi_acdirmax) 5972 delta = mi->mi_acdirmax; 5973 } 5974 drp->r_time_attr_inval = now + delta; 5975 mutex_exit(&drp->r_statelock); 5976 5977 /* 5978 * Even though we have a valid directory attr cache, 5979 * we may not have access. 5980 * This should almost always hit the cache. 5981 */ 5982 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5983 if (e.error) { 5984 sfh4_rele(&sfhp); 5985 goto exit; 5986 } 5987 } 5988 5989 /* 5990 * Now we have successfully completed the lookup, if the 5991 * directory has changed we now have the valid attributes. 5992 * We also know we have directory access. 5993 * Create the new rnode and insert it in the dnlc. 5994 */ 5995 if (isdotdot) { 5996 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5997 if (e.error) { 5998 sfh4_rele(&sfhp); 5999 goto exit; 6000 } 6001 /* 6002 * XXX if nfs4_make_dotdot uses an existing rnode 6003 * XXX it doesn't update the attributes. 6004 * XXX for now just save them again to save an OTW 6005 */ 6006 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 6007 } else { 6008 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 6009 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 6010 } 6011 sfh4_rele(&sfhp); 6012 6013 nrp = VTOR4(nvp); 6014 mutex_enter(&nrp->r_statev4_lock); 6015 if (!nrp->created_v4) { 6016 mutex_exit(&nrp->r_statev4_lock); 6017 dnlc_update(dvp, nm, nvp); 6018 } else 6019 mutex_exit(&nrp->r_statev4_lock); 6020 6021 *vpp = nvp; 6022 6023 exit: 6024 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6025 kmem_free(argop, argoplist_size); 6026 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 6027 return (e.error); 6028 } 6029 6030 #ifdef DEBUG 6031 void 6032 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 6033 { 6034 uint_t i, len; 6035 zoneid_t zoneid = getzoneid(); 6036 char *s; 6037 6038 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 6039 for (i = 0; i < argcnt; i++) { 6040 nfs_argop4 *op = &argbase[i]; 6041 switch (op->argop) { 6042 case OP_CPUTFH: 6043 case OP_PUTFH: 6044 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 6045 break; 6046 case OP_PUTROOTFH: 6047 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 6048 break; 6049 case OP_CLOOKUP: 6050 s = op->nfs_argop4_u.opclookup.cname; 6051 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6052 break; 6053 case OP_LOOKUP: 6054 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6055 &len, NULL); 6056 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6057 kmem_free(s, len); 6058 break; 6059 case OP_LOOKUPP: 6060 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6061 break; 6062 case OP_GETFH: 6063 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6064 break; 6065 case OP_GETATTR: 6066 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6067 break; 6068 case OP_OPENATTR: 6069 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6070 break; 6071 default: 6072 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6073 op->argop); 6074 break; 6075 } 6076 } 6077 } 6078 #endif 6079 6080 /* 6081 * nfs4lookup_setup - constructs a multi-lookup compound request. 6082 * 6083 * Given the path "nm1/nm2/.../nmn", the following compound requests 6084 * may be created: 6085 * 6086 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6087 * is faster, for now. 6088 * 6089 * l4_getattrs indicates the type of compound requested. 6090 * 6091 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6092 * 6093 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6094 * 6095 * total number of ops is n + 1. 6096 * 6097 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6098 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6099 * before the last component, and only get attributes 6100 * for the last component. Note that the second-to-last 6101 * pathname component is XATTR_RPATH, which does NOT go 6102 * over-the-wire as a lookup. 6103 * 6104 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6105 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6106 * 6107 * and total number of ops is n + 5. 6108 * 6109 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6110 * attribute directory: create lookups plus an OPENATTR 6111 * replacing the last lookup. Note that the last pathname 6112 * component is XATTR_RPATH, which does NOT go over-the-wire 6113 * as a lookup. 6114 * 6115 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6116 * Openattr; Getfh; Getattr } 6117 * 6118 * and total number of ops is n + 5. 6119 * 6120 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6121 * nodes too. 6122 * 6123 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6124 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6125 * 6126 * and total number of ops is 3*n + 1. 6127 * 6128 * All cases: returns the index in the arg array of the final LOOKUP op, or 6129 * -1 if no LOOKUPs were used. 6130 */ 6131 int 6132 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6133 { 6134 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6135 nfs_argop4 *argbase, *argop; 6136 int arglen, argcnt; 6137 int n = 1; /* number of components */ 6138 int nga = 1; /* number of Getattr's in request */ 6139 char c = '\0', *s, *p; 6140 int lookup_idx = -1; 6141 int argoplist_size; 6142 6143 /* set lookuparg response result to 0 */ 6144 lookupargp->resp->status = NFS4_OK; 6145 6146 /* skip leading "/" or "." e.g. ".//./" if there is */ 6147 for (; ; nm++) { 6148 if (*nm != '/' && *nm != '.') 6149 break; 6150 6151 /* ".." is counted as 1 component */ 6152 if (*nm == '.' && *(nm + 1) != '/') 6153 break; 6154 } 6155 6156 /* 6157 * Find n = number of components - nm must be null terminated 6158 * Skip "." components. 6159 */ 6160 if (*nm != '\0') 6161 for (n = 1, s = nm; *s != '\0'; s++) { 6162 if ((*s == '/') && (*(s + 1) != '/') && 6163 (*(s + 1) != '\0') && 6164 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6165 *(s + 2) == '\0'))) 6166 n++; 6167 } 6168 else 6169 n = 0; 6170 6171 /* 6172 * nga is number of components that need Getfh+Getattr 6173 */ 6174 switch (l4_getattrs) { 6175 case LKP4_NO_ATTRIBUTES: 6176 nga = 0; 6177 break; 6178 case LKP4_ALL_ATTRIBUTES: 6179 nga = n; 6180 /* 6181 * Always have at least 1 getfh, getattr pair 6182 */ 6183 if (nga == 0) 6184 nga++; 6185 break; 6186 case LKP4_LAST_ATTRDIR: 6187 case LKP4_LAST_NAMED_ATTR: 6188 nga = n+1; 6189 break; 6190 } 6191 6192 /* 6193 * If change to use the filehandle attr instead of getfh 6194 * the following line can be deleted. 6195 */ 6196 nga *= 2; 6197 6198 /* 6199 * calculate number of ops in request as 6200 * header + trailer + lookups + getattrs 6201 */ 6202 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6203 6204 argoplist_size = arglen * sizeof (nfs_argop4); 6205 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6206 lookupargp->argsp->array = argop; 6207 6208 argcnt = lookupargp->header_len; 6209 argop += argcnt; 6210 6211 /* 6212 * loop and create a lookup op and possibly getattr/getfh for 6213 * each component. Skip "." components. 6214 */ 6215 for (s = nm; *s != '\0'; s = p) { 6216 /* 6217 * Set up a pathname struct for each component if needed 6218 */ 6219 while (*s == '/') 6220 s++; 6221 if (*s == '\0') 6222 break; 6223 6224 for (p = s; (*p != '/') && (*p != '\0'); p++) 6225 ; 6226 c = *p; 6227 *p = '\0'; 6228 6229 if (s[0] == '.' && s[1] == '\0') { 6230 *p = c; 6231 continue; 6232 } 6233 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6234 strcmp(s, XATTR_RPATH) == 0) { 6235 /* getfh XXX may not be needed in future */ 6236 argop->argop = OP_GETFH; 6237 argop++; 6238 argcnt++; 6239 6240 /* getattr */ 6241 argop->argop = OP_GETATTR; 6242 argop->nfs_argop4_u.opgetattr.attr_request = 6243 lookupargp->ga_bits; 6244 argop->nfs_argop4_u.opgetattr.mi = 6245 lookupargp->mi; 6246 argop++; 6247 argcnt++; 6248 6249 /* openattr */ 6250 argop->argop = OP_OPENATTR; 6251 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6252 strcmp(s, XATTR_RPATH) == 0) { 6253 /* openattr */ 6254 argop->argop = OP_OPENATTR; 6255 argop++; 6256 argcnt++; 6257 6258 /* getfh XXX may not be needed in future */ 6259 argop->argop = OP_GETFH; 6260 argop++; 6261 argcnt++; 6262 6263 /* getattr */ 6264 argop->argop = OP_GETATTR; 6265 argop->nfs_argop4_u.opgetattr.attr_request = 6266 lookupargp->ga_bits; 6267 argop->nfs_argop4_u.opgetattr.mi = 6268 lookupargp->mi; 6269 argop++; 6270 argcnt++; 6271 *p = c; 6272 continue; 6273 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6274 /* lookupp */ 6275 argop->argop = OP_LOOKUPP; 6276 } else { 6277 /* lookup */ 6278 argop->argop = OP_LOOKUP; 6279 (void) str_to_utf8(s, 6280 &argop->nfs_argop4_u.oplookup.objname); 6281 } 6282 lookup_idx = argcnt; 6283 argop++; 6284 argcnt++; 6285 6286 *p = c; 6287 6288 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6289 /* getfh XXX may not be needed in future */ 6290 argop->argop = OP_GETFH; 6291 argop++; 6292 argcnt++; 6293 6294 /* getattr */ 6295 argop->argop = OP_GETATTR; 6296 argop->nfs_argop4_u.opgetattr.attr_request = 6297 lookupargp->ga_bits; 6298 argop->nfs_argop4_u.opgetattr.mi = 6299 lookupargp->mi; 6300 argop++; 6301 argcnt++; 6302 } 6303 } 6304 6305 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6306 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6307 if (needgetfh) { 6308 /* stick in a post-lookup getfh */ 6309 argop->argop = OP_GETFH; 6310 argcnt++; 6311 argop++; 6312 } 6313 /* post-lookup getattr */ 6314 argop->argop = OP_GETATTR; 6315 argop->nfs_argop4_u.opgetattr.attr_request = 6316 lookupargp->ga_bits; 6317 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6318 argcnt++; 6319 } 6320 argcnt += lookupargp->trailer_len; /* actual op count */ 6321 lookupargp->argsp->array_len = argcnt; 6322 lookupargp->arglen = arglen; 6323 6324 #ifdef DEBUG 6325 if (nfs4_client_lookup_debug) 6326 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6327 #endif 6328 6329 return (lookup_idx); 6330 } 6331 6332 static int 6333 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6334 { 6335 COMPOUND4args_clnt args; 6336 COMPOUND4res_clnt res; 6337 GETFH4res *gf_res = NULL; 6338 nfs_argop4 argop[4]; 6339 nfs_resop4 *resop = NULL; 6340 nfs4_sharedfh_t *sfhp; 6341 hrtime_t t; 6342 nfs4_error_t e; 6343 6344 rnode4_t *drp; 6345 int doqueue = 1; 6346 vnode_t *vp; 6347 int needrecov = 0; 6348 nfs4_recov_state_t recov_state; 6349 6350 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6351 6352 *avp = NULL; 6353 recov_state.rs_flags = 0; 6354 recov_state.rs_num_retry_despite_err = 0; 6355 6356 recov_retry: 6357 /* COMPOUND: putfh, openattr, getfh, getattr */ 6358 args.array_len = 4; 6359 args.array = argop; 6360 args.ctag = TAG_OPENATTR; 6361 6362 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6363 if (e.error) 6364 return (e.error); 6365 6366 drp = VTOR4(dvp); 6367 6368 /* putfh */ 6369 argop[0].argop = OP_CPUTFH; 6370 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6371 6372 /* openattr */ 6373 argop[1].argop = OP_OPENATTR; 6374 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6375 6376 /* getfh */ 6377 argop[2].argop = OP_GETFH; 6378 6379 /* getattr */ 6380 argop[3].argop = OP_GETATTR; 6381 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6382 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6383 6384 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6385 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6386 rnode4info(drp))); 6387 6388 t = gethrtime(); 6389 6390 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6391 6392 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6393 if (needrecov) { 6394 bool_t abort; 6395 6396 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6397 "nfs4openattr: initiating recovery\n")); 6398 6399 abort = nfs4_start_recovery(&e, 6400 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6401 OP_OPENATTR, NULL, NULL, NULL); 6402 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6403 if (!e.error) { 6404 e.error = geterrno4(res.status); 6405 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6406 } 6407 if (abort == FALSE) 6408 goto recov_retry; 6409 return (e.error); 6410 } 6411 6412 if (e.error) { 6413 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6414 return (e.error); 6415 } 6416 6417 if (res.status) { 6418 /* 6419 * If OTW errro is NOTSUPP, then it should be 6420 * translated to EINVAL. All Solaris file system 6421 * implementations return EINVAL to the syscall layer 6422 * when the attrdir cannot be created due to an 6423 * implementation restriction or noxattr mount option. 6424 */ 6425 if (res.status == NFS4ERR_NOTSUPP) { 6426 mutex_enter(&drp->r_statelock); 6427 if (drp->r_xattr_dir) 6428 VN_RELE(drp->r_xattr_dir); 6429 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6430 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6431 mutex_exit(&drp->r_statelock); 6432 6433 e.error = EINVAL; 6434 } else { 6435 e.error = geterrno4(res.status); 6436 } 6437 6438 if (e.error) { 6439 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6440 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6441 needrecov); 6442 return (e.error); 6443 } 6444 } 6445 6446 resop = &res.array[0]; /* putfh res */ 6447 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6448 6449 resop = &res.array[1]; /* openattr res */ 6450 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6451 6452 resop = &res.array[2]; /* getfh res */ 6453 gf_res = &resop->nfs_resop4_u.opgetfh; 6454 if (gf_res->object.nfs_fh4_len == 0) { 6455 *avp = NULL; 6456 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6457 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6458 return (ENOENT); 6459 } 6460 6461 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6462 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6463 dvp->v_vfsp, t, cr, dvp, 6464 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6465 sfh4_rele(&sfhp); 6466 6467 if (e.error) 6468 PURGE_ATTRCACHE4(vp); 6469 6470 mutex_enter(&vp->v_lock); 6471 vp->v_flag |= V_XATTRDIR; 6472 mutex_exit(&vp->v_lock); 6473 6474 *avp = vp; 6475 6476 mutex_enter(&drp->r_statelock); 6477 if (drp->r_xattr_dir) 6478 VN_RELE(drp->r_xattr_dir); 6479 VN_HOLD(vp); 6480 drp->r_xattr_dir = vp; 6481 6482 /* 6483 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6484 * NULL. xattrs could be created at any time, and we have no 6485 * way to update pc4_xattr_exists in the base object if/when 6486 * it happens. 6487 */ 6488 drp->r_pathconf.pc4_xattr_valid = 0; 6489 6490 mutex_exit(&drp->r_statelock); 6491 6492 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6493 6494 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6495 6496 return (0); 6497 } 6498 6499 /* ARGSUSED */ 6500 static int 6501 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6502 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6503 vsecattr_t *vsecp) 6504 { 6505 int error; 6506 vnode_t *vp = NULL; 6507 rnode4_t *rp; 6508 struct vattr vattr; 6509 rnode4_t *drp; 6510 vnode_t *tempvp; 6511 enum createmode4 createmode; 6512 bool_t must_trunc = FALSE; 6513 int truncating = 0; 6514 6515 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6516 return (EPERM); 6517 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6518 return (EINVAL); 6519 } 6520 6521 /* . and .. have special meaning in the protocol, reject them. */ 6522 6523 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6524 return (EISDIR); 6525 6526 drp = VTOR4(dvp); 6527 6528 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6529 return (EINTR); 6530 6531 top: 6532 /* 6533 * We make a copy of the attributes because the caller does not 6534 * expect us to change what va points to. 6535 */ 6536 vattr = *va; 6537 6538 /* 6539 * If the pathname is "", then dvp is the root vnode of 6540 * a remote file mounted over a local directory. 6541 * All that needs to be done is access 6542 * checking and truncation. Note that we avoid doing 6543 * open w/ create because the parent directory might 6544 * be in pseudo-fs and the open would fail. 6545 */ 6546 if (*nm == '\0') { 6547 error = 0; 6548 VN_HOLD(dvp); 6549 vp = dvp; 6550 must_trunc = TRUE; 6551 } else { 6552 /* 6553 * We need to go over the wire, just to be sure whether the 6554 * file exists or not. Using the DNLC can be dangerous in 6555 * this case when making a decision regarding existence. 6556 */ 6557 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6558 } 6559 6560 if (exclusive) 6561 createmode = EXCLUSIVE4; 6562 else 6563 createmode = GUARDED4; 6564 6565 /* 6566 * error would be set if the file does not exist on the 6567 * server, so lets go create it. 6568 */ 6569 if (error) { 6570 goto create_otw; 6571 } 6572 6573 /* 6574 * File does exist on the server 6575 */ 6576 if (exclusive == EXCL) 6577 error = EEXIST; 6578 else if (vp->v_type == VDIR && (mode & VWRITE)) 6579 error = EISDIR; 6580 else { 6581 /* 6582 * If vnode is a device, create special vnode. 6583 */ 6584 if (ISVDEV(vp->v_type)) { 6585 tempvp = vp; 6586 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6587 VN_RELE(tempvp); 6588 } 6589 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6590 if ((vattr.va_mask & AT_SIZE) && 6591 vp->v_type == VREG) { 6592 rp = VTOR4(vp); 6593 /* 6594 * Check here for large file handled 6595 * by LF-unaware process (as 6596 * ufs_create() does) 6597 */ 6598 if (!(flags & FOFFMAX)) { 6599 mutex_enter(&rp->r_statelock); 6600 if (rp->r_size > MAXOFF32_T) 6601 error = EOVERFLOW; 6602 mutex_exit(&rp->r_statelock); 6603 } 6604 6605 /* if error is set then we need to return */ 6606 if (error) { 6607 nfs_rw_exit(&drp->r_rwlock); 6608 VN_RELE(vp); 6609 return (error); 6610 } 6611 6612 if (must_trunc) { 6613 vattr.va_mask = AT_SIZE; 6614 error = nfs4setattr(vp, &vattr, 0, cr, 6615 NULL); 6616 } else { 6617 /* 6618 * we know we have a regular file that already 6619 * exists and we may end up truncating the file 6620 * as a result of the open_otw, so flush out 6621 * any dirty pages for this file first. 6622 */ 6623 if (nfs4_has_pages(vp) && 6624 ((rp->r_flags & R4DIRTY) || 6625 rp->r_count > 0 || 6626 rp->r_mapcnt > 0)) { 6627 error = nfs4_putpage(vp, 6628 (offset_t)0, 0, 0, cr, ct); 6629 if (error && (error == ENOSPC || 6630 error == EDQUOT)) { 6631 mutex_enter( 6632 &rp->r_statelock); 6633 if (!rp->r_error) 6634 rp->r_error = 6635 error; 6636 mutex_exit( 6637 &rp->r_statelock); 6638 } 6639 } 6640 vattr.va_mask = (AT_SIZE | 6641 AT_TYPE | AT_MODE); 6642 vattr.va_type = VREG; 6643 createmode = UNCHECKED4; 6644 truncating = 1; 6645 goto create_otw; 6646 } 6647 } 6648 } 6649 } 6650 nfs_rw_exit(&drp->r_rwlock); 6651 if (error) { 6652 VN_RELE(vp); 6653 } else { 6654 vnode_t *tvp; 6655 rnode4_t *trp; 6656 /* 6657 * existing file got truncated, notify. 6658 */ 6659 tvp = vp; 6660 if (vp->v_type == VREG) { 6661 trp = VTOR4(vp); 6662 if (IS_SHADOW(vp, trp)) 6663 tvp = RTOV4(trp); 6664 } 6665 vnevent_create(tvp, ct); 6666 *vpp = vp; 6667 } 6668 return (error); 6669 6670 create_otw: 6671 dnlc_remove(dvp, nm); 6672 6673 ASSERT(vattr.va_mask & AT_TYPE); 6674 6675 /* 6676 * If not a regular file let nfs4mknod() handle it. 6677 */ 6678 if (vattr.va_type != VREG) { 6679 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6680 nfs_rw_exit(&drp->r_rwlock); 6681 return (error); 6682 } 6683 6684 /* 6685 * It _is_ a regular file. 6686 */ 6687 ASSERT(vattr.va_mask & AT_MODE); 6688 if (MANDMODE(vattr.va_mode)) { 6689 nfs_rw_exit(&drp->r_rwlock); 6690 return (EACCES); 6691 } 6692 6693 /* 6694 * If this happens to be a mknod of a regular file, then flags will 6695 * have neither FREAD or FWRITE. However, we must set at least one 6696 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6697 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6698 * set (based on openmode specified by app). 6699 */ 6700 if ((flags & (FREAD|FWRITE)) == 0) 6701 flags |= (FREAD|FWRITE); 6702 6703 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6704 6705 if (vp != NULL) { 6706 /* if create was successful, throw away the file's pages */ 6707 if (!error && (vattr.va_mask & AT_SIZE)) 6708 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6709 cr); 6710 /* release the lookup hold */ 6711 VN_RELE(vp); 6712 vp = NULL; 6713 } 6714 6715 /* 6716 * validate that we opened a regular file. This handles a misbehaving 6717 * server that returns an incorrect FH. 6718 */ 6719 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6720 error = EISDIR; 6721 VN_RELE(*vpp); 6722 } 6723 6724 /* 6725 * If this is not an exclusive create, then the CREATE 6726 * request will be made with the GUARDED mode set. This 6727 * means that the server will return EEXIST if the file 6728 * exists. The file could exist because of a retransmitted 6729 * request. In this case, we recover by starting over and 6730 * checking to see whether the file exists. This second 6731 * time through it should and a CREATE request will not be 6732 * sent. 6733 * 6734 * This handles the problem of a dangling CREATE request 6735 * which contains attributes which indicate that the file 6736 * should be truncated. This retransmitted request could 6737 * possibly truncate valid data in the file if not caught 6738 * by the duplicate request mechanism on the server or if 6739 * not caught by other means. The scenario is: 6740 * 6741 * Client transmits CREATE request with size = 0 6742 * Client times out, retransmits request. 6743 * Response to the first request arrives from the server 6744 * and the client proceeds on. 6745 * Client writes data to the file. 6746 * The server now processes retransmitted CREATE request 6747 * and truncates file. 6748 * 6749 * The use of the GUARDED CREATE request prevents this from 6750 * happening because the retransmitted CREATE would fail 6751 * with EEXIST and would not truncate the file. 6752 */ 6753 if (error == EEXIST && exclusive == NONEXCL) { 6754 #ifdef DEBUG 6755 nfs4_create_misses++; 6756 #endif 6757 goto top; 6758 } 6759 nfs_rw_exit(&drp->r_rwlock); 6760 if (truncating && !error && *vpp) { 6761 vnode_t *tvp; 6762 rnode4_t *trp; 6763 /* 6764 * existing file got truncated, notify. 6765 */ 6766 tvp = *vpp; 6767 trp = VTOR4(tvp); 6768 if (IS_SHADOW(tvp, trp)) 6769 tvp = RTOV4(trp); 6770 vnevent_create(tvp, ct); 6771 } 6772 return (error); 6773 } 6774 6775 /* 6776 * Create compound (for mkdir, mknod, symlink): 6777 * { Putfh <dfh>; Create; Getfh; Getattr } 6778 * It's okay if setattr failed to set gid - this is not considered 6779 * an error, but purge attrs in that case. 6780 */ 6781 static int 6782 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6783 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6784 { 6785 int need_end_op = FALSE; 6786 COMPOUND4args_clnt args; 6787 COMPOUND4res_clnt res, *resp = NULL; 6788 nfs_argop4 *argop; 6789 nfs_resop4 *resop; 6790 int doqueue; 6791 mntinfo4_t *mi; 6792 rnode4_t *drp = VTOR4(dvp); 6793 change_info4 *cinfo; 6794 GETFH4res *gf_res; 6795 struct vattr vattr; 6796 vnode_t *vp; 6797 fattr4 *crattr; 6798 bool_t needrecov = FALSE; 6799 nfs4_recov_state_t recov_state; 6800 nfs4_sharedfh_t *sfhp = NULL; 6801 hrtime_t t; 6802 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6803 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6804 dirattr_info_t dinfo, *dinfop; 6805 servinfo4_t *svp; 6806 bitmap4 supp_attrs; 6807 6808 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6809 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6810 6811 mi = VTOMI4(dvp); 6812 6813 /* 6814 * Make sure we properly deal with setting the right gid 6815 * on a new directory to reflect the parent's setgid bit 6816 */ 6817 setgid_flag = 0; 6818 if (type == NF4DIR) { 6819 struct vattr dva; 6820 6821 va->va_mode &= ~VSGID; 6822 dva.va_mask = AT_MODE | AT_GID; 6823 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6824 6825 /* 6826 * If the parent's directory has the setgid bit set 6827 * _and_ the client was able to get a valid mapping 6828 * for the parent dir's owner_group, we want to 6829 * append NVERIFY(owner_group == dva.va_gid) and 6830 * SETTATTR to the CREATE compound. 6831 */ 6832 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6833 setgid_flag = 1; 6834 va->va_mode |= VSGID; 6835 if (dva.va_gid != GID_NOBODY) { 6836 va->va_mask |= AT_GID; 6837 va->va_gid = dva.va_gid; 6838 } 6839 } 6840 } 6841 } 6842 6843 /* 6844 * Create ops: 6845 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6846 * 5:restorefh(dir) 6:getattr(dir) 6847 * 6848 * if (setgid) 6849 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6850 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6851 * 8:nverify 9:setattr 6852 */ 6853 if (setgid_flag) { 6854 numops = 10; 6855 idx_create = 1; 6856 idx_fattr = 3; 6857 } else { 6858 numops = 7; 6859 idx_create = 2; 6860 idx_fattr = 4; 6861 } 6862 6863 ASSERT(nfs_zone() == mi->mi_zone); 6864 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6865 return (EINTR); 6866 } 6867 recov_state.rs_flags = 0; 6868 recov_state.rs_num_retry_despite_err = 0; 6869 6870 argoplist_size = numops * sizeof (nfs_argop4); 6871 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6872 6873 recov_retry: 6874 if (type == NF4LNK) 6875 args.ctag = TAG_SYMLINK; 6876 else if (type == NF4DIR) 6877 args.ctag = TAG_MKDIR; 6878 else 6879 args.ctag = TAG_MKNOD; 6880 6881 args.array_len = numops; 6882 args.array = argop; 6883 6884 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6885 nfs_rw_exit(&drp->r_rwlock); 6886 kmem_free(argop, argoplist_size); 6887 return (e.error); 6888 } 6889 need_end_op = TRUE; 6890 6891 6892 /* 0: putfh directory */ 6893 argop[0].argop = OP_CPUTFH; 6894 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6895 6896 /* 1/2: Create object */ 6897 argop[idx_create].argop = OP_CCREATE; 6898 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6899 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6900 if (type == NF4LNK) { 6901 /* 6902 * symlink, treat name as data 6903 */ 6904 ASSERT(data != NULL); 6905 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6906 (char *)data; 6907 } 6908 if (type == NF4BLK || type == NF4CHR) { 6909 ASSERT(data != NULL); 6910 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6911 *((specdata4 *)data); 6912 } 6913 6914 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6915 6916 svp = drp->r_server; 6917 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6918 supp_attrs = svp->sv_supp_attrs; 6919 nfs_rw_exit(&svp->sv_lock); 6920 6921 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6922 nfs_rw_exit(&drp->r_rwlock); 6923 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6924 e.error = EINVAL; 6925 kmem_free(argop, argoplist_size); 6926 return (e.error); 6927 } 6928 6929 /* 2/3: getfh fh of created object */ 6930 ASSERT(idx_create + 1 == idx_fattr - 1); 6931 argop[idx_create + 1].argop = OP_GETFH; 6932 6933 /* 3/4: getattr of new object */ 6934 argop[idx_fattr].argop = OP_GETATTR; 6935 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6936 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6937 6938 if (setgid_flag) { 6939 vattr_t _v; 6940 6941 argop[4].argop = OP_SAVEFH; 6942 6943 argop[5].argop = OP_CPUTFH; 6944 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6945 6946 argop[6].argop = OP_GETATTR; 6947 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6948 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6949 6950 argop[7].argop = OP_RESTOREFH; 6951 6952 /* 6953 * nverify 6954 * 6955 * XXX - Revisit the last argument to nfs4_end_op() 6956 * once 5020486 is fixed. 6957 */ 6958 _v.va_mask = AT_GID; 6959 _v.va_gid = va->va_gid; 6960 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6961 supp_attrs)) { 6962 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6963 nfs_rw_exit(&drp->r_rwlock); 6964 nfs4_fattr4_free(crattr); 6965 kmem_free(argop, argoplist_size); 6966 return (e.error); 6967 } 6968 6969 /* 6970 * setattr 6971 * 6972 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6973 * so no need for stateid or flags. Also we specify NULL 6974 * rp since we're only interested in setting owner_group 6975 * attributes. 6976 */ 6977 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6978 &e.error, 0); 6979 6980 if (e.error) { 6981 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6982 nfs_rw_exit(&drp->r_rwlock); 6983 nfs4_fattr4_free(crattr); 6984 nfs4args_verify_free(&argop[8]); 6985 kmem_free(argop, argoplist_size); 6986 return (e.error); 6987 } 6988 } else { 6989 argop[1].argop = OP_SAVEFH; 6990 6991 argop[5].argop = OP_RESTOREFH; 6992 6993 argop[6].argop = OP_GETATTR; 6994 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6995 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6996 } 6997 6998 dnlc_remove(dvp, nm); 6999 7000 doqueue = 1; 7001 t = gethrtime(); 7002 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7003 7004 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7005 if (e.error) { 7006 PURGE_ATTRCACHE4(dvp); 7007 if (!needrecov) 7008 goto out; 7009 } 7010 7011 if (needrecov) { 7012 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 7013 OP_CREATE, NULL, NULL, NULL) == FALSE) { 7014 nfs4_end_op(mi, dvp, NULL, &recov_state, 7015 needrecov); 7016 need_end_op = FALSE; 7017 nfs4_fattr4_free(crattr); 7018 if (setgid_flag) { 7019 nfs4args_verify_free(&argop[8]); 7020 nfs4args_setattr_free(&argop[9]); 7021 } 7022 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7023 goto recov_retry; 7024 } 7025 } 7026 7027 resp = &res; 7028 7029 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 7030 7031 if (res.status == NFS4ERR_BADOWNER) 7032 nfs4_log_badowner(mi, OP_CREATE); 7033 7034 e.error = geterrno4(res.status); 7035 7036 /* 7037 * This check is left over from when create was implemented 7038 * using a setattr op (instead of createattrs). If the 7039 * putfh/create/getfh failed, the error was returned. If 7040 * setattr/getattr failed, we keep going. 7041 * 7042 * It might be better to get rid of the GETFH also, and just 7043 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 7044 * Then if any of the operations failed, we could return the 7045 * error now, and remove much of the error code below. 7046 */ 7047 if (res.array_len <= idx_fattr) { 7048 /* 7049 * Either Putfh, Create or Getfh failed. 7050 */ 7051 PURGE_ATTRCACHE4(dvp); 7052 /* 7053 * nfs4_purge_stale_fh() may generate otw calls through 7054 * nfs4_invalidate_pages. Hence the need to call 7055 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7056 */ 7057 nfs4_end_op(mi, dvp, NULL, &recov_state, 7058 needrecov); 7059 need_end_op = FALSE; 7060 nfs4_purge_stale_fh(e.error, dvp, cr); 7061 goto out; 7062 } 7063 } 7064 7065 resop = &res.array[idx_create]; /* create res */ 7066 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7067 7068 resop = &res.array[idx_create + 1]; /* getfh res */ 7069 gf_res = &resop->nfs_resop4_u.opgetfh; 7070 7071 sfhp = sfh4_get(&gf_res->object, mi); 7072 if (e.error) { 7073 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7074 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7075 if (vp->v_type == VNON) { 7076 vattr.va_mask = AT_TYPE; 7077 /* 7078 * Need to call nfs4_end_op before nfs4getattr to avoid 7079 * potential nfs4_start_op deadlock. See RFE 4777612. 7080 */ 7081 nfs4_end_op(mi, dvp, NULL, &recov_state, 7082 needrecov); 7083 need_end_op = FALSE; 7084 e.error = nfs4getattr(vp, &vattr, cr); 7085 if (e.error) { 7086 VN_RELE(vp); 7087 *vpp = NULL; 7088 goto out; 7089 } 7090 vp->v_type = vattr.va_type; 7091 } 7092 e.error = 0; 7093 } else { 7094 *vpp = vp = makenfs4node(sfhp, 7095 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7096 dvp->v_vfsp, t, cr, 7097 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7098 } 7099 7100 /* 7101 * If compound succeeded, then update dir attrs 7102 */ 7103 if (res.status == NFS4_OK) { 7104 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7105 dinfo.di_cred = cr; 7106 dinfo.di_time_call = t; 7107 dinfop = &dinfo; 7108 } else 7109 dinfop = NULL; 7110 7111 /* Update directory cache attribute, readdir and dnlc caches */ 7112 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7113 7114 out: 7115 if (sfhp != NULL) 7116 sfh4_rele(&sfhp); 7117 nfs_rw_exit(&drp->r_rwlock); 7118 nfs4_fattr4_free(crattr); 7119 if (setgid_flag) { 7120 nfs4args_verify_free(&argop[8]); 7121 nfs4args_setattr_free(&argop[9]); 7122 } 7123 if (resp) 7124 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7125 if (need_end_op) 7126 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7127 7128 kmem_free(argop, argoplist_size); 7129 return (e.error); 7130 } 7131 7132 /* ARGSUSED */ 7133 static int 7134 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7135 int mode, vnode_t **vpp, cred_t *cr) 7136 { 7137 int error; 7138 vnode_t *vp; 7139 nfs_ftype4 type; 7140 specdata4 spec, *specp = NULL; 7141 7142 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7143 7144 switch (va->va_type) { 7145 case VCHR: 7146 case VBLK: 7147 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7148 spec.specdata1 = getmajor(va->va_rdev); 7149 spec.specdata2 = getminor(va->va_rdev); 7150 specp = &spec; 7151 break; 7152 7153 case VFIFO: 7154 type = NF4FIFO; 7155 break; 7156 case VSOCK: 7157 type = NF4SOCK; 7158 break; 7159 7160 default: 7161 return (EINVAL); 7162 } 7163 7164 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7165 if (error) { 7166 return (error); 7167 } 7168 7169 /* 7170 * This might not be needed any more; special case to deal 7171 * with problematic v2/v3 servers. Since create was unable 7172 * to set group correctly, not sure what hope setattr has. 7173 */ 7174 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7175 va->va_mask = AT_GID; 7176 (void) nfs4setattr(vp, va, 0, cr, NULL); 7177 } 7178 7179 /* 7180 * If vnode is a device create special vnode 7181 */ 7182 if (ISVDEV(vp->v_type)) { 7183 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7184 VN_RELE(vp); 7185 } else { 7186 *vpp = vp; 7187 } 7188 return (error); 7189 } 7190 7191 /* 7192 * Remove requires that the current fh be the target directory. 7193 * After the operation, the current fh is unchanged. 7194 * The compound op structure is: 7195 * PUTFH(targetdir), REMOVE 7196 * 7197 * Weirdness: if the vnode to be removed is open 7198 * we rename it instead of removing it and nfs_inactive 7199 * will remove the new name. 7200 */ 7201 /* ARGSUSED */ 7202 static int 7203 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7204 { 7205 COMPOUND4args_clnt args; 7206 COMPOUND4res_clnt res, *resp = NULL; 7207 REMOVE4res *rm_res; 7208 nfs_argop4 argop[3]; 7209 nfs_resop4 *resop; 7210 vnode_t *vp; 7211 char *tmpname; 7212 int doqueue; 7213 mntinfo4_t *mi; 7214 rnode4_t *rp; 7215 rnode4_t *drp; 7216 int needrecov = 0; 7217 nfs4_recov_state_t recov_state; 7218 int isopen; 7219 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7220 dirattr_info_t dinfo; 7221 7222 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7223 return (EPERM); 7224 drp = VTOR4(dvp); 7225 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7226 return (EINTR); 7227 7228 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7229 if (e.error) { 7230 nfs_rw_exit(&drp->r_rwlock); 7231 return (e.error); 7232 } 7233 7234 if (vp->v_type == VDIR) { 7235 VN_RELE(vp); 7236 nfs_rw_exit(&drp->r_rwlock); 7237 return (EISDIR); 7238 } 7239 7240 /* 7241 * First just remove the entry from the name cache, as it 7242 * is most likely the only entry for this vp. 7243 */ 7244 dnlc_remove(dvp, nm); 7245 7246 rp = VTOR4(vp); 7247 7248 /* 7249 * For regular file types, check to see if the file is open by looking 7250 * at the open streams. 7251 * For all other types, check the reference count on the vnode. Since 7252 * they are not opened OTW they never have an open stream. 7253 * 7254 * If the file is open, rename it to .nfsXXXX. 7255 */ 7256 if (vp->v_type != VREG) { 7257 /* 7258 * If the file has a v_count > 1 then there may be more than one 7259 * entry in the name cache due multiple links or an open file, 7260 * but we don't have the real reference count so flush all 7261 * possible entries. 7262 */ 7263 if (vp->v_count > 1) 7264 dnlc_purge_vp(vp); 7265 7266 /* 7267 * Now we have the real reference count. 7268 */ 7269 isopen = vp->v_count > 1; 7270 } else { 7271 mutex_enter(&rp->r_os_lock); 7272 isopen = list_head(&rp->r_open_streams) != NULL; 7273 mutex_exit(&rp->r_os_lock); 7274 } 7275 7276 mutex_enter(&rp->r_statelock); 7277 if (isopen && 7278 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7279 mutex_exit(&rp->r_statelock); 7280 tmpname = newname(); 7281 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7282 if (e.error) 7283 kmem_free(tmpname, MAXNAMELEN); 7284 else { 7285 mutex_enter(&rp->r_statelock); 7286 if (rp->r_unldvp == NULL) { 7287 VN_HOLD(dvp); 7288 rp->r_unldvp = dvp; 7289 if (rp->r_unlcred != NULL) 7290 crfree(rp->r_unlcred); 7291 crhold(cr); 7292 rp->r_unlcred = cr; 7293 rp->r_unlname = tmpname; 7294 } else { 7295 kmem_free(rp->r_unlname, MAXNAMELEN); 7296 rp->r_unlname = tmpname; 7297 } 7298 mutex_exit(&rp->r_statelock); 7299 } 7300 VN_RELE(vp); 7301 nfs_rw_exit(&drp->r_rwlock); 7302 return (e.error); 7303 } 7304 /* 7305 * Actually remove the file/dir 7306 */ 7307 mutex_exit(&rp->r_statelock); 7308 7309 /* 7310 * We need to flush any dirty pages which happen to 7311 * be hanging around before removing the file. 7312 * This shouldn't happen very often since in NFSv4 7313 * we should be close to open consistent. 7314 */ 7315 if (nfs4_has_pages(vp) && 7316 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7317 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7318 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7319 mutex_enter(&rp->r_statelock); 7320 if (!rp->r_error) 7321 rp->r_error = e.error; 7322 mutex_exit(&rp->r_statelock); 7323 } 7324 } 7325 7326 mi = VTOMI4(dvp); 7327 7328 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7329 recov_state.rs_flags = 0; 7330 recov_state.rs_num_retry_despite_err = 0; 7331 7332 recov_retry: 7333 /* 7334 * Remove ops: putfh dir; remove 7335 */ 7336 args.ctag = TAG_REMOVE; 7337 args.array_len = 3; 7338 args.array = argop; 7339 7340 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7341 if (e.error) { 7342 nfs_rw_exit(&drp->r_rwlock); 7343 VN_RELE(vp); 7344 return (e.error); 7345 } 7346 7347 /* putfh directory */ 7348 argop[0].argop = OP_CPUTFH; 7349 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7350 7351 /* remove */ 7352 argop[1].argop = OP_CREMOVE; 7353 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7354 7355 /* getattr dir */ 7356 argop[2].argop = OP_GETATTR; 7357 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7358 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7359 7360 doqueue = 1; 7361 dinfo.di_time_call = gethrtime(); 7362 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7363 7364 PURGE_ATTRCACHE4(vp); 7365 7366 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7367 if (e.error) 7368 PURGE_ATTRCACHE4(dvp); 7369 7370 if (needrecov) { 7371 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7372 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 7373 if (!e.error) 7374 (void) xdr_free(xdr_COMPOUND4res_clnt, 7375 (caddr_t)&res); 7376 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7377 needrecov); 7378 goto recov_retry; 7379 } 7380 } 7381 7382 /* 7383 * Matching nfs4_end_op() for start_op() above. 7384 * There is a path in the code below which calls 7385 * nfs4_purge_stale_fh(), which may generate otw calls through 7386 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7387 * here to avoid nfs4_start_op() deadlock. 7388 */ 7389 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7390 7391 if (!e.error) { 7392 resp = &res; 7393 7394 if (res.status) { 7395 e.error = geterrno4(res.status); 7396 PURGE_ATTRCACHE4(dvp); 7397 nfs4_purge_stale_fh(e.error, dvp, cr); 7398 } else { 7399 resop = &res.array[1]; /* remove res */ 7400 rm_res = &resop->nfs_resop4_u.opremove; 7401 7402 dinfo.di_garp = 7403 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7404 dinfo.di_cred = cr; 7405 7406 /* Update directory attr, readdir and dnlc caches */ 7407 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7408 &dinfo); 7409 } 7410 } 7411 nfs_rw_exit(&drp->r_rwlock); 7412 if (resp) 7413 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7414 7415 if (e.error == 0) { 7416 vnode_t *tvp; 7417 rnode4_t *trp; 7418 trp = VTOR4(vp); 7419 tvp = vp; 7420 if (IS_SHADOW(vp, trp)) 7421 tvp = RTOV4(trp); 7422 vnevent_remove(tvp, dvp, nm, ct); 7423 } 7424 VN_RELE(vp); 7425 return (e.error); 7426 } 7427 7428 /* 7429 * Link requires that the current fh be the target directory and the 7430 * saved fh be the source fh. After the operation, the current fh is unchanged. 7431 * Thus the compound op structure is: 7432 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7433 * GETATTR(file) 7434 */ 7435 /* ARGSUSED */ 7436 static int 7437 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7438 caller_context_t *ct, int flags) 7439 { 7440 COMPOUND4args_clnt args; 7441 COMPOUND4res_clnt res, *resp = NULL; 7442 LINK4res *ln_res; 7443 int argoplist_size = 7 * sizeof (nfs_argop4); 7444 nfs_argop4 *argop; 7445 nfs_resop4 *resop; 7446 vnode_t *realvp, *nvp; 7447 int doqueue; 7448 mntinfo4_t *mi; 7449 rnode4_t *tdrp; 7450 bool_t needrecov = FALSE; 7451 nfs4_recov_state_t recov_state; 7452 hrtime_t t; 7453 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7454 dirattr_info_t dinfo; 7455 7456 ASSERT(*tnm != '\0'); 7457 ASSERT(tdvp->v_type == VDIR); 7458 ASSERT(nfs4_consistent_type(tdvp)); 7459 ASSERT(nfs4_consistent_type(svp)); 7460 7461 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7462 return (EPERM); 7463 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7464 svp = realvp; 7465 ASSERT(nfs4_consistent_type(svp)); 7466 } 7467 7468 tdrp = VTOR4(tdvp); 7469 mi = VTOMI4(svp); 7470 7471 if (!(mi->mi_flags & MI4_LINK)) { 7472 return (EOPNOTSUPP); 7473 } 7474 recov_state.rs_flags = 0; 7475 recov_state.rs_num_retry_despite_err = 0; 7476 7477 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7478 return (EINTR); 7479 7480 recov_retry: 7481 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7482 7483 args.ctag = TAG_LINK; 7484 7485 /* 7486 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7487 * restorefh; getattr(fl) 7488 */ 7489 args.array_len = 7; 7490 args.array = argop; 7491 7492 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7493 if (e.error) { 7494 kmem_free(argop, argoplist_size); 7495 nfs_rw_exit(&tdrp->r_rwlock); 7496 return (e.error); 7497 } 7498 7499 /* 0. putfh file */ 7500 argop[0].argop = OP_CPUTFH; 7501 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7502 7503 /* 1. save current fh to free up the space for the dir */ 7504 argop[1].argop = OP_SAVEFH; 7505 7506 /* 2. putfh targetdir */ 7507 argop[2].argop = OP_CPUTFH; 7508 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7509 7510 /* 3. link: current_fh is targetdir, saved_fh is source */ 7511 argop[3].argop = OP_CLINK; 7512 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7513 7514 /* 4. Get attributes of dir */ 7515 argop[4].argop = OP_GETATTR; 7516 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7517 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7518 7519 /* 5. If link was successful, restore current vp to file */ 7520 argop[5].argop = OP_RESTOREFH; 7521 7522 /* 6. Get attributes of linked object */ 7523 argop[6].argop = OP_GETATTR; 7524 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7525 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7526 7527 dnlc_remove(tdvp, tnm); 7528 7529 doqueue = 1; 7530 t = gethrtime(); 7531 7532 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7533 7534 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7535 if (e.error != 0 && !needrecov) { 7536 PURGE_ATTRCACHE4(tdvp); 7537 PURGE_ATTRCACHE4(svp); 7538 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7539 goto out; 7540 } 7541 7542 if (needrecov) { 7543 bool_t abort; 7544 7545 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7546 NULL, NULL, OP_LINK, NULL, NULL, NULL); 7547 if (abort == FALSE) { 7548 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7549 needrecov); 7550 kmem_free(argop, argoplist_size); 7551 if (!e.error) 7552 (void) xdr_free(xdr_COMPOUND4res_clnt, 7553 (caddr_t)&res); 7554 goto recov_retry; 7555 } else { 7556 if (e.error != 0) { 7557 PURGE_ATTRCACHE4(tdvp); 7558 PURGE_ATTRCACHE4(svp); 7559 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7560 &recov_state, needrecov); 7561 goto out; 7562 } 7563 /* fall through for res.status case */ 7564 } 7565 } 7566 7567 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7568 7569 resp = &res; 7570 if (res.status) { 7571 /* If link succeeded, then don't return error */ 7572 e.error = geterrno4(res.status); 7573 if (res.array_len <= 4) { 7574 /* 7575 * Either Putfh, Savefh, Putfh dir, or Link failed 7576 */ 7577 PURGE_ATTRCACHE4(svp); 7578 PURGE_ATTRCACHE4(tdvp); 7579 if (e.error == EOPNOTSUPP) { 7580 mutex_enter(&mi->mi_lock); 7581 mi->mi_flags &= ~MI4_LINK; 7582 mutex_exit(&mi->mi_lock); 7583 } 7584 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7585 /* XXX-LP */ 7586 if (e.error == EISDIR && crgetuid(cr) != 0) 7587 e.error = EPERM; 7588 goto out; 7589 } 7590 } 7591 7592 /* either no error or one of the postop getattr failed */ 7593 7594 /* 7595 * XXX - if LINK succeeded, but no attrs were returned for link 7596 * file, purge its cache. 7597 * 7598 * XXX Perform a simplified version of wcc checking. Instead of 7599 * have another getattr to get pre-op, just purge cache if 7600 * any of the ops prior to and including the getattr failed. 7601 * If the getattr succeeded then update the attrcache accordingly. 7602 */ 7603 7604 /* 7605 * update cache with link file postattrs. 7606 * Note: at this point resop points to link res. 7607 */ 7608 resop = &res.array[3]; /* link res */ 7609 ln_res = &resop->nfs_resop4_u.oplink; 7610 if (res.status == NFS4_OK) 7611 e.error = nfs4_update_attrcache(res.status, 7612 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7613 t, svp, cr); 7614 7615 /* 7616 * Call makenfs4node to create the new shadow vp for tnm. 7617 * We pass NULL attrs because we just cached attrs for 7618 * the src object. All we're trying to accomplish is to 7619 * to create the new shadow vnode. 7620 */ 7621 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7622 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7623 7624 /* Update target cache attribute, readdir and dnlc caches */ 7625 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7626 dinfo.di_time_call = t; 7627 dinfo.di_cred = cr; 7628 7629 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7630 ASSERT(nfs4_consistent_type(tdvp)); 7631 ASSERT(nfs4_consistent_type(svp)); 7632 ASSERT(nfs4_consistent_type(nvp)); 7633 VN_RELE(nvp); 7634 7635 if (!e.error) { 7636 vnode_t *tvp; 7637 rnode4_t *trp; 7638 /* 7639 * Notify the source file of this link operation. 7640 */ 7641 trp = VTOR4(svp); 7642 tvp = svp; 7643 if (IS_SHADOW(svp, trp)) 7644 tvp = RTOV4(trp); 7645 vnevent_link(tvp, ct); 7646 } 7647 out: 7648 kmem_free(argop, argoplist_size); 7649 if (resp) 7650 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7651 7652 nfs_rw_exit(&tdrp->r_rwlock); 7653 7654 return (e.error); 7655 } 7656 7657 /* ARGSUSED */ 7658 static int 7659 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7660 caller_context_t *ct, int flags) 7661 { 7662 vnode_t *realvp; 7663 7664 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7665 return (EPERM); 7666 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7667 ndvp = realvp; 7668 7669 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7670 } 7671 7672 /* 7673 * nfs4rename does the real work of renaming in NFS Version 4. 7674 * 7675 * A file handle is considered volatile for renaming purposes if either 7676 * of the volatile bits are turned on. However, the compound may differ 7677 * based on the likelihood of the filehandle to change during rename. 7678 */ 7679 static int 7680 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7681 caller_context_t *ct) 7682 { 7683 int error; 7684 mntinfo4_t *mi; 7685 vnode_t *nvp = NULL; 7686 vnode_t *ovp = NULL; 7687 char *tmpname = NULL; 7688 rnode4_t *rp; 7689 rnode4_t *odrp; 7690 rnode4_t *ndrp; 7691 int did_link = 0; 7692 int do_link = 1; 7693 nfsstat4 stat = NFS4_OK; 7694 7695 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7696 ASSERT(nfs4_consistent_type(odvp)); 7697 ASSERT(nfs4_consistent_type(ndvp)); 7698 7699 if (onm[0] == '.' && (onm[1] == '\0' || 7700 (onm[1] == '.' && onm[2] == '\0'))) 7701 return (EINVAL); 7702 7703 if (nnm[0] == '.' && (nnm[1] == '\0' || 7704 (nnm[1] == '.' && nnm[2] == '\0'))) 7705 return (EINVAL); 7706 7707 odrp = VTOR4(odvp); 7708 ndrp = VTOR4(ndvp); 7709 if ((intptr_t)odrp < (intptr_t)ndrp) { 7710 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7711 return (EINTR); 7712 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7713 nfs_rw_exit(&odrp->r_rwlock); 7714 return (EINTR); 7715 } 7716 } else { 7717 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7718 return (EINTR); 7719 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7720 nfs_rw_exit(&ndrp->r_rwlock); 7721 return (EINTR); 7722 } 7723 } 7724 7725 /* 7726 * Lookup the target file. If it exists, it needs to be 7727 * checked to see whether it is a mount point and whether 7728 * it is active (open). 7729 */ 7730 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7731 if (!error) { 7732 int isactive; 7733 7734 ASSERT(nfs4_consistent_type(nvp)); 7735 /* 7736 * If this file has been mounted on, then just 7737 * return busy because renaming to it would remove 7738 * the mounted file system from the name space. 7739 */ 7740 if (vn_ismntpt(nvp)) { 7741 VN_RELE(nvp); 7742 nfs_rw_exit(&odrp->r_rwlock); 7743 nfs_rw_exit(&ndrp->r_rwlock); 7744 return (EBUSY); 7745 } 7746 7747 /* 7748 * First just remove the entry from the name cache, as it 7749 * is most likely the only entry for this vp. 7750 */ 7751 dnlc_remove(ndvp, nnm); 7752 7753 rp = VTOR4(nvp); 7754 7755 if (nvp->v_type != VREG) { 7756 /* 7757 * Purge the name cache of all references to this vnode 7758 * so that we can check the reference count to infer 7759 * whether it is active or not. 7760 */ 7761 if (nvp->v_count > 1) 7762 dnlc_purge_vp(nvp); 7763 7764 isactive = nvp->v_count > 1; 7765 } else { 7766 mutex_enter(&rp->r_os_lock); 7767 isactive = list_head(&rp->r_open_streams) != NULL; 7768 mutex_exit(&rp->r_os_lock); 7769 } 7770 7771 /* 7772 * If the vnode is active and is not a directory, 7773 * arrange to rename it to a 7774 * temporary file so that it will continue to be 7775 * accessible. This implements the "unlink-open-file" 7776 * semantics for the target of a rename operation. 7777 * Before doing this though, make sure that the 7778 * source and target files are not already the same. 7779 */ 7780 if (isactive && nvp->v_type != VDIR) { 7781 /* 7782 * Lookup the source name. 7783 */ 7784 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7785 7786 /* 7787 * The source name *should* already exist. 7788 */ 7789 if (error) { 7790 VN_RELE(nvp); 7791 nfs_rw_exit(&odrp->r_rwlock); 7792 nfs_rw_exit(&ndrp->r_rwlock); 7793 return (error); 7794 } 7795 7796 ASSERT(nfs4_consistent_type(ovp)); 7797 7798 /* 7799 * Compare the two vnodes. If they are the same, 7800 * just release all held vnodes and return success. 7801 */ 7802 if (VN_CMP(ovp, nvp)) { 7803 VN_RELE(ovp); 7804 VN_RELE(nvp); 7805 nfs_rw_exit(&odrp->r_rwlock); 7806 nfs_rw_exit(&ndrp->r_rwlock); 7807 return (0); 7808 } 7809 7810 /* 7811 * Can't mix and match directories and non- 7812 * directories in rename operations. We already 7813 * know that the target is not a directory. If 7814 * the source is a directory, return an error. 7815 */ 7816 if (ovp->v_type == VDIR) { 7817 VN_RELE(ovp); 7818 VN_RELE(nvp); 7819 nfs_rw_exit(&odrp->r_rwlock); 7820 nfs_rw_exit(&ndrp->r_rwlock); 7821 return (ENOTDIR); 7822 } 7823 link_call: 7824 /* 7825 * The target file exists, is not the same as 7826 * the source file, and is active. We first 7827 * try to Link it to a temporary filename to 7828 * avoid having the server removing the file 7829 * completely (which could cause data loss to 7830 * the user's POV in the event the Rename fails 7831 * -- see bug 1165874). 7832 */ 7833 /* 7834 * The do_link and did_link booleans are 7835 * introduced in the event we get NFS4ERR_FILE_OPEN 7836 * returned for the Rename. Some servers can 7837 * not Rename over an Open file, so they return 7838 * this error. The client needs to Remove the 7839 * newly created Link and do two Renames, just 7840 * as if the server didn't support LINK. 7841 */ 7842 tmpname = newname(); 7843 error = 0; 7844 7845 if (do_link) { 7846 error = nfs4_link(ndvp, nvp, tmpname, cr, 7847 NULL, 0); 7848 } 7849 if (error == EOPNOTSUPP || !do_link) { 7850 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7851 cr, NULL, 0); 7852 did_link = 0; 7853 } else { 7854 did_link = 1; 7855 } 7856 if (error) { 7857 kmem_free(tmpname, MAXNAMELEN); 7858 VN_RELE(ovp); 7859 VN_RELE(nvp); 7860 nfs_rw_exit(&odrp->r_rwlock); 7861 nfs_rw_exit(&ndrp->r_rwlock); 7862 return (error); 7863 } 7864 7865 mutex_enter(&rp->r_statelock); 7866 if (rp->r_unldvp == NULL) { 7867 VN_HOLD(ndvp); 7868 rp->r_unldvp = ndvp; 7869 if (rp->r_unlcred != NULL) 7870 crfree(rp->r_unlcred); 7871 crhold(cr); 7872 rp->r_unlcred = cr; 7873 rp->r_unlname = tmpname; 7874 } else { 7875 if (rp->r_unlname) 7876 kmem_free(rp->r_unlname, MAXNAMELEN); 7877 rp->r_unlname = tmpname; 7878 } 7879 mutex_exit(&rp->r_statelock); 7880 } 7881 7882 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7883 7884 ASSERT(nfs4_consistent_type(nvp)); 7885 } 7886 7887 if (ovp == NULL) { 7888 /* 7889 * When renaming directories to be a subdirectory of a 7890 * different parent, the dnlc entry for ".." will no 7891 * longer be valid, so it must be removed. 7892 * 7893 * We do a lookup here to determine whether we are renaming 7894 * a directory and we need to check if we are renaming 7895 * an unlinked file. This might have already been done 7896 * in previous code, so we check ovp == NULL to avoid 7897 * doing it twice. 7898 */ 7899 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7900 /* 7901 * The source name *should* already exist. 7902 */ 7903 if (error) { 7904 nfs_rw_exit(&odrp->r_rwlock); 7905 nfs_rw_exit(&ndrp->r_rwlock); 7906 if (nvp) { 7907 VN_RELE(nvp); 7908 } 7909 return (error); 7910 } 7911 ASSERT(ovp != NULL); 7912 ASSERT(nfs4_consistent_type(ovp)); 7913 } 7914 7915 /* 7916 * Is the object being renamed a dir, and if so, is 7917 * it being renamed to a child of itself? The underlying 7918 * fs should ultimately return EINVAL for this case; 7919 * however, buggy beta non-Solaris NFSv4 servers at 7920 * interop testing events have allowed this behavior, 7921 * and it caused our client to panic due to a recursive 7922 * mutex_enter in fn_move. 7923 * 7924 * The tedious locking in fn_move could be changed to 7925 * deal with this case, and the client could avoid the 7926 * panic; however, the client would just confuse itself 7927 * later and misbehave. A better way to handle the broken 7928 * server is to detect this condition and return EINVAL 7929 * without ever sending the the bogus rename to the server. 7930 * We know the rename is invalid -- just fail it now. 7931 */ 7932 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7933 VN_RELE(ovp); 7934 nfs_rw_exit(&odrp->r_rwlock); 7935 nfs_rw_exit(&ndrp->r_rwlock); 7936 if (nvp) { 7937 VN_RELE(nvp); 7938 } 7939 return (EINVAL); 7940 } 7941 7942 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7943 7944 /* 7945 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7946 * possible for the filehandle to change due to the rename. 7947 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7948 * the fh will not change because of the rename, but we still need 7949 * to update its rnode entry with the new name for 7950 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7951 * has no effect on these for now, but for future improvements, 7952 * we might want to use it too to simplify handling of files 7953 * that are open with that flag on. (XXX) 7954 */ 7955 mi = VTOMI4(odvp); 7956 if (NFS4_VOLATILE_FH(mi)) 7957 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7958 &stat); 7959 else 7960 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7961 &stat); 7962 7963 ASSERT(nfs4_consistent_type(odvp)); 7964 ASSERT(nfs4_consistent_type(ndvp)); 7965 ASSERT(nfs4_consistent_type(ovp)); 7966 7967 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7968 do_link = 0; 7969 /* 7970 * Before the 'link_call' code, we did a nfs4_lookup 7971 * that puts a VN_HOLD on nvp. After the nfs4_link 7972 * call we call VN_RELE to match that hold. We need 7973 * to place an additional VN_HOLD here since we will 7974 * be hitting that VN_RELE again. 7975 */ 7976 VN_HOLD(nvp); 7977 7978 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 7979 7980 /* Undo the unlinked file naming stuff we just did */ 7981 mutex_enter(&rp->r_statelock); 7982 if (rp->r_unldvp) { 7983 VN_RELE(ndvp); 7984 rp->r_unldvp = NULL; 7985 if (rp->r_unlcred != NULL) 7986 crfree(rp->r_unlcred); 7987 rp->r_unlcred = NULL; 7988 /* rp->r_unlanme points to tmpname */ 7989 if (rp->r_unlname) 7990 kmem_free(rp->r_unlname, MAXNAMELEN); 7991 rp->r_unlname = NULL; 7992 } 7993 mutex_exit(&rp->r_statelock); 7994 7995 if (nvp) { 7996 VN_RELE(nvp); 7997 } 7998 goto link_call; 7999 } 8000 8001 if (error) { 8002 VN_RELE(ovp); 8003 nfs_rw_exit(&odrp->r_rwlock); 8004 nfs_rw_exit(&ndrp->r_rwlock); 8005 if (nvp) { 8006 VN_RELE(nvp); 8007 } 8008 return (error); 8009 } 8010 8011 /* 8012 * when renaming directories to be a subdirectory of a 8013 * different parent, the dnlc entry for ".." will no 8014 * longer be valid, so it must be removed 8015 */ 8016 rp = VTOR4(ovp); 8017 if (ndvp != odvp) { 8018 if (ovp->v_type == VDIR) { 8019 dnlc_remove(ovp, ".."); 8020 if (rp->r_dir != NULL) 8021 nfs4_purge_rddir_cache(ovp); 8022 } 8023 } 8024 8025 /* 8026 * If we are renaming the unlinked file, update the 8027 * r_unldvp and r_unlname as needed. 8028 */ 8029 mutex_enter(&rp->r_statelock); 8030 if (rp->r_unldvp != NULL) { 8031 if (strcmp(rp->r_unlname, onm) == 0) { 8032 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 8033 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 8034 if (ndvp != rp->r_unldvp) { 8035 VN_RELE(rp->r_unldvp); 8036 rp->r_unldvp = ndvp; 8037 VN_HOLD(ndvp); 8038 } 8039 } 8040 } 8041 mutex_exit(&rp->r_statelock); 8042 8043 /* 8044 * Notify the rename vnevents to source vnode, and to the target 8045 * vnode if it already existed. 8046 */ 8047 if (error == 0) { 8048 vnode_t *tvp; 8049 rnode4_t *trp; 8050 /* 8051 * Notify the vnode. Each links is represented by 8052 * a different vnode, in nfsv4. 8053 */ 8054 if (nvp) { 8055 trp = VTOR4(nvp); 8056 tvp = nvp; 8057 if (IS_SHADOW(nvp, trp)) 8058 tvp = RTOV4(trp); 8059 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8060 } 8061 8062 /* 8063 * if the source and destination directory are not the 8064 * same notify the destination directory. 8065 */ 8066 if (VTOR4(odvp) != VTOR4(ndvp)) { 8067 trp = VTOR4(ndvp); 8068 tvp = ndvp; 8069 if (IS_SHADOW(ndvp, trp)) 8070 tvp = RTOV4(trp); 8071 vnevent_rename_dest_dir(tvp, ct); 8072 } 8073 8074 trp = VTOR4(ovp); 8075 tvp = ovp; 8076 if (IS_SHADOW(ovp, trp)) 8077 tvp = RTOV4(trp); 8078 vnevent_rename_src(tvp, odvp, onm, ct); 8079 } 8080 8081 if (nvp) { 8082 VN_RELE(nvp); 8083 } 8084 VN_RELE(ovp); 8085 8086 nfs_rw_exit(&odrp->r_rwlock); 8087 nfs_rw_exit(&ndrp->r_rwlock); 8088 8089 return (error); 8090 } 8091 8092 /* 8093 * When the parent directory has changed, sv_dfh must be updated 8094 */ 8095 static void 8096 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8097 { 8098 svnode_t *sv = VTOSV(vp); 8099 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8100 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8101 8102 sfh4_hold(new_dfh); 8103 sv->sv_dfh = new_dfh; 8104 sfh4_rele(&old_dfh); 8105 } 8106 8107 /* 8108 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8109 * when it is known that the filehandle is persistent through rename. 8110 * 8111 * Rename requires that the current fh be the target directory and the 8112 * saved fh be the source directory. After the operation, the current fh 8113 * is unchanged. 8114 * The compound op structure for persistent fh rename is: 8115 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8116 * Rather than bother with the directory postop args, we'll simply 8117 * update that a change occurred in the cache, so no post-op getattrs. 8118 */ 8119 static int 8120 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8121 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8122 { 8123 COMPOUND4args_clnt args; 8124 COMPOUND4res_clnt res, *resp = NULL; 8125 nfs_argop4 *argop; 8126 nfs_resop4 *resop; 8127 int doqueue, argoplist_size; 8128 mntinfo4_t *mi; 8129 rnode4_t *odrp = VTOR4(odvp); 8130 rnode4_t *ndrp = VTOR4(ndvp); 8131 RENAME4res *rn_res; 8132 bool_t needrecov; 8133 nfs4_recov_state_t recov_state; 8134 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8135 dirattr_info_t dinfo, *dinfop; 8136 8137 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8138 8139 recov_state.rs_flags = 0; 8140 recov_state.rs_num_retry_despite_err = 0; 8141 8142 /* 8143 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8144 * 8145 * If source/target are different dirs, then append putfh(src); getattr 8146 */ 8147 args.array_len = (odvp == ndvp) ? 5 : 7; 8148 argoplist_size = args.array_len * sizeof (nfs_argop4); 8149 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8150 8151 recov_retry: 8152 *statp = NFS4_OK; 8153 8154 /* No need to Lookup the file, persistent fh */ 8155 args.ctag = TAG_RENAME; 8156 8157 mi = VTOMI4(odvp); 8158 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8159 if (e.error) { 8160 kmem_free(argop, argoplist_size); 8161 return (e.error); 8162 } 8163 8164 /* 0: putfh source directory */ 8165 argop[0].argop = OP_CPUTFH; 8166 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8167 8168 /* 1: Save source fh to free up current for target */ 8169 argop[1].argop = OP_SAVEFH; 8170 8171 /* 2: putfh targetdir */ 8172 argop[2].argop = OP_CPUTFH; 8173 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8174 8175 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8176 argop[3].argop = OP_CRENAME; 8177 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8178 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8179 8180 /* 4: getattr (targetdir) */ 8181 argop[4].argop = OP_GETATTR; 8182 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8183 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8184 8185 if (ndvp != odvp) { 8186 8187 /* 5: putfh (sourcedir) */ 8188 argop[5].argop = OP_CPUTFH; 8189 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8190 8191 /* 6: getattr (sourcedir) */ 8192 argop[6].argop = OP_GETATTR; 8193 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8194 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8195 } 8196 8197 dnlc_remove(odvp, onm); 8198 dnlc_remove(ndvp, nnm); 8199 8200 doqueue = 1; 8201 dinfo.di_time_call = gethrtime(); 8202 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8203 8204 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8205 if (e.error) { 8206 PURGE_ATTRCACHE4(odvp); 8207 PURGE_ATTRCACHE4(ndvp); 8208 } else { 8209 *statp = res.status; 8210 } 8211 8212 if (needrecov) { 8213 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8214 OP_RENAME, NULL, NULL, NULL) == FALSE) { 8215 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8216 if (!e.error) 8217 (void) xdr_free(xdr_COMPOUND4res_clnt, 8218 (caddr_t)&res); 8219 goto recov_retry; 8220 } 8221 } 8222 8223 if (!e.error) { 8224 resp = &res; 8225 /* 8226 * as long as OP_RENAME 8227 */ 8228 if (res.status != NFS4_OK && res.array_len <= 4) { 8229 e.error = geterrno4(res.status); 8230 PURGE_ATTRCACHE4(odvp); 8231 PURGE_ATTRCACHE4(ndvp); 8232 /* 8233 * System V defines rename to return EEXIST, not 8234 * ENOTEMPTY if the target directory is not empty. 8235 * Over the wire, the error is NFSERR_ENOTEMPTY 8236 * which geterrno4 maps to ENOTEMPTY. 8237 */ 8238 if (e.error == ENOTEMPTY) 8239 e.error = EEXIST; 8240 } else { 8241 8242 resop = &res.array[3]; /* rename res */ 8243 rn_res = &resop->nfs_resop4_u.oprename; 8244 8245 if (res.status == NFS4_OK) { 8246 /* 8247 * Update target attribute, readdir and dnlc 8248 * caches. 8249 */ 8250 dinfo.di_garp = 8251 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8252 dinfo.di_cred = cr; 8253 dinfop = &dinfo; 8254 } else 8255 dinfop = NULL; 8256 8257 nfs4_update_dircaches(&rn_res->target_cinfo, 8258 ndvp, NULL, NULL, dinfop); 8259 8260 /* 8261 * Update source attribute, readdir and dnlc caches 8262 * 8263 */ 8264 if (ndvp != odvp) { 8265 update_parentdir_sfh(renvp, ndvp); 8266 8267 if (dinfop) 8268 dinfo.di_garp = 8269 &(res.array[6].nfs_resop4_u. 8270 opgetattr.ga_res); 8271 8272 nfs4_update_dircaches(&rn_res->source_cinfo, 8273 odvp, NULL, NULL, dinfop); 8274 } 8275 8276 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8277 nnm); 8278 } 8279 } 8280 8281 if (resp) 8282 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8283 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8284 kmem_free(argop, argoplist_size); 8285 8286 return (e.error); 8287 } 8288 8289 /* 8290 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8291 * it is possible for the filehandle to change due to the rename. 8292 * 8293 * The compound req in this case includes a post-rename lookup and getattr 8294 * to ensure that we have the correct fh and attributes for the object. 8295 * 8296 * Rename requires that the current fh be the target directory and the 8297 * saved fh be the source directory. After the operation, the current fh 8298 * is unchanged. 8299 * 8300 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8301 * update the filehandle for the renamed object. We also get the old 8302 * filehandle for historical reasons; this should be taken out sometime. 8303 * This results in a rather cumbersome compound... 8304 * 8305 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8306 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8307 * 8308 */ 8309 static int 8310 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8311 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8312 { 8313 COMPOUND4args_clnt args; 8314 COMPOUND4res_clnt res, *resp = NULL; 8315 int argoplist_size; 8316 nfs_argop4 *argop; 8317 nfs_resop4 *resop; 8318 int doqueue; 8319 mntinfo4_t *mi; 8320 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8321 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8322 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8323 RENAME4res *rn_res; 8324 GETFH4res *ngf_res; 8325 bool_t needrecov; 8326 nfs4_recov_state_t recov_state; 8327 hrtime_t t; 8328 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8329 dirattr_info_t dinfo, *dinfop = &dinfo; 8330 8331 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8332 8333 recov_state.rs_flags = 0; 8334 recov_state.rs_num_retry_despite_err = 0; 8335 8336 recov_retry: 8337 *statp = NFS4_OK; 8338 8339 /* 8340 * There is a window between the RPC and updating the path and 8341 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8342 * code, so that it doesn't try to use the old path during that 8343 * window. 8344 */ 8345 mutex_enter(&orp->r_statelock); 8346 while (orp->r_flags & R4RECEXPFH) { 8347 klwp_t *lwp = ttolwp(curthread); 8348 8349 if (lwp != NULL) 8350 lwp->lwp_nostop++; 8351 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8352 mutex_exit(&orp->r_statelock); 8353 if (lwp != NULL) 8354 lwp->lwp_nostop--; 8355 return (EINTR); 8356 } 8357 if (lwp != NULL) 8358 lwp->lwp_nostop--; 8359 } 8360 orp->r_flags |= R4RECEXPFH; 8361 mutex_exit(&orp->r_statelock); 8362 8363 mi = VTOMI4(odvp); 8364 8365 args.ctag = TAG_RENAME_VFH; 8366 args.array_len = (odvp == ndvp) ? 10 : 12; 8367 argoplist_size = args.array_len * sizeof (nfs_argop4); 8368 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8369 8370 /* 8371 * Rename ops: 8372 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8373 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8374 * LOOKUP(trgt), GETFH(new), GETATTR, 8375 * 8376 * if (odvp != ndvp) 8377 * add putfh(sourcedir), getattr(sourcedir) } 8378 */ 8379 args.array = argop; 8380 8381 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8382 &recov_state, NULL); 8383 if (e.error) { 8384 kmem_free(argop, argoplist_size); 8385 mutex_enter(&orp->r_statelock); 8386 orp->r_flags &= ~R4RECEXPFH; 8387 cv_broadcast(&orp->r_cv); 8388 mutex_exit(&orp->r_statelock); 8389 return (e.error); 8390 } 8391 8392 /* 0: putfh source directory */ 8393 argop[0].argop = OP_CPUTFH; 8394 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8395 8396 /* 1: Save source fh to free up current for target */ 8397 argop[1].argop = OP_SAVEFH; 8398 8399 /* 2: Lookup pre-rename fh of renamed object */ 8400 argop[2].argop = OP_CLOOKUP; 8401 argop[2].nfs_argop4_u.opclookup.cname = onm; 8402 8403 /* 3: getfh fh of renamed object (before rename) */ 8404 argop[3].argop = OP_GETFH; 8405 8406 /* 4: putfh targetdir */ 8407 argop[4].argop = OP_CPUTFH; 8408 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8409 8410 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8411 argop[5].argop = OP_CRENAME; 8412 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8413 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8414 8415 /* 6: getattr of target dir (post op attrs) */ 8416 argop[6].argop = OP_GETATTR; 8417 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8418 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8419 8420 /* 7: Lookup post-rename fh of renamed object */ 8421 argop[7].argop = OP_CLOOKUP; 8422 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8423 8424 /* 8: getfh fh of renamed object (after rename) */ 8425 argop[8].argop = OP_GETFH; 8426 8427 /* 9: getattr of renamed object */ 8428 argop[9].argop = OP_GETATTR; 8429 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8430 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8431 8432 /* 8433 * If source/target dirs are different, then get new post-op 8434 * attrs for source dir also. 8435 */ 8436 if (ndvp != odvp) { 8437 /* 10: putfh (sourcedir) */ 8438 argop[10].argop = OP_CPUTFH; 8439 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8440 8441 /* 11: getattr (sourcedir) */ 8442 argop[11].argop = OP_GETATTR; 8443 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8444 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8445 } 8446 8447 dnlc_remove(odvp, onm); 8448 dnlc_remove(ndvp, nnm); 8449 8450 doqueue = 1; 8451 t = gethrtime(); 8452 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8453 8454 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8455 if (e.error) { 8456 PURGE_ATTRCACHE4(odvp); 8457 PURGE_ATTRCACHE4(ndvp); 8458 if (!needrecov) { 8459 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8460 &recov_state, needrecov); 8461 goto out; 8462 } 8463 } else { 8464 *statp = res.status; 8465 } 8466 8467 if (needrecov) { 8468 bool_t abort; 8469 8470 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8471 OP_RENAME, NULL, NULL, NULL); 8472 if (abort == FALSE) { 8473 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8474 &recov_state, needrecov); 8475 kmem_free(argop, argoplist_size); 8476 if (!e.error) 8477 (void) xdr_free(xdr_COMPOUND4res_clnt, 8478 (caddr_t)&res); 8479 mutex_enter(&orp->r_statelock); 8480 orp->r_flags &= ~R4RECEXPFH; 8481 cv_broadcast(&orp->r_cv); 8482 mutex_exit(&orp->r_statelock); 8483 goto recov_retry; 8484 } else { 8485 if (e.error != 0) { 8486 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8487 &recov_state, needrecov); 8488 goto out; 8489 } 8490 /* fall through for res.status case */ 8491 } 8492 } 8493 8494 resp = &res; 8495 /* 8496 * If OP_RENAME (or any prev op) failed, then return an error. 8497 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8498 */ 8499 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8500 /* 8501 * Error in an op other than last Getattr 8502 */ 8503 e.error = geterrno4(res.status); 8504 PURGE_ATTRCACHE4(odvp); 8505 PURGE_ATTRCACHE4(ndvp); 8506 /* 8507 * System V defines rename to return EEXIST, not 8508 * ENOTEMPTY if the target directory is not empty. 8509 * Over the wire, the error is NFSERR_ENOTEMPTY 8510 * which geterrno4 maps to ENOTEMPTY. 8511 */ 8512 if (e.error == ENOTEMPTY) 8513 e.error = EEXIST; 8514 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8515 needrecov); 8516 goto out; 8517 } 8518 8519 /* rename results */ 8520 rn_res = &res.array[5].nfs_resop4_u.oprename; 8521 8522 if (res.status == NFS4_OK) { 8523 /* Update target attribute, readdir and dnlc caches */ 8524 dinfo.di_garp = 8525 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8526 dinfo.di_cred = cr; 8527 dinfo.di_time_call = t; 8528 } else 8529 dinfop = NULL; 8530 8531 /* Update source cache attribute, readdir and dnlc caches */ 8532 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8533 8534 /* Update source cache attribute, readdir and dnlc caches */ 8535 if (ndvp != odvp) { 8536 update_parentdir_sfh(ovp, ndvp); 8537 8538 /* 8539 * If dinfop is non-NULL, then compound succeded, so 8540 * set di_garp to attrs for source dir. dinfop is only 8541 * set to NULL when compound fails. 8542 */ 8543 if (dinfop) 8544 dinfo.di_garp = 8545 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8546 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8547 dinfop); 8548 } 8549 8550 /* 8551 * Update the rnode with the new component name and args, 8552 * and if the file handle changed, also update it with the new fh. 8553 * This is only necessary if the target object has an rnode 8554 * entry and there is no need to create one for it. 8555 */ 8556 resop = &res.array[8]; /* getfh new res */ 8557 ngf_res = &resop->nfs_resop4_u.opgetfh; 8558 8559 /* 8560 * Update the path and filehandle for the renamed object. 8561 */ 8562 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8563 8564 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8565 8566 if (res.status == NFS4_OK) { 8567 resop++; /* getattr res */ 8568 e.error = nfs4_update_attrcache(res.status, 8569 &resop->nfs_resop4_u.opgetattr.ga_res, 8570 t, ovp, cr); 8571 } 8572 8573 out: 8574 kmem_free(argop, argoplist_size); 8575 if (resp) 8576 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8577 mutex_enter(&orp->r_statelock); 8578 orp->r_flags &= ~R4RECEXPFH; 8579 cv_broadcast(&orp->r_cv); 8580 mutex_exit(&orp->r_statelock); 8581 8582 return (e.error); 8583 } 8584 8585 /* ARGSUSED */ 8586 static int 8587 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8588 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8589 { 8590 int error; 8591 vnode_t *vp; 8592 8593 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8594 return (EPERM); 8595 /* 8596 * As ".." has special meaning and rather than send a mkdir 8597 * over the wire to just let the server freak out, we just 8598 * short circuit it here and return EEXIST 8599 */ 8600 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8601 return (EEXIST); 8602 8603 /* 8604 * Decision to get the right gid and setgid bit of the 8605 * new directory is now made in call_nfs4_create_req. 8606 */ 8607 va->va_mask |= AT_MODE; 8608 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8609 if (error) 8610 return (error); 8611 8612 *vpp = vp; 8613 return (0); 8614 } 8615 8616 8617 /* 8618 * rmdir is using the same remove v4 op as does remove. 8619 * Remove requires that the current fh be the target directory. 8620 * After the operation, the current fh is unchanged. 8621 * The compound op structure is: 8622 * PUTFH(targetdir), REMOVE 8623 */ 8624 /*ARGSUSED4*/ 8625 static int 8626 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8627 caller_context_t *ct, int flags) 8628 { 8629 int need_end_op = FALSE; 8630 COMPOUND4args_clnt args; 8631 COMPOUND4res_clnt res, *resp = NULL; 8632 REMOVE4res *rm_res; 8633 nfs_argop4 argop[3]; 8634 nfs_resop4 *resop; 8635 vnode_t *vp; 8636 int doqueue; 8637 mntinfo4_t *mi; 8638 rnode4_t *drp; 8639 bool_t needrecov = FALSE; 8640 nfs4_recov_state_t recov_state; 8641 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8642 dirattr_info_t dinfo, *dinfop; 8643 8644 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8645 return (EPERM); 8646 /* 8647 * As ".." has special meaning and rather than send a rmdir 8648 * over the wire to just let the server freak out, we just 8649 * short circuit it here and return EEXIST 8650 */ 8651 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8652 return (EEXIST); 8653 8654 drp = VTOR4(dvp); 8655 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8656 return (EINTR); 8657 8658 /* 8659 * Attempt to prevent a rmdir(".") from succeeding. 8660 */ 8661 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8662 if (e.error) { 8663 nfs_rw_exit(&drp->r_rwlock); 8664 return (e.error); 8665 } 8666 if (vp == cdir) { 8667 VN_RELE(vp); 8668 nfs_rw_exit(&drp->r_rwlock); 8669 return (EINVAL); 8670 } 8671 8672 /* 8673 * Since nfsv4 remove op works on both files and directories, 8674 * check that the removed object is indeed a directory. 8675 */ 8676 if (vp->v_type != VDIR) { 8677 VN_RELE(vp); 8678 nfs_rw_exit(&drp->r_rwlock); 8679 return (ENOTDIR); 8680 } 8681 8682 /* 8683 * First just remove the entry from the name cache, as it 8684 * is most likely an entry for this vp. 8685 */ 8686 dnlc_remove(dvp, nm); 8687 8688 /* 8689 * If there vnode reference count is greater than one, then 8690 * there may be additional references in the DNLC which will 8691 * need to be purged. First, trying removing the entry for 8692 * the parent directory and see if that removes the additional 8693 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8694 * to completely remove any references to the directory which 8695 * might still exist in the DNLC. 8696 */ 8697 if (vp->v_count > 1) { 8698 dnlc_remove(vp, ".."); 8699 if (vp->v_count > 1) 8700 dnlc_purge_vp(vp); 8701 } 8702 8703 mi = VTOMI4(dvp); 8704 recov_state.rs_flags = 0; 8705 recov_state.rs_num_retry_despite_err = 0; 8706 8707 recov_retry: 8708 args.ctag = TAG_RMDIR; 8709 8710 /* 8711 * Rmdir ops: putfh dir; remove 8712 */ 8713 args.array_len = 3; 8714 args.array = argop; 8715 8716 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8717 if (e.error) { 8718 nfs_rw_exit(&drp->r_rwlock); 8719 return (e.error); 8720 } 8721 need_end_op = TRUE; 8722 8723 /* putfh directory */ 8724 argop[0].argop = OP_CPUTFH; 8725 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8726 8727 /* remove */ 8728 argop[1].argop = OP_CREMOVE; 8729 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8730 8731 /* getattr (postop attrs for dir that contained removed dir) */ 8732 argop[2].argop = OP_GETATTR; 8733 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8734 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8735 8736 dinfo.di_time_call = gethrtime(); 8737 doqueue = 1; 8738 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8739 8740 PURGE_ATTRCACHE4(vp); 8741 8742 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8743 if (e.error) { 8744 PURGE_ATTRCACHE4(dvp); 8745 } 8746 8747 if (needrecov) { 8748 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8749 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 8750 if (!e.error) 8751 (void) xdr_free(xdr_COMPOUND4res_clnt, 8752 (caddr_t)&res); 8753 8754 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8755 needrecov); 8756 need_end_op = FALSE; 8757 goto recov_retry; 8758 } 8759 } 8760 8761 if (!e.error) { 8762 resp = &res; 8763 8764 /* 8765 * Only return error if first 2 ops (OP_REMOVE or earlier) 8766 * failed. 8767 */ 8768 if (res.status != NFS4_OK && res.array_len <= 2) { 8769 e.error = geterrno4(res.status); 8770 PURGE_ATTRCACHE4(dvp); 8771 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8772 &recov_state, needrecov); 8773 need_end_op = FALSE; 8774 nfs4_purge_stale_fh(e.error, dvp, cr); 8775 /* 8776 * System V defines rmdir to return EEXIST, not 8777 * ENOTEMPTY if the directory is not empty. Over 8778 * the wire, the error is NFSERR_ENOTEMPTY which 8779 * geterrno4 maps to ENOTEMPTY. 8780 */ 8781 if (e.error == ENOTEMPTY) 8782 e.error = EEXIST; 8783 } else { 8784 resop = &res.array[1]; /* remove res */ 8785 rm_res = &resop->nfs_resop4_u.opremove; 8786 8787 if (res.status == NFS4_OK) { 8788 resop = &res.array[2]; /* dir attrs */ 8789 dinfo.di_garp = 8790 &resop->nfs_resop4_u.opgetattr.ga_res; 8791 dinfo.di_cred = cr; 8792 dinfop = &dinfo; 8793 } else 8794 dinfop = NULL; 8795 8796 /* Update dir attribute, readdir and dnlc caches */ 8797 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8798 dinfop); 8799 8800 /* destroy rddir cache for dir that was removed */ 8801 if (VTOR4(vp)->r_dir != NULL) 8802 nfs4_purge_rddir_cache(vp); 8803 } 8804 } 8805 8806 if (need_end_op) 8807 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8808 8809 nfs_rw_exit(&drp->r_rwlock); 8810 8811 if (resp) 8812 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8813 8814 if (e.error == 0) { 8815 vnode_t *tvp; 8816 rnode4_t *trp; 8817 trp = VTOR4(vp); 8818 tvp = vp; 8819 if (IS_SHADOW(vp, trp)) 8820 tvp = RTOV4(trp); 8821 vnevent_rmdir(tvp, dvp, nm, ct); 8822 } 8823 8824 VN_RELE(vp); 8825 8826 return (e.error); 8827 } 8828 8829 /* ARGSUSED */ 8830 static int 8831 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8832 caller_context_t *ct, int flags) 8833 { 8834 int error; 8835 vnode_t *vp; 8836 rnode4_t *rp; 8837 char *contents; 8838 mntinfo4_t *mi = VTOMI4(dvp); 8839 8840 if (nfs_zone() != mi->mi_zone) 8841 return (EPERM); 8842 if (!(mi->mi_flags & MI4_SYMLINK)) 8843 return (EOPNOTSUPP); 8844 8845 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8846 if (error) 8847 return (error); 8848 8849 ASSERT(nfs4_consistent_type(vp)); 8850 rp = VTOR4(vp); 8851 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8852 8853 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8854 8855 if (contents != NULL) { 8856 mutex_enter(&rp->r_statelock); 8857 if (rp->r_symlink.contents == NULL) { 8858 rp->r_symlink.len = strlen(tnm); 8859 bcopy(tnm, contents, rp->r_symlink.len); 8860 rp->r_symlink.contents = contents; 8861 rp->r_symlink.size = MAXPATHLEN; 8862 mutex_exit(&rp->r_statelock); 8863 } else { 8864 mutex_exit(&rp->r_statelock); 8865 kmem_free((void *)contents, MAXPATHLEN); 8866 } 8867 } 8868 } 8869 VN_RELE(vp); 8870 8871 return (error); 8872 } 8873 8874 8875 /* 8876 * Read directory entries. 8877 * There are some weird things to look out for here. The uio_loffset 8878 * field is either 0 or it is the offset returned from a previous 8879 * readdir. It is an opaque value used by the server to find the 8880 * correct directory block to read. The count field is the number 8881 * of blocks to read on the server. This is advisory only, the server 8882 * may return only one block's worth of entries. Entries may be compressed 8883 * on the server. 8884 */ 8885 /* ARGSUSED */ 8886 static int 8887 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8888 caller_context_t *ct, int flags) 8889 { 8890 int error; 8891 uint_t count; 8892 rnode4_t *rp; 8893 rddir4_cache *rdc; 8894 rddir4_cache *rrdc; 8895 8896 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8897 return (EIO); 8898 rp = VTOR4(vp); 8899 8900 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8901 8902 /* 8903 * Make sure that the directory cache is valid. 8904 */ 8905 if (rp->r_dir != NULL) { 8906 if (nfs_disable_rddir_cache != 0) { 8907 /* 8908 * Setting nfs_disable_rddir_cache in /etc/system 8909 * allows interoperability with servers that do not 8910 * properly update the attributes of directories. 8911 * Any cached information gets purged before an 8912 * access is made to it. 8913 */ 8914 nfs4_purge_rddir_cache(vp); 8915 } 8916 8917 error = nfs4_validate_caches(vp, cr); 8918 if (error) 8919 return (error); 8920 } 8921 8922 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8923 8924 /* 8925 * Short circuit last readdir which always returns 0 bytes. 8926 * This can be done after the directory has been read through 8927 * completely at least once. This will set r_direof which 8928 * can be used to find the value of the last cookie. 8929 */ 8930 mutex_enter(&rp->r_statelock); 8931 if (rp->r_direof != NULL && 8932 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8933 mutex_exit(&rp->r_statelock); 8934 #ifdef DEBUG 8935 nfs4_readdir_cache_shorts++; 8936 #endif 8937 if (eofp) 8938 *eofp = 1; 8939 return (0); 8940 } 8941 8942 /* 8943 * Look for a cache entry. Cache entries are identified 8944 * by the NFS cookie value and the byte count requested. 8945 */ 8946 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8947 8948 /* 8949 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8950 */ 8951 if (rdc == NULL) { 8952 mutex_exit(&rp->r_statelock); 8953 return (EINTR); 8954 } 8955 8956 /* 8957 * Check to see if we need to fill this entry in. 8958 */ 8959 if (rdc->flags & RDDIRREQ) { 8960 rdc->flags &= ~RDDIRREQ; 8961 rdc->flags |= RDDIR; 8962 mutex_exit(&rp->r_statelock); 8963 8964 /* 8965 * Do the readdir. 8966 */ 8967 nfs4readdir(vp, rdc, cr); 8968 8969 /* 8970 * Reacquire the lock, so that we can continue 8971 */ 8972 mutex_enter(&rp->r_statelock); 8973 /* 8974 * The entry is now complete 8975 */ 8976 rdc->flags &= ~RDDIR; 8977 } 8978 8979 ASSERT(!(rdc->flags & RDDIR)); 8980 8981 /* 8982 * If an error occurred while attempting 8983 * to fill the cache entry, mark the entry invalid and 8984 * just return the error. 8985 */ 8986 if (rdc->error) { 8987 error = rdc->error; 8988 rdc->flags |= RDDIRREQ; 8989 rddir4_cache_rele(rp, rdc); 8990 mutex_exit(&rp->r_statelock); 8991 return (error); 8992 } 8993 8994 /* 8995 * The cache entry is complete and good, 8996 * copyout the dirent structs to the calling 8997 * thread. 8998 */ 8999 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 9000 9001 /* 9002 * If no error occurred during the copyout, 9003 * update the offset in the uio struct to 9004 * contain the value of the next NFS 4 cookie 9005 * and set the eof value appropriately. 9006 */ 9007 if (!error) { 9008 uiop->uio_loffset = rdc->nfs4_ncookie; 9009 if (eofp) 9010 *eofp = rdc->eof; 9011 } 9012 9013 /* 9014 * Decide whether to do readahead. Don't if we 9015 * have already read to the end of directory. 9016 */ 9017 if (rdc->eof) { 9018 /* 9019 * Make the entry the direof only if it is cached 9020 */ 9021 if (rdc->flags & RDDIRCACHED) 9022 rp->r_direof = rdc; 9023 rddir4_cache_rele(rp, rdc); 9024 mutex_exit(&rp->r_statelock); 9025 return (error); 9026 } 9027 9028 /* Determine if a readdir readahead should be done */ 9029 if (!(rp->r_flags & R4LOOKUP)) { 9030 rddir4_cache_rele(rp, rdc); 9031 mutex_exit(&rp->r_statelock); 9032 return (error); 9033 } 9034 9035 /* 9036 * Now look for a readahead entry. 9037 * 9038 * Check to see whether we found an entry for the readahead. 9039 * If so, we don't need to do anything further, so free the new 9040 * entry if one was allocated. Otherwise, allocate a new entry, add 9041 * it to the cache, and then initiate an asynchronous readdir 9042 * operation to fill it. 9043 */ 9044 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 9045 9046 /* 9047 * A readdir cache entry could not be obtained for the readahead. In 9048 * this case we skip the readahead and return. 9049 */ 9050 if (rrdc == NULL) { 9051 rddir4_cache_rele(rp, rdc); 9052 mutex_exit(&rp->r_statelock); 9053 return (error); 9054 } 9055 9056 /* 9057 * Check to see if we need to fill this entry in. 9058 */ 9059 if (rrdc->flags & RDDIRREQ) { 9060 rrdc->flags &= ~RDDIRREQ; 9061 rrdc->flags |= RDDIR; 9062 rddir4_cache_rele(rp, rdc); 9063 mutex_exit(&rp->r_statelock); 9064 #ifdef DEBUG 9065 nfs4_readdir_readahead++; 9066 #endif 9067 /* 9068 * Do the readdir. 9069 */ 9070 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9071 return (error); 9072 } 9073 9074 rddir4_cache_rele(rp, rrdc); 9075 rddir4_cache_rele(rp, rdc); 9076 mutex_exit(&rp->r_statelock); 9077 return (error); 9078 } 9079 9080 static int 9081 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9082 { 9083 int error; 9084 rnode4_t *rp; 9085 9086 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9087 9088 rp = VTOR4(vp); 9089 9090 /* 9091 * Obtain the readdir results for the caller. 9092 */ 9093 nfs4readdir(vp, rdc, cr); 9094 9095 mutex_enter(&rp->r_statelock); 9096 /* 9097 * The entry is now complete 9098 */ 9099 rdc->flags &= ~RDDIR; 9100 9101 error = rdc->error; 9102 if (error) 9103 rdc->flags |= RDDIRREQ; 9104 rddir4_cache_rele(rp, rdc); 9105 mutex_exit(&rp->r_statelock); 9106 9107 return (error); 9108 } 9109 9110 /* 9111 * Read directory entries. 9112 * There are some weird things to look out for here. The uio_loffset 9113 * field is either 0 or it is the offset returned from a previous 9114 * readdir. It is an opaque value used by the server to find the 9115 * correct directory block to read. The count field is the number 9116 * of blocks to read on the server. This is advisory only, the server 9117 * may return only one block's worth of entries. Entries may be compressed 9118 * on the server. 9119 * 9120 * Generates the following compound request: 9121 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9122 * must include a Lookupp as well. In this case, send: 9123 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9124 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9125 * 9126 * Get complete attributes and filehandles for entries if this is the 9127 * first read of the directory. Otherwise, just get fileid's. 9128 */ 9129 static void 9130 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9131 { 9132 COMPOUND4args_clnt args; 9133 COMPOUND4res_clnt res; 9134 READDIR4args *rargs; 9135 READDIR4res_clnt *rd_res; 9136 bitmap4 rd_bitsval; 9137 nfs_argop4 argop[5]; 9138 nfs_resop4 *resop; 9139 rnode4_t *rp = VTOR4(vp); 9140 mntinfo4_t *mi = VTOMI4(vp); 9141 int doqueue; 9142 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9143 vnode_t *dvp; 9144 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9145 int num_ops, res_opcnt; 9146 bool_t needrecov = FALSE; 9147 nfs4_recov_state_t recov_state; 9148 hrtime_t t; 9149 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9150 9151 ASSERT(nfs_zone() == mi->mi_zone); 9152 ASSERT(rdc->flags & RDDIR); 9153 ASSERT(rdc->entries == NULL); 9154 9155 /* 9156 * If rp were a stub, it should have triggered and caused 9157 * a mount for us to get this far. 9158 */ 9159 ASSERT(!RP_ISSTUB(rp)); 9160 9161 num_ops = 2; 9162 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9163 /* 9164 * Since nfsv4 readdir may not return entries for "." and "..", 9165 * the client must recreate them: 9166 * To find the correct nodeid, do the following: 9167 * For current node, get nodeid from dnlc. 9168 * - if current node is rootvp, set pnodeid to nodeid. 9169 * - else if parent is in the dnlc, get its nodeid from there. 9170 * - else add LOOKUPP+GETATTR to compound. 9171 */ 9172 nodeid = rp->r_attr.va_nodeid; 9173 if (vp->v_flag & VROOT) { 9174 pnodeid = nodeid; /* root of mount point */ 9175 } else { 9176 dvp = dnlc_lookup(vp, ".."); 9177 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9178 /* parent in dnlc cache - no need for otw */ 9179 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9180 } else { 9181 /* 9182 * parent not in dnlc cache, 9183 * do lookupp to get its id 9184 */ 9185 num_ops = 5; 9186 pnodeid = 0; /* set later by getattr parent */ 9187 } 9188 if (dvp) 9189 VN_RELE(dvp); 9190 } 9191 } 9192 recov_state.rs_flags = 0; 9193 recov_state.rs_num_retry_despite_err = 0; 9194 9195 /* Save the original mount point security flavor */ 9196 (void) save_mnt_secinfo(mi->mi_curr_serv); 9197 9198 recov_retry: 9199 args.ctag = TAG_READDIR; 9200 9201 args.array = argop; 9202 args.array_len = num_ops; 9203 9204 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9205 &recov_state, NULL)) { 9206 /* 9207 * If readdir a node that is a stub for a crossed mount point, 9208 * keep the original secinfo flavor for the current file 9209 * system, not the crossed one. 9210 */ 9211 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9212 rdc->error = e.error; 9213 return; 9214 } 9215 9216 /* 9217 * Determine which attrs to request for dirents. This code 9218 * must be protected by nfs4_start/end_fop because of r_server 9219 * (which will change during failover recovery). 9220 * 9221 */ 9222 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9223 /* 9224 * Get all vattr attrs plus filehandle and rdattr_error 9225 */ 9226 rd_bitsval = NFS4_VATTR_MASK | 9227 FATTR4_RDATTR_ERROR_MASK | 9228 FATTR4_FILEHANDLE_MASK; 9229 9230 if (rp->r_flags & R4READDIRWATTR) { 9231 mutex_enter(&rp->r_statelock); 9232 rp->r_flags &= ~R4READDIRWATTR; 9233 mutex_exit(&rp->r_statelock); 9234 } 9235 } else { 9236 servinfo4_t *svp = rp->r_server; 9237 9238 /* 9239 * Already read directory. Use readdir with 9240 * no attrs (except for mounted_on_fileid) for updates. 9241 */ 9242 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9243 9244 /* 9245 * request mounted on fileid if supported, else request 9246 * fileid. maybe we should verify that fileid is supported 9247 * and request something else if not. 9248 */ 9249 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9250 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9251 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9252 nfs_rw_exit(&svp->sv_lock); 9253 } 9254 9255 /* putfh directory fh */ 9256 argop[0].argop = OP_CPUTFH; 9257 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9258 9259 argop[1].argop = OP_READDIR; 9260 rargs = &argop[1].nfs_argop4_u.opreaddir; 9261 /* 9262 * 1 and 2 are reserved for client "." and ".." entry offset. 9263 * cookie 0 should be used over-the-wire to start reading at 9264 * the beginning of the directory excluding "." and "..". 9265 */ 9266 if (rdc->nfs4_cookie == 0 || 9267 rdc->nfs4_cookie == 1 || 9268 rdc->nfs4_cookie == 2) { 9269 rargs->cookie = (nfs_cookie4)0; 9270 rargs->cookieverf = 0; 9271 } else { 9272 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9273 mutex_enter(&rp->r_statelock); 9274 rargs->cookieverf = rp->r_cookieverf4; 9275 mutex_exit(&rp->r_statelock); 9276 } 9277 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9278 rargs->maxcount = mi->mi_tsize; 9279 rargs->attr_request = rd_bitsval; 9280 rargs->rdc = rdc; 9281 rargs->dvp = vp; 9282 rargs->mi = mi; 9283 rargs->cr = cr; 9284 9285 9286 /* 9287 * If count < than the minimum required, we return no entries 9288 * and fail with EINVAL 9289 */ 9290 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9291 rdc->error = EINVAL; 9292 goto out; 9293 } 9294 9295 if (args.array_len == 5) { 9296 /* 9297 * Add lookupp and getattr for parent nodeid. 9298 */ 9299 argop[2].argop = OP_LOOKUPP; 9300 9301 argop[3].argop = OP_GETFH; 9302 9303 /* getattr parent */ 9304 argop[4].argop = OP_GETATTR; 9305 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9306 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9307 } 9308 9309 doqueue = 1; 9310 9311 if (mi->mi_io_kstats) { 9312 mutex_enter(&mi->mi_lock); 9313 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9314 mutex_exit(&mi->mi_lock); 9315 } 9316 9317 /* capture the time of this call */ 9318 rargs->t = t = gethrtime(); 9319 9320 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9321 9322 if (mi->mi_io_kstats) { 9323 mutex_enter(&mi->mi_lock); 9324 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9325 mutex_exit(&mi->mi_lock); 9326 } 9327 9328 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9329 9330 /* 9331 * If RPC error occurred and it isn't an error that 9332 * triggers recovery, then go ahead and fail now. 9333 */ 9334 if (e.error != 0 && !needrecov) { 9335 rdc->error = e.error; 9336 goto out; 9337 } 9338 9339 if (needrecov) { 9340 bool_t abort; 9341 9342 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9343 "nfs4readdir: initiating recovery.\n")); 9344 9345 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9346 NULL, OP_READDIR, NULL, NULL, NULL); 9347 if (abort == FALSE) { 9348 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9349 &recov_state, needrecov); 9350 if (!e.error) 9351 (void) xdr_free(xdr_COMPOUND4res_clnt, 9352 (caddr_t)&res); 9353 if (rdc->entries != NULL) { 9354 kmem_free(rdc->entries, rdc->entlen); 9355 rdc->entries = NULL; 9356 } 9357 goto recov_retry; 9358 } 9359 9360 if (e.error != 0) { 9361 rdc->error = e.error; 9362 goto out; 9363 } 9364 9365 /* fall through for res.status case */ 9366 } 9367 9368 res_opcnt = res.array_len; 9369 9370 /* 9371 * If compound failed first 2 ops (PUTFH+READDIR), then return 9372 * failure here. Subsequent ops are for filling out dot-dot 9373 * dirent, and if they fail, we still want to give the caller 9374 * the dirents returned by (the successful) READDIR op, so we need 9375 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9376 * 9377 * One example where PUTFH+READDIR ops would succeed but 9378 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9379 * but lacks x. In this case, a POSIX server's VOP_READDIR 9380 * would succeed; however, VOP_LOOKUP(..) would fail since no 9381 * x perm. We need to come up with a non-vendor-specific way 9382 * for a POSIX server to return d_ino from dotdot's dirent if 9383 * client only requests mounted_on_fileid, and just say the 9384 * LOOKUPP succeeded and fill out the GETATTR. However, if 9385 * client requested any mandatory attrs, server would be required 9386 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9387 * for dotdot. 9388 */ 9389 9390 if (res.status) { 9391 if (res_opcnt <= 2) { 9392 e.error = geterrno4(res.status); 9393 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9394 &recov_state, needrecov); 9395 nfs4_purge_stale_fh(e.error, vp, cr); 9396 rdc->error = e.error; 9397 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9398 if (rdc->entries != NULL) { 9399 kmem_free(rdc->entries, rdc->entlen); 9400 rdc->entries = NULL; 9401 } 9402 /* 9403 * If readdir a node that is a stub for a 9404 * crossed mount point, keep the original 9405 * secinfo flavor for the current file system, 9406 * not the crossed one. 9407 */ 9408 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9409 return; 9410 } 9411 } 9412 9413 resop = &res.array[1]; /* readdir res */ 9414 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9415 9416 mutex_enter(&rp->r_statelock); 9417 rp->r_cookieverf4 = rd_res->cookieverf; 9418 mutex_exit(&rp->r_statelock); 9419 9420 /* 9421 * For "." and ".." entries 9422 * e.g. 9423 * seek(cookie=0) -> "." entry with d_off = 1 9424 * seek(cookie=1) -> ".." entry with d_off = 2 9425 */ 9426 if (cookie == (nfs_cookie4) 0) { 9427 if (rd_res->dotp) 9428 rd_res->dotp->d_ino = nodeid; 9429 if (rd_res->dotdotp) 9430 rd_res->dotdotp->d_ino = pnodeid; 9431 } 9432 if (cookie == (nfs_cookie4) 1) { 9433 if (rd_res->dotdotp) 9434 rd_res->dotdotp->d_ino = pnodeid; 9435 } 9436 9437 9438 /* LOOKUPP+GETATTR attemped */ 9439 if (args.array_len == 5 && rd_res->dotdotp) { 9440 if (res.status == NFS4_OK && res_opcnt == 5) { 9441 nfs_fh4 *fhp; 9442 nfs4_sharedfh_t *sfhp; 9443 vnode_t *pvp; 9444 nfs4_ga_res_t *garp; 9445 9446 resop++; /* lookupp */ 9447 resop++; /* getfh */ 9448 fhp = &resop->nfs_resop4_u.opgetfh.object; 9449 9450 resop++; /* getattr of parent */ 9451 9452 /* 9453 * First, take care of finishing the 9454 * readdir results. 9455 */ 9456 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9457 /* 9458 * The d_ino of .. must be the inode number 9459 * of the mounted filesystem. 9460 */ 9461 if (garp->n4g_va.va_mask & AT_NODEID) 9462 rd_res->dotdotp->d_ino = 9463 garp->n4g_va.va_nodeid; 9464 9465 9466 /* 9467 * Next, create the ".." dnlc entry 9468 */ 9469 sfhp = sfh4_get(fhp, mi); 9470 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9471 dnlc_update(vp, "..", pvp); 9472 VN_RELE(pvp); 9473 } 9474 sfh4_rele(&sfhp); 9475 } 9476 } 9477 9478 if (mi->mi_io_kstats) { 9479 mutex_enter(&mi->mi_lock); 9480 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9481 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9482 mutex_exit(&mi->mi_lock); 9483 } 9484 9485 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9486 9487 out: 9488 /* 9489 * If readdir a node that is a stub for a crossed mount point, 9490 * keep the original secinfo flavor for the current file system, 9491 * not the crossed one. 9492 */ 9493 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9494 9495 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9496 } 9497 9498 9499 static int 9500 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9501 { 9502 rnode4_t *rp = VTOR4(bp->b_vp); 9503 int count; 9504 int error; 9505 cred_t *cred_otw = NULL; 9506 offset_t offset; 9507 nfs4_open_stream_t *osp = NULL; 9508 bool_t first_time = TRUE; /* first time getting otw cred */ 9509 bool_t last_time = FALSE; /* last time getting otw cred */ 9510 9511 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9512 9513 DTRACE_IO1(start, struct buf *, bp); 9514 offset = ldbtob(bp->b_lblkno); 9515 9516 if (bp->b_flags & B_READ) { 9517 read_again: 9518 /* 9519 * Releases the osp, if it is provided. 9520 * Puts a hold on the cred_otw and the new osp (if found). 9521 */ 9522 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9523 &first_time, &last_time); 9524 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9525 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9526 readahead, NULL); 9527 crfree(cred_otw); 9528 if (!error) { 9529 if (bp->b_resid) { 9530 /* 9531 * Didn't get it all because we hit EOF, 9532 * zero all the memory beyond the EOF. 9533 */ 9534 /* bzero(rdaddr + */ 9535 bzero(bp->b_un.b_addr + 9536 bp->b_bcount - bp->b_resid, bp->b_resid); 9537 } 9538 mutex_enter(&rp->r_statelock); 9539 if (bp->b_resid == bp->b_bcount && 9540 offset >= rp->r_size) { 9541 /* 9542 * We didn't read anything at all as we are 9543 * past EOF. Return an error indicator back 9544 * but don't destroy the pages (yet). 9545 */ 9546 error = NFS_EOF; 9547 } 9548 mutex_exit(&rp->r_statelock); 9549 } else if (error == EACCES && last_time == FALSE) { 9550 goto read_again; 9551 } 9552 } else { 9553 if (!(rp->r_flags & R4STALE)) { 9554 write_again: 9555 /* 9556 * Releases the osp, if it is provided. 9557 * Puts a hold on the cred_otw and the new 9558 * osp (if found). 9559 */ 9560 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9561 &first_time, &last_time); 9562 mutex_enter(&rp->r_statelock); 9563 count = MIN(bp->b_bcount, rp->r_size - offset); 9564 mutex_exit(&rp->r_statelock); 9565 if (count < 0) 9566 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9567 #ifdef DEBUG 9568 if (count == 0) { 9569 zoneid_t zoneid = getzoneid(); 9570 9571 zcmn_err(zoneid, CE_WARN, 9572 "nfs4_bio: zero length write at %lld", 9573 offset); 9574 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9575 "b_bcount=%ld, file size=%lld", 9576 rp->r_flags, (long)bp->b_bcount, 9577 rp->r_size); 9578 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9579 if (nfs4_bio_do_stop) 9580 debug_enter("nfs4_bio"); 9581 } 9582 #endif 9583 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9584 count, cred_otw, stab_comm); 9585 if (error == EACCES && last_time == FALSE) { 9586 crfree(cred_otw); 9587 goto write_again; 9588 } 9589 bp->b_error = error; 9590 if (error && error != EINTR && 9591 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9592 /* 9593 * Don't print EDQUOT errors on the console. 9594 * Don't print asynchronous EACCES errors. 9595 * Don't print EFBIG errors. 9596 * Print all other write errors. 9597 */ 9598 if (error != EDQUOT && error != EFBIG && 9599 (error != EACCES || 9600 !(bp->b_flags & B_ASYNC))) 9601 nfs4_write_error(bp->b_vp, 9602 error, cred_otw); 9603 /* 9604 * Update r_error and r_flags as appropriate. 9605 * If the error was ESTALE, then mark the 9606 * rnode as not being writeable and save 9607 * the error status. Otherwise, save any 9608 * errors which occur from asynchronous 9609 * page invalidations. Any errors occurring 9610 * from other operations should be saved 9611 * by the caller. 9612 */ 9613 mutex_enter(&rp->r_statelock); 9614 if (error == ESTALE) { 9615 rp->r_flags |= R4STALE; 9616 if (!rp->r_error) 9617 rp->r_error = error; 9618 } else if (!rp->r_error && 9619 (bp->b_flags & 9620 (B_INVAL|B_FORCE|B_ASYNC)) == 9621 (B_INVAL|B_FORCE|B_ASYNC)) { 9622 rp->r_error = error; 9623 } 9624 mutex_exit(&rp->r_statelock); 9625 } 9626 crfree(cred_otw); 9627 } else { 9628 error = rp->r_error; 9629 /* 9630 * A close may have cleared r_error, if so, 9631 * propagate ESTALE error return properly 9632 */ 9633 if (error == 0) 9634 error = ESTALE; 9635 } 9636 } 9637 9638 if (error != 0 && error != NFS_EOF) 9639 bp->b_flags |= B_ERROR; 9640 9641 if (osp) 9642 open_stream_rele(osp, rp); 9643 9644 DTRACE_IO1(done, struct buf *, bp); 9645 9646 return (error); 9647 } 9648 9649 /* ARGSUSED */ 9650 int 9651 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9652 { 9653 return (EREMOTE); 9654 } 9655 9656 /* ARGSUSED2 */ 9657 int 9658 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9659 { 9660 rnode4_t *rp = VTOR4(vp); 9661 9662 if (!write_lock) { 9663 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9664 return (V_WRITELOCK_FALSE); 9665 } 9666 9667 if ((rp->r_flags & R4DIRECTIO) || 9668 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9669 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9670 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9671 return (V_WRITELOCK_FALSE); 9672 nfs_rw_exit(&rp->r_rwlock); 9673 } 9674 9675 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9676 return (V_WRITELOCK_TRUE); 9677 } 9678 9679 /* ARGSUSED */ 9680 void 9681 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9682 { 9683 rnode4_t *rp = VTOR4(vp); 9684 9685 nfs_rw_exit(&rp->r_rwlock); 9686 } 9687 9688 /* ARGSUSED */ 9689 static int 9690 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9691 { 9692 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9693 return (EIO); 9694 9695 /* 9696 * Because we stuff the readdir cookie into the offset field 9697 * someone may attempt to do an lseek with the cookie which 9698 * we want to succeed. 9699 */ 9700 if (vp->v_type == VDIR) 9701 return (0); 9702 if (*noffp < 0) 9703 return (EINVAL); 9704 return (0); 9705 } 9706 9707 9708 /* 9709 * Return all the pages from [off..off+len) in file 9710 */ 9711 /* ARGSUSED */ 9712 static int 9713 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9714 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9715 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9716 { 9717 rnode4_t *rp; 9718 int error; 9719 mntinfo4_t *mi; 9720 9721 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9722 return (EIO); 9723 rp = VTOR4(vp); 9724 if (IS_SHADOW(vp, rp)) 9725 vp = RTOV4(rp); 9726 9727 if (vp->v_flag & VNOMAP) 9728 return (ENOSYS); 9729 9730 if (protp != NULL) 9731 *protp = PROT_ALL; 9732 9733 /* 9734 * Now validate that the caches are up to date. 9735 */ 9736 if (error = nfs4_validate_caches(vp, cr)) 9737 return (error); 9738 9739 mi = VTOMI4(vp); 9740 retry: 9741 mutex_enter(&rp->r_statelock); 9742 9743 /* 9744 * Don't create dirty pages faster than they 9745 * can be cleaned so that the system doesn't 9746 * get imbalanced. If the async queue is 9747 * maxed out, then wait for it to drain before 9748 * creating more dirty pages. Also, wait for 9749 * any threads doing pagewalks in the vop_getattr 9750 * entry points so that they don't block for 9751 * long periods. 9752 */ 9753 if (rw == S_CREATE) { 9754 while ((mi->mi_max_threads != 0 && 9755 rp->r_awcount > 2 * mi->mi_max_threads) || 9756 rp->r_gcount > 0) 9757 cv_wait(&rp->r_cv, &rp->r_statelock); 9758 } 9759 9760 /* 9761 * If we are getting called as a side effect of an nfs_write() 9762 * operation the local file size might not be extended yet. 9763 * In this case we want to be able to return pages of zeroes. 9764 */ 9765 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9766 NFS4_DEBUG(nfs4_pageio_debug, 9767 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9768 "len=%llu, size=%llu, attrsize =%llu", off, 9769 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9770 mutex_exit(&rp->r_statelock); 9771 return (EFAULT); /* beyond EOF */ 9772 } 9773 9774 mutex_exit(&rp->r_statelock); 9775 9776 if (len <= PAGESIZE) { 9777 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9778 seg, addr, rw, cr); 9779 NFS4_DEBUG(nfs4_pageio_debug && error, 9780 (CE_NOTE, "getpage error %d; off=%lld, " 9781 "len=%lld", error, off, (u_longlong_t)len)); 9782 } else { 9783 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9784 pl, plsz, seg, addr, rw, cr); 9785 NFS4_DEBUG(nfs4_pageio_debug && error, 9786 (CE_NOTE, "getpages error %d; off=%lld, " 9787 "len=%lld", error, off, (u_longlong_t)len)); 9788 } 9789 9790 switch (error) { 9791 case NFS_EOF: 9792 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9793 goto retry; 9794 case ESTALE: 9795 nfs4_purge_stale_fh(error, vp, cr); 9796 } 9797 9798 return (error); 9799 } 9800 9801 /* 9802 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9803 */ 9804 /* ARGSUSED */ 9805 static int 9806 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9807 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9808 enum seg_rw rw, cred_t *cr) 9809 { 9810 rnode4_t *rp; 9811 uint_t bsize; 9812 struct buf *bp; 9813 page_t *pp; 9814 u_offset_t lbn; 9815 u_offset_t io_off; 9816 u_offset_t blkoff; 9817 u_offset_t rablkoff; 9818 size_t io_len; 9819 uint_t blksize; 9820 int error; 9821 int readahead; 9822 int readahead_issued = 0; 9823 int ra_window; /* readahead window */ 9824 page_t *pagefound; 9825 page_t *savepp; 9826 9827 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9828 return (EIO); 9829 9830 rp = VTOR4(vp); 9831 ASSERT(!IS_SHADOW(vp, rp)); 9832 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9833 9834 reread: 9835 bp = NULL; 9836 pp = NULL; 9837 pagefound = NULL; 9838 9839 if (pl != NULL) 9840 pl[0] = NULL; 9841 9842 error = 0; 9843 lbn = off / bsize; 9844 blkoff = lbn * bsize; 9845 9846 /* 9847 * Queueing up the readahead before doing the synchronous read 9848 * results in a significant increase in read throughput because 9849 * of the increased parallelism between the async threads and 9850 * the process context. 9851 */ 9852 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9853 rw != S_CREATE && 9854 !(vp->v_flag & VNOCACHE)) { 9855 mutex_enter(&rp->r_statelock); 9856 9857 /* 9858 * Calculate the number of readaheads to do. 9859 * a) No readaheads at offset = 0. 9860 * b) Do maximum(nfs4_nra) readaheads when the readahead 9861 * window is closed. 9862 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9863 * upon how far the readahead window is open or close. 9864 * d) No readaheads if rp->r_nextr is not within the scope 9865 * of the readahead window (random i/o). 9866 */ 9867 9868 if (off == 0) 9869 readahead = 0; 9870 else if (blkoff == rp->r_nextr) 9871 readahead = nfs4_nra; 9872 else if (rp->r_nextr > blkoff && 9873 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9874 <= (nfs4_nra - 1))) 9875 readahead = nfs4_nra - ra_window; 9876 else 9877 readahead = 0; 9878 9879 rablkoff = rp->r_nextr; 9880 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9881 mutex_exit(&rp->r_statelock); 9882 if (nfs4_async_readahead(vp, rablkoff + bsize, 9883 addr + (rablkoff + bsize - off), 9884 seg, cr, nfs4_readahead) < 0) { 9885 mutex_enter(&rp->r_statelock); 9886 break; 9887 } 9888 readahead--; 9889 rablkoff += bsize; 9890 /* 9891 * Indicate that we did a readahead so 9892 * readahead offset is not updated 9893 * by the synchronous read below. 9894 */ 9895 readahead_issued = 1; 9896 mutex_enter(&rp->r_statelock); 9897 /* 9898 * set readahead offset to 9899 * offset of last async readahead 9900 * request. 9901 */ 9902 rp->r_nextr = rablkoff; 9903 } 9904 mutex_exit(&rp->r_statelock); 9905 } 9906 9907 again: 9908 if ((pagefound = page_exists(vp, off)) == NULL) { 9909 if (pl == NULL) { 9910 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9911 nfs4_readahead); 9912 } else if (rw == S_CREATE) { 9913 /* 9914 * Block for this page is not allocated, or the offset 9915 * is beyond the current allocation size, or we're 9916 * allocating a swap slot and the page was not found, 9917 * so allocate it and return a zero page. 9918 */ 9919 if ((pp = page_create_va(vp, off, 9920 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9921 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9922 io_len = PAGESIZE; 9923 mutex_enter(&rp->r_statelock); 9924 rp->r_nextr = off + PAGESIZE; 9925 mutex_exit(&rp->r_statelock); 9926 } else { 9927 /* 9928 * Need to go to server to get a block 9929 */ 9930 mutex_enter(&rp->r_statelock); 9931 if (blkoff < rp->r_size && 9932 blkoff + bsize > rp->r_size) { 9933 /* 9934 * If less than a block left in 9935 * file read less than a block. 9936 */ 9937 if (rp->r_size <= off) { 9938 /* 9939 * Trying to access beyond EOF, 9940 * set up to get at least one page. 9941 */ 9942 blksize = off + PAGESIZE - blkoff; 9943 } else 9944 blksize = rp->r_size - blkoff; 9945 } else if ((off == 0) || 9946 (off != rp->r_nextr && !readahead_issued)) { 9947 blksize = PAGESIZE; 9948 blkoff = off; /* block = page here */ 9949 } else 9950 blksize = bsize; 9951 mutex_exit(&rp->r_statelock); 9952 9953 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9954 &io_len, blkoff, blksize, 0); 9955 9956 /* 9957 * Some other thread has entered the page, 9958 * so just use it. 9959 */ 9960 if (pp == NULL) 9961 goto again; 9962 9963 /* 9964 * Now round the request size up to page boundaries. 9965 * This ensures that the entire page will be 9966 * initialized to zeroes if EOF is encountered. 9967 */ 9968 io_len = ptob(btopr(io_len)); 9969 9970 bp = pageio_setup(pp, io_len, vp, B_READ); 9971 ASSERT(bp != NULL); 9972 9973 /* 9974 * pageio_setup should have set b_addr to 0. This 9975 * is correct since we want to do I/O on a page 9976 * boundary. bp_mapin will use this addr to calculate 9977 * an offset, and then set b_addr to the kernel virtual 9978 * address it allocated for us. 9979 */ 9980 ASSERT(bp->b_un.b_addr == 0); 9981 9982 bp->b_edev = 0; 9983 bp->b_dev = 0; 9984 bp->b_lblkno = lbtodb(io_off); 9985 bp->b_file = vp; 9986 bp->b_offset = (offset_t)off; 9987 bp_mapin(bp); 9988 9989 /* 9990 * If doing a write beyond what we believe is EOF, 9991 * don't bother trying to read the pages from the 9992 * server, we'll just zero the pages here. We 9993 * don't check that the rw flag is S_WRITE here 9994 * because some implementations may attempt a 9995 * read access to the buffer before copying data. 9996 */ 9997 mutex_enter(&rp->r_statelock); 9998 if (io_off >= rp->r_size && seg == segkmap) { 9999 mutex_exit(&rp->r_statelock); 10000 bzero(bp->b_un.b_addr, io_len); 10001 } else { 10002 mutex_exit(&rp->r_statelock); 10003 error = nfs4_bio(bp, NULL, cr, FALSE); 10004 } 10005 10006 /* 10007 * Unmap the buffer before freeing it. 10008 */ 10009 bp_mapout(bp); 10010 pageio_done(bp); 10011 10012 savepp = pp; 10013 do { 10014 pp->p_fsdata = C_NOCOMMIT; 10015 } while ((pp = pp->p_next) != savepp); 10016 10017 if (error == NFS_EOF) { 10018 /* 10019 * If doing a write system call just return 10020 * zeroed pages, else user tried to get pages 10021 * beyond EOF, return error. We don't check 10022 * that the rw flag is S_WRITE here because 10023 * some implementations may attempt a read 10024 * access to the buffer before copying data. 10025 */ 10026 if (seg == segkmap) 10027 error = 0; 10028 else 10029 error = EFAULT; 10030 } 10031 10032 if (!readahead_issued && !error) { 10033 mutex_enter(&rp->r_statelock); 10034 rp->r_nextr = io_off + io_len; 10035 mutex_exit(&rp->r_statelock); 10036 } 10037 } 10038 } 10039 10040 out: 10041 if (pl == NULL) 10042 return (error); 10043 10044 if (error) { 10045 if (pp != NULL) 10046 pvn_read_done(pp, B_ERROR); 10047 return (error); 10048 } 10049 10050 if (pagefound) { 10051 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 10052 10053 /* 10054 * Page exists in the cache, acquire the appropriate lock. 10055 * If this fails, start all over again. 10056 */ 10057 if ((pp = page_lookup(vp, off, se)) == NULL) { 10058 #ifdef DEBUG 10059 nfs4_lostpage++; 10060 #endif 10061 goto reread; 10062 } 10063 pl[0] = pp; 10064 pl[1] = NULL; 10065 return (0); 10066 } 10067 10068 if (pp != NULL) 10069 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10070 10071 return (error); 10072 } 10073 10074 static void 10075 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10076 cred_t *cr) 10077 { 10078 int error; 10079 page_t *pp; 10080 u_offset_t io_off; 10081 size_t io_len; 10082 struct buf *bp; 10083 uint_t bsize, blksize; 10084 rnode4_t *rp = VTOR4(vp); 10085 page_t *savepp; 10086 10087 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10088 10089 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10090 10091 mutex_enter(&rp->r_statelock); 10092 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10093 /* 10094 * If less than a block left in file read less 10095 * than a block. 10096 */ 10097 blksize = rp->r_size - blkoff; 10098 } else 10099 blksize = bsize; 10100 mutex_exit(&rp->r_statelock); 10101 10102 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10103 &io_off, &io_len, blkoff, blksize, 1); 10104 /* 10105 * The isra flag passed to the kluster function is 1, we may have 10106 * gotten a return value of NULL for a variety of reasons (# of free 10107 * pages < minfree, someone entered the page on the vnode etc). In all 10108 * cases, we want to punt on the readahead. 10109 */ 10110 if (pp == NULL) 10111 return; 10112 10113 /* 10114 * Now round the request size up to page boundaries. 10115 * This ensures that the entire page will be 10116 * initialized to zeroes if EOF is encountered. 10117 */ 10118 io_len = ptob(btopr(io_len)); 10119 10120 bp = pageio_setup(pp, io_len, vp, B_READ); 10121 ASSERT(bp != NULL); 10122 10123 /* 10124 * pageio_setup should have set b_addr to 0. This is correct since 10125 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10126 * to calculate an offset, and then set b_addr to the kernel virtual 10127 * address it allocated for us. 10128 */ 10129 ASSERT(bp->b_un.b_addr == 0); 10130 10131 bp->b_edev = 0; 10132 bp->b_dev = 0; 10133 bp->b_lblkno = lbtodb(io_off); 10134 bp->b_file = vp; 10135 bp->b_offset = (offset_t)blkoff; 10136 bp_mapin(bp); 10137 10138 /* 10139 * If doing a write beyond what we believe is EOF, don't bother trying 10140 * to read the pages from the server, we'll just zero the pages here. 10141 * We don't check that the rw flag is S_WRITE here because some 10142 * implementations may attempt a read access to the buffer before 10143 * copying data. 10144 */ 10145 mutex_enter(&rp->r_statelock); 10146 if (io_off >= rp->r_size && seg == segkmap) { 10147 mutex_exit(&rp->r_statelock); 10148 bzero(bp->b_un.b_addr, io_len); 10149 error = 0; 10150 } else { 10151 mutex_exit(&rp->r_statelock); 10152 error = nfs4_bio(bp, NULL, cr, TRUE); 10153 if (error == NFS_EOF) 10154 error = 0; 10155 } 10156 10157 /* 10158 * Unmap the buffer before freeing it. 10159 */ 10160 bp_mapout(bp); 10161 pageio_done(bp); 10162 10163 savepp = pp; 10164 do { 10165 pp->p_fsdata = C_NOCOMMIT; 10166 } while ((pp = pp->p_next) != savepp); 10167 10168 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10169 10170 /* 10171 * In case of error set readahead offset 10172 * to the lowest offset. 10173 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10174 */ 10175 if (error && rp->r_nextr > io_off) { 10176 mutex_enter(&rp->r_statelock); 10177 if (rp->r_nextr > io_off) 10178 rp->r_nextr = io_off; 10179 mutex_exit(&rp->r_statelock); 10180 } 10181 } 10182 10183 /* 10184 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10185 * If len == 0, do from off to EOF. 10186 * 10187 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10188 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10189 * (from pageout). 10190 */ 10191 /* ARGSUSED */ 10192 static int 10193 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10194 caller_context_t *ct) 10195 { 10196 int error; 10197 rnode4_t *rp; 10198 10199 ASSERT(cr != NULL); 10200 10201 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10202 return (EIO); 10203 10204 rp = VTOR4(vp); 10205 if (IS_SHADOW(vp, rp)) 10206 vp = RTOV4(rp); 10207 10208 /* 10209 * XXX - Why should this check be made here? 10210 */ 10211 if (vp->v_flag & VNOMAP) 10212 return (ENOSYS); 10213 10214 if (len == 0 && !(flags & B_INVAL) && 10215 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10216 return (0); 10217 10218 mutex_enter(&rp->r_statelock); 10219 rp->r_count++; 10220 mutex_exit(&rp->r_statelock); 10221 error = nfs4_putpages(vp, off, len, flags, cr); 10222 mutex_enter(&rp->r_statelock); 10223 rp->r_count--; 10224 cv_broadcast(&rp->r_cv); 10225 mutex_exit(&rp->r_statelock); 10226 10227 return (error); 10228 } 10229 10230 /* 10231 * Write out a single page, possibly klustering adjacent dirty pages. 10232 */ 10233 int 10234 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10235 int flags, cred_t *cr) 10236 { 10237 u_offset_t io_off; 10238 u_offset_t lbn_off; 10239 u_offset_t lbn; 10240 size_t io_len; 10241 uint_t bsize; 10242 int error; 10243 rnode4_t *rp; 10244 10245 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10246 ASSERT(pp != NULL); 10247 ASSERT(cr != NULL); 10248 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10249 10250 rp = VTOR4(vp); 10251 ASSERT(rp->r_count > 0); 10252 ASSERT(!IS_SHADOW(vp, rp)); 10253 10254 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10255 lbn = pp->p_offset / bsize; 10256 lbn_off = lbn * bsize; 10257 10258 /* 10259 * Find a kluster that fits in one block, or in 10260 * one page if pages are bigger than blocks. If 10261 * there is less file space allocated than a whole 10262 * page, we'll shorten the i/o request below. 10263 */ 10264 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10265 roundup(bsize, PAGESIZE), flags); 10266 10267 /* 10268 * pvn_write_kluster shouldn't have returned a page with offset 10269 * behind the original page we were given. Verify that. 10270 */ 10271 ASSERT((pp->p_offset / bsize) >= lbn); 10272 10273 /* 10274 * Now pp will have the list of kept dirty pages marked for 10275 * write back. It will also handle invalidation and freeing 10276 * of pages that are not dirty. Check for page length rounding 10277 * problems. 10278 */ 10279 if (io_off + io_len > lbn_off + bsize) { 10280 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10281 io_len = lbn_off + bsize - io_off; 10282 } 10283 /* 10284 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10285 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10286 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10287 * progress and the r_size has not been made consistent with the 10288 * new size of the file. When the uiomove() completes the r_size is 10289 * updated and the R4MODINPROGRESS flag is cleared. 10290 * 10291 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10292 * consistent value of r_size. Without this handshaking, it is 10293 * possible that nfs4_bio() picks up the old value of r_size 10294 * before the uiomove() in writerp4() completes. This will result 10295 * in the write through nfs4_bio() being dropped. 10296 * 10297 * More precisely, there is a window between the time the uiomove() 10298 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10299 * operation intervenes in this window, the page will be picked up, 10300 * because it is dirty (it will be unlocked, unless it was 10301 * pagecreate'd). When the page is picked up as dirty, the dirty 10302 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10303 * checked. This will still be the old size. Therefore the page will 10304 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10305 * the page will be found to be clean and the write will be dropped. 10306 */ 10307 if (rp->r_flags & R4MODINPROGRESS) { 10308 mutex_enter(&rp->r_statelock); 10309 if ((rp->r_flags & R4MODINPROGRESS) && 10310 rp->r_modaddr + MAXBSIZE > io_off && 10311 rp->r_modaddr < io_off + io_len) { 10312 page_t *plist; 10313 /* 10314 * A write is in progress for this region of the file. 10315 * If we did not detect R4MODINPROGRESS here then this 10316 * path through nfs_putapage() would eventually go to 10317 * nfs4_bio() and may not write out all of the data 10318 * in the pages. We end up losing data. So we decide 10319 * to set the modified bit on each page in the page 10320 * list and mark the rnode with R4DIRTY. This write 10321 * will be restarted at some later time. 10322 */ 10323 plist = pp; 10324 while (plist != NULL) { 10325 pp = plist; 10326 page_sub(&plist, pp); 10327 hat_setmod(pp); 10328 page_io_unlock(pp); 10329 page_unlock(pp); 10330 } 10331 rp->r_flags |= R4DIRTY; 10332 mutex_exit(&rp->r_statelock); 10333 if (offp) 10334 *offp = io_off; 10335 if (lenp) 10336 *lenp = io_len; 10337 return (0); 10338 } 10339 mutex_exit(&rp->r_statelock); 10340 } 10341 10342 if (flags & B_ASYNC) { 10343 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10344 nfs4_sync_putapage); 10345 } else 10346 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10347 10348 if (offp) 10349 *offp = io_off; 10350 if (lenp) 10351 *lenp = io_len; 10352 return (error); 10353 } 10354 10355 static int 10356 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10357 int flags, cred_t *cr) 10358 { 10359 int error; 10360 rnode4_t *rp; 10361 10362 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10363 10364 flags |= B_WRITE; 10365 10366 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10367 10368 rp = VTOR4(vp); 10369 10370 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10371 error == EACCES) && 10372 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10373 if (!(rp->r_flags & R4OUTOFSPACE)) { 10374 mutex_enter(&rp->r_statelock); 10375 rp->r_flags |= R4OUTOFSPACE; 10376 mutex_exit(&rp->r_statelock); 10377 } 10378 flags |= B_ERROR; 10379 pvn_write_done(pp, flags); 10380 /* 10381 * If this was not an async thread, then try again to 10382 * write out the pages, but this time, also destroy 10383 * them whether or not the write is successful. This 10384 * will prevent memory from filling up with these 10385 * pages and destroying them is the only alternative 10386 * if they can't be written out. 10387 * 10388 * Don't do this if this is an async thread because 10389 * when the pages are unlocked in pvn_write_done, 10390 * some other thread could have come along, locked 10391 * them, and queued for an async thread. It would be 10392 * possible for all of the async threads to be tied 10393 * up waiting to lock the pages again and they would 10394 * all already be locked and waiting for an async 10395 * thread to handle them. Deadlock. 10396 */ 10397 if (!(flags & B_ASYNC)) { 10398 error = nfs4_putpage(vp, io_off, io_len, 10399 B_INVAL | B_FORCE, cr, NULL); 10400 } 10401 } else { 10402 if (error) 10403 flags |= B_ERROR; 10404 else if (rp->r_flags & R4OUTOFSPACE) { 10405 mutex_enter(&rp->r_statelock); 10406 rp->r_flags &= ~R4OUTOFSPACE; 10407 mutex_exit(&rp->r_statelock); 10408 } 10409 pvn_write_done(pp, flags); 10410 if (freemem < desfree) 10411 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10412 NFS4_WRITE_NOWAIT); 10413 } 10414 10415 return (error); 10416 } 10417 10418 #ifdef DEBUG 10419 int nfs4_force_open_before_mmap = 0; 10420 #endif 10421 10422 /* ARGSUSED */ 10423 static int 10424 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10425 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10426 caller_context_t *ct) 10427 { 10428 struct segvn_crargs vn_a; 10429 int error = 0; 10430 rnode4_t *rp = VTOR4(vp); 10431 mntinfo4_t *mi = VTOMI4(vp); 10432 10433 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10434 return (EIO); 10435 10436 if (vp->v_flag & VNOMAP) 10437 return (ENOSYS); 10438 10439 if (off < 0 || (off + len) < 0) 10440 return (ENXIO); 10441 10442 if (vp->v_type != VREG) 10443 return (ENODEV); 10444 10445 /* 10446 * If the file is delegated to the client don't do anything. 10447 * If the file is not delegated, then validate the data cache. 10448 */ 10449 mutex_enter(&rp->r_statev4_lock); 10450 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10451 mutex_exit(&rp->r_statev4_lock); 10452 error = nfs4_validate_caches(vp, cr); 10453 if (error) 10454 return (error); 10455 } else { 10456 mutex_exit(&rp->r_statev4_lock); 10457 } 10458 10459 /* 10460 * Check to see if the vnode is currently marked as not cachable. 10461 * This means portions of the file are locked (through VOP_FRLOCK). 10462 * In this case the map request must be refused. We use 10463 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10464 * 10465 * Atomically increment r_inmap after acquiring r_rwlock. The 10466 * idea here is to acquire r_rwlock to block read/write and 10467 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10468 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10469 * and we can prevent the deadlock that would have occurred 10470 * when nfs4_addmap() would have acquired it out of order. 10471 * 10472 * Since we are not protecting r_inmap by any lock, we do not 10473 * hold any lock when we decrement it. We atomically decrement 10474 * r_inmap after we release r_lkserlock. 10475 */ 10476 10477 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp))) 10478 return (EINTR); 10479 atomic_add_int(&rp->r_inmap, 1); 10480 nfs_rw_exit(&rp->r_rwlock); 10481 10482 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10483 atomic_add_int(&rp->r_inmap, -1); 10484 return (EINTR); 10485 } 10486 10487 10488 if (vp->v_flag & VNOCACHE) { 10489 error = EAGAIN; 10490 goto done; 10491 } 10492 10493 /* 10494 * Don't allow concurrent locks and mapping if mandatory locking is 10495 * enabled. 10496 */ 10497 if (flk_has_remote_locks(vp)) { 10498 struct vattr va; 10499 va.va_mask = AT_MODE; 10500 error = nfs4getattr(vp, &va, cr); 10501 if (error != 0) 10502 goto done; 10503 if (MANDLOCK(vp, va.va_mode)) { 10504 error = EAGAIN; 10505 goto done; 10506 } 10507 } 10508 10509 /* 10510 * It is possible that the rnode has a lost lock request that we 10511 * are still trying to recover, and that the request conflicts with 10512 * this map request. 10513 * 10514 * An alternative approach would be for nfs4_safemap() to consider 10515 * queued lock requests when deciding whether to set or clear 10516 * VNOCACHE. This would require the frlock code path to call 10517 * nfs4_safemap() after enqueing a lost request. 10518 */ 10519 if (nfs4_map_lost_lock_conflict(vp)) { 10520 error = EAGAIN; 10521 goto done; 10522 } 10523 10524 as_rangelock(as); 10525 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10526 if (error != 0) { 10527 as_rangeunlock(as); 10528 goto done; 10529 } 10530 10531 if (vp->v_type == VREG) { 10532 /* 10533 * We need to retrieve the open stream 10534 */ 10535 nfs4_open_stream_t *osp = NULL; 10536 nfs4_open_owner_t *oop = NULL; 10537 10538 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10539 if (oop != NULL) { 10540 /* returns with 'os_sync_lock' held */ 10541 osp = find_open_stream(oop, rp); 10542 open_owner_rele(oop); 10543 } 10544 if (osp == NULL) { 10545 #ifdef DEBUG 10546 if (nfs4_force_open_before_mmap) { 10547 error = EIO; 10548 goto done; 10549 } 10550 #endif 10551 /* returns with 'os_sync_lock' held */ 10552 error = open_and_get_osp(vp, cr, &osp); 10553 if (osp == NULL) { 10554 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10555 "nfs4_map: we tried to OPEN the file " 10556 "but again no osp, so fail with EIO")); 10557 goto done; 10558 } 10559 } 10560 10561 if (osp->os_failed_reopen) { 10562 mutex_exit(&osp->os_sync_lock); 10563 open_stream_rele(osp, rp); 10564 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10565 "nfs4_map: os_failed_reopen set on " 10566 "osp %p, cr %p, rp %s", (void *)osp, 10567 (void *)cr, rnode4info(rp))); 10568 error = EIO; 10569 goto done; 10570 } 10571 mutex_exit(&osp->os_sync_lock); 10572 open_stream_rele(osp, rp); 10573 } 10574 10575 vn_a.vp = vp; 10576 vn_a.offset = off; 10577 vn_a.type = (flags & MAP_TYPE); 10578 vn_a.prot = (uchar_t)prot; 10579 vn_a.maxprot = (uchar_t)maxprot; 10580 vn_a.flags = (flags & ~MAP_TYPE); 10581 vn_a.cred = cr; 10582 vn_a.amp = NULL; 10583 vn_a.szc = 0; 10584 vn_a.lgrp_mem_policy_flags = 0; 10585 10586 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10587 as_rangeunlock(as); 10588 10589 done: 10590 nfs_rw_exit(&rp->r_lkserlock); 10591 atomic_add_int(&rp->r_inmap, -1); 10592 return (error); 10593 } 10594 10595 /* 10596 * We're most likely dealing with a kernel module that likes to READ 10597 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10598 * officially OPEN the file to create the necessary client state 10599 * for bookkeeping of os_mmap_read/write counts. 10600 * 10601 * Since VOP_MAP only passes in a pointer to the vnode rather than 10602 * a double pointer, we can't handle the case where nfs4open_otw() 10603 * returns a different vnode than the one passed into VOP_MAP (since 10604 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10605 * we return NULL and let nfs4_map() fail. Note: the only case where 10606 * this should happen is if the file got removed and replaced with the 10607 * same name on the server (in addition to the fact that we're trying 10608 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10609 */ 10610 static int 10611 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10612 { 10613 rnode4_t *rp, *drp; 10614 vnode_t *dvp, *open_vp; 10615 char file_name[MAXNAMELEN]; 10616 int just_created; 10617 nfs4_open_stream_t *osp; 10618 nfs4_open_owner_t *oop; 10619 int error; 10620 10621 *ospp = NULL; 10622 open_vp = map_vp; 10623 10624 rp = VTOR4(open_vp); 10625 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10626 return (error); 10627 drp = VTOR4(dvp); 10628 10629 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10630 VN_RELE(dvp); 10631 return (EINTR); 10632 } 10633 10634 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10635 nfs_rw_exit(&drp->r_rwlock); 10636 VN_RELE(dvp); 10637 return (error); 10638 } 10639 10640 mutex_enter(&rp->r_statev4_lock); 10641 if (rp->created_v4) { 10642 rp->created_v4 = 0; 10643 mutex_exit(&rp->r_statev4_lock); 10644 10645 dnlc_update(dvp, file_name, open_vp); 10646 /* This is needed so we don't bump the open ref count */ 10647 just_created = 1; 10648 } else { 10649 mutex_exit(&rp->r_statev4_lock); 10650 just_created = 0; 10651 } 10652 10653 VN_HOLD(map_vp); 10654 10655 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10656 just_created); 10657 if (error) { 10658 nfs_rw_exit(&drp->r_rwlock); 10659 VN_RELE(dvp); 10660 VN_RELE(map_vp); 10661 return (error); 10662 } 10663 10664 nfs_rw_exit(&drp->r_rwlock); 10665 VN_RELE(dvp); 10666 10667 /* 10668 * If nfs4open_otw() returned a different vnode then "undo" 10669 * the open and return failure to the caller. 10670 */ 10671 if (!VN_CMP(open_vp, map_vp)) { 10672 nfs4_error_t e; 10673 10674 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10675 "open returned a different vnode")); 10676 /* 10677 * If there's an error, ignore it, 10678 * and let VOP_INACTIVE handle it. 10679 */ 10680 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10681 CLOSE_NORM, 0, 0, 0); 10682 VN_RELE(map_vp); 10683 return (EIO); 10684 } 10685 10686 VN_RELE(map_vp); 10687 10688 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10689 if (!oop) { 10690 nfs4_error_t e; 10691 10692 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10693 "no open owner")); 10694 /* 10695 * If there's an error, ignore it, 10696 * and let VOP_INACTIVE handle it. 10697 */ 10698 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10699 CLOSE_NORM, 0, 0, 0); 10700 return (EIO); 10701 } 10702 osp = find_open_stream(oop, rp); 10703 open_owner_rele(oop); 10704 *ospp = osp; 10705 return (0); 10706 } 10707 10708 /* 10709 * Please be aware that when this function is called, the address space write 10710 * a_lock is held. Do not put over the wire calls in this function. 10711 */ 10712 /* ARGSUSED */ 10713 static int 10714 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10715 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10716 caller_context_t *ct) 10717 { 10718 rnode4_t *rp; 10719 int error = 0; 10720 mntinfo4_t *mi; 10721 10722 mi = VTOMI4(vp); 10723 rp = VTOR4(vp); 10724 10725 if (nfs_zone() != mi->mi_zone) 10726 return (EIO); 10727 if (vp->v_flag & VNOMAP) 10728 return (ENOSYS); 10729 10730 /* 10731 * Don't need to update the open stream first, since this 10732 * mmap can't add any additional share access that isn't 10733 * already contained in the open stream (for the case where we 10734 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10735 * take into account os_mmap_read[write] counts). 10736 */ 10737 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10738 10739 if (vp->v_type == VREG) { 10740 /* 10741 * We need to retrieve the open stream and update the counts. 10742 * If there is no open stream here, something is wrong. 10743 */ 10744 nfs4_open_stream_t *osp = NULL; 10745 nfs4_open_owner_t *oop = NULL; 10746 10747 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10748 if (oop != NULL) { 10749 /* returns with 'os_sync_lock' held */ 10750 osp = find_open_stream(oop, rp); 10751 open_owner_rele(oop); 10752 } 10753 if (osp == NULL) { 10754 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10755 "nfs4_addmap: we should have an osp" 10756 "but we don't, so fail with EIO")); 10757 error = EIO; 10758 goto out; 10759 } 10760 10761 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10762 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10763 10764 /* 10765 * Update the map count in the open stream. 10766 * This is necessary in the case where we 10767 * open/mmap/close/, then the server reboots, and we 10768 * attempt to reopen. If the mmap doesn't add share 10769 * access then we send an invalid reopen with 10770 * access = NONE. 10771 * 10772 * We need to specifically check each PROT_* so a mmap 10773 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10774 * read and write access. A simple comparison of prot 10775 * to ~PROT_WRITE to determine read access is insufficient 10776 * since prot can be |= with PROT_USER, etc. 10777 */ 10778 10779 /* 10780 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10781 */ 10782 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10783 osp->os_mmap_write += btopr(len); 10784 if (maxprot & PROT_READ) 10785 osp->os_mmap_read += btopr(len); 10786 if (maxprot & PROT_EXEC) 10787 osp->os_mmap_read += btopr(len); 10788 /* 10789 * Ensure that os_mmap_read gets incremented, even if 10790 * maxprot were to look like PROT_NONE. 10791 */ 10792 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10793 !(maxprot & PROT_EXEC)) 10794 osp->os_mmap_read += btopr(len); 10795 osp->os_mapcnt += btopr(len); 10796 mutex_exit(&osp->os_sync_lock); 10797 open_stream_rele(osp, rp); 10798 } 10799 10800 out: 10801 /* 10802 * If we got an error, then undo our 10803 * incrementing of 'r_mapcnt'. 10804 */ 10805 10806 if (error) { 10807 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10808 ASSERT(rp->r_mapcnt >= 0); 10809 } 10810 return (error); 10811 } 10812 10813 /* ARGSUSED */ 10814 static int 10815 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10816 { 10817 10818 return (VTOR4(vp1) == VTOR4(vp2)); 10819 } 10820 10821 /* ARGSUSED */ 10822 static int 10823 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10824 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10825 caller_context_t *ct) 10826 { 10827 int rc; 10828 u_offset_t start, end; 10829 rnode4_t *rp; 10830 int error = 0, intr = INTR4(vp); 10831 nfs4_error_t e; 10832 10833 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10834 return (EIO); 10835 10836 /* check for valid cmd parameter */ 10837 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10838 return (EINVAL); 10839 10840 /* Verify l_type. */ 10841 switch (bfp->l_type) { 10842 case F_RDLCK: 10843 if (cmd != F_GETLK && !(flag & FREAD)) 10844 return (EBADF); 10845 break; 10846 case F_WRLCK: 10847 if (cmd != F_GETLK && !(flag & FWRITE)) 10848 return (EBADF); 10849 break; 10850 case F_UNLCK: 10851 intr = 0; 10852 break; 10853 10854 default: 10855 return (EINVAL); 10856 } 10857 10858 /* check the validity of the lock range */ 10859 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10860 return (rc); 10861 if (rc = flk_check_lock_data(start, end, MAXEND)) 10862 return (rc); 10863 10864 /* 10865 * If the filesystem is mounted using local locking, pass the 10866 * request off to the local locking code. 10867 */ 10868 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10869 if (cmd == F_SETLK || cmd == F_SETLKW) { 10870 /* 10871 * For complete safety, we should be holding 10872 * r_lkserlock. However, we can't call 10873 * nfs4_safelock and then fs_frlock while 10874 * holding r_lkserlock, so just invoke 10875 * nfs4_safelock and expect that this will 10876 * catch enough of the cases. 10877 */ 10878 if (!nfs4_safelock(vp, bfp, cr)) 10879 return (EAGAIN); 10880 } 10881 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10882 } 10883 10884 rp = VTOR4(vp); 10885 10886 /* 10887 * Check whether the given lock request can proceed, given the 10888 * current file mappings. 10889 */ 10890 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10891 return (EINTR); 10892 if (cmd == F_SETLK || cmd == F_SETLKW) { 10893 if (!nfs4_safelock(vp, bfp, cr)) { 10894 rc = EAGAIN; 10895 goto done; 10896 } 10897 } 10898 10899 /* 10900 * Flush the cache after waiting for async I/O to finish. For new 10901 * locks, this is so that the process gets the latest bits from the 10902 * server. For unlocks, this is so that other clients see the 10903 * latest bits once the file has been unlocked. If currently dirty 10904 * pages can't be flushed, then don't allow a lock to be set. But 10905 * allow unlocks to succeed, to avoid having orphan locks on the 10906 * server. 10907 */ 10908 if (cmd != F_GETLK) { 10909 mutex_enter(&rp->r_statelock); 10910 while (rp->r_count > 0) { 10911 if (intr) { 10912 klwp_t *lwp = ttolwp(curthread); 10913 10914 if (lwp != NULL) 10915 lwp->lwp_nostop++; 10916 if (cv_wait_sig(&rp->r_cv, 10917 &rp->r_statelock) == 0) { 10918 if (lwp != NULL) 10919 lwp->lwp_nostop--; 10920 rc = EINTR; 10921 break; 10922 } 10923 if (lwp != NULL) 10924 lwp->lwp_nostop--; 10925 } else 10926 cv_wait(&rp->r_cv, &rp->r_statelock); 10927 } 10928 mutex_exit(&rp->r_statelock); 10929 if (rc != 0) 10930 goto done; 10931 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 10932 if (error) { 10933 if (error == ENOSPC || error == EDQUOT) { 10934 mutex_enter(&rp->r_statelock); 10935 if (!rp->r_error) 10936 rp->r_error = error; 10937 mutex_exit(&rp->r_statelock); 10938 } 10939 if (bfp->l_type != F_UNLCK) { 10940 rc = ENOLCK; 10941 goto done; 10942 } 10943 } 10944 } 10945 10946 /* 10947 * Call the lock manager to do the real work of contacting 10948 * the server and obtaining the lock. 10949 */ 10950 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10951 cr, &e, NULL, NULL); 10952 rc = e.error; 10953 10954 if (rc == 0) 10955 nfs4_lockcompletion(vp, cmd); 10956 10957 done: 10958 nfs_rw_exit(&rp->r_lkserlock); 10959 10960 return (rc); 10961 } 10962 10963 /* 10964 * Free storage space associated with the specified vnode. The portion 10965 * to be freed is specified by bfp->l_start and bfp->l_len (already 10966 * normalized to a "whence" of 0). 10967 * 10968 * This is an experimental facility whose continued existence is not 10969 * guaranteed. Currently, we only support the special case 10970 * of l_len == 0, meaning free to end of file. 10971 */ 10972 /* ARGSUSED */ 10973 static int 10974 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10975 offset_t offset, cred_t *cr, caller_context_t *ct) 10976 { 10977 int error; 10978 10979 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10980 return (EIO); 10981 ASSERT(vp->v_type == VREG); 10982 if (cmd != F_FREESP) 10983 return (EINVAL); 10984 10985 error = convoff(vp, bfp, 0, offset); 10986 if (!error) { 10987 ASSERT(bfp->l_start >= 0); 10988 if (bfp->l_len == 0) { 10989 struct vattr va; 10990 10991 va.va_mask = AT_SIZE; 10992 va.va_size = bfp->l_start; 10993 error = nfs4setattr(vp, &va, 0, cr, NULL); 10994 } else 10995 error = EINVAL; 10996 } 10997 10998 return (error); 10999 } 11000 11001 /* ARGSUSED */ 11002 int 11003 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 11004 { 11005 rnode4_t *rp; 11006 rp = VTOR4(vp); 11007 11008 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 11009 vp = RTOV4(rp); 11010 } 11011 *vpp = vp; 11012 return (0); 11013 } 11014 11015 /* 11016 * Setup and add an address space callback to do the work of the delmap call. 11017 * The callback will (and must be) deleted in the actual callback function. 11018 * 11019 * This is done in order to take care of the problem that we have with holding 11020 * the address space's a_lock for a long period of time (e.g. if the NFS server 11021 * is down). Callbacks will be executed in the address space code while the 11022 * a_lock is not held. Holding the address space's a_lock causes things such 11023 * as ps and fork to hang because they are trying to acquire this lock as well. 11024 */ 11025 /* ARGSUSED */ 11026 static int 11027 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 11028 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 11029 caller_context_t *ct) 11030 { 11031 int caller_found; 11032 int error; 11033 rnode4_t *rp; 11034 nfs4_delmap_args_t *dmapp; 11035 nfs4_delmapcall_t *delmap_call; 11036 11037 if (vp->v_flag & VNOMAP) 11038 return (ENOSYS); 11039 11040 /* 11041 * A process may not change zones if it has NFS pages mmap'ed 11042 * in, so we can't legitimately get here from the wrong zone. 11043 */ 11044 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11045 11046 rp = VTOR4(vp); 11047 11048 /* 11049 * The way that the address space of this process deletes its mapping 11050 * of this file is via the following call chains: 11051 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11052 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11053 * 11054 * With the use of address space callbacks we are allowed to drop the 11055 * address space lock, a_lock, while executing the NFS operations that 11056 * need to go over the wire. Returning EAGAIN to the caller of this 11057 * function is what drives the execution of the callback that we add 11058 * below. The callback will be executed by the address space code 11059 * after dropping the a_lock. When the callback is finished, since 11060 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11061 * is called again on the same segment to finish the rest of the work 11062 * that needs to happen during unmapping. 11063 * 11064 * This action of calling back into the segment driver causes 11065 * nfs4_delmap() to get called again, but since the callback was 11066 * already executed at this point, it already did the work and there 11067 * is nothing left for us to do. 11068 * 11069 * To Summarize: 11070 * - The first time nfs4_delmap is called by the current thread is when 11071 * we add the caller associated with this delmap to the delmap caller 11072 * list, add the callback, and return EAGAIN. 11073 * - The second time in this call chain when nfs4_delmap is called we 11074 * will find this caller in the delmap caller list and realize there 11075 * is no more work to do thus removing this caller from the list and 11076 * returning the error that was set in the callback execution. 11077 */ 11078 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11079 if (caller_found) { 11080 /* 11081 * 'error' is from the actual delmap operations. To avoid 11082 * hangs, we need to handle the return of EAGAIN differently 11083 * since this is what drives the callback execution. 11084 * In this case, we don't want to return EAGAIN and do the 11085 * callback execution because there are none to execute. 11086 */ 11087 if (error == EAGAIN) 11088 return (0); 11089 else 11090 return (error); 11091 } 11092 11093 /* current caller was not in the list */ 11094 delmap_call = nfs4_init_delmapcall(); 11095 11096 mutex_enter(&rp->r_statelock); 11097 list_insert_tail(&rp->r_indelmap, delmap_call); 11098 mutex_exit(&rp->r_statelock); 11099 11100 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11101 11102 dmapp->vp = vp; 11103 dmapp->off = off; 11104 dmapp->addr = addr; 11105 dmapp->len = len; 11106 dmapp->prot = prot; 11107 dmapp->maxprot = maxprot; 11108 dmapp->flags = flags; 11109 dmapp->cr = cr; 11110 dmapp->caller = delmap_call; 11111 11112 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11113 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11114 11115 return (error ? error : EAGAIN); 11116 } 11117 11118 static nfs4_delmapcall_t * 11119 nfs4_init_delmapcall() 11120 { 11121 nfs4_delmapcall_t *delmap_call; 11122 11123 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11124 delmap_call->call_id = curthread; 11125 delmap_call->error = 0; 11126 11127 return (delmap_call); 11128 } 11129 11130 static void 11131 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11132 { 11133 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11134 } 11135 11136 /* 11137 * Searches for the current delmap caller (based on curthread) in the list of 11138 * callers. If it is found, we remove it and free the delmap caller. 11139 * Returns: 11140 * 0 if the caller wasn't found 11141 * 1 if the caller was found, removed and freed. *errp will be set 11142 * to what the result of the delmap was. 11143 */ 11144 static int 11145 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11146 { 11147 nfs4_delmapcall_t *delmap_call; 11148 11149 /* 11150 * If the list doesn't exist yet, we create it and return 11151 * that the caller wasn't found. No list = no callers. 11152 */ 11153 mutex_enter(&rp->r_statelock); 11154 if (!(rp->r_flags & R4DELMAPLIST)) { 11155 /* The list does not exist */ 11156 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11157 offsetof(nfs4_delmapcall_t, call_node)); 11158 rp->r_flags |= R4DELMAPLIST; 11159 mutex_exit(&rp->r_statelock); 11160 return (0); 11161 } else { 11162 /* The list exists so search it */ 11163 for (delmap_call = list_head(&rp->r_indelmap); 11164 delmap_call != NULL; 11165 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11166 if (delmap_call->call_id == curthread) { 11167 /* current caller is in the list */ 11168 *errp = delmap_call->error; 11169 list_remove(&rp->r_indelmap, delmap_call); 11170 mutex_exit(&rp->r_statelock); 11171 nfs4_free_delmapcall(delmap_call); 11172 return (1); 11173 } 11174 } 11175 } 11176 mutex_exit(&rp->r_statelock); 11177 return (0); 11178 } 11179 11180 /* 11181 * Remove some pages from an mmap'd vnode. Just update the 11182 * count of pages. If doing close-to-open, then flush and 11183 * commit all of the pages associated with this file. 11184 * Otherwise, start an asynchronous page flush to write out 11185 * any dirty pages. This will also associate a credential 11186 * with the rnode which can be used to write the pages. 11187 */ 11188 /* ARGSUSED */ 11189 static void 11190 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11191 { 11192 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11193 rnode4_t *rp; 11194 mntinfo4_t *mi; 11195 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11196 11197 rp = VTOR4(dmapp->vp); 11198 mi = VTOMI4(dmapp->vp); 11199 11200 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11201 ASSERT(rp->r_mapcnt >= 0); 11202 11203 /* 11204 * Initiate a page flush and potential commit if there are 11205 * pages, the file system was not mounted readonly, the segment 11206 * was mapped shared, and the pages themselves were writeable. 11207 */ 11208 if (nfs4_has_pages(dmapp->vp) && 11209 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11210 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11211 mutex_enter(&rp->r_statelock); 11212 rp->r_flags |= R4DIRTY; 11213 mutex_exit(&rp->r_statelock); 11214 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11215 dmapp->len, dmapp->cr); 11216 if (!e.error) { 11217 mutex_enter(&rp->r_statelock); 11218 e.error = rp->r_error; 11219 rp->r_error = 0; 11220 mutex_exit(&rp->r_statelock); 11221 } 11222 } else 11223 e.error = 0; 11224 11225 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11226 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11227 B_INVAL, dmapp->cr, NULL); 11228 11229 if (e.error) { 11230 e.stat = puterrno4(e.error); 11231 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11232 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11233 dmapp->caller->error = e.error; 11234 } 11235 11236 /* Check to see if we need to close the file */ 11237 11238 if (dmapp->vp->v_type == VREG) { 11239 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11240 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11241 11242 if (e.error != 0 || e.stat != NFS4_OK) { 11243 /* 11244 * Since it is possible that e.error == 0 and 11245 * e.stat != NFS4_OK (and vice versa), 11246 * we do the proper checking in order to get both 11247 * e.error and e.stat reporting the correct info. 11248 */ 11249 if (e.stat == NFS4_OK) 11250 e.stat = puterrno4(e.error); 11251 if (e.error == 0) 11252 e.error = geterrno4(e.stat); 11253 11254 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11255 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11256 dmapp->caller->error = e.error; 11257 } 11258 } 11259 11260 (void) as_delete_callback(as, arg); 11261 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11262 } 11263 11264 11265 static uint_t 11266 fattr4_maxfilesize_to_bits(uint64_t ll) 11267 { 11268 uint_t l = 1; 11269 11270 if (ll == 0) { 11271 return (0); 11272 } 11273 11274 if (ll & 0xffffffff00000000) { 11275 l += 32; ll >>= 32; 11276 } 11277 if (ll & 0xffff0000) { 11278 l += 16; ll >>= 16; 11279 } 11280 if (ll & 0xff00) { 11281 l += 8; ll >>= 8; 11282 } 11283 if (ll & 0xf0) { 11284 l += 4; ll >>= 4; 11285 } 11286 if (ll & 0xc) { 11287 l += 2; ll >>= 2; 11288 } 11289 if (ll & 0x2) { 11290 l += 1; 11291 } 11292 return (l); 11293 } 11294 11295 static int 11296 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11297 { 11298 vnode_t *avp = NULL; 11299 int error; 11300 11301 if ((error = nfs4lookup_xattr(vp, "", &avp, 11302 LOOKUP_XATTR, cr)) == 0) 11303 error = do_xattr_exists_check(avp, valp, cr); 11304 if (avp) 11305 VN_RELE(avp); 11306 11307 return (error); 11308 } 11309 11310 /* ARGSUSED */ 11311 int 11312 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11313 caller_context_t *ct) 11314 { 11315 int error; 11316 hrtime_t t; 11317 rnode4_t *rp; 11318 nfs4_ga_res_t gar; 11319 nfs4_ga_ext_res_t ger; 11320 11321 gar.n4g_ext_res = &ger; 11322 11323 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11324 return (EIO); 11325 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11326 *valp = MAXPATHLEN; 11327 return (0); 11328 } 11329 if (cmd == _PC_ACL_ENABLED) { 11330 *valp = _ACL_ACE_ENABLED; 11331 return (0); 11332 } 11333 11334 rp = VTOR4(vp); 11335 if (cmd == _PC_XATTR_EXISTS) { 11336 /* 11337 * The existence of the xattr directory is not sufficient 11338 * for determining whether generic user attributes exists. 11339 * The attribute directory could only be a transient directory 11340 * used for Solaris sysattr support. Do a small readdir 11341 * to verify if the only entries are sysattrs or not. 11342 * 11343 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11344 * is NULL. Once the xadir vp exists, we can create xattrs, 11345 * and we don't have any way to update the "base" object's 11346 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11347 * could help out. 11348 */ 11349 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11350 rp->r_xattr_dir == NULL) { 11351 return (nfs4_have_xattrs(vp, valp, cr)); 11352 } 11353 } else { /* OLD CODE */ 11354 if (ATTRCACHE4_VALID(vp)) { 11355 mutex_enter(&rp->r_statelock); 11356 if (rp->r_pathconf.pc4_cache_valid) { 11357 error = 0; 11358 switch (cmd) { 11359 case _PC_FILESIZEBITS: 11360 *valp = 11361 rp->r_pathconf.pc4_filesizebits; 11362 break; 11363 case _PC_LINK_MAX: 11364 *valp = 11365 rp->r_pathconf.pc4_link_max; 11366 break; 11367 case _PC_NAME_MAX: 11368 *valp = 11369 rp->r_pathconf.pc4_name_max; 11370 break; 11371 case _PC_CHOWN_RESTRICTED: 11372 *valp = 11373 rp->r_pathconf.pc4_chown_restricted; 11374 break; 11375 case _PC_NO_TRUNC: 11376 *valp = 11377 rp->r_pathconf.pc4_no_trunc; 11378 break; 11379 default: 11380 error = EINVAL; 11381 break; 11382 } 11383 mutex_exit(&rp->r_statelock); 11384 #ifdef DEBUG 11385 nfs4_pathconf_cache_hits++; 11386 #endif 11387 return (error); 11388 } 11389 mutex_exit(&rp->r_statelock); 11390 } 11391 } 11392 #ifdef DEBUG 11393 nfs4_pathconf_cache_misses++; 11394 #endif 11395 11396 t = gethrtime(); 11397 11398 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11399 11400 if (error) { 11401 mutex_enter(&rp->r_statelock); 11402 rp->r_pathconf.pc4_cache_valid = FALSE; 11403 rp->r_pathconf.pc4_xattr_valid = FALSE; 11404 mutex_exit(&rp->r_statelock); 11405 return (error); 11406 } 11407 11408 /* interpret the max filesize */ 11409 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11410 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11411 11412 /* Store the attributes we just received */ 11413 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11414 11415 switch (cmd) { 11416 case _PC_FILESIZEBITS: 11417 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11418 break; 11419 case _PC_LINK_MAX: 11420 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11421 break; 11422 case _PC_NAME_MAX: 11423 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11424 break; 11425 case _PC_CHOWN_RESTRICTED: 11426 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11427 break; 11428 case _PC_NO_TRUNC: 11429 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11430 break; 11431 case _PC_XATTR_EXISTS: 11432 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11433 if (error = nfs4_have_xattrs(vp, valp, cr)) 11434 return (error); 11435 } 11436 break; 11437 default: 11438 return (EINVAL); 11439 } 11440 11441 return (0); 11442 } 11443 11444 /* 11445 * Called by async thread to do synchronous pageio. Do the i/o, wait 11446 * for it to complete, and cleanup the page list when done. 11447 */ 11448 static int 11449 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11450 int flags, cred_t *cr) 11451 { 11452 int error; 11453 11454 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11455 11456 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11457 if (flags & B_READ) 11458 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11459 else 11460 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11461 return (error); 11462 } 11463 11464 /* ARGSUSED */ 11465 static int 11466 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11467 int flags, cred_t *cr, caller_context_t *ct) 11468 { 11469 int error; 11470 rnode4_t *rp; 11471 11472 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11473 return (EIO); 11474 11475 if (pp == NULL) 11476 return (EINVAL); 11477 11478 rp = VTOR4(vp); 11479 mutex_enter(&rp->r_statelock); 11480 rp->r_count++; 11481 mutex_exit(&rp->r_statelock); 11482 11483 if (flags & B_ASYNC) { 11484 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11485 nfs4_sync_pageio); 11486 } else 11487 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11488 mutex_enter(&rp->r_statelock); 11489 rp->r_count--; 11490 cv_broadcast(&rp->r_cv); 11491 mutex_exit(&rp->r_statelock); 11492 return (error); 11493 } 11494 11495 /* ARGSUSED */ 11496 static void 11497 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11498 caller_context_t *ct) 11499 { 11500 int error; 11501 rnode4_t *rp; 11502 page_t *plist; 11503 page_t *pptr; 11504 offset3 offset; 11505 count3 len; 11506 k_sigset_t smask; 11507 11508 /* 11509 * We should get called with fl equal to either B_FREE or 11510 * B_INVAL. Any other value is illegal. 11511 * 11512 * The page that we are either supposed to free or destroy 11513 * should be exclusive locked and its io lock should not 11514 * be held. 11515 */ 11516 ASSERT(fl == B_FREE || fl == B_INVAL); 11517 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11518 11519 rp = VTOR4(vp); 11520 11521 /* 11522 * If the page doesn't need to be committed or we shouldn't 11523 * even bother attempting to commit it, then just make sure 11524 * that the p_fsdata byte is clear and then either free or 11525 * destroy the page as appropriate. 11526 */ 11527 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11528 pp->p_fsdata = C_NOCOMMIT; 11529 if (fl == B_FREE) 11530 page_free(pp, dn); 11531 else 11532 page_destroy(pp, dn); 11533 return; 11534 } 11535 11536 /* 11537 * If there is a page invalidation operation going on, then 11538 * if this is one of the pages being destroyed, then just 11539 * clear the p_fsdata byte and then either free or destroy 11540 * the page as appropriate. 11541 */ 11542 mutex_enter(&rp->r_statelock); 11543 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11544 mutex_exit(&rp->r_statelock); 11545 pp->p_fsdata = C_NOCOMMIT; 11546 if (fl == B_FREE) 11547 page_free(pp, dn); 11548 else 11549 page_destroy(pp, dn); 11550 return; 11551 } 11552 11553 /* 11554 * If we are freeing this page and someone else is already 11555 * waiting to do a commit, then just unlock the page and 11556 * return. That other thread will take care of commiting 11557 * this page. The page can be freed sometime after the 11558 * commit has finished. Otherwise, if the page is marked 11559 * as delay commit, then we may be getting called from 11560 * pvn_write_done, one page at a time. This could result 11561 * in one commit per page, so we end up doing lots of small 11562 * commits instead of fewer larger commits. This is bad, 11563 * we want do as few commits as possible. 11564 */ 11565 if (fl == B_FREE) { 11566 if (rp->r_flags & R4COMMITWAIT) { 11567 page_unlock(pp); 11568 mutex_exit(&rp->r_statelock); 11569 return; 11570 } 11571 if (pp->p_fsdata == C_DELAYCOMMIT) { 11572 pp->p_fsdata = C_COMMIT; 11573 page_unlock(pp); 11574 mutex_exit(&rp->r_statelock); 11575 return; 11576 } 11577 } 11578 11579 /* 11580 * Check to see if there is a signal which would prevent an 11581 * attempt to commit the pages from being successful. If so, 11582 * then don't bother with all of the work to gather pages and 11583 * generate the unsuccessful RPC. Just return from here and 11584 * let the page be committed at some later time. 11585 */ 11586 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11587 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11588 sigunintr(&smask); 11589 page_unlock(pp); 11590 mutex_exit(&rp->r_statelock); 11591 return; 11592 } 11593 sigunintr(&smask); 11594 11595 /* 11596 * We are starting to need to commit pages, so let's try 11597 * to commit as many as possible at once to reduce the 11598 * overhead. 11599 * 11600 * Set the `commit inprogress' state bit. We must 11601 * first wait until any current one finishes. Then 11602 * we initialize the c_pages list with this page. 11603 */ 11604 while (rp->r_flags & R4COMMIT) { 11605 rp->r_flags |= R4COMMITWAIT; 11606 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11607 rp->r_flags &= ~R4COMMITWAIT; 11608 } 11609 rp->r_flags |= R4COMMIT; 11610 mutex_exit(&rp->r_statelock); 11611 ASSERT(rp->r_commit.c_pages == NULL); 11612 rp->r_commit.c_pages = pp; 11613 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11614 rp->r_commit.c_commlen = PAGESIZE; 11615 11616 /* 11617 * Gather together all other pages which can be committed. 11618 * They will all be chained off r_commit.c_pages. 11619 */ 11620 nfs4_get_commit(vp); 11621 11622 /* 11623 * Clear the `commit inprogress' status and disconnect 11624 * the list of pages to be committed from the rnode. 11625 * At this same time, we also save the starting offset 11626 * and length of data to be committed on the server. 11627 */ 11628 plist = rp->r_commit.c_pages; 11629 rp->r_commit.c_pages = NULL; 11630 offset = rp->r_commit.c_commbase; 11631 len = rp->r_commit.c_commlen; 11632 mutex_enter(&rp->r_statelock); 11633 rp->r_flags &= ~R4COMMIT; 11634 cv_broadcast(&rp->r_commit.c_cv); 11635 mutex_exit(&rp->r_statelock); 11636 11637 if (curproc == proc_pageout || curproc == proc_fsflush || 11638 nfs_zone() != VTOMI4(vp)->mi_zone) { 11639 nfs4_async_commit(vp, plist, offset, len, 11640 cr, do_nfs4_async_commit); 11641 return; 11642 } 11643 11644 /* 11645 * Actually generate the COMMIT op over the wire operation. 11646 */ 11647 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11648 11649 /* 11650 * If we got an error during the commit, just unlock all 11651 * of the pages. The pages will get retransmitted to the 11652 * server during a putpage operation. 11653 */ 11654 if (error) { 11655 while (plist != NULL) { 11656 pptr = plist; 11657 page_sub(&plist, pptr); 11658 page_unlock(pptr); 11659 } 11660 return; 11661 } 11662 11663 /* 11664 * We've tried as hard as we can to commit the data to stable 11665 * storage on the server. We just unlock the rest of the pages 11666 * and clear the commit required state. They will be put 11667 * onto the tail of the cachelist if they are nolonger 11668 * mapped. 11669 */ 11670 while (plist != pp) { 11671 pptr = plist; 11672 page_sub(&plist, pptr); 11673 pptr->p_fsdata = C_NOCOMMIT; 11674 page_unlock(pptr); 11675 } 11676 11677 /* 11678 * It is possible that nfs4_commit didn't return error but 11679 * some other thread has modified the page we are going 11680 * to free/destroy. 11681 * In this case we need to rewrite the page. Do an explicit check 11682 * before attempting to free/destroy the page. If modified, needs to 11683 * be rewritten so unlock the page and return. 11684 */ 11685 if (hat_ismod(pp)) { 11686 pp->p_fsdata = C_NOCOMMIT; 11687 page_unlock(pp); 11688 return; 11689 } 11690 11691 /* 11692 * Now, as appropriate, either free or destroy the page 11693 * that we were called with. 11694 */ 11695 pp->p_fsdata = C_NOCOMMIT; 11696 if (fl == B_FREE) 11697 page_free(pp, dn); 11698 else 11699 page_destroy(pp, dn); 11700 } 11701 11702 /* 11703 * Commit requires that the current fh be the file written to. 11704 * The compound op structure is: 11705 * PUTFH(file), COMMIT 11706 */ 11707 static int 11708 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11709 { 11710 COMPOUND4args_clnt args; 11711 COMPOUND4res_clnt res; 11712 COMMIT4res *cm_res; 11713 nfs_argop4 argop[2]; 11714 nfs_resop4 *resop; 11715 int doqueue; 11716 mntinfo4_t *mi; 11717 rnode4_t *rp; 11718 cred_t *cred_otw = NULL; 11719 bool_t needrecov = FALSE; 11720 nfs4_recov_state_t recov_state; 11721 nfs4_open_stream_t *osp = NULL; 11722 bool_t first_time = TRUE; /* first time getting OTW cred */ 11723 bool_t last_time = FALSE; /* last time getting OTW cred */ 11724 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11725 11726 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11727 11728 rp = VTOR4(vp); 11729 11730 mi = VTOMI4(vp); 11731 recov_state.rs_flags = 0; 11732 recov_state.rs_num_retry_despite_err = 0; 11733 get_commit_cred: 11734 /* 11735 * Releases the osp, if a valid open stream is provided. 11736 * Puts a hold on the cred_otw and the new osp (if found). 11737 */ 11738 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11739 &first_time, &last_time); 11740 args.ctag = TAG_COMMIT; 11741 recov_retry: 11742 /* 11743 * Commit ops: putfh file; commit 11744 */ 11745 args.array_len = 2; 11746 args.array = argop; 11747 11748 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11749 &recov_state, NULL); 11750 if (e.error) { 11751 crfree(cred_otw); 11752 if (osp != NULL) 11753 open_stream_rele(osp, rp); 11754 return (e.error); 11755 } 11756 11757 /* putfh directory */ 11758 argop[0].argop = OP_CPUTFH; 11759 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11760 11761 /* commit */ 11762 argop[1].argop = OP_COMMIT; 11763 argop[1].nfs_argop4_u.opcommit.offset = offset; 11764 argop[1].nfs_argop4_u.opcommit.count = count; 11765 11766 doqueue = 1; 11767 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11768 11769 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11770 if (!needrecov && e.error) { 11771 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11772 needrecov); 11773 crfree(cred_otw); 11774 if (e.error == EACCES && last_time == FALSE) 11775 goto get_commit_cred; 11776 if (osp != NULL) 11777 open_stream_rele(osp, rp); 11778 return (e.error); 11779 } 11780 11781 if (needrecov) { 11782 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11783 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) { 11784 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11785 &recov_state, needrecov); 11786 if (!e.error) 11787 (void) xdr_free(xdr_COMPOUND4res_clnt, 11788 (caddr_t)&res); 11789 goto recov_retry; 11790 } 11791 if (e.error) { 11792 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11793 &recov_state, needrecov); 11794 crfree(cred_otw); 11795 if (osp != NULL) 11796 open_stream_rele(osp, rp); 11797 return (e.error); 11798 } 11799 /* fall through for res.status case */ 11800 } 11801 11802 if (res.status) { 11803 e.error = geterrno4(res.status); 11804 if (e.error == EACCES && last_time == FALSE) { 11805 crfree(cred_otw); 11806 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11807 &recov_state, needrecov); 11808 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11809 goto get_commit_cred; 11810 } 11811 /* 11812 * Can't do a nfs4_purge_stale_fh here because this 11813 * can cause a deadlock. nfs4_commit can 11814 * be called from nfs4_dispose which can be called 11815 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11816 * can call back to pvn_vplist_dirty. 11817 */ 11818 if (e.error == ESTALE) { 11819 mutex_enter(&rp->r_statelock); 11820 rp->r_flags |= R4STALE; 11821 if (!rp->r_error) 11822 rp->r_error = e.error; 11823 mutex_exit(&rp->r_statelock); 11824 PURGE_ATTRCACHE4(vp); 11825 } else { 11826 mutex_enter(&rp->r_statelock); 11827 if (!rp->r_error) 11828 rp->r_error = e.error; 11829 mutex_exit(&rp->r_statelock); 11830 } 11831 } else { 11832 ASSERT(rp->r_flags & R4HAVEVERF); 11833 resop = &res.array[1]; /* commit res */ 11834 cm_res = &resop->nfs_resop4_u.opcommit; 11835 mutex_enter(&rp->r_statelock); 11836 if (cm_res->writeverf == rp->r_writeverf) { 11837 mutex_exit(&rp->r_statelock); 11838 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11839 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11840 &recov_state, needrecov); 11841 crfree(cred_otw); 11842 if (osp != NULL) 11843 open_stream_rele(osp, rp); 11844 return (0); 11845 } 11846 nfs4_set_mod(vp); 11847 rp->r_writeverf = cm_res->writeverf; 11848 mutex_exit(&rp->r_statelock); 11849 e.error = NFS_VERF_MISMATCH; 11850 } 11851 11852 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11853 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11854 crfree(cred_otw); 11855 if (osp != NULL) 11856 open_stream_rele(osp, rp); 11857 11858 return (e.error); 11859 } 11860 11861 static void 11862 nfs4_set_mod(vnode_t *vp) 11863 { 11864 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11865 11866 /* make sure we're looking at the master vnode, not a shadow */ 11867 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check); 11868 } 11869 11870 /* 11871 * This function is used to gather a page list of the pages which 11872 * can be committed on the server. 11873 * 11874 * The calling thread must have set R4COMMIT. This bit is used to 11875 * serialize access to the commit structure in the rnode. As long 11876 * as the thread has set R4COMMIT, then it can manipulate the commit 11877 * structure without requiring any other locks. 11878 * 11879 * When this function is called from nfs4_dispose() the page passed 11880 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11881 * will skip it. This is not a problem since we initially add the 11882 * page to the r_commit page list. 11883 * 11884 */ 11885 static void 11886 nfs4_get_commit(vnode_t *vp) 11887 { 11888 rnode4_t *rp; 11889 page_t *pp; 11890 kmutex_t *vphm; 11891 11892 rp = VTOR4(vp); 11893 11894 ASSERT(rp->r_flags & R4COMMIT); 11895 11896 /* make sure we're looking at the master vnode, not a shadow */ 11897 11898 if (IS_SHADOW(vp, rp)) 11899 vp = RTOV4(rp); 11900 11901 vphm = page_vnode_mutex(vp); 11902 mutex_enter(vphm); 11903 11904 /* 11905 * If there are no pages associated with this vnode, then 11906 * just return. 11907 */ 11908 if ((pp = vp->v_pages) == NULL) { 11909 mutex_exit(vphm); 11910 return; 11911 } 11912 11913 /* 11914 * Step through all of the pages associated with this vnode 11915 * looking for pages which need to be committed. 11916 */ 11917 do { 11918 /* Skip marker pages. */ 11919 if (pp->p_hash == PVN_VPLIST_HASH_TAG) 11920 continue; 11921 11922 /* 11923 * First short-cut everything (without the page_lock) 11924 * and see if this page does not need to be committed 11925 * or is modified if so then we'll just skip it. 11926 */ 11927 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11928 continue; 11929 11930 /* 11931 * Attempt to lock the page. If we can't, then 11932 * someone else is messing with it or we have been 11933 * called from nfs4_dispose and this is the page that 11934 * nfs4_dispose was called with.. anyway just skip it. 11935 */ 11936 if (!page_trylock(pp, SE_EXCL)) 11937 continue; 11938 11939 /* 11940 * Lets check again now that we have the page lock. 11941 */ 11942 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11943 page_unlock(pp); 11944 continue; 11945 } 11946 11947 /* this had better not be a free page */ 11948 ASSERT(PP_ISFREE(pp) == 0); 11949 11950 /* 11951 * The page needs to be committed and we locked it. 11952 * Update the base and length parameters and add it 11953 * to r_pages. 11954 */ 11955 if (rp->r_commit.c_pages == NULL) { 11956 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11957 rp->r_commit.c_commlen = PAGESIZE; 11958 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11959 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11960 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11961 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11962 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11963 <= pp->p_offset) { 11964 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11965 rp->r_commit.c_commbase + PAGESIZE; 11966 } 11967 page_add(&rp->r_commit.c_pages, pp); 11968 } while ((pp = pp->p_vpnext) != vp->v_pages); 11969 11970 mutex_exit(vphm); 11971 } 11972 11973 /* 11974 * This routine is used to gather together a page list of the pages 11975 * which are to be committed on the server. This routine must not 11976 * be called if the calling thread holds any locked pages. 11977 * 11978 * The calling thread must have set R4COMMIT. This bit is used to 11979 * serialize access to the commit structure in the rnode. As long 11980 * as the thread has set R4COMMIT, then it can manipulate the commit 11981 * structure without requiring any other locks. 11982 */ 11983 static void 11984 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11985 { 11986 11987 rnode4_t *rp; 11988 page_t *pp; 11989 u_offset_t end; 11990 u_offset_t off; 11991 ASSERT(len != 0); 11992 rp = VTOR4(vp); 11993 ASSERT(rp->r_flags & R4COMMIT); 11994 11995 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11996 11997 /* make sure we're looking at the master vnode, not a shadow */ 11998 11999 if (IS_SHADOW(vp, rp)) 12000 vp = RTOV4(rp); 12001 12002 /* 12003 * If there are no pages associated with this vnode, then 12004 * just return. 12005 */ 12006 if ((pp = vp->v_pages) == NULL) 12007 return; 12008 /* 12009 * Calculate the ending offset. 12010 */ 12011 end = soff + len; 12012 for (off = soff; off < end; off += PAGESIZE) { 12013 /* 12014 * Lookup each page by vp, offset. 12015 */ 12016 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 12017 continue; 12018 /* 12019 * If this page does not need to be committed or is 12020 * modified, then just skip it. 12021 */ 12022 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12023 page_unlock(pp); 12024 continue; 12025 } 12026 12027 ASSERT(PP_ISFREE(pp) == 0); 12028 /* 12029 * The page needs to be committed and we locked it. 12030 * Update the base and length parameters and add it 12031 * to r_pages. 12032 */ 12033 if (rp->r_commit.c_pages == NULL) { 12034 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12035 rp->r_commit.c_commlen = PAGESIZE; 12036 } else { 12037 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12038 rp->r_commit.c_commbase + PAGESIZE; 12039 } 12040 page_add(&rp->r_commit.c_pages, pp); 12041 } 12042 } 12043 12044 /* 12045 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12046 * Flushes and commits data to the server. 12047 */ 12048 static int 12049 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12050 { 12051 int error; 12052 verifier4 write_verf; 12053 rnode4_t *rp = VTOR4(vp); 12054 12055 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12056 12057 /* 12058 * Flush the data portion of the file and then commit any 12059 * portions which need to be committed. This may need to 12060 * be done twice if the server has changed state since 12061 * data was last written. The data will need to be 12062 * rewritten to the server and then a new commit done. 12063 * 12064 * In fact, this may need to be done several times if the 12065 * server is having problems and crashing while we are 12066 * attempting to do this. 12067 */ 12068 12069 top: 12070 /* 12071 * Do a flush based on the poff and plen arguments. This 12072 * will synchronously write out any modified pages in the 12073 * range specified by (poff, plen). This starts all of the 12074 * i/o operations which will be waited for in the next 12075 * call to nfs4_putpage 12076 */ 12077 12078 mutex_enter(&rp->r_statelock); 12079 write_verf = rp->r_writeverf; 12080 mutex_exit(&rp->r_statelock); 12081 12082 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12083 if (error == EAGAIN) 12084 error = 0; 12085 12086 /* 12087 * Do a flush based on the poff and plen arguments. This 12088 * will synchronously write out any modified pages in the 12089 * range specified by (poff, plen) and wait until all of 12090 * the asynchronous i/o's in that range are done as well. 12091 */ 12092 if (!error) 12093 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12094 12095 if (error) 12096 return (error); 12097 12098 mutex_enter(&rp->r_statelock); 12099 if (rp->r_writeverf != write_verf) { 12100 mutex_exit(&rp->r_statelock); 12101 goto top; 12102 } 12103 mutex_exit(&rp->r_statelock); 12104 12105 /* 12106 * Now commit any pages which might need to be committed. 12107 * If the error, NFS_VERF_MISMATCH, is returned, then 12108 * start over with the flush operation. 12109 */ 12110 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12111 12112 if (error == NFS_VERF_MISMATCH) 12113 goto top; 12114 12115 return (error); 12116 } 12117 12118 /* 12119 * nfs4_commit_vp() will wait for other pending commits and 12120 * will either commit the whole file or a range, plen dictates 12121 * if we commit whole file. a value of zero indicates the whole 12122 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12123 */ 12124 static int 12125 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12126 cred_t *cr, int wait_on_writes) 12127 { 12128 rnode4_t *rp; 12129 page_t *plist; 12130 offset3 offset; 12131 count3 len; 12132 12133 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12134 12135 rp = VTOR4(vp); 12136 12137 /* 12138 * before we gather commitable pages make 12139 * sure there are no outstanding async writes 12140 */ 12141 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12142 mutex_enter(&rp->r_statelock); 12143 while (rp->r_count > 0) { 12144 cv_wait(&rp->r_cv, &rp->r_statelock); 12145 } 12146 mutex_exit(&rp->r_statelock); 12147 } 12148 12149 /* 12150 * Set the `commit inprogress' state bit. We must 12151 * first wait until any current one finishes. 12152 */ 12153 mutex_enter(&rp->r_statelock); 12154 while (rp->r_flags & R4COMMIT) { 12155 rp->r_flags |= R4COMMITWAIT; 12156 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12157 rp->r_flags &= ~R4COMMITWAIT; 12158 } 12159 rp->r_flags |= R4COMMIT; 12160 mutex_exit(&rp->r_statelock); 12161 12162 /* 12163 * Gather all of the pages which need to be 12164 * committed. 12165 */ 12166 if (plen == 0) 12167 nfs4_get_commit(vp); 12168 else 12169 nfs4_get_commit_range(vp, poff, plen); 12170 12171 /* 12172 * Clear the `commit inprogress' bit and disconnect the 12173 * page list which was gathered by nfs4_get_commit. 12174 */ 12175 plist = rp->r_commit.c_pages; 12176 rp->r_commit.c_pages = NULL; 12177 offset = rp->r_commit.c_commbase; 12178 len = rp->r_commit.c_commlen; 12179 mutex_enter(&rp->r_statelock); 12180 rp->r_flags &= ~R4COMMIT; 12181 cv_broadcast(&rp->r_commit.c_cv); 12182 mutex_exit(&rp->r_statelock); 12183 12184 /* 12185 * If any pages need to be committed, commit them and 12186 * then unlock them so that they can be freed some 12187 * time later. 12188 */ 12189 if (plist == NULL) 12190 return (0); 12191 12192 /* 12193 * No error occurred during the flush portion 12194 * of this operation, so now attempt to commit 12195 * the data to stable storage on the server. 12196 * 12197 * This will unlock all of the pages on the list. 12198 */ 12199 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12200 } 12201 12202 static int 12203 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12204 cred_t *cr) 12205 { 12206 int error; 12207 page_t *pp; 12208 12209 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12210 12211 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12212 12213 /* 12214 * If we got an error, then just unlock all of the pages 12215 * on the list. 12216 */ 12217 if (error) { 12218 while (plist != NULL) { 12219 pp = plist; 12220 page_sub(&plist, pp); 12221 page_unlock(pp); 12222 } 12223 return (error); 12224 } 12225 /* 12226 * We've tried as hard as we can to commit the data to stable 12227 * storage on the server. We just unlock the pages and clear 12228 * the commit required state. They will get freed later. 12229 */ 12230 while (plist != NULL) { 12231 pp = plist; 12232 page_sub(&plist, pp); 12233 pp->p_fsdata = C_NOCOMMIT; 12234 page_unlock(pp); 12235 } 12236 12237 return (error); 12238 } 12239 12240 static void 12241 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12242 cred_t *cr) 12243 { 12244 12245 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12246 } 12247 12248 /*ARGSUSED*/ 12249 static int 12250 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12251 caller_context_t *ct) 12252 { 12253 int error = 0; 12254 mntinfo4_t *mi; 12255 vattr_t va; 12256 vsecattr_t nfsace4_vsap; 12257 12258 mi = VTOMI4(vp); 12259 if (nfs_zone() != mi->mi_zone) 12260 return (EIO); 12261 if (mi->mi_flags & MI4_ACL) { 12262 /* if we have a delegation, return it */ 12263 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12264 (void) nfs4delegreturn(VTOR4(vp), 12265 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12266 12267 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12268 NFS4_ACL_SET); 12269 if (error) /* EINVAL */ 12270 return (error); 12271 12272 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12273 /* 12274 * These are aclent_t type entries. 12275 */ 12276 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12277 vp->v_type == VDIR, FALSE); 12278 if (error) 12279 return (error); 12280 } else { 12281 /* 12282 * These are ace_t type entries. 12283 */ 12284 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12285 FALSE); 12286 if (error) 12287 return (error); 12288 } 12289 bzero(&va, sizeof (va)); 12290 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12291 vs_ace4_destroy(&nfsace4_vsap); 12292 return (error); 12293 } 12294 return (ENOSYS); 12295 } 12296 12297 /* ARGSUSED */ 12298 int 12299 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12300 caller_context_t *ct) 12301 { 12302 int error; 12303 mntinfo4_t *mi; 12304 nfs4_ga_res_t gar; 12305 rnode4_t *rp = VTOR4(vp); 12306 12307 mi = VTOMI4(vp); 12308 if (nfs_zone() != mi->mi_zone) 12309 return (EIO); 12310 12311 bzero(&gar, sizeof (gar)); 12312 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12313 12314 /* 12315 * vsecattr->vsa_mask holds the original acl request mask. 12316 * This is needed when determining what to return. 12317 * (See: nfs4_create_getsecattr_return()) 12318 */ 12319 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12320 if (error) /* EINVAL */ 12321 return (error); 12322 12323 /* 12324 * If this is a referral stub, don't try to go OTW for an ACL 12325 */ 12326 if (RP_ISSTUB_REFERRAL(VTOR4(vp))) 12327 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 12328 12329 if (mi->mi_flags & MI4_ACL) { 12330 /* 12331 * Check if the data is cached and the cache is valid. If it 12332 * is we don't go over the wire. 12333 */ 12334 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12335 mutex_enter(&rp->r_statelock); 12336 if (rp->r_secattr != NULL) { 12337 error = nfs4_create_getsecattr_return( 12338 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12339 rp->r_attr.va_gid, 12340 vp->v_type == VDIR); 12341 if (!error) { /* error == 0 - Success! */ 12342 mutex_exit(&rp->r_statelock); 12343 return (error); 12344 } 12345 } 12346 mutex_exit(&rp->r_statelock); 12347 } 12348 12349 /* 12350 * The getattr otw call will always get both the acl, in 12351 * the form of a list of nfsace4's, and the number of acl 12352 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12353 */ 12354 gar.n4g_va.va_mask = AT_ALL; 12355 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12356 if (error) { 12357 vs_ace4_destroy(&gar.n4g_vsa); 12358 if (error == ENOTSUP || error == EOPNOTSUPP) 12359 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12360 return (error); 12361 } 12362 12363 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12364 /* 12365 * No error was returned, but according to the response 12366 * bitmap, neither was an acl. 12367 */ 12368 vs_ace4_destroy(&gar.n4g_vsa); 12369 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12370 return (error); 12371 } 12372 12373 /* 12374 * Update the cache with the ACL. 12375 */ 12376 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12377 12378 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12379 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12380 vp->v_type == VDIR); 12381 vs_ace4_destroy(&gar.n4g_vsa); 12382 if ((error) && (vsecattr->vsa_mask & 12383 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12384 (error != EACCES)) { 12385 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12386 } 12387 return (error); 12388 } 12389 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12390 return (error); 12391 } 12392 12393 /* 12394 * The function returns: 12395 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12396 * - EINVAL if the passed in "acl_mask" is an invalid request. 12397 * 12398 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12399 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12400 * 12401 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12402 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12403 * - We have a count field set without the corresponding acl field set. (e.g. - 12404 * VSA_ACECNT is set, but VSA_ACE is not) 12405 */ 12406 static int 12407 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12408 { 12409 /* Shortcut the masks that are always valid. */ 12410 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12411 return (0); 12412 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12413 return (0); 12414 12415 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12416 /* 12417 * We can't have any VSA_ACL type stuff in the mask now. 12418 */ 12419 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12420 VSA_DFACLCNT)) 12421 return (EINVAL); 12422 12423 if (op == NFS4_ACL_SET) { 12424 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12425 return (EINVAL); 12426 } 12427 } 12428 12429 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12430 /* 12431 * We can't have any VSA_ACE type stuff in the mask now. 12432 */ 12433 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12434 return (EINVAL); 12435 12436 if (op == NFS4_ACL_SET) { 12437 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12438 return (EINVAL); 12439 12440 if ((acl_mask & VSA_DFACLCNT) && 12441 !(acl_mask & VSA_DFACL)) 12442 return (EINVAL); 12443 } 12444 } 12445 return (0); 12446 } 12447 12448 /* 12449 * The theory behind creating the correct getsecattr return is simply this: 12450 * "Don't return anything that the caller is not expecting to have to free." 12451 */ 12452 static int 12453 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12454 uid_t uid, gid_t gid, int isdir) 12455 { 12456 int error = 0; 12457 /* Save the mask since the translators modify it. */ 12458 uint_t orig_mask = vsap->vsa_mask; 12459 12460 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12461 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE); 12462 12463 if (error) 12464 return (error); 12465 12466 /* 12467 * If the caller only asked for the ace count (VSA_ACECNT) 12468 * don't give them the full acl (VSA_ACE), free it. 12469 */ 12470 if (!orig_mask & VSA_ACE) { 12471 if (vsap->vsa_aclentp != NULL) { 12472 kmem_free(vsap->vsa_aclentp, 12473 vsap->vsa_aclcnt * sizeof (ace_t)); 12474 vsap->vsa_aclentp = NULL; 12475 } 12476 } 12477 vsap->vsa_mask = orig_mask; 12478 12479 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12480 VSA_DFACLCNT)) { 12481 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12482 isdir, FALSE); 12483 12484 if (error) 12485 return (error); 12486 12487 /* 12488 * If the caller only asked for the acl count (VSA_ACLCNT) 12489 * and/or the default acl count (VSA_DFACLCNT) don't give them 12490 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12491 */ 12492 if (!orig_mask & VSA_ACL) { 12493 if (vsap->vsa_aclentp != NULL) { 12494 kmem_free(vsap->vsa_aclentp, 12495 vsap->vsa_aclcnt * sizeof (aclent_t)); 12496 vsap->vsa_aclentp = NULL; 12497 } 12498 } 12499 12500 if (!orig_mask & VSA_DFACL) { 12501 if (vsap->vsa_dfaclentp != NULL) { 12502 kmem_free(vsap->vsa_dfaclentp, 12503 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12504 vsap->vsa_dfaclentp = NULL; 12505 } 12506 } 12507 vsap->vsa_mask = orig_mask; 12508 } 12509 return (0); 12510 } 12511 12512 /* ARGSUSED */ 12513 int 12514 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12515 caller_context_t *ct) 12516 { 12517 int error; 12518 12519 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12520 return (EIO); 12521 /* 12522 * check for valid cmd parameter 12523 */ 12524 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12525 return (EINVAL); 12526 12527 /* 12528 * Check access permissions 12529 */ 12530 if ((cmd & F_SHARE) && 12531 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12532 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12533 return (EBADF); 12534 12535 /* 12536 * If the filesystem is mounted using local locking, pass the 12537 * request off to the local share code. 12538 */ 12539 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12540 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12541 12542 switch (cmd) { 12543 case F_SHARE: 12544 case F_UNSHARE: 12545 /* 12546 * This will be properly implemented later, 12547 * see RFE: 4823948 . 12548 */ 12549 error = EAGAIN; 12550 break; 12551 12552 case F_HASREMOTELOCKS: 12553 /* 12554 * NFS client can't store remote locks itself 12555 */ 12556 shr->s_access = 0; 12557 error = 0; 12558 break; 12559 12560 default: 12561 error = EINVAL; 12562 break; 12563 } 12564 12565 return (error); 12566 } 12567 12568 /* 12569 * Common code called by directory ops to update the attrcache 12570 */ 12571 static int 12572 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12573 hrtime_t t, vnode_t *vp, cred_t *cr) 12574 { 12575 int error = 0; 12576 12577 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12578 12579 if (status != NFS4_OK) { 12580 /* getattr not done or failed */ 12581 PURGE_ATTRCACHE4(vp); 12582 return (error); 12583 } 12584 12585 if (garp) { 12586 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12587 } else { 12588 PURGE_ATTRCACHE4(vp); 12589 } 12590 return (error); 12591 } 12592 12593 /* 12594 * Update directory caches for directory modification ops (link, rename, etc.) 12595 * When dinfo is NULL, manage dircaches in the old way. 12596 */ 12597 static void 12598 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12599 dirattr_info_t *dinfo) 12600 { 12601 rnode4_t *drp = VTOR4(dvp); 12602 12603 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12604 12605 /* Purge rddir cache for dir since it changed */ 12606 if (drp->r_dir != NULL) 12607 nfs4_purge_rddir_cache(dvp); 12608 12609 /* 12610 * If caller provided dinfo, then use it to manage dir caches. 12611 */ 12612 if (dinfo != NULL) { 12613 if (vp != NULL) { 12614 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12615 if (!VTOR4(vp)->created_v4) { 12616 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12617 dnlc_update(dvp, nm, vp); 12618 } else { 12619 /* 12620 * XXX don't update if the created_v4 flag is 12621 * set 12622 */ 12623 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12624 NFS4_DEBUG(nfs4_client_state_debug, 12625 (CE_NOTE, "nfs4_update_dircaches: " 12626 "don't update dnlc: created_v4 flag")); 12627 } 12628 } 12629 12630 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12631 dinfo->di_cred, FALSE, cinfo); 12632 12633 return; 12634 } 12635 12636 /* 12637 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12638 * Since caller modified dir but didn't receive post-dirmod-op dir 12639 * attrs, the dir's attrs must be purged. 12640 * 12641 * XXX this check and dnlc update/purge should really be atomic, 12642 * XXX but can't use rnode statelock because it'll deadlock in 12643 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12644 * XXX does occur. 12645 * 12646 * XXX We also may want to check that atomic is true in the 12647 * XXX change_info struct. If it is not, the change_info may 12648 * XXX reflect changes by more than one clients which means that 12649 * XXX our cache may not be valid. 12650 */ 12651 PURGE_ATTRCACHE4(dvp); 12652 if (drp->r_change == cinfo->before) { 12653 /* no changes took place in the directory prior to our link */ 12654 if (vp != NULL) { 12655 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12656 if (!VTOR4(vp)->created_v4) { 12657 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12658 dnlc_update(dvp, nm, vp); 12659 } else { 12660 /* 12661 * XXX dont' update if the created_v4 flag 12662 * is set 12663 */ 12664 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12665 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12666 "nfs4_update_dircaches: don't" 12667 " update dnlc: created_v4 flag")); 12668 } 12669 } 12670 } else { 12671 /* Another client modified directory - purge its dnlc cache */ 12672 dnlc_purge_vp(dvp); 12673 } 12674 } 12675 12676 /* 12677 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12678 * file. 12679 * 12680 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12681 * file (ie: client recovery) and otherwise set to FALSE. 12682 * 12683 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12684 * initiated) calling functions. 12685 * 12686 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12687 * of resending a 'lost' open request. 12688 * 12689 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12690 * server that hands out BAD_SEQID on open confirm. 12691 * 12692 * Errors are returned via the nfs4_error_t parameter. 12693 */ 12694 void 12695 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12696 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12697 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12698 { 12699 COMPOUND4args_clnt args; 12700 COMPOUND4res_clnt res; 12701 nfs_argop4 argop[2]; 12702 nfs_resop4 *resop; 12703 int doqueue = 1; 12704 mntinfo4_t *mi; 12705 OPEN_CONFIRM4args *open_confirm_args; 12706 int needrecov; 12707 12708 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12709 #if DEBUG 12710 mutex_enter(&oop->oo_lock); 12711 ASSERT(oop->oo_seqid_inuse); 12712 mutex_exit(&oop->oo_lock); 12713 #endif 12714 12715 recov_retry_confirm: 12716 nfs4_error_zinit(ep); 12717 *retry_open = FALSE; 12718 12719 if (resend) 12720 args.ctag = TAG_OPEN_CONFIRM_LOST; 12721 else 12722 args.ctag = TAG_OPEN_CONFIRM; 12723 12724 args.array_len = 2; 12725 args.array = argop; 12726 12727 /* putfh target fh */ 12728 argop[0].argop = OP_CPUTFH; 12729 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12730 12731 argop[1].argop = OP_OPEN_CONFIRM; 12732 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12733 12734 (*seqid) += 1; 12735 open_confirm_args->seqid = *seqid; 12736 open_confirm_args->open_stateid = *stateid; 12737 12738 mi = VTOMI4(vp); 12739 12740 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12741 12742 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12743 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12744 } 12745 12746 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12747 if (!needrecov && ep->error) 12748 return; 12749 12750 if (needrecov) { 12751 bool_t abort = FALSE; 12752 12753 if (reopening_file == FALSE) { 12754 nfs4_bseqid_entry_t *bsep = NULL; 12755 12756 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12757 bsep = nfs4_create_bseqid_entry(oop, NULL, 12758 vp, 0, args.ctag, 12759 open_confirm_args->seqid); 12760 12761 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 12762 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL); 12763 if (bsep) { 12764 kmem_free(bsep, sizeof (*bsep)); 12765 if (num_bseqid_retryp && 12766 --(*num_bseqid_retryp) == 0) 12767 abort = TRUE; 12768 } 12769 } 12770 if ((ep->error == ETIMEDOUT || 12771 res.status == NFS4ERR_RESOURCE) && 12772 abort == FALSE && resend == FALSE) { 12773 if (!ep->error) 12774 (void) xdr_free(xdr_COMPOUND4res_clnt, 12775 (caddr_t)&res); 12776 12777 delay(SEC_TO_TICK(confirm_retry_sec)); 12778 goto recov_retry_confirm; 12779 } 12780 /* State may have changed so retry the entire OPEN op */ 12781 if (abort == FALSE) 12782 *retry_open = TRUE; 12783 else 12784 *retry_open = FALSE; 12785 if (!ep->error) 12786 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12787 return; 12788 } 12789 12790 if (res.status) { 12791 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12792 return; 12793 } 12794 12795 resop = &res.array[1]; /* open confirm res */ 12796 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12797 stateid, sizeof (*stateid)); 12798 12799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12800 } 12801 12802 /* 12803 * Return the credentials associated with a client state object. The 12804 * caller is responsible for freeing the credentials. 12805 */ 12806 12807 static cred_t * 12808 state_to_cred(nfs4_open_stream_t *osp) 12809 { 12810 cred_t *cr; 12811 12812 /* 12813 * It's ok to not lock the open stream and open owner to get 12814 * the oo_cred since this is only written once (upon creation) 12815 * and will not change. 12816 */ 12817 cr = osp->os_open_owner->oo_cred; 12818 crhold(cr); 12819 12820 return (cr); 12821 } 12822 12823 /* 12824 * nfs4_find_sysid 12825 * 12826 * Find the sysid for the knetconfig associated with the given mi. 12827 */ 12828 static struct lm_sysid * 12829 nfs4_find_sysid(mntinfo4_t *mi) 12830 { 12831 ASSERT(nfs_zone() == mi->mi_zone); 12832 12833 /* 12834 * Switch from RDMA knconf to original mount knconf 12835 */ 12836 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12837 mi->mi_curr_serv->sv_hostname, NULL)); 12838 } 12839 12840 #ifdef DEBUG 12841 /* 12842 * Return a string version of the call type for easy reading. 12843 */ 12844 static char * 12845 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12846 { 12847 switch (ctype) { 12848 case NFS4_LCK_CTYPE_NORM: 12849 return ("NORMAL"); 12850 case NFS4_LCK_CTYPE_RECLAIM: 12851 return ("RECLAIM"); 12852 case NFS4_LCK_CTYPE_RESEND: 12853 return ("RESEND"); 12854 case NFS4_LCK_CTYPE_REINSTATE: 12855 return ("REINSTATE"); 12856 default: 12857 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12858 "type %d", ctype); 12859 return (""); 12860 } 12861 } 12862 #endif 12863 12864 /* 12865 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12866 * Unlock requests don't have an over-the-wire locktype, so we just return 12867 * something non-threatening. 12868 */ 12869 12870 static nfs_lock_type4 12871 flk_to_locktype(int cmd, int l_type) 12872 { 12873 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12874 12875 switch (l_type) { 12876 case F_UNLCK: 12877 return (READ_LT); 12878 case F_RDLCK: 12879 if (cmd == F_SETLK) 12880 return (READ_LT); 12881 else 12882 return (READW_LT); 12883 case F_WRLCK: 12884 if (cmd == F_SETLK) 12885 return (WRITE_LT); 12886 else 12887 return (WRITEW_LT); 12888 } 12889 panic("flk_to_locktype"); 12890 /*NOTREACHED*/ 12891 } 12892 12893 /* 12894 * Do some preliminary checks for nfs4frlock. 12895 */ 12896 static int 12897 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12898 u_offset_t offset) 12899 { 12900 int error = 0; 12901 12902 /* 12903 * If we are setting a lock, check that the file is opened 12904 * with the correct mode. 12905 */ 12906 if (cmd == F_SETLK || cmd == F_SETLKW) { 12907 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12908 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12909 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12910 "nfs4frlock_validate_args: file was opened with " 12911 "incorrect mode")); 12912 return (EBADF); 12913 } 12914 } 12915 12916 /* Convert the offset. It may need to be restored before returning. */ 12917 if (error = convoff(vp, flk, 0, offset)) { 12918 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12919 "nfs4frlock_validate_args: convoff => error= %d\n", 12920 error)); 12921 return (error); 12922 } 12923 12924 return (error); 12925 } 12926 12927 /* 12928 * Set the flock64's lm_sysid for nfs4frlock. 12929 */ 12930 static int 12931 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12932 { 12933 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12934 12935 /* Find the lm_sysid */ 12936 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12937 12938 if (*lspp == NULL) { 12939 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12940 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12941 return (ENOLCK); 12942 } 12943 12944 flk->l_sysid = lm_sysidt(*lspp); 12945 12946 return (0); 12947 } 12948 12949 /* 12950 * Do the remaining preliminary setup for nfs4frlock. 12951 */ 12952 static void 12953 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12954 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12955 cred_t **cred_otw) 12956 { 12957 /* 12958 * set tick_delay to the base delay time. 12959 * (NFS4_BASE_WAIT_TIME is in secs) 12960 */ 12961 12962 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12963 12964 /* 12965 * If lock is relative to EOF, we need the newest length of the 12966 * file. Therefore invalidate the ATTR_CACHE. 12967 */ 12968 12969 *whencep = flk->l_whence; 12970 12971 if (*whencep == 2) /* SEEK_END */ 12972 PURGE_ATTRCACHE4(vp); 12973 12974 recov_statep->rs_flags = 0; 12975 recov_statep->rs_num_retry_despite_err = 0; 12976 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12977 } 12978 12979 /* 12980 * Initialize and allocate the data structures necessary for 12981 * the nfs4frlock call. 12982 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12983 */ 12984 static void 12985 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12986 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12987 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12988 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12989 { 12990 int argoplist_size; 12991 int num_ops = 2; 12992 12993 *retry = FALSE; 12994 *did_start_fop = FALSE; 12995 *skip_get_err = FALSE; 12996 lost_rqstp->lr_op = 0; 12997 argoplist_size = num_ops * sizeof (nfs_argop4); 12998 /* fill array with zero */ 12999 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 13000 13001 *argspp = argsp; 13002 *respp = NULL; 13003 13004 argsp->array_len = num_ops; 13005 argsp->array = *argopp; 13006 13007 /* initialize in case of error; will get real value down below */ 13008 argsp->ctag = TAG_NONE; 13009 13010 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 13011 *op_hintp = OH_LOCKU; 13012 else 13013 *op_hintp = OH_OTHER; 13014 } 13015 13016 /* 13017 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 13018 * the proper nfs4_server_t for this instance of nfs4frlock. 13019 * Returns 0 (success) or an errno value. 13020 */ 13021 static int 13022 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 13023 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 13024 bool_t *did_start_fop, bool_t *startrecovp) 13025 { 13026 int error = 0; 13027 rnode4_t *rp; 13028 13029 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13030 13031 if (ctype == NFS4_LCK_CTYPE_NORM) { 13032 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 13033 recov_statep, startrecovp); 13034 if (error) 13035 return (error); 13036 *did_start_fop = TRUE; 13037 } else { 13038 *did_start_fop = FALSE; 13039 *startrecovp = FALSE; 13040 } 13041 13042 if (!error) { 13043 rp = VTOR4(vp); 13044 13045 /* If the file failed recovery, just quit. */ 13046 mutex_enter(&rp->r_statelock); 13047 if (rp->r_flags & R4RECOVERR) { 13048 error = EIO; 13049 } 13050 mutex_exit(&rp->r_statelock); 13051 } 13052 13053 return (error); 13054 } 13055 13056 /* 13057 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13058 * resend nfs4frlock call is initiated by the recovery framework. 13059 * Acquires the lop and oop seqid synchronization. 13060 */ 13061 static void 13062 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13063 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13064 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13065 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13066 { 13067 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13068 int error; 13069 13070 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13071 (CE_NOTE, 13072 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13073 ASSERT(resend_rqstp != NULL); 13074 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13075 resend_rqstp->lr_op == OP_LOCKU); 13076 13077 *oopp = resend_rqstp->lr_oop; 13078 if (resend_rqstp->lr_oop) { 13079 open_owner_hold(resend_rqstp->lr_oop); 13080 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13081 ASSERT(error == 0); /* recov thread always succeeds */ 13082 } 13083 13084 /* Must resend this lost lock/locku request. */ 13085 ASSERT(resend_rqstp->lr_lop != NULL); 13086 *lopp = resend_rqstp->lr_lop; 13087 lock_owner_hold(resend_rqstp->lr_lop); 13088 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13089 ASSERT(error == 0); /* recov thread always succeeds */ 13090 13091 *ospp = resend_rqstp->lr_osp; 13092 if (*ospp) 13093 open_stream_hold(resend_rqstp->lr_osp); 13094 13095 if (resend_rqstp->lr_op == OP_LOCK) { 13096 LOCK4args *lock_args; 13097 13098 argop->argop = OP_LOCK; 13099 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13100 lock_args->locktype = resend_rqstp->lr_locktype; 13101 lock_args->reclaim = 13102 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13103 lock_args->offset = resend_rqstp->lr_flk->l_start; 13104 lock_args->length = resend_rqstp->lr_flk->l_len; 13105 if (lock_args->length == 0) 13106 lock_args->length = ~lock_args->length; 13107 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13108 mi2clientid(mi), &lock_args->locker); 13109 13110 switch (resend_rqstp->lr_ctype) { 13111 case NFS4_LCK_CTYPE_RESEND: 13112 argsp->ctag = TAG_LOCK_RESEND; 13113 break; 13114 case NFS4_LCK_CTYPE_REINSTATE: 13115 argsp->ctag = TAG_LOCK_REINSTATE; 13116 break; 13117 case NFS4_LCK_CTYPE_RECLAIM: 13118 argsp->ctag = TAG_LOCK_RECLAIM; 13119 break; 13120 default: 13121 argsp->ctag = TAG_LOCK_UNKNOWN; 13122 break; 13123 } 13124 } else { 13125 LOCKU4args *locku_args; 13126 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13127 13128 argop->argop = OP_LOCKU; 13129 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13130 locku_args->locktype = READ_LT; 13131 locku_args->seqid = lop->lock_seqid + 1; 13132 mutex_enter(&lop->lo_lock); 13133 locku_args->lock_stateid = lop->lock_stateid; 13134 mutex_exit(&lop->lo_lock); 13135 locku_args->offset = resend_rqstp->lr_flk->l_start; 13136 locku_args->length = resend_rqstp->lr_flk->l_len; 13137 if (locku_args->length == 0) 13138 locku_args->length = ~locku_args->length; 13139 13140 switch (resend_rqstp->lr_ctype) { 13141 case NFS4_LCK_CTYPE_RESEND: 13142 argsp->ctag = TAG_LOCKU_RESEND; 13143 break; 13144 case NFS4_LCK_CTYPE_REINSTATE: 13145 argsp->ctag = TAG_LOCKU_REINSTATE; 13146 break; 13147 default: 13148 argsp->ctag = TAG_LOCK_UNKNOWN; 13149 break; 13150 } 13151 } 13152 } 13153 13154 /* 13155 * Setup the LOCKT4 arguments. 13156 */ 13157 static void 13158 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13159 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 13160 rnode4_t *rp) 13161 { 13162 LOCKT4args *lockt_args; 13163 13164 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13165 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13166 argop->argop = OP_LOCKT; 13167 argsp->ctag = TAG_LOCKT; 13168 lockt_args = &argop->nfs_argop4_u.oplockt; 13169 13170 /* 13171 * The locktype will be READ_LT unless it's 13172 * a write lock. We do this because the Solaris 13173 * system call allows the combination of 13174 * F_UNLCK and F_GETLK* and so in that case the 13175 * unlock is mapped to a read. 13176 */ 13177 if (flk->l_type == F_WRLCK) 13178 lockt_args->locktype = WRITE_LT; 13179 else 13180 lockt_args->locktype = READ_LT; 13181 13182 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13183 /* set the lock owner4 args */ 13184 nfs4_setlockowner_args(&lockt_args->owner, rp, 13185 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13186 flk->l_pid); 13187 lockt_args->offset = flk->l_start; 13188 lockt_args->length = flk->l_len; 13189 if (flk->l_len == 0) 13190 lockt_args->length = ~lockt_args->length; 13191 13192 *lockt_argsp = lockt_args; 13193 } 13194 13195 /* 13196 * If the client is holding a delegation, and the open stream to be used 13197 * with this lock request is a delegation open stream, then re-open the stream. 13198 * Sets the nfs4_error_t to all zeros unless the open stream has already 13199 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13200 * means the caller should retry (like a recovery retry). 13201 */ 13202 static void 13203 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13204 { 13205 open_delegation_type4 dt; 13206 bool_t reopen_needed, force; 13207 nfs4_open_stream_t *osp; 13208 open_claim_type4 oclaim; 13209 rnode4_t *rp = VTOR4(vp); 13210 mntinfo4_t *mi = VTOMI4(vp); 13211 13212 ASSERT(nfs_zone() == mi->mi_zone); 13213 13214 nfs4_error_zinit(ep); 13215 13216 mutex_enter(&rp->r_statev4_lock); 13217 dt = rp->r_deleg_type; 13218 mutex_exit(&rp->r_statev4_lock); 13219 13220 if (dt != OPEN_DELEGATE_NONE) { 13221 nfs4_open_owner_t *oop; 13222 13223 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13224 if (!oop) { 13225 ep->stat = NFS4ERR_IO; 13226 return; 13227 } 13228 /* returns with 'os_sync_lock' held */ 13229 osp = find_open_stream(oop, rp); 13230 if (!osp) { 13231 open_owner_rele(oop); 13232 ep->stat = NFS4ERR_IO; 13233 return; 13234 } 13235 13236 if (osp->os_failed_reopen) { 13237 NFS4_DEBUG((nfs4_open_stream_debug || 13238 nfs4_client_lock_debug), (CE_NOTE, 13239 "nfs4frlock_check_deleg: os_failed_reopen set " 13240 "for osp %p, cr %p, rp %s", (void *)osp, 13241 (void *)cr, rnode4info(rp))); 13242 mutex_exit(&osp->os_sync_lock); 13243 open_stream_rele(osp, rp); 13244 open_owner_rele(oop); 13245 ep->stat = NFS4ERR_IO; 13246 return; 13247 } 13248 13249 /* 13250 * Determine whether a reopen is needed. If this 13251 * is a delegation open stream, then send the open 13252 * to the server to give visibility to the open owner. 13253 * Even if it isn't a delegation open stream, we need 13254 * to check if the previous open CLAIM_DELEGATE_CUR 13255 * was sufficient. 13256 */ 13257 13258 reopen_needed = osp->os_delegation || 13259 ((lt == F_RDLCK && 13260 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13261 (lt == F_WRLCK && 13262 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13263 13264 mutex_exit(&osp->os_sync_lock); 13265 open_owner_rele(oop); 13266 13267 if (reopen_needed) { 13268 /* 13269 * Always use CLAIM_PREVIOUS after server reboot. 13270 * The server will reject CLAIM_DELEGATE_CUR if 13271 * it is used during the grace period. 13272 */ 13273 mutex_enter(&mi->mi_lock); 13274 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13275 oclaim = CLAIM_PREVIOUS; 13276 force = TRUE; 13277 } else { 13278 oclaim = CLAIM_DELEGATE_CUR; 13279 force = FALSE; 13280 } 13281 mutex_exit(&mi->mi_lock); 13282 13283 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13284 if (ep->error == EAGAIN) { 13285 nfs4_error_zinit(ep); 13286 ep->stat = NFS4ERR_DELAY; 13287 } 13288 } 13289 open_stream_rele(osp, rp); 13290 osp = NULL; 13291 } 13292 } 13293 13294 /* 13295 * Setup the LOCKU4 arguments. 13296 * Returns errors via the nfs4_error_t. 13297 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13298 * over-the-wire. The caller must release the 13299 * reference on *lopp. 13300 * NFS4ERR_DELAY caller should retry (like recovery retry) 13301 * (other) unrecoverable error. 13302 */ 13303 static void 13304 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13305 LOCKU4args **locku_argsp, flock64_t *flk, 13306 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13307 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13308 bool_t *skip_get_err, bool_t *go_otwp) 13309 { 13310 nfs4_lock_owner_t *lop = NULL; 13311 LOCKU4args *locku_args; 13312 pid_t pid; 13313 bool_t is_spec = FALSE; 13314 rnode4_t *rp = VTOR4(vp); 13315 13316 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13317 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13318 13319 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13320 if (ep->error || ep->stat) 13321 return; 13322 13323 argop->argop = OP_LOCKU; 13324 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13325 argsp->ctag = TAG_LOCKU_REINSTATE; 13326 else 13327 argsp->ctag = TAG_LOCKU; 13328 locku_args = &argop->nfs_argop4_u.oplocku; 13329 *locku_argsp = locku_args; 13330 13331 /* 13332 * XXX what should locku_args->locktype be? 13333 * setting to ALWAYS be READ_LT so at least 13334 * it is a valid locktype. 13335 */ 13336 13337 locku_args->locktype = READ_LT; 13338 13339 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13340 flk->l_pid; 13341 13342 /* 13343 * Get the lock owner stateid. If no lock owner 13344 * exists, return success. 13345 */ 13346 lop = find_lock_owner(rp, pid, LOWN_ANY); 13347 *lopp = lop; 13348 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13349 is_spec = TRUE; 13350 if (!lop || is_spec) { 13351 /* 13352 * No lock owner so no locks to unlock. 13353 * Return success. If there was a failed 13354 * reclaim earlier, the lock might still be 13355 * registered with the local locking code, 13356 * so notify it of the unlock. 13357 * 13358 * If the lockowner is using a special stateid, 13359 * then the original lock request (that created 13360 * this lockowner) was never successful, so we 13361 * have no lock to undo OTW. 13362 */ 13363 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13364 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13365 "(%ld) so return success", (long)pid)); 13366 13367 if (ctype == NFS4_LCK_CTYPE_NORM) 13368 flk->l_pid = curproc->p_pid; 13369 nfs4_register_lock_locally(vp, flk, flag, offset); 13370 /* 13371 * Release our hold and NULL out so final_cleanup 13372 * doesn't try to end a lock seqid sync we 13373 * never started. 13374 */ 13375 if (is_spec) { 13376 lock_owner_rele(lop); 13377 *lopp = NULL; 13378 } 13379 *skip_get_err = TRUE; 13380 *go_otwp = FALSE; 13381 return; 13382 } 13383 13384 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13385 if (ep->error == EAGAIN) { 13386 lock_owner_rele(lop); 13387 *lopp = NULL; 13388 return; 13389 } 13390 13391 mutex_enter(&lop->lo_lock); 13392 locku_args->lock_stateid = lop->lock_stateid; 13393 mutex_exit(&lop->lo_lock); 13394 locku_args->seqid = lop->lock_seqid + 1; 13395 13396 /* leave the ref count on lop, rele after RPC call */ 13397 13398 locku_args->offset = flk->l_start; 13399 locku_args->length = flk->l_len; 13400 if (flk->l_len == 0) 13401 locku_args->length = ~locku_args->length; 13402 13403 *go_otwp = TRUE; 13404 } 13405 13406 /* 13407 * Setup the LOCK4 arguments. 13408 * 13409 * Returns errors via the nfs4_error_t. 13410 * NFS4_OK no problems 13411 * NFS4ERR_DELAY caller should retry (like recovery retry) 13412 * (other) unrecoverable error 13413 */ 13414 static void 13415 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13416 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13417 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13418 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13419 { 13420 LOCK4args *lock_args; 13421 nfs4_open_owner_t *oop = NULL; 13422 nfs4_open_stream_t *osp = NULL; 13423 nfs4_lock_owner_t *lop = NULL; 13424 pid_t pid; 13425 rnode4_t *rp = VTOR4(vp); 13426 13427 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13428 13429 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13430 if (ep->error || ep->stat != NFS4_OK) 13431 return; 13432 13433 argop->argop = OP_LOCK; 13434 if (ctype == NFS4_LCK_CTYPE_NORM) 13435 argsp->ctag = TAG_LOCK; 13436 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13437 argsp->ctag = TAG_RELOCK; 13438 else 13439 argsp->ctag = TAG_LOCK_REINSTATE; 13440 lock_args = &argop->nfs_argop4_u.oplock; 13441 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13442 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13443 /* 13444 * Get the lock owner. If no lock owner exists, 13445 * create a 'temporary' one and grab the open seqid 13446 * synchronization (which puts a hold on the open 13447 * owner and open stream). 13448 * This also grabs the lock seqid synchronization. 13449 */ 13450 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13451 ep->stat = 13452 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13453 13454 if (ep->stat != NFS4_OK) 13455 goto out; 13456 13457 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13458 &lock_args->locker); 13459 13460 lock_args->offset = flk->l_start; 13461 lock_args->length = flk->l_len; 13462 if (flk->l_len == 0) 13463 lock_args->length = ~lock_args->length; 13464 *lock_argsp = lock_args; 13465 out: 13466 *oopp = oop; 13467 *ospp = osp; 13468 *lopp = lop; 13469 } 13470 13471 /* 13472 * After we get the reply from the server, record the proper information 13473 * for possible resend lock requests. 13474 * 13475 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13476 */ 13477 static void 13478 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13479 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13480 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13481 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13482 { 13483 bool_t unlock = (flk->l_type == F_UNLCK); 13484 13485 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13486 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13487 ctype == NFS4_LCK_CTYPE_REINSTATE); 13488 13489 if (error != 0 && !unlock) { 13490 NFS4_DEBUG((nfs4_lost_rqst_debug || 13491 nfs4_client_lock_debug), (CE_NOTE, 13492 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13493 " for lop %p", (void *)lop)); 13494 ASSERT(lop != NULL); 13495 mutex_enter(&lop->lo_lock); 13496 lop->lo_pending_rqsts = 1; 13497 mutex_exit(&lop->lo_lock); 13498 } 13499 13500 lost_rqstp->lr_putfirst = FALSE; 13501 lost_rqstp->lr_op = 0; 13502 13503 /* 13504 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13505 * recovery purposes so that the lock request that was sent 13506 * can be saved and re-issued later. Ditto for EIO from a forced 13507 * unmount. This is done to have the client's local locking state 13508 * match the v4 server's state; that is, the request was 13509 * potentially received and accepted by the server but the client 13510 * thinks it was not. 13511 */ 13512 if (error == ETIMEDOUT || error == EINTR || 13513 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13514 NFS4_DEBUG((nfs4_lost_rqst_debug || 13515 nfs4_client_lock_debug), (CE_NOTE, 13516 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13517 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13518 (void *)lop, (void *)oop, (void *)osp)); 13519 if (unlock) 13520 lost_rqstp->lr_op = OP_LOCKU; 13521 else { 13522 lost_rqstp->lr_op = OP_LOCK; 13523 lost_rqstp->lr_locktype = locktype; 13524 } 13525 /* 13526 * Objects are held and rele'd via the recovery code. 13527 * See nfs4_save_lost_rqst. 13528 */ 13529 lost_rqstp->lr_vp = vp; 13530 lost_rqstp->lr_dvp = NULL; 13531 lost_rqstp->lr_oop = oop; 13532 lost_rqstp->lr_osp = osp; 13533 lost_rqstp->lr_lop = lop; 13534 lost_rqstp->lr_cr = cr; 13535 switch (ctype) { 13536 case NFS4_LCK_CTYPE_NORM: 13537 flk->l_pid = ttoproc(curthread)->p_pid; 13538 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13539 break; 13540 case NFS4_LCK_CTYPE_REINSTATE: 13541 lost_rqstp->lr_putfirst = TRUE; 13542 lost_rqstp->lr_ctype = ctype; 13543 break; 13544 default: 13545 break; 13546 } 13547 lost_rqstp->lr_flk = flk; 13548 } 13549 } 13550 13551 /* 13552 * Update lop's seqid. Also update the seqid stored in a resend request, 13553 * if any. (Some recovery errors increment the seqid, and we may have to 13554 * send the resend request again.) 13555 */ 13556 13557 static void 13558 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13559 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13560 { 13561 if (lock_args) { 13562 if (lock_args->locker.new_lock_owner == TRUE) 13563 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13564 else { 13565 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13566 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13567 } 13568 } else if (locku_args) { 13569 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13570 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13571 } 13572 } 13573 13574 /* 13575 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13576 * COMPOUND4 args/res for calls that need to retry. 13577 * Switches the *cred_otwp to base_cr. 13578 */ 13579 static void 13580 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13581 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13582 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13583 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13584 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13585 { 13586 nfs4_open_owner_t *oop = *oopp; 13587 nfs4_open_stream_t *osp = *ospp; 13588 nfs4_lock_owner_t *lop = *lopp; 13589 nfs_argop4 *argop = (*argspp)->array; 13590 13591 if (*did_start_fop) { 13592 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13593 needrecov); 13594 *did_start_fop = FALSE; 13595 } 13596 ASSERT((*argspp)->array_len == 2); 13597 if (argop[1].argop == OP_LOCK) 13598 nfs4args_lock_free(&argop[1]); 13599 else if (argop[1].argop == OP_LOCKT) 13600 nfs4args_lockt_free(&argop[1]); 13601 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13602 if (!error) 13603 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13604 *argspp = NULL; 13605 *respp = NULL; 13606 13607 if (lop) { 13608 nfs4_end_lock_seqid_sync(lop); 13609 lock_owner_rele(lop); 13610 *lopp = NULL; 13611 } 13612 13613 /* need to free up the reference on osp for lock args */ 13614 if (osp != NULL) { 13615 open_stream_rele(osp, VTOR4(vp)); 13616 *ospp = NULL; 13617 } 13618 13619 /* need to free up the reference on oop for lock args */ 13620 if (oop != NULL) { 13621 nfs4_end_open_seqid_sync(oop); 13622 open_owner_rele(oop); 13623 *oopp = NULL; 13624 } 13625 13626 crfree(*cred_otwp); 13627 *cred_otwp = base_cr; 13628 crhold(*cred_otwp); 13629 } 13630 13631 /* 13632 * Function to process the client's recovery for nfs4frlock. 13633 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13634 * 13635 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13636 * COMPOUND4 args/res for calls that need to retry. 13637 * 13638 * Note: the rp's r_lkserlock is *not* dropped during this path. 13639 */ 13640 static bool_t 13641 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13642 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13643 LOCK4args *lock_args, LOCKU4args *locku_args, 13644 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13645 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13646 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13647 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13648 { 13649 nfs4_open_owner_t *oop = *oopp; 13650 nfs4_open_stream_t *osp = *ospp; 13651 nfs4_lock_owner_t *lop = *lopp; 13652 13653 bool_t abort, retry; 13654 13655 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13656 ASSERT((*argspp) != NULL); 13657 ASSERT((*respp) != NULL); 13658 if (lock_args || locku_args) 13659 ASSERT(lop != NULL); 13660 13661 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13662 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13663 13664 retry = TRUE; 13665 abort = FALSE; 13666 if (needrecov) { 13667 nfs4_bseqid_entry_t *bsep = NULL; 13668 nfs_opnum4 op; 13669 13670 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13671 13672 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13673 seqid4 seqid; 13674 13675 if (lock_args) { 13676 if (lock_args->locker.new_lock_owner == TRUE) 13677 seqid = lock_args->locker.locker4_u. 13678 open_owner.open_seqid; 13679 else 13680 seqid = lock_args->locker.locker4_u. 13681 lock_owner.lock_seqid; 13682 } else if (locku_args) { 13683 seqid = locku_args->seqid; 13684 } else { 13685 seqid = 0; 13686 } 13687 13688 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13689 flk->l_pid, (*argspp)->ctag, seqid); 13690 } 13691 13692 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13693 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13694 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13695 NULL, op, bsep, NULL, NULL); 13696 13697 if (bsep) 13698 kmem_free(bsep, sizeof (*bsep)); 13699 } 13700 13701 /* 13702 * Return that we do not want to retry the request for 3 cases: 13703 * 1. If we received EINTR or are bailing out because of a forced 13704 * unmount, we came into this code path just for the sake of 13705 * initiating recovery, we now need to return the error. 13706 * 2. If we have aborted recovery. 13707 * 3. We received NFS4ERR_BAD_SEQID. 13708 */ 13709 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13710 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13711 retry = FALSE; 13712 13713 if (*did_start_fop == TRUE) { 13714 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13715 needrecov); 13716 *did_start_fop = FALSE; 13717 } 13718 13719 if (retry == TRUE) { 13720 nfs_argop4 *argop; 13721 13722 argop = (*argspp)->array; 13723 ASSERT((*argspp)->array_len == 2); 13724 13725 if (argop[1].argop == OP_LOCK) 13726 nfs4args_lock_free(&argop[1]); 13727 else if (argop[1].argop == OP_LOCKT) 13728 nfs4args_lockt_free(&argop[1]); 13729 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13730 if (!ep->error) 13731 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13732 *respp = NULL; 13733 *argspp = NULL; 13734 } 13735 13736 if (lop != NULL) { 13737 nfs4_end_lock_seqid_sync(lop); 13738 lock_owner_rele(lop); 13739 } 13740 13741 *lopp = NULL; 13742 13743 /* need to free up the reference on osp for lock args */ 13744 if (osp != NULL) { 13745 open_stream_rele(osp, rp); 13746 *ospp = NULL; 13747 } 13748 13749 /* need to free up the reference on oop for lock args */ 13750 if (oop != NULL) { 13751 nfs4_end_open_seqid_sync(oop); 13752 open_owner_rele(oop); 13753 *oopp = NULL; 13754 } 13755 13756 return (retry); 13757 } 13758 13759 /* 13760 * Handles the successful reply from the server for nfs4frlock. 13761 */ 13762 static void 13763 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13764 vnode_t *vp, int flag, u_offset_t offset, 13765 nfs4_lost_rqst_t *resend_rqstp) 13766 { 13767 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13768 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13769 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13770 if (ctype == NFS4_LCK_CTYPE_NORM) { 13771 flk->l_pid = ttoproc(curthread)->p_pid; 13772 /* 13773 * We do not register lost locks locally in 13774 * the 'resend' case since the user/application 13775 * doesn't think we have the lock. 13776 */ 13777 ASSERT(!resend_rqstp); 13778 nfs4_register_lock_locally(vp, flk, flag, offset); 13779 } 13780 } 13781 } 13782 13783 /* 13784 * Handle the DENIED reply from the server for nfs4frlock. 13785 * Returns TRUE if we should retry the request; FALSE otherwise. 13786 * 13787 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13788 * COMPOUND4 args/res for calls that need to retry. Can also 13789 * drop and regrab the r_lkserlock. 13790 */ 13791 static bool_t 13792 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13793 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13794 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13795 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13796 nfs4_recov_state_t *recov_statep, int needrecov, 13797 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13798 clock_t *tick_delayp, short *whencep, int *errorp, 13799 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13800 bool_t *skip_get_err) 13801 { 13802 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13803 13804 if (lock_args) { 13805 nfs4_open_owner_t *oop = *oopp; 13806 nfs4_open_stream_t *osp = *ospp; 13807 nfs4_lock_owner_t *lop = *lopp; 13808 int intr; 13809 13810 /* 13811 * Blocking lock needs to sleep and retry from the request. 13812 * 13813 * Do not block and wait for 'resend' or 'reinstate' 13814 * lock requests, just return the error. 13815 * 13816 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13817 */ 13818 if (cmd == F_SETLKW) { 13819 rnode4_t *rp = VTOR4(vp); 13820 nfs_argop4 *argop = (*argspp)->array; 13821 13822 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13823 13824 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13825 recov_statep, needrecov); 13826 *did_start_fop = FALSE; 13827 ASSERT((*argspp)->array_len == 2); 13828 if (argop[1].argop == OP_LOCK) 13829 nfs4args_lock_free(&argop[1]); 13830 else if (argop[1].argop == OP_LOCKT) 13831 nfs4args_lockt_free(&argop[1]); 13832 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13833 if (*respp) 13834 (void) xdr_free(xdr_COMPOUND4res_clnt, 13835 (caddr_t)*respp); 13836 *argspp = NULL; 13837 *respp = NULL; 13838 nfs4_end_lock_seqid_sync(lop); 13839 lock_owner_rele(lop); 13840 *lopp = NULL; 13841 if (osp != NULL) { 13842 open_stream_rele(osp, rp); 13843 *ospp = NULL; 13844 } 13845 if (oop != NULL) { 13846 nfs4_end_open_seqid_sync(oop); 13847 open_owner_rele(oop); 13848 *oopp = NULL; 13849 } 13850 13851 nfs_rw_exit(&rp->r_lkserlock); 13852 13853 intr = nfs4_block_and_wait(tick_delayp, rp); 13854 13855 if (intr) { 13856 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13857 RW_WRITER, FALSE); 13858 *errorp = EINTR; 13859 return (FALSE); 13860 } 13861 13862 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13863 RW_WRITER, FALSE); 13864 13865 /* 13866 * Make sure we are still safe to lock with 13867 * regards to mmapping. 13868 */ 13869 if (!nfs4_safelock(vp, flk, cr)) { 13870 *errorp = EAGAIN; 13871 return (FALSE); 13872 } 13873 13874 return (TRUE); 13875 } 13876 if (ctype == NFS4_LCK_CTYPE_NORM) 13877 *errorp = EAGAIN; 13878 *skip_get_err = TRUE; 13879 flk->l_whence = 0; 13880 *whencep = 0; 13881 return (FALSE); 13882 } else if (lockt_args) { 13883 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13884 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13885 13886 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13887 flk, lockt_args); 13888 13889 /* according to NLM code */ 13890 *errorp = 0; 13891 *whencep = 0; 13892 *skip_get_err = TRUE; 13893 return (FALSE); 13894 } 13895 return (FALSE); 13896 } 13897 13898 /* 13899 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13900 */ 13901 static void 13902 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13903 { 13904 switch (resp->status) { 13905 case NFS4ERR_ACCESS: 13906 case NFS4ERR_ADMIN_REVOKED: 13907 case NFS4ERR_BADHANDLE: 13908 case NFS4ERR_BAD_RANGE: 13909 case NFS4ERR_BAD_SEQID: 13910 case NFS4ERR_BAD_STATEID: 13911 case NFS4ERR_BADXDR: 13912 case NFS4ERR_DEADLOCK: 13913 case NFS4ERR_DELAY: 13914 case NFS4ERR_EXPIRED: 13915 case NFS4ERR_FHEXPIRED: 13916 case NFS4ERR_GRACE: 13917 case NFS4ERR_INVAL: 13918 case NFS4ERR_ISDIR: 13919 case NFS4ERR_LEASE_MOVED: 13920 case NFS4ERR_LOCK_NOTSUPP: 13921 case NFS4ERR_LOCK_RANGE: 13922 case NFS4ERR_MOVED: 13923 case NFS4ERR_NOFILEHANDLE: 13924 case NFS4ERR_NO_GRACE: 13925 case NFS4ERR_OLD_STATEID: 13926 case NFS4ERR_OPENMODE: 13927 case NFS4ERR_RECLAIM_BAD: 13928 case NFS4ERR_RECLAIM_CONFLICT: 13929 case NFS4ERR_RESOURCE: 13930 case NFS4ERR_SERVERFAULT: 13931 case NFS4ERR_STALE: 13932 case NFS4ERR_STALE_CLIENTID: 13933 case NFS4ERR_STALE_STATEID: 13934 return; 13935 default: 13936 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13937 "nfs4frlock_results_default: got unrecognizable " 13938 "res.status %d", resp->status)); 13939 *errorp = NFS4ERR_INVAL; 13940 } 13941 } 13942 13943 /* 13944 * The lock request was successful, so update the client's state. 13945 */ 13946 static void 13947 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13948 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13949 vnode_t *vp, flock64_t *flk, cred_t *cr, 13950 nfs4_lost_rqst_t *resend_rqstp) 13951 { 13952 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13953 13954 if (lock_args) { 13955 LOCK4res *lock_res; 13956 13957 lock_res = &resop->nfs_resop4_u.oplock; 13958 /* update the stateid with server's response */ 13959 13960 if (lock_args->locker.new_lock_owner == TRUE) { 13961 mutex_enter(&lop->lo_lock); 13962 lop->lo_just_created = NFS4_PERM_CREATED; 13963 mutex_exit(&lop->lo_lock); 13964 } 13965 13966 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13967 13968 /* 13969 * If the lock was the result of a resending a lost 13970 * request, we've synched up the stateid and seqid 13971 * with the server, but now the server might be out of sync 13972 * with what the application thinks it has for locks. 13973 * Clean that up here. It's unclear whether we should do 13974 * this even if the filesystem has been forcibly unmounted. 13975 * For most servers, it's probably wasted effort, but 13976 * RFC3530 lets servers require that unlocks exactly match 13977 * the locks that are held. 13978 */ 13979 if (resend_rqstp != NULL && 13980 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13981 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13982 } else { 13983 flk->l_whence = 0; 13984 } 13985 } else if (locku_args) { 13986 LOCKU4res *locku_res; 13987 13988 locku_res = &resop->nfs_resop4_u.oplocku; 13989 13990 /* Update the stateid with the server's response */ 13991 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13992 } else if (lockt_args) { 13993 /* Switch the lock type to express success, see fcntl */ 13994 flk->l_type = F_UNLCK; 13995 flk->l_whence = 0; 13996 } 13997 } 13998 13999 /* 14000 * Do final cleanup before exiting nfs4frlock. 14001 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 14002 * COMPOUND4 args/res for calls that haven't already. 14003 */ 14004 static void 14005 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 14006 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 14007 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 14008 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 14009 short whence, u_offset_t offset, struct lm_sysid *ls, 14010 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 14011 bool_t did_start_fop, bool_t skip_get_err, 14012 cred_t *cred_otw, cred_t *cred) 14013 { 14014 mntinfo4_t *mi = VTOMI4(vp); 14015 rnode4_t *rp = VTOR4(vp); 14016 int error = *errorp; 14017 nfs_argop4 *argop; 14018 int do_flush_pages = 0; 14019 14020 ASSERT(nfs_zone() == mi->mi_zone); 14021 /* 14022 * The client recovery code wants the raw status information, 14023 * so don't map the NFS status code to an errno value for 14024 * non-normal call types. 14025 */ 14026 if (ctype == NFS4_LCK_CTYPE_NORM) { 14027 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 14028 *errorp = geterrno4(resp->status); 14029 if (did_start_fop == TRUE) 14030 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 14031 needrecov); 14032 14033 /* 14034 * We've established a new lock on the server, so invalidate 14035 * the pages associated with the vnode to get the most up to 14036 * date pages from the server after acquiring the lock. We 14037 * want to be sure that the read operation gets the newest data. 14038 * N.B. 14039 * We used to do this in nfs4frlock_results_ok but that doesn't 14040 * work since VOP_PUTPAGE can call nfs4_commit which calls 14041 * nfs4_start_fop. We flush the pages below after calling 14042 * nfs4_end_fop above 14043 * The flush of the page cache must be done after 14044 * nfs4_end_open_seqid_sync() to avoid a 4-way hang. 14045 */ 14046 if (!error && resp && resp->status == NFS4_OK) 14047 do_flush_pages = 1; 14048 } 14049 if (argsp) { 14050 ASSERT(argsp->array_len == 2); 14051 argop = argsp->array; 14052 if (argop[1].argop == OP_LOCK) 14053 nfs4args_lock_free(&argop[1]); 14054 else if (argop[1].argop == OP_LOCKT) 14055 nfs4args_lockt_free(&argop[1]); 14056 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14057 if (resp) 14058 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14059 } 14060 14061 /* free the reference on the lock owner */ 14062 if (lop != NULL) { 14063 nfs4_end_lock_seqid_sync(lop); 14064 lock_owner_rele(lop); 14065 } 14066 14067 /* need to free up the reference on osp for lock args */ 14068 if (osp != NULL) 14069 open_stream_rele(osp, rp); 14070 14071 /* need to free up the reference on oop for lock args */ 14072 if (oop != NULL) { 14073 nfs4_end_open_seqid_sync(oop); 14074 open_owner_rele(oop); 14075 } 14076 14077 if (do_flush_pages) 14078 nfs4_flush_pages(vp, cred); 14079 14080 (void) convoff(vp, flk, whence, offset); 14081 14082 lm_rel_sysid(ls); 14083 14084 /* 14085 * Record debug information in the event we get EINVAL. 14086 */ 14087 mutex_enter(&mi->mi_lock); 14088 if (*errorp == EINVAL && (lock_args || locku_args) && 14089 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14090 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14091 zcmn_err(getzoneid(), CE_NOTE, 14092 "%s operation failed with " 14093 "EINVAL probably since the server, %s," 14094 " doesn't support POSIX style locking", 14095 lock_args ? "LOCK" : "LOCKU", 14096 mi->mi_curr_serv->sv_hostname); 14097 mi->mi_flags |= MI4_LOCK_DEBUG; 14098 } 14099 } 14100 mutex_exit(&mi->mi_lock); 14101 14102 if (cred_otw) 14103 crfree(cred_otw); 14104 } 14105 14106 /* 14107 * This calls the server and the local locking code. 14108 * 14109 * Client locks are registerred locally by oring the sysid with 14110 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 14111 * We need to distinguish between the two to avoid collision in case one 14112 * machine is used as both client and server. 14113 * 14114 * Blocking lock requests will continually retry to acquire the lock 14115 * forever. 14116 * 14117 * The ctype is defined as follows: 14118 * NFS4_LCK_CTYPE_NORM: normal lock request. 14119 * 14120 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14121 * recovery, get the pid from flk instead of curproc, and don't reregister 14122 * the lock locally. 14123 * 14124 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14125 * that we will use the information passed in via resend_rqstp to setup the 14126 * lock/locku request. This resend is the exact same request as the 'lost 14127 * lock', and is initiated by the recovery framework. A successful resend 14128 * request can initiate one or more reinstate requests. 14129 * 14130 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14131 * does not trigger additional reinstate requests. This lock call type is 14132 * set for setting the v4 server's locking state back to match what the 14133 * client's local locking state is in the event of a received 'lost lock'. 14134 * 14135 * Errors are returned via the nfs4_error_t parameter. 14136 */ 14137 void 14138 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14139 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 14140 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 14141 { 14142 COMPOUND4args_clnt args, *argsp = NULL; 14143 COMPOUND4res_clnt res, *resp = NULL; 14144 nfs_argop4 *argop; 14145 nfs_resop4 *resop; 14146 rnode4_t *rp; 14147 int doqueue = 1; 14148 clock_t tick_delay; /* delay in clock ticks */ 14149 struct lm_sysid *ls; 14150 LOCK4args *lock_args = NULL; 14151 LOCKU4args *locku_args = NULL; 14152 LOCKT4args *lockt_args = NULL; 14153 nfs4_open_owner_t *oop = NULL; 14154 nfs4_open_stream_t *osp = NULL; 14155 nfs4_lock_owner_t *lop = NULL; 14156 bool_t needrecov = FALSE; 14157 nfs4_recov_state_t recov_state; 14158 short whence; 14159 nfs4_op_hint_t op_hint; 14160 nfs4_lost_rqst_t lost_rqst; 14161 bool_t retry = FALSE; 14162 bool_t did_start_fop = FALSE; 14163 bool_t skip_get_err = FALSE; 14164 cred_t *cred_otw = NULL; 14165 bool_t recovonly; /* just queue request */ 14166 int frc_no_reclaim = 0; 14167 #ifdef DEBUG 14168 char *name; 14169 #endif 14170 14171 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14172 14173 #ifdef DEBUG 14174 name = fn_name(VTOSV(vp)->sv_name); 14175 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14176 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 14177 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14178 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 14179 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 14180 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 14181 resend_rqstp ? "TRUE" : "FALSE")); 14182 kmem_free(name, MAXNAMELEN); 14183 #endif 14184 14185 nfs4_error_zinit(ep); 14186 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 14187 if (ep->error) 14188 return; 14189 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 14190 if (ep->error) 14191 return; 14192 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 14193 vp, cr, &cred_otw); 14194 14195 recov_retry: 14196 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14197 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14198 rp = VTOR4(vp); 14199 14200 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14201 &did_start_fop, &recovonly); 14202 14203 if (ep->error) 14204 goto out; 14205 14206 if (recovonly) { 14207 /* 14208 * Leave the request for the recovery system to deal with. 14209 */ 14210 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14211 ASSERT(cmd != F_GETLK); 14212 ASSERT(flk->l_type == F_UNLCK); 14213 14214 nfs4_error_init(ep, EINTR); 14215 needrecov = TRUE; 14216 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14217 if (lop != NULL) { 14218 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14219 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14220 (void) nfs4_start_recovery(ep, 14221 VTOMI4(vp), vp, NULL, NULL, 14222 (lost_rqst.lr_op == OP_LOCK || 14223 lost_rqst.lr_op == OP_LOCKU) ? 14224 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL); 14225 lock_owner_rele(lop); 14226 lop = NULL; 14227 } 14228 flk->l_pid = curproc->p_pid; 14229 nfs4_register_lock_locally(vp, flk, flag, offset); 14230 goto out; 14231 } 14232 14233 /* putfh directory fh */ 14234 argop[0].argop = OP_CPUTFH; 14235 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14236 14237 /* 14238 * Set up the over-the-wire arguments and get references to the 14239 * open owner, etc. 14240 */ 14241 14242 if (ctype == NFS4_LCK_CTYPE_RESEND || 14243 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14244 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14245 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14246 } else { 14247 bool_t go_otw = TRUE; 14248 14249 ASSERT(resend_rqstp == NULL); 14250 14251 switch (cmd) { 14252 case F_GETLK: 14253 case F_O_GETLK: 14254 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14255 &lockt_args, argsp, flk, rp); 14256 break; 14257 case F_SETLKW: 14258 case F_SETLK: 14259 if (flk->l_type == F_UNLCK) 14260 nfs4frlock_setup_locku_args(ctype, 14261 &argop[1], &locku_args, flk, 14262 &lop, ep, argsp, 14263 vp, flag, offset, cr, 14264 &skip_get_err, &go_otw); 14265 else 14266 nfs4frlock_setup_lock_args(ctype, 14267 &lock_args, &oop, &osp, &lop, &argop[1], 14268 argsp, flk, cmd, vp, cr, ep); 14269 14270 if (ep->error) 14271 goto out; 14272 14273 switch (ep->stat) { 14274 case NFS4_OK: 14275 break; 14276 case NFS4ERR_DELAY: 14277 /* recov thread never gets this error */ 14278 ASSERT(resend_rqstp == NULL); 14279 ASSERT(did_start_fop); 14280 14281 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14282 &recov_state, TRUE); 14283 did_start_fop = FALSE; 14284 if (argop[1].argop == OP_LOCK) 14285 nfs4args_lock_free(&argop[1]); 14286 else if (argop[1].argop == OP_LOCKT) 14287 nfs4args_lockt_free(&argop[1]); 14288 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14289 argsp = NULL; 14290 goto recov_retry; 14291 default: 14292 ep->error = EIO; 14293 goto out; 14294 } 14295 break; 14296 default: 14297 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14298 "nfs4_frlock: invalid cmd %d", cmd)); 14299 ep->error = EINVAL; 14300 goto out; 14301 } 14302 14303 if (!go_otw) 14304 goto out; 14305 } 14306 14307 /* XXX should we use the local reclock as a cache ? */ 14308 /* 14309 * Unregister the lock with the local locking code before 14310 * contacting the server. This avoids a potential race where 14311 * another process gets notified that it has been granted a lock 14312 * before we can unregister ourselves locally. 14313 */ 14314 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14315 if (ctype == NFS4_LCK_CTYPE_NORM) 14316 flk->l_pid = ttoproc(curthread)->p_pid; 14317 nfs4_register_lock_locally(vp, flk, flag, offset); 14318 } 14319 14320 /* 14321 * Send the server the lock request. Continually loop with a delay 14322 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14323 */ 14324 resp = &res; 14325 14326 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14327 (CE_NOTE, 14328 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14329 rnode4info(rp))); 14330 14331 if (lock_args && frc_no_reclaim) { 14332 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14333 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14334 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14335 lock_args->reclaim = FALSE; 14336 if (did_reclaimp) 14337 *did_reclaimp = 0; 14338 } 14339 14340 /* 14341 * Do the OTW call. 14342 */ 14343 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14344 14345 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14346 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14347 14348 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14349 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14350 "nfs4frlock: needrecov %d", needrecov)); 14351 14352 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14353 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14354 args.ctag); 14355 14356 /* 14357 * Check if one of these mutually exclusive error cases has 14358 * happened: 14359 * need to swap credentials due to access error 14360 * recovery is needed 14361 * different error (only known case is missing Kerberos ticket) 14362 */ 14363 14364 if ((ep->error == EACCES || 14365 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14366 cred_otw != cr) { 14367 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14368 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14369 cr, &cred_otw); 14370 goto recov_retry; 14371 } 14372 14373 if (needrecov) { 14374 /* 14375 * LOCKT requests don't need to recover from lost 14376 * requests since they don't create/modify state. 14377 */ 14378 if ((ep->error == EINTR || 14379 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14380 lockt_args) 14381 goto out; 14382 /* 14383 * Do not attempt recovery for requests initiated by 14384 * the recovery framework. Let the framework redrive them. 14385 */ 14386 if (ctype != NFS4_LCK_CTYPE_NORM) 14387 goto out; 14388 else { 14389 ASSERT(resend_rqstp == NULL); 14390 } 14391 14392 nfs4frlock_save_lost_rqst(ctype, ep->error, 14393 flk_to_locktype(cmd, flk->l_type), 14394 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14395 14396 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14397 &resp, lock_args, locku_args, &oop, &osp, &lop, 14398 rp, vp, &recov_state, op_hint, &did_start_fop, 14399 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14400 14401 if (retry) { 14402 ASSERT(oop == NULL); 14403 ASSERT(osp == NULL); 14404 ASSERT(lop == NULL); 14405 goto recov_retry; 14406 } 14407 goto out; 14408 } 14409 14410 /* 14411 * Bail out if have reached this point with ep->error set. Can 14412 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14413 * This happens if Kerberos ticket has expired or has been 14414 * destroyed. 14415 */ 14416 if (ep->error != 0) 14417 goto out; 14418 14419 /* 14420 * Process the reply. 14421 */ 14422 switch (resp->status) { 14423 case NFS4_OK: 14424 resop = &resp->array[1]; 14425 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14426 resend_rqstp); 14427 /* 14428 * Have a successful lock operation, now update state. 14429 */ 14430 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14431 resop, lop, vp, flk, cr, resend_rqstp); 14432 break; 14433 14434 case NFS4ERR_DENIED: 14435 resop = &resp->array[1]; 14436 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14437 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14438 &recov_state, needrecov, &argsp, &resp, 14439 &tick_delay, &whence, &ep->error, resop, cr, 14440 &did_start_fop, &skip_get_err); 14441 14442 if (retry) { 14443 ASSERT(oop == NULL); 14444 ASSERT(osp == NULL); 14445 ASSERT(lop == NULL); 14446 goto recov_retry; 14447 } 14448 break; 14449 /* 14450 * If the server won't let us reclaim, fall-back to trying to lock 14451 * the file from scratch. Code elsewhere will check the changeinfo 14452 * to ensure the file hasn't been changed. 14453 */ 14454 case NFS4ERR_NO_GRACE: 14455 if (lock_args && lock_args->reclaim == TRUE) { 14456 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14457 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14458 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14459 frc_no_reclaim = 1; 14460 /* clean up before retrying */ 14461 needrecov = 0; 14462 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14463 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14464 &recov_state, op_hint, &did_start_fop, NULL, flk); 14465 goto recov_retry; 14466 } 14467 /* FALLTHROUGH */ 14468 14469 default: 14470 nfs4frlock_results_default(resp, &ep->error); 14471 break; 14472 } 14473 out: 14474 /* 14475 * Process and cleanup from error. Make interrupted unlock 14476 * requests look successful, since they will be handled by the 14477 * client recovery code. 14478 */ 14479 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14480 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14481 lock_args, locku_args, did_start_fop, 14482 skip_get_err, cred_otw, cr); 14483 14484 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14485 (cmd == F_SETLK || cmd == F_SETLKW)) 14486 ep->error = 0; 14487 } 14488 14489 /* 14490 * nfs4_safelock: 14491 * 14492 * Return non-zero if the given lock request can be handled without 14493 * violating the constraints on concurrent mapping and locking. 14494 */ 14495 14496 static int 14497 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14498 { 14499 rnode4_t *rp = VTOR4(vp); 14500 struct vattr va; 14501 int error; 14502 14503 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14504 ASSERT(rp->r_mapcnt >= 0); 14505 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14506 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14507 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14508 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14509 14510 if (rp->r_mapcnt == 0) 14511 return (1); /* always safe if not mapped */ 14512 14513 /* 14514 * If the file is already mapped and there are locks, then they 14515 * should be all safe locks. So adding or removing a lock is safe 14516 * as long as the new request is safe (i.e., whole-file, meaning 14517 * length and starting offset are both zero). 14518 */ 14519 14520 if (bfp->l_start != 0 || bfp->l_len != 0) { 14521 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14522 "cannot lock a memory mapped file unless locking the " 14523 "entire file: start %"PRIx64", len %"PRIx64, 14524 bfp->l_start, bfp->l_len)); 14525 return (0); 14526 } 14527 14528 /* mandatory locking and mapping don't mix */ 14529 va.va_mask = AT_MODE; 14530 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14531 if (error != 0) { 14532 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14533 "getattr error %d", error)); 14534 return (0); /* treat errors conservatively */ 14535 } 14536 if (MANDLOCK(vp, va.va_mode)) { 14537 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14538 "cannot mandatory lock and mmap a file")); 14539 return (0); 14540 } 14541 14542 return (1); 14543 } 14544 14545 14546 /* 14547 * Register the lock locally within Solaris. 14548 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14549 * recording locks locally. 14550 * 14551 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14552 * are registered locally. 14553 */ 14554 void 14555 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14556 u_offset_t offset) 14557 { 14558 int oldsysid; 14559 int error; 14560 #ifdef DEBUG 14561 char *name; 14562 #endif 14563 14564 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14565 14566 #ifdef DEBUG 14567 name = fn_name(VTOSV(vp)->sv_name); 14568 NFS4_DEBUG(nfs4_client_lock_debug, 14569 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14570 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14571 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14572 flk->l_sysid)); 14573 kmem_free(name, MAXNAMELEN); 14574 #endif 14575 14576 /* register the lock with local locking */ 14577 oldsysid = flk->l_sysid; 14578 flk->l_sysid |= LM_SYSID_CLIENT; 14579 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14580 #ifdef DEBUG 14581 if (error != 0) { 14582 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14583 "nfs4_register_lock_locally: could not register with" 14584 " local locking")); 14585 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14586 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14587 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14588 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14589 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14590 flk->l_type, flk->l_start, flk->l_len)); 14591 (void) reclock(vp, flk, 0, flag, offset, NULL); 14592 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14593 "blocked by pid %d sysid 0x%x type %d " 14594 "off 0x%" PRIx64 " len 0x%" PRIx64, 14595 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14596 flk->l_len)); 14597 } 14598 #endif 14599 flk->l_sysid = oldsysid; 14600 } 14601 14602 /* 14603 * nfs4_lockrelease: 14604 * 14605 * Release any locks on the given vnode that are held by the current 14606 * process. Also removes the lock owner (if one exists) from the rnode's 14607 * list. 14608 */ 14609 static int 14610 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14611 { 14612 flock64_t ld; 14613 int ret, error; 14614 rnode4_t *rp; 14615 nfs4_lock_owner_t *lop; 14616 nfs4_recov_state_t recov_state; 14617 mntinfo4_t *mi; 14618 bool_t possible_orphan = FALSE; 14619 bool_t recovonly; 14620 14621 ASSERT((uintptr_t)vp > KERNELBASE); 14622 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14623 14624 rp = VTOR4(vp); 14625 mi = VTOMI4(vp); 14626 14627 /* 14628 * If we have not locked anything then we can 14629 * just return since we have no work to do. 14630 */ 14631 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14632 return (0); 14633 } 14634 14635 /* 14636 * We need to comprehend that another thread may 14637 * kick off recovery and the lock_owner we have stashed 14638 * in lop might be invalid so we should NOT cache it 14639 * locally! 14640 */ 14641 recov_state.rs_flags = 0; 14642 recov_state.rs_num_retry_despite_err = 0; 14643 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14644 &recovonly); 14645 if (error) { 14646 mutex_enter(&rp->r_statelock); 14647 rp->r_flags |= R4LODANGLERS; 14648 mutex_exit(&rp->r_statelock); 14649 return (error); 14650 } 14651 14652 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14653 14654 /* 14655 * Check if the lock owner might have a lock (request was sent but 14656 * no response was received). Also check if there are any remote 14657 * locks on the file. (In theory we shouldn't have to make this 14658 * second check if there's no lock owner, but for now we'll be 14659 * conservative and do it anyway.) If either condition is true, 14660 * send an unlock for the entire file to the server. 14661 * 14662 * Note that no explicit synchronization is needed here. At worst, 14663 * flk_has_remote_locks() will return a false positive, in which case 14664 * the unlock call wastes time but doesn't harm correctness. 14665 */ 14666 14667 if (lop) { 14668 mutex_enter(&lop->lo_lock); 14669 possible_orphan = lop->lo_pending_rqsts; 14670 mutex_exit(&lop->lo_lock); 14671 lock_owner_rele(lop); 14672 } 14673 14674 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14675 14676 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14677 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14678 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14679 (void *)lop)); 14680 14681 if (possible_orphan || flk_has_remote_locks(vp)) { 14682 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14683 ld.l_whence = 0; /* unlock from start of file */ 14684 ld.l_start = 0; 14685 ld.l_len = 0; /* do entire file */ 14686 14687 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14688 cr, NULL); 14689 14690 if (ret != 0) { 14691 /* 14692 * If VOP_FRLOCK fails, make sure we unregister 14693 * local locks before we continue. 14694 */ 14695 ld.l_pid = ttoproc(curthread)->p_pid; 14696 nfs4_register_lock_locally(vp, &ld, flag, offset); 14697 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14698 "nfs4_lockrelease: lock release error on vp" 14699 " %p: error %d.\n", (void *)vp, ret)); 14700 } 14701 } 14702 14703 recov_state.rs_flags = 0; 14704 recov_state.rs_num_retry_despite_err = 0; 14705 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14706 &recovonly); 14707 if (error) { 14708 mutex_enter(&rp->r_statelock); 14709 rp->r_flags |= R4LODANGLERS; 14710 mutex_exit(&rp->r_statelock); 14711 return (error); 14712 } 14713 14714 /* 14715 * So, here we're going to need to retrieve the lock-owner 14716 * again (in case recovery has done a switch-a-roo) and 14717 * remove it because we can. 14718 */ 14719 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14720 14721 if (lop) { 14722 nfs4_rnode_remove_lock_owner(rp, lop); 14723 lock_owner_rele(lop); 14724 } 14725 14726 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14727 return (0); 14728 } 14729 14730 /* 14731 * Wait for 'tick_delay' clock ticks. 14732 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14733 * NOTE: lock_lease_time is in seconds. 14734 * 14735 * XXX For future improvements, should implement a waiting queue scheme. 14736 */ 14737 static int 14738 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14739 { 14740 long milliseconds_delay; 14741 time_t lock_lease_time; 14742 14743 /* wait tick_delay clock ticks or siginteruptus */ 14744 if (delay_sig(*tick_delay)) { 14745 return (EINTR); 14746 } 14747 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14748 "reissue the lock request: blocked for %ld clock ticks: %ld " 14749 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14750 14751 /* get the lease time */ 14752 lock_lease_time = r2lease_time(rp); 14753 14754 /* drv_hztousec converts ticks to microseconds */ 14755 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14756 if (milliseconds_delay < lock_lease_time * 1000) { 14757 *tick_delay = 2 * *tick_delay; 14758 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14759 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14760 } 14761 return (0); 14762 } 14763 14764 14765 void 14766 nfs4_vnops_init(void) 14767 { 14768 } 14769 14770 void 14771 nfs4_vnops_fini(void) 14772 { 14773 } 14774 14775 /* 14776 * Return a reference to the directory (parent) vnode for a given vnode, 14777 * using the saved pathname information and the directory file handle. The 14778 * caller is responsible for disposing of the reference. 14779 * Returns zero or an errno value. 14780 * 14781 * Caller should set need_start_op to FALSE if it is the recovery 14782 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14783 */ 14784 int 14785 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14786 { 14787 svnode_t *svnp; 14788 vnode_t *dvp = NULL; 14789 servinfo4_t *svp; 14790 nfs4_fname_t *mfname; 14791 int error; 14792 14793 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14794 14795 if (vp->v_flag & VROOT) { 14796 nfs4_sharedfh_t *sfh; 14797 nfs_fh4 fh; 14798 mntinfo4_t *mi; 14799 14800 ASSERT(vp->v_type == VREG); 14801 14802 mi = VTOMI4(vp); 14803 svp = mi->mi_curr_serv; 14804 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14805 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14806 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14807 sfh = sfh4_get(&fh, VTOMI4(vp)); 14808 nfs_rw_exit(&svp->sv_lock); 14809 mfname = mi->mi_fname; 14810 fn_hold(mfname); 14811 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14812 sfh4_rele(&sfh); 14813 14814 if (dvp->v_type == VNON) 14815 dvp->v_type = VDIR; 14816 *dvpp = dvp; 14817 return (0); 14818 } 14819 14820 svnp = VTOSV(vp); 14821 14822 if (svnp == NULL) { 14823 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14824 "shadow node is NULL")); 14825 return (EINVAL); 14826 } 14827 14828 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14829 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14830 "shadow node name or dfh val == NULL")); 14831 return (EINVAL); 14832 } 14833 14834 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14835 (int)need_start_op); 14836 if (error != 0) { 14837 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14838 "nfs4_make_dotdot returned %d", error)); 14839 return (error); 14840 } 14841 if (!dvp) { 14842 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14843 "nfs4_make_dotdot returned a NULL dvp")); 14844 return (EIO); 14845 } 14846 if (dvp->v_type == VNON) 14847 dvp->v_type = VDIR; 14848 ASSERT(dvp->v_type == VDIR); 14849 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14850 mutex_enter(&dvp->v_lock); 14851 dvp->v_flag |= V_XATTRDIR; 14852 mutex_exit(&dvp->v_lock); 14853 } 14854 *dvpp = dvp; 14855 return (0); 14856 } 14857 14858 /* 14859 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14860 * length that fnamep can accept, including the trailing null. 14861 * Returns 0 if okay, returns an errno value if there was a problem. 14862 */ 14863 14864 int 14865 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14866 { 14867 char *fn; 14868 int err = 0; 14869 servinfo4_t *svp; 14870 svnode_t *shvp; 14871 14872 /* 14873 * If the file being opened has VROOT set, then this is 14874 * a "file" mount. sv_name will not be interesting, so 14875 * go back to the servinfo4 to get the original mount 14876 * path and strip off all but the final edge. Otherwise 14877 * just return the name from the shadow vnode. 14878 */ 14879 14880 if (vp->v_flag & VROOT) { 14881 14882 svp = VTOMI4(vp)->mi_curr_serv; 14883 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14884 14885 fn = strrchr(svp->sv_path, '/'); 14886 if (fn == NULL) 14887 err = EINVAL; 14888 else 14889 fn++; 14890 } else { 14891 shvp = VTOSV(vp); 14892 fn = fn_name(shvp->sv_name); 14893 } 14894 14895 if (err == 0) 14896 if (strlen(fn) < maxlen) 14897 (void) strcpy(fnamep, fn); 14898 else 14899 err = ENAMETOOLONG; 14900 14901 if (vp->v_flag & VROOT) 14902 nfs_rw_exit(&svp->sv_lock); 14903 else 14904 kmem_free(fn, MAXNAMELEN); 14905 14906 return (err); 14907 } 14908 14909 /* 14910 * Bookkeeping for a close that doesn't need to go over the wire. 14911 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14912 * it is left at 1. 14913 */ 14914 void 14915 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14916 { 14917 rnode4_t *rp; 14918 mntinfo4_t *mi; 14919 14920 mi = VTOMI4(vp); 14921 rp = VTOR4(vp); 14922 14923 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14924 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14925 ASSERT(nfs_zone() == mi->mi_zone); 14926 ASSERT(mutex_owned(&osp->os_sync_lock)); 14927 ASSERT(*have_lockp); 14928 14929 if (!osp->os_valid || 14930 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14931 return; 14932 } 14933 14934 /* 14935 * This removes the reference obtained at OPEN; ie, 14936 * when the open stream structure was created. 14937 * 14938 * We don't have to worry about calling 'open_stream_rele' 14939 * since we our currently holding a reference to this 14940 * open stream which means the count can not go to 0 with 14941 * this decrement. 14942 */ 14943 ASSERT(osp->os_ref_count >= 2); 14944 osp->os_ref_count--; 14945 osp->os_valid = 0; 14946 mutex_exit(&osp->os_sync_lock); 14947 *have_lockp = 0; 14948 14949 nfs4_dec_state_ref_count(mi); 14950 } 14951 14952 /* 14953 * Close all remaining open streams on the rnode. These open streams 14954 * could be here because: 14955 * - The close attempted at either close or delmap failed 14956 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14957 * - Someone did mknod on a regular file but never opened it 14958 */ 14959 int 14960 nfs4close_all(vnode_t *vp, cred_t *cr) 14961 { 14962 nfs4_open_stream_t *osp; 14963 int error; 14964 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14965 rnode4_t *rp; 14966 14967 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14968 14969 error = 0; 14970 rp = VTOR4(vp); 14971 14972 /* 14973 * At this point, all we know is that the last time 14974 * someone called vn_rele, the count was 1. Since then, 14975 * the vnode could have been re-activated. We want to 14976 * loop through the open streams and close each one, but 14977 * we have to be careful since once we release the rnode 14978 * hash bucket lock, someone else is free to come in and 14979 * re-activate the rnode and add new open streams. The 14980 * strategy is take the rnode hash bucket lock, verify that 14981 * the count is still 1, grab the open stream off the 14982 * head of the list and mark it invalid, then release the 14983 * rnode hash bucket lock and proceed with that open stream. 14984 * This is ok because nfs4close_one() will acquire the proper 14985 * open/create to close/destroy synchronization for open 14986 * streams, and will ensure that if someone has reopened 14987 * the open stream after we've dropped the hash bucket lock 14988 * then we'll just simply return without destroying the 14989 * open stream. 14990 * Repeat until the list is empty. 14991 */ 14992 14993 for (;;) { 14994 14995 /* make sure vnode hasn't been reactivated */ 14996 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14997 mutex_enter(&vp->v_lock); 14998 if (vp->v_count > 1) { 14999 mutex_exit(&vp->v_lock); 15000 rw_exit(&rp->r_hashq->r_lock); 15001 break; 15002 } 15003 /* 15004 * Grabbing r_os_lock before releasing v_lock prevents 15005 * a window where the rnode/open stream could get 15006 * reactivated (and os_force_close set to 0) before we 15007 * had a chance to set os_force_close to 1. 15008 */ 15009 mutex_enter(&rp->r_os_lock); 15010 mutex_exit(&vp->v_lock); 15011 15012 osp = list_head(&rp->r_open_streams); 15013 if (!osp) { 15014 /* nothing left to CLOSE OTW, so return */ 15015 mutex_exit(&rp->r_os_lock); 15016 rw_exit(&rp->r_hashq->r_lock); 15017 break; 15018 } 15019 15020 mutex_enter(&rp->r_statev4_lock); 15021 /* the file can't still be mem mapped */ 15022 ASSERT(rp->r_mapcnt == 0); 15023 if (rp->created_v4) 15024 rp->created_v4 = 0; 15025 mutex_exit(&rp->r_statev4_lock); 15026 15027 /* 15028 * Grab a ref on this open stream; nfs4close_one 15029 * will mark it as invalid 15030 */ 15031 mutex_enter(&osp->os_sync_lock); 15032 osp->os_ref_count++; 15033 osp->os_force_close = 1; 15034 mutex_exit(&osp->os_sync_lock); 15035 mutex_exit(&rp->r_os_lock); 15036 rw_exit(&rp->r_hashq->r_lock); 15037 15038 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15039 15040 /* Update error if it isn't already non-zero */ 15041 if (error == 0) { 15042 if (e.error) 15043 error = e.error; 15044 else if (e.stat) 15045 error = geterrno4(e.stat); 15046 } 15047 15048 #ifdef DEBUG 15049 nfs4close_all_cnt++; 15050 #endif 15051 /* Release the ref on osp acquired above. */ 15052 open_stream_rele(osp, rp); 15053 15054 /* Proceed to the next open stream, if any */ 15055 } 15056 return (error); 15057 } 15058 15059 /* 15060 * nfs4close_one - close one open stream for a file if needed. 15061 * 15062 * "close_type" indicates which close path this is: 15063 * CLOSE_NORM: close initiated via VOP_CLOSE. 15064 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15065 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15066 * the close and release of client state for this open stream 15067 * (unless someone else has the open stream open). 15068 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15069 * (e.g., due to abort because of a signal). 15070 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15071 * 15072 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15073 * recovery. Instead, the caller is expected to deal with retries. 15074 * 15075 * The caller can either pass in the osp ('provided_osp') or not. 15076 * 15077 * 'access_bits' represents the access we are closing/downgrading. 15078 * 15079 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15080 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15081 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15082 * 15083 * Errors are returned via the nfs4_error_t. 15084 */ 15085 void 15086 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15087 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15088 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15089 uint_t mmap_flags) 15090 { 15091 nfs4_open_owner_t *oop; 15092 nfs4_open_stream_t *osp = NULL; 15093 int retry = 0; 15094 int num_retries = NFS4_NUM_RECOV_RETRIES; 15095 rnode4_t *rp; 15096 mntinfo4_t *mi; 15097 nfs4_recov_state_t recov_state; 15098 cred_t *cred_otw = NULL; 15099 bool_t recovonly = FALSE; 15100 int isrecov; 15101 int force_close; 15102 int close_failed = 0; 15103 int did_dec_count = 0; 15104 int did_start_op = 0; 15105 int did_force_recovlock = 0; 15106 int did_start_seqid_sync = 0; 15107 int have_sync_lock = 0; 15108 15109 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15110 15111 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15112 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15113 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15114 len, maxprot, mmap_flags, access_bits)); 15115 15116 nfs4_error_zinit(ep); 15117 rp = VTOR4(vp); 15118 mi = VTOMI4(vp); 15119 isrecov = (close_type == CLOSE_RESEND || 15120 close_type == CLOSE_AFTER_RESEND); 15121 15122 /* 15123 * First get the open owner. 15124 */ 15125 if (!provided_osp) { 15126 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15127 } else { 15128 oop = provided_osp->os_open_owner; 15129 ASSERT(oop != NULL); 15130 open_owner_hold(oop); 15131 } 15132 15133 if (!oop) { 15134 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15135 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15136 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15137 (void *)provided_osp, close_type)); 15138 ep->error = EIO; 15139 goto out; 15140 } 15141 15142 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15143 recov_retry: 15144 osp = NULL; 15145 close_failed = 0; 15146 force_close = (close_type == CLOSE_FORCE); 15147 retry = 0; 15148 did_start_op = 0; 15149 did_force_recovlock = 0; 15150 did_start_seqid_sync = 0; 15151 have_sync_lock = 0; 15152 recovonly = FALSE; 15153 recov_state.rs_flags = 0; 15154 recov_state.rs_num_retry_despite_err = 0; 15155 15156 /* 15157 * Second synchronize with recovery. 15158 */ 15159 if (!isrecov) { 15160 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15161 &recov_state, &recovonly); 15162 if (!ep->error) { 15163 did_start_op = 1; 15164 } else { 15165 close_failed = 1; 15166 /* 15167 * If we couldn't get start_fop, but have to 15168 * cleanup state, then at least acquire the 15169 * mi_recovlock so we can synchronize with 15170 * recovery. 15171 */ 15172 if (close_type == CLOSE_FORCE) { 15173 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15174 RW_READER, FALSE); 15175 did_force_recovlock = 1; 15176 } else 15177 goto out; 15178 } 15179 } 15180 15181 /* 15182 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15183 * set 'recovonly' to TRUE since most likely this is due to 15184 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15185 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15186 * to retry, causing us to loop until recovery finishes. Plus we 15187 * don't need protection over the open seqid since we're not going 15188 * OTW, hence don't need to use the seqid. 15189 */ 15190 if (recovonly == FALSE) { 15191 /* need to grab the open owner sync before 'os_sync_lock' */ 15192 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15193 if (ep->error == EAGAIN) { 15194 ASSERT(!isrecov); 15195 if (did_start_op) 15196 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15197 &recov_state, TRUE); 15198 if (did_force_recovlock) 15199 nfs_rw_exit(&mi->mi_recovlock); 15200 goto recov_retry; 15201 } 15202 did_start_seqid_sync = 1; 15203 } 15204 15205 /* 15206 * Third get an open stream and acquire 'os_sync_lock' to 15207 * sychronize the opening/creating of an open stream with the 15208 * closing/destroying of an open stream. 15209 */ 15210 if (!provided_osp) { 15211 /* returns with 'os_sync_lock' held */ 15212 osp = find_open_stream(oop, rp); 15213 if (!osp) { 15214 ep->error = EIO; 15215 goto out; 15216 } 15217 } else { 15218 osp = provided_osp; 15219 open_stream_hold(osp); 15220 mutex_enter(&osp->os_sync_lock); 15221 } 15222 have_sync_lock = 1; 15223 15224 ASSERT(oop == osp->os_open_owner); 15225 15226 /* 15227 * Fourth, do any special pre-OTW CLOSE processing 15228 * based on the specific close type. 15229 */ 15230 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15231 !did_dec_count) { 15232 ASSERT(osp->os_open_ref_count > 0); 15233 osp->os_open_ref_count--; 15234 did_dec_count = 1; 15235 if (osp->os_open_ref_count == 0) 15236 osp->os_final_close = 1; 15237 } 15238 15239 if (close_type == CLOSE_FORCE) { 15240 /* see if somebody reopened the open stream. */ 15241 if (!osp->os_force_close) { 15242 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15243 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15244 "was reopened, vp %p", (void *)osp, (void *)vp)); 15245 ep->error = 0; 15246 ep->stat = NFS4_OK; 15247 goto out; 15248 } 15249 15250 if (!osp->os_final_close && !did_dec_count) { 15251 osp->os_open_ref_count--; 15252 did_dec_count = 1; 15253 } 15254 15255 /* 15256 * We can't depend on os_open_ref_count being 0 due to the 15257 * way executables are opened (VN_RELE to match a VOP_OPEN). 15258 */ 15259 #ifdef NOTYET 15260 ASSERT(osp->os_open_ref_count == 0); 15261 #endif 15262 if (osp->os_open_ref_count != 0) { 15263 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15264 "nfs4close_one: should panic here on an " 15265 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15266 "since this is probably the exec problem.")); 15267 15268 osp->os_open_ref_count = 0; 15269 } 15270 15271 /* 15272 * There is the possibility that nfs4close_one() 15273 * for close_type == CLOSE_DELMAP couldn't find the 15274 * open stream, thus couldn't decrement its os_mapcnt; 15275 * therefore we can't use this ASSERT yet. 15276 */ 15277 #ifdef NOTYET 15278 ASSERT(osp->os_mapcnt == 0); 15279 #endif 15280 osp->os_mapcnt = 0; 15281 } 15282 15283 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15284 ASSERT(osp->os_mapcnt >= btopr(len)); 15285 15286 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15287 osp->os_mmap_write -= btopr(len); 15288 if (maxprot & PROT_READ) 15289 osp->os_mmap_read -= btopr(len); 15290 if (maxprot & PROT_EXEC) 15291 osp->os_mmap_read -= btopr(len); 15292 /* mirror the PROT_NONE check in nfs4_addmap() */ 15293 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15294 !(maxprot & PROT_EXEC)) 15295 osp->os_mmap_read -= btopr(len); 15296 osp->os_mapcnt -= btopr(len); 15297 did_dec_count = 1; 15298 } 15299 15300 if (recovonly) { 15301 nfs4_lost_rqst_t lost_rqst; 15302 15303 /* request should not already be in recovery queue */ 15304 ASSERT(lrp == NULL); 15305 nfs4_error_init(ep, EINTR); 15306 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15307 osp, cred_otw, vp); 15308 mutex_exit(&osp->os_sync_lock); 15309 have_sync_lock = 0; 15310 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15311 lost_rqst.lr_op == OP_CLOSE ? 15312 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL); 15313 close_failed = 1; 15314 force_close = 0; 15315 goto close_cleanup; 15316 } 15317 15318 /* 15319 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15320 * we stopped operating on the open owner's <old oo_name, old seqid> 15321 * space, which means we stopped operating on the open stream 15322 * too. So don't go OTW (as the seqid is likely bad, and the 15323 * stateid could be stale, potentially triggering a false 15324 * setclientid), and just clean up the client's internal state. 15325 */ 15326 if (osp->os_orig_oo_name != oop->oo_name) { 15327 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15328 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15329 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15330 "oo_name %" PRIx64")", 15331 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15332 oop->oo_name)); 15333 close_failed = 1; 15334 } 15335 15336 /* If the file failed recovery, just quit. */ 15337 mutex_enter(&rp->r_statelock); 15338 if (rp->r_flags & R4RECOVERR) { 15339 close_failed = 1; 15340 } 15341 mutex_exit(&rp->r_statelock); 15342 15343 /* 15344 * If the force close path failed to obtain start_fop 15345 * then skip the OTW close and just remove the state. 15346 */ 15347 if (close_failed) 15348 goto close_cleanup; 15349 15350 /* 15351 * Fifth, check to see if there are still mapped pages or other 15352 * opens using this open stream. If there are then we can't 15353 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15354 */ 15355 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15356 nfs4_lost_rqst_t new_lost_rqst; 15357 bool_t needrecov = FALSE; 15358 cred_t *odg_cred_otw = NULL; 15359 seqid4 open_dg_seqid = 0; 15360 15361 if (osp->os_delegation) { 15362 /* 15363 * If this open stream was never OPENed OTW then we 15364 * surely can't DOWNGRADE it (especially since the 15365 * osp->open_stateid is really a delegation stateid 15366 * when os_delegation is 1). 15367 */ 15368 if (access_bits & FREAD) 15369 osp->os_share_acc_read--; 15370 if (access_bits & FWRITE) 15371 osp->os_share_acc_write--; 15372 osp->os_share_deny_none--; 15373 nfs4_error_zinit(ep); 15374 goto out; 15375 } 15376 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15377 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15378 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15379 if (needrecov && !isrecov) { 15380 bool_t abort; 15381 nfs4_bseqid_entry_t *bsep = NULL; 15382 15383 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15384 bsep = nfs4_create_bseqid_entry(oop, NULL, 15385 vp, 0, 15386 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15387 open_dg_seqid); 15388 15389 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15390 oop, osp, odg_cred_otw, vp, access_bits, 0); 15391 mutex_exit(&osp->os_sync_lock); 15392 have_sync_lock = 0; 15393 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15394 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15395 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15396 bsep, NULL, NULL); 15397 if (odg_cred_otw) 15398 crfree(odg_cred_otw); 15399 if (bsep) 15400 kmem_free(bsep, sizeof (*bsep)); 15401 15402 if (abort == TRUE) 15403 goto out; 15404 15405 if (did_start_seqid_sync) { 15406 nfs4_end_open_seqid_sync(oop); 15407 did_start_seqid_sync = 0; 15408 } 15409 open_stream_rele(osp, rp); 15410 15411 if (did_start_op) 15412 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15413 &recov_state, FALSE); 15414 if (did_force_recovlock) 15415 nfs_rw_exit(&mi->mi_recovlock); 15416 15417 goto recov_retry; 15418 } else { 15419 if (odg_cred_otw) 15420 crfree(odg_cred_otw); 15421 } 15422 goto out; 15423 } 15424 15425 /* 15426 * If this open stream was created as the results of an open 15427 * while holding a delegation, then just release it; no need 15428 * to do an OTW close. Otherwise do a "normal" OTW close. 15429 */ 15430 if (osp->os_delegation) { 15431 nfs4close_notw(vp, osp, &have_sync_lock); 15432 nfs4_error_zinit(ep); 15433 goto out; 15434 } 15435 15436 /* 15437 * If this stream is not valid, we're done. 15438 */ 15439 if (!osp->os_valid) { 15440 nfs4_error_zinit(ep); 15441 goto out; 15442 } 15443 15444 /* 15445 * Last open or mmap ref has vanished, need to do an OTW close. 15446 * First check to see if a close is still necessary. 15447 */ 15448 if (osp->os_failed_reopen) { 15449 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15450 "don't close OTW osp %p since reopen failed.", 15451 (void *)osp)); 15452 /* 15453 * Reopen of the open stream failed, hence the 15454 * stateid of the open stream is invalid/stale, and 15455 * sending this OTW would incorrectly cause another 15456 * round of recovery. In this case, we need to set 15457 * the 'os_valid' bit to 0 so another thread doesn't 15458 * come in and re-open this open stream before 15459 * this "closing" thread cleans up state (decrementing 15460 * the nfs4_server_t's state_ref_count and decrementing 15461 * the os_ref_count). 15462 */ 15463 osp->os_valid = 0; 15464 /* 15465 * This removes the reference obtained at OPEN; ie, 15466 * when the open stream structure was created. 15467 * 15468 * We don't have to worry about calling 'open_stream_rele' 15469 * since we our currently holding a reference to this 15470 * open stream which means the count can not go to 0 with 15471 * this decrement. 15472 */ 15473 ASSERT(osp->os_ref_count >= 2); 15474 osp->os_ref_count--; 15475 nfs4_error_zinit(ep); 15476 close_failed = 0; 15477 goto close_cleanup; 15478 } 15479 15480 ASSERT(osp->os_ref_count > 1); 15481 15482 /* 15483 * Sixth, try the CLOSE OTW. 15484 */ 15485 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15486 close_type, ep, &have_sync_lock); 15487 15488 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15489 /* 15490 * Let the recovery thread be responsible for 15491 * removing the state for CLOSE. 15492 */ 15493 close_failed = 1; 15494 force_close = 0; 15495 retry = 0; 15496 } 15497 15498 /* See if we need to retry with a different cred */ 15499 if ((ep->error == EACCES || 15500 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15501 cred_otw != cr) { 15502 crfree(cred_otw); 15503 cred_otw = cr; 15504 crhold(cred_otw); 15505 retry = 1; 15506 } 15507 15508 if (ep->error || ep->stat) 15509 close_failed = 1; 15510 15511 if (retry && !isrecov && num_retries-- > 0) { 15512 if (have_sync_lock) { 15513 mutex_exit(&osp->os_sync_lock); 15514 have_sync_lock = 0; 15515 } 15516 if (did_start_seqid_sync) { 15517 nfs4_end_open_seqid_sync(oop); 15518 did_start_seqid_sync = 0; 15519 } 15520 open_stream_rele(osp, rp); 15521 15522 if (did_start_op) 15523 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15524 &recov_state, FALSE); 15525 if (did_force_recovlock) 15526 nfs_rw_exit(&mi->mi_recovlock); 15527 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15528 "nfs4close_one: need to retry the close " 15529 "operation")); 15530 goto recov_retry; 15531 } 15532 close_cleanup: 15533 /* 15534 * Seventh and lastly, process our results. 15535 */ 15536 if (close_failed && force_close) { 15537 /* 15538 * It's ok to drop and regrab the 'os_sync_lock' since 15539 * nfs4close_notw() will recheck to make sure the 15540 * "close"/removal of state should happen. 15541 */ 15542 if (!have_sync_lock) { 15543 mutex_enter(&osp->os_sync_lock); 15544 have_sync_lock = 1; 15545 } 15546 /* 15547 * This is last call, remove the ref on the open 15548 * stream created by open and clean everything up. 15549 */ 15550 osp->os_pending_close = 0; 15551 nfs4close_notw(vp, osp, &have_sync_lock); 15552 nfs4_error_zinit(ep); 15553 } 15554 15555 if (!close_failed) { 15556 if (have_sync_lock) { 15557 osp->os_pending_close = 0; 15558 mutex_exit(&osp->os_sync_lock); 15559 have_sync_lock = 0; 15560 } else { 15561 mutex_enter(&osp->os_sync_lock); 15562 osp->os_pending_close = 0; 15563 mutex_exit(&osp->os_sync_lock); 15564 } 15565 if (did_start_op && recov_state.rs_sp != NULL) { 15566 mutex_enter(&recov_state.rs_sp->s_lock); 15567 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15568 mutex_exit(&recov_state.rs_sp->s_lock); 15569 } else { 15570 nfs4_dec_state_ref_count(mi); 15571 } 15572 nfs4_error_zinit(ep); 15573 } 15574 15575 out: 15576 if (have_sync_lock) 15577 mutex_exit(&osp->os_sync_lock); 15578 if (did_start_op) 15579 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15580 recovonly ? TRUE : FALSE); 15581 if (did_force_recovlock) 15582 nfs_rw_exit(&mi->mi_recovlock); 15583 if (cred_otw) 15584 crfree(cred_otw); 15585 if (osp) 15586 open_stream_rele(osp, rp); 15587 if (oop) { 15588 if (did_start_seqid_sync) 15589 nfs4_end_open_seqid_sync(oop); 15590 open_owner_rele(oop); 15591 } 15592 } 15593 15594 /* 15595 * Convert information returned by the server in the LOCK4denied 15596 * structure to the form required by fcntl. 15597 */ 15598 static void 15599 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15600 { 15601 nfs4_lo_name_t *lo; 15602 15603 #ifdef DEBUG 15604 if (denied_to_flk_debug) { 15605 lockt_denied_debug = lockt_denied; 15606 debug_enter("lockt_denied"); 15607 } 15608 #endif 15609 15610 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15611 flk->l_whence = 0; /* aka SEEK_SET */ 15612 flk->l_start = lockt_denied->offset; 15613 flk->l_len = lockt_denied->length; 15614 15615 /* 15616 * If the blocking clientid matches our client id, then we can 15617 * interpret the lockowner (since we built it). If not, then 15618 * fabricate a sysid and pid. Note that the l_sysid field 15619 * in *flk already has the local sysid. 15620 */ 15621 15622 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15623 15624 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15625 lo = (nfs4_lo_name_t *) 15626 lockt_denied->owner.owner_val; 15627 15628 flk->l_pid = lo->ln_pid; 15629 } else { 15630 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15631 "denied_to_flk: bad lock owner length\n")); 15632 15633 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15634 } 15635 } else { 15636 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15637 "denied_to_flk: foreign clientid\n")); 15638 15639 /* 15640 * Construct a new sysid which should be different from 15641 * sysids of other systems. 15642 */ 15643 15644 flk->l_sysid++; 15645 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15646 } 15647 } 15648 15649 static pid_t 15650 lo_to_pid(lock_owner4 *lop) 15651 { 15652 pid_t pid = 0; 15653 uchar_t *cp; 15654 int i; 15655 15656 cp = (uchar_t *)&lop->clientid; 15657 15658 for (i = 0; i < sizeof (lop->clientid); i++) 15659 pid += (pid_t)*cp++; 15660 15661 cp = (uchar_t *)lop->owner_val; 15662 15663 for (i = 0; i < lop->owner_len; i++) 15664 pid += (pid_t)*cp++; 15665 15666 return (pid); 15667 } 15668 15669 /* 15670 * Given a lock pointer, returns the length of that lock. 15671 * "end" is the last locked offset the "l_len" covers from 15672 * the start of the lock. 15673 */ 15674 static off64_t 15675 lock_to_end(flock64_t *lock) 15676 { 15677 off64_t lock_end; 15678 15679 if (lock->l_len == 0) 15680 lock_end = (off64_t)MAXEND; 15681 else 15682 lock_end = lock->l_start + lock->l_len - 1; 15683 15684 return (lock_end); 15685 } 15686 15687 /* 15688 * Given the end of a lock, it will return you the length "l_len" for that lock. 15689 */ 15690 static off64_t 15691 end_to_len(off64_t start, off64_t end) 15692 { 15693 off64_t lock_len; 15694 15695 ASSERT(end >= start); 15696 if (end == MAXEND) 15697 lock_len = 0; 15698 else 15699 lock_len = end - start + 1; 15700 15701 return (lock_len); 15702 } 15703 15704 /* 15705 * On given end for a lock it determines if it is the last locked offset 15706 * or not, if so keeps it as is, else adds one to return the length for 15707 * valid start. 15708 */ 15709 static off64_t 15710 start_check(off64_t x) 15711 { 15712 if (x == MAXEND) 15713 return (x); 15714 else 15715 return (x + 1); 15716 } 15717 15718 /* 15719 * See if these two locks overlap, and if so return 1; 15720 * otherwise, return 0. 15721 */ 15722 static int 15723 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15724 { 15725 off64_t llfp_end, curfp_end; 15726 15727 llfp_end = lock_to_end(llfp); 15728 curfp_end = lock_to_end(curfp); 15729 15730 if (((llfp_end >= curfp->l_start) && 15731 (llfp->l_start <= curfp->l_start)) || 15732 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15733 return (1); 15734 return (0); 15735 } 15736 15737 /* 15738 * Determine what the intersecting lock region is, and add that to the 15739 * 'nl_llpp' locklist in increasing order (by l_start). 15740 */ 15741 static void 15742 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15743 locklist_t **nl_llpp, vnode_t *vp) 15744 { 15745 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15746 off64_t lost_flp_end, local_flp_end, len, start; 15747 15748 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15749 15750 if (!locks_intersect(lost_flp, local_flp)) 15751 return; 15752 15753 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15754 "locks intersect")); 15755 15756 lost_flp_end = lock_to_end(lost_flp); 15757 local_flp_end = lock_to_end(local_flp); 15758 15759 /* Find the starting point of the intersecting region */ 15760 if (local_flp->l_start > lost_flp->l_start) 15761 start = local_flp->l_start; 15762 else 15763 start = lost_flp->l_start; 15764 15765 /* Find the lenght of the intersecting region */ 15766 if (lost_flp_end < local_flp_end) 15767 len = end_to_len(start, lost_flp_end); 15768 else 15769 len = end_to_len(start, local_flp_end); 15770 15771 /* 15772 * Prepare the flock structure for the intersection found and insert 15773 * it into the new list in increasing l_start order. This list contains 15774 * intersections of locks registered by the client with the local host 15775 * and the lost lock. 15776 * The lock type of this lock is the same as that of the local_flp. 15777 */ 15778 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15779 intersect_llp->ll_flock.l_start = start; 15780 intersect_llp->ll_flock.l_len = len; 15781 intersect_llp->ll_flock.l_type = local_flp->l_type; 15782 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15783 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15784 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15785 intersect_llp->ll_vp = vp; 15786 15787 tmp_fllp = *nl_llpp; 15788 cur_fllp = NULL; 15789 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15790 intersect_llp->ll_flock.l_start) { 15791 cur_fllp = tmp_fllp; 15792 tmp_fllp = tmp_fllp->ll_next; 15793 } 15794 if (cur_fllp == NULL) { 15795 /* first on the list */ 15796 intersect_llp->ll_next = *nl_llpp; 15797 *nl_llpp = intersect_llp; 15798 } else { 15799 intersect_llp->ll_next = cur_fllp->ll_next; 15800 cur_fllp->ll_next = intersect_llp; 15801 } 15802 15803 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15804 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15805 intersect_llp->ll_flock.l_start, 15806 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15807 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15808 } 15809 15810 /* 15811 * Our local locking current state is potentially different than 15812 * what the NFSv4 server thinks we have due to a lost lock that was 15813 * resent and then received. We need to reset our "NFSv4" locking 15814 * state to match the current local locking state for this pid since 15815 * that is what the user/application sees as what the world is. 15816 * 15817 * We cannot afford to drop the open/lock seqid sync since then we can 15818 * get confused about what the current local locking state "is" versus 15819 * "was". 15820 * 15821 * If we are unable to fix up the locks, we send SIGLOST to the affected 15822 * process. This is not done if the filesystem has been forcibly 15823 * unmounted, in case the process has already exited and a new process 15824 * exists with the same pid. 15825 */ 15826 static void 15827 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15828 nfs4_lock_owner_t *lop) 15829 { 15830 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15831 mntinfo4_t *mi = VTOMI4(vp); 15832 const int cmd = F_SETLK; 15833 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15834 flock64_t ul_fl; 15835 15836 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15837 "nfs4_reinstitute_local_lock_state")); 15838 15839 /* 15840 * Find active locks for this vp from the local locking code. 15841 * Scan through this list and find out the locks that intersect with 15842 * the lost lock. Once we find the lock that intersects, add the 15843 * intersection area as a new lock to a new list "ri_llp". The lock 15844 * type of the intersection region lock added to ri_llp is the same 15845 * as that found in the active lock list, "list". The intersecting 15846 * region locks are added to ri_llp in increasing l_start order. 15847 */ 15848 ASSERT(nfs_zone() == mi->mi_zone); 15849 15850 locks = flk_active_locks_for_vp(vp); 15851 ri_llp = NULL; 15852 15853 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15854 ASSERT(llp->ll_vp == vp); 15855 /* 15856 * Pick locks that belong to this pid/lockowner 15857 */ 15858 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15859 continue; 15860 15861 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15862 } 15863 15864 /* 15865 * Now we have the list of intersections with the lost lock. These are 15866 * the locks that were/are active before the server replied to the 15867 * last/lost lock. Issue these locks to the server here. Playing these 15868 * locks to the server will re-establish aur current local locking state 15869 * with the v4 server. 15870 * If we get an error, send SIGLOST to the application for that lock. 15871 */ 15872 15873 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15874 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15875 "nfs4_reinstitute_local_lock_state: need to issue " 15876 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15877 llp->ll_flock.l_start, 15878 llp->ll_flock.l_start + llp->ll_flock.l_len, 15879 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15880 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15881 /* 15882 * No need to relock what we already have 15883 */ 15884 if (llp->ll_flock.l_type == lost_flp->l_type) 15885 continue; 15886 15887 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15888 } 15889 15890 /* 15891 * Now keeping the start of the lost lock as our reference parse the 15892 * newly created ri_llp locklist to find the ranges that we have locked 15893 * with the v4 server but not in the current local locking. We need 15894 * to unlock these ranges. 15895 * These ranges can also be reffered to as those ranges, where the lost 15896 * lock does not overlap with the locks in the ri_llp but are locked 15897 * since the server replied to the lost lock. 15898 */ 15899 cur_start = lost_flp->l_start; 15900 lost_flp_end = lock_to_end(lost_flp); 15901 15902 ul_fl.l_type = F_UNLCK; 15903 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15904 ul_fl.l_sysid = lost_flp->l_sysid; 15905 ul_fl.l_pid = lost_flp->l_pid; 15906 15907 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15908 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15909 15910 if (llp->ll_flock.l_start <= cur_start) { 15911 cur_start = start_check(llp_ll_flock_end); 15912 continue; 15913 } 15914 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15915 "nfs4_reinstitute_local_lock_state: " 15916 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15917 cur_start, llp->ll_flock.l_start)); 15918 15919 ul_fl.l_start = cur_start; 15920 ul_fl.l_len = end_to_len(cur_start, 15921 (llp->ll_flock.l_start - 1)); 15922 15923 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15924 cur_start = start_check(llp_ll_flock_end); 15925 } 15926 15927 /* 15928 * In the case where the lost lock ends after all intersecting locks, 15929 * unlock the last part of the lost lock range. 15930 */ 15931 if (cur_start != start_check(lost_flp_end)) { 15932 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15933 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15934 "lost lock region [%"PRIx64" - %"PRIx64"]", 15935 cur_start, lost_flp->l_start + lost_flp->l_len)); 15936 15937 ul_fl.l_start = cur_start; 15938 /* 15939 * Is it an to-EOF lock? if so unlock till the end 15940 */ 15941 if (lost_flp->l_len == 0) 15942 ul_fl.l_len = 0; 15943 else 15944 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15945 15946 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15947 } 15948 15949 if (locks != NULL) 15950 flk_free_locklist(locks); 15951 15952 /* Free up our newly created locklist */ 15953 for (llp = ri_llp; llp != NULL; ) { 15954 tmp_llp = llp->ll_next; 15955 kmem_free(llp, sizeof (locklist_t)); 15956 llp = tmp_llp; 15957 } 15958 15959 /* 15960 * Now return back to the original calling nfs4frlock() 15961 * and let us naturally drop our seqid syncs. 15962 */ 15963 } 15964 15965 /* 15966 * Create a lost state record for the given lock reinstantiation request 15967 * and push it onto the lost state queue. 15968 */ 15969 static void 15970 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15971 nfs4_lock_owner_t *lop) 15972 { 15973 nfs4_lost_rqst_t req; 15974 nfs_lock_type4 locktype; 15975 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15976 15977 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15978 15979 locktype = flk_to_locktype(cmd, flk->l_type); 15980 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15981 NULL, NULL, lop, flk, &req, cr, vp); 15982 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15983 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15984 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15985 NULL, NULL, NULL); 15986 }