1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 31 * All Rights Reserved 32 */ 33 34 /* 35 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 36 */ 37 38 #include <sys/param.h> 39 #include <sys/types.h> 40 #include <sys/systm.h> 41 #include <sys/cred.h> 42 #include <sys/time.h> 43 #include <sys/vnode.h> 44 #include <sys/vfs.h> 45 #include <sys/vfs_opreg.h> 46 #include <sys/file.h> 47 #include <sys/filio.h> 48 #include <sys/uio.h> 49 #include <sys/buf.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/dirent.h> 53 #include <sys/debug.h> 54 #include <sys/vmsystm.h> 55 #include <sys/fcntl.h> 56 #include <sys/flock.h> 57 #include <sys/swap.h> 58 #include <sys/errno.h> 59 #include <sys/strsubr.h> 60 #include <sys/sysmacros.h> 61 #include <sys/kmem.h> 62 #include <sys/cmn_err.h> 63 #include <sys/pathconf.h> 64 #include <sys/utsname.h> 65 #include <sys/dnlc.h> 66 #include <sys/acl.h> 67 #include <sys/systeminfo.h> 68 #include <sys/policy.h> 69 #include <sys/sdt.h> 70 #include <sys/list.h> 71 #include <sys/stat.h> 72 #include <sys/zone.h> 73 74 #include <rpc/types.h> 75 #include <rpc/auth.h> 76 #include <rpc/clnt.h> 77 78 #include <nfs/nfs.h> 79 #include <nfs/nfs_clnt.h> 80 #include <nfs/nfs_acl.h> 81 #include <nfs/lm.h> 82 #include <nfs/nfs4.h> 83 #include <nfs/nfs4_kprot.h> 84 #include <nfs/rnode4.h> 85 #include <nfs/nfs4_clnt.h> 86 87 #include <vm/hat.h> 88 #include <vm/as.h> 89 #include <vm/page.h> 90 #include <vm/pvn.h> 91 #include <vm/seg.h> 92 #include <vm/seg_map.h> 93 #include <vm/seg_kpm.h> 94 #include <vm/seg_vn.h> 95 96 #include <fs/fs_subr.h> 97 98 #include <sys/ddi.h> 99 #include <sys/int_fmtio.h> 100 #include <sys/fs/autofs.h> 101 102 typedef struct { 103 nfs4_ga_res_t *di_garp; 104 cred_t *di_cred; 105 hrtime_t di_time_call; 106 } dirattr_info_t; 107 108 typedef enum nfs4_acl_op { 109 NFS4_ACL_GET, 110 NFS4_ACL_SET 111 } nfs4_acl_op_t; 112 113 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 114 115 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 116 char *, dirattr_info_t *); 117 118 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 119 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 120 nfs4_error_t *, int *); 121 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 122 cred_t *); 123 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 124 stable_how4 *); 125 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 126 cred_t *, bool_t, struct uio *); 127 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 128 vsecattr_t *); 129 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 130 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 131 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 132 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 133 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 134 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 135 int, vnode_t **, cred_t *); 136 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 137 cred_t *, int, int, enum createmode4, int); 138 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 139 caller_context_t *); 140 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 141 vnode_t *, char *, cred_t *, nfsstat4 *); 142 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 143 vnode_t *, char *, cred_t *, nfsstat4 *); 144 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 145 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 146 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 147 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 148 page_t *[], size_t, struct seg *, caddr_t, 149 enum seg_rw, cred_t *); 150 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 151 cred_t *); 152 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 153 int, cred_t *); 154 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 155 int, cred_t *); 156 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 157 static void nfs4_set_mod(vnode_t *); 158 static void nfs4_get_commit(vnode_t *); 159 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 160 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 161 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 162 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 163 cred_t *); 164 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 165 cred_t *); 166 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 167 hrtime_t, vnode_t *, cred_t *); 168 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 169 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 170 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 171 u_offset_t); 172 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 173 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 174 static cred_t *state_to_cred(nfs4_open_stream_t *); 175 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 176 static pid_t lo_to_pid(lock_owner4 *); 177 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 178 cred_t *, nfs4_lock_owner_t *); 179 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 180 nfs4_lock_owner_t *); 181 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 182 static void nfs4_delmap_callback(struct as *, void *, uint_t); 183 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 184 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 185 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 186 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 187 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 188 uid_t, gid_t, int); 189 190 /* 191 * Routines that implement the setting of v4 args for the misc. ops 192 */ 193 static void nfs4args_lock_free(nfs_argop4 *); 194 static void nfs4args_lockt_free(nfs_argop4 *); 195 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 196 int, rnode4_t *, cred_t *, bitmap4, int *, 197 nfs4_stateid_types_t *); 198 static void nfs4args_setattr_free(nfs_argop4 *); 199 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 200 bitmap4); 201 static void nfs4args_verify_free(nfs_argop4 *); 202 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 203 WRITE4args **, nfs4_stateid_types_t *); 204 205 /* 206 * These are the vnode ops functions that implement the vnode interface to 207 * the networked file system. See more comments below at nfs4_vnodeops. 208 */ 209 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 210 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 211 caller_context_t *); 212 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 213 caller_context_t *); 214 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 215 caller_context_t *); 216 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 217 caller_context_t *); 218 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 219 caller_context_t *); 220 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 221 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 222 caller_context_t *); 223 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 224 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 225 int, vnode_t **, cred_t *, int, caller_context_t *, 226 vsecattr_t *); 227 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 228 int); 229 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 230 caller_context_t *, int); 231 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 232 caller_context_t *, int); 233 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 234 cred_t *, caller_context_t *, int, vsecattr_t *); 235 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 236 caller_context_t *, int); 237 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 238 cred_t *, caller_context_t *, int); 239 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 240 caller_context_t *, int); 241 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 242 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 243 page_t *[], size_t, struct seg *, caddr_t, 244 enum seg_rw, cred_t *, caller_context_t *); 245 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 246 caller_context_t *); 247 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 248 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 249 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 250 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 251 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 252 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 253 struct flk_callback *, cred_t *, caller_context_t *); 254 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 255 cred_t *, caller_context_t *); 256 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 257 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 258 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 259 cred_t *, caller_context_t *); 260 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 261 caller_context_t *); 262 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 263 caller_context_t *); 264 /* 265 * These vnode ops are required to be called from outside this source file, 266 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 267 * as static. 268 */ 269 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 270 caller_context_t *); 271 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 272 int nfs4_lookup(vnode_t *, char *, vnode_t **, 273 struct pathname *, int, vnode_t *, cred_t *, 274 caller_context_t *, int *, pathname_t *); 275 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 276 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 277 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 278 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 279 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 280 caller_context_t *); 281 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 282 caller_context_t *); 283 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 284 caller_context_t *); 285 286 /* 287 * Used for nfs4_commit_vp() to indicate if we should 288 * wait on pending writes. 289 */ 290 #define NFS4_WRITE_NOWAIT 0 291 #define NFS4_WRITE_WAIT 1 292 293 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 294 295 /* 296 * Error flags used to pass information about certain special errors 297 * which need to be handled specially. 298 */ 299 #define NFS_EOF -98 300 #define NFS_VERF_MISMATCH -97 301 302 /* 303 * Flags used to differentiate between which operation drove the 304 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 305 */ 306 #define NFS4_CLOSE_OP 0x1 307 #define NFS4_DELMAP_OP 0x2 308 #define NFS4_INACTIVE_OP 0x3 309 310 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 311 312 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 313 #define ALIGN64(x, ptr, sz) \ 314 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 315 if (x) { \ 316 x = sizeof (uint64_t) - (x); \ 317 sz -= (x); \ 318 ptr += (x); \ 319 } 320 321 #ifdef DEBUG 322 int nfs4_client_attr_debug = 0; 323 int nfs4_client_state_debug = 0; 324 int nfs4_client_shadow_debug = 0; 325 int nfs4_client_lock_debug = 0; 326 int nfs4_seqid_sync = 0; 327 int nfs4_client_map_debug = 0; 328 static int nfs4_pageio_debug = 0; 329 int nfs4_client_inactive_debug = 0; 330 int nfs4_client_recov_debug = 0; 331 int nfs4_client_failover_debug = 0; 332 int nfs4_client_call_debug = 0; 333 int nfs4_client_lookup_debug = 0; 334 int nfs4_client_zone_debug = 0; 335 int nfs4_lost_rqst_debug = 0; 336 int nfs4_rdattrerr_debug = 0; 337 int nfs4_open_stream_debug = 0; 338 339 int nfs4read_error_inject; 340 341 static int nfs4_create_misses = 0; 342 343 static int nfs4_readdir_cache_shorts = 0; 344 static int nfs4_readdir_readahead = 0; 345 346 static int nfs4_bio_do_stop = 0; 347 348 static int nfs4_lostpage = 0; /* number of times we lost original page */ 349 350 int nfs4_mmap_debug = 0; 351 352 static int nfs4_pathconf_cache_hits = 0; 353 static int nfs4_pathconf_cache_misses = 0; 354 355 int nfs4close_all_cnt; 356 int nfs4close_one_debug = 0; 357 int nfs4close_notw_debug = 0; 358 359 int denied_to_flk_debug = 0; 360 void *lockt_denied_debug; 361 362 #endif 363 364 /* 365 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 366 * or NFS4ERR_RESOURCE. 367 */ 368 static int confirm_retry_sec = 30; 369 370 static int nfs4_lookup_neg_cache = 1; 371 372 /* 373 * number of pages to read ahead 374 * optimized for 100 base-T. 375 */ 376 static int nfs4_nra = 4; 377 378 static int nfs4_do_symlink_cache = 1; 379 380 static int nfs4_pathconf_disable_cache = 0; 381 382 /* 383 * These are the vnode ops routines which implement the vnode interface to 384 * the networked file system. These routines just take their parameters, 385 * make them look networkish by putting the right info into interface structs, 386 * and then calling the appropriate remote routine(s) to do the work. 387 * 388 * Note on directory name lookup cacheing: If we detect a stale fhandle, 389 * we purge the directory cache relative to that vnode. This way, the 390 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 391 * more details on rnode locking. 392 */ 393 394 struct vnodeops *nfs4_vnodeops; 395 396 const fs_operation_def_t nfs4_vnodeops_template[] = { 397 VOPNAME_OPEN, { .vop_open = nfs4_open }, 398 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 399 VOPNAME_READ, { .vop_read = nfs4_read }, 400 VOPNAME_WRITE, { .vop_write = nfs4_write }, 401 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 402 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 403 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 404 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 405 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 406 VOPNAME_CREATE, { .vop_create = nfs4_create }, 407 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 408 VOPNAME_LINK, { .vop_link = nfs4_link }, 409 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 410 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 411 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 412 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 413 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 414 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 415 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 416 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 417 VOPNAME_FID, { .vop_fid = nfs4_fid }, 418 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 419 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 420 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 421 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 422 VOPNAME_SPACE, { .vop_space = nfs4_space }, 423 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 424 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 425 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 426 VOPNAME_MAP, { .vop_map = nfs4_map }, 427 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 428 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 429 /* no separate nfs4_dump */ 430 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 431 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 432 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 433 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 434 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 435 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 436 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 437 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 438 NULL, NULL 439 }; 440 441 /* 442 * The following are subroutines and definitions to set args or get res 443 * for the different nfsv4 ops 444 */ 445 446 void 447 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 448 { 449 int i; 450 451 for (i = 0; i < arglen; i++) { 452 if (argop[i].argop == OP_LOOKUP) { 453 kmem_free( 454 argop[i].nfs_argop4_u.oplookup. 455 objname.utf8string_val, 456 argop[i].nfs_argop4_u.oplookup. 457 objname.utf8string_len); 458 } 459 } 460 } 461 462 static void 463 nfs4args_lock_free(nfs_argop4 *argop) 464 { 465 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 466 467 if (locker->new_lock_owner == TRUE) { 468 open_to_lock_owner4 *open_owner; 469 470 open_owner = &locker->locker4_u.open_owner; 471 if (open_owner->lock_owner.owner_val != NULL) { 472 kmem_free(open_owner->lock_owner.owner_val, 473 open_owner->lock_owner.owner_len); 474 } 475 } 476 } 477 478 static void 479 nfs4args_lockt_free(nfs_argop4 *argop) 480 { 481 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 482 483 if (lowner->owner_val != NULL) { 484 kmem_free(lowner->owner_val, lowner->owner_len); 485 } 486 } 487 488 static void 489 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 490 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 491 nfs4_stateid_types_t *sid_types) 492 { 493 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 494 mntinfo4_t *mi; 495 496 argop->argop = OP_SETATTR; 497 /* 498 * The stateid is set to 0 if client is not modifying the size 499 * and otherwise to whatever nfs4_get_stateid() returns. 500 * 501 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 502 * state struct could be found for the process/file pair. We may 503 * want to change this in the future (by OPENing the file). See 504 * bug # 4474852. 505 */ 506 if (vap->va_mask & AT_SIZE) { 507 508 ASSERT(rp != NULL); 509 mi = VTOMI4(RTOV4(rp)); 510 511 argop->nfs_argop4_u.opsetattr.stateid = 512 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 513 OP_SETATTR, sid_types, FALSE); 514 } else { 515 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 516 sizeof (stateid4)); 517 } 518 519 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 520 if (*error) 521 bzero(attr, sizeof (*attr)); 522 } 523 524 static void 525 nfs4args_setattr_free(nfs_argop4 *argop) 526 { 527 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 528 } 529 530 static int 531 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 532 bitmap4 supp) 533 { 534 fattr4 *attr; 535 int error = 0; 536 537 argop->argop = op; 538 switch (op) { 539 case OP_VERIFY: 540 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 541 break; 542 case OP_NVERIFY: 543 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 544 break; 545 default: 546 return (EINVAL); 547 } 548 if (!error) 549 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 550 if (error) 551 bzero(attr, sizeof (*attr)); 552 return (error); 553 } 554 555 static void 556 nfs4args_verify_free(nfs_argop4 *argop) 557 { 558 switch (argop->argop) { 559 case OP_VERIFY: 560 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 561 break; 562 case OP_NVERIFY: 563 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 564 break; 565 default: 566 break; 567 } 568 } 569 570 static void 571 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 572 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 573 { 574 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 575 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 576 577 argop->argop = OP_WRITE; 578 wargs->stable = stable; 579 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 580 mi, OP_WRITE, sid_tp); 581 wargs->mblk = NULL; 582 *wargs_pp = wargs; 583 } 584 585 void 586 nfs4args_copen_free(OPEN4cargs *open_args) 587 { 588 if (open_args->owner.owner_val) { 589 kmem_free(open_args->owner.owner_val, 590 open_args->owner.owner_len); 591 } 592 if ((open_args->opentype == OPEN4_CREATE) && 593 (open_args->mode != EXCLUSIVE4)) { 594 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 595 } 596 } 597 598 /* 599 * XXX: This is referenced in modstubs.s 600 */ 601 struct vnodeops * 602 nfs4_getvnodeops(void) 603 { 604 return (nfs4_vnodeops); 605 } 606 607 /* 608 * The OPEN operation opens a regular file. 609 */ 610 /*ARGSUSED3*/ 611 static int 612 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 613 { 614 vnode_t *dvp = NULL; 615 rnode4_t *rp, *drp; 616 int error; 617 int just_been_created; 618 char fn[MAXNAMELEN]; 619 620 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 621 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 622 return (EIO); 623 rp = VTOR4(*vpp); 624 625 /* 626 * Check to see if opening something besides a regular file; 627 * if so skip the OTW call 628 */ 629 if ((*vpp)->v_type != VREG) { 630 error = nfs4_open_non_reg_file(vpp, flag, cr); 631 return (error); 632 } 633 634 /* 635 * XXX - would like a check right here to know if the file is 636 * executable or not, so as to skip OTW 637 */ 638 639 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 640 return (error); 641 642 drp = VTOR4(dvp); 643 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 644 return (EINTR); 645 646 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 647 nfs_rw_exit(&drp->r_rwlock); 648 return (error); 649 } 650 651 /* 652 * See if this file has just been CREATEd. 653 * If so, clear the flag and update the dnlc, which was previously 654 * skipped in nfs4_create. 655 * XXX need better serilization on this. 656 * XXX move this into the nf4open_otw call, after we have 657 * XXX acquired the open owner seqid sync. 658 */ 659 mutex_enter(&rp->r_statev4_lock); 660 if (rp->created_v4) { 661 rp->created_v4 = 0; 662 mutex_exit(&rp->r_statev4_lock); 663 664 dnlc_update(dvp, fn, *vpp); 665 /* This is needed so we don't bump the open ref count */ 666 just_been_created = 1; 667 } else { 668 mutex_exit(&rp->r_statev4_lock); 669 just_been_created = 0; 670 } 671 672 /* 673 * If caller specified O_TRUNC/FTRUNC, then be sure to set 674 * FWRITE (to drive successful setattr(size=0) after open) 675 */ 676 if (flag & FTRUNC) 677 flag |= FWRITE; 678 679 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 680 just_been_created); 681 682 if (!error && !((*vpp)->v_flag & VROOT)) 683 dnlc_update(dvp, fn, *vpp); 684 685 nfs_rw_exit(&drp->r_rwlock); 686 687 /* release the hold from vtodv */ 688 VN_RELE(dvp); 689 690 /* exchange the shadow for the master vnode, if needed */ 691 692 if (error == 0 && IS_SHADOW(*vpp, rp)) 693 sv_exchange(vpp); 694 695 return (error); 696 } 697 698 /* 699 * See if there's a "lost open" request to be saved and recovered. 700 */ 701 static void 702 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 703 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 704 vnode_t *dvp, OPEN4cargs *open_args) 705 { 706 vfs_t *vfsp; 707 char *srccfp; 708 709 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 710 711 if (error != ETIMEDOUT && error != EINTR && 712 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 713 lost_rqstp->lr_op = 0; 714 return; 715 } 716 717 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 718 "nfs4open_save_lost_rqst: error %d", error)); 719 720 lost_rqstp->lr_op = OP_OPEN; 721 722 /* 723 * The vp (if it is not NULL) and dvp are held and rele'd via 724 * the recovery code. See nfs4_save_lost_rqst. 725 */ 726 lost_rqstp->lr_vp = vp; 727 lost_rqstp->lr_dvp = dvp; 728 lost_rqstp->lr_oop = oop; 729 lost_rqstp->lr_osp = NULL; 730 lost_rqstp->lr_lop = NULL; 731 lost_rqstp->lr_cr = cr; 732 lost_rqstp->lr_flk = NULL; 733 lost_rqstp->lr_oacc = open_args->share_access; 734 lost_rqstp->lr_odeny = open_args->share_deny; 735 lost_rqstp->lr_oclaim = open_args->claim; 736 if (open_args->claim == CLAIM_DELEGATE_CUR) { 737 lost_rqstp->lr_ostateid = 738 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 739 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 740 } else { 741 srccfp = open_args->open_claim4_u.cfile; 742 } 743 lost_rqstp->lr_ofile.utf8string_len = 0; 744 lost_rqstp->lr_ofile.utf8string_val = NULL; 745 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 746 lost_rqstp->lr_putfirst = FALSE; 747 } 748 749 struct nfs4_excl_time { 750 uint32 seconds; 751 uint32 nseconds; 752 }; 753 754 /* 755 * The OPEN operation creates and/or opens a regular file 756 * 757 * ARGSUSED 758 */ 759 static int 760 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 761 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 762 enum createmode4 createmode, int file_just_been_created) 763 { 764 rnode4_t *rp; 765 rnode4_t *drp = VTOR4(dvp); 766 vnode_t *vp = NULL; 767 vnode_t *vpi = *vpp; 768 bool_t needrecov = FALSE; 769 770 int doqueue = 1; 771 772 COMPOUND4args_clnt args; 773 COMPOUND4res_clnt res; 774 nfs_argop4 *argop; 775 nfs_resop4 *resop; 776 int argoplist_size; 777 int idx_open, idx_fattr; 778 779 GETFH4res *gf_res = NULL; 780 OPEN4res *op_res = NULL; 781 nfs4_ga_res_t *garp; 782 fattr4 *attr = NULL; 783 struct nfs4_excl_time verf; 784 bool_t did_excl_setup = FALSE; 785 int created_osp; 786 787 OPEN4cargs *open_args; 788 nfs4_open_owner_t *oop = NULL; 789 nfs4_open_stream_t *osp = NULL; 790 seqid4 seqid = 0; 791 bool_t retry_open = FALSE; 792 nfs4_recov_state_t recov_state; 793 nfs4_lost_rqst_t lost_rqst; 794 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 795 hrtime_t t; 796 int acc = 0; 797 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 798 cred_t *ncr = NULL; 799 800 nfs4_sharedfh_t *otw_sfh; 801 nfs4_sharedfh_t *orig_sfh; 802 int fh_differs = 0; 803 int numops, setgid_flag; 804 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 805 806 /* 807 * Make sure we properly deal with setting the right gid on 808 * a newly created file to reflect the parent's setgid bit 809 */ 810 setgid_flag = 0; 811 if (create_flag && in_va) { 812 813 /* 814 * If there is grpid mount flag used or 815 * the parent's directory has the setgid bit set 816 * _and_ the client was able to get a valid mapping 817 * for the parent dir's owner_group, we want to 818 * append NVERIFY(owner_group == dva.va_gid) and 819 * SETATTR to the CREATE compound. 820 */ 821 mutex_enter(&drp->r_statelock); 822 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID || 823 drp->r_attr.va_mode & VSGID) && 824 drp->r_attr.va_gid != GID_NOBODY) { 825 in_va->va_mask |= AT_GID; 826 in_va->va_gid = drp->r_attr.va_gid; 827 setgid_flag = 1; 828 } 829 mutex_exit(&drp->r_statelock); 830 } 831 832 /* 833 * Normal/non-create compound: 834 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 835 * 836 * Open(create) compound no setgid: 837 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 838 * RESTOREFH + GETATTR 839 * 840 * Open(create) setgid: 841 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 842 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 843 * NVERIFY(grp) + SETATTR 844 */ 845 if (setgid_flag) { 846 numops = 10; 847 idx_open = 1; 848 idx_fattr = 3; 849 } else if (create_flag) { 850 numops = 7; 851 idx_open = 2; 852 idx_fattr = 4; 853 } else { 854 numops = 4; 855 idx_open = 1; 856 idx_fattr = 3; 857 } 858 859 args.array_len = numops; 860 argoplist_size = numops * sizeof (nfs_argop4); 861 argop = kmem_alloc(argoplist_size, KM_SLEEP); 862 863 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 864 "open %s open flag 0x%x cred %p", file_name, open_flag, 865 (void *)cr)); 866 867 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 868 if (create_flag) { 869 /* 870 * We are to create a file. Initialize the passed in vnode 871 * pointer. 872 */ 873 vpi = NULL; 874 } else { 875 /* 876 * Check to see if the client owns a read delegation and is 877 * trying to open for write. If so, then return the delegation 878 * to avoid the server doing a cb_recall and returning DELAY. 879 * NB - we don't use the statev4_lock here because we'd have 880 * to drop the lock anyway and the result would be stale. 881 */ 882 if ((open_flag & FWRITE) && 883 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 884 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 885 886 /* 887 * If the file has a delegation, then do an access check up 888 * front. This avoids having to an access check later after 889 * we've already done start_op, which could deadlock. 890 */ 891 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 892 if (open_flag & FREAD && 893 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 894 acc |= VREAD; 895 if (open_flag & FWRITE && 896 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 897 acc |= VWRITE; 898 } 899 } 900 901 drp = VTOR4(dvp); 902 903 recov_state.rs_flags = 0; 904 recov_state.rs_num_retry_despite_err = 0; 905 cred_otw = cr; 906 907 recov_retry: 908 fh_differs = 0; 909 nfs4_error_zinit(&e); 910 911 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 912 if (e.error) { 913 if (ncr != NULL) 914 crfree(ncr); 915 kmem_free(argop, argoplist_size); 916 return (e.error); 917 } 918 919 args.ctag = TAG_OPEN; 920 args.array_len = numops; 921 args.array = argop; 922 923 /* putfh directory fh */ 924 argop[0].argop = OP_CPUTFH; 925 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 926 927 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 928 argop[idx_open].argop = OP_COPEN; 929 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 930 open_args->claim = CLAIM_NULL; 931 932 /* name of file */ 933 open_args->open_claim4_u.cfile = file_name; 934 open_args->owner.owner_len = 0; 935 open_args->owner.owner_val = NULL; 936 937 if (create_flag) { 938 /* CREATE a file */ 939 open_args->opentype = OPEN4_CREATE; 940 open_args->mode = createmode; 941 if (createmode == EXCLUSIVE4) { 942 if (did_excl_setup == FALSE) { 943 verf.seconds = zone_get_hostid(NULL); 944 if (verf.seconds != 0) 945 verf.nseconds = newnum(); 946 else { 947 timestruc_t now; 948 949 gethrestime(&now); 950 verf.seconds = now.tv_sec; 951 verf.nseconds = now.tv_nsec; 952 } 953 /* 954 * Since the server will use this value for the 955 * mtime, make sure that it can't overflow. Zero 956 * out the MSB. The actual value does not matter 957 * here, only its uniqeness. 958 */ 959 verf.seconds &= INT32_MAX; 960 did_excl_setup = TRUE; 961 } 962 963 /* Now copy over verifier to OPEN4args. */ 964 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 965 } else { 966 int v_error; 967 bitmap4 supp_attrs; 968 servinfo4_t *svp; 969 970 attr = &open_args->createhow4_u.createattrs; 971 972 svp = drp->r_server; 973 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 974 supp_attrs = svp->sv_supp_attrs; 975 nfs_rw_exit(&svp->sv_lock); 976 977 /* GUARDED4 or UNCHECKED4 */ 978 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 979 supp_attrs); 980 if (v_error) { 981 bzero(attr, sizeof (*attr)); 982 nfs4args_copen_free(open_args); 983 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 984 &recov_state, FALSE); 985 if (ncr != NULL) 986 crfree(ncr); 987 kmem_free(argop, argoplist_size); 988 return (v_error); 989 } 990 } 991 } else { 992 /* NO CREATE */ 993 open_args->opentype = OPEN4_NOCREATE; 994 } 995 996 if (recov_state.rs_sp != NULL) { 997 mutex_enter(&recov_state.rs_sp->s_lock); 998 open_args->owner.clientid = recov_state.rs_sp->clientid; 999 mutex_exit(&recov_state.rs_sp->s_lock); 1000 } else { 1001 /* XXX should we just fail here? */ 1002 open_args->owner.clientid = 0; 1003 } 1004 1005 /* 1006 * This increments oop's ref count or creates a temporary 'just_created' 1007 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 1008 * completes. 1009 */ 1010 mutex_enter(&VTOMI4(dvp)->mi_lock); 1011 1012 /* See if a permanent or just created open owner exists */ 1013 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1014 if (!oop) { 1015 /* 1016 * This open owner does not exist so create a temporary 1017 * just created one. 1018 */ 1019 oop = create_open_owner(cr, VTOMI4(dvp)); 1020 ASSERT(oop != NULL); 1021 } 1022 mutex_exit(&VTOMI4(dvp)->mi_lock); 1023 1024 /* this length never changes, do alloc before seqid sync */ 1025 open_args->owner.owner_len = sizeof (oop->oo_name); 1026 open_args->owner.owner_val = 1027 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1028 1029 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1030 if (e.error == EAGAIN) { 1031 open_owner_rele(oop); 1032 nfs4args_copen_free(open_args); 1033 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1034 if (ncr != NULL) { 1035 crfree(ncr); 1036 ncr = NULL; 1037 } 1038 goto recov_retry; 1039 } 1040 1041 /* Check to see if we need to do the OTW call */ 1042 if (!create_flag) { 1043 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1044 file_just_been_created, &e.error, acc, &recov_state)) { 1045 1046 /* 1047 * The OTW open is not necessary. Either 1048 * the open can succeed without it (eg. 1049 * delegation, error == 0) or the open 1050 * must fail due to an access failure 1051 * (error != 0). In either case, tidy 1052 * up and return. 1053 */ 1054 1055 nfs4_end_open_seqid_sync(oop); 1056 open_owner_rele(oop); 1057 nfs4args_copen_free(open_args); 1058 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1059 if (ncr != NULL) 1060 crfree(ncr); 1061 kmem_free(argop, argoplist_size); 1062 return (e.error); 1063 } 1064 } 1065 1066 bcopy(&oop->oo_name, open_args->owner.owner_val, 1067 open_args->owner.owner_len); 1068 1069 seqid = nfs4_get_open_seqid(oop) + 1; 1070 open_args->seqid = seqid; 1071 open_args->share_access = 0; 1072 if (open_flag & FREAD) 1073 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1074 if (open_flag & FWRITE) 1075 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1076 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1077 1078 1079 1080 /* 1081 * getfh w/sanity check for idx_open/idx_fattr 1082 */ 1083 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1084 argop[idx_open + 1].argop = OP_GETFH; 1085 1086 /* getattr */ 1087 argop[idx_fattr].argop = OP_GETATTR; 1088 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1089 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1090 1091 if (setgid_flag) { 1092 vattr_t _v; 1093 servinfo4_t *svp; 1094 bitmap4 supp_attrs; 1095 1096 svp = drp->r_server; 1097 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1098 supp_attrs = svp->sv_supp_attrs; 1099 nfs_rw_exit(&svp->sv_lock); 1100 1101 /* 1102 * For setgid case, we need to: 1103 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1104 */ 1105 argop[4].argop = OP_SAVEFH; 1106 1107 argop[5].argop = OP_CPUTFH; 1108 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1109 1110 argop[6].argop = OP_GETATTR; 1111 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1112 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1113 1114 argop[7].argop = OP_RESTOREFH; 1115 1116 /* 1117 * nverify 1118 */ 1119 _v.va_mask = AT_GID; 1120 _v.va_gid = in_va->va_gid; 1121 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1122 supp_attrs))) { 1123 1124 /* 1125 * setattr 1126 * 1127 * We _know_ we're not messing with AT_SIZE or 1128 * AT_XTIME, so no need for stateid or flags. 1129 * Also we specify NULL rp since we're only 1130 * interested in setting owner_group attributes. 1131 */ 1132 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1133 supp_attrs, &e.error, 0); 1134 if (e.error) 1135 nfs4args_verify_free(&argop[8]); 1136 } 1137 1138 if (e.error) { 1139 /* 1140 * XXX - Revisit the last argument to nfs4_end_op() 1141 * once 5020486 is fixed. 1142 */ 1143 nfs4_end_open_seqid_sync(oop); 1144 open_owner_rele(oop); 1145 nfs4args_copen_free(open_args); 1146 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1147 if (ncr != NULL) 1148 crfree(ncr); 1149 kmem_free(argop, argoplist_size); 1150 return (e.error); 1151 } 1152 } else if (create_flag) { 1153 argop[1].argop = OP_SAVEFH; 1154 1155 argop[5].argop = OP_RESTOREFH; 1156 1157 argop[6].argop = OP_GETATTR; 1158 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1159 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1160 } 1161 1162 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1163 "nfs4open_otw: %s call, nm %s, rp %s", 1164 needrecov ? "recov" : "first", file_name, 1165 rnode4info(VTOR4(dvp)))); 1166 1167 t = gethrtime(); 1168 1169 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1170 1171 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1172 nfs4_set_open_seqid(seqid, oop, args.ctag); 1173 1174 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1175 1176 if (e.error || needrecov) { 1177 bool_t abort = FALSE; 1178 1179 if (needrecov) { 1180 nfs4_bseqid_entry_t *bsep = NULL; 1181 1182 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1183 cred_otw, vpi, dvp, open_args); 1184 1185 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1186 bsep = nfs4_create_bseqid_entry(oop, NULL, 1187 vpi, 0, args.ctag, open_args->seqid); 1188 num_bseqid_retry--; 1189 } 1190 1191 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1192 NULL, lost_rqst.lr_op == OP_OPEN ? 1193 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL); 1194 1195 if (bsep) 1196 kmem_free(bsep, sizeof (*bsep)); 1197 /* give up if we keep getting BAD_SEQID */ 1198 if (num_bseqid_retry == 0) 1199 abort = TRUE; 1200 if (abort == TRUE && e.error == 0) 1201 e.error = geterrno4(res.status); 1202 } 1203 nfs4_end_open_seqid_sync(oop); 1204 open_owner_rele(oop); 1205 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1206 nfs4args_copen_free(open_args); 1207 if (setgid_flag) { 1208 nfs4args_verify_free(&argop[8]); 1209 nfs4args_setattr_free(&argop[9]); 1210 } 1211 if (!e.error) 1212 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1213 if (ncr != NULL) { 1214 crfree(ncr); 1215 ncr = NULL; 1216 } 1217 if (!needrecov || abort == TRUE || e.error == EINTR || 1218 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1219 kmem_free(argop, argoplist_size); 1220 return (e.error); 1221 } 1222 goto recov_retry; 1223 } 1224 1225 /* 1226 * Will check and update lease after checking the rflag for 1227 * OPEN_CONFIRM in the successful OPEN call. 1228 */ 1229 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1230 1231 /* 1232 * XXX what if we're crossing mount points from server1:/drp 1233 * to server2:/drp/rp. 1234 */ 1235 1236 /* Signal our end of use of the open seqid */ 1237 nfs4_end_open_seqid_sync(oop); 1238 1239 /* 1240 * This will destroy the open owner if it was just created, 1241 * and no one else has put a reference on it. 1242 */ 1243 open_owner_rele(oop); 1244 if (create_flag && (createmode != EXCLUSIVE4) && 1245 res.status == NFS4ERR_BADOWNER) 1246 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1247 1248 e.error = geterrno4(res.status); 1249 nfs4args_copen_free(open_args); 1250 if (setgid_flag) { 1251 nfs4args_verify_free(&argop[8]); 1252 nfs4args_setattr_free(&argop[9]); 1253 } 1254 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1255 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1256 /* 1257 * If the reply is NFS4ERR_ACCESS, it may be because 1258 * we are root (no root net access). If the real uid 1259 * is not root, then retry with the real uid instead. 1260 */ 1261 if (ncr != NULL) { 1262 crfree(ncr); 1263 ncr = NULL; 1264 } 1265 if (res.status == NFS4ERR_ACCESS && 1266 (ncr = crnetadjust(cred_otw)) != NULL) { 1267 cred_otw = ncr; 1268 goto recov_retry; 1269 } 1270 kmem_free(argop, argoplist_size); 1271 return (e.error); 1272 } 1273 1274 resop = &res.array[idx_open]; /* open res */ 1275 op_res = &resop->nfs_resop4_u.opopen; 1276 1277 #ifdef DEBUG 1278 /* 1279 * verify attrset bitmap 1280 */ 1281 if (create_flag && 1282 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1283 /* make sure attrset returned is what we asked for */ 1284 /* XXX Ignore this 'error' for now */ 1285 if (attr->attrmask != op_res->attrset) 1286 /* EMPTY */; 1287 } 1288 #endif 1289 1290 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1291 mutex_enter(&VTOMI4(dvp)->mi_lock); 1292 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1293 mutex_exit(&VTOMI4(dvp)->mi_lock); 1294 } 1295 1296 resop = &res.array[idx_open + 1]; /* getfh res */ 1297 gf_res = &resop->nfs_resop4_u.opgetfh; 1298 1299 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1300 1301 /* 1302 * The open stateid has been updated on the server but not 1303 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1304 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1305 * WRITE call. That, however, will use the old stateid, so go ahead 1306 * and upate the open stateid now, before any call to makenfs4node. 1307 */ 1308 if (vpi) { 1309 nfs4_open_stream_t *tmp_osp; 1310 rnode4_t *tmp_rp = VTOR4(vpi); 1311 1312 tmp_osp = find_open_stream(oop, tmp_rp); 1313 if (tmp_osp) { 1314 tmp_osp->open_stateid = op_res->stateid; 1315 mutex_exit(&tmp_osp->os_sync_lock); 1316 open_stream_rele(tmp_osp, tmp_rp); 1317 } 1318 1319 /* 1320 * We must determine if the file handle given by the otw open 1321 * is the same as the file handle which was passed in with 1322 * *vpp. This case can be reached if the file we are trying 1323 * to open has been removed and another file has been created 1324 * having the same file name. The passed in vnode is released 1325 * later. 1326 */ 1327 orig_sfh = VTOR4(vpi)->r_fh; 1328 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1329 } 1330 1331 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1332 1333 if (create_flag || fh_differs) { 1334 int rnode_err = 0; 1335 1336 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1337 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1338 1339 if (e.error) 1340 PURGE_ATTRCACHE4(vp); 1341 /* 1342 * For the newly created vp case, make sure the rnode 1343 * isn't bad before using it. 1344 */ 1345 mutex_enter(&(VTOR4(vp))->r_statelock); 1346 if (VTOR4(vp)->r_flags & R4RECOVERR) 1347 rnode_err = EIO; 1348 mutex_exit(&(VTOR4(vp))->r_statelock); 1349 1350 if (rnode_err) { 1351 nfs4_end_open_seqid_sync(oop); 1352 nfs4args_copen_free(open_args); 1353 if (setgid_flag) { 1354 nfs4args_verify_free(&argop[8]); 1355 nfs4args_setattr_free(&argop[9]); 1356 } 1357 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1358 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1359 needrecov); 1360 open_owner_rele(oop); 1361 VN_RELE(vp); 1362 if (ncr != NULL) 1363 crfree(ncr); 1364 sfh4_rele(&otw_sfh); 1365 kmem_free(argop, argoplist_size); 1366 return (EIO); 1367 } 1368 } else { 1369 vp = vpi; 1370 } 1371 sfh4_rele(&otw_sfh); 1372 1373 /* 1374 * It seems odd to get a full set of attrs and then not update 1375 * the object's attrcache in the non-create case. Create case uses 1376 * the attrs since makenfs4node checks to see if the attrs need to 1377 * be updated (and then updates them). The non-create case should 1378 * update attrs also. 1379 */ 1380 if (! create_flag && ! fh_differs && !e.error) { 1381 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1382 } 1383 1384 nfs4_error_zinit(&e); 1385 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1386 /* This does not do recovery for vp explicitly. */ 1387 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1388 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1389 1390 if (e.error || e.stat) { 1391 nfs4_end_open_seqid_sync(oop); 1392 nfs4args_copen_free(open_args); 1393 if (setgid_flag) { 1394 nfs4args_verify_free(&argop[8]); 1395 nfs4args_setattr_free(&argop[9]); 1396 } 1397 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1398 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1399 needrecov); 1400 open_owner_rele(oop); 1401 if (create_flag || fh_differs) { 1402 /* rele the makenfs4node */ 1403 VN_RELE(vp); 1404 } 1405 if (ncr != NULL) { 1406 crfree(ncr); 1407 ncr = NULL; 1408 } 1409 if (retry_open == TRUE) { 1410 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1411 "nfs4open_otw: retry the open since OPEN " 1412 "CONFIRM failed with error %d stat %d", 1413 e.error, e.stat)); 1414 if (create_flag && createmode == GUARDED4) { 1415 NFS4_DEBUG(nfs4_client_recov_debug, 1416 (CE_NOTE, "nfs4open_otw: switch " 1417 "createmode from GUARDED4 to " 1418 "UNCHECKED4")); 1419 createmode = UNCHECKED4; 1420 } 1421 goto recov_retry; 1422 } 1423 if (!e.error) { 1424 if (create_flag && (createmode != EXCLUSIVE4) && 1425 e.stat == NFS4ERR_BADOWNER) 1426 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1427 1428 e.error = geterrno4(e.stat); 1429 } 1430 kmem_free(argop, argoplist_size); 1431 return (e.error); 1432 } 1433 } 1434 1435 rp = VTOR4(vp); 1436 1437 mutex_enter(&rp->r_statev4_lock); 1438 if (create_flag) 1439 rp->created_v4 = 1; 1440 mutex_exit(&rp->r_statev4_lock); 1441 1442 mutex_enter(&oop->oo_lock); 1443 /* Doesn't matter if 'oo_just_created' already was set as this */ 1444 oop->oo_just_created = NFS4_PERM_CREATED; 1445 if (oop->oo_cred_otw) 1446 crfree(oop->oo_cred_otw); 1447 oop->oo_cred_otw = cred_otw; 1448 crhold(oop->oo_cred_otw); 1449 mutex_exit(&oop->oo_lock); 1450 1451 /* returns with 'os_sync_lock' held */ 1452 osp = find_or_create_open_stream(oop, rp, &created_osp); 1453 if (!osp) { 1454 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1455 "nfs4open_otw: failed to create an open stream")); 1456 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1457 "signal our end of use of the open seqid")); 1458 1459 nfs4_end_open_seqid_sync(oop); 1460 open_owner_rele(oop); 1461 nfs4args_copen_free(open_args); 1462 if (setgid_flag) { 1463 nfs4args_verify_free(&argop[8]); 1464 nfs4args_setattr_free(&argop[9]); 1465 } 1466 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1467 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1468 if (create_flag || fh_differs) 1469 VN_RELE(vp); 1470 if (ncr != NULL) 1471 crfree(ncr); 1472 1473 kmem_free(argop, argoplist_size); 1474 return (EINVAL); 1475 1476 } 1477 1478 osp->open_stateid = op_res->stateid; 1479 1480 if (open_flag & FREAD) 1481 osp->os_share_acc_read++; 1482 if (open_flag & FWRITE) 1483 osp->os_share_acc_write++; 1484 osp->os_share_deny_none++; 1485 1486 /* 1487 * Need to reset this bitfield for the possible case where we were 1488 * going to OTW CLOSE the file, got a non-recoverable error, and before 1489 * we could retry the CLOSE, OPENed the file again. 1490 */ 1491 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1492 osp->os_final_close = 0; 1493 osp->os_force_close = 0; 1494 #ifdef DEBUG 1495 if (osp->os_failed_reopen) 1496 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1497 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1498 (void *)osp, (void *)cr, rnode4info(rp))); 1499 #endif 1500 osp->os_failed_reopen = 0; 1501 1502 mutex_exit(&osp->os_sync_lock); 1503 1504 nfs4_end_open_seqid_sync(oop); 1505 1506 if (created_osp && recov_state.rs_sp != NULL) { 1507 mutex_enter(&recov_state.rs_sp->s_lock); 1508 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1509 mutex_exit(&recov_state.rs_sp->s_lock); 1510 } 1511 1512 /* get rid of our reference to find oop */ 1513 open_owner_rele(oop); 1514 1515 open_stream_rele(osp, rp); 1516 1517 /* accept delegation, if any */ 1518 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1519 1520 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1521 1522 if (createmode == EXCLUSIVE4 && 1523 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1524 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1525 " EXCLUSIVE4: sending a SETATTR")); 1526 /* 1527 * If doing an exclusive create, then generate 1528 * a SETATTR to set the initial attributes. 1529 * Try to set the mtime and the atime to the 1530 * server's current time. It is somewhat 1531 * expected that these fields will be used to 1532 * store the exclusive create cookie. If not, 1533 * server implementors will need to know that 1534 * a SETATTR will follow an exclusive create 1535 * and the cookie should be destroyed if 1536 * appropriate. 1537 * 1538 * The AT_GID and AT_SIZE bits are turned off 1539 * so that the SETATTR request will not attempt 1540 * to process these. The gid will be set 1541 * separately if appropriate. The size is turned 1542 * off because it is assumed that a new file will 1543 * be created empty and if the file wasn't empty, 1544 * then the exclusive create will have failed 1545 * because the file must have existed already. 1546 * Therefore, no truncate operation is needed. 1547 */ 1548 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1549 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1550 1551 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1552 if (e.error) { 1553 /* 1554 * Couldn't correct the attributes of 1555 * the newly created file and the 1556 * attributes are wrong. Remove the 1557 * file and return an error to the 1558 * application. 1559 */ 1560 /* XXX will this take care of client state ? */ 1561 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1562 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1563 " remove file", e.error)); 1564 VN_RELE(vp); 1565 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1566 /* 1567 * Since we've reled the vnode and removed 1568 * the file we now need to return the error. 1569 * At this point we don't want to update the 1570 * dircaches, call nfs4_waitfor_purge_complete 1571 * or set vpp to vp so we need to skip these 1572 * as well. 1573 */ 1574 goto skip_update_dircaches; 1575 } 1576 } 1577 1578 /* 1579 * If we created or found the correct vnode, due to create_flag or 1580 * fh_differs being set, then update directory cache attribute, readdir 1581 * and dnlc caches. 1582 */ 1583 if (create_flag || fh_differs) { 1584 dirattr_info_t dinfo, *dinfop; 1585 1586 /* 1587 * Make sure getattr succeeded before using results. 1588 * note: op 7 is getattr(dir) for both flavors of 1589 * open(create). 1590 */ 1591 if (create_flag && res.status == NFS4_OK) { 1592 dinfo.di_time_call = t; 1593 dinfo.di_cred = cr; 1594 dinfo.di_garp = 1595 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1596 dinfop = &dinfo; 1597 } else { 1598 dinfop = NULL; 1599 } 1600 1601 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1602 dinfop); 1603 } 1604 1605 /* 1606 * If the page cache for this file was flushed from actions 1607 * above, it was done asynchronously and if that is true, 1608 * there is a need to wait here for it to complete. This must 1609 * be done outside of start_fop/end_fop. 1610 */ 1611 (void) nfs4_waitfor_purge_complete(vp); 1612 1613 /* 1614 * It is implicit that we are in the open case (create_flag == 0) since 1615 * fh_differs can only be set to a non-zero value in the open case. 1616 */ 1617 if (fh_differs != 0 && vpi != NULL) 1618 VN_RELE(vpi); 1619 1620 /* 1621 * Be sure to set *vpp to the correct value before returning. 1622 */ 1623 *vpp = vp; 1624 1625 skip_update_dircaches: 1626 1627 nfs4args_copen_free(open_args); 1628 if (setgid_flag) { 1629 nfs4args_verify_free(&argop[8]); 1630 nfs4args_setattr_free(&argop[9]); 1631 } 1632 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1633 1634 if (ncr) 1635 crfree(ncr); 1636 kmem_free(argop, argoplist_size); 1637 return (e.error); 1638 } 1639 1640 /* 1641 * Reopen an open instance. cf. nfs4open_otw(). 1642 * 1643 * Errors are returned by the nfs4_error_t parameter. 1644 * - ep->error contains an errno value or zero. 1645 * - if it is zero, ep->stat is set to an NFS status code, if any. 1646 * If the file could not be reopened, but the caller should continue, the 1647 * file is marked dead and no error values are returned. If the caller 1648 * should stop recovering open files and start over, either the ep->error 1649 * value or ep->stat will indicate an error (either something that requires 1650 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1651 * filehandles) may be handled silently by this routine. 1652 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1653 * will be started, so the caller should not do it. 1654 * 1655 * Gotos: 1656 * - kill_file : reopen failed in such a fashion to constitute marking the 1657 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1658 * is for cases where recovery is not possible. 1659 * - failed_reopen : same as above, except that the file has already been 1660 * marked dead, so no need to do it again. 1661 * - bailout : reopen failed but we are able to recover and retry the reopen - 1662 * either within this function immediately or via the calling function. 1663 */ 1664 1665 void 1666 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1667 open_claim_type4 claim, bool_t frc_use_claim_previous, 1668 bool_t is_recov) 1669 { 1670 COMPOUND4args_clnt args; 1671 COMPOUND4res_clnt res; 1672 nfs_argop4 argop[4]; 1673 nfs_resop4 *resop; 1674 OPEN4res *op_res = NULL; 1675 OPEN4cargs *open_args; 1676 GETFH4res *gf_res; 1677 rnode4_t *rp = VTOR4(vp); 1678 int doqueue = 1; 1679 cred_t *cr = NULL, *cred_otw = NULL; 1680 nfs4_open_owner_t *oop = NULL; 1681 seqid4 seqid; 1682 nfs4_ga_res_t *garp; 1683 char fn[MAXNAMELEN]; 1684 nfs4_recov_state_t recov = {NULL, 0}; 1685 nfs4_lost_rqst_t lost_rqst; 1686 mntinfo4_t *mi = VTOMI4(vp); 1687 bool_t abort; 1688 char *failed_msg = ""; 1689 int fh_different; 1690 hrtime_t t; 1691 nfs4_bseqid_entry_t *bsep = NULL; 1692 1693 ASSERT(nfs4_consistent_type(vp)); 1694 ASSERT(nfs_zone() == mi->mi_zone); 1695 1696 nfs4_error_zinit(ep); 1697 1698 /* this is the cred used to find the open owner */ 1699 cr = state_to_cred(osp); 1700 if (cr == NULL) { 1701 failed_msg = "Couldn't reopen: no cred"; 1702 goto kill_file; 1703 } 1704 /* use this cred for OTW operations */ 1705 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1706 1707 top: 1708 nfs4_error_zinit(ep); 1709 1710 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1711 /* File system has been unmounted, quit */ 1712 ep->error = EIO; 1713 failed_msg = "Couldn't reopen: file system has been unmounted"; 1714 goto kill_file; 1715 } 1716 1717 oop = osp->os_open_owner; 1718 1719 ASSERT(oop != NULL); 1720 if (oop == NULL) { /* be defensive in non-DEBUG */ 1721 failed_msg = "can't reopen: no open owner"; 1722 goto kill_file; 1723 } 1724 open_owner_hold(oop); 1725 1726 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1727 if (ep->error) { 1728 open_owner_rele(oop); 1729 oop = NULL; 1730 goto bailout; 1731 } 1732 1733 /* 1734 * If the rnode has a delegation and the delegation has been 1735 * recovered and the server didn't request a recall and the caller 1736 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1737 * recovery) and the rnode hasn't been marked dead, then install 1738 * the delegation stateid in the open stream. Otherwise, proceed 1739 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1740 */ 1741 mutex_enter(&rp->r_statev4_lock); 1742 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1743 !rp->r_deleg_return_pending && 1744 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1745 !rp->r_deleg_needs_recall && 1746 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1747 !(rp->r_flags & R4RECOVERR)) { 1748 mutex_enter(&osp->os_sync_lock); 1749 osp->os_delegation = 1; 1750 osp->open_stateid = rp->r_deleg_stateid; 1751 mutex_exit(&osp->os_sync_lock); 1752 mutex_exit(&rp->r_statev4_lock); 1753 goto bailout; 1754 } 1755 mutex_exit(&rp->r_statev4_lock); 1756 1757 /* 1758 * If the file failed recovery, just quit. This failure need not 1759 * affect other reopens, so don't return an error. 1760 */ 1761 mutex_enter(&rp->r_statelock); 1762 if (rp->r_flags & R4RECOVERR) { 1763 mutex_exit(&rp->r_statelock); 1764 ep->error = 0; 1765 goto failed_reopen; 1766 } 1767 mutex_exit(&rp->r_statelock); 1768 1769 /* 1770 * argop is empty here 1771 * 1772 * PUTFH, OPEN, GETATTR 1773 */ 1774 args.ctag = TAG_REOPEN; 1775 args.array_len = 4; 1776 args.array = argop; 1777 1778 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1779 "nfs4_reopen: file is type %d, id %s", 1780 vp->v_type, rnode4info(VTOR4(vp)))); 1781 1782 argop[0].argop = OP_CPUTFH; 1783 1784 if (claim != CLAIM_PREVIOUS) { 1785 /* 1786 * if this is a file mount then 1787 * use the mntinfo parentfh 1788 */ 1789 argop[0].nfs_argop4_u.opcputfh.sfh = 1790 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1791 VTOSV(vp)->sv_dfh; 1792 } else { 1793 /* putfh fh to reopen */ 1794 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1795 } 1796 1797 argop[1].argop = OP_COPEN; 1798 open_args = &argop[1].nfs_argop4_u.opcopen; 1799 open_args->claim = claim; 1800 1801 if (claim == CLAIM_NULL) { 1802 1803 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1804 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1805 "failed for vp 0x%p for CLAIM_NULL with %m", 1806 (void *)vp); 1807 failed_msg = "Couldn't reopen: vtoname failed for " 1808 "CLAIM_NULL"; 1809 /* nothing allocated yet */ 1810 goto kill_file; 1811 } 1812 1813 open_args->open_claim4_u.cfile = fn; 1814 } else if (claim == CLAIM_PREVIOUS) { 1815 1816 /* 1817 * We have two cases to deal with here: 1818 * 1) We're being called to reopen files in order to satisfy 1819 * a lock operation request which requires us to explicitly 1820 * reopen files which were opened under a delegation. If 1821 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1822 * that case, frc_use_claim_previous is TRUE and we must 1823 * use the rnode's current delegation type (r_deleg_type). 1824 * 2) We're reopening files during some form of recovery. 1825 * In this case, frc_use_claim_previous is FALSE and we 1826 * use the delegation type appropriate for recovery 1827 * (r_deleg_needs_recovery). 1828 */ 1829 mutex_enter(&rp->r_statev4_lock); 1830 open_args->open_claim4_u.delegate_type = 1831 frc_use_claim_previous ? 1832 rp->r_deleg_type : 1833 rp->r_deleg_needs_recovery; 1834 mutex_exit(&rp->r_statev4_lock); 1835 1836 } else if (claim == CLAIM_DELEGATE_CUR) { 1837 1838 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1839 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1840 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1841 "with %m", (void *)vp); 1842 failed_msg = "Couldn't reopen: vtoname failed for " 1843 "CLAIM_DELEGATE_CUR"; 1844 /* nothing allocated yet */ 1845 goto kill_file; 1846 } 1847 1848 mutex_enter(&rp->r_statev4_lock); 1849 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1850 rp->r_deleg_stateid; 1851 mutex_exit(&rp->r_statev4_lock); 1852 1853 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1854 } 1855 open_args->opentype = OPEN4_NOCREATE; 1856 open_args->owner.clientid = mi2clientid(mi); 1857 open_args->owner.owner_len = sizeof (oop->oo_name); 1858 open_args->owner.owner_val = 1859 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1860 bcopy(&oop->oo_name, open_args->owner.owner_val, 1861 open_args->owner.owner_len); 1862 open_args->share_access = 0; 1863 open_args->share_deny = 0; 1864 1865 mutex_enter(&osp->os_sync_lock); 1866 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1867 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1868 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1869 (void *)osp, (void *)rp, osp->os_share_acc_read, 1870 osp->os_share_acc_write, osp->os_open_ref_count, 1871 osp->os_mmap_read, osp->os_mmap_write, claim)); 1872 1873 if (osp->os_share_acc_read || osp->os_mmap_read) 1874 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1875 if (osp->os_share_acc_write || osp->os_mmap_write) 1876 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1877 if (osp->os_share_deny_read) 1878 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1879 if (osp->os_share_deny_write) 1880 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1881 mutex_exit(&osp->os_sync_lock); 1882 1883 seqid = nfs4_get_open_seqid(oop) + 1; 1884 open_args->seqid = seqid; 1885 1886 /* Construct the getfh part of the compound */ 1887 argop[2].argop = OP_GETFH; 1888 1889 /* Construct the getattr part of the compound */ 1890 argop[3].argop = OP_GETATTR; 1891 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1892 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1893 1894 t = gethrtime(); 1895 1896 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1897 1898 if (ep->error) { 1899 if (!is_recov && !frc_use_claim_previous && 1900 (ep->error == EINTR || ep->error == ETIMEDOUT || 1901 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1902 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1903 cred_otw, vp, NULL, open_args); 1904 abort = nfs4_start_recovery(ep, 1905 VTOMI4(vp), vp, NULL, NULL, 1906 lost_rqst.lr_op == OP_OPEN ? 1907 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL); 1908 nfs4args_copen_free(open_args); 1909 goto bailout; 1910 } 1911 1912 nfs4args_copen_free(open_args); 1913 1914 if (ep->error == EACCES && cred_otw != cr) { 1915 crfree(cred_otw); 1916 cred_otw = cr; 1917 crhold(cred_otw); 1918 nfs4_end_open_seqid_sync(oop); 1919 open_owner_rele(oop); 1920 oop = NULL; 1921 goto top; 1922 } 1923 if (ep->error == ETIMEDOUT) 1924 goto bailout; 1925 failed_msg = "Couldn't reopen: rpc error"; 1926 goto kill_file; 1927 } 1928 1929 if (nfs4_need_to_bump_seqid(&res)) 1930 nfs4_set_open_seqid(seqid, oop, args.ctag); 1931 1932 switch (res.status) { 1933 case NFS4_OK: 1934 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1935 mutex_enter(&rp->r_statelock); 1936 rp->r_delay_interval = 0; 1937 mutex_exit(&rp->r_statelock); 1938 } 1939 break; 1940 case NFS4ERR_BAD_SEQID: 1941 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1942 args.ctag, open_args->seqid); 1943 1944 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1945 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1946 NULL, OP_OPEN, bsep, NULL, NULL); 1947 1948 nfs4args_copen_free(open_args); 1949 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1950 nfs4_end_open_seqid_sync(oop); 1951 open_owner_rele(oop); 1952 oop = NULL; 1953 kmem_free(bsep, sizeof (*bsep)); 1954 1955 goto kill_file; 1956 case NFS4ERR_NO_GRACE: 1957 nfs4args_copen_free(open_args); 1958 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1959 nfs4_end_open_seqid_sync(oop); 1960 open_owner_rele(oop); 1961 oop = NULL; 1962 if (claim == CLAIM_PREVIOUS) { 1963 /* 1964 * Retry as a plain open. We don't need to worry about 1965 * checking the changeinfo: it is acceptable for a 1966 * client to re-open a file and continue processing 1967 * (in the absence of locks). 1968 */ 1969 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1970 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1971 "will retry as CLAIM_NULL")); 1972 claim = CLAIM_NULL; 1973 nfs4_mi_kstat_inc_no_grace(mi); 1974 goto top; 1975 } 1976 failed_msg = 1977 "Couldn't reopen: tried reclaim outside grace period. "; 1978 goto kill_file; 1979 case NFS4ERR_GRACE: 1980 nfs4_set_grace_wait(mi); 1981 nfs4args_copen_free(open_args); 1982 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1983 nfs4_end_open_seqid_sync(oop); 1984 open_owner_rele(oop); 1985 oop = NULL; 1986 ep->error = nfs4_wait_for_grace(mi, &recov); 1987 if (ep->error != 0) 1988 goto bailout; 1989 goto top; 1990 case NFS4ERR_DELAY: 1991 nfs4_set_delay_wait(vp); 1992 nfs4args_copen_free(open_args); 1993 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1994 nfs4_end_open_seqid_sync(oop); 1995 open_owner_rele(oop); 1996 oop = NULL; 1997 ep->error = nfs4_wait_for_delay(vp, &recov); 1998 nfs4_mi_kstat_inc_delay(mi); 1999 if (ep->error != 0) 2000 goto bailout; 2001 goto top; 2002 case NFS4ERR_FHEXPIRED: 2003 /* recover filehandle and retry */ 2004 abort = nfs4_start_recovery(ep, 2005 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL); 2006 nfs4args_copen_free(open_args); 2007 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2008 nfs4_end_open_seqid_sync(oop); 2009 open_owner_rele(oop); 2010 oop = NULL; 2011 if (abort == FALSE) 2012 goto top; 2013 failed_msg = "Couldn't reopen: recovery aborted"; 2014 goto kill_file; 2015 case NFS4ERR_RESOURCE: 2016 case NFS4ERR_STALE_CLIENTID: 2017 case NFS4ERR_WRONGSEC: 2018 case NFS4ERR_EXPIRED: 2019 /* 2020 * Do not mark the file dead and let the calling 2021 * function initiate recovery. 2022 */ 2023 nfs4args_copen_free(open_args); 2024 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2025 nfs4_end_open_seqid_sync(oop); 2026 open_owner_rele(oop); 2027 oop = NULL; 2028 goto bailout; 2029 case NFS4ERR_ACCESS: 2030 if (cred_otw != cr) { 2031 crfree(cred_otw); 2032 cred_otw = cr; 2033 crhold(cred_otw); 2034 nfs4args_copen_free(open_args); 2035 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2036 nfs4_end_open_seqid_sync(oop); 2037 open_owner_rele(oop); 2038 oop = NULL; 2039 goto top; 2040 } 2041 /* fall through */ 2042 default: 2043 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2044 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2045 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2046 rnode4info(VTOR4(vp)))); 2047 failed_msg = "Couldn't reopen: NFSv4 error"; 2048 nfs4args_copen_free(open_args); 2049 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2050 goto kill_file; 2051 } 2052 2053 resop = &res.array[1]; /* open res */ 2054 op_res = &resop->nfs_resop4_u.opopen; 2055 2056 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2057 2058 /* 2059 * Check if the path we reopened really is the same 2060 * file. We could end up in a situation where the file 2061 * was removed and a new file created with the same name. 2062 */ 2063 resop = &res.array[2]; 2064 gf_res = &resop->nfs_resop4_u.opgetfh; 2065 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2066 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2067 if (fh_different) { 2068 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2069 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2070 /* Oops, we don't have the same file */ 2071 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2072 failed_msg = "Couldn't reopen: Persistent " 2073 "file handle changed"; 2074 else 2075 failed_msg = "Couldn't reopen: Volatile " 2076 "(no expire on open) file handle changed"; 2077 2078 nfs4args_copen_free(open_args); 2079 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2080 nfs_rw_exit(&mi->mi_fh_lock); 2081 goto kill_file; 2082 2083 } else { 2084 /* 2085 * We have volatile file handles that don't compare. 2086 * If the fids are the same then we assume that the 2087 * file handle expired but the rnode still refers to 2088 * the same file object. 2089 * 2090 * First check that we have fids or not. 2091 * If we don't we have a dumb server so we will 2092 * just assume every thing is ok for now. 2093 */ 2094 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2095 rp->r_attr.va_mask & AT_NODEID && 2096 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2097 /* 2098 * We have fids, but they don't 2099 * compare. So kill the file. 2100 */ 2101 failed_msg = 2102 "Couldn't reopen: file handle changed" 2103 " due to mismatched fids"; 2104 nfs4args_copen_free(open_args); 2105 (void) xdr_free(xdr_COMPOUND4res_clnt, 2106 (caddr_t)&res); 2107 nfs_rw_exit(&mi->mi_fh_lock); 2108 goto kill_file; 2109 } else { 2110 /* 2111 * We have volatile file handles that refers 2112 * to the same file (at least they have the 2113 * same fid) or we don't have fids so we 2114 * can't tell. :(. We'll be a kind and accepting 2115 * client so we'll update the rnode's file 2116 * handle with the otw handle. 2117 * 2118 * We need to drop mi->mi_fh_lock since 2119 * sh4_update acquires it. Since there is 2120 * only one recovery thread there is no 2121 * race. 2122 */ 2123 nfs_rw_exit(&mi->mi_fh_lock); 2124 sfh4_update(rp->r_fh, &gf_res->object); 2125 } 2126 } 2127 } else { 2128 nfs_rw_exit(&mi->mi_fh_lock); 2129 } 2130 2131 ASSERT(nfs4_consistent_type(vp)); 2132 2133 /* 2134 * If the server wanted an OPEN_CONFIRM but that fails, just start 2135 * over. Presumably if there is a persistent error it will show up 2136 * when we resend the OPEN. 2137 */ 2138 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2139 bool_t retry_open = FALSE; 2140 2141 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2142 cred_otw, is_recov, &retry_open, 2143 oop, FALSE, ep, NULL); 2144 if (ep->error || ep->stat) { 2145 nfs4args_copen_free(open_args); 2146 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2147 nfs4_end_open_seqid_sync(oop); 2148 open_owner_rele(oop); 2149 oop = NULL; 2150 goto top; 2151 } 2152 } 2153 2154 mutex_enter(&osp->os_sync_lock); 2155 osp->open_stateid = op_res->stateid; 2156 osp->os_delegation = 0; 2157 /* 2158 * Need to reset this bitfield for the possible case where we were 2159 * going to OTW CLOSE the file, got a non-recoverable error, and before 2160 * we could retry the CLOSE, OPENed the file again. 2161 */ 2162 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2163 osp->os_final_close = 0; 2164 osp->os_force_close = 0; 2165 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2166 osp->os_dc_openacc = open_args->share_access; 2167 mutex_exit(&osp->os_sync_lock); 2168 2169 nfs4_end_open_seqid_sync(oop); 2170 2171 /* accept delegation, if any */ 2172 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2173 2174 nfs4args_copen_free(open_args); 2175 2176 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2177 2178 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2179 2180 ASSERT(nfs4_consistent_type(vp)); 2181 2182 open_owner_rele(oop); 2183 crfree(cr); 2184 crfree(cred_otw); 2185 return; 2186 2187 kill_file: 2188 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2189 failed_reopen: 2190 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2191 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2192 (void *)osp, (void *)cr, rnode4info(rp))); 2193 mutex_enter(&osp->os_sync_lock); 2194 osp->os_failed_reopen = 1; 2195 mutex_exit(&osp->os_sync_lock); 2196 bailout: 2197 if (oop != NULL) { 2198 nfs4_end_open_seqid_sync(oop); 2199 open_owner_rele(oop); 2200 } 2201 if (cr != NULL) 2202 crfree(cr); 2203 if (cred_otw != NULL) 2204 crfree(cred_otw); 2205 } 2206 2207 /* for . and .. OPENs */ 2208 /* ARGSUSED */ 2209 static int 2210 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2211 { 2212 rnode4_t *rp; 2213 nfs4_ga_res_t gar; 2214 2215 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2216 2217 /* 2218 * If close-to-open consistency checking is turned off or 2219 * if there is no cached data, we can avoid 2220 * the over the wire getattr. Otherwise, force a 2221 * call to the server to get fresh attributes and to 2222 * check caches. This is required for close-to-open 2223 * consistency. 2224 */ 2225 rp = VTOR4(*vpp); 2226 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2227 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2228 return (0); 2229 2230 gar.n4g_va.va_mask = AT_ALL; 2231 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2232 } 2233 2234 /* 2235 * CLOSE a file 2236 */ 2237 /* ARGSUSED */ 2238 static int 2239 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2240 caller_context_t *ct) 2241 { 2242 rnode4_t *rp; 2243 int error = 0; 2244 int r_error = 0; 2245 int n4error = 0; 2246 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2247 2248 /* 2249 * Remove client state for this (lockowner, file) pair. 2250 * Issue otw v4 call to have the server do the same. 2251 */ 2252 2253 rp = VTOR4(vp); 2254 2255 /* 2256 * zone_enter(2) prevents processes from changing zones with NFS files 2257 * open; if we happen to get here from the wrong zone we can't do 2258 * anything over the wire. 2259 */ 2260 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2261 /* 2262 * We could attempt to clean up locks, except we're sure 2263 * that the current process didn't acquire any locks on 2264 * the file: any attempt to lock a file belong to another zone 2265 * will fail, and one can't lock an NFS file and then change 2266 * zones, as that fails too. 2267 * 2268 * Returning an error here is the sane thing to do. A 2269 * subsequent call to VN_RELE() which translates to a 2270 * nfs4_inactive() will clean up state: if the zone of the 2271 * vnode's origin is still alive and kicking, the inactive 2272 * thread will handle the request (from the correct zone), and 2273 * everything (minus the OTW close call) should be OK. If the 2274 * zone is going away nfs4_async_inactive() will throw away 2275 * delegations, open streams and cached pages inline. 2276 */ 2277 return (EIO); 2278 } 2279 2280 /* 2281 * If we are using local locking for this filesystem, then 2282 * release all of the SYSV style record locks. Otherwise, 2283 * we are doing network locking and we need to release all 2284 * of the network locks. All of the locks held by this 2285 * process on this file are released no matter what the 2286 * incoming reference count is. 2287 */ 2288 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2289 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2290 cleanshares(vp, ttoproc(curthread)->p_pid); 2291 } else 2292 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2293 2294 if (e.error) { 2295 struct lm_sysid *lmsid; 2296 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2297 if (lmsid == NULL) { 2298 DTRACE_PROBE2(unknown__sysid, int, e.error, 2299 vnode_t *, vp); 2300 } else { 2301 cleanlocks(vp, ttoproc(curthread)->p_pid, 2302 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2303 } 2304 return (e.error); 2305 } 2306 2307 if (count > 1) 2308 return (0); 2309 2310 /* 2311 * If the file has been `unlinked', then purge the 2312 * DNLC so that this vnode will get reycled quicker 2313 * and the .nfs* file on the server will get removed. 2314 */ 2315 if (rp->r_unldvp != NULL) 2316 dnlc_purge_vp(vp); 2317 2318 /* 2319 * If the file was open for write and there are pages, 2320 * do a synchronous flush and commit of all of the 2321 * dirty and uncommitted pages. 2322 */ 2323 ASSERT(!e.error); 2324 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2325 error = nfs4_putpage_commit(vp, 0, 0, cr); 2326 2327 mutex_enter(&rp->r_statelock); 2328 r_error = rp->r_error; 2329 rp->r_error = 0; 2330 mutex_exit(&rp->r_statelock); 2331 2332 /* 2333 * If this file type is one for which no explicit 'open' was 2334 * done, then bail now (ie. no need for protocol 'close'). If 2335 * there was an error w/the vm subsystem, return _that_ error, 2336 * otherwise, return any errors that may've been reported via 2337 * the rnode. 2338 */ 2339 if (vp->v_type != VREG) 2340 return (error ? error : r_error); 2341 2342 /* 2343 * The sync putpage commit may have failed above, but since 2344 * we're working w/a regular file, we need to do the protocol 2345 * 'close' (nfs4close_one will figure out if an otw close is 2346 * needed or not). Report any errors _after_ doing the protocol 2347 * 'close'. 2348 */ 2349 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2350 n4error = e.error ? e.error : geterrno4(e.stat); 2351 2352 /* 2353 * Error reporting prio (Hi -> Lo) 2354 * 2355 * i) nfs4_putpage_commit (error) 2356 * ii) rnode's (r_error) 2357 * iii) nfs4close_one (n4error) 2358 */ 2359 return (error ? error : (r_error ? r_error : n4error)); 2360 } 2361 2362 /* 2363 * Initialize *lost_rqstp. 2364 */ 2365 2366 static void 2367 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2368 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2369 vnode_t *vp) 2370 { 2371 if (error != ETIMEDOUT && error != EINTR && 2372 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2373 lost_rqstp->lr_op = 0; 2374 return; 2375 } 2376 2377 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2378 "nfs4close_save_lost_rqst: error %d", error)); 2379 2380 lost_rqstp->lr_op = OP_CLOSE; 2381 /* 2382 * The vp is held and rele'd via the recovery code. 2383 * See nfs4_save_lost_rqst. 2384 */ 2385 lost_rqstp->lr_vp = vp; 2386 lost_rqstp->lr_dvp = NULL; 2387 lost_rqstp->lr_oop = oop; 2388 lost_rqstp->lr_osp = osp; 2389 ASSERT(osp != NULL); 2390 ASSERT(mutex_owned(&osp->os_sync_lock)); 2391 osp->os_pending_close = 1; 2392 lost_rqstp->lr_lop = NULL; 2393 lost_rqstp->lr_cr = cr; 2394 lost_rqstp->lr_flk = NULL; 2395 lost_rqstp->lr_putfirst = FALSE; 2396 } 2397 2398 /* 2399 * Assumes you already have the open seqid sync grabbed as well as the 2400 * 'os_sync_lock'. Note: this will release the open seqid sync and 2401 * 'os_sync_lock' if client recovery starts. Calling functions have to 2402 * be prepared to handle this. 2403 * 2404 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2405 * was needed and was started, and that the calling function should retry 2406 * this function; otherwise it is returned as 0. 2407 * 2408 * Errors are returned via the nfs4_error_t parameter. 2409 */ 2410 static void 2411 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2412 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2413 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2414 { 2415 COMPOUND4args_clnt args; 2416 COMPOUND4res_clnt res; 2417 CLOSE4args *close_args; 2418 nfs_resop4 *resop; 2419 nfs_argop4 argop[3]; 2420 int doqueue = 1; 2421 mntinfo4_t *mi; 2422 seqid4 seqid; 2423 vnode_t *vp; 2424 bool_t needrecov = FALSE; 2425 nfs4_lost_rqst_t lost_rqst; 2426 hrtime_t t; 2427 2428 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2429 2430 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2431 2432 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2433 2434 /* Only set this to 1 if recovery is started */ 2435 *recov = 0; 2436 2437 /* do the OTW call to close the file */ 2438 2439 if (close_type == CLOSE_RESEND) 2440 args.ctag = TAG_CLOSE_LOST; 2441 else if (close_type == CLOSE_AFTER_RESEND) 2442 args.ctag = TAG_CLOSE_UNDO; 2443 else 2444 args.ctag = TAG_CLOSE; 2445 2446 args.array_len = 3; 2447 args.array = argop; 2448 2449 vp = RTOV4(rp); 2450 2451 mi = VTOMI4(vp); 2452 2453 /* putfh target fh */ 2454 argop[0].argop = OP_CPUTFH; 2455 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2456 2457 argop[1].argop = OP_GETATTR; 2458 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2459 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2460 2461 argop[2].argop = OP_CLOSE; 2462 close_args = &argop[2].nfs_argop4_u.opclose; 2463 2464 seqid = nfs4_get_open_seqid(oop) + 1; 2465 2466 close_args->seqid = seqid; 2467 close_args->open_stateid = osp->open_stateid; 2468 2469 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2470 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2471 rnode4info(rp))); 2472 2473 t = gethrtime(); 2474 2475 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2476 2477 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2478 nfs4_set_open_seqid(seqid, oop, args.ctag); 2479 } 2480 2481 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2482 if (ep->error && !needrecov) { 2483 /* 2484 * if there was an error and no recovery is to be done 2485 * then then set up the file to flush its cache if 2486 * needed for the next caller. 2487 */ 2488 mutex_enter(&rp->r_statelock); 2489 PURGE_ATTRCACHE4_LOCKED(rp); 2490 rp->r_flags &= ~R4WRITEMODIFIED; 2491 mutex_exit(&rp->r_statelock); 2492 return; 2493 } 2494 2495 if (needrecov) { 2496 bool_t abort; 2497 nfs4_bseqid_entry_t *bsep = NULL; 2498 2499 if (close_type != CLOSE_RESEND) 2500 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2501 osp, cred_otw, vp); 2502 2503 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2504 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2505 0, args.ctag, close_args->seqid); 2506 2507 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2508 "nfs4close_otw: initiating recovery. error %d " 2509 "res.status %d", ep->error, res.status)); 2510 2511 /* 2512 * Drop the 'os_sync_lock' here so we don't hit 2513 * a potential recursive mutex_enter via an 2514 * 'open_stream_hold()'. 2515 */ 2516 mutex_exit(&osp->os_sync_lock); 2517 *have_sync_lockp = 0; 2518 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2519 (close_type != CLOSE_RESEND && 2520 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2521 OP_CLOSE, bsep, NULL, NULL); 2522 2523 /* drop open seq sync, and let the calling function regrab it */ 2524 nfs4_end_open_seqid_sync(oop); 2525 *did_start_seqid_syncp = 0; 2526 2527 if (bsep) 2528 kmem_free(bsep, sizeof (*bsep)); 2529 /* 2530 * For signals, the caller wants to quit, so don't say to 2531 * retry. For forced unmount, if it's a user thread, it 2532 * wants to quit. If it's a recovery thread, the retry 2533 * will happen higher-up on the call stack. Either way, 2534 * don't say to retry. 2535 */ 2536 if (abort == FALSE && ep->error != EINTR && 2537 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2538 close_type != CLOSE_RESEND && 2539 close_type != CLOSE_AFTER_RESEND) 2540 *recov = 1; 2541 else 2542 *recov = 0; 2543 2544 if (!ep->error) 2545 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2546 return; 2547 } 2548 2549 if (res.status) { 2550 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2551 return; 2552 } 2553 2554 mutex_enter(&rp->r_statev4_lock); 2555 rp->created_v4 = 0; 2556 mutex_exit(&rp->r_statev4_lock); 2557 2558 resop = &res.array[2]; 2559 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2560 osp->os_valid = 0; 2561 2562 /* 2563 * This removes the reference obtained at OPEN; ie, when the 2564 * open stream structure was created. 2565 * 2566 * We don't have to worry about calling 'open_stream_rele' 2567 * since we our currently holding a reference to the open 2568 * stream which means the count cannot go to 0 with this 2569 * decrement. 2570 */ 2571 ASSERT(osp->os_ref_count >= 2); 2572 osp->os_ref_count--; 2573 2574 if (!ep->error) 2575 nfs4_attr_cache(vp, 2576 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2577 t, cred_otw, TRUE, NULL); 2578 2579 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2580 " returning %d", ep->error)); 2581 2582 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2583 } 2584 2585 /* ARGSUSED */ 2586 static int 2587 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2588 caller_context_t *ct) 2589 { 2590 rnode4_t *rp; 2591 u_offset_t off; 2592 offset_t diff; 2593 uint_t on; 2594 uint_t n; 2595 caddr_t base; 2596 uint_t flags; 2597 int error; 2598 mntinfo4_t *mi; 2599 2600 rp = VTOR4(vp); 2601 2602 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2603 2604 if (IS_SHADOW(vp, rp)) 2605 vp = RTOV4(rp); 2606 2607 if (vp->v_type != VREG) 2608 return (EISDIR); 2609 2610 mi = VTOMI4(vp); 2611 2612 if (nfs_zone() != mi->mi_zone) 2613 return (EIO); 2614 2615 if (uiop->uio_resid == 0) 2616 return (0); 2617 2618 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2619 return (EINVAL); 2620 2621 mutex_enter(&rp->r_statelock); 2622 if (rp->r_flags & R4RECOVERRP) 2623 error = (rp->r_error ? rp->r_error : EIO); 2624 else 2625 error = 0; 2626 mutex_exit(&rp->r_statelock); 2627 if (error) 2628 return (error); 2629 2630 /* 2631 * Bypass VM if caching has been disabled (e.g., locking) or if 2632 * using client-side direct I/O and the file is not mmap'd and 2633 * there are no cached pages. 2634 */ 2635 if ((vp->v_flag & VNOCACHE) || 2636 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2637 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2638 size_t resid = 0; 2639 2640 return (nfs4read(vp, NULL, uiop->uio_loffset, 2641 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2642 } 2643 2644 error = 0; 2645 2646 do { 2647 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2648 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2649 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2650 2651 if (error = nfs4_validate_caches(vp, cr)) 2652 break; 2653 2654 mutex_enter(&rp->r_statelock); 2655 while (rp->r_flags & R4INCACHEPURGE) { 2656 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2657 mutex_exit(&rp->r_statelock); 2658 return (EINTR); 2659 } 2660 } 2661 diff = rp->r_size - uiop->uio_loffset; 2662 mutex_exit(&rp->r_statelock); 2663 if (diff <= 0) 2664 break; 2665 if (diff < n) 2666 n = (uint_t)diff; 2667 2668 if (vpm_enable) { 2669 /* 2670 * Copy data. 2671 */ 2672 error = vpm_data_copy(vp, off + on, n, uiop, 2673 1, NULL, 0, S_READ); 2674 } else { 2675 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2676 S_READ); 2677 2678 error = uiomove(base + on, n, UIO_READ, uiop); 2679 } 2680 2681 if (!error) { 2682 /* 2683 * If read a whole block or read to eof, 2684 * won't need this buffer again soon. 2685 */ 2686 mutex_enter(&rp->r_statelock); 2687 if (n + on == MAXBSIZE || 2688 uiop->uio_loffset == rp->r_size) 2689 flags = SM_DONTNEED; 2690 else 2691 flags = 0; 2692 mutex_exit(&rp->r_statelock); 2693 if (vpm_enable) { 2694 error = vpm_sync_pages(vp, off, n, flags); 2695 } else { 2696 error = segmap_release(segkmap, base, flags); 2697 } 2698 } else { 2699 if (vpm_enable) { 2700 (void) vpm_sync_pages(vp, off, n, 0); 2701 } else { 2702 (void) segmap_release(segkmap, base, 0); 2703 } 2704 } 2705 } while (!error && uiop->uio_resid > 0); 2706 2707 return (error); 2708 } 2709 2710 /* ARGSUSED */ 2711 static int 2712 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2713 caller_context_t *ct) 2714 { 2715 rlim64_t limit = uiop->uio_llimit; 2716 rnode4_t *rp; 2717 u_offset_t off; 2718 caddr_t base; 2719 uint_t flags; 2720 int remainder; 2721 size_t n; 2722 int on; 2723 int error; 2724 int resid; 2725 u_offset_t offset; 2726 mntinfo4_t *mi; 2727 uint_t bsize; 2728 2729 rp = VTOR4(vp); 2730 2731 if (IS_SHADOW(vp, rp)) 2732 vp = RTOV4(rp); 2733 2734 if (vp->v_type != VREG) 2735 return (EISDIR); 2736 2737 mi = VTOMI4(vp); 2738 2739 if (nfs_zone() != mi->mi_zone) 2740 return (EIO); 2741 2742 if (uiop->uio_resid == 0) 2743 return (0); 2744 2745 mutex_enter(&rp->r_statelock); 2746 if (rp->r_flags & R4RECOVERRP) 2747 error = (rp->r_error ? rp->r_error : EIO); 2748 else 2749 error = 0; 2750 mutex_exit(&rp->r_statelock); 2751 if (error) 2752 return (error); 2753 2754 if (ioflag & FAPPEND) { 2755 struct vattr va; 2756 2757 /* 2758 * Must serialize if appending. 2759 */ 2760 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2761 nfs_rw_exit(&rp->r_rwlock); 2762 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2763 INTR4(vp))) 2764 return (EINTR); 2765 } 2766 2767 va.va_mask = AT_SIZE; 2768 error = nfs4getattr(vp, &va, cr); 2769 if (error) 2770 return (error); 2771 uiop->uio_loffset = va.va_size; 2772 } 2773 2774 offset = uiop->uio_loffset + uiop->uio_resid; 2775 2776 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2777 return (EINVAL); 2778 2779 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2780 limit = MAXOFFSET_T; 2781 2782 /* 2783 * Check to make sure that the process will not exceed 2784 * its limit on file size. It is okay to write up to 2785 * the limit, but not beyond. Thus, the write which 2786 * reaches the limit will be short and the next write 2787 * will return an error. 2788 */ 2789 remainder = 0; 2790 if (offset > uiop->uio_llimit) { 2791 remainder = offset - uiop->uio_llimit; 2792 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2793 if (uiop->uio_resid <= 0) { 2794 proc_t *p = ttoproc(curthread); 2795 2796 uiop->uio_resid += remainder; 2797 mutex_enter(&p->p_lock); 2798 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2799 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2800 mutex_exit(&p->p_lock); 2801 return (EFBIG); 2802 } 2803 } 2804 2805 /* update the change attribute, if we have a write delegation */ 2806 2807 mutex_enter(&rp->r_statev4_lock); 2808 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2809 rp->r_deleg_change++; 2810 2811 mutex_exit(&rp->r_statev4_lock); 2812 2813 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2814 return (EINTR); 2815 2816 /* 2817 * Bypass VM if caching has been disabled (e.g., locking) or if 2818 * using client-side direct I/O and the file is not mmap'd and 2819 * there are no cached pages. 2820 */ 2821 if ((vp->v_flag & VNOCACHE) || 2822 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2823 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2824 size_t bufsize; 2825 int count; 2826 u_offset_t org_offset; 2827 stable_how4 stab_comm; 2828 nfs4_fwrite: 2829 if (rp->r_flags & R4STALE) { 2830 resid = uiop->uio_resid; 2831 offset = uiop->uio_loffset; 2832 error = rp->r_error; 2833 /* 2834 * A close may have cleared r_error, if so, 2835 * propagate ESTALE error return properly 2836 */ 2837 if (error == 0) 2838 error = ESTALE; 2839 goto bottom; 2840 } 2841 2842 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2843 base = kmem_alloc(bufsize, KM_SLEEP); 2844 do { 2845 if (ioflag & FDSYNC) 2846 stab_comm = DATA_SYNC4; 2847 else 2848 stab_comm = FILE_SYNC4; 2849 resid = uiop->uio_resid; 2850 offset = uiop->uio_loffset; 2851 count = MIN(uiop->uio_resid, bufsize); 2852 org_offset = uiop->uio_loffset; 2853 error = uiomove(base, count, UIO_WRITE, uiop); 2854 if (!error) { 2855 error = nfs4write(vp, base, org_offset, 2856 count, cr, &stab_comm); 2857 if (!error) { 2858 mutex_enter(&rp->r_statelock); 2859 if (rp->r_size < uiop->uio_loffset) 2860 rp->r_size = uiop->uio_loffset; 2861 mutex_exit(&rp->r_statelock); 2862 } 2863 } 2864 } while (!error && uiop->uio_resid > 0); 2865 kmem_free(base, bufsize); 2866 goto bottom; 2867 } 2868 2869 bsize = vp->v_vfsp->vfs_bsize; 2870 2871 do { 2872 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2873 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2874 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2875 2876 resid = uiop->uio_resid; 2877 offset = uiop->uio_loffset; 2878 2879 if (rp->r_flags & R4STALE) { 2880 error = rp->r_error; 2881 /* 2882 * A close may have cleared r_error, if so, 2883 * propagate ESTALE error return properly 2884 */ 2885 if (error == 0) 2886 error = ESTALE; 2887 break; 2888 } 2889 2890 /* 2891 * Don't create dirty pages faster than they 2892 * can be cleaned so that the system doesn't 2893 * get imbalanced. If the async queue is 2894 * maxed out, then wait for it to drain before 2895 * creating more dirty pages. Also, wait for 2896 * any threads doing pagewalks in the vop_getattr 2897 * entry points so that they don't block for 2898 * long periods. 2899 */ 2900 mutex_enter(&rp->r_statelock); 2901 while ((mi->mi_max_threads != 0 && 2902 rp->r_awcount > 2 * mi->mi_max_threads) || 2903 rp->r_gcount > 0) { 2904 if (INTR4(vp)) { 2905 klwp_t *lwp = ttolwp(curthread); 2906 2907 if (lwp != NULL) 2908 lwp->lwp_nostop++; 2909 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2910 mutex_exit(&rp->r_statelock); 2911 if (lwp != NULL) 2912 lwp->lwp_nostop--; 2913 error = EINTR; 2914 goto bottom; 2915 } 2916 if (lwp != NULL) 2917 lwp->lwp_nostop--; 2918 } else 2919 cv_wait(&rp->r_cv, &rp->r_statelock); 2920 } 2921 mutex_exit(&rp->r_statelock); 2922 2923 /* 2924 * Touch the page and fault it in if it is not in core 2925 * before segmap_getmapflt or vpm_data_copy can lock it. 2926 * This is to avoid the deadlock if the buffer is mapped 2927 * to the same file through mmap which we want to write. 2928 */ 2929 uio_prefaultpages((long)n, uiop); 2930 2931 if (vpm_enable) { 2932 /* 2933 * It will use kpm mappings, so no need to 2934 * pass an address. 2935 */ 2936 error = writerp4(rp, NULL, n, uiop, 0); 2937 } else { 2938 if (segmap_kpm) { 2939 int pon = uiop->uio_loffset & PAGEOFFSET; 2940 size_t pn = MIN(PAGESIZE - pon, 2941 uiop->uio_resid); 2942 int pagecreate; 2943 2944 mutex_enter(&rp->r_statelock); 2945 pagecreate = (pon == 0) && (pn == PAGESIZE || 2946 uiop->uio_loffset + pn >= rp->r_size); 2947 mutex_exit(&rp->r_statelock); 2948 2949 base = segmap_getmapflt(segkmap, vp, off + on, 2950 pn, !pagecreate, S_WRITE); 2951 2952 error = writerp4(rp, base + pon, n, uiop, 2953 pagecreate); 2954 2955 } else { 2956 base = segmap_getmapflt(segkmap, vp, off + on, 2957 n, 0, S_READ); 2958 error = writerp4(rp, base + on, n, uiop, 0); 2959 } 2960 } 2961 2962 if (!error) { 2963 if (mi->mi_flags & MI4_NOAC) 2964 flags = SM_WRITE; 2965 else if ((uiop->uio_loffset % bsize) == 0 || 2966 IS_SWAPVP(vp)) { 2967 /* 2968 * Have written a whole block. 2969 * Start an asynchronous write 2970 * and mark the buffer to 2971 * indicate that it won't be 2972 * needed again soon. 2973 */ 2974 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2975 } else 2976 flags = 0; 2977 if ((ioflag & (FSYNC|FDSYNC)) || 2978 (rp->r_flags & R4OUTOFSPACE)) { 2979 flags &= ~SM_ASYNC; 2980 flags |= SM_WRITE; 2981 } 2982 if (vpm_enable) { 2983 error = vpm_sync_pages(vp, off, n, flags); 2984 } else { 2985 error = segmap_release(segkmap, base, flags); 2986 } 2987 } else { 2988 if (vpm_enable) { 2989 (void) vpm_sync_pages(vp, off, n, 0); 2990 } else { 2991 (void) segmap_release(segkmap, base, 0); 2992 } 2993 /* 2994 * In the event that we got an access error while 2995 * faulting in a page for a write-only file just 2996 * force a write. 2997 */ 2998 if (error == EACCES) 2999 goto nfs4_fwrite; 3000 } 3001 } while (!error && uiop->uio_resid > 0); 3002 3003 bottom: 3004 if (error) { 3005 uiop->uio_resid = resid + remainder; 3006 uiop->uio_loffset = offset; 3007 } else { 3008 uiop->uio_resid += remainder; 3009 3010 mutex_enter(&rp->r_statev4_lock); 3011 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 3012 gethrestime(&rp->r_attr.va_mtime); 3013 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3014 } 3015 mutex_exit(&rp->r_statev4_lock); 3016 } 3017 3018 nfs_rw_exit(&rp->r_lkserlock); 3019 3020 return (error); 3021 } 3022 3023 /* 3024 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 3025 */ 3026 static int 3027 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 3028 int flags, cred_t *cr) 3029 { 3030 struct buf *bp; 3031 int error; 3032 page_t *savepp; 3033 uchar_t fsdata; 3034 stable_how4 stab_comm; 3035 3036 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3037 bp = pageio_setup(pp, len, vp, flags); 3038 ASSERT(bp != NULL); 3039 3040 /* 3041 * pageio_setup should have set b_addr to 0. This 3042 * is correct since we want to do I/O on a page 3043 * boundary. bp_mapin will use this addr to calculate 3044 * an offset, and then set b_addr to the kernel virtual 3045 * address it allocated for us. 3046 */ 3047 ASSERT(bp->b_un.b_addr == 0); 3048 3049 bp->b_edev = 0; 3050 bp->b_dev = 0; 3051 bp->b_lblkno = lbtodb(off); 3052 bp->b_file = vp; 3053 bp->b_offset = (offset_t)off; 3054 bp_mapin(bp); 3055 3056 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3057 freemem > desfree) 3058 stab_comm = UNSTABLE4; 3059 else 3060 stab_comm = FILE_SYNC4; 3061 3062 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3063 3064 bp_mapout(bp); 3065 pageio_done(bp); 3066 3067 if (stab_comm == UNSTABLE4) 3068 fsdata = C_DELAYCOMMIT; 3069 else 3070 fsdata = C_NOCOMMIT; 3071 3072 savepp = pp; 3073 do { 3074 pp->p_fsdata = fsdata; 3075 } while ((pp = pp->p_next) != savepp); 3076 3077 return (error); 3078 } 3079 3080 /* 3081 */ 3082 static int 3083 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3084 { 3085 nfs4_open_owner_t *oop; 3086 nfs4_open_stream_t *osp; 3087 rnode4_t *rp = VTOR4(vp); 3088 mntinfo4_t *mi = VTOMI4(vp); 3089 int reopen_needed; 3090 3091 ASSERT(nfs_zone() == mi->mi_zone); 3092 3093 3094 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3095 if (!oop) 3096 return (EIO); 3097 3098 /* returns with 'os_sync_lock' held */ 3099 osp = find_open_stream(oop, rp); 3100 if (!osp) { 3101 open_owner_rele(oop); 3102 return (EIO); 3103 } 3104 3105 if (osp->os_failed_reopen) { 3106 mutex_exit(&osp->os_sync_lock); 3107 open_stream_rele(osp, rp); 3108 open_owner_rele(oop); 3109 return (EIO); 3110 } 3111 3112 /* 3113 * Determine whether a reopen is needed. If this 3114 * is a delegation open stream, then the os_delegation bit 3115 * should be set. 3116 */ 3117 3118 reopen_needed = osp->os_delegation; 3119 3120 mutex_exit(&osp->os_sync_lock); 3121 open_owner_rele(oop); 3122 3123 if (reopen_needed) { 3124 nfs4_error_zinit(ep); 3125 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3126 mutex_enter(&osp->os_sync_lock); 3127 if (ep->error || ep->stat || osp->os_failed_reopen) { 3128 mutex_exit(&osp->os_sync_lock); 3129 open_stream_rele(osp, rp); 3130 return (EIO); 3131 } 3132 mutex_exit(&osp->os_sync_lock); 3133 } 3134 open_stream_rele(osp, rp); 3135 3136 return (0); 3137 } 3138 3139 /* 3140 * Write to file. Writes to remote server in largest size 3141 * chunks that the server can handle. Write is synchronous. 3142 */ 3143 static int 3144 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3145 stable_how4 *stab_comm) 3146 { 3147 mntinfo4_t *mi; 3148 COMPOUND4args_clnt args; 3149 COMPOUND4res_clnt res; 3150 WRITE4args *wargs; 3151 WRITE4res *wres; 3152 nfs_argop4 argop[2]; 3153 nfs_resop4 *resop; 3154 int tsize; 3155 stable_how4 stable; 3156 rnode4_t *rp; 3157 int doqueue = 1; 3158 bool_t needrecov; 3159 nfs4_recov_state_t recov_state; 3160 nfs4_stateid_types_t sid_types; 3161 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3162 int recov; 3163 3164 rp = VTOR4(vp); 3165 mi = VTOMI4(vp); 3166 3167 ASSERT(nfs_zone() == mi->mi_zone); 3168 3169 stable = *stab_comm; 3170 *stab_comm = FILE_SYNC4; 3171 3172 needrecov = FALSE; 3173 recov_state.rs_flags = 0; 3174 recov_state.rs_num_retry_despite_err = 0; 3175 nfs4_init_stateid_types(&sid_types); 3176 3177 /* Is curthread the recovery thread? */ 3178 mutex_enter(&mi->mi_lock); 3179 recov = (mi->mi_recovthread == curthread); 3180 mutex_exit(&mi->mi_lock); 3181 3182 recov_retry: 3183 args.ctag = TAG_WRITE; 3184 args.array_len = 2; 3185 args.array = argop; 3186 3187 if (!recov) { 3188 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3189 &recov_state, NULL); 3190 if (e.error) 3191 return (e.error); 3192 } 3193 3194 /* 0. putfh target fh */ 3195 argop[0].argop = OP_CPUTFH; 3196 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3197 3198 /* 1. write */ 3199 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3200 3201 do { 3202 3203 wargs->offset = (offset4)offset; 3204 wargs->data_val = base; 3205 3206 if (mi->mi_io_kstats) { 3207 mutex_enter(&mi->mi_lock); 3208 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3209 mutex_exit(&mi->mi_lock); 3210 } 3211 3212 if ((vp->v_flag & VNOCACHE) || 3213 (rp->r_flags & R4DIRECTIO) || 3214 (mi->mi_flags & MI4_DIRECTIO)) 3215 tsize = MIN(mi->mi_stsize, count); 3216 else 3217 tsize = MIN(mi->mi_curwrite, count); 3218 wargs->data_len = (uint_t)tsize; 3219 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3220 3221 if (mi->mi_io_kstats) { 3222 mutex_enter(&mi->mi_lock); 3223 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3224 mutex_exit(&mi->mi_lock); 3225 } 3226 3227 if (!recov) { 3228 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3229 if (e.error && !needrecov) { 3230 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3231 &recov_state, needrecov); 3232 return (e.error); 3233 } 3234 } else { 3235 if (e.error) 3236 return (e.error); 3237 } 3238 3239 /* 3240 * Do handling of OLD_STATEID outside 3241 * of the normal recovery framework. 3242 * 3243 * If write receives a BAD stateid error while using a 3244 * delegation stateid, retry using the open stateid (if it 3245 * exists). If it doesn't have an open stateid, reopen the 3246 * file first, then retry. 3247 */ 3248 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3249 sid_types.cur_sid_type != SPEC_SID) { 3250 nfs4_save_stateid(&wargs->stateid, &sid_types); 3251 if (!recov) 3252 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3253 &recov_state, needrecov); 3254 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3255 goto recov_retry; 3256 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3257 sid_types.cur_sid_type == DEL_SID) { 3258 nfs4_save_stateid(&wargs->stateid, &sid_types); 3259 mutex_enter(&rp->r_statev4_lock); 3260 rp->r_deleg_return_pending = TRUE; 3261 mutex_exit(&rp->r_statev4_lock); 3262 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3263 if (!recov) 3264 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3265 &recov_state, needrecov); 3266 (void) xdr_free(xdr_COMPOUND4res_clnt, 3267 (caddr_t)&res); 3268 return (EIO); 3269 } 3270 if (!recov) 3271 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3272 &recov_state, needrecov); 3273 /* hold needed for nfs4delegreturn_thread */ 3274 VN_HOLD(vp); 3275 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3276 NFS4_DR_DISCARD), FALSE); 3277 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3278 goto recov_retry; 3279 } 3280 3281 if (needrecov) { 3282 bool_t abort; 3283 3284 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3285 "nfs4write: client got error %d, res.status %d" 3286 ", so start recovery", e.error, res.status)); 3287 3288 abort = nfs4_start_recovery(&e, 3289 VTOMI4(vp), vp, NULL, &wargs->stateid, 3290 NULL, OP_WRITE, NULL, NULL, NULL); 3291 if (!e.error) { 3292 e.error = geterrno4(res.status); 3293 (void) xdr_free(xdr_COMPOUND4res_clnt, 3294 (caddr_t)&res); 3295 } 3296 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3297 &recov_state, needrecov); 3298 if (abort == FALSE) 3299 goto recov_retry; 3300 return (e.error); 3301 } 3302 3303 if (res.status) { 3304 e.error = geterrno4(res.status); 3305 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3306 if (!recov) 3307 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3308 &recov_state, needrecov); 3309 return (e.error); 3310 } 3311 3312 resop = &res.array[1]; /* write res */ 3313 wres = &resop->nfs_resop4_u.opwrite; 3314 3315 if ((int)wres->count > tsize) { 3316 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3317 3318 zcmn_err(getzoneid(), CE_WARN, 3319 "nfs4write: server wrote %u, requested was %u", 3320 (int)wres->count, tsize); 3321 if (!recov) 3322 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3323 &recov_state, needrecov); 3324 return (EIO); 3325 } 3326 if (wres->committed == UNSTABLE4) { 3327 *stab_comm = UNSTABLE4; 3328 if (wargs->stable == DATA_SYNC4 || 3329 wargs->stable == FILE_SYNC4) { 3330 (void) xdr_free(xdr_COMPOUND4res_clnt, 3331 (caddr_t)&res); 3332 zcmn_err(getzoneid(), CE_WARN, 3333 "nfs4write: server %s did not commit " 3334 "to stable storage", 3335 rp->r_server->sv_hostname); 3336 if (!recov) 3337 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3338 OH_WRITE, &recov_state, needrecov); 3339 return (EIO); 3340 } 3341 } 3342 3343 tsize = (int)wres->count; 3344 count -= tsize; 3345 base += tsize; 3346 offset += tsize; 3347 if (mi->mi_io_kstats) { 3348 mutex_enter(&mi->mi_lock); 3349 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3350 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3351 tsize; 3352 mutex_exit(&mi->mi_lock); 3353 } 3354 lwp_stat_update(LWP_STAT_OUBLK, 1); 3355 mutex_enter(&rp->r_statelock); 3356 if (rp->r_flags & R4HAVEVERF) { 3357 if (rp->r_writeverf != wres->writeverf) { 3358 nfs4_set_mod(vp); 3359 rp->r_writeverf = wres->writeverf; 3360 } 3361 } else { 3362 rp->r_writeverf = wres->writeverf; 3363 rp->r_flags |= R4HAVEVERF; 3364 } 3365 PURGE_ATTRCACHE4_LOCKED(rp); 3366 rp->r_flags |= R4WRITEMODIFIED; 3367 gethrestime(&rp->r_attr.va_mtime); 3368 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3369 mutex_exit(&rp->r_statelock); 3370 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3371 } while (count); 3372 3373 if (!recov) 3374 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3375 needrecov); 3376 3377 return (e.error); 3378 } 3379 3380 /* 3381 * Read from a file. Reads data in largest chunks our interface can handle. 3382 */ 3383 static int 3384 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3385 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3386 { 3387 mntinfo4_t *mi; 3388 COMPOUND4args_clnt args; 3389 COMPOUND4res_clnt res; 3390 READ4args *rargs; 3391 nfs_argop4 argop[2]; 3392 int tsize; 3393 int doqueue; 3394 rnode4_t *rp; 3395 int data_len; 3396 bool_t is_eof; 3397 bool_t needrecov = FALSE; 3398 nfs4_recov_state_t recov_state; 3399 nfs4_stateid_types_t sid_types; 3400 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3401 3402 rp = VTOR4(vp); 3403 mi = VTOMI4(vp); 3404 doqueue = 1; 3405 3406 ASSERT(nfs_zone() == mi->mi_zone); 3407 3408 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3409 3410 args.array_len = 2; 3411 args.array = argop; 3412 3413 nfs4_init_stateid_types(&sid_types); 3414 3415 recov_state.rs_flags = 0; 3416 recov_state.rs_num_retry_despite_err = 0; 3417 3418 recov_retry: 3419 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3420 &recov_state, NULL); 3421 if (e.error) 3422 return (e.error); 3423 3424 /* putfh target fh */ 3425 argop[0].argop = OP_CPUTFH; 3426 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3427 3428 /* read */ 3429 argop[1].argop = OP_READ; 3430 rargs = &argop[1].nfs_argop4_u.opread; 3431 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3432 OP_READ, &sid_types, async); 3433 3434 do { 3435 if (mi->mi_io_kstats) { 3436 mutex_enter(&mi->mi_lock); 3437 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3438 mutex_exit(&mi->mi_lock); 3439 } 3440 3441 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3442 "nfs4read: %s call, rp %s", 3443 needrecov ? "recov" : "first", 3444 rnode4info(rp))); 3445 3446 if ((vp->v_flag & VNOCACHE) || 3447 (rp->r_flags & R4DIRECTIO) || 3448 (mi->mi_flags & MI4_DIRECTIO)) 3449 tsize = MIN(mi->mi_tsize, count); 3450 else 3451 tsize = MIN(mi->mi_curread, count); 3452 3453 rargs->offset = (offset4)offset; 3454 rargs->count = (count4)tsize; 3455 rargs->res_data_val_alt = NULL; 3456 rargs->res_mblk = NULL; 3457 rargs->res_uiop = NULL; 3458 rargs->res_maxsize = 0; 3459 rargs->wlist = NULL; 3460 3461 if (uiop) 3462 rargs->res_uiop = uiop; 3463 else 3464 rargs->res_data_val_alt = base; 3465 rargs->res_maxsize = tsize; 3466 3467 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3468 #ifdef DEBUG 3469 if (nfs4read_error_inject) { 3470 res.status = nfs4read_error_inject; 3471 nfs4read_error_inject = 0; 3472 } 3473 #endif 3474 3475 if (mi->mi_io_kstats) { 3476 mutex_enter(&mi->mi_lock); 3477 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3478 mutex_exit(&mi->mi_lock); 3479 } 3480 3481 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3482 if (e.error != 0 && !needrecov) { 3483 nfs4_end_fop(mi, vp, NULL, OH_READ, 3484 &recov_state, needrecov); 3485 return (e.error); 3486 } 3487 3488 /* 3489 * Do proper retry for OLD and BAD stateid errors outside 3490 * of the normal recovery framework. There are two differences 3491 * between async and sync reads. The first is that we allow 3492 * retry on BAD_STATEID for async reads, but not sync reads. 3493 * The second is that we mark the file dead for a failed 3494 * attempt with a special stateid for sync reads, but just 3495 * return EIO for async reads. 3496 * 3497 * If a sync read receives a BAD stateid error while using a 3498 * delegation stateid, retry using the open stateid (if it 3499 * exists). If it doesn't have an open stateid, reopen the 3500 * file first, then retry. 3501 */ 3502 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3503 res.status == NFS4ERR_BAD_STATEID) && async) { 3504 nfs4_end_fop(mi, vp, NULL, OH_READ, 3505 &recov_state, needrecov); 3506 if (sid_types.cur_sid_type == SPEC_SID) { 3507 (void) xdr_free(xdr_COMPOUND4res_clnt, 3508 (caddr_t)&res); 3509 return (EIO); 3510 } 3511 nfs4_save_stateid(&rargs->stateid, &sid_types); 3512 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3513 goto recov_retry; 3514 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3515 !async && sid_types.cur_sid_type != SPEC_SID) { 3516 nfs4_save_stateid(&rargs->stateid, &sid_types); 3517 nfs4_end_fop(mi, vp, NULL, OH_READ, 3518 &recov_state, needrecov); 3519 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3520 goto recov_retry; 3521 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3522 sid_types.cur_sid_type == DEL_SID) { 3523 nfs4_save_stateid(&rargs->stateid, &sid_types); 3524 mutex_enter(&rp->r_statev4_lock); 3525 rp->r_deleg_return_pending = TRUE; 3526 mutex_exit(&rp->r_statev4_lock); 3527 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3528 nfs4_end_fop(mi, vp, NULL, OH_READ, 3529 &recov_state, needrecov); 3530 (void) xdr_free(xdr_COMPOUND4res_clnt, 3531 (caddr_t)&res); 3532 return (EIO); 3533 } 3534 nfs4_end_fop(mi, vp, NULL, OH_READ, 3535 &recov_state, needrecov); 3536 /* hold needed for nfs4delegreturn_thread */ 3537 VN_HOLD(vp); 3538 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3539 NFS4_DR_DISCARD), FALSE); 3540 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3541 goto recov_retry; 3542 } 3543 if (needrecov) { 3544 bool_t abort; 3545 3546 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3547 "nfs4read: initiating recovery\n")); 3548 abort = nfs4_start_recovery(&e, 3549 mi, vp, NULL, &rargs->stateid, 3550 NULL, OP_READ, NULL, NULL, NULL); 3551 nfs4_end_fop(mi, vp, NULL, OH_READ, 3552 &recov_state, needrecov); 3553 /* 3554 * Do not retry if we got OLD_STATEID using a special 3555 * stateid. This avoids looping with a broken server. 3556 */ 3557 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3558 sid_types.cur_sid_type == SPEC_SID) 3559 abort = TRUE; 3560 3561 if (abort == FALSE) { 3562 /* 3563 * Need to retry all possible stateids in 3564 * case the recovery error wasn't stateid 3565 * related or the stateids have become 3566 * stale (server reboot). 3567 */ 3568 nfs4_init_stateid_types(&sid_types); 3569 (void) xdr_free(xdr_COMPOUND4res_clnt, 3570 (caddr_t)&res); 3571 goto recov_retry; 3572 } 3573 3574 if (!e.error) { 3575 e.error = geterrno4(res.status); 3576 (void) xdr_free(xdr_COMPOUND4res_clnt, 3577 (caddr_t)&res); 3578 } 3579 return (e.error); 3580 } 3581 3582 if (res.status) { 3583 e.error = geterrno4(res.status); 3584 nfs4_end_fop(mi, vp, NULL, OH_READ, 3585 &recov_state, needrecov); 3586 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3587 return (e.error); 3588 } 3589 3590 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3591 count -= data_len; 3592 if (base) 3593 base += data_len; 3594 offset += data_len; 3595 if (mi->mi_io_kstats) { 3596 mutex_enter(&mi->mi_lock); 3597 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3598 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3599 mutex_exit(&mi->mi_lock); 3600 } 3601 lwp_stat_update(LWP_STAT_INBLK, 1); 3602 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3603 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3604 3605 } while (count && !is_eof); 3606 3607 *residp = count; 3608 3609 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3610 3611 return (e.error); 3612 } 3613 3614 /* ARGSUSED */ 3615 static int 3616 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3617 caller_context_t *ct) 3618 { 3619 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3620 return (EIO); 3621 switch (cmd) { 3622 case _FIODIRECTIO: 3623 return (nfs4_directio(vp, (int)arg, cr)); 3624 default: 3625 return (ENOTTY); 3626 } 3627 } 3628 3629 /* ARGSUSED */ 3630 int 3631 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3632 caller_context_t *ct) 3633 { 3634 int error; 3635 rnode4_t *rp = VTOR4(vp); 3636 3637 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3638 return (EIO); 3639 /* 3640 * If it has been specified that the return value will 3641 * just be used as a hint, and we are only being asked 3642 * for size, fsid or rdevid, then return the client's 3643 * notion of these values without checking to make sure 3644 * that the attribute cache is up to date. 3645 * The whole point is to avoid an over the wire GETATTR 3646 * call. 3647 */ 3648 if (flags & ATTR_HINT) { 3649 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) { 3650 mutex_enter(&rp->r_statelock); 3651 if (vap->va_mask & AT_SIZE) 3652 vap->va_size = rp->r_size; 3653 if (vap->va_mask & AT_FSID) 3654 vap->va_fsid = rp->r_attr.va_fsid; 3655 if (vap->va_mask & AT_RDEV) 3656 vap->va_rdev = rp->r_attr.va_rdev; 3657 mutex_exit(&rp->r_statelock); 3658 return (0); 3659 } 3660 } 3661 3662 /* 3663 * Only need to flush pages if asking for the mtime 3664 * and if there any dirty pages or any outstanding 3665 * asynchronous (write) requests for this file. 3666 */ 3667 if (vap->va_mask & AT_MTIME) { 3668 rp = VTOR4(vp); 3669 if (nfs4_has_pages(vp)) { 3670 mutex_enter(&rp->r_statev4_lock); 3671 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3672 mutex_exit(&rp->r_statev4_lock); 3673 if (rp->r_flags & R4DIRTY || 3674 rp->r_awcount > 0) { 3675 mutex_enter(&rp->r_statelock); 3676 rp->r_gcount++; 3677 mutex_exit(&rp->r_statelock); 3678 error = 3679 nfs4_putpage(vp, (u_offset_t)0, 3680 0, 0, cr, NULL); 3681 mutex_enter(&rp->r_statelock); 3682 if (error && (error == ENOSPC || 3683 error == EDQUOT)) { 3684 if (!rp->r_error) 3685 rp->r_error = error; 3686 } 3687 if (--rp->r_gcount == 0) 3688 cv_broadcast(&rp->r_cv); 3689 mutex_exit(&rp->r_statelock); 3690 } 3691 } else { 3692 mutex_exit(&rp->r_statev4_lock); 3693 } 3694 } 3695 } 3696 return (nfs4getattr(vp, vap, cr)); 3697 } 3698 3699 int 3700 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3701 { 3702 /* 3703 * If these are the only two bits cleared 3704 * on the server then return 0 (OK) else 3705 * return 1 (BAD). 3706 */ 3707 on_client &= ~(S_ISUID|S_ISGID); 3708 if (on_client == from_server) 3709 return (0); 3710 else 3711 return (1); 3712 } 3713 3714 /*ARGSUSED4*/ 3715 static int 3716 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3717 caller_context_t *ct) 3718 { 3719 if (vap->va_mask & AT_NOSET) 3720 return (EINVAL); 3721 3722 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3723 return (EIO); 3724 3725 /* 3726 * Don't call secpolicy_vnode_setattr, the client cannot 3727 * use its cached attributes to make security decisions 3728 * as the server may be faking mode bits or mapping uid/gid. 3729 * Always just let the server to the checking. 3730 * If we provide the ability to remove basic priviledges 3731 * to setattr (e.g. basic without chmod) then we will 3732 * need to add a check here before calling the server. 3733 */ 3734 3735 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3736 } 3737 3738 /* 3739 * To replace the "guarded" version 3 setattr, we use two types of compound 3740 * setattr requests: 3741 * 1. The "normal" setattr, used when the size of the file isn't being 3742 * changed - { Putfh <fh>; Setattr; Getattr }/ 3743 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3744 * with only ctime as the argument. If the server ctime differs from 3745 * what is cached on the client, the verify will fail, but we would 3746 * already have the ctime from the preceding getattr, so just set it 3747 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3748 * Setattr; Getattr }. 3749 * 3750 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3751 * this setattr and NULL if they are not. 3752 */ 3753 static int 3754 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3755 vsecattr_t *vsap) 3756 { 3757 COMPOUND4args_clnt args; 3758 COMPOUND4res_clnt res, *resp = NULL; 3759 nfs4_ga_res_t *garp = NULL; 3760 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3761 nfs_argop4 argop[5]; 3762 int verify_argop = -1; 3763 int setattr_argop = 1; 3764 nfs_resop4 *resop; 3765 vattr_t va; 3766 rnode4_t *rp; 3767 int doqueue = 1; 3768 uint_t mask = vap->va_mask; 3769 mode_t omode; 3770 vsecattr_t *vsp; 3771 timestruc_t ctime; 3772 bool_t needrecov = FALSE; 3773 nfs4_recov_state_t recov_state; 3774 nfs4_stateid_types_t sid_types; 3775 stateid4 stateid; 3776 hrtime_t t; 3777 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3778 servinfo4_t *svp; 3779 bitmap4 supp_attrs; 3780 3781 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3782 rp = VTOR4(vp); 3783 nfs4_init_stateid_types(&sid_types); 3784 3785 /* 3786 * Only need to flush pages if there are any pages and 3787 * if the file is marked as dirty in some fashion. The 3788 * file must be flushed so that we can accurately 3789 * determine the size of the file and the cached data 3790 * after the SETATTR returns. A file is considered to 3791 * be dirty if it is either marked with R4DIRTY, has 3792 * outstanding i/o's active, or is mmap'd. In this 3793 * last case, we can't tell whether there are dirty 3794 * pages, so we flush just to be sure. 3795 */ 3796 if (nfs4_has_pages(vp) && 3797 ((rp->r_flags & R4DIRTY) || 3798 rp->r_count > 0 || 3799 rp->r_mapcnt > 0)) { 3800 ASSERT(vp->v_type != VCHR); 3801 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3802 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3803 mutex_enter(&rp->r_statelock); 3804 if (!rp->r_error) 3805 rp->r_error = e.error; 3806 mutex_exit(&rp->r_statelock); 3807 } 3808 } 3809 3810 if (mask & AT_SIZE) { 3811 /* 3812 * Verification setattr compound for non-deleg AT_SIZE: 3813 * { Putfh; Getattr; Verify; Setattr; Getattr } 3814 * Set ctime local here (outside the do_again label) 3815 * so that subsequent retries (after failed VERIFY) 3816 * will use ctime from GETATTR results (from failed 3817 * verify compound) as VERIFY arg. 3818 * If file has delegation, then VERIFY(time_metadata) 3819 * is of little added value, so don't bother. 3820 */ 3821 mutex_enter(&rp->r_statev4_lock); 3822 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3823 rp->r_deleg_return_pending) { 3824 numops = 5; 3825 ctime = rp->r_attr.va_ctime; 3826 } 3827 mutex_exit(&rp->r_statev4_lock); 3828 } 3829 3830 recov_state.rs_flags = 0; 3831 recov_state.rs_num_retry_despite_err = 0; 3832 3833 args.ctag = TAG_SETATTR; 3834 do_again: 3835 recov_retry: 3836 setattr_argop = numops - 2; 3837 3838 args.array = argop; 3839 args.array_len = numops; 3840 3841 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3842 if (e.error) 3843 return (e.error); 3844 3845 3846 /* putfh target fh */ 3847 argop[0].argop = OP_CPUTFH; 3848 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3849 3850 if (numops == 5) { 3851 /* 3852 * We only care about the ctime, but need to get mtime 3853 * and size for proper cache update. 3854 */ 3855 /* getattr */ 3856 argop[1].argop = OP_GETATTR; 3857 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3858 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3859 3860 /* verify - set later in loop */ 3861 verify_argop = 2; 3862 } 3863 3864 /* setattr */ 3865 svp = rp->r_server; 3866 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3867 supp_attrs = svp->sv_supp_attrs; 3868 nfs_rw_exit(&svp->sv_lock); 3869 3870 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3871 supp_attrs, &e.error, &sid_types); 3872 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3873 if (e.error) { 3874 /* req time field(s) overflow - return immediately */ 3875 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3876 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3877 opsetattr.obj_attributes); 3878 return (e.error); 3879 } 3880 omode = rp->r_attr.va_mode; 3881 3882 /* getattr */ 3883 argop[numops-1].argop = OP_GETATTR; 3884 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3885 /* 3886 * If we are setting the ACL (indicated only by vsap != NULL), request 3887 * the ACL in this getattr. The ACL returned from this getattr will be 3888 * used in updating the ACL cache. 3889 */ 3890 if (vsap != NULL) 3891 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3892 FATTR4_ACL_MASK; 3893 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3894 3895 /* 3896 * setattr iterates if the object size is set and the cached ctime 3897 * does not match the file ctime. In that case, verify the ctime first. 3898 */ 3899 3900 do { 3901 if (verify_argop != -1) { 3902 /* 3903 * Verify that the ctime match before doing setattr. 3904 */ 3905 va.va_mask = AT_CTIME; 3906 va.va_ctime = ctime; 3907 svp = rp->r_server; 3908 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3909 supp_attrs = svp->sv_supp_attrs; 3910 nfs_rw_exit(&svp->sv_lock); 3911 e.error = nfs4args_verify(&argop[verify_argop], &va, 3912 OP_VERIFY, supp_attrs); 3913 if (e.error) { 3914 /* req time field(s) overflow - return */ 3915 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3916 needrecov); 3917 break; 3918 } 3919 } 3920 3921 doqueue = 1; 3922 3923 t = gethrtime(); 3924 3925 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3926 3927 /* 3928 * Purge the access cache and ACL cache if changing either the 3929 * owner of the file, the group owner, or the mode. These may 3930 * change the access permissions of the file, so purge old 3931 * information and start over again. 3932 */ 3933 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3934 (void) nfs4_access_purge_rp(rp); 3935 if (rp->r_secattr != NULL) { 3936 mutex_enter(&rp->r_statelock); 3937 vsp = rp->r_secattr; 3938 rp->r_secattr = NULL; 3939 mutex_exit(&rp->r_statelock); 3940 if (vsp != NULL) 3941 nfs4_acl_free_cache(vsp); 3942 } 3943 } 3944 3945 /* 3946 * If res.array_len == numops, then everything succeeded, 3947 * except for possibly the final getattr. If only the 3948 * last getattr failed, give up, and don't try recovery. 3949 */ 3950 if (res.array_len == numops) { 3951 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3952 needrecov); 3953 if (! e.error) 3954 resp = &res; 3955 break; 3956 } 3957 3958 /* 3959 * if either rpc call failed or completely succeeded - done 3960 */ 3961 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3962 if (e.error) { 3963 PURGE_ATTRCACHE4(vp); 3964 if (!needrecov) { 3965 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3966 needrecov); 3967 break; 3968 } 3969 } 3970 3971 /* 3972 * Do proper retry for OLD_STATEID outside of the normal 3973 * recovery framework. 3974 */ 3975 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3976 sid_types.cur_sid_type != SPEC_SID && 3977 sid_types.cur_sid_type != NO_SID) { 3978 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3979 needrecov); 3980 nfs4_save_stateid(&stateid, &sid_types); 3981 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3982 opsetattr.obj_attributes); 3983 if (verify_argop != -1) { 3984 nfs4args_verify_free(&argop[verify_argop]); 3985 verify_argop = -1; 3986 } 3987 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3988 goto recov_retry; 3989 } 3990 3991 if (needrecov) { 3992 bool_t abort; 3993 3994 abort = nfs4_start_recovery(&e, 3995 VTOMI4(vp), vp, NULL, NULL, NULL, 3996 OP_SETATTR, NULL, NULL, NULL); 3997 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3998 needrecov); 3999 /* 4000 * Do not retry if we failed with OLD_STATEID using 4001 * a special stateid. This is done to avoid looping 4002 * with a broken server. 4003 */ 4004 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4005 (sid_types.cur_sid_type == SPEC_SID || 4006 sid_types.cur_sid_type == NO_SID)) 4007 abort = TRUE; 4008 if (!e.error) { 4009 if (res.status == NFS4ERR_BADOWNER) 4010 nfs4_log_badowner(VTOMI4(vp), 4011 OP_SETATTR); 4012 4013 e.error = geterrno4(res.status); 4014 (void) xdr_free(xdr_COMPOUND4res_clnt, 4015 (caddr_t)&res); 4016 } 4017 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4018 opsetattr.obj_attributes); 4019 if (verify_argop != -1) { 4020 nfs4args_verify_free(&argop[verify_argop]); 4021 verify_argop = -1; 4022 } 4023 if (abort == FALSE) { 4024 /* 4025 * Need to retry all possible stateids in 4026 * case the recovery error wasn't stateid 4027 * related or the stateids have become 4028 * stale (server reboot). 4029 */ 4030 nfs4_init_stateid_types(&sid_types); 4031 goto recov_retry; 4032 } 4033 return (e.error); 4034 } 4035 4036 /* 4037 * Need to call nfs4_end_op before nfs4getattr to 4038 * avoid potential nfs4_start_op deadlock. See RFE 4039 * 4777612. Calls to nfs4_invalidate_pages() and 4040 * nfs4_purge_stale_fh() might also generate over the 4041 * wire calls which my cause nfs4_start_op() deadlock. 4042 */ 4043 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4044 4045 /* 4046 * Check to update lease. 4047 */ 4048 resp = &res; 4049 if (res.status == NFS4_OK) { 4050 break; 4051 } 4052 4053 /* 4054 * Check if verify failed to see if try again 4055 */ 4056 if ((verify_argop == -1) || (res.array_len != 3)) { 4057 /* 4058 * can't continue... 4059 */ 4060 if (res.status == NFS4ERR_BADOWNER) 4061 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4062 4063 e.error = geterrno4(res.status); 4064 } else { 4065 /* 4066 * When the verify request fails, the client ctime is 4067 * not in sync with the server. This is the same as 4068 * the version 3 "not synchronized" error, and we 4069 * handle it in a similar manner (XXX do we need to???). 4070 * Use the ctime returned in the first getattr for 4071 * the input to the next verify. 4072 * If we couldn't get the attributes, then we give up 4073 * because we can't complete the operation as required. 4074 */ 4075 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4076 } 4077 if (e.error) { 4078 PURGE_ATTRCACHE4(vp); 4079 nfs4_purge_stale_fh(e.error, vp, cr); 4080 } else { 4081 /* 4082 * retry with a new verify value 4083 */ 4084 ctime = garp->n4g_va.va_ctime; 4085 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4086 resp = NULL; 4087 } 4088 if (!e.error) { 4089 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4090 opsetattr.obj_attributes); 4091 if (verify_argop != -1) { 4092 nfs4args_verify_free(&argop[verify_argop]); 4093 verify_argop = -1; 4094 } 4095 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4096 goto do_again; 4097 } 4098 } while (!e.error); 4099 4100 if (e.error) { 4101 /* 4102 * If we are here, rfs4call has an irrecoverable error - return 4103 */ 4104 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4105 opsetattr.obj_attributes); 4106 if (verify_argop != -1) { 4107 nfs4args_verify_free(&argop[verify_argop]); 4108 verify_argop = -1; 4109 } 4110 if (resp) 4111 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4112 return (e.error); 4113 } 4114 4115 4116 4117 /* 4118 * If changing the size of the file, invalidate 4119 * any local cached data which is no longer part 4120 * of the file. We also possibly invalidate the 4121 * last page in the file. We could use 4122 * pvn_vpzero(), but this would mark the page as 4123 * modified and require it to be written back to 4124 * the server for no particularly good reason. 4125 * This way, if we access it, then we bring it 4126 * back in. A read should be cheaper than a 4127 * write. 4128 */ 4129 if (mask & AT_SIZE) { 4130 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4131 } 4132 4133 /* either no error or one of the postop getattr failed */ 4134 4135 /* 4136 * XXX Perform a simplified version of wcc checking. Instead of 4137 * have another getattr to get pre-op, just purge cache if 4138 * any of the ops prior to and including the getattr failed. 4139 * If the getattr succeeded then update the attrcache accordingly. 4140 */ 4141 4142 garp = NULL; 4143 if (res.status == NFS4_OK) { 4144 /* 4145 * Last getattr 4146 */ 4147 resop = &res.array[numops - 1]; 4148 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4149 } 4150 /* 4151 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4152 * rather than filling it. See the function itself for details. 4153 */ 4154 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4155 if (garp != NULL) { 4156 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4157 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4158 vs_ace4_destroy(&garp->n4g_vsa); 4159 } else { 4160 if (vsap != NULL) { 4161 /* 4162 * The ACL was supposed to be set and to be 4163 * returned in the last getattr of this 4164 * compound, but for some reason the getattr 4165 * result doesn't contain the ACL. In this 4166 * case, purge the ACL cache. 4167 */ 4168 if (rp->r_secattr != NULL) { 4169 mutex_enter(&rp->r_statelock); 4170 vsp = rp->r_secattr; 4171 rp->r_secattr = NULL; 4172 mutex_exit(&rp->r_statelock); 4173 if (vsp != NULL) 4174 nfs4_acl_free_cache(vsp); 4175 } 4176 } 4177 } 4178 } 4179 4180 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4181 /* 4182 * Set the size, rather than relying on getting it updated 4183 * via a GETATTR. With delegations the client tries to 4184 * suppress GETATTR calls. 4185 */ 4186 mutex_enter(&rp->r_statelock); 4187 rp->r_size = vap->va_size; 4188 mutex_exit(&rp->r_statelock); 4189 } 4190 4191 /* 4192 * Can free up request args and res 4193 */ 4194 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4195 opsetattr.obj_attributes); 4196 if (verify_argop != -1) { 4197 nfs4args_verify_free(&argop[verify_argop]); 4198 verify_argop = -1; 4199 } 4200 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4201 4202 /* 4203 * Some servers will change the mode to clear the setuid 4204 * and setgid bits when changing the uid or gid. The 4205 * client needs to compensate appropriately. 4206 */ 4207 if (mask & (AT_UID | AT_GID)) { 4208 int terror, do_setattr; 4209 4210 do_setattr = 0; 4211 va.va_mask = AT_MODE; 4212 terror = nfs4getattr(vp, &va, cr); 4213 if (!terror && 4214 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4215 (!(mask & AT_MODE) && va.va_mode != omode))) { 4216 va.va_mask = AT_MODE; 4217 if (mask & AT_MODE) { 4218 /* 4219 * We asked the mode to be changed and what 4220 * we just got from the server in getattr is 4221 * not what we wanted it to be, so set it now. 4222 */ 4223 va.va_mode = vap->va_mode; 4224 do_setattr = 1; 4225 } else { 4226 /* 4227 * We did not ask the mode to be changed, 4228 * Check to see that the server just cleared 4229 * I_SUID and I_GUID from it. If not then 4230 * set mode to omode with UID/GID cleared. 4231 */ 4232 if (nfs4_compare_modes(va.va_mode, omode)) { 4233 omode &= ~(S_ISUID|S_ISGID); 4234 va.va_mode = omode; 4235 do_setattr = 1; 4236 } 4237 } 4238 4239 if (do_setattr) 4240 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4241 } 4242 } 4243 4244 return (e.error); 4245 } 4246 4247 /* ARGSUSED */ 4248 static int 4249 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4250 { 4251 COMPOUND4args_clnt args; 4252 COMPOUND4res_clnt res; 4253 int doqueue; 4254 uint32_t acc, resacc, argacc; 4255 rnode4_t *rp; 4256 cred_t *cred, *ncr, *ncrfree = NULL; 4257 nfs4_access_type_t cacc; 4258 int num_ops; 4259 nfs_argop4 argop[3]; 4260 nfs_resop4 *resop; 4261 bool_t needrecov = FALSE, do_getattr; 4262 nfs4_recov_state_t recov_state; 4263 int rpc_error; 4264 hrtime_t t; 4265 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4266 mntinfo4_t *mi = VTOMI4(vp); 4267 4268 if (nfs_zone() != mi->mi_zone) 4269 return (EIO); 4270 4271 acc = 0; 4272 if (mode & VREAD) 4273 acc |= ACCESS4_READ; 4274 if (mode & VWRITE) { 4275 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4276 return (EROFS); 4277 if (vp->v_type == VDIR) 4278 acc |= ACCESS4_DELETE; 4279 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4280 } 4281 if (mode & VEXEC) { 4282 if (vp->v_type == VDIR) 4283 acc |= ACCESS4_LOOKUP; 4284 else 4285 acc |= ACCESS4_EXECUTE; 4286 } 4287 4288 if (VTOR4(vp)->r_acache != NULL) { 4289 e.error = nfs4_validate_caches(vp, cr); 4290 if (e.error) 4291 return (e.error); 4292 } 4293 4294 rp = VTOR4(vp); 4295 if (vp->v_type == VDIR) 4296 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4297 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4298 else 4299 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4300 ACCESS4_EXECUTE; 4301 recov_state.rs_flags = 0; 4302 recov_state.rs_num_retry_despite_err = 0; 4303 4304 cred = cr; 4305 /* 4306 * ncr and ncrfree both initially 4307 * point to the memory area returned 4308 * by crnetadjust(); 4309 * ncrfree not NULL when exiting means 4310 * that we need to release it 4311 */ 4312 ncr = crnetadjust(cred); 4313 ncrfree = ncr; 4314 4315 tryagain: 4316 cacc = nfs4_access_check(rp, acc, cred); 4317 if (cacc == NFS4_ACCESS_ALLOWED) { 4318 if (ncrfree != NULL) 4319 crfree(ncrfree); 4320 return (0); 4321 } 4322 if (cacc == NFS4_ACCESS_DENIED) { 4323 /* 4324 * If the cred can be adjusted, try again 4325 * with the new cred. 4326 */ 4327 if (ncr != NULL) { 4328 cred = ncr; 4329 ncr = NULL; 4330 goto tryagain; 4331 } 4332 if (ncrfree != NULL) 4333 crfree(ncrfree); 4334 return (EACCES); 4335 } 4336 4337 recov_retry: 4338 /* 4339 * Don't take with r_statev4_lock here. r_deleg_type could 4340 * change as soon as lock is released. Since it is an int, 4341 * there is no atomicity issue. 4342 */ 4343 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4344 num_ops = do_getattr ? 3 : 2; 4345 4346 args.ctag = TAG_ACCESS; 4347 4348 args.array_len = num_ops; 4349 args.array = argop; 4350 4351 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4352 &recov_state, NULL)) { 4353 if (ncrfree != NULL) 4354 crfree(ncrfree); 4355 return (e.error); 4356 } 4357 4358 /* putfh target fh */ 4359 argop[0].argop = OP_CPUTFH; 4360 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4361 4362 /* access */ 4363 argop[1].argop = OP_ACCESS; 4364 argop[1].nfs_argop4_u.opaccess.access = argacc; 4365 4366 /* getattr */ 4367 if (do_getattr) { 4368 argop[2].argop = OP_GETATTR; 4369 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4370 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4371 } 4372 4373 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4374 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4375 rnode4info(VTOR4(vp)))); 4376 4377 doqueue = 1; 4378 t = gethrtime(); 4379 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4380 rpc_error = e.error; 4381 4382 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4383 if (needrecov) { 4384 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4385 "nfs4_access: initiating recovery\n")); 4386 4387 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4388 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) { 4389 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4390 &recov_state, needrecov); 4391 if (!e.error) 4392 (void) xdr_free(xdr_COMPOUND4res_clnt, 4393 (caddr_t)&res); 4394 goto recov_retry; 4395 } 4396 } 4397 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4398 4399 if (e.error) 4400 goto out; 4401 4402 if (res.status) { 4403 e.error = geterrno4(res.status); 4404 /* 4405 * This might generate over the wire calls throught 4406 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4407 * here to avoid a deadlock. 4408 */ 4409 nfs4_purge_stale_fh(e.error, vp, cr); 4410 goto out; 4411 } 4412 resop = &res.array[1]; /* access res */ 4413 4414 resacc = resop->nfs_resop4_u.opaccess.access; 4415 4416 if (do_getattr) { 4417 resop++; /* getattr res */ 4418 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4419 t, cr, FALSE, NULL); 4420 } 4421 4422 if (!e.error) { 4423 nfs4_access_cache(rp, argacc, resacc, cred); 4424 /* 4425 * we just cached results with cred; if cred is the 4426 * adjusted credentials from crnetadjust, we do not want 4427 * to release them before exiting: hence setting ncrfree 4428 * to NULL 4429 */ 4430 if (cred != cr) 4431 ncrfree = NULL; 4432 /* XXX check the supported bits too? */ 4433 if ((acc & resacc) != acc) { 4434 /* 4435 * The following code implements the semantic 4436 * that a setuid root program has *at least* the 4437 * permissions of the user that is running the 4438 * program. See rfs3call() for more portions 4439 * of the implementation of this functionality. 4440 */ 4441 /* XXX-LP */ 4442 if (ncr != NULL) { 4443 (void) xdr_free(xdr_COMPOUND4res_clnt, 4444 (caddr_t)&res); 4445 cred = ncr; 4446 ncr = NULL; 4447 goto tryagain; 4448 } 4449 e.error = EACCES; 4450 } 4451 } 4452 4453 out: 4454 if (!rpc_error) 4455 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4456 4457 if (ncrfree != NULL) 4458 crfree(ncrfree); 4459 4460 return (e.error); 4461 } 4462 4463 /* ARGSUSED */ 4464 static int 4465 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4466 { 4467 COMPOUND4args_clnt args; 4468 COMPOUND4res_clnt res; 4469 int doqueue; 4470 rnode4_t *rp; 4471 nfs_argop4 argop[3]; 4472 nfs_resop4 *resop; 4473 READLINK4res *lr_res; 4474 nfs4_ga_res_t *garp; 4475 uint_t len; 4476 char *linkdata; 4477 bool_t needrecov = FALSE; 4478 nfs4_recov_state_t recov_state; 4479 hrtime_t t; 4480 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4481 4482 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4483 return (EIO); 4484 /* 4485 * Can't readlink anything other than a symbolic link. 4486 */ 4487 if (vp->v_type != VLNK) 4488 return (EINVAL); 4489 4490 rp = VTOR4(vp); 4491 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4492 e.error = nfs4_validate_caches(vp, cr); 4493 if (e.error) 4494 return (e.error); 4495 mutex_enter(&rp->r_statelock); 4496 if (rp->r_symlink.contents != NULL) { 4497 e.error = uiomove(rp->r_symlink.contents, 4498 rp->r_symlink.len, UIO_READ, uiop); 4499 mutex_exit(&rp->r_statelock); 4500 return (e.error); 4501 } 4502 mutex_exit(&rp->r_statelock); 4503 } 4504 recov_state.rs_flags = 0; 4505 recov_state.rs_num_retry_despite_err = 0; 4506 4507 recov_retry: 4508 args.array_len = 3; 4509 args.array = argop; 4510 args.ctag = TAG_READLINK; 4511 4512 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4513 if (e.error) { 4514 return (e.error); 4515 } 4516 4517 /* 0. putfh symlink fh */ 4518 argop[0].argop = OP_CPUTFH; 4519 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4520 4521 /* 1. readlink */ 4522 argop[1].argop = OP_READLINK; 4523 4524 /* 2. getattr */ 4525 argop[2].argop = OP_GETATTR; 4526 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4527 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4528 4529 doqueue = 1; 4530 4531 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4532 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4533 rnode4info(VTOR4(vp)))); 4534 4535 t = gethrtime(); 4536 4537 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4538 4539 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4540 if (needrecov) { 4541 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4542 "nfs4_readlink: initiating recovery\n")); 4543 4544 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4545 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) { 4546 if (!e.error) 4547 (void) xdr_free(xdr_COMPOUND4res_clnt, 4548 (caddr_t)&res); 4549 4550 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4551 needrecov); 4552 goto recov_retry; 4553 } 4554 } 4555 4556 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4557 4558 if (e.error) 4559 return (e.error); 4560 4561 /* 4562 * There is an path in the code below which calls 4563 * nfs4_purge_stale_fh(), which may generate otw calls through 4564 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4565 * here to avoid nfs4_start_op() deadlock. 4566 */ 4567 4568 if (res.status && (res.array_len < args.array_len)) { 4569 /* 4570 * either Putfh or Link failed 4571 */ 4572 e.error = geterrno4(res.status); 4573 nfs4_purge_stale_fh(e.error, vp, cr); 4574 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4575 return (e.error); 4576 } 4577 4578 resop = &res.array[1]; /* readlink res */ 4579 lr_res = &resop->nfs_resop4_u.opreadlink; 4580 4581 /* 4582 * treat symlink names as data 4583 */ 4584 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4585 if (linkdata != NULL) { 4586 int uio_len = len - 1; 4587 /* len includes null byte, which we won't uiomove */ 4588 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4589 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4590 mutex_enter(&rp->r_statelock); 4591 if (rp->r_symlink.contents == NULL) { 4592 rp->r_symlink.contents = linkdata; 4593 rp->r_symlink.len = uio_len; 4594 rp->r_symlink.size = len; 4595 mutex_exit(&rp->r_statelock); 4596 } else { 4597 mutex_exit(&rp->r_statelock); 4598 kmem_free(linkdata, len); 4599 } 4600 } else { 4601 kmem_free(linkdata, len); 4602 } 4603 } 4604 if (res.status == NFS4_OK) { 4605 resop++; /* getattr res */ 4606 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4607 } 4608 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4609 4610 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4611 4612 /* 4613 * The over the wire error for attempting to readlink something 4614 * other than a symbolic link is ENXIO. However, we need to 4615 * return EINVAL instead of ENXIO, so we map it here. 4616 */ 4617 return (e.error == ENXIO ? EINVAL : e.error); 4618 } 4619 4620 /* 4621 * Flush local dirty pages to stable storage on the server. 4622 * 4623 * If FNODSYNC is specified, then there is nothing to do because 4624 * metadata changes are not cached on the client before being 4625 * sent to the server. 4626 */ 4627 /* ARGSUSED */ 4628 static int 4629 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4630 { 4631 int error; 4632 4633 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4634 return (0); 4635 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4636 return (EIO); 4637 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4638 if (!error) 4639 error = VTOR4(vp)->r_error; 4640 return (error); 4641 } 4642 4643 /* 4644 * Weirdness: if the file was removed or the target of a rename 4645 * operation while it was open, it got renamed instead. Here we 4646 * remove the renamed file. 4647 */ 4648 /* ARGSUSED */ 4649 void 4650 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4651 { 4652 rnode4_t *rp; 4653 4654 ASSERT(vp != DNLC_NO_VNODE); 4655 4656 rp = VTOR4(vp); 4657 4658 if (IS_SHADOW(vp, rp)) { 4659 sv_inactive(vp); 4660 return; 4661 } 4662 4663 /* 4664 * If this is coming from the wrong zone, we let someone in the right 4665 * zone take care of it asynchronously. We can get here due to 4666 * VN_RELE() being called from pageout() or fsflush(). This call may 4667 * potentially turn into an expensive no-op if, for instance, v_count 4668 * gets incremented in the meantime, but it's still correct. 4669 */ 4670 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4671 nfs4_async_inactive(vp, cr); 4672 return; 4673 } 4674 4675 /* 4676 * Some of the cleanup steps might require over-the-wire 4677 * operations. Since VOP_INACTIVE can get called as a result of 4678 * other over-the-wire operations (e.g., an attribute cache update 4679 * can lead to a DNLC purge), doing those steps now would lead to a 4680 * nested call to the recovery framework, which can deadlock. So 4681 * do any over-the-wire cleanups asynchronously, in a separate 4682 * thread. 4683 */ 4684 4685 mutex_enter(&rp->r_os_lock); 4686 mutex_enter(&rp->r_statelock); 4687 mutex_enter(&rp->r_statev4_lock); 4688 4689 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4690 mutex_exit(&rp->r_statev4_lock); 4691 mutex_exit(&rp->r_statelock); 4692 mutex_exit(&rp->r_os_lock); 4693 nfs4_async_inactive(vp, cr); 4694 return; 4695 } 4696 4697 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4698 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4699 mutex_exit(&rp->r_statev4_lock); 4700 mutex_exit(&rp->r_statelock); 4701 mutex_exit(&rp->r_os_lock); 4702 nfs4_async_inactive(vp, cr); 4703 return; 4704 } 4705 4706 if (rp->r_unldvp != NULL) { 4707 mutex_exit(&rp->r_statev4_lock); 4708 mutex_exit(&rp->r_statelock); 4709 mutex_exit(&rp->r_os_lock); 4710 nfs4_async_inactive(vp, cr); 4711 return; 4712 } 4713 mutex_exit(&rp->r_statev4_lock); 4714 mutex_exit(&rp->r_statelock); 4715 mutex_exit(&rp->r_os_lock); 4716 4717 rp4_addfree(rp, cr); 4718 } 4719 4720 /* 4721 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4722 * various bits of state. The caller must not refer to vp after this call. 4723 */ 4724 4725 void 4726 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4727 { 4728 rnode4_t *rp = VTOR4(vp); 4729 nfs4_recov_state_t recov_state; 4730 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4731 vnode_t *unldvp; 4732 char *unlname; 4733 cred_t *unlcred; 4734 COMPOUND4args_clnt args; 4735 COMPOUND4res_clnt res, *resp; 4736 nfs_argop4 argop[2]; 4737 int doqueue; 4738 #ifdef DEBUG 4739 char *name; 4740 #endif 4741 4742 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4743 ASSERT(!IS_SHADOW(vp, rp)); 4744 4745 #ifdef DEBUG 4746 name = fn_name(VTOSV(vp)->sv_name); 4747 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4748 "release vnode %s", name)); 4749 kmem_free(name, MAXNAMELEN); 4750 #endif 4751 4752 if (vp->v_type == VREG) { 4753 bool_t recov_failed = FALSE; 4754 4755 e.error = nfs4close_all(vp, cr); 4756 if (e.error) { 4757 /* Check to see if recovery failed */ 4758 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4759 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4760 recov_failed = TRUE; 4761 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4762 if (!recov_failed) { 4763 mutex_enter(&rp->r_statelock); 4764 if (rp->r_flags & R4RECOVERR) 4765 recov_failed = TRUE; 4766 mutex_exit(&rp->r_statelock); 4767 } 4768 if (recov_failed) { 4769 NFS4_DEBUG(nfs4_client_recov_debug, 4770 (CE_NOTE, "nfs4_inactive_otw: " 4771 "close failed (recovery failure)")); 4772 } 4773 } 4774 } 4775 4776 redo: 4777 if (rp->r_unldvp == NULL) { 4778 rp4_addfree(rp, cr); 4779 return; 4780 } 4781 4782 /* 4783 * Save the vnode pointer for the directory where the 4784 * unlinked-open file got renamed, then set it to NULL 4785 * to prevent another thread from getting here before 4786 * we're done with the remove. While we have the 4787 * statelock, make local copies of the pertinent rnode 4788 * fields. If we weren't to do this in an atomic way, the 4789 * the unl* fields could become inconsistent with respect 4790 * to each other due to a race condition between this 4791 * code and nfs_remove(). See bug report 1034328. 4792 */ 4793 mutex_enter(&rp->r_statelock); 4794 if (rp->r_unldvp == NULL) { 4795 mutex_exit(&rp->r_statelock); 4796 rp4_addfree(rp, cr); 4797 return; 4798 } 4799 4800 unldvp = rp->r_unldvp; 4801 rp->r_unldvp = NULL; 4802 unlname = rp->r_unlname; 4803 rp->r_unlname = NULL; 4804 unlcred = rp->r_unlcred; 4805 rp->r_unlcred = NULL; 4806 mutex_exit(&rp->r_statelock); 4807 4808 /* 4809 * If there are any dirty pages left, then flush 4810 * them. This is unfortunate because they just 4811 * may get thrown away during the remove operation, 4812 * but we have to do this for correctness. 4813 */ 4814 if (nfs4_has_pages(vp) && 4815 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4816 ASSERT(vp->v_type != VCHR); 4817 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4818 if (e.error) { 4819 mutex_enter(&rp->r_statelock); 4820 if (!rp->r_error) 4821 rp->r_error = e.error; 4822 mutex_exit(&rp->r_statelock); 4823 } 4824 } 4825 4826 recov_state.rs_flags = 0; 4827 recov_state.rs_num_retry_despite_err = 0; 4828 recov_retry_remove: 4829 /* 4830 * Do the remove operation on the renamed file 4831 */ 4832 args.ctag = TAG_INACTIVE; 4833 4834 /* 4835 * Remove ops: putfh dir; remove 4836 */ 4837 args.array_len = 2; 4838 args.array = argop; 4839 4840 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4841 if (e.error) { 4842 kmem_free(unlname, MAXNAMELEN); 4843 crfree(unlcred); 4844 VN_RELE(unldvp); 4845 /* 4846 * Try again; this time around r_unldvp will be NULL, so we'll 4847 * just call rp4_addfree() and return. 4848 */ 4849 goto redo; 4850 } 4851 4852 /* putfh directory */ 4853 argop[0].argop = OP_CPUTFH; 4854 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4855 4856 /* remove */ 4857 argop[1].argop = OP_CREMOVE; 4858 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4859 4860 doqueue = 1; 4861 resp = &res; 4862 4863 #if 0 /* notyet */ 4864 /* 4865 * Can't do this yet. We may be being called from 4866 * dnlc_purge_XXX while that routine is holding a 4867 * mutex lock to the nc_rele list. The calls to 4868 * nfs3_cache_wcc_data may result in calls to 4869 * dnlc_purge_XXX. This will result in a deadlock. 4870 */ 4871 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4872 if (e.error) { 4873 PURGE_ATTRCACHE4(unldvp); 4874 resp = NULL; 4875 } else if (res.status) { 4876 e.error = geterrno4(res.status); 4877 PURGE_ATTRCACHE4(unldvp); 4878 /* 4879 * This code is inactive right now 4880 * but if made active there should 4881 * be a nfs4_end_op() call before 4882 * nfs4_purge_stale_fh to avoid start_op() 4883 * deadlock. See BugId: 4948726 4884 */ 4885 nfs4_purge_stale_fh(error, unldvp, cr); 4886 } else { 4887 nfs_resop4 *resop; 4888 REMOVE4res *rm_res; 4889 4890 resop = &res.array[1]; 4891 rm_res = &resop->nfs_resop4_u.opremove; 4892 /* 4893 * Update directory cache attribute, 4894 * readdir and dnlc caches. 4895 */ 4896 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4897 } 4898 #else 4899 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4900 4901 PURGE_ATTRCACHE4(unldvp); 4902 #endif 4903 4904 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4905 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4906 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 4907 if (!e.error) 4908 (void) xdr_free(xdr_COMPOUND4res_clnt, 4909 (caddr_t)&res); 4910 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4911 &recov_state, TRUE); 4912 goto recov_retry_remove; 4913 } 4914 } 4915 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4916 4917 /* 4918 * Release stuff held for the remove 4919 */ 4920 VN_RELE(unldvp); 4921 if (!e.error && resp) 4922 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4923 4924 kmem_free(unlname, MAXNAMELEN); 4925 crfree(unlcred); 4926 goto redo; 4927 } 4928 4929 /* 4930 * Remote file system operations having to do with directory manipulation. 4931 */ 4932 /* ARGSUSED3 */ 4933 int 4934 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4935 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4936 int *direntflags, pathname_t *realpnp) 4937 { 4938 int error; 4939 vnode_t *vp, *avp = NULL; 4940 rnode4_t *drp; 4941 4942 *vpp = NULL; 4943 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4944 return (EPERM); 4945 /* 4946 * if LOOKUP_XATTR, must replace dvp (object) with 4947 * object's attrdir before continuing with lookup 4948 */ 4949 if (flags & LOOKUP_XATTR) { 4950 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4951 if (error) 4952 return (error); 4953 4954 dvp = avp; 4955 4956 /* 4957 * If lookup is for "", just return dvp now. The attrdir 4958 * has already been activated (from nfs4lookup_xattr), and 4959 * the caller will RELE the original dvp -- not 4960 * the attrdir. So, set vpp and return. 4961 * Currently, when the LOOKUP_XATTR flag is 4962 * passed to VOP_LOOKUP, the name is always empty, and 4963 * shortcircuiting here avoids 3 unneeded lock/unlock 4964 * pairs. 4965 * 4966 * If a non-empty name was provided, then it is the 4967 * attribute name, and it will be looked up below. 4968 */ 4969 if (*nm == '\0') { 4970 *vpp = dvp; 4971 return (0); 4972 } 4973 4974 /* 4975 * The vfs layer never sends a name when asking for the 4976 * attrdir, so we should never get here (unless of course 4977 * name is passed at some time in future -- at which time 4978 * we'll blow up here). 4979 */ 4980 ASSERT(0); 4981 } 4982 4983 drp = VTOR4(dvp); 4984 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4985 return (EINTR); 4986 4987 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4988 nfs_rw_exit(&drp->r_rwlock); 4989 4990 /* 4991 * If vnode is a device, create special vnode. 4992 */ 4993 if (!error && ISVDEV((*vpp)->v_type)) { 4994 vp = *vpp; 4995 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4996 VN_RELE(vp); 4997 } 4998 4999 return (error); 5000 } 5001 5002 /* ARGSUSED */ 5003 static int 5004 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 5005 { 5006 int error; 5007 rnode4_t *drp; 5008 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 5009 mntinfo4_t *mi; 5010 5011 mi = VTOMI4(dvp); 5012 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 5013 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 5014 return (EINVAL); 5015 5016 drp = VTOR4(dvp); 5017 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5018 return (EINTR); 5019 5020 mutex_enter(&drp->r_statelock); 5021 /* 5022 * If the server doesn't support xattrs just return EINVAL 5023 */ 5024 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 5025 mutex_exit(&drp->r_statelock); 5026 nfs_rw_exit(&drp->r_rwlock); 5027 return (EINVAL); 5028 } 5029 5030 /* 5031 * If there is a cached xattr directory entry, 5032 * use it as long as the attributes are valid. If the 5033 * attributes are not valid, take the simple approach and 5034 * free the cached value and re-fetch a new value. 5035 * 5036 * We don't negative entry cache for now, if we did we 5037 * would need to check if the file has changed on every 5038 * lookup. But xattrs don't exist very often and failing 5039 * an openattr is not much more expensive than and NVERIFY or GETATTR 5040 * so do an openattr over the wire for now. 5041 */ 5042 if (drp->r_xattr_dir != NULL) { 5043 if (ATTRCACHE4_VALID(dvp)) { 5044 VN_HOLD(drp->r_xattr_dir); 5045 *vpp = drp->r_xattr_dir; 5046 mutex_exit(&drp->r_statelock); 5047 nfs_rw_exit(&drp->r_rwlock); 5048 return (0); 5049 } 5050 VN_RELE(drp->r_xattr_dir); 5051 drp->r_xattr_dir = NULL; 5052 } 5053 mutex_exit(&drp->r_statelock); 5054 5055 error = nfs4openattr(dvp, vpp, cflag, cr); 5056 5057 nfs_rw_exit(&drp->r_rwlock); 5058 5059 return (error); 5060 } 5061 5062 static int 5063 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5064 { 5065 int error; 5066 rnode4_t *drp; 5067 5068 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5069 5070 /* 5071 * If lookup is for "", just return dvp. Don't need 5072 * to send it over the wire, look it up in the dnlc, 5073 * or perform any access checks. 5074 */ 5075 if (*nm == '\0') { 5076 VN_HOLD(dvp); 5077 *vpp = dvp; 5078 return (0); 5079 } 5080 5081 /* 5082 * Can't do lookups in non-directories. 5083 */ 5084 if (dvp->v_type != VDIR) 5085 return (ENOTDIR); 5086 5087 /* 5088 * If lookup is for ".", just return dvp. Don't need 5089 * to send it over the wire or look it up in the dnlc, 5090 * just need to check access. 5091 */ 5092 if (nm[0] == '.' && nm[1] == '\0') { 5093 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5094 if (error) 5095 return (error); 5096 VN_HOLD(dvp); 5097 *vpp = dvp; 5098 return (0); 5099 } 5100 5101 drp = VTOR4(dvp); 5102 if (!(drp->r_flags & R4LOOKUP)) { 5103 mutex_enter(&drp->r_statelock); 5104 drp->r_flags |= R4LOOKUP; 5105 mutex_exit(&drp->r_statelock); 5106 } 5107 5108 *vpp = NULL; 5109 /* 5110 * Lookup this name in the DNLC. If there is no entry 5111 * lookup over the wire. 5112 */ 5113 if (!skipdnlc) 5114 *vpp = dnlc_lookup(dvp, nm); 5115 if (*vpp == NULL) { 5116 /* 5117 * We need to go over the wire to lookup the name. 5118 */ 5119 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5120 } 5121 5122 /* 5123 * We hit on the dnlc 5124 */ 5125 if (*vpp != DNLC_NO_VNODE || 5126 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5127 /* 5128 * But our attrs may not be valid. 5129 */ 5130 if (ATTRCACHE4_VALID(dvp)) { 5131 error = nfs4_waitfor_purge_complete(dvp); 5132 if (error) { 5133 VN_RELE(*vpp); 5134 *vpp = NULL; 5135 return (error); 5136 } 5137 5138 /* 5139 * If after the purge completes, check to make sure 5140 * our attrs are still valid. 5141 */ 5142 if (ATTRCACHE4_VALID(dvp)) { 5143 /* 5144 * If we waited for a purge we may have 5145 * lost our vnode so look it up again. 5146 */ 5147 VN_RELE(*vpp); 5148 *vpp = dnlc_lookup(dvp, nm); 5149 if (*vpp == NULL) 5150 return (nfs4lookupnew_otw(dvp, 5151 nm, vpp, cr)); 5152 5153 /* 5154 * The access cache should almost always hit 5155 */ 5156 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5157 5158 if (error) { 5159 VN_RELE(*vpp); 5160 *vpp = NULL; 5161 return (error); 5162 } 5163 if (*vpp == DNLC_NO_VNODE) { 5164 VN_RELE(*vpp); 5165 *vpp = NULL; 5166 return (ENOENT); 5167 } 5168 return (0); 5169 } 5170 } 5171 } 5172 5173 ASSERT(*vpp != NULL); 5174 5175 /* 5176 * We may have gotten here we have one of the following cases: 5177 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5178 * need to validate them. 5179 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5180 * must validate. 5181 * 5182 * Go to the server and check if the directory has changed, if 5183 * it hasn't we are done and can use the dnlc entry. 5184 */ 5185 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5186 } 5187 5188 /* 5189 * Go to the server and check if the directory has changed, if 5190 * it hasn't we are done and can use the dnlc entry. If it 5191 * has changed we get a new copy of its attributes and check 5192 * the access for VEXEC, then relookup the filename and 5193 * get its filehandle and attributes. 5194 * 5195 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5196 * if the NVERIFY failed we must 5197 * purge the caches 5198 * cache new attributes (will set r_time_attr_inval) 5199 * cache new access 5200 * recheck VEXEC access 5201 * add name to dnlc, possibly negative 5202 * if LOOKUP succeeded 5203 * cache new attributes 5204 * else 5205 * set a new r_time_attr_inval for dvp 5206 * check to make sure we have access 5207 * 5208 * The vpp returned is the vnode passed in if the directory is valid, 5209 * a new vnode if successful lookup, or NULL on error. 5210 */ 5211 static int 5212 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5213 { 5214 COMPOUND4args_clnt args; 5215 COMPOUND4res_clnt res; 5216 fattr4 *ver_fattr; 5217 fattr4_change dchange; 5218 int32_t *ptr; 5219 int argoplist_size = 7 * sizeof (nfs_argop4); 5220 nfs_argop4 *argop; 5221 int doqueue; 5222 mntinfo4_t *mi; 5223 nfs4_recov_state_t recov_state; 5224 hrtime_t t; 5225 int isdotdot; 5226 vnode_t *nvp; 5227 nfs_fh4 *fhp; 5228 nfs4_sharedfh_t *sfhp; 5229 nfs4_access_type_t cacc; 5230 rnode4_t *nrp; 5231 rnode4_t *drp = VTOR4(dvp); 5232 nfs4_ga_res_t *garp = NULL; 5233 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5234 5235 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5236 ASSERT(nm != NULL); 5237 ASSERT(nm[0] != '\0'); 5238 ASSERT(dvp->v_type == VDIR); 5239 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5240 ASSERT(*vpp != NULL); 5241 5242 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5243 isdotdot = 1; 5244 args.ctag = TAG_LOOKUP_VPARENT; 5245 } else { 5246 /* 5247 * If dvp were a stub, it should have triggered and caused 5248 * a mount for us to get this far. 5249 */ 5250 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5251 5252 isdotdot = 0; 5253 args.ctag = TAG_LOOKUP_VALID; 5254 } 5255 5256 mi = VTOMI4(dvp); 5257 recov_state.rs_flags = 0; 5258 recov_state.rs_num_retry_despite_err = 0; 5259 5260 nvp = NULL; 5261 5262 /* Save the original mount point security information */ 5263 (void) save_mnt_secinfo(mi->mi_curr_serv); 5264 5265 recov_retry: 5266 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5267 &recov_state, NULL); 5268 if (e.error) { 5269 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5270 VN_RELE(*vpp); 5271 *vpp = NULL; 5272 return (e.error); 5273 } 5274 5275 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5276 5277 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5278 args.array_len = 7; 5279 args.array = argop; 5280 5281 /* 0. putfh file */ 5282 argop[0].argop = OP_CPUTFH; 5283 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5284 5285 /* 1. nverify the change info */ 5286 argop[1].argop = OP_NVERIFY; 5287 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5288 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5289 ver_fattr->attrlist4 = (char *)&dchange; 5290 ptr = (int32_t *)&dchange; 5291 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5292 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5293 5294 /* 2. getattr directory */ 5295 argop[2].argop = OP_GETATTR; 5296 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5297 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5298 5299 /* 3. access directory */ 5300 argop[3].argop = OP_ACCESS; 5301 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5302 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5303 5304 /* 4. lookup name */ 5305 if (isdotdot) { 5306 argop[4].argop = OP_LOOKUPP; 5307 } else { 5308 argop[4].argop = OP_CLOOKUP; 5309 argop[4].nfs_argop4_u.opclookup.cname = nm; 5310 } 5311 5312 /* 5. resulting file handle */ 5313 argop[5].argop = OP_GETFH; 5314 5315 /* 6. resulting file attributes */ 5316 argop[6].argop = OP_GETATTR; 5317 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5318 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5319 5320 doqueue = 1; 5321 t = gethrtime(); 5322 5323 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5324 5325 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5326 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5327 if (e.error != 0 && *vpp != NULL) 5328 VN_RELE(*vpp); 5329 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5330 &recov_state, FALSE); 5331 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5332 kmem_free(argop, argoplist_size); 5333 return (e.error); 5334 } 5335 5336 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5337 /* 5338 * For WRONGSEC of a non-dotdot case, send secinfo directly 5339 * from this thread, do not go thru the recovery thread since 5340 * we need the nm information. 5341 * 5342 * Not doing dotdot case because there is no specification 5343 * for (PUTFH, SECINFO "..") yet. 5344 */ 5345 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5346 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5347 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5348 &recov_state, FALSE); 5349 else 5350 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5351 &recov_state, TRUE); 5352 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5353 kmem_free(argop, argoplist_size); 5354 if (!e.error) 5355 goto recov_retry; 5356 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5357 VN_RELE(*vpp); 5358 *vpp = NULL; 5359 return (e.error); 5360 } 5361 5362 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5363 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5364 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5365 &recov_state, TRUE); 5366 5367 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5368 kmem_free(argop, argoplist_size); 5369 goto recov_retry; 5370 } 5371 } 5372 5373 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5374 5375 if (e.error || res.array_len == 0) { 5376 /* 5377 * If e.error isn't set, then reply has no ops (or we couldn't 5378 * be here). The only legal way to reply without an op array 5379 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5380 * be in the reply for all other status values. 5381 * 5382 * For valid replies without an ops array, return ENOTSUP 5383 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5384 * return EIO -- don't trust status. 5385 */ 5386 if (e.error == 0) 5387 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5388 ENOTSUP : EIO; 5389 VN_RELE(*vpp); 5390 *vpp = NULL; 5391 kmem_free(argop, argoplist_size); 5392 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5393 return (e.error); 5394 } 5395 5396 if (res.status != NFS4ERR_SAME) { 5397 e.error = geterrno4(res.status); 5398 5399 /* 5400 * The NVERIFY "failed" so the directory has changed 5401 * First make sure PUTFH succeeded and NVERIFY "failed" 5402 * cleanly. 5403 */ 5404 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5405 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5406 nfs4_purge_stale_fh(e.error, dvp, cr); 5407 VN_RELE(*vpp); 5408 *vpp = NULL; 5409 goto exit; 5410 } 5411 5412 /* 5413 * We know the NVERIFY "failed" so we must: 5414 * purge the caches (access and indirectly dnlc if needed) 5415 */ 5416 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5417 5418 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5419 nfs4_purge_stale_fh(e.error, dvp, cr); 5420 VN_RELE(*vpp); 5421 *vpp = NULL; 5422 goto exit; 5423 } 5424 5425 /* 5426 * Install new cached attributes for the directory 5427 */ 5428 nfs4_attr_cache(dvp, 5429 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5430 t, cr, FALSE, NULL); 5431 5432 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5433 nfs4_purge_stale_fh(e.error, dvp, cr); 5434 VN_RELE(*vpp); 5435 *vpp = NULL; 5436 e.error = geterrno4(res.status); 5437 goto exit; 5438 } 5439 5440 /* 5441 * Now we know the directory is valid, 5442 * cache new directory access 5443 */ 5444 nfs4_access_cache(drp, 5445 args.array[3].nfs_argop4_u.opaccess.access, 5446 res.array[3].nfs_resop4_u.opaccess.access, cr); 5447 5448 /* 5449 * recheck VEXEC access 5450 */ 5451 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5452 if (cacc != NFS4_ACCESS_ALLOWED) { 5453 /* 5454 * Directory permissions might have been revoked 5455 */ 5456 if (cacc == NFS4_ACCESS_DENIED) { 5457 e.error = EACCES; 5458 VN_RELE(*vpp); 5459 *vpp = NULL; 5460 goto exit; 5461 } 5462 5463 /* 5464 * Somehow we must not have asked for enough 5465 * so try a singleton ACCESS, should never happen. 5466 */ 5467 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5468 if (e.error) { 5469 VN_RELE(*vpp); 5470 *vpp = NULL; 5471 goto exit; 5472 } 5473 } 5474 5475 e.error = geterrno4(res.status); 5476 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5477 /* 5478 * The lookup failed, probably no entry 5479 */ 5480 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5481 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5482 } else { 5483 /* 5484 * Might be some other error, so remove 5485 * the dnlc entry to make sure we start all 5486 * over again, next time. 5487 */ 5488 dnlc_remove(dvp, nm); 5489 } 5490 VN_RELE(*vpp); 5491 *vpp = NULL; 5492 goto exit; 5493 } 5494 5495 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5496 /* 5497 * The file exists but we can't get its fh for 5498 * some unknown reason. Remove it from the dnlc 5499 * and error out to be safe. 5500 */ 5501 dnlc_remove(dvp, nm); 5502 VN_RELE(*vpp); 5503 *vpp = NULL; 5504 goto exit; 5505 } 5506 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5507 if (fhp->nfs_fh4_len == 0) { 5508 /* 5509 * The file exists but a bogus fh 5510 * some unknown reason. Remove it from the dnlc 5511 * and error out to be safe. 5512 */ 5513 e.error = ENOENT; 5514 dnlc_remove(dvp, nm); 5515 VN_RELE(*vpp); 5516 *vpp = NULL; 5517 goto exit; 5518 } 5519 sfhp = sfh4_get(fhp, mi); 5520 5521 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5522 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5523 5524 /* 5525 * Make the new rnode 5526 */ 5527 if (isdotdot) { 5528 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5529 if (e.error) { 5530 sfh4_rele(&sfhp); 5531 VN_RELE(*vpp); 5532 *vpp = NULL; 5533 goto exit; 5534 } 5535 /* 5536 * XXX if nfs4_make_dotdot uses an existing rnode 5537 * XXX it doesn't update the attributes. 5538 * XXX for now just save them again to save an OTW 5539 */ 5540 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5541 } else { 5542 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5543 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5544 /* 5545 * If v_type == VNON, then garp was NULL because 5546 * the last op in the compound failed and makenfs4node 5547 * could not find the vnode for sfhp. It created 5548 * a new vnode, so we have nothing to purge here. 5549 */ 5550 if (nvp->v_type == VNON) { 5551 vattr_t vattr; 5552 5553 vattr.va_mask = AT_TYPE; 5554 /* 5555 * N.B. We've already called nfs4_end_fop above. 5556 */ 5557 e.error = nfs4getattr(nvp, &vattr, cr); 5558 if (e.error) { 5559 sfh4_rele(&sfhp); 5560 VN_RELE(*vpp); 5561 *vpp = NULL; 5562 VN_RELE(nvp); 5563 goto exit; 5564 } 5565 nvp->v_type = vattr.va_type; 5566 } 5567 } 5568 sfh4_rele(&sfhp); 5569 5570 nrp = VTOR4(nvp); 5571 mutex_enter(&nrp->r_statev4_lock); 5572 if (!nrp->created_v4) { 5573 mutex_exit(&nrp->r_statev4_lock); 5574 dnlc_update(dvp, nm, nvp); 5575 } else 5576 mutex_exit(&nrp->r_statev4_lock); 5577 5578 VN_RELE(*vpp); 5579 *vpp = nvp; 5580 } else { 5581 hrtime_t now; 5582 hrtime_t delta = 0; 5583 5584 e.error = 0; 5585 5586 /* 5587 * Because the NVERIFY "succeeded" we know that the 5588 * directory attributes are still valid 5589 * so update r_time_attr_inval 5590 */ 5591 now = gethrtime(); 5592 mutex_enter(&drp->r_statelock); 5593 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5594 delta = now - drp->r_time_attr_saved; 5595 if (delta < mi->mi_acdirmin) 5596 delta = mi->mi_acdirmin; 5597 else if (delta > mi->mi_acdirmax) 5598 delta = mi->mi_acdirmax; 5599 } 5600 drp->r_time_attr_inval = now + delta; 5601 mutex_exit(&drp->r_statelock); 5602 dnlc_update(dvp, nm, *vpp); 5603 5604 /* 5605 * Even though we have a valid directory attr cache 5606 * and dnlc entry, we may not have access. 5607 * This should almost always hit the cache. 5608 */ 5609 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5610 if (e.error) { 5611 VN_RELE(*vpp); 5612 *vpp = NULL; 5613 } 5614 5615 if (*vpp == DNLC_NO_VNODE) { 5616 VN_RELE(*vpp); 5617 *vpp = NULL; 5618 e.error = ENOENT; 5619 } 5620 } 5621 5622 exit: 5623 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5624 kmem_free(argop, argoplist_size); 5625 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5626 return (e.error); 5627 } 5628 5629 /* 5630 * We need to go over the wire to lookup the name, but 5631 * while we are there verify the directory has not 5632 * changed but if it has, get new attributes and check access 5633 * 5634 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5635 * NVERIFY GETATTR ACCESS 5636 * 5637 * With the results: 5638 * if the NVERIFY failed we must purge the caches, add new attributes, 5639 * and cache new access. 5640 * set a new r_time_attr_inval 5641 * add name to dnlc, possibly negative 5642 * if LOOKUP succeeded 5643 * cache new attributes 5644 */ 5645 static int 5646 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5647 { 5648 COMPOUND4args_clnt args; 5649 COMPOUND4res_clnt res; 5650 fattr4 *ver_fattr; 5651 fattr4_change dchange; 5652 int32_t *ptr; 5653 nfs4_ga_res_t *garp = NULL; 5654 int argoplist_size = 9 * sizeof (nfs_argop4); 5655 nfs_argop4 *argop; 5656 int doqueue; 5657 mntinfo4_t *mi; 5658 nfs4_recov_state_t recov_state; 5659 hrtime_t t; 5660 int isdotdot; 5661 vnode_t *nvp; 5662 nfs_fh4 *fhp; 5663 nfs4_sharedfh_t *sfhp; 5664 nfs4_access_type_t cacc; 5665 rnode4_t *nrp; 5666 rnode4_t *drp = VTOR4(dvp); 5667 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5668 5669 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5670 ASSERT(nm != NULL); 5671 ASSERT(nm[0] != '\0'); 5672 ASSERT(dvp->v_type == VDIR); 5673 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5674 ASSERT(*vpp == NULL); 5675 5676 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5677 isdotdot = 1; 5678 args.ctag = TAG_LOOKUP_PARENT; 5679 } else { 5680 /* 5681 * If dvp were a stub, it should have triggered and caused 5682 * a mount for us to get this far. 5683 */ 5684 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5685 5686 isdotdot = 0; 5687 args.ctag = TAG_LOOKUP; 5688 } 5689 5690 mi = VTOMI4(dvp); 5691 recov_state.rs_flags = 0; 5692 recov_state.rs_num_retry_despite_err = 0; 5693 5694 nvp = NULL; 5695 5696 /* Save the original mount point security information */ 5697 (void) save_mnt_secinfo(mi->mi_curr_serv); 5698 5699 recov_retry: 5700 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5701 &recov_state, NULL); 5702 if (e.error) { 5703 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5704 return (e.error); 5705 } 5706 5707 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5708 5709 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5710 args.array_len = 9; 5711 args.array = argop; 5712 5713 /* 0. putfh file */ 5714 argop[0].argop = OP_CPUTFH; 5715 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5716 5717 /* 1. savefh for the nverify */ 5718 argop[1].argop = OP_SAVEFH; 5719 5720 /* 2. lookup name */ 5721 if (isdotdot) { 5722 argop[2].argop = OP_LOOKUPP; 5723 } else { 5724 argop[2].argop = OP_CLOOKUP; 5725 argop[2].nfs_argop4_u.opclookup.cname = nm; 5726 } 5727 5728 /* 3. resulting file handle */ 5729 argop[3].argop = OP_GETFH; 5730 5731 /* 4. resulting file attributes */ 5732 argop[4].argop = OP_GETATTR; 5733 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5734 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5735 5736 /* 5. restorefh back the directory for the nverify */ 5737 argop[5].argop = OP_RESTOREFH; 5738 5739 /* 6. nverify the change info */ 5740 argop[6].argop = OP_NVERIFY; 5741 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5742 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5743 ver_fattr->attrlist4 = (char *)&dchange; 5744 ptr = (int32_t *)&dchange; 5745 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5746 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5747 5748 /* 7. getattr directory */ 5749 argop[7].argop = OP_GETATTR; 5750 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5751 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5752 5753 /* 8. access directory */ 5754 argop[8].argop = OP_ACCESS; 5755 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5756 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5757 5758 doqueue = 1; 5759 t = gethrtime(); 5760 5761 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5762 5763 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5764 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5765 if (e.error != 0 && *vpp != NULL) 5766 VN_RELE(*vpp); 5767 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5768 &recov_state, FALSE); 5769 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5770 kmem_free(argop, argoplist_size); 5771 return (e.error); 5772 } 5773 5774 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5775 /* 5776 * For WRONGSEC of a non-dotdot case, send secinfo directly 5777 * from this thread, do not go thru the recovery thread since 5778 * we need the nm information. 5779 * 5780 * Not doing dotdot case because there is no specification 5781 * for (PUTFH, SECINFO "..") yet. 5782 */ 5783 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5784 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5785 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5786 &recov_state, FALSE); 5787 else 5788 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5789 &recov_state, TRUE); 5790 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5791 kmem_free(argop, argoplist_size); 5792 if (!e.error) 5793 goto recov_retry; 5794 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5795 return (e.error); 5796 } 5797 5798 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5799 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5800 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5801 &recov_state, TRUE); 5802 5803 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5804 kmem_free(argop, argoplist_size); 5805 goto recov_retry; 5806 } 5807 } 5808 5809 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5810 5811 if (e.error || res.array_len == 0) { 5812 /* 5813 * If e.error isn't set, then reply has no ops (or we couldn't 5814 * be here). The only legal way to reply without an op array 5815 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5816 * be in the reply for all other status values. 5817 * 5818 * For valid replies without an ops array, return ENOTSUP 5819 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5820 * return EIO -- don't trust status. 5821 */ 5822 if (e.error == 0) 5823 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5824 ENOTSUP : EIO; 5825 5826 kmem_free(argop, argoplist_size); 5827 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5828 return (e.error); 5829 } 5830 5831 e.error = geterrno4(res.status); 5832 5833 /* 5834 * The PUTFH and SAVEFH may have failed. 5835 */ 5836 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5837 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5838 nfs4_purge_stale_fh(e.error, dvp, cr); 5839 goto exit; 5840 } 5841 5842 /* 5843 * Check if the file exists, if it does delay entering 5844 * into the dnlc until after we update the directory 5845 * attributes so we don't cause it to get purged immediately. 5846 */ 5847 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5848 /* 5849 * The lookup failed, probably no entry 5850 */ 5851 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5852 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5853 goto exit; 5854 } 5855 5856 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5857 /* 5858 * The file exists but we can't get its fh for 5859 * some unknown reason. Error out to be safe. 5860 */ 5861 goto exit; 5862 } 5863 5864 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5865 if (fhp->nfs_fh4_len == 0) { 5866 /* 5867 * The file exists but a bogus fh 5868 * some unknown reason. Error out to be safe. 5869 */ 5870 e.error = EIO; 5871 goto exit; 5872 } 5873 sfhp = sfh4_get(fhp, mi); 5874 5875 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5876 sfh4_rele(&sfhp); 5877 goto exit; 5878 } 5879 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5880 5881 /* 5882 * The RESTOREFH may have failed 5883 */ 5884 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5885 sfh4_rele(&sfhp); 5886 e.error = EIO; 5887 goto exit; 5888 } 5889 5890 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5891 /* 5892 * First make sure the NVERIFY failed as we expected, 5893 * if it didn't then be conservative and error out 5894 * as we can't trust the directory. 5895 */ 5896 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5897 sfh4_rele(&sfhp); 5898 e.error = EIO; 5899 goto exit; 5900 } 5901 5902 /* 5903 * We know the NVERIFY "failed" so the directory has changed, 5904 * so we must: 5905 * purge the caches (access and indirectly dnlc if needed) 5906 */ 5907 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5908 5909 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5910 sfh4_rele(&sfhp); 5911 goto exit; 5912 } 5913 nfs4_attr_cache(dvp, 5914 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5915 t, cr, FALSE, NULL); 5916 5917 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5918 nfs4_purge_stale_fh(e.error, dvp, cr); 5919 sfh4_rele(&sfhp); 5920 e.error = geterrno4(res.status); 5921 goto exit; 5922 } 5923 5924 /* 5925 * Now we know the directory is valid, 5926 * cache new directory access 5927 */ 5928 nfs4_access_cache(drp, 5929 args.array[8].nfs_argop4_u.opaccess.access, 5930 res.array[8].nfs_resop4_u.opaccess.access, cr); 5931 5932 /* 5933 * recheck VEXEC access 5934 */ 5935 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5936 if (cacc != NFS4_ACCESS_ALLOWED) { 5937 /* 5938 * Directory permissions might have been revoked 5939 */ 5940 if (cacc == NFS4_ACCESS_DENIED) { 5941 sfh4_rele(&sfhp); 5942 e.error = EACCES; 5943 goto exit; 5944 } 5945 5946 /* 5947 * Somehow we must not have asked for enough 5948 * so try a singleton ACCESS should never happen 5949 */ 5950 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5951 if (e.error) { 5952 sfh4_rele(&sfhp); 5953 goto exit; 5954 } 5955 } 5956 5957 e.error = geterrno4(res.status); 5958 } else { 5959 hrtime_t now; 5960 hrtime_t delta = 0; 5961 5962 e.error = 0; 5963 5964 /* 5965 * Because the NVERIFY "succeeded" we know that the 5966 * directory attributes are still valid 5967 * so update r_time_attr_inval 5968 */ 5969 now = gethrtime(); 5970 mutex_enter(&drp->r_statelock); 5971 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5972 delta = now - drp->r_time_attr_saved; 5973 if (delta < mi->mi_acdirmin) 5974 delta = mi->mi_acdirmin; 5975 else if (delta > mi->mi_acdirmax) 5976 delta = mi->mi_acdirmax; 5977 } 5978 drp->r_time_attr_inval = now + delta; 5979 mutex_exit(&drp->r_statelock); 5980 5981 /* 5982 * Even though we have a valid directory attr cache, 5983 * we may not have access. 5984 * This should almost always hit the cache. 5985 */ 5986 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5987 if (e.error) { 5988 sfh4_rele(&sfhp); 5989 goto exit; 5990 } 5991 } 5992 5993 /* 5994 * Now we have successfully completed the lookup, if the 5995 * directory has changed we now have the valid attributes. 5996 * We also know we have directory access. 5997 * Create the new rnode and insert it in the dnlc. 5998 */ 5999 if (isdotdot) { 6000 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 6001 if (e.error) { 6002 sfh4_rele(&sfhp); 6003 goto exit; 6004 } 6005 /* 6006 * XXX if nfs4_make_dotdot uses an existing rnode 6007 * XXX it doesn't update the attributes. 6008 * XXX for now just save them again to save an OTW 6009 */ 6010 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 6011 } else { 6012 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 6013 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 6014 } 6015 sfh4_rele(&sfhp); 6016 6017 nrp = VTOR4(nvp); 6018 mutex_enter(&nrp->r_statev4_lock); 6019 if (!nrp->created_v4) { 6020 mutex_exit(&nrp->r_statev4_lock); 6021 dnlc_update(dvp, nm, nvp); 6022 } else 6023 mutex_exit(&nrp->r_statev4_lock); 6024 6025 *vpp = nvp; 6026 6027 exit: 6028 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6029 kmem_free(argop, argoplist_size); 6030 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 6031 return (e.error); 6032 } 6033 6034 #ifdef DEBUG 6035 void 6036 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 6037 { 6038 uint_t i, len; 6039 zoneid_t zoneid = getzoneid(); 6040 char *s; 6041 6042 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 6043 for (i = 0; i < argcnt; i++) { 6044 nfs_argop4 *op = &argbase[i]; 6045 switch (op->argop) { 6046 case OP_CPUTFH: 6047 case OP_PUTFH: 6048 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 6049 break; 6050 case OP_PUTROOTFH: 6051 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 6052 break; 6053 case OP_CLOOKUP: 6054 s = op->nfs_argop4_u.opclookup.cname; 6055 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6056 break; 6057 case OP_LOOKUP: 6058 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6059 &len, NULL); 6060 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6061 kmem_free(s, len); 6062 break; 6063 case OP_LOOKUPP: 6064 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6065 break; 6066 case OP_GETFH: 6067 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6068 break; 6069 case OP_GETATTR: 6070 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6071 break; 6072 case OP_OPENATTR: 6073 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6074 break; 6075 default: 6076 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6077 op->argop); 6078 break; 6079 } 6080 } 6081 } 6082 #endif 6083 6084 /* 6085 * nfs4lookup_setup - constructs a multi-lookup compound request. 6086 * 6087 * Given the path "nm1/nm2/.../nmn", the following compound requests 6088 * may be created: 6089 * 6090 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6091 * is faster, for now. 6092 * 6093 * l4_getattrs indicates the type of compound requested. 6094 * 6095 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6096 * 6097 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6098 * 6099 * total number of ops is n + 1. 6100 * 6101 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6102 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6103 * before the last component, and only get attributes 6104 * for the last component. Note that the second-to-last 6105 * pathname component is XATTR_RPATH, which does NOT go 6106 * over-the-wire as a lookup. 6107 * 6108 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6109 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6110 * 6111 * and total number of ops is n + 5. 6112 * 6113 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6114 * attribute directory: create lookups plus an OPENATTR 6115 * replacing the last lookup. Note that the last pathname 6116 * component is XATTR_RPATH, which does NOT go over-the-wire 6117 * as a lookup. 6118 * 6119 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6120 * Openattr; Getfh; Getattr } 6121 * 6122 * and total number of ops is n + 5. 6123 * 6124 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6125 * nodes too. 6126 * 6127 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6128 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6129 * 6130 * and total number of ops is 3*n + 1. 6131 * 6132 * All cases: returns the index in the arg array of the final LOOKUP op, or 6133 * -1 if no LOOKUPs were used. 6134 */ 6135 int 6136 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6137 { 6138 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6139 nfs_argop4 *argbase, *argop; 6140 int arglen, argcnt; 6141 int n = 1; /* number of components */ 6142 int nga = 1; /* number of Getattr's in request */ 6143 char c = '\0', *s, *p; 6144 int lookup_idx = -1; 6145 int argoplist_size; 6146 6147 /* set lookuparg response result to 0 */ 6148 lookupargp->resp->status = NFS4_OK; 6149 6150 /* skip leading "/" or "." e.g. ".//./" if there is */ 6151 for (; ; nm++) { 6152 if (*nm != '/' && *nm != '.') 6153 break; 6154 6155 /* ".." is counted as 1 component */ 6156 if (*nm == '.' && *(nm + 1) != '/') 6157 break; 6158 } 6159 6160 /* 6161 * Find n = number of components - nm must be null terminated 6162 * Skip "." components. 6163 */ 6164 if (*nm != '\0') 6165 for (n = 1, s = nm; *s != '\0'; s++) { 6166 if ((*s == '/') && (*(s + 1) != '/') && 6167 (*(s + 1) != '\0') && 6168 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6169 *(s + 2) == '\0'))) 6170 n++; 6171 } 6172 else 6173 n = 0; 6174 6175 /* 6176 * nga is number of components that need Getfh+Getattr 6177 */ 6178 switch (l4_getattrs) { 6179 case LKP4_NO_ATTRIBUTES: 6180 nga = 0; 6181 break; 6182 case LKP4_ALL_ATTRIBUTES: 6183 nga = n; 6184 /* 6185 * Always have at least 1 getfh, getattr pair 6186 */ 6187 if (nga == 0) 6188 nga++; 6189 break; 6190 case LKP4_LAST_ATTRDIR: 6191 case LKP4_LAST_NAMED_ATTR: 6192 nga = n+1; 6193 break; 6194 } 6195 6196 /* 6197 * If change to use the filehandle attr instead of getfh 6198 * the following line can be deleted. 6199 */ 6200 nga *= 2; 6201 6202 /* 6203 * calculate number of ops in request as 6204 * header + trailer + lookups + getattrs 6205 */ 6206 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6207 6208 argoplist_size = arglen * sizeof (nfs_argop4); 6209 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6210 lookupargp->argsp->array = argop; 6211 6212 argcnt = lookupargp->header_len; 6213 argop += argcnt; 6214 6215 /* 6216 * loop and create a lookup op and possibly getattr/getfh for 6217 * each component. Skip "." components. 6218 */ 6219 for (s = nm; *s != '\0'; s = p) { 6220 /* 6221 * Set up a pathname struct for each component if needed 6222 */ 6223 while (*s == '/') 6224 s++; 6225 if (*s == '\0') 6226 break; 6227 6228 for (p = s; (*p != '/') && (*p != '\0'); p++) 6229 ; 6230 c = *p; 6231 *p = '\0'; 6232 6233 if (s[0] == '.' && s[1] == '\0') { 6234 *p = c; 6235 continue; 6236 } 6237 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6238 strcmp(s, XATTR_RPATH) == 0) { 6239 /* getfh XXX may not be needed in future */ 6240 argop->argop = OP_GETFH; 6241 argop++; 6242 argcnt++; 6243 6244 /* getattr */ 6245 argop->argop = OP_GETATTR; 6246 argop->nfs_argop4_u.opgetattr.attr_request = 6247 lookupargp->ga_bits; 6248 argop->nfs_argop4_u.opgetattr.mi = 6249 lookupargp->mi; 6250 argop++; 6251 argcnt++; 6252 6253 /* openattr */ 6254 argop->argop = OP_OPENATTR; 6255 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6256 strcmp(s, XATTR_RPATH) == 0) { 6257 /* openattr */ 6258 argop->argop = OP_OPENATTR; 6259 argop++; 6260 argcnt++; 6261 6262 /* getfh XXX may not be needed in future */ 6263 argop->argop = OP_GETFH; 6264 argop++; 6265 argcnt++; 6266 6267 /* getattr */ 6268 argop->argop = OP_GETATTR; 6269 argop->nfs_argop4_u.opgetattr.attr_request = 6270 lookupargp->ga_bits; 6271 argop->nfs_argop4_u.opgetattr.mi = 6272 lookupargp->mi; 6273 argop++; 6274 argcnt++; 6275 *p = c; 6276 continue; 6277 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6278 /* lookupp */ 6279 argop->argop = OP_LOOKUPP; 6280 } else { 6281 /* lookup */ 6282 argop->argop = OP_LOOKUP; 6283 (void) str_to_utf8(s, 6284 &argop->nfs_argop4_u.oplookup.objname); 6285 } 6286 lookup_idx = argcnt; 6287 argop++; 6288 argcnt++; 6289 6290 *p = c; 6291 6292 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6293 /* getfh XXX may not be needed in future */ 6294 argop->argop = OP_GETFH; 6295 argop++; 6296 argcnt++; 6297 6298 /* getattr */ 6299 argop->argop = OP_GETATTR; 6300 argop->nfs_argop4_u.opgetattr.attr_request = 6301 lookupargp->ga_bits; 6302 argop->nfs_argop4_u.opgetattr.mi = 6303 lookupargp->mi; 6304 argop++; 6305 argcnt++; 6306 } 6307 } 6308 6309 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6310 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6311 if (needgetfh) { 6312 /* stick in a post-lookup getfh */ 6313 argop->argop = OP_GETFH; 6314 argcnt++; 6315 argop++; 6316 } 6317 /* post-lookup getattr */ 6318 argop->argop = OP_GETATTR; 6319 argop->nfs_argop4_u.opgetattr.attr_request = 6320 lookupargp->ga_bits; 6321 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6322 argcnt++; 6323 } 6324 argcnt += lookupargp->trailer_len; /* actual op count */ 6325 lookupargp->argsp->array_len = argcnt; 6326 lookupargp->arglen = arglen; 6327 6328 #ifdef DEBUG 6329 if (nfs4_client_lookup_debug) 6330 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6331 #endif 6332 6333 return (lookup_idx); 6334 } 6335 6336 static int 6337 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6338 { 6339 COMPOUND4args_clnt args; 6340 COMPOUND4res_clnt res; 6341 GETFH4res *gf_res = NULL; 6342 nfs_argop4 argop[4]; 6343 nfs_resop4 *resop = NULL; 6344 nfs4_sharedfh_t *sfhp; 6345 hrtime_t t; 6346 nfs4_error_t e; 6347 6348 rnode4_t *drp; 6349 int doqueue = 1; 6350 vnode_t *vp; 6351 int needrecov = 0; 6352 nfs4_recov_state_t recov_state; 6353 6354 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6355 6356 *avp = NULL; 6357 recov_state.rs_flags = 0; 6358 recov_state.rs_num_retry_despite_err = 0; 6359 6360 recov_retry: 6361 /* COMPOUND: putfh, openattr, getfh, getattr */ 6362 args.array_len = 4; 6363 args.array = argop; 6364 args.ctag = TAG_OPENATTR; 6365 6366 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6367 if (e.error) 6368 return (e.error); 6369 6370 drp = VTOR4(dvp); 6371 6372 /* putfh */ 6373 argop[0].argop = OP_CPUTFH; 6374 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6375 6376 /* openattr */ 6377 argop[1].argop = OP_OPENATTR; 6378 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6379 6380 /* getfh */ 6381 argop[2].argop = OP_GETFH; 6382 6383 /* getattr */ 6384 argop[3].argop = OP_GETATTR; 6385 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6386 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6387 6388 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6389 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6390 rnode4info(drp))); 6391 6392 t = gethrtime(); 6393 6394 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6395 6396 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6397 if (needrecov) { 6398 bool_t abort; 6399 6400 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6401 "nfs4openattr: initiating recovery\n")); 6402 6403 abort = nfs4_start_recovery(&e, 6404 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6405 OP_OPENATTR, NULL, NULL, NULL); 6406 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6407 if (!e.error) { 6408 e.error = geterrno4(res.status); 6409 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6410 } 6411 if (abort == FALSE) 6412 goto recov_retry; 6413 return (e.error); 6414 } 6415 6416 if (e.error) { 6417 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6418 return (e.error); 6419 } 6420 6421 if (res.status) { 6422 /* 6423 * If OTW errro is NOTSUPP, then it should be 6424 * translated to EINVAL. All Solaris file system 6425 * implementations return EINVAL to the syscall layer 6426 * when the attrdir cannot be created due to an 6427 * implementation restriction or noxattr mount option. 6428 */ 6429 if (res.status == NFS4ERR_NOTSUPP) { 6430 mutex_enter(&drp->r_statelock); 6431 if (drp->r_xattr_dir) 6432 VN_RELE(drp->r_xattr_dir); 6433 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6434 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6435 mutex_exit(&drp->r_statelock); 6436 6437 e.error = EINVAL; 6438 } else { 6439 e.error = geterrno4(res.status); 6440 } 6441 6442 if (e.error) { 6443 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6444 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6445 needrecov); 6446 return (e.error); 6447 } 6448 } 6449 6450 resop = &res.array[0]; /* putfh res */ 6451 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6452 6453 resop = &res.array[1]; /* openattr res */ 6454 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6455 6456 resop = &res.array[2]; /* getfh res */ 6457 gf_res = &resop->nfs_resop4_u.opgetfh; 6458 if (gf_res->object.nfs_fh4_len == 0) { 6459 *avp = NULL; 6460 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6461 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6462 return (ENOENT); 6463 } 6464 6465 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6466 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6467 dvp->v_vfsp, t, cr, dvp, 6468 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6469 sfh4_rele(&sfhp); 6470 6471 if (e.error) 6472 PURGE_ATTRCACHE4(vp); 6473 6474 mutex_enter(&vp->v_lock); 6475 vp->v_flag |= V_XATTRDIR; 6476 mutex_exit(&vp->v_lock); 6477 6478 *avp = vp; 6479 6480 mutex_enter(&drp->r_statelock); 6481 if (drp->r_xattr_dir) 6482 VN_RELE(drp->r_xattr_dir); 6483 VN_HOLD(vp); 6484 drp->r_xattr_dir = vp; 6485 6486 /* 6487 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6488 * NULL. xattrs could be created at any time, and we have no 6489 * way to update pc4_xattr_exists in the base object if/when 6490 * it happens. 6491 */ 6492 drp->r_pathconf.pc4_xattr_valid = 0; 6493 6494 mutex_exit(&drp->r_statelock); 6495 6496 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6497 6498 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6499 6500 return (0); 6501 } 6502 6503 /* ARGSUSED */ 6504 static int 6505 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6506 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6507 vsecattr_t *vsecp) 6508 { 6509 int error; 6510 vnode_t *vp = NULL; 6511 rnode4_t *rp; 6512 struct vattr vattr; 6513 rnode4_t *drp; 6514 vnode_t *tempvp; 6515 enum createmode4 createmode; 6516 bool_t must_trunc = FALSE; 6517 int truncating = 0; 6518 6519 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6520 return (EPERM); 6521 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6522 return (EINVAL); 6523 } 6524 6525 /* . and .. have special meaning in the protocol, reject them. */ 6526 6527 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6528 return (EISDIR); 6529 6530 drp = VTOR4(dvp); 6531 6532 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6533 return (EINTR); 6534 6535 top: 6536 /* 6537 * We make a copy of the attributes because the caller does not 6538 * expect us to change what va points to. 6539 */ 6540 vattr = *va; 6541 6542 /* 6543 * If the pathname is "", then dvp is the root vnode of 6544 * a remote file mounted over a local directory. 6545 * All that needs to be done is access 6546 * checking and truncation. Note that we avoid doing 6547 * open w/ create because the parent directory might 6548 * be in pseudo-fs and the open would fail. 6549 */ 6550 if (*nm == '\0') { 6551 error = 0; 6552 VN_HOLD(dvp); 6553 vp = dvp; 6554 must_trunc = TRUE; 6555 } else { 6556 /* 6557 * We need to go over the wire, just to be sure whether the 6558 * file exists or not. Using the DNLC can be dangerous in 6559 * this case when making a decision regarding existence. 6560 */ 6561 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6562 } 6563 6564 if (exclusive) 6565 createmode = EXCLUSIVE4; 6566 else 6567 createmode = GUARDED4; 6568 6569 /* 6570 * error would be set if the file does not exist on the 6571 * server, so lets go create it. 6572 */ 6573 if (error) { 6574 goto create_otw; 6575 } 6576 6577 /* 6578 * File does exist on the server 6579 */ 6580 if (exclusive == EXCL) 6581 error = EEXIST; 6582 else if (vp->v_type == VDIR && (mode & VWRITE)) 6583 error = EISDIR; 6584 else { 6585 /* 6586 * If vnode is a device, create special vnode. 6587 */ 6588 if (ISVDEV(vp->v_type)) { 6589 tempvp = vp; 6590 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6591 VN_RELE(tempvp); 6592 } 6593 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6594 if ((vattr.va_mask & AT_SIZE) && 6595 vp->v_type == VREG) { 6596 rp = VTOR4(vp); 6597 /* 6598 * Check here for large file handled 6599 * by LF-unaware process (as 6600 * ufs_create() does) 6601 */ 6602 if (!(flags & FOFFMAX)) { 6603 mutex_enter(&rp->r_statelock); 6604 if (rp->r_size > MAXOFF32_T) 6605 error = EOVERFLOW; 6606 mutex_exit(&rp->r_statelock); 6607 } 6608 6609 /* if error is set then we need to return */ 6610 if (error) { 6611 nfs_rw_exit(&drp->r_rwlock); 6612 VN_RELE(vp); 6613 return (error); 6614 } 6615 6616 if (must_trunc) { 6617 vattr.va_mask = AT_SIZE; 6618 error = nfs4setattr(vp, &vattr, 0, cr, 6619 NULL); 6620 } else { 6621 /* 6622 * we know we have a regular file that already 6623 * exists and we may end up truncating the file 6624 * as a result of the open_otw, so flush out 6625 * any dirty pages for this file first. 6626 */ 6627 if (nfs4_has_pages(vp) && 6628 ((rp->r_flags & R4DIRTY) || 6629 rp->r_count > 0 || 6630 rp->r_mapcnt > 0)) { 6631 error = nfs4_putpage(vp, 6632 (offset_t)0, 0, 0, cr, ct); 6633 if (error && (error == ENOSPC || 6634 error == EDQUOT)) { 6635 mutex_enter( 6636 &rp->r_statelock); 6637 if (!rp->r_error) 6638 rp->r_error = 6639 error; 6640 mutex_exit( 6641 &rp->r_statelock); 6642 } 6643 } 6644 vattr.va_mask = (AT_SIZE | 6645 AT_TYPE | AT_MODE); 6646 vattr.va_type = VREG; 6647 createmode = UNCHECKED4; 6648 truncating = 1; 6649 goto create_otw; 6650 } 6651 } 6652 } 6653 } 6654 nfs_rw_exit(&drp->r_rwlock); 6655 if (error) { 6656 VN_RELE(vp); 6657 } else { 6658 vnode_t *tvp; 6659 rnode4_t *trp; 6660 tvp = vp; 6661 if (vp->v_type == VREG) { 6662 trp = VTOR4(vp); 6663 if (IS_SHADOW(vp, trp)) 6664 tvp = RTOV4(trp); 6665 } 6666 6667 if (must_trunc) { 6668 /* 6669 * existing file got truncated, notify. 6670 */ 6671 vnevent_create(tvp, ct); 6672 } 6673 6674 *vpp = vp; 6675 } 6676 return (error); 6677 6678 create_otw: 6679 dnlc_remove(dvp, nm); 6680 6681 ASSERT(vattr.va_mask & AT_TYPE); 6682 6683 /* 6684 * If not a regular file let nfs4mknod() handle it. 6685 */ 6686 if (vattr.va_type != VREG) { 6687 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6688 nfs_rw_exit(&drp->r_rwlock); 6689 return (error); 6690 } 6691 6692 /* 6693 * It _is_ a regular file. 6694 */ 6695 ASSERT(vattr.va_mask & AT_MODE); 6696 if (MANDMODE(vattr.va_mode)) { 6697 nfs_rw_exit(&drp->r_rwlock); 6698 return (EACCES); 6699 } 6700 6701 /* 6702 * If this happens to be a mknod of a regular file, then flags will 6703 * have neither FREAD or FWRITE. However, we must set at least one 6704 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6705 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6706 * set (based on openmode specified by app). 6707 */ 6708 if ((flags & (FREAD|FWRITE)) == 0) 6709 flags |= (FREAD|FWRITE); 6710 6711 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6712 6713 if (vp != NULL) { 6714 /* if create was successful, throw away the file's pages */ 6715 if (!error && (vattr.va_mask & AT_SIZE)) 6716 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6717 cr); 6718 /* release the lookup hold */ 6719 VN_RELE(vp); 6720 vp = NULL; 6721 } 6722 6723 /* 6724 * validate that we opened a regular file. This handles a misbehaving 6725 * server that returns an incorrect FH. 6726 */ 6727 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6728 error = EISDIR; 6729 VN_RELE(*vpp); 6730 } 6731 6732 /* 6733 * If this is not an exclusive create, then the CREATE 6734 * request will be made with the GUARDED mode set. This 6735 * means that the server will return EEXIST if the file 6736 * exists. The file could exist because of a retransmitted 6737 * request. In this case, we recover by starting over and 6738 * checking to see whether the file exists. This second 6739 * time through it should and a CREATE request will not be 6740 * sent. 6741 * 6742 * This handles the problem of a dangling CREATE request 6743 * which contains attributes which indicate that the file 6744 * should be truncated. This retransmitted request could 6745 * possibly truncate valid data in the file if not caught 6746 * by the duplicate request mechanism on the server or if 6747 * not caught by other means. The scenario is: 6748 * 6749 * Client transmits CREATE request with size = 0 6750 * Client times out, retransmits request. 6751 * Response to the first request arrives from the server 6752 * and the client proceeds on. 6753 * Client writes data to the file. 6754 * The server now processes retransmitted CREATE request 6755 * and truncates file. 6756 * 6757 * The use of the GUARDED CREATE request prevents this from 6758 * happening because the retransmitted CREATE would fail 6759 * with EEXIST and would not truncate the file. 6760 */ 6761 if (error == EEXIST && exclusive == NONEXCL) { 6762 #ifdef DEBUG 6763 nfs4_create_misses++; 6764 #endif 6765 goto top; 6766 } 6767 nfs_rw_exit(&drp->r_rwlock); 6768 if (truncating && !error && *vpp) { 6769 vnode_t *tvp; 6770 rnode4_t *trp; 6771 /* 6772 * existing file got truncated, notify. 6773 */ 6774 tvp = *vpp; 6775 trp = VTOR4(tvp); 6776 if (IS_SHADOW(tvp, trp)) 6777 tvp = RTOV4(trp); 6778 vnevent_create(tvp, ct); 6779 } 6780 return (error); 6781 } 6782 6783 /* 6784 * Create compound (for mkdir, mknod, symlink): 6785 * { Putfh <dfh>; Create; Getfh; Getattr } 6786 * It's okay if setattr failed to set gid - this is not considered 6787 * an error, but purge attrs in that case. 6788 */ 6789 static int 6790 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6791 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6792 { 6793 int need_end_op = FALSE; 6794 COMPOUND4args_clnt args; 6795 COMPOUND4res_clnt res, *resp = NULL; 6796 nfs_argop4 *argop; 6797 nfs_resop4 *resop; 6798 int doqueue; 6799 mntinfo4_t *mi; 6800 rnode4_t *drp = VTOR4(dvp); 6801 change_info4 *cinfo; 6802 GETFH4res *gf_res; 6803 struct vattr vattr; 6804 vnode_t *vp; 6805 fattr4 *crattr; 6806 bool_t needrecov = FALSE; 6807 nfs4_recov_state_t recov_state; 6808 nfs4_sharedfh_t *sfhp = NULL; 6809 hrtime_t t; 6810 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6811 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6812 dirattr_info_t dinfo, *dinfop; 6813 servinfo4_t *svp; 6814 bitmap4 supp_attrs; 6815 6816 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6817 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6818 6819 mi = VTOMI4(dvp); 6820 6821 /* 6822 * Make sure we properly deal with setting the right gid 6823 * on a new directory to reflect the parent's setgid bit 6824 */ 6825 setgid_flag = 0; 6826 if (type == NF4DIR) { 6827 struct vattr dva; 6828 6829 va->va_mode &= ~VSGID; 6830 dva.va_mask = AT_MODE | AT_GID; 6831 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6832 6833 /* 6834 * If the parent's directory has the setgid bit set 6835 * _and_ the client was able to get a valid mapping 6836 * for the parent dir's owner_group, we want to 6837 * append NVERIFY(owner_group == dva.va_gid) and 6838 * SETTATTR to the CREATE compound. 6839 */ 6840 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6841 setgid_flag = 1; 6842 va->va_mode |= VSGID; 6843 if (dva.va_gid != GID_NOBODY) { 6844 va->va_mask |= AT_GID; 6845 va->va_gid = dva.va_gid; 6846 } 6847 } 6848 } 6849 } 6850 6851 /* 6852 * Create ops: 6853 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6854 * 5:restorefh(dir) 6:getattr(dir) 6855 * 6856 * if (setgid) 6857 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6858 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6859 * 8:nverify 9:setattr 6860 */ 6861 if (setgid_flag) { 6862 numops = 10; 6863 idx_create = 1; 6864 idx_fattr = 3; 6865 } else { 6866 numops = 7; 6867 idx_create = 2; 6868 idx_fattr = 4; 6869 } 6870 6871 ASSERT(nfs_zone() == mi->mi_zone); 6872 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6873 return (EINTR); 6874 } 6875 recov_state.rs_flags = 0; 6876 recov_state.rs_num_retry_despite_err = 0; 6877 6878 argoplist_size = numops * sizeof (nfs_argop4); 6879 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6880 6881 recov_retry: 6882 if (type == NF4LNK) 6883 args.ctag = TAG_SYMLINK; 6884 else if (type == NF4DIR) 6885 args.ctag = TAG_MKDIR; 6886 else 6887 args.ctag = TAG_MKNOD; 6888 6889 args.array_len = numops; 6890 args.array = argop; 6891 6892 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6893 nfs_rw_exit(&drp->r_rwlock); 6894 kmem_free(argop, argoplist_size); 6895 return (e.error); 6896 } 6897 need_end_op = TRUE; 6898 6899 6900 /* 0: putfh directory */ 6901 argop[0].argop = OP_CPUTFH; 6902 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6903 6904 /* 1/2: Create object */ 6905 argop[idx_create].argop = OP_CCREATE; 6906 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6907 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6908 if (type == NF4LNK) { 6909 /* 6910 * symlink, treat name as data 6911 */ 6912 ASSERT(data != NULL); 6913 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6914 (char *)data; 6915 } 6916 if (type == NF4BLK || type == NF4CHR) { 6917 ASSERT(data != NULL); 6918 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6919 *((specdata4 *)data); 6920 } 6921 6922 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6923 6924 svp = drp->r_server; 6925 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6926 supp_attrs = svp->sv_supp_attrs; 6927 nfs_rw_exit(&svp->sv_lock); 6928 6929 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6930 nfs_rw_exit(&drp->r_rwlock); 6931 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6932 e.error = EINVAL; 6933 kmem_free(argop, argoplist_size); 6934 return (e.error); 6935 } 6936 6937 /* 2/3: getfh fh of created object */ 6938 ASSERT(idx_create + 1 == idx_fattr - 1); 6939 argop[idx_create + 1].argop = OP_GETFH; 6940 6941 /* 3/4: getattr of new object */ 6942 argop[idx_fattr].argop = OP_GETATTR; 6943 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6944 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6945 6946 if (setgid_flag) { 6947 vattr_t _v; 6948 6949 argop[4].argop = OP_SAVEFH; 6950 6951 argop[5].argop = OP_CPUTFH; 6952 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6953 6954 argop[6].argop = OP_GETATTR; 6955 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6956 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6957 6958 argop[7].argop = OP_RESTOREFH; 6959 6960 /* 6961 * nverify 6962 * 6963 * XXX - Revisit the last argument to nfs4_end_op() 6964 * once 5020486 is fixed. 6965 */ 6966 _v.va_mask = AT_GID; 6967 _v.va_gid = va->va_gid; 6968 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6969 supp_attrs)) { 6970 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6971 nfs_rw_exit(&drp->r_rwlock); 6972 nfs4_fattr4_free(crattr); 6973 kmem_free(argop, argoplist_size); 6974 return (e.error); 6975 } 6976 6977 /* 6978 * setattr 6979 * 6980 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6981 * so no need for stateid or flags. Also we specify NULL 6982 * rp since we're only interested in setting owner_group 6983 * attributes. 6984 */ 6985 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6986 &e.error, 0); 6987 6988 if (e.error) { 6989 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6990 nfs_rw_exit(&drp->r_rwlock); 6991 nfs4_fattr4_free(crattr); 6992 nfs4args_verify_free(&argop[8]); 6993 kmem_free(argop, argoplist_size); 6994 return (e.error); 6995 } 6996 } else { 6997 argop[1].argop = OP_SAVEFH; 6998 6999 argop[5].argop = OP_RESTOREFH; 7000 7001 argop[6].argop = OP_GETATTR; 7002 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7003 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7004 } 7005 7006 dnlc_remove(dvp, nm); 7007 7008 doqueue = 1; 7009 t = gethrtime(); 7010 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7011 7012 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7013 if (e.error) { 7014 PURGE_ATTRCACHE4(dvp); 7015 if (!needrecov) 7016 goto out; 7017 } 7018 7019 if (needrecov) { 7020 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 7021 OP_CREATE, NULL, NULL, NULL) == FALSE) { 7022 nfs4_end_op(mi, dvp, NULL, &recov_state, 7023 needrecov); 7024 need_end_op = FALSE; 7025 nfs4_fattr4_free(crattr); 7026 if (setgid_flag) { 7027 nfs4args_verify_free(&argop[8]); 7028 nfs4args_setattr_free(&argop[9]); 7029 } 7030 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7031 goto recov_retry; 7032 } 7033 } 7034 7035 resp = &res; 7036 7037 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 7038 7039 if (res.status == NFS4ERR_BADOWNER) 7040 nfs4_log_badowner(mi, OP_CREATE); 7041 7042 e.error = geterrno4(res.status); 7043 7044 /* 7045 * This check is left over from when create was implemented 7046 * using a setattr op (instead of createattrs). If the 7047 * putfh/create/getfh failed, the error was returned. If 7048 * setattr/getattr failed, we keep going. 7049 * 7050 * It might be better to get rid of the GETFH also, and just 7051 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 7052 * Then if any of the operations failed, we could return the 7053 * error now, and remove much of the error code below. 7054 */ 7055 if (res.array_len <= idx_fattr) { 7056 /* 7057 * Either Putfh, Create or Getfh failed. 7058 */ 7059 PURGE_ATTRCACHE4(dvp); 7060 /* 7061 * nfs4_purge_stale_fh() may generate otw calls through 7062 * nfs4_invalidate_pages. Hence the need to call 7063 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7064 */ 7065 nfs4_end_op(mi, dvp, NULL, &recov_state, 7066 needrecov); 7067 need_end_op = FALSE; 7068 nfs4_purge_stale_fh(e.error, dvp, cr); 7069 goto out; 7070 } 7071 } 7072 7073 resop = &res.array[idx_create]; /* create res */ 7074 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7075 7076 resop = &res.array[idx_create + 1]; /* getfh res */ 7077 gf_res = &resop->nfs_resop4_u.opgetfh; 7078 7079 sfhp = sfh4_get(&gf_res->object, mi); 7080 if (e.error) { 7081 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7082 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7083 if (vp->v_type == VNON) { 7084 vattr.va_mask = AT_TYPE; 7085 /* 7086 * Need to call nfs4_end_op before nfs4getattr to avoid 7087 * potential nfs4_start_op deadlock. See RFE 4777612. 7088 */ 7089 nfs4_end_op(mi, dvp, NULL, &recov_state, 7090 needrecov); 7091 need_end_op = FALSE; 7092 e.error = nfs4getattr(vp, &vattr, cr); 7093 if (e.error) { 7094 VN_RELE(vp); 7095 *vpp = NULL; 7096 goto out; 7097 } 7098 vp->v_type = vattr.va_type; 7099 } 7100 e.error = 0; 7101 } else { 7102 *vpp = vp = makenfs4node(sfhp, 7103 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7104 dvp->v_vfsp, t, cr, 7105 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7106 } 7107 7108 /* 7109 * If compound succeeded, then update dir attrs 7110 */ 7111 if (res.status == NFS4_OK) { 7112 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7113 dinfo.di_cred = cr; 7114 dinfo.di_time_call = t; 7115 dinfop = &dinfo; 7116 } else 7117 dinfop = NULL; 7118 7119 /* Update directory cache attribute, readdir and dnlc caches */ 7120 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7121 7122 out: 7123 if (sfhp != NULL) 7124 sfh4_rele(&sfhp); 7125 nfs_rw_exit(&drp->r_rwlock); 7126 nfs4_fattr4_free(crattr); 7127 if (setgid_flag) { 7128 nfs4args_verify_free(&argop[8]); 7129 nfs4args_setattr_free(&argop[9]); 7130 } 7131 if (resp) 7132 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7133 if (need_end_op) 7134 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7135 7136 kmem_free(argop, argoplist_size); 7137 return (e.error); 7138 } 7139 7140 /* ARGSUSED */ 7141 static int 7142 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7143 int mode, vnode_t **vpp, cred_t *cr) 7144 { 7145 int error; 7146 vnode_t *vp; 7147 nfs_ftype4 type; 7148 specdata4 spec, *specp = NULL; 7149 7150 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7151 7152 switch (va->va_type) { 7153 case VCHR: 7154 case VBLK: 7155 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7156 spec.specdata1 = getmajor(va->va_rdev); 7157 spec.specdata2 = getminor(va->va_rdev); 7158 specp = &spec; 7159 break; 7160 7161 case VFIFO: 7162 type = NF4FIFO; 7163 break; 7164 case VSOCK: 7165 type = NF4SOCK; 7166 break; 7167 7168 default: 7169 return (EINVAL); 7170 } 7171 7172 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7173 if (error) { 7174 return (error); 7175 } 7176 7177 /* 7178 * This might not be needed any more; special case to deal 7179 * with problematic v2/v3 servers. Since create was unable 7180 * to set group correctly, not sure what hope setattr has. 7181 */ 7182 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7183 va->va_mask = AT_GID; 7184 (void) nfs4setattr(vp, va, 0, cr, NULL); 7185 } 7186 7187 /* 7188 * If vnode is a device create special vnode 7189 */ 7190 if (ISVDEV(vp->v_type)) { 7191 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7192 VN_RELE(vp); 7193 } else { 7194 *vpp = vp; 7195 } 7196 return (error); 7197 } 7198 7199 /* 7200 * Remove requires that the current fh be the target directory. 7201 * After the operation, the current fh is unchanged. 7202 * The compound op structure is: 7203 * PUTFH(targetdir), REMOVE 7204 * 7205 * Weirdness: if the vnode to be removed is open 7206 * we rename it instead of removing it and nfs_inactive 7207 * will remove the new name. 7208 */ 7209 /* ARGSUSED */ 7210 static int 7211 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7212 { 7213 COMPOUND4args_clnt args; 7214 COMPOUND4res_clnt res, *resp = NULL; 7215 REMOVE4res *rm_res; 7216 nfs_argop4 argop[3]; 7217 nfs_resop4 *resop; 7218 vnode_t *vp; 7219 char *tmpname; 7220 int doqueue; 7221 mntinfo4_t *mi; 7222 rnode4_t *rp; 7223 rnode4_t *drp; 7224 int needrecov = 0; 7225 nfs4_recov_state_t recov_state; 7226 int isopen; 7227 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7228 dirattr_info_t dinfo; 7229 7230 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7231 return (EPERM); 7232 drp = VTOR4(dvp); 7233 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7234 return (EINTR); 7235 7236 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7237 if (e.error) { 7238 nfs_rw_exit(&drp->r_rwlock); 7239 return (e.error); 7240 } 7241 7242 if (vp->v_type == VDIR) { 7243 VN_RELE(vp); 7244 nfs_rw_exit(&drp->r_rwlock); 7245 return (EISDIR); 7246 } 7247 7248 /* 7249 * First just remove the entry from the name cache, as it 7250 * is most likely the only entry for this vp. 7251 */ 7252 dnlc_remove(dvp, nm); 7253 7254 rp = VTOR4(vp); 7255 7256 /* 7257 * For regular file types, check to see if the file is open by looking 7258 * at the open streams. 7259 * For all other types, check the reference count on the vnode. Since 7260 * they are not opened OTW they never have an open stream. 7261 * 7262 * If the file is open, rename it to .nfsXXXX. 7263 */ 7264 if (vp->v_type != VREG) { 7265 /* 7266 * If the file has a v_count > 1 then there may be more than one 7267 * entry in the name cache due multiple links or an open file, 7268 * but we don't have the real reference count so flush all 7269 * possible entries. 7270 */ 7271 if (vp->v_count > 1) 7272 dnlc_purge_vp(vp); 7273 7274 /* 7275 * Now we have the real reference count. 7276 */ 7277 isopen = vp->v_count > 1; 7278 } else { 7279 mutex_enter(&rp->r_os_lock); 7280 isopen = list_head(&rp->r_open_streams) != NULL; 7281 mutex_exit(&rp->r_os_lock); 7282 } 7283 7284 mutex_enter(&rp->r_statelock); 7285 if (isopen && 7286 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7287 mutex_exit(&rp->r_statelock); 7288 tmpname = newname(); 7289 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7290 if (e.error) 7291 kmem_free(tmpname, MAXNAMELEN); 7292 else { 7293 mutex_enter(&rp->r_statelock); 7294 if (rp->r_unldvp == NULL) { 7295 VN_HOLD(dvp); 7296 rp->r_unldvp = dvp; 7297 if (rp->r_unlcred != NULL) 7298 crfree(rp->r_unlcred); 7299 crhold(cr); 7300 rp->r_unlcred = cr; 7301 rp->r_unlname = tmpname; 7302 } else { 7303 kmem_free(rp->r_unlname, MAXNAMELEN); 7304 rp->r_unlname = tmpname; 7305 } 7306 mutex_exit(&rp->r_statelock); 7307 } 7308 VN_RELE(vp); 7309 nfs_rw_exit(&drp->r_rwlock); 7310 return (e.error); 7311 } 7312 /* 7313 * Actually remove the file/dir 7314 */ 7315 mutex_exit(&rp->r_statelock); 7316 7317 /* 7318 * We need to flush any dirty pages which happen to 7319 * be hanging around before removing the file. 7320 * This shouldn't happen very often since in NFSv4 7321 * we should be close to open consistent. 7322 */ 7323 if (nfs4_has_pages(vp) && 7324 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7325 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7326 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7327 mutex_enter(&rp->r_statelock); 7328 if (!rp->r_error) 7329 rp->r_error = e.error; 7330 mutex_exit(&rp->r_statelock); 7331 } 7332 } 7333 7334 mi = VTOMI4(dvp); 7335 7336 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7337 recov_state.rs_flags = 0; 7338 recov_state.rs_num_retry_despite_err = 0; 7339 7340 recov_retry: 7341 /* 7342 * Remove ops: putfh dir; remove 7343 */ 7344 args.ctag = TAG_REMOVE; 7345 args.array_len = 3; 7346 args.array = argop; 7347 7348 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7349 if (e.error) { 7350 nfs_rw_exit(&drp->r_rwlock); 7351 VN_RELE(vp); 7352 return (e.error); 7353 } 7354 7355 /* putfh directory */ 7356 argop[0].argop = OP_CPUTFH; 7357 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7358 7359 /* remove */ 7360 argop[1].argop = OP_CREMOVE; 7361 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7362 7363 /* getattr dir */ 7364 argop[2].argop = OP_GETATTR; 7365 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7366 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7367 7368 doqueue = 1; 7369 dinfo.di_time_call = gethrtime(); 7370 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7371 7372 PURGE_ATTRCACHE4(vp); 7373 7374 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7375 if (e.error) 7376 PURGE_ATTRCACHE4(dvp); 7377 7378 if (needrecov) { 7379 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7380 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 7381 if (!e.error) 7382 (void) xdr_free(xdr_COMPOUND4res_clnt, 7383 (caddr_t)&res); 7384 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7385 needrecov); 7386 goto recov_retry; 7387 } 7388 } 7389 7390 /* 7391 * Matching nfs4_end_op() for start_op() above. 7392 * There is a path in the code below which calls 7393 * nfs4_purge_stale_fh(), which may generate otw calls through 7394 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7395 * here to avoid nfs4_start_op() deadlock. 7396 */ 7397 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7398 7399 if (!e.error) { 7400 resp = &res; 7401 7402 if (res.status) { 7403 e.error = geterrno4(res.status); 7404 PURGE_ATTRCACHE4(dvp); 7405 nfs4_purge_stale_fh(e.error, dvp, cr); 7406 } else { 7407 resop = &res.array[1]; /* remove res */ 7408 rm_res = &resop->nfs_resop4_u.opremove; 7409 7410 dinfo.di_garp = 7411 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7412 dinfo.di_cred = cr; 7413 7414 /* Update directory attr, readdir and dnlc caches */ 7415 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7416 &dinfo); 7417 } 7418 } 7419 nfs_rw_exit(&drp->r_rwlock); 7420 if (resp) 7421 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7422 7423 if (e.error == 0) { 7424 vnode_t *tvp; 7425 rnode4_t *trp; 7426 trp = VTOR4(vp); 7427 tvp = vp; 7428 if (IS_SHADOW(vp, trp)) 7429 tvp = RTOV4(trp); 7430 vnevent_remove(tvp, dvp, nm, ct); 7431 } 7432 VN_RELE(vp); 7433 return (e.error); 7434 } 7435 7436 /* 7437 * Link requires that the current fh be the target directory and the 7438 * saved fh be the source fh. After the operation, the current fh is unchanged. 7439 * Thus the compound op structure is: 7440 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7441 * GETATTR(file) 7442 */ 7443 /* ARGSUSED */ 7444 static int 7445 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7446 caller_context_t *ct, int flags) 7447 { 7448 COMPOUND4args_clnt args; 7449 COMPOUND4res_clnt res, *resp = NULL; 7450 LINK4res *ln_res; 7451 int argoplist_size = 7 * sizeof (nfs_argop4); 7452 nfs_argop4 *argop; 7453 nfs_resop4 *resop; 7454 vnode_t *realvp, *nvp; 7455 int doqueue; 7456 mntinfo4_t *mi; 7457 rnode4_t *tdrp; 7458 bool_t needrecov = FALSE; 7459 nfs4_recov_state_t recov_state; 7460 hrtime_t t; 7461 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7462 dirattr_info_t dinfo; 7463 7464 ASSERT(*tnm != '\0'); 7465 ASSERT(tdvp->v_type == VDIR); 7466 ASSERT(nfs4_consistent_type(tdvp)); 7467 ASSERT(nfs4_consistent_type(svp)); 7468 7469 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7470 return (EPERM); 7471 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7472 svp = realvp; 7473 ASSERT(nfs4_consistent_type(svp)); 7474 } 7475 7476 tdrp = VTOR4(tdvp); 7477 mi = VTOMI4(svp); 7478 7479 if (!(mi->mi_flags & MI4_LINK)) { 7480 return (EOPNOTSUPP); 7481 } 7482 recov_state.rs_flags = 0; 7483 recov_state.rs_num_retry_despite_err = 0; 7484 7485 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7486 return (EINTR); 7487 7488 recov_retry: 7489 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7490 7491 args.ctag = TAG_LINK; 7492 7493 /* 7494 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7495 * restorefh; getattr(fl) 7496 */ 7497 args.array_len = 7; 7498 args.array = argop; 7499 7500 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7501 if (e.error) { 7502 kmem_free(argop, argoplist_size); 7503 nfs_rw_exit(&tdrp->r_rwlock); 7504 return (e.error); 7505 } 7506 7507 /* 0. putfh file */ 7508 argop[0].argop = OP_CPUTFH; 7509 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7510 7511 /* 1. save current fh to free up the space for the dir */ 7512 argop[1].argop = OP_SAVEFH; 7513 7514 /* 2. putfh targetdir */ 7515 argop[2].argop = OP_CPUTFH; 7516 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7517 7518 /* 3. link: current_fh is targetdir, saved_fh is source */ 7519 argop[3].argop = OP_CLINK; 7520 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7521 7522 /* 4. Get attributes of dir */ 7523 argop[4].argop = OP_GETATTR; 7524 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7525 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7526 7527 /* 5. If link was successful, restore current vp to file */ 7528 argop[5].argop = OP_RESTOREFH; 7529 7530 /* 6. Get attributes of linked object */ 7531 argop[6].argop = OP_GETATTR; 7532 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7533 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7534 7535 dnlc_remove(tdvp, tnm); 7536 7537 doqueue = 1; 7538 t = gethrtime(); 7539 7540 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7541 7542 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7543 if (e.error != 0 && !needrecov) { 7544 PURGE_ATTRCACHE4(tdvp); 7545 PURGE_ATTRCACHE4(svp); 7546 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7547 goto out; 7548 } 7549 7550 if (needrecov) { 7551 bool_t abort; 7552 7553 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7554 NULL, NULL, OP_LINK, NULL, NULL, NULL); 7555 if (abort == FALSE) { 7556 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7557 needrecov); 7558 kmem_free(argop, argoplist_size); 7559 if (!e.error) 7560 (void) xdr_free(xdr_COMPOUND4res_clnt, 7561 (caddr_t)&res); 7562 goto recov_retry; 7563 } else { 7564 if (e.error != 0) { 7565 PURGE_ATTRCACHE4(tdvp); 7566 PURGE_ATTRCACHE4(svp); 7567 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7568 &recov_state, needrecov); 7569 goto out; 7570 } 7571 /* fall through for res.status case */ 7572 } 7573 } 7574 7575 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7576 7577 resp = &res; 7578 if (res.status) { 7579 /* If link succeeded, then don't return error */ 7580 e.error = geterrno4(res.status); 7581 if (res.array_len <= 4) { 7582 /* 7583 * Either Putfh, Savefh, Putfh dir, or Link failed 7584 */ 7585 PURGE_ATTRCACHE4(svp); 7586 PURGE_ATTRCACHE4(tdvp); 7587 if (e.error == EOPNOTSUPP) { 7588 mutex_enter(&mi->mi_lock); 7589 mi->mi_flags &= ~MI4_LINK; 7590 mutex_exit(&mi->mi_lock); 7591 } 7592 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7593 /* XXX-LP */ 7594 if (e.error == EISDIR && crgetuid(cr) != 0) 7595 e.error = EPERM; 7596 goto out; 7597 } 7598 } 7599 7600 /* either no error or one of the postop getattr failed */ 7601 7602 /* 7603 * XXX - if LINK succeeded, but no attrs were returned for link 7604 * file, purge its cache. 7605 * 7606 * XXX Perform a simplified version of wcc checking. Instead of 7607 * have another getattr to get pre-op, just purge cache if 7608 * any of the ops prior to and including the getattr failed. 7609 * If the getattr succeeded then update the attrcache accordingly. 7610 */ 7611 7612 /* 7613 * update cache with link file postattrs. 7614 * Note: at this point resop points to link res. 7615 */ 7616 resop = &res.array[3]; /* link res */ 7617 ln_res = &resop->nfs_resop4_u.oplink; 7618 if (res.status == NFS4_OK) 7619 e.error = nfs4_update_attrcache(res.status, 7620 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7621 t, svp, cr); 7622 7623 /* 7624 * Call makenfs4node to create the new shadow vp for tnm. 7625 * We pass NULL attrs because we just cached attrs for 7626 * the src object. All we're trying to accomplish is to 7627 * to create the new shadow vnode. 7628 */ 7629 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7630 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7631 7632 /* Update target cache attribute, readdir and dnlc caches */ 7633 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7634 dinfo.di_time_call = t; 7635 dinfo.di_cred = cr; 7636 7637 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7638 ASSERT(nfs4_consistent_type(tdvp)); 7639 ASSERT(nfs4_consistent_type(svp)); 7640 ASSERT(nfs4_consistent_type(nvp)); 7641 VN_RELE(nvp); 7642 7643 if (!e.error) { 7644 vnode_t *tvp; 7645 rnode4_t *trp; 7646 /* 7647 * Notify the source file of this link operation. 7648 */ 7649 trp = VTOR4(svp); 7650 tvp = svp; 7651 if (IS_SHADOW(svp, trp)) 7652 tvp = RTOV4(trp); 7653 vnevent_link(tvp, ct); 7654 } 7655 out: 7656 kmem_free(argop, argoplist_size); 7657 if (resp) 7658 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7659 7660 nfs_rw_exit(&tdrp->r_rwlock); 7661 7662 return (e.error); 7663 } 7664 7665 /* ARGSUSED */ 7666 static int 7667 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7668 caller_context_t *ct, int flags) 7669 { 7670 vnode_t *realvp; 7671 7672 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7673 return (EPERM); 7674 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7675 ndvp = realvp; 7676 7677 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7678 } 7679 7680 /* 7681 * nfs4rename does the real work of renaming in NFS Version 4. 7682 * 7683 * A file handle is considered volatile for renaming purposes if either 7684 * of the volatile bits are turned on. However, the compound may differ 7685 * based on the likelihood of the filehandle to change during rename. 7686 */ 7687 static int 7688 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7689 caller_context_t *ct) 7690 { 7691 int error; 7692 mntinfo4_t *mi; 7693 vnode_t *nvp = NULL; 7694 vnode_t *ovp = NULL; 7695 char *tmpname = NULL; 7696 rnode4_t *rp; 7697 rnode4_t *odrp; 7698 rnode4_t *ndrp; 7699 int did_link = 0; 7700 int do_link = 1; 7701 nfsstat4 stat = NFS4_OK; 7702 7703 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7704 ASSERT(nfs4_consistent_type(odvp)); 7705 ASSERT(nfs4_consistent_type(ndvp)); 7706 7707 if (onm[0] == '.' && (onm[1] == '\0' || 7708 (onm[1] == '.' && onm[2] == '\0'))) 7709 return (EINVAL); 7710 7711 if (nnm[0] == '.' && (nnm[1] == '\0' || 7712 (nnm[1] == '.' && nnm[2] == '\0'))) 7713 return (EINVAL); 7714 7715 odrp = VTOR4(odvp); 7716 ndrp = VTOR4(ndvp); 7717 if ((intptr_t)odrp < (intptr_t)ndrp) { 7718 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7719 return (EINTR); 7720 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7721 nfs_rw_exit(&odrp->r_rwlock); 7722 return (EINTR); 7723 } 7724 } else { 7725 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7726 return (EINTR); 7727 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7728 nfs_rw_exit(&ndrp->r_rwlock); 7729 return (EINTR); 7730 } 7731 } 7732 7733 /* 7734 * Lookup the target file. If it exists, it needs to be 7735 * checked to see whether it is a mount point and whether 7736 * it is active (open). 7737 */ 7738 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7739 if (!error) { 7740 int isactive; 7741 7742 ASSERT(nfs4_consistent_type(nvp)); 7743 /* 7744 * If this file has been mounted on, then just 7745 * return busy because renaming to it would remove 7746 * the mounted file system from the name space. 7747 */ 7748 if (vn_ismntpt(nvp)) { 7749 VN_RELE(nvp); 7750 nfs_rw_exit(&odrp->r_rwlock); 7751 nfs_rw_exit(&ndrp->r_rwlock); 7752 return (EBUSY); 7753 } 7754 7755 /* 7756 * First just remove the entry from the name cache, as it 7757 * is most likely the only entry for this vp. 7758 */ 7759 dnlc_remove(ndvp, nnm); 7760 7761 rp = VTOR4(nvp); 7762 7763 if (nvp->v_type != VREG) { 7764 /* 7765 * Purge the name cache of all references to this vnode 7766 * so that we can check the reference count to infer 7767 * whether it is active or not. 7768 */ 7769 if (nvp->v_count > 1) 7770 dnlc_purge_vp(nvp); 7771 7772 isactive = nvp->v_count > 1; 7773 } else { 7774 mutex_enter(&rp->r_os_lock); 7775 isactive = list_head(&rp->r_open_streams) != NULL; 7776 mutex_exit(&rp->r_os_lock); 7777 } 7778 7779 /* 7780 * If the vnode is active and is not a directory, 7781 * arrange to rename it to a 7782 * temporary file so that it will continue to be 7783 * accessible. This implements the "unlink-open-file" 7784 * semantics for the target of a rename operation. 7785 * Before doing this though, make sure that the 7786 * source and target files are not already the same. 7787 */ 7788 if (isactive && nvp->v_type != VDIR) { 7789 /* 7790 * Lookup the source name. 7791 */ 7792 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7793 7794 /* 7795 * The source name *should* already exist. 7796 */ 7797 if (error) { 7798 VN_RELE(nvp); 7799 nfs_rw_exit(&odrp->r_rwlock); 7800 nfs_rw_exit(&ndrp->r_rwlock); 7801 return (error); 7802 } 7803 7804 ASSERT(nfs4_consistent_type(ovp)); 7805 7806 /* 7807 * Compare the two vnodes. If they are the same, 7808 * just release all held vnodes and return success. 7809 */ 7810 if (VN_CMP(ovp, nvp)) { 7811 VN_RELE(ovp); 7812 VN_RELE(nvp); 7813 nfs_rw_exit(&odrp->r_rwlock); 7814 nfs_rw_exit(&ndrp->r_rwlock); 7815 return (0); 7816 } 7817 7818 /* 7819 * Can't mix and match directories and non- 7820 * directories in rename operations. We already 7821 * know that the target is not a directory. If 7822 * the source is a directory, return an error. 7823 */ 7824 if (ovp->v_type == VDIR) { 7825 VN_RELE(ovp); 7826 VN_RELE(nvp); 7827 nfs_rw_exit(&odrp->r_rwlock); 7828 nfs_rw_exit(&ndrp->r_rwlock); 7829 return (ENOTDIR); 7830 } 7831 link_call: 7832 /* 7833 * The target file exists, is not the same as 7834 * the source file, and is active. We first 7835 * try to Link it to a temporary filename to 7836 * avoid having the server removing the file 7837 * completely (which could cause data loss to 7838 * the user's POV in the event the Rename fails 7839 * -- see bug 1165874). 7840 */ 7841 /* 7842 * The do_link and did_link booleans are 7843 * introduced in the event we get NFS4ERR_FILE_OPEN 7844 * returned for the Rename. Some servers can 7845 * not Rename over an Open file, so they return 7846 * this error. The client needs to Remove the 7847 * newly created Link and do two Renames, just 7848 * as if the server didn't support LINK. 7849 */ 7850 tmpname = newname(); 7851 error = 0; 7852 7853 if (do_link) { 7854 error = nfs4_link(ndvp, nvp, tmpname, cr, 7855 NULL, 0); 7856 } 7857 if (error == EOPNOTSUPP || !do_link) { 7858 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7859 cr, NULL, 0); 7860 did_link = 0; 7861 } else { 7862 did_link = 1; 7863 } 7864 if (error) { 7865 kmem_free(tmpname, MAXNAMELEN); 7866 VN_RELE(ovp); 7867 VN_RELE(nvp); 7868 nfs_rw_exit(&odrp->r_rwlock); 7869 nfs_rw_exit(&ndrp->r_rwlock); 7870 return (error); 7871 } 7872 7873 mutex_enter(&rp->r_statelock); 7874 if (rp->r_unldvp == NULL) { 7875 VN_HOLD(ndvp); 7876 rp->r_unldvp = ndvp; 7877 if (rp->r_unlcred != NULL) 7878 crfree(rp->r_unlcred); 7879 crhold(cr); 7880 rp->r_unlcred = cr; 7881 rp->r_unlname = tmpname; 7882 } else { 7883 if (rp->r_unlname) 7884 kmem_free(rp->r_unlname, MAXNAMELEN); 7885 rp->r_unlname = tmpname; 7886 } 7887 mutex_exit(&rp->r_statelock); 7888 } 7889 7890 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7891 7892 ASSERT(nfs4_consistent_type(nvp)); 7893 } 7894 7895 if (ovp == NULL) { 7896 /* 7897 * When renaming directories to be a subdirectory of a 7898 * different parent, the dnlc entry for ".." will no 7899 * longer be valid, so it must be removed. 7900 * 7901 * We do a lookup here to determine whether we are renaming 7902 * a directory and we need to check if we are renaming 7903 * an unlinked file. This might have already been done 7904 * in previous code, so we check ovp == NULL to avoid 7905 * doing it twice. 7906 */ 7907 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7908 /* 7909 * The source name *should* already exist. 7910 */ 7911 if (error) { 7912 nfs_rw_exit(&odrp->r_rwlock); 7913 nfs_rw_exit(&ndrp->r_rwlock); 7914 if (nvp) { 7915 VN_RELE(nvp); 7916 } 7917 return (error); 7918 } 7919 ASSERT(ovp != NULL); 7920 ASSERT(nfs4_consistent_type(ovp)); 7921 } 7922 7923 /* 7924 * Is the object being renamed a dir, and if so, is 7925 * it being renamed to a child of itself? The underlying 7926 * fs should ultimately return EINVAL for this case; 7927 * however, buggy beta non-Solaris NFSv4 servers at 7928 * interop testing events have allowed this behavior, 7929 * and it caused our client to panic due to a recursive 7930 * mutex_enter in fn_move. 7931 * 7932 * The tedious locking in fn_move could be changed to 7933 * deal with this case, and the client could avoid the 7934 * panic; however, the client would just confuse itself 7935 * later and misbehave. A better way to handle the broken 7936 * server is to detect this condition and return EINVAL 7937 * without ever sending the the bogus rename to the server. 7938 * We know the rename is invalid -- just fail it now. 7939 */ 7940 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7941 VN_RELE(ovp); 7942 nfs_rw_exit(&odrp->r_rwlock); 7943 nfs_rw_exit(&ndrp->r_rwlock); 7944 if (nvp) { 7945 VN_RELE(nvp); 7946 } 7947 return (EINVAL); 7948 } 7949 7950 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7951 7952 /* 7953 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7954 * possible for the filehandle to change due to the rename. 7955 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7956 * the fh will not change because of the rename, but we still need 7957 * to update its rnode entry with the new name for 7958 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7959 * has no effect on these for now, but for future improvements, 7960 * we might want to use it too to simplify handling of files 7961 * that are open with that flag on. (XXX) 7962 */ 7963 mi = VTOMI4(odvp); 7964 if (NFS4_VOLATILE_FH(mi)) 7965 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7966 &stat); 7967 else 7968 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7969 &stat); 7970 7971 ASSERT(nfs4_consistent_type(odvp)); 7972 ASSERT(nfs4_consistent_type(ndvp)); 7973 ASSERT(nfs4_consistent_type(ovp)); 7974 7975 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7976 do_link = 0; 7977 /* 7978 * Before the 'link_call' code, we did a nfs4_lookup 7979 * that puts a VN_HOLD on nvp. After the nfs4_link 7980 * call we call VN_RELE to match that hold. We need 7981 * to place an additional VN_HOLD here since we will 7982 * be hitting that VN_RELE again. 7983 */ 7984 VN_HOLD(nvp); 7985 7986 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 7987 7988 /* Undo the unlinked file naming stuff we just did */ 7989 mutex_enter(&rp->r_statelock); 7990 if (rp->r_unldvp) { 7991 VN_RELE(ndvp); 7992 rp->r_unldvp = NULL; 7993 if (rp->r_unlcred != NULL) 7994 crfree(rp->r_unlcred); 7995 rp->r_unlcred = NULL; 7996 /* rp->r_unlanme points to tmpname */ 7997 if (rp->r_unlname) 7998 kmem_free(rp->r_unlname, MAXNAMELEN); 7999 rp->r_unlname = NULL; 8000 } 8001 mutex_exit(&rp->r_statelock); 8002 8003 if (nvp) { 8004 VN_RELE(nvp); 8005 } 8006 goto link_call; 8007 } 8008 8009 if (error) { 8010 VN_RELE(ovp); 8011 nfs_rw_exit(&odrp->r_rwlock); 8012 nfs_rw_exit(&ndrp->r_rwlock); 8013 if (nvp) { 8014 VN_RELE(nvp); 8015 } 8016 return (error); 8017 } 8018 8019 /* 8020 * when renaming directories to be a subdirectory of a 8021 * different parent, the dnlc entry for ".." will no 8022 * longer be valid, so it must be removed 8023 */ 8024 rp = VTOR4(ovp); 8025 if (ndvp != odvp) { 8026 if (ovp->v_type == VDIR) { 8027 dnlc_remove(ovp, ".."); 8028 if (rp->r_dir != NULL) 8029 nfs4_purge_rddir_cache(ovp); 8030 } 8031 } 8032 8033 /* 8034 * If we are renaming the unlinked file, update the 8035 * r_unldvp and r_unlname as needed. 8036 */ 8037 mutex_enter(&rp->r_statelock); 8038 if (rp->r_unldvp != NULL) { 8039 if (strcmp(rp->r_unlname, onm) == 0) { 8040 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 8041 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 8042 if (ndvp != rp->r_unldvp) { 8043 VN_RELE(rp->r_unldvp); 8044 rp->r_unldvp = ndvp; 8045 VN_HOLD(ndvp); 8046 } 8047 } 8048 } 8049 mutex_exit(&rp->r_statelock); 8050 8051 /* 8052 * Notify the rename vnevents to source vnode, and to the target 8053 * vnode if it already existed. 8054 */ 8055 if (error == 0) { 8056 vnode_t *tvp; 8057 rnode4_t *trp; 8058 /* 8059 * Notify the vnode. Each links is represented by 8060 * a different vnode, in nfsv4. 8061 */ 8062 if (nvp) { 8063 trp = VTOR4(nvp); 8064 tvp = nvp; 8065 if (IS_SHADOW(nvp, trp)) 8066 tvp = RTOV4(trp); 8067 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8068 } 8069 8070 /* 8071 * if the source and destination directory are not the 8072 * same notify the destination directory. 8073 */ 8074 if (VTOR4(odvp) != VTOR4(ndvp)) { 8075 trp = VTOR4(ndvp); 8076 tvp = ndvp; 8077 if (IS_SHADOW(ndvp, trp)) 8078 tvp = RTOV4(trp); 8079 vnevent_rename_dest_dir(tvp, ct); 8080 } 8081 8082 trp = VTOR4(ovp); 8083 tvp = ovp; 8084 if (IS_SHADOW(ovp, trp)) 8085 tvp = RTOV4(trp); 8086 vnevent_rename_src(tvp, odvp, onm, ct); 8087 } 8088 8089 if (nvp) { 8090 VN_RELE(nvp); 8091 } 8092 VN_RELE(ovp); 8093 8094 nfs_rw_exit(&odrp->r_rwlock); 8095 nfs_rw_exit(&ndrp->r_rwlock); 8096 8097 return (error); 8098 } 8099 8100 /* 8101 * When the parent directory has changed, sv_dfh must be updated 8102 */ 8103 static void 8104 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8105 { 8106 svnode_t *sv = VTOSV(vp); 8107 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8108 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8109 8110 sfh4_hold(new_dfh); 8111 sv->sv_dfh = new_dfh; 8112 sfh4_rele(&old_dfh); 8113 } 8114 8115 /* 8116 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8117 * when it is known that the filehandle is persistent through rename. 8118 * 8119 * Rename requires that the current fh be the target directory and the 8120 * saved fh be the source directory. After the operation, the current fh 8121 * is unchanged. 8122 * The compound op structure for persistent fh rename is: 8123 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8124 * Rather than bother with the directory postop args, we'll simply 8125 * update that a change occurred in the cache, so no post-op getattrs. 8126 */ 8127 static int 8128 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8129 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8130 { 8131 COMPOUND4args_clnt args; 8132 COMPOUND4res_clnt res, *resp = NULL; 8133 nfs_argop4 *argop; 8134 nfs_resop4 *resop; 8135 int doqueue, argoplist_size; 8136 mntinfo4_t *mi; 8137 rnode4_t *odrp = VTOR4(odvp); 8138 rnode4_t *ndrp = VTOR4(ndvp); 8139 RENAME4res *rn_res; 8140 bool_t needrecov; 8141 nfs4_recov_state_t recov_state; 8142 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8143 dirattr_info_t dinfo, *dinfop; 8144 8145 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8146 8147 recov_state.rs_flags = 0; 8148 recov_state.rs_num_retry_despite_err = 0; 8149 8150 /* 8151 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8152 * 8153 * If source/target are different dirs, then append putfh(src); getattr 8154 */ 8155 args.array_len = (odvp == ndvp) ? 5 : 7; 8156 argoplist_size = args.array_len * sizeof (nfs_argop4); 8157 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8158 8159 recov_retry: 8160 *statp = NFS4_OK; 8161 8162 /* No need to Lookup the file, persistent fh */ 8163 args.ctag = TAG_RENAME; 8164 8165 mi = VTOMI4(odvp); 8166 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8167 if (e.error) { 8168 kmem_free(argop, argoplist_size); 8169 return (e.error); 8170 } 8171 8172 /* 0: putfh source directory */ 8173 argop[0].argop = OP_CPUTFH; 8174 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8175 8176 /* 1: Save source fh to free up current for target */ 8177 argop[1].argop = OP_SAVEFH; 8178 8179 /* 2: putfh targetdir */ 8180 argop[2].argop = OP_CPUTFH; 8181 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8182 8183 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8184 argop[3].argop = OP_CRENAME; 8185 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8186 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8187 8188 /* 4: getattr (targetdir) */ 8189 argop[4].argop = OP_GETATTR; 8190 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8191 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8192 8193 if (ndvp != odvp) { 8194 8195 /* 5: putfh (sourcedir) */ 8196 argop[5].argop = OP_CPUTFH; 8197 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8198 8199 /* 6: getattr (sourcedir) */ 8200 argop[6].argop = OP_GETATTR; 8201 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8202 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8203 } 8204 8205 dnlc_remove(odvp, onm); 8206 dnlc_remove(ndvp, nnm); 8207 8208 doqueue = 1; 8209 dinfo.di_time_call = gethrtime(); 8210 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8211 8212 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8213 if (e.error) { 8214 PURGE_ATTRCACHE4(odvp); 8215 PURGE_ATTRCACHE4(ndvp); 8216 } else { 8217 *statp = res.status; 8218 } 8219 8220 if (needrecov) { 8221 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8222 OP_RENAME, NULL, NULL, NULL) == FALSE) { 8223 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8224 if (!e.error) 8225 (void) xdr_free(xdr_COMPOUND4res_clnt, 8226 (caddr_t)&res); 8227 goto recov_retry; 8228 } 8229 } 8230 8231 if (!e.error) { 8232 resp = &res; 8233 /* 8234 * as long as OP_RENAME 8235 */ 8236 if (res.status != NFS4_OK && res.array_len <= 4) { 8237 e.error = geterrno4(res.status); 8238 PURGE_ATTRCACHE4(odvp); 8239 PURGE_ATTRCACHE4(ndvp); 8240 /* 8241 * System V defines rename to return EEXIST, not 8242 * ENOTEMPTY if the target directory is not empty. 8243 * Over the wire, the error is NFSERR_ENOTEMPTY 8244 * which geterrno4 maps to ENOTEMPTY. 8245 */ 8246 if (e.error == ENOTEMPTY) 8247 e.error = EEXIST; 8248 } else { 8249 8250 resop = &res.array[3]; /* rename res */ 8251 rn_res = &resop->nfs_resop4_u.oprename; 8252 8253 if (res.status == NFS4_OK) { 8254 /* 8255 * Update target attribute, readdir and dnlc 8256 * caches. 8257 */ 8258 dinfo.di_garp = 8259 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8260 dinfo.di_cred = cr; 8261 dinfop = &dinfo; 8262 } else 8263 dinfop = NULL; 8264 8265 nfs4_update_dircaches(&rn_res->target_cinfo, 8266 ndvp, NULL, NULL, dinfop); 8267 8268 /* 8269 * Update source attribute, readdir and dnlc caches 8270 * 8271 */ 8272 if (ndvp != odvp) { 8273 update_parentdir_sfh(renvp, ndvp); 8274 8275 if (dinfop) 8276 dinfo.di_garp = 8277 &(res.array[6].nfs_resop4_u. 8278 opgetattr.ga_res); 8279 8280 nfs4_update_dircaches(&rn_res->source_cinfo, 8281 odvp, NULL, NULL, dinfop); 8282 } 8283 8284 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8285 nnm); 8286 } 8287 } 8288 8289 if (resp) 8290 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8291 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8292 kmem_free(argop, argoplist_size); 8293 8294 return (e.error); 8295 } 8296 8297 /* 8298 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8299 * it is possible for the filehandle to change due to the rename. 8300 * 8301 * The compound req in this case includes a post-rename lookup and getattr 8302 * to ensure that we have the correct fh and attributes for the object. 8303 * 8304 * Rename requires that the current fh be the target directory and the 8305 * saved fh be the source directory. After the operation, the current fh 8306 * is unchanged. 8307 * 8308 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8309 * update the filehandle for the renamed object. We also get the old 8310 * filehandle for historical reasons; this should be taken out sometime. 8311 * This results in a rather cumbersome compound... 8312 * 8313 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8314 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8315 * 8316 */ 8317 static int 8318 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8319 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8320 { 8321 COMPOUND4args_clnt args; 8322 COMPOUND4res_clnt res, *resp = NULL; 8323 int argoplist_size; 8324 nfs_argop4 *argop; 8325 nfs_resop4 *resop; 8326 int doqueue; 8327 mntinfo4_t *mi; 8328 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8329 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8330 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8331 RENAME4res *rn_res; 8332 GETFH4res *ngf_res; 8333 bool_t needrecov; 8334 nfs4_recov_state_t recov_state; 8335 hrtime_t t; 8336 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8337 dirattr_info_t dinfo, *dinfop = &dinfo; 8338 8339 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8340 8341 recov_state.rs_flags = 0; 8342 recov_state.rs_num_retry_despite_err = 0; 8343 8344 recov_retry: 8345 *statp = NFS4_OK; 8346 8347 /* 8348 * There is a window between the RPC and updating the path and 8349 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8350 * code, so that it doesn't try to use the old path during that 8351 * window. 8352 */ 8353 mutex_enter(&orp->r_statelock); 8354 while (orp->r_flags & R4RECEXPFH) { 8355 klwp_t *lwp = ttolwp(curthread); 8356 8357 if (lwp != NULL) 8358 lwp->lwp_nostop++; 8359 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8360 mutex_exit(&orp->r_statelock); 8361 if (lwp != NULL) 8362 lwp->lwp_nostop--; 8363 return (EINTR); 8364 } 8365 if (lwp != NULL) 8366 lwp->lwp_nostop--; 8367 } 8368 orp->r_flags |= R4RECEXPFH; 8369 mutex_exit(&orp->r_statelock); 8370 8371 mi = VTOMI4(odvp); 8372 8373 args.ctag = TAG_RENAME_VFH; 8374 args.array_len = (odvp == ndvp) ? 10 : 12; 8375 argoplist_size = args.array_len * sizeof (nfs_argop4); 8376 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8377 8378 /* 8379 * Rename ops: 8380 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8381 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8382 * LOOKUP(trgt), GETFH(new), GETATTR, 8383 * 8384 * if (odvp != ndvp) 8385 * add putfh(sourcedir), getattr(sourcedir) } 8386 */ 8387 args.array = argop; 8388 8389 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8390 &recov_state, NULL); 8391 if (e.error) { 8392 kmem_free(argop, argoplist_size); 8393 mutex_enter(&orp->r_statelock); 8394 orp->r_flags &= ~R4RECEXPFH; 8395 cv_broadcast(&orp->r_cv); 8396 mutex_exit(&orp->r_statelock); 8397 return (e.error); 8398 } 8399 8400 /* 0: putfh source directory */ 8401 argop[0].argop = OP_CPUTFH; 8402 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8403 8404 /* 1: Save source fh to free up current for target */ 8405 argop[1].argop = OP_SAVEFH; 8406 8407 /* 2: Lookup pre-rename fh of renamed object */ 8408 argop[2].argop = OP_CLOOKUP; 8409 argop[2].nfs_argop4_u.opclookup.cname = onm; 8410 8411 /* 3: getfh fh of renamed object (before rename) */ 8412 argop[3].argop = OP_GETFH; 8413 8414 /* 4: putfh targetdir */ 8415 argop[4].argop = OP_CPUTFH; 8416 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8417 8418 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8419 argop[5].argop = OP_CRENAME; 8420 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8421 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8422 8423 /* 6: getattr of target dir (post op attrs) */ 8424 argop[6].argop = OP_GETATTR; 8425 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8426 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8427 8428 /* 7: Lookup post-rename fh of renamed object */ 8429 argop[7].argop = OP_CLOOKUP; 8430 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8431 8432 /* 8: getfh fh of renamed object (after rename) */ 8433 argop[8].argop = OP_GETFH; 8434 8435 /* 9: getattr of renamed object */ 8436 argop[9].argop = OP_GETATTR; 8437 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8438 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8439 8440 /* 8441 * If source/target dirs are different, then get new post-op 8442 * attrs for source dir also. 8443 */ 8444 if (ndvp != odvp) { 8445 /* 10: putfh (sourcedir) */ 8446 argop[10].argop = OP_CPUTFH; 8447 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8448 8449 /* 11: getattr (sourcedir) */ 8450 argop[11].argop = OP_GETATTR; 8451 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8452 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8453 } 8454 8455 dnlc_remove(odvp, onm); 8456 dnlc_remove(ndvp, nnm); 8457 8458 doqueue = 1; 8459 t = gethrtime(); 8460 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8461 8462 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8463 if (e.error) { 8464 PURGE_ATTRCACHE4(odvp); 8465 PURGE_ATTRCACHE4(ndvp); 8466 if (!needrecov) { 8467 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8468 &recov_state, needrecov); 8469 goto out; 8470 } 8471 } else { 8472 *statp = res.status; 8473 } 8474 8475 if (needrecov) { 8476 bool_t abort; 8477 8478 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8479 OP_RENAME, NULL, NULL, NULL); 8480 if (abort == FALSE) { 8481 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8482 &recov_state, needrecov); 8483 kmem_free(argop, argoplist_size); 8484 if (!e.error) 8485 (void) xdr_free(xdr_COMPOUND4res_clnt, 8486 (caddr_t)&res); 8487 mutex_enter(&orp->r_statelock); 8488 orp->r_flags &= ~R4RECEXPFH; 8489 cv_broadcast(&orp->r_cv); 8490 mutex_exit(&orp->r_statelock); 8491 goto recov_retry; 8492 } else { 8493 if (e.error != 0) { 8494 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8495 &recov_state, needrecov); 8496 goto out; 8497 } 8498 /* fall through for res.status case */ 8499 } 8500 } 8501 8502 resp = &res; 8503 /* 8504 * If OP_RENAME (or any prev op) failed, then return an error. 8505 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8506 */ 8507 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8508 /* 8509 * Error in an op other than last Getattr 8510 */ 8511 e.error = geterrno4(res.status); 8512 PURGE_ATTRCACHE4(odvp); 8513 PURGE_ATTRCACHE4(ndvp); 8514 /* 8515 * System V defines rename to return EEXIST, not 8516 * ENOTEMPTY if the target directory is not empty. 8517 * Over the wire, the error is NFSERR_ENOTEMPTY 8518 * which geterrno4 maps to ENOTEMPTY. 8519 */ 8520 if (e.error == ENOTEMPTY) 8521 e.error = EEXIST; 8522 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8523 needrecov); 8524 goto out; 8525 } 8526 8527 /* rename results */ 8528 rn_res = &res.array[5].nfs_resop4_u.oprename; 8529 8530 if (res.status == NFS4_OK) { 8531 /* Update target attribute, readdir and dnlc caches */ 8532 dinfo.di_garp = 8533 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8534 dinfo.di_cred = cr; 8535 dinfo.di_time_call = t; 8536 } else 8537 dinfop = NULL; 8538 8539 /* Update source cache attribute, readdir and dnlc caches */ 8540 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8541 8542 /* Update source cache attribute, readdir and dnlc caches */ 8543 if (ndvp != odvp) { 8544 update_parentdir_sfh(ovp, ndvp); 8545 8546 /* 8547 * If dinfop is non-NULL, then compound succeded, so 8548 * set di_garp to attrs for source dir. dinfop is only 8549 * set to NULL when compound fails. 8550 */ 8551 if (dinfop) 8552 dinfo.di_garp = 8553 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8554 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8555 dinfop); 8556 } 8557 8558 /* 8559 * Update the rnode with the new component name and args, 8560 * and if the file handle changed, also update it with the new fh. 8561 * This is only necessary if the target object has an rnode 8562 * entry and there is no need to create one for it. 8563 */ 8564 resop = &res.array[8]; /* getfh new res */ 8565 ngf_res = &resop->nfs_resop4_u.opgetfh; 8566 8567 /* 8568 * Update the path and filehandle for the renamed object. 8569 */ 8570 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8571 8572 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8573 8574 if (res.status == NFS4_OK) { 8575 resop++; /* getattr res */ 8576 e.error = nfs4_update_attrcache(res.status, 8577 &resop->nfs_resop4_u.opgetattr.ga_res, 8578 t, ovp, cr); 8579 } 8580 8581 out: 8582 kmem_free(argop, argoplist_size); 8583 if (resp) 8584 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8585 mutex_enter(&orp->r_statelock); 8586 orp->r_flags &= ~R4RECEXPFH; 8587 cv_broadcast(&orp->r_cv); 8588 mutex_exit(&orp->r_statelock); 8589 8590 return (e.error); 8591 } 8592 8593 /* ARGSUSED */ 8594 static int 8595 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8596 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8597 { 8598 int error; 8599 vnode_t *vp; 8600 8601 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8602 return (EPERM); 8603 /* 8604 * As ".." has special meaning and rather than send a mkdir 8605 * over the wire to just let the server freak out, we just 8606 * short circuit it here and return EEXIST 8607 */ 8608 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8609 return (EEXIST); 8610 8611 /* 8612 * Decision to get the right gid and setgid bit of the 8613 * new directory is now made in call_nfs4_create_req. 8614 */ 8615 va->va_mask |= AT_MODE; 8616 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8617 if (error) 8618 return (error); 8619 8620 *vpp = vp; 8621 return (0); 8622 } 8623 8624 8625 /* 8626 * rmdir is using the same remove v4 op as does remove. 8627 * Remove requires that the current fh be the target directory. 8628 * After the operation, the current fh is unchanged. 8629 * The compound op structure is: 8630 * PUTFH(targetdir), REMOVE 8631 */ 8632 /*ARGSUSED4*/ 8633 static int 8634 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8635 caller_context_t *ct, int flags) 8636 { 8637 int need_end_op = FALSE; 8638 COMPOUND4args_clnt args; 8639 COMPOUND4res_clnt res, *resp = NULL; 8640 REMOVE4res *rm_res; 8641 nfs_argop4 argop[3]; 8642 nfs_resop4 *resop; 8643 vnode_t *vp; 8644 int doqueue; 8645 mntinfo4_t *mi; 8646 rnode4_t *drp; 8647 bool_t needrecov = FALSE; 8648 nfs4_recov_state_t recov_state; 8649 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8650 dirattr_info_t dinfo, *dinfop; 8651 8652 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8653 return (EPERM); 8654 /* 8655 * As ".." has special meaning and rather than send a rmdir 8656 * over the wire to just let the server freak out, we just 8657 * short circuit it here and return EEXIST 8658 */ 8659 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8660 return (EEXIST); 8661 8662 drp = VTOR4(dvp); 8663 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8664 return (EINTR); 8665 8666 /* 8667 * Attempt to prevent a rmdir(".") from succeeding. 8668 */ 8669 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8670 if (e.error) { 8671 nfs_rw_exit(&drp->r_rwlock); 8672 return (e.error); 8673 } 8674 if (vp == cdir) { 8675 VN_RELE(vp); 8676 nfs_rw_exit(&drp->r_rwlock); 8677 return (EINVAL); 8678 } 8679 8680 /* 8681 * Since nfsv4 remove op works on both files and directories, 8682 * check that the removed object is indeed a directory. 8683 */ 8684 if (vp->v_type != VDIR) { 8685 VN_RELE(vp); 8686 nfs_rw_exit(&drp->r_rwlock); 8687 return (ENOTDIR); 8688 } 8689 8690 /* 8691 * First just remove the entry from the name cache, as it 8692 * is most likely an entry for this vp. 8693 */ 8694 dnlc_remove(dvp, nm); 8695 8696 /* 8697 * If there vnode reference count is greater than one, then 8698 * there may be additional references in the DNLC which will 8699 * need to be purged. First, trying removing the entry for 8700 * the parent directory and see if that removes the additional 8701 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8702 * to completely remove any references to the directory which 8703 * might still exist in the DNLC. 8704 */ 8705 if (vp->v_count > 1) { 8706 dnlc_remove(vp, ".."); 8707 if (vp->v_count > 1) 8708 dnlc_purge_vp(vp); 8709 } 8710 8711 mi = VTOMI4(dvp); 8712 recov_state.rs_flags = 0; 8713 recov_state.rs_num_retry_despite_err = 0; 8714 8715 recov_retry: 8716 args.ctag = TAG_RMDIR; 8717 8718 /* 8719 * Rmdir ops: putfh dir; remove 8720 */ 8721 args.array_len = 3; 8722 args.array = argop; 8723 8724 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8725 if (e.error) { 8726 nfs_rw_exit(&drp->r_rwlock); 8727 return (e.error); 8728 } 8729 need_end_op = TRUE; 8730 8731 /* putfh directory */ 8732 argop[0].argop = OP_CPUTFH; 8733 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8734 8735 /* remove */ 8736 argop[1].argop = OP_CREMOVE; 8737 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8738 8739 /* getattr (postop attrs for dir that contained removed dir) */ 8740 argop[2].argop = OP_GETATTR; 8741 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8742 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8743 8744 dinfo.di_time_call = gethrtime(); 8745 doqueue = 1; 8746 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8747 8748 PURGE_ATTRCACHE4(vp); 8749 8750 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8751 if (e.error) { 8752 PURGE_ATTRCACHE4(dvp); 8753 } 8754 8755 if (needrecov) { 8756 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8757 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 8758 if (!e.error) 8759 (void) xdr_free(xdr_COMPOUND4res_clnt, 8760 (caddr_t)&res); 8761 8762 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8763 needrecov); 8764 need_end_op = FALSE; 8765 goto recov_retry; 8766 } 8767 } 8768 8769 if (!e.error) { 8770 resp = &res; 8771 8772 /* 8773 * Only return error if first 2 ops (OP_REMOVE or earlier) 8774 * failed. 8775 */ 8776 if (res.status != NFS4_OK && res.array_len <= 2) { 8777 e.error = geterrno4(res.status); 8778 PURGE_ATTRCACHE4(dvp); 8779 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8780 &recov_state, needrecov); 8781 need_end_op = FALSE; 8782 nfs4_purge_stale_fh(e.error, dvp, cr); 8783 /* 8784 * System V defines rmdir to return EEXIST, not 8785 * ENOTEMPTY if the directory is not empty. Over 8786 * the wire, the error is NFSERR_ENOTEMPTY which 8787 * geterrno4 maps to ENOTEMPTY. 8788 */ 8789 if (e.error == ENOTEMPTY) 8790 e.error = EEXIST; 8791 } else { 8792 resop = &res.array[1]; /* remove res */ 8793 rm_res = &resop->nfs_resop4_u.opremove; 8794 8795 if (res.status == NFS4_OK) { 8796 resop = &res.array[2]; /* dir attrs */ 8797 dinfo.di_garp = 8798 &resop->nfs_resop4_u.opgetattr.ga_res; 8799 dinfo.di_cred = cr; 8800 dinfop = &dinfo; 8801 } else 8802 dinfop = NULL; 8803 8804 /* Update dir attribute, readdir and dnlc caches */ 8805 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8806 dinfop); 8807 8808 /* destroy rddir cache for dir that was removed */ 8809 if (VTOR4(vp)->r_dir != NULL) 8810 nfs4_purge_rddir_cache(vp); 8811 } 8812 } 8813 8814 if (need_end_op) 8815 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8816 8817 nfs_rw_exit(&drp->r_rwlock); 8818 8819 if (resp) 8820 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8821 8822 if (e.error == 0) { 8823 vnode_t *tvp; 8824 rnode4_t *trp; 8825 trp = VTOR4(vp); 8826 tvp = vp; 8827 if (IS_SHADOW(vp, trp)) 8828 tvp = RTOV4(trp); 8829 vnevent_rmdir(tvp, dvp, nm, ct); 8830 } 8831 8832 VN_RELE(vp); 8833 8834 return (e.error); 8835 } 8836 8837 /* ARGSUSED */ 8838 static int 8839 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8840 caller_context_t *ct, int flags) 8841 { 8842 int error; 8843 vnode_t *vp; 8844 rnode4_t *rp; 8845 char *contents; 8846 mntinfo4_t *mi = VTOMI4(dvp); 8847 8848 if (nfs_zone() != mi->mi_zone) 8849 return (EPERM); 8850 if (!(mi->mi_flags & MI4_SYMLINK)) 8851 return (EOPNOTSUPP); 8852 8853 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8854 if (error) 8855 return (error); 8856 8857 ASSERT(nfs4_consistent_type(vp)); 8858 rp = VTOR4(vp); 8859 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8860 8861 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8862 8863 if (contents != NULL) { 8864 mutex_enter(&rp->r_statelock); 8865 if (rp->r_symlink.contents == NULL) { 8866 rp->r_symlink.len = strlen(tnm); 8867 bcopy(tnm, contents, rp->r_symlink.len); 8868 rp->r_symlink.contents = contents; 8869 rp->r_symlink.size = MAXPATHLEN; 8870 mutex_exit(&rp->r_statelock); 8871 } else { 8872 mutex_exit(&rp->r_statelock); 8873 kmem_free((void *)contents, MAXPATHLEN); 8874 } 8875 } 8876 } 8877 VN_RELE(vp); 8878 8879 return (error); 8880 } 8881 8882 8883 /* 8884 * Read directory entries. 8885 * There are some weird things to look out for here. The uio_loffset 8886 * field is either 0 or it is the offset returned from a previous 8887 * readdir. It is an opaque value used by the server to find the 8888 * correct directory block to read. The count field is the number 8889 * of blocks to read on the server. This is advisory only, the server 8890 * may return only one block's worth of entries. Entries may be compressed 8891 * on the server. 8892 */ 8893 /* ARGSUSED */ 8894 static int 8895 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8896 caller_context_t *ct, int flags) 8897 { 8898 int error; 8899 uint_t count; 8900 rnode4_t *rp; 8901 rddir4_cache *rdc; 8902 rddir4_cache *rrdc; 8903 8904 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8905 return (EIO); 8906 rp = VTOR4(vp); 8907 8908 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8909 8910 /* 8911 * Make sure that the directory cache is valid. 8912 */ 8913 if (rp->r_dir != NULL) { 8914 if (nfs_disable_rddir_cache != 0) { 8915 /* 8916 * Setting nfs_disable_rddir_cache in /etc/system 8917 * allows interoperability with servers that do not 8918 * properly update the attributes of directories. 8919 * Any cached information gets purged before an 8920 * access is made to it. 8921 */ 8922 nfs4_purge_rddir_cache(vp); 8923 } 8924 8925 error = nfs4_validate_caches(vp, cr); 8926 if (error) 8927 return (error); 8928 } 8929 8930 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8931 8932 /* 8933 * Short circuit last readdir which always returns 0 bytes. 8934 * This can be done after the directory has been read through 8935 * completely at least once. This will set r_direof which 8936 * can be used to find the value of the last cookie. 8937 */ 8938 mutex_enter(&rp->r_statelock); 8939 if (rp->r_direof != NULL && 8940 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8941 mutex_exit(&rp->r_statelock); 8942 #ifdef DEBUG 8943 nfs4_readdir_cache_shorts++; 8944 #endif 8945 if (eofp) 8946 *eofp = 1; 8947 return (0); 8948 } 8949 8950 /* 8951 * Look for a cache entry. Cache entries are identified 8952 * by the NFS cookie value and the byte count requested. 8953 */ 8954 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8955 8956 /* 8957 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8958 */ 8959 if (rdc == NULL) { 8960 mutex_exit(&rp->r_statelock); 8961 return (EINTR); 8962 } 8963 8964 /* 8965 * Check to see if we need to fill this entry in. 8966 */ 8967 if (rdc->flags & RDDIRREQ) { 8968 rdc->flags &= ~RDDIRREQ; 8969 rdc->flags |= RDDIR; 8970 mutex_exit(&rp->r_statelock); 8971 8972 /* 8973 * Do the readdir. 8974 */ 8975 nfs4readdir(vp, rdc, cr); 8976 8977 /* 8978 * Reacquire the lock, so that we can continue 8979 */ 8980 mutex_enter(&rp->r_statelock); 8981 /* 8982 * The entry is now complete 8983 */ 8984 rdc->flags &= ~RDDIR; 8985 } 8986 8987 ASSERT(!(rdc->flags & RDDIR)); 8988 8989 /* 8990 * If an error occurred while attempting 8991 * to fill the cache entry, mark the entry invalid and 8992 * just return the error. 8993 */ 8994 if (rdc->error) { 8995 error = rdc->error; 8996 rdc->flags |= RDDIRREQ; 8997 rddir4_cache_rele(rp, rdc); 8998 mutex_exit(&rp->r_statelock); 8999 return (error); 9000 } 9001 9002 /* 9003 * The cache entry is complete and good, 9004 * copyout the dirent structs to the calling 9005 * thread. 9006 */ 9007 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 9008 9009 /* 9010 * If no error occurred during the copyout, 9011 * update the offset in the uio struct to 9012 * contain the value of the next NFS 4 cookie 9013 * and set the eof value appropriately. 9014 */ 9015 if (!error) { 9016 uiop->uio_loffset = rdc->nfs4_ncookie; 9017 if (eofp) 9018 *eofp = rdc->eof; 9019 } 9020 9021 /* 9022 * Decide whether to do readahead. Don't if we 9023 * have already read to the end of directory. 9024 */ 9025 if (rdc->eof) { 9026 /* 9027 * Make the entry the direof only if it is cached 9028 */ 9029 if (rdc->flags & RDDIRCACHED) 9030 rp->r_direof = rdc; 9031 rddir4_cache_rele(rp, rdc); 9032 mutex_exit(&rp->r_statelock); 9033 return (error); 9034 } 9035 9036 /* Determine if a readdir readahead should be done */ 9037 if (!(rp->r_flags & R4LOOKUP)) { 9038 rddir4_cache_rele(rp, rdc); 9039 mutex_exit(&rp->r_statelock); 9040 return (error); 9041 } 9042 9043 /* 9044 * Now look for a readahead entry. 9045 * 9046 * Check to see whether we found an entry for the readahead. 9047 * If so, we don't need to do anything further, so free the new 9048 * entry if one was allocated. Otherwise, allocate a new entry, add 9049 * it to the cache, and then initiate an asynchronous readdir 9050 * operation to fill it. 9051 */ 9052 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 9053 9054 /* 9055 * A readdir cache entry could not be obtained for the readahead. In 9056 * this case we skip the readahead and return. 9057 */ 9058 if (rrdc == NULL) { 9059 rddir4_cache_rele(rp, rdc); 9060 mutex_exit(&rp->r_statelock); 9061 return (error); 9062 } 9063 9064 /* 9065 * Check to see if we need to fill this entry in. 9066 */ 9067 if (rrdc->flags & RDDIRREQ) { 9068 rrdc->flags &= ~RDDIRREQ; 9069 rrdc->flags |= RDDIR; 9070 rddir4_cache_rele(rp, rdc); 9071 mutex_exit(&rp->r_statelock); 9072 #ifdef DEBUG 9073 nfs4_readdir_readahead++; 9074 #endif 9075 /* 9076 * Do the readdir. 9077 */ 9078 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9079 return (error); 9080 } 9081 9082 rddir4_cache_rele(rp, rrdc); 9083 rddir4_cache_rele(rp, rdc); 9084 mutex_exit(&rp->r_statelock); 9085 return (error); 9086 } 9087 9088 static int 9089 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9090 { 9091 int error; 9092 rnode4_t *rp; 9093 9094 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9095 9096 rp = VTOR4(vp); 9097 9098 /* 9099 * Obtain the readdir results for the caller. 9100 */ 9101 nfs4readdir(vp, rdc, cr); 9102 9103 mutex_enter(&rp->r_statelock); 9104 /* 9105 * The entry is now complete 9106 */ 9107 rdc->flags &= ~RDDIR; 9108 9109 error = rdc->error; 9110 if (error) 9111 rdc->flags |= RDDIRREQ; 9112 rddir4_cache_rele(rp, rdc); 9113 mutex_exit(&rp->r_statelock); 9114 9115 return (error); 9116 } 9117 9118 /* 9119 * Read directory entries. 9120 * There are some weird things to look out for here. The uio_loffset 9121 * field is either 0 or it is the offset returned from a previous 9122 * readdir. It is an opaque value used by the server to find the 9123 * correct directory block to read. The count field is the number 9124 * of blocks to read on the server. This is advisory only, the server 9125 * may return only one block's worth of entries. Entries may be compressed 9126 * on the server. 9127 * 9128 * Generates the following compound request: 9129 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9130 * must include a Lookupp as well. In this case, send: 9131 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9132 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9133 * 9134 * Get complete attributes and filehandles for entries if this is the 9135 * first read of the directory. Otherwise, just get fileid's. 9136 */ 9137 static void 9138 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9139 { 9140 COMPOUND4args_clnt args; 9141 COMPOUND4res_clnt res; 9142 READDIR4args *rargs; 9143 READDIR4res_clnt *rd_res; 9144 bitmap4 rd_bitsval; 9145 nfs_argop4 argop[5]; 9146 nfs_resop4 *resop; 9147 rnode4_t *rp = VTOR4(vp); 9148 mntinfo4_t *mi = VTOMI4(vp); 9149 int doqueue; 9150 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9151 vnode_t *dvp; 9152 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9153 int num_ops, res_opcnt; 9154 bool_t needrecov = FALSE; 9155 nfs4_recov_state_t recov_state; 9156 hrtime_t t; 9157 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9158 9159 ASSERT(nfs_zone() == mi->mi_zone); 9160 ASSERT(rdc->flags & RDDIR); 9161 ASSERT(rdc->entries == NULL); 9162 9163 /* 9164 * If rp were a stub, it should have triggered and caused 9165 * a mount for us to get this far. 9166 */ 9167 ASSERT(!RP_ISSTUB(rp)); 9168 9169 num_ops = 2; 9170 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9171 /* 9172 * Since nfsv4 readdir may not return entries for "." and "..", 9173 * the client must recreate them: 9174 * To find the correct nodeid, do the following: 9175 * For current node, get nodeid from dnlc. 9176 * - if current node is rootvp, set pnodeid to nodeid. 9177 * - else if parent is in the dnlc, get its nodeid from there. 9178 * - else add LOOKUPP+GETATTR to compound. 9179 */ 9180 nodeid = rp->r_attr.va_nodeid; 9181 if (vp->v_flag & VROOT) { 9182 pnodeid = nodeid; /* root of mount point */ 9183 } else { 9184 dvp = dnlc_lookup(vp, ".."); 9185 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9186 /* parent in dnlc cache - no need for otw */ 9187 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9188 } else { 9189 /* 9190 * parent not in dnlc cache, 9191 * do lookupp to get its id 9192 */ 9193 num_ops = 5; 9194 pnodeid = 0; /* set later by getattr parent */ 9195 } 9196 if (dvp) 9197 VN_RELE(dvp); 9198 } 9199 } 9200 recov_state.rs_flags = 0; 9201 recov_state.rs_num_retry_despite_err = 0; 9202 9203 /* Save the original mount point security flavor */ 9204 (void) save_mnt_secinfo(mi->mi_curr_serv); 9205 9206 recov_retry: 9207 args.ctag = TAG_READDIR; 9208 9209 args.array = argop; 9210 args.array_len = num_ops; 9211 9212 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9213 &recov_state, NULL)) { 9214 /* 9215 * If readdir a node that is a stub for a crossed mount point, 9216 * keep the original secinfo flavor for the current file 9217 * system, not the crossed one. 9218 */ 9219 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9220 rdc->error = e.error; 9221 return; 9222 } 9223 9224 /* 9225 * Determine which attrs to request for dirents. This code 9226 * must be protected by nfs4_start/end_fop because of r_server 9227 * (which will change during failover recovery). 9228 * 9229 */ 9230 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9231 /* 9232 * Get all vattr attrs plus filehandle and rdattr_error 9233 */ 9234 rd_bitsval = NFS4_VATTR_MASK | 9235 FATTR4_RDATTR_ERROR_MASK | 9236 FATTR4_FILEHANDLE_MASK; 9237 9238 if (rp->r_flags & R4READDIRWATTR) { 9239 mutex_enter(&rp->r_statelock); 9240 rp->r_flags &= ~R4READDIRWATTR; 9241 mutex_exit(&rp->r_statelock); 9242 } 9243 } else { 9244 servinfo4_t *svp = rp->r_server; 9245 9246 /* 9247 * Already read directory. Use readdir with 9248 * no attrs (except for mounted_on_fileid) for updates. 9249 */ 9250 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9251 9252 /* 9253 * request mounted on fileid if supported, else request 9254 * fileid. maybe we should verify that fileid is supported 9255 * and request something else if not. 9256 */ 9257 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9258 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9259 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9260 nfs_rw_exit(&svp->sv_lock); 9261 } 9262 9263 /* putfh directory fh */ 9264 argop[0].argop = OP_CPUTFH; 9265 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9266 9267 argop[1].argop = OP_READDIR; 9268 rargs = &argop[1].nfs_argop4_u.opreaddir; 9269 /* 9270 * 1 and 2 are reserved for client "." and ".." entry offset. 9271 * cookie 0 should be used over-the-wire to start reading at 9272 * the beginning of the directory excluding "." and "..". 9273 */ 9274 if (rdc->nfs4_cookie == 0 || 9275 rdc->nfs4_cookie == 1 || 9276 rdc->nfs4_cookie == 2) { 9277 rargs->cookie = (nfs_cookie4)0; 9278 rargs->cookieverf = 0; 9279 } else { 9280 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9281 mutex_enter(&rp->r_statelock); 9282 rargs->cookieverf = rp->r_cookieverf4; 9283 mutex_exit(&rp->r_statelock); 9284 } 9285 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9286 rargs->maxcount = mi->mi_tsize; 9287 rargs->attr_request = rd_bitsval; 9288 rargs->rdc = rdc; 9289 rargs->dvp = vp; 9290 rargs->mi = mi; 9291 rargs->cr = cr; 9292 9293 9294 /* 9295 * If count < than the minimum required, we return no entries 9296 * and fail with EINVAL 9297 */ 9298 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9299 rdc->error = EINVAL; 9300 goto out; 9301 } 9302 9303 if (args.array_len == 5) { 9304 /* 9305 * Add lookupp and getattr for parent nodeid. 9306 */ 9307 argop[2].argop = OP_LOOKUPP; 9308 9309 argop[3].argop = OP_GETFH; 9310 9311 /* getattr parent */ 9312 argop[4].argop = OP_GETATTR; 9313 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9314 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9315 } 9316 9317 doqueue = 1; 9318 9319 if (mi->mi_io_kstats) { 9320 mutex_enter(&mi->mi_lock); 9321 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9322 mutex_exit(&mi->mi_lock); 9323 } 9324 9325 /* capture the time of this call */ 9326 rargs->t = t = gethrtime(); 9327 9328 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9329 9330 if (mi->mi_io_kstats) { 9331 mutex_enter(&mi->mi_lock); 9332 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9333 mutex_exit(&mi->mi_lock); 9334 } 9335 9336 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9337 9338 /* 9339 * If RPC error occurred and it isn't an error that 9340 * triggers recovery, then go ahead and fail now. 9341 */ 9342 if (e.error != 0 && !needrecov) { 9343 rdc->error = e.error; 9344 goto out; 9345 } 9346 9347 if (needrecov) { 9348 bool_t abort; 9349 9350 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9351 "nfs4readdir: initiating recovery.\n")); 9352 9353 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9354 NULL, OP_READDIR, NULL, NULL, NULL); 9355 if (abort == FALSE) { 9356 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9357 &recov_state, needrecov); 9358 if (!e.error) 9359 (void) xdr_free(xdr_COMPOUND4res_clnt, 9360 (caddr_t)&res); 9361 if (rdc->entries != NULL) { 9362 kmem_free(rdc->entries, rdc->entlen); 9363 rdc->entries = NULL; 9364 } 9365 goto recov_retry; 9366 } 9367 9368 if (e.error != 0) { 9369 rdc->error = e.error; 9370 goto out; 9371 } 9372 9373 /* fall through for res.status case */ 9374 } 9375 9376 res_opcnt = res.array_len; 9377 9378 /* 9379 * If compound failed first 2 ops (PUTFH+READDIR), then return 9380 * failure here. Subsequent ops are for filling out dot-dot 9381 * dirent, and if they fail, we still want to give the caller 9382 * the dirents returned by (the successful) READDIR op, so we need 9383 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9384 * 9385 * One example where PUTFH+READDIR ops would succeed but 9386 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9387 * but lacks x. In this case, a POSIX server's VOP_READDIR 9388 * would succeed; however, VOP_LOOKUP(..) would fail since no 9389 * x perm. We need to come up with a non-vendor-specific way 9390 * for a POSIX server to return d_ino from dotdot's dirent if 9391 * client only requests mounted_on_fileid, and just say the 9392 * LOOKUPP succeeded and fill out the GETATTR. However, if 9393 * client requested any mandatory attrs, server would be required 9394 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9395 * for dotdot. 9396 */ 9397 9398 if (res.status) { 9399 if (res_opcnt <= 2) { 9400 e.error = geterrno4(res.status); 9401 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9402 &recov_state, needrecov); 9403 nfs4_purge_stale_fh(e.error, vp, cr); 9404 rdc->error = e.error; 9405 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9406 if (rdc->entries != NULL) { 9407 kmem_free(rdc->entries, rdc->entlen); 9408 rdc->entries = NULL; 9409 } 9410 /* 9411 * If readdir a node that is a stub for a 9412 * crossed mount point, keep the original 9413 * secinfo flavor for the current file system, 9414 * not the crossed one. 9415 */ 9416 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9417 return; 9418 } 9419 } 9420 9421 resop = &res.array[1]; /* readdir res */ 9422 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9423 9424 mutex_enter(&rp->r_statelock); 9425 rp->r_cookieverf4 = rd_res->cookieverf; 9426 mutex_exit(&rp->r_statelock); 9427 9428 /* 9429 * For "." and ".." entries 9430 * e.g. 9431 * seek(cookie=0) -> "." entry with d_off = 1 9432 * seek(cookie=1) -> ".." entry with d_off = 2 9433 */ 9434 if (cookie == (nfs_cookie4) 0) { 9435 if (rd_res->dotp) 9436 rd_res->dotp->d_ino = nodeid; 9437 if (rd_res->dotdotp) 9438 rd_res->dotdotp->d_ino = pnodeid; 9439 } 9440 if (cookie == (nfs_cookie4) 1) { 9441 if (rd_res->dotdotp) 9442 rd_res->dotdotp->d_ino = pnodeid; 9443 } 9444 9445 9446 /* LOOKUPP+GETATTR attemped */ 9447 if (args.array_len == 5 && rd_res->dotdotp) { 9448 if (res.status == NFS4_OK && res_opcnt == 5) { 9449 nfs_fh4 *fhp; 9450 nfs4_sharedfh_t *sfhp; 9451 vnode_t *pvp; 9452 nfs4_ga_res_t *garp; 9453 9454 resop++; /* lookupp */ 9455 resop++; /* getfh */ 9456 fhp = &resop->nfs_resop4_u.opgetfh.object; 9457 9458 resop++; /* getattr of parent */ 9459 9460 /* 9461 * First, take care of finishing the 9462 * readdir results. 9463 */ 9464 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9465 /* 9466 * The d_ino of .. must be the inode number 9467 * of the mounted filesystem. 9468 */ 9469 if (garp->n4g_va.va_mask & AT_NODEID) 9470 rd_res->dotdotp->d_ino = 9471 garp->n4g_va.va_nodeid; 9472 9473 9474 /* 9475 * Next, create the ".." dnlc entry 9476 */ 9477 sfhp = sfh4_get(fhp, mi); 9478 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9479 dnlc_update(vp, "..", pvp); 9480 VN_RELE(pvp); 9481 } 9482 sfh4_rele(&sfhp); 9483 } 9484 } 9485 9486 if (mi->mi_io_kstats) { 9487 mutex_enter(&mi->mi_lock); 9488 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9489 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9490 mutex_exit(&mi->mi_lock); 9491 } 9492 9493 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9494 9495 out: 9496 /* 9497 * If readdir a node that is a stub for a crossed mount point, 9498 * keep the original secinfo flavor for the current file system, 9499 * not the crossed one. 9500 */ 9501 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9502 9503 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9504 } 9505 9506 9507 static int 9508 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9509 { 9510 rnode4_t *rp = VTOR4(bp->b_vp); 9511 int count; 9512 int error; 9513 cred_t *cred_otw = NULL; 9514 offset_t offset; 9515 nfs4_open_stream_t *osp = NULL; 9516 bool_t first_time = TRUE; /* first time getting otw cred */ 9517 bool_t last_time = FALSE; /* last time getting otw cred */ 9518 9519 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9520 9521 DTRACE_IO1(start, struct buf *, bp); 9522 offset = ldbtob(bp->b_lblkno); 9523 9524 if (bp->b_flags & B_READ) { 9525 read_again: 9526 /* 9527 * Releases the osp, if it is provided. 9528 * Puts a hold on the cred_otw and the new osp (if found). 9529 */ 9530 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9531 &first_time, &last_time); 9532 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9533 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9534 readahead, NULL); 9535 crfree(cred_otw); 9536 if (!error) { 9537 if (bp->b_resid) { 9538 /* 9539 * Didn't get it all because we hit EOF, 9540 * zero all the memory beyond the EOF. 9541 */ 9542 /* bzero(rdaddr + */ 9543 bzero(bp->b_un.b_addr + 9544 bp->b_bcount - bp->b_resid, bp->b_resid); 9545 } 9546 mutex_enter(&rp->r_statelock); 9547 if (bp->b_resid == bp->b_bcount && 9548 offset >= rp->r_size) { 9549 /* 9550 * We didn't read anything at all as we are 9551 * past EOF. Return an error indicator back 9552 * but don't destroy the pages (yet). 9553 */ 9554 error = NFS_EOF; 9555 } 9556 mutex_exit(&rp->r_statelock); 9557 } else if (error == EACCES && last_time == FALSE) { 9558 goto read_again; 9559 } 9560 } else { 9561 if (!(rp->r_flags & R4STALE)) { 9562 write_again: 9563 /* 9564 * Releases the osp, if it is provided. 9565 * Puts a hold on the cred_otw and the new 9566 * osp (if found). 9567 */ 9568 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9569 &first_time, &last_time); 9570 mutex_enter(&rp->r_statelock); 9571 count = MIN(bp->b_bcount, rp->r_size - offset); 9572 mutex_exit(&rp->r_statelock); 9573 if (count < 0) 9574 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9575 #ifdef DEBUG 9576 if (count == 0) { 9577 zoneid_t zoneid = getzoneid(); 9578 9579 zcmn_err(zoneid, CE_WARN, 9580 "nfs4_bio: zero length write at %lld", 9581 offset); 9582 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9583 "b_bcount=%ld, file size=%lld", 9584 rp->r_flags, (long)bp->b_bcount, 9585 rp->r_size); 9586 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9587 if (nfs4_bio_do_stop) 9588 debug_enter("nfs4_bio"); 9589 } 9590 #endif 9591 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9592 count, cred_otw, stab_comm); 9593 if (error == EACCES && last_time == FALSE) { 9594 crfree(cred_otw); 9595 goto write_again; 9596 } 9597 bp->b_error = error; 9598 if (error && error != EINTR && 9599 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9600 /* 9601 * Don't print EDQUOT errors on the console. 9602 * Don't print asynchronous EACCES errors. 9603 * Don't print EFBIG errors. 9604 * Print all other write errors. 9605 */ 9606 if (error != EDQUOT && error != EFBIG && 9607 (error != EACCES || 9608 !(bp->b_flags & B_ASYNC))) 9609 nfs4_write_error(bp->b_vp, 9610 error, cred_otw); 9611 /* 9612 * Update r_error and r_flags as appropriate. 9613 * If the error was ESTALE, then mark the 9614 * rnode as not being writeable and save 9615 * the error status. Otherwise, save any 9616 * errors which occur from asynchronous 9617 * page invalidations. Any errors occurring 9618 * from other operations should be saved 9619 * by the caller. 9620 */ 9621 mutex_enter(&rp->r_statelock); 9622 if (error == ESTALE) { 9623 rp->r_flags |= R4STALE; 9624 if (!rp->r_error) 9625 rp->r_error = error; 9626 } else if (!rp->r_error && 9627 (bp->b_flags & 9628 (B_INVAL|B_FORCE|B_ASYNC)) == 9629 (B_INVAL|B_FORCE|B_ASYNC)) { 9630 rp->r_error = error; 9631 } 9632 mutex_exit(&rp->r_statelock); 9633 } 9634 crfree(cred_otw); 9635 } else { 9636 error = rp->r_error; 9637 /* 9638 * A close may have cleared r_error, if so, 9639 * propagate ESTALE error return properly 9640 */ 9641 if (error == 0) 9642 error = ESTALE; 9643 } 9644 } 9645 9646 if (error != 0 && error != NFS_EOF) 9647 bp->b_flags |= B_ERROR; 9648 9649 if (osp) 9650 open_stream_rele(osp, rp); 9651 9652 DTRACE_IO1(done, struct buf *, bp); 9653 9654 return (error); 9655 } 9656 9657 /* ARGSUSED */ 9658 int 9659 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9660 { 9661 return (EREMOTE); 9662 } 9663 9664 /* ARGSUSED2 */ 9665 int 9666 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9667 { 9668 rnode4_t *rp = VTOR4(vp); 9669 9670 if (!write_lock) { 9671 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9672 return (V_WRITELOCK_FALSE); 9673 } 9674 9675 if ((rp->r_flags & R4DIRECTIO) || 9676 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9677 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9678 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9679 return (V_WRITELOCK_FALSE); 9680 nfs_rw_exit(&rp->r_rwlock); 9681 } 9682 9683 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9684 return (V_WRITELOCK_TRUE); 9685 } 9686 9687 /* ARGSUSED */ 9688 void 9689 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9690 { 9691 rnode4_t *rp = VTOR4(vp); 9692 9693 nfs_rw_exit(&rp->r_rwlock); 9694 } 9695 9696 /* ARGSUSED */ 9697 static int 9698 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9699 { 9700 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9701 return (EIO); 9702 9703 /* 9704 * Because we stuff the readdir cookie into the offset field 9705 * someone may attempt to do an lseek with the cookie which 9706 * we want to succeed. 9707 */ 9708 if (vp->v_type == VDIR) 9709 return (0); 9710 if (*noffp < 0) 9711 return (EINVAL); 9712 return (0); 9713 } 9714 9715 9716 /* 9717 * Return all the pages from [off..off+len) in file 9718 */ 9719 /* ARGSUSED */ 9720 static int 9721 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9722 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9723 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9724 { 9725 rnode4_t *rp; 9726 int error; 9727 mntinfo4_t *mi; 9728 9729 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9730 return (EIO); 9731 rp = VTOR4(vp); 9732 if (IS_SHADOW(vp, rp)) 9733 vp = RTOV4(rp); 9734 9735 if (vp->v_flag & VNOMAP) 9736 return (ENOSYS); 9737 9738 if (protp != NULL) 9739 *protp = PROT_ALL; 9740 9741 /* 9742 * Now validate that the caches are up to date. 9743 */ 9744 if (error = nfs4_validate_caches(vp, cr)) 9745 return (error); 9746 9747 mi = VTOMI4(vp); 9748 retry: 9749 mutex_enter(&rp->r_statelock); 9750 9751 /* 9752 * Don't create dirty pages faster than they 9753 * can be cleaned so that the system doesn't 9754 * get imbalanced. If the async queue is 9755 * maxed out, then wait for it to drain before 9756 * creating more dirty pages. Also, wait for 9757 * any threads doing pagewalks in the vop_getattr 9758 * entry points so that they don't block for 9759 * long periods. 9760 */ 9761 if (rw == S_CREATE) { 9762 while ((mi->mi_max_threads != 0 && 9763 rp->r_awcount > 2 * mi->mi_max_threads) || 9764 rp->r_gcount > 0) 9765 cv_wait(&rp->r_cv, &rp->r_statelock); 9766 } 9767 9768 /* 9769 * If we are getting called as a side effect of an nfs_write() 9770 * operation the local file size might not be extended yet. 9771 * In this case we want to be able to return pages of zeroes. 9772 */ 9773 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9774 NFS4_DEBUG(nfs4_pageio_debug, 9775 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9776 "len=%llu, size=%llu, attrsize =%llu", off, 9777 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9778 mutex_exit(&rp->r_statelock); 9779 return (EFAULT); /* beyond EOF */ 9780 } 9781 9782 mutex_exit(&rp->r_statelock); 9783 9784 if (len <= PAGESIZE) { 9785 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9786 seg, addr, rw, cr); 9787 NFS4_DEBUG(nfs4_pageio_debug && error, 9788 (CE_NOTE, "getpage error %d; off=%lld, " 9789 "len=%lld", error, off, (u_longlong_t)len)); 9790 } else { 9791 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9792 pl, plsz, seg, addr, rw, cr); 9793 NFS4_DEBUG(nfs4_pageio_debug && error, 9794 (CE_NOTE, "getpages error %d; off=%lld, " 9795 "len=%lld", error, off, (u_longlong_t)len)); 9796 } 9797 9798 switch (error) { 9799 case NFS_EOF: 9800 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9801 goto retry; 9802 case ESTALE: 9803 nfs4_purge_stale_fh(error, vp, cr); 9804 } 9805 9806 return (error); 9807 } 9808 9809 /* 9810 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9811 */ 9812 /* ARGSUSED */ 9813 static int 9814 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9815 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9816 enum seg_rw rw, cred_t *cr) 9817 { 9818 rnode4_t *rp; 9819 uint_t bsize; 9820 struct buf *bp; 9821 page_t *pp; 9822 u_offset_t lbn; 9823 u_offset_t io_off; 9824 u_offset_t blkoff; 9825 u_offset_t rablkoff; 9826 size_t io_len; 9827 uint_t blksize; 9828 int error; 9829 int readahead; 9830 int readahead_issued = 0; 9831 int ra_window; /* readahead window */ 9832 page_t *pagefound; 9833 page_t *savepp; 9834 9835 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9836 return (EIO); 9837 9838 rp = VTOR4(vp); 9839 ASSERT(!IS_SHADOW(vp, rp)); 9840 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9841 9842 reread: 9843 bp = NULL; 9844 pp = NULL; 9845 pagefound = NULL; 9846 9847 if (pl != NULL) 9848 pl[0] = NULL; 9849 9850 error = 0; 9851 lbn = off / bsize; 9852 blkoff = lbn * bsize; 9853 9854 /* 9855 * Queueing up the readahead before doing the synchronous read 9856 * results in a significant increase in read throughput because 9857 * of the increased parallelism between the async threads and 9858 * the process context. 9859 */ 9860 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9861 rw != S_CREATE && 9862 !(vp->v_flag & VNOCACHE)) { 9863 mutex_enter(&rp->r_statelock); 9864 9865 /* 9866 * Calculate the number of readaheads to do. 9867 * a) No readaheads at offset = 0. 9868 * b) Do maximum(nfs4_nra) readaheads when the readahead 9869 * window is closed. 9870 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9871 * upon how far the readahead window is open or close. 9872 * d) No readaheads if rp->r_nextr is not within the scope 9873 * of the readahead window (random i/o). 9874 */ 9875 9876 if (off == 0) 9877 readahead = 0; 9878 else if (blkoff == rp->r_nextr) 9879 readahead = nfs4_nra; 9880 else if (rp->r_nextr > blkoff && 9881 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9882 <= (nfs4_nra - 1))) 9883 readahead = nfs4_nra - ra_window; 9884 else 9885 readahead = 0; 9886 9887 rablkoff = rp->r_nextr; 9888 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9889 mutex_exit(&rp->r_statelock); 9890 if (nfs4_async_readahead(vp, rablkoff + bsize, 9891 addr + (rablkoff + bsize - off), 9892 seg, cr, nfs4_readahead) < 0) { 9893 mutex_enter(&rp->r_statelock); 9894 break; 9895 } 9896 readahead--; 9897 rablkoff += bsize; 9898 /* 9899 * Indicate that we did a readahead so 9900 * readahead offset is not updated 9901 * by the synchronous read below. 9902 */ 9903 readahead_issued = 1; 9904 mutex_enter(&rp->r_statelock); 9905 /* 9906 * set readahead offset to 9907 * offset of last async readahead 9908 * request. 9909 */ 9910 rp->r_nextr = rablkoff; 9911 } 9912 mutex_exit(&rp->r_statelock); 9913 } 9914 9915 again: 9916 if ((pagefound = page_exists(vp, off)) == NULL) { 9917 if (pl == NULL) { 9918 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9919 nfs4_readahead); 9920 } else if (rw == S_CREATE) { 9921 /* 9922 * Block for this page is not allocated, or the offset 9923 * is beyond the current allocation size, or we're 9924 * allocating a swap slot and the page was not found, 9925 * so allocate it and return a zero page. 9926 */ 9927 if ((pp = page_create_va(vp, off, 9928 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9929 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9930 io_len = PAGESIZE; 9931 mutex_enter(&rp->r_statelock); 9932 rp->r_nextr = off + PAGESIZE; 9933 mutex_exit(&rp->r_statelock); 9934 } else { 9935 /* 9936 * Need to go to server to get a block 9937 */ 9938 mutex_enter(&rp->r_statelock); 9939 if (blkoff < rp->r_size && 9940 blkoff + bsize > rp->r_size) { 9941 /* 9942 * If less than a block left in 9943 * file read less than a block. 9944 */ 9945 if (rp->r_size <= off) { 9946 /* 9947 * Trying to access beyond EOF, 9948 * set up to get at least one page. 9949 */ 9950 blksize = off + PAGESIZE - blkoff; 9951 } else 9952 blksize = rp->r_size - blkoff; 9953 } else if ((off == 0) || 9954 (off != rp->r_nextr && !readahead_issued)) { 9955 blksize = PAGESIZE; 9956 blkoff = off; /* block = page here */ 9957 } else 9958 blksize = bsize; 9959 mutex_exit(&rp->r_statelock); 9960 9961 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9962 &io_len, blkoff, blksize, 0); 9963 9964 /* 9965 * Some other thread has entered the page, 9966 * so just use it. 9967 */ 9968 if (pp == NULL) 9969 goto again; 9970 9971 /* 9972 * Now round the request size up to page boundaries. 9973 * This ensures that the entire page will be 9974 * initialized to zeroes if EOF is encountered. 9975 */ 9976 io_len = ptob(btopr(io_len)); 9977 9978 bp = pageio_setup(pp, io_len, vp, B_READ); 9979 ASSERT(bp != NULL); 9980 9981 /* 9982 * pageio_setup should have set b_addr to 0. This 9983 * is correct since we want to do I/O on a page 9984 * boundary. bp_mapin will use this addr to calculate 9985 * an offset, and then set b_addr to the kernel virtual 9986 * address it allocated for us. 9987 */ 9988 ASSERT(bp->b_un.b_addr == 0); 9989 9990 bp->b_edev = 0; 9991 bp->b_dev = 0; 9992 bp->b_lblkno = lbtodb(io_off); 9993 bp->b_file = vp; 9994 bp->b_offset = (offset_t)off; 9995 bp_mapin(bp); 9996 9997 /* 9998 * If doing a write beyond what we believe is EOF, 9999 * don't bother trying to read the pages from the 10000 * server, we'll just zero the pages here. We 10001 * don't check that the rw flag is S_WRITE here 10002 * because some implementations may attempt a 10003 * read access to the buffer before copying data. 10004 */ 10005 mutex_enter(&rp->r_statelock); 10006 if (io_off >= rp->r_size && seg == segkmap) { 10007 mutex_exit(&rp->r_statelock); 10008 bzero(bp->b_un.b_addr, io_len); 10009 } else { 10010 mutex_exit(&rp->r_statelock); 10011 error = nfs4_bio(bp, NULL, cr, FALSE); 10012 } 10013 10014 /* 10015 * Unmap the buffer before freeing it. 10016 */ 10017 bp_mapout(bp); 10018 pageio_done(bp); 10019 10020 savepp = pp; 10021 do { 10022 pp->p_fsdata = C_NOCOMMIT; 10023 } while ((pp = pp->p_next) != savepp); 10024 10025 if (error == NFS_EOF) { 10026 /* 10027 * If doing a write system call just return 10028 * zeroed pages, else user tried to get pages 10029 * beyond EOF, return error. We don't check 10030 * that the rw flag is S_WRITE here because 10031 * some implementations may attempt a read 10032 * access to the buffer before copying data. 10033 */ 10034 if (seg == segkmap) 10035 error = 0; 10036 else 10037 error = EFAULT; 10038 } 10039 10040 if (!readahead_issued && !error) { 10041 mutex_enter(&rp->r_statelock); 10042 rp->r_nextr = io_off + io_len; 10043 mutex_exit(&rp->r_statelock); 10044 } 10045 } 10046 } 10047 10048 out: 10049 if (pl == NULL) 10050 return (error); 10051 10052 if (error) { 10053 if (pp != NULL) 10054 pvn_read_done(pp, B_ERROR); 10055 return (error); 10056 } 10057 10058 if (pagefound) { 10059 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 10060 10061 /* 10062 * Page exists in the cache, acquire the appropriate lock. 10063 * If this fails, start all over again. 10064 */ 10065 if ((pp = page_lookup(vp, off, se)) == NULL) { 10066 #ifdef DEBUG 10067 nfs4_lostpage++; 10068 #endif 10069 goto reread; 10070 } 10071 pl[0] = pp; 10072 pl[1] = NULL; 10073 return (0); 10074 } 10075 10076 if (pp != NULL) 10077 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10078 10079 return (error); 10080 } 10081 10082 static void 10083 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10084 cred_t *cr) 10085 { 10086 int error; 10087 page_t *pp; 10088 u_offset_t io_off; 10089 size_t io_len; 10090 struct buf *bp; 10091 uint_t bsize, blksize; 10092 rnode4_t *rp = VTOR4(vp); 10093 page_t *savepp; 10094 10095 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10096 10097 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10098 10099 mutex_enter(&rp->r_statelock); 10100 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10101 /* 10102 * If less than a block left in file read less 10103 * than a block. 10104 */ 10105 blksize = rp->r_size - blkoff; 10106 } else 10107 blksize = bsize; 10108 mutex_exit(&rp->r_statelock); 10109 10110 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10111 &io_off, &io_len, blkoff, blksize, 1); 10112 /* 10113 * The isra flag passed to the kluster function is 1, we may have 10114 * gotten a return value of NULL for a variety of reasons (# of free 10115 * pages < minfree, someone entered the page on the vnode etc). In all 10116 * cases, we want to punt on the readahead. 10117 */ 10118 if (pp == NULL) 10119 return; 10120 10121 /* 10122 * Now round the request size up to page boundaries. 10123 * This ensures that the entire page will be 10124 * initialized to zeroes if EOF is encountered. 10125 */ 10126 io_len = ptob(btopr(io_len)); 10127 10128 bp = pageio_setup(pp, io_len, vp, B_READ); 10129 ASSERT(bp != NULL); 10130 10131 /* 10132 * pageio_setup should have set b_addr to 0. This is correct since 10133 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10134 * to calculate an offset, and then set b_addr to the kernel virtual 10135 * address it allocated for us. 10136 */ 10137 ASSERT(bp->b_un.b_addr == 0); 10138 10139 bp->b_edev = 0; 10140 bp->b_dev = 0; 10141 bp->b_lblkno = lbtodb(io_off); 10142 bp->b_file = vp; 10143 bp->b_offset = (offset_t)blkoff; 10144 bp_mapin(bp); 10145 10146 /* 10147 * If doing a write beyond what we believe is EOF, don't bother trying 10148 * to read the pages from the server, we'll just zero the pages here. 10149 * We don't check that the rw flag is S_WRITE here because some 10150 * implementations may attempt a read access to the buffer before 10151 * copying data. 10152 */ 10153 mutex_enter(&rp->r_statelock); 10154 if (io_off >= rp->r_size && seg == segkmap) { 10155 mutex_exit(&rp->r_statelock); 10156 bzero(bp->b_un.b_addr, io_len); 10157 error = 0; 10158 } else { 10159 mutex_exit(&rp->r_statelock); 10160 error = nfs4_bio(bp, NULL, cr, TRUE); 10161 if (error == NFS_EOF) 10162 error = 0; 10163 } 10164 10165 /* 10166 * Unmap the buffer before freeing it. 10167 */ 10168 bp_mapout(bp); 10169 pageio_done(bp); 10170 10171 savepp = pp; 10172 do { 10173 pp->p_fsdata = C_NOCOMMIT; 10174 } while ((pp = pp->p_next) != savepp); 10175 10176 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10177 10178 /* 10179 * In case of error set readahead offset 10180 * to the lowest offset. 10181 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10182 */ 10183 if (error && rp->r_nextr > io_off) { 10184 mutex_enter(&rp->r_statelock); 10185 if (rp->r_nextr > io_off) 10186 rp->r_nextr = io_off; 10187 mutex_exit(&rp->r_statelock); 10188 } 10189 } 10190 10191 /* 10192 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10193 * If len == 0, do from off to EOF. 10194 * 10195 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10196 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10197 * (from pageout). 10198 */ 10199 /* ARGSUSED */ 10200 static int 10201 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10202 caller_context_t *ct) 10203 { 10204 int error; 10205 rnode4_t *rp; 10206 10207 ASSERT(cr != NULL); 10208 10209 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10210 return (EIO); 10211 10212 rp = VTOR4(vp); 10213 if (IS_SHADOW(vp, rp)) 10214 vp = RTOV4(rp); 10215 10216 /* 10217 * XXX - Why should this check be made here? 10218 */ 10219 if (vp->v_flag & VNOMAP) 10220 return (ENOSYS); 10221 10222 if (len == 0 && !(flags & B_INVAL) && 10223 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10224 return (0); 10225 10226 mutex_enter(&rp->r_statelock); 10227 rp->r_count++; 10228 mutex_exit(&rp->r_statelock); 10229 error = nfs4_putpages(vp, off, len, flags, cr); 10230 mutex_enter(&rp->r_statelock); 10231 rp->r_count--; 10232 cv_broadcast(&rp->r_cv); 10233 mutex_exit(&rp->r_statelock); 10234 10235 return (error); 10236 } 10237 10238 /* 10239 * Write out a single page, possibly klustering adjacent dirty pages. 10240 */ 10241 int 10242 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10243 int flags, cred_t *cr) 10244 { 10245 u_offset_t io_off; 10246 u_offset_t lbn_off; 10247 u_offset_t lbn; 10248 size_t io_len; 10249 uint_t bsize; 10250 int error; 10251 rnode4_t *rp; 10252 10253 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10254 ASSERT(pp != NULL); 10255 ASSERT(cr != NULL); 10256 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10257 10258 rp = VTOR4(vp); 10259 ASSERT(rp->r_count > 0); 10260 ASSERT(!IS_SHADOW(vp, rp)); 10261 10262 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10263 lbn = pp->p_offset / bsize; 10264 lbn_off = lbn * bsize; 10265 10266 /* 10267 * Find a kluster that fits in one block, or in 10268 * one page if pages are bigger than blocks. If 10269 * there is less file space allocated than a whole 10270 * page, we'll shorten the i/o request below. 10271 */ 10272 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10273 roundup(bsize, PAGESIZE), flags); 10274 10275 /* 10276 * pvn_write_kluster shouldn't have returned a page with offset 10277 * behind the original page we were given. Verify that. 10278 */ 10279 ASSERT((pp->p_offset / bsize) >= lbn); 10280 10281 /* 10282 * Now pp will have the list of kept dirty pages marked for 10283 * write back. It will also handle invalidation and freeing 10284 * of pages that are not dirty. Check for page length rounding 10285 * problems. 10286 */ 10287 if (io_off + io_len > lbn_off + bsize) { 10288 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10289 io_len = lbn_off + bsize - io_off; 10290 } 10291 /* 10292 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10293 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10294 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10295 * progress and the r_size has not been made consistent with the 10296 * new size of the file. When the uiomove() completes the r_size is 10297 * updated and the R4MODINPROGRESS flag is cleared. 10298 * 10299 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10300 * consistent value of r_size. Without this handshaking, it is 10301 * possible that nfs4_bio() picks up the old value of r_size 10302 * before the uiomove() in writerp4() completes. This will result 10303 * in the write through nfs4_bio() being dropped. 10304 * 10305 * More precisely, there is a window between the time the uiomove() 10306 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10307 * operation intervenes in this window, the page will be picked up, 10308 * because it is dirty (it will be unlocked, unless it was 10309 * pagecreate'd). When the page is picked up as dirty, the dirty 10310 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10311 * checked. This will still be the old size. Therefore the page will 10312 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10313 * the page will be found to be clean and the write will be dropped. 10314 */ 10315 if (rp->r_flags & R4MODINPROGRESS) { 10316 mutex_enter(&rp->r_statelock); 10317 if ((rp->r_flags & R4MODINPROGRESS) && 10318 rp->r_modaddr + MAXBSIZE > io_off && 10319 rp->r_modaddr < io_off + io_len) { 10320 page_t *plist; 10321 /* 10322 * A write is in progress for this region of the file. 10323 * If we did not detect R4MODINPROGRESS here then this 10324 * path through nfs_putapage() would eventually go to 10325 * nfs4_bio() and may not write out all of the data 10326 * in the pages. We end up losing data. So we decide 10327 * to set the modified bit on each page in the page 10328 * list and mark the rnode with R4DIRTY. This write 10329 * will be restarted at some later time. 10330 */ 10331 plist = pp; 10332 while (plist != NULL) { 10333 pp = plist; 10334 page_sub(&plist, pp); 10335 hat_setmod(pp); 10336 page_io_unlock(pp); 10337 page_unlock(pp); 10338 } 10339 rp->r_flags |= R4DIRTY; 10340 mutex_exit(&rp->r_statelock); 10341 if (offp) 10342 *offp = io_off; 10343 if (lenp) 10344 *lenp = io_len; 10345 return (0); 10346 } 10347 mutex_exit(&rp->r_statelock); 10348 } 10349 10350 if (flags & B_ASYNC) { 10351 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10352 nfs4_sync_putapage); 10353 } else 10354 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10355 10356 if (offp) 10357 *offp = io_off; 10358 if (lenp) 10359 *lenp = io_len; 10360 return (error); 10361 } 10362 10363 static int 10364 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10365 int flags, cred_t *cr) 10366 { 10367 int error; 10368 rnode4_t *rp; 10369 10370 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10371 10372 flags |= B_WRITE; 10373 10374 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10375 10376 rp = VTOR4(vp); 10377 10378 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10379 error == EACCES) && 10380 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10381 if (!(rp->r_flags & R4OUTOFSPACE)) { 10382 mutex_enter(&rp->r_statelock); 10383 rp->r_flags |= R4OUTOFSPACE; 10384 mutex_exit(&rp->r_statelock); 10385 } 10386 flags |= B_ERROR; 10387 pvn_write_done(pp, flags); 10388 /* 10389 * If this was not an async thread, then try again to 10390 * write out the pages, but this time, also destroy 10391 * them whether or not the write is successful. This 10392 * will prevent memory from filling up with these 10393 * pages and destroying them is the only alternative 10394 * if they can't be written out. 10395 * 10396 * Don't do this if this is an async thread because 10397 * when the pages are unlocked in pvn_write_done, 10398 * some other thread could have come along, locked 10399 * them, and queued for an async thread. It would be 10400 * possible for all of the async threads to be tied 10401 * up waiting to lock the pages again and they would 10402 * all already be locked and waiting for an async 10403 * thread to handle them. Deadlock. 10404 */ 10405 if (!(flags & B_ASYNC)) { 10406 error = nfs4_putpage(vp, io_off, io_len, 10407 B_INVAL | B_FORCE, cr, NULL); 10408 } 10409 } else { 10410 if (error) 10411 flags |= B_ERROR; 10412 else if (rp->r_flags & R4OUTOFSPACE) { 10413 mutex_enter(&rp->r_statelock); 10414 rp->r_flags &= ~R4OUTOFSPACE; 10415 mutex_exit(&rp->r_statelock); 10416 } 10417 pvn_write_done(pp, flags); 10418 if (freemem < desfree) 10419 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10420 NFS4_WRITE_NOWAIT); 10421 } 10422 10423 return (error); 10424 } 10425 10426 #ifdef DEBUG 10427 int nfs4_force_open_before_mmap = 0; 10428 #endif 10429 10430 /* ARGSUSED */ 10431 static int 10432 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10433 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10434 caller_context_t *ct) 10435 { 10436 struct segvn_crargs vn_a; 10437 int error = 0; 10438 rnode4_t *rp = VTOR4(vp); 10439 mntinfo4_t *mi = VTOMI4(vp); 10440 10441 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10442 return (EIO); 10443 10444 if (vp->v_flag & VNOMAP) 10445 return (ENOSYS); 10446 10447 if (off < 0 || (off + len) < 0) 10448 return (ENXIO); 10449 10450 if (vp->v_type != VREG) 10451 return (ENODEV); 10452 10453 /* 10454 * If the file is delegated to the client don't do anything. 10455 * If the file is not delegated, then validate the data cache. 10456 */ 10457 mutex_enter(&rp->r_statev4_lock); 10458 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10459 mutex_exit(&rp->r_statev4_lock); 10460 error = nfs4_validate_caches(vp, cr); 10461 if (error) 10462 return (error); 10463 } else { 10464 mutex_exit(&rp->r_statev4_lock); 10465 } 10466 10467 /* 10468 * Check to see if the vnode is currently marked as not cachable. 10469 * This means portions of the file are locked (through VOP_FRLOCK). 10470 * In this case the map request must be refused. We use 10471 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10472 * 10473 * Atomically increment r_inmap after acquiring r_rwlock. The 10474 * idea here is to acquire r_rwlock to block read/write and 10475 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10476 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10477 * and we can prevent the deadlock that would have occurred 10478 * when nfs4_addmap() would have acquired it out of order. 10479 * 10480 * Since we are not protecting r_inmap by any lock, we do not 10481 * hold any lock when we decrement it. We atomically decrement 10482 * r_inmap after we release r_lkserlock. 10483 */ 10484 10485 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp))) 10486 return (EINTR); 10487 atomic_add_int(&rp->r_inmap, 1); 10488 nfs_rw_exit(&rp->r_rwlock); 10489 10490 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10491 atomic_add_int(&rp->r_inmap, -1); 10492 return (EINTR); 10493 } 10494 10495 10496 if (vp->v_flag & VNOCACHE) { 10497 error = EAGAIN; 10498 goto done; 10499 } 10500 10501 /* 10502 * Don't allow concurrent locks and mapping if mandatory locking is 10503 * enabled. 10504 */ 10505 if (flk_has_remote_locks(vp)) { 10506 struct vattr va; 10507 va.va_mask = AT_MODE; 10508 error = nfs4getattr(vp, &va, cr); 10509 if (error != 0) 10510 goto done; 10511 if (MANDLOCK(vp, va.va_mode)) { 10512 error = EAGAIN; 10513 goto done; 10514 } 10515 } 10516 10517 /* 10518 * It is possible that the rnode has a lost lock request that we 10519 * are still trying to recover, and that the request conflicts with 10520 * this map request. 10521 * 10522 * An alternative approach would be for nfs4_safemap() to consider 10523 * queued lock requests when deciding whether to set or clear 10524 * VNOCACHE. This would require the frlock code path to call 10525 * nfs4_safemap() after enqueing a lost request. 10526 */ 10527 if (nfs4_map_lost_lock_conflict(vp)) { 10528 error = EAGAIN; 10529 goto done; 10530 } 10531 10532 as_rangelock(as); 10533 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10534 if (error != 0) { 10535 as_rangeunlock(as); 10536 goto done; 10537 } 10538 10539 if (vp->v_type == VREG) { 10540 /* 10541 * We need to retrieve the open stream 10542 */ 10543 nfs4_open_stream_t *osp = NULL; 10544 nfs4_open_owner_t *oop = NULL; 10545 10546 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10547 if (oop != NULL) { 10548 /* returns with 'os_sync_lock' held */ 10549 osp = find_open_stream(oop, rp); 10550 open_owner_rele(oop); 10551 } 10552 if (osp == NULL) { 10553 #ifdef DEBUG 10554 if (nfs4_force_open_before_mmap) { 10555 error = EIO; 10556 goto done; 10557 } 10558 #endif 10559 /* returns with 'os_sync_lock' held */ 10560 error = open_and_get_osp(vp, cr, &osp); 10561 if (osp == NULL) { 10562 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10563 "nfs4_map: we tried to OPEN the file " 10564 "but again no osp, so fail with EIO")); 10565 goto done; 10566 } 10567 } 10568 10569 if (osp->os_failed_reopen) { 10570 mutex_exit(&osp->os_sync_lock); 10571 open_stream_rele(osp, rp); 10572 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10573 "nfs4_map: os_failed_reopen set on " 10574 "osp %p, cr %p, rp %s", (void *)osp, 10575 (void *)cr, rnode4info(rp))); 10576 error = EIO; 10577 goto done; 10578 } 10579 mutex_exit(&osp->os_sync_lock); 10580 open_stream_rele(osp, rp); 10581 } 10582 10583 vn_a.vp = vp; 10584 vn_a.offset = off; 10585 vn_a.type = (flags & MAP_TYPE); 10586 vn_a.prot = (uchar_t)prot; 10587 vn_a.maxprot = (uchar_t)maxprot; 10588 vn_a.flags = (flags & ~MAP_TYPE); 10589 vn_a.cred = cr; 10590 vn_a.amp = NULL; 10591 vn_a.szc = 0; 10592 vn_a.lgrp_mem_policy_flags = 0; 10593 10594 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10595 as_rangeunlock(as); 10596 10597 done: 10598 nfs_rw_exit(&rp->r_lkserlock); 10599 atomic_add_int(&rp->r_inmap, -1); 10600 return (error); 10601 } 10602 10603 /* 10604 * We're most likely dealing with a kernel module that likes to READ 10605 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10606 * officially OPEN the file to create the necessary client state 10607 * for bookkeeping of os_mmap_read/write counts. 10608 * 10609 * Since VOP_MAP only passes in a pointer to the vnode rather than 10610 * a double pointer, we can't handle the case where nfs4open_otw() 10611 * returns a different vnode than the one passed into VOP_MAP (since 10612 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10613 * we return NULL and let nfs4_map() fail. Note: the only case where 10614 * this should happen is if the file got removed and replaced with the 10615 * same name on the server (in addition to the fact that we're trying 10616 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10617 */ 10618 static int 10619 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10620 { 10621 rnode4_t *rp, *drp; 10622 vnode_t *dvp, *open_vp; 10623 char file_name[MAXNAMELEN]; 10624 int just_created; 10625 nfs4_open_stream_t *osp; 10626 nfs4_open_owner_t *oop; 10627 int error; 10628 10629 *ospp = NULL; 10630 open_vp = map_vp; 10631 10632 rp = VTOR4(open_vp); 10633 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10634 return (error); 10635 drp = VTOR4(dvp); 10636 10637 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10638 VN_RELE(dvp); 10639 return (EINTR); 10640 } 10641 10642 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10643 nfs_rw_exit(&drp->r_rwlock); 10644 VN_RELE(dvp); 10645 return (error); 10646 } 10647 10648 mutex_enter(&rp->r_statev4_lock); 10649 if (rp->created_v4) { 10650 rp->created_v4 = 0; 10651 mutex_exit(&rp->r_statev4_lock); 10652 10653 dnlc_update(dvp, file_name, open_vp); 10654 /* This is needed so we don't bump the open ref count */ 10655 just_created = 1; 10656 } else { 10657 mutex_exit(&rp->r_statev4_lock); 10658 just_created = 0; 10659 } 10660 10661 VN_HOLD(map_vp); 10662 10663 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10664 just_created); 10665 if (error) { 10666 nfs_rw_exit(&drp->r_rwlock); 10667 VN_RELE(dvp); 10668 VN_RELE(map_vp); 10669 return (error); 10670 } 10671 10672 nfs_rw_exit(&drp->r_rwlock); 10673 VN_RELE(dvp); 10674 10675 /* 10676 * If nfs4open_otw() returned a different vnode then "undo" 10677 * the open and return failure to the caller. 10678 */ 10679 if (!VN_CMP(open_vp, map_vp)) { 10680 nfs4_error_t e; 10681 10682 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10683 "open returned a different vnode")); 10684 /* 10685 * If there's an error, ignore it, 10686 * and let VOP_INACTIVE handle it. 10687 */ 10688 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10689 CLOSE_NORM, 0, 0, 0); 10690 VN_RELE(map_vp); 10691 return (EIO); 10692 } 10693 10694 VN_RELE(map_vp); 10695 10696 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10697 if (!oop) { 10698 nfs4_error_t e; 10699 10700 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10701 "no open owner")); 10702 /* 10703 * If there's an error, ignore it, 10704 * and let VOP_INACTIVE handle it. 10705 */ 10706 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10707 CLOSE_NORM, 0, 0, 0); 10708 return (EIO); 10709 } 10710 osp = find_open_stream(oop, rp); 10711 open_owner_rele(oop); 10712 *ospp = osp; 10713 return (0); 10714 } 10715 10716 /* 10717 * Please be aware that when this function is called, the address space write 10718 * a_lock is held. Do not put over the wire calls in this function. 10719 */ 10720 /* ARGSUSED */ 10721 static int 10722 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10723 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10724 caller_context_t *ct) 10725 { 10726 rnode4_t *rp; 10727 int error = 0; 10728 mntinfo4_t *mi; 10729 10730 mi = VTOMI4(vp); 10731 rp = VTOR4(vp); 10732 10733 if (nfs_zone() != mi->mi_zone) 10734 return (EIO); 10735 if (vp->v_flag & VNOMAP) 10736 return (ENOSYS); 10737 10738 /* 10739 * Don't need to update the open stream first, since this 10740 * mmap can't add any additional share access that isn't 10741 * already contained in the open stream (for the case where we 10742 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10743 * take into account os_mmap_read[write] counts). 10744 */ 10745 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10746 10747 if (vp->v_type == VREG) { 10748 /* 10749 * We need to retrieve the open stream and update the counts. 10750 * If there is no open stream here, something is wrong. 10751 */ 10752 nfs4_open_stream_t *osp = NULL; 10753 nfs4_open_owner_t *oop = NULL; 10754 10755 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10756 if (oop != NULL) { 10757 /* returns with 'os_sync_lock' held */ 10758 osp = find_open_stream(oop, rp); 10759 open_owner_rele(oop); 10760 } 10761 if (osp == NULL) { 10762 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10763 "nfs4_addmap: we should have an osp" 10764 "but we don't, so fail with EIO")); 10765 error = EIO; 10766 goto out; 10767 } 10768 10769 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10770 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10771 10772 /* 10773 * Update the map count in the open stream. 10774 * This is necessary in the case where we 10775 * open/mmap/close/, then the server reboots, and we 10776 * attempt to reopen. If the mmap doesn't add share 10777 * access then we send an invalid reopen with 10778 * access = NONE. 10779 * 10780 * We need to specifically check each PROT_* so a mmap 10781 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10782 * read and write access. A simple comparison of prot 10783 * to ~PROT_WRITE to determine read access is insufficient 10784 * since prot can be |= with PROT_USER, etc. 10785 */ 10786 10787 /* 10788 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10789 */ 10790 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10791 osp->os_mmap_write += btopr(len); 10792 if (maxprot & PROT_READ) 10793 osp->os_mmap_read += btopr(len); 10794 if (maxprot & PROT_EXEC) 10795 osp->os_mmap_read += btopr(len); 10796 /* 10797 * Ensure that os_mmap_read gets incremented, even if 10798 * maxprot were to look like PROT_NONE. 10799 */ 10800 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10801 !(maxprot & PROT_EXEC)) 10802 osp->os_mmap_read += btopr(len); 10803 osp->os_mapcnt += btopr(len); 10804 mutex_exit(&osp->os_sync_lock); 10805 open_stream_rele(osp, rp); 10806 } 10807 10808 out: 10809 /* 10810 * If we got an error, then undo our 10811 * incrementing of 'r_mapcnt'. 10812 */ 10813 10814 if (error) { 10815 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10816 ASSERT(rp->r_mapcnt >= 0); 10817 } 10818 return (error); 10819 } 10820 10821 /* ARGSUSED */ 10822 static int 10823 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10824 { 10825 10826 return (VTOR4(vp1) == VTOR4(vp2)); 10827 } 10828 10829 /* ARGSUSED */ 10830 static int 10831 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10832 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10833 caller_context_t *ct) 10834 { 10835 int rc; 10836 u_offset_t start, end; 10837 rnode4_t *rp; 10838 int error = 0, intr = INTR4(vp); 10839 nfs4_error_t e; 10840 10841 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10842 return (EIO); 10843 10844 /* check for valid cmd parameter */ 10845 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10846 return (EINVAL); 10847 10848 /* Verify l_type. */ 10849 switch (bfp->l_type) { 10850 case F_RDLCK: 10851 if (cmd != F_GETLK && !(flag & FREAD)) 10852 return (EBADF); 10853 break; 10854 case F_WRLCK: 10855 if (cmd != F_GETLK && !(flag & FWRITE)) 10856 return (EBADF); 10857 break; 10858 case F_UNLCK: 10859 intr = 0; 10860 break; 10861 10862 default: 10863 return (EINVAL); 10864 } 10865 10866 /* check the validity of the lock range */ 10867 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10868 return (rc); 10869 if (rc = flk_check_lock_data(start, end, MAXEND)) 10870 return (rc); 10871 10872 /* 10873 * If the filesystem is mounted using local locking, pass the 10874 * request off to the local locking code. 10875 */ 10876 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10877 if (cmd == F_SETLK || cmd == F_SETLKW) { 10878 /* 10879 * For complete safety, we should be holding 10880 * r_lkserlock. However, we can't call 10881 * nfs4_safelock and then fs_frlock while 10882 * holding r_lkserlock, so just invoke 10883 * nfs4_safelock and expect that this will 10884 * catch enough of the cases. 10885 */ 10886 if (!nfs4_safelock(vp, bfp, cr)) 10887 return (EAGAIN); 10888 } 10889 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10890 } 10891 10892 rp = VTOR4(vp); 10893 10894 /* 10895 * Check whether the given lock request can proceed, given the 10896 * current file mappings. 10897 */ 10898 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10899 return (EINTR); 10900 if (cmd == F_SETLK || cmd == F_SETLKW) { 10901 if (!nfs4_safelock(vp, bfp, cr)) { 10902 rc = EAGAIN; 10903 goto done; 10904 } 10905 } 10906 10907 /* 10908 * Flush the cache after waiting for async I/O to finish. For new 10909 * locks, this is so that the process gets the latest bits from the 10910 * server. For unlocks, this is so that other clients see the 10911 * latest bits once the file has been unlocked. If currently dirty 10912 * pages can't be flushed, then don't allow a lock to be set. But 10913 * allow unlocks to succeed, to avoid having orphan locks on the 10914 * server. 10915 */ 10916 if (cmd != F_GETLK) { 10917 mutex_enter(&rp->r_statelock); 10918 while (rp->r_count > 0) { 10919 if (intr) { 10920 klwp_t *lwp = ttolwp(curthread); 10921 10922 if (lwp != NULL) 10923 lwp->lwp_nostop++; 10924 if (cv_wait_sig(&rp->r_cv, 10925 &rp->r_statelock) == 0) { 10926 if (lwp != NULL) 10927 lwp->lwp_nostop--; 10928 rc = EINTR; 10929 break; 10930 } 10931 if (lwp != NULL) 10932 lwp->lwp_nostop--; 10933 } else 10934 cv_wait(&rp->r_cv, &rp->r_statelock); 10935 } 10936 mutex_exit(&rp->r_statelock); 10937 if (rc != 0) 10938 goto done; 10939 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 10940 if (error) { 10941 if (error == ENOSPC || error == EDQUOT) { 10942 mutex_enter(&rp->r_statelock); 10943 if (!rp->r_error) 10944 rp->r_error = error; 10945 mutex_exit(&rp->r_statelock); 10946 } 10947 if (bfp->l_type != F_UNLCK) { 10948 rc = ENOLCK; 10949 goto done; 10950 } 10951 } 10952 } 10953 10954 /* 10955 * Call the lock manager to do the real work of contacting 10956 * the server and obtaining the lock. 10957 */ 10958 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10959 cr, &e, NULL, NULL); 10960 rc = e.error; 10961 10962 if (rc == 0) 10963 nfs4_lockcompletion(vp, cmd); 10964 10965 done: 10966 nfs_rw_exit(&rp->r_lkserlock); 10967 10968 return (rc); 10969 } 10970 10971 /* 10972 * Free storage space associated with the specified vnode. The portion 10973 * to be freed is specified by bfp->l_start and bfp->l_len (already 10974 * normalized to a "whence" of 0). 10975 * 10976 * This is an experimental facility whose continued existence is not 10977 * guaranteed. Currently, we only support the special case 10978 * of l_len == 0, meaning free to end of file. 10979 */ 10980 /* ARGSUSED */ 10981 static int 10982 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10983 offset_t offset, cred_t *cr, caller_context_t *ct) 10984 { 10985 int error; 10986 10987 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10988 return (EIO); 10989 ASSERT(vp->v_type == VREG); 10990 if (cmd != F_FREESP) 10991 return (EINVAL); 10992 10993 error = convoff(vp, bfp, 0, offset); 10994 if (!error) { 10995 ASSERT(bfp->l_start >= 0); 10996 if (bfp->l_len == 0) { 10997 struct vattr va; 10998 10999 va.va_mask = AT_SIZE; 11000 va.va_size = bfp->l_start; 11001 error = nfs4setattr(vp, &va, 0, cr, NULL); 11002 } else 11003 error = EINVAL; 11004 } 11005 11006 return (error); 11007 } 11008 11009 /* ARGSUSED */ 11010 int 11011 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 11012 { 11013 rnode4_t *rp; 11014 rp = VTOR4(vp); 11015 11016 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 11017 vp = RTOV4(rp); 11018 } 11019 *vpp = vp; 11020 return (0); 11021 } 11022 11023 /* 11024 * Setup and add an address space callback to do the work of the delmap call. 11025 * The callback will (and must be) deleted in the actual callback function. 11026 * 11027 * This is done in order to take care of the problem that we have with holding 11028 * the address space's a_lock for a long period of time (e.g. if the NFS server 11029 * is down). Callbacks will be executed in the address space code while the 11030 * a_lock is not held. Holding the address space's a_lock causes things such 11031 * as ps and fork to hang because they are trying to acquire this lock as well. 11032 */ 11033 /* ARGSUSED */ 11034 static int 11035 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 11036 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 11037 caller_context_t *ct) 11038 { 11039 int caller_found; 11040 int error; 11041 rnode4_t *rp; 11042 nfs4_delmap_args_t *dmapp; 11043 nfs4_delmapcall_t *delmap_call; 11044 11045 if (vp->v_flag & VNOMAP) 11046 return (ENOSYS); 11047 11048 /* 11049 * A process may not change zones if it has NFS pages mmap'ed 11050 * in, so we can't legitimately get here from the wrong zone. 11051 */ 11052 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11053 11054 rp = VTOR4(vp); 11055 11056 /* 11057 * The way that the address space of this process deletes its mapping 11058 * of this file is via the following call chains: 11059 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11060 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11061 * 11062 * With the use of address space callbacks we are allowed to drop the 11063 * address space lock, a_lock, while executing the NFS operations that 11064 * need to go over the wire. Returning EAGAIN to the caller of this 11065 * function is what drives the execution of the callback that we add 11066 * below. The callback will be executed by the address space code 11067 * after dropping the a_lock. When the callback is finished, since 11068 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11069 * is called again on the same segment to finish the rest of the work 11070 * that needs to happen during unmapping. 11071 * 11072 * This action of calling back into the segment driver causes 11073 * nfs4_delmap() to get called again, but since the callback was 11074 * already executed at this point, it already did the work and there 11075 * is nothing left for us to do. 11076 * 11077 * To Summarize: 11078 * - The first time nfs4_delmap is called by the current thread is when 11079 * we add the caller associated with this delmap to the delmap caller 11080 * list, add the callback, and return EAGAIN. 11081 * - The second time in this call chain when nfs4_delmap is called we 11082 * will find this caller in the delmap caller list and realize there 11083 * is no more work to do thus removing this caller from the list and 11084 * returning the error that was set in the callback execution. 11085 */ 11086 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11087 if (caller_found) { 11088 /* 11089 * 'error' is from the actual delmap operations. To avoid 11090 * hangs, we need to handle the return of EAGAIN differently 11091 * since this is what drives the callback execution. 11092 * In this case, we don't want to return EAGAIN and do the 11093 * callback execution because there are none to execute. 11094 */ 11095 if (error == EAGAIN) 11096 return (0); 11097 else 11098 return (error); 11099 } 11100 11101 /* current caller was not in the list */ 11102 delmap_call = nfs4_init_delmapcall(); 11103 11104 mutex_enter(&rp->r_statelock); 11105 list_insert_tail(&rp->r_indelmap, delmap_call); 11106 mutex_exit(&rp->r_statelock); 11107 11108 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11109 11110 dmapp->vp = vp; 11111 dmapp->off = off; 11112 dmapp->addr = addr; 11113 dmapp->len = len; 11114 dmapp->prot = prot; 11115 dmapp->maxprot = maxprot; 11116 dmapp->flags = flags; 11117 dmapp->cr = cr; 11118 dmapp->caller = delmap_call; 11119 11120 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11121 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11122 11123 return (error ? error : EAGAIN); 11124 } 11125 11126 static nfs4_delmapcall_t * 11127 nfs4_init_delmapcall() 11128 { 11129 nfs4_delmapcall_t *delmap_call; 11130 11131 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11132 delmap_call->call_id = curthread; 11133 delmap_call->error = 0; 11134 11135 return (delmap_call); 11136 } 11137 11138 static void 11139 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11140 { 11141 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11142 } 11143 11144 /* 11145 * Searches for the current delmap caller (based on curthread) in the list of 11146 * callers. If it is found, we remove it and free the delmap caller. 11147 * Returns: 11148 * 0 if the caller wasn't found 11149 * 1 if the caller was found, removed and freed. *errp will be set 11150 * to what the result of the delmap was. 11151 */ 11152 static int 11153 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11154 { 11155 nfs4_delmapcall_t *delmap_call; 11156 11157 /* 11158 * If the list doesn't exist yet, we create it and return 11159 * that the caller wasn't found. No list = no callers. 11160 */ 11161 mutex_enter(&rp->r_statelock); 11162 if (!(rp->r_flags & R4DELMAPLIST)) { 11163 /* The list does not exist */ 11164 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11165 offsetof(nfs4_delmapcall_t, call_node)); 11166 rp->r_flags |= R4DELMAPLIST; 11167 mutex_exit(&rp->r_statelock); 11168 return (0); 11169 } else { 11170 /* The list exists so search it */ 11171 for (delmap_call = list_head(&rp->r_indelmap); 11172 delmap_call != NULL; 11173 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11174 if (delmap_call->call_id == curthread) { 11175 /* current caller is in the list */ 11176 *errp = delmap_call->error; 11177 list_remove(&rp->r_indelmap, delmap_call); 11178 mutex_exit(&rp->r_statelock); 11179 nfs4_free_delmapcall(delmap_call); 11180 return (1); 11181 } 11182 } 11183 } 11184 mutex_exit(&rp->r_statelock); 11185 return (0); 11186 } 11187 11188 /* 11189 * Remove some pages from an mmap'd vnode. Just update the 11190 * count of pages. If doing close-to-open, then flush and 11191 * commit all of the pages associated with this file. 11192 * Otherwise, start an asynchronous page flush to write out 11193 * any dirty pages. This will also associate a credential 11194 * with the rnode which can be used to write the pages. 11195 */ 11196 /* ARGSUSED */ 11197 static void 11198 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11199 { 11200 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11201 rnode4_t *rp; 11202 mntinfo4_t *mi; 11203 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11204 11205 rp = VTOR4(dmapp->vp); 11206 mi = VTOMI4(dmapp->vp); 11207 11208 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11209 ASSERT(rp->r_mapcnt >= 0); 11210 11211 /* 11212 * Initiate a page flush and potential commit if there are 11213 * pages, the file system was not mounted readonly, the segment 11214 * was mapped shared, and the pages themselves were writeable. 11215 */ 11216 if (nfs4_has_pages(dmapp->vp) && 11217 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11218 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11219 mutex_enter(&rp->r_statelock); 11220 rp->r_flags |= R4DIRTY; 11221 mutex_exit(&rp->r_statelock); 11222 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11223 dmapp->len, dmapp->cr); 11224 if (!e.error) { 11225 mutex_enter(&rp->r_statelock); 11226 e.error = rp->r_error; 11227 rp->r_error = 0; 11228 mutex_exit(&rp->r_statelock); 11229 } 11230 } else 11231 e.error = 0; 11232 11233 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11234 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11235 B_INVAL, dmapp->cr, NULL); 11236 11237 if (e.error) { 11238 e.stat = puterrno4(e.error); 11239 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11240 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11241 dmapp->caller->error = e.error; 11242 } 11243 11244 /* Check to see if we need to close the file */ 11245 11246 if (dmapp->vp->v_type == VREG) { 11247 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11248 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11249 11250 if (e.error != 0 || e.stat != NFS4_OK) { 11251 /* 11252 * Since it is possible that e.error == 0 and 11253 * e.stat != NFS4_OK (and vice versa), 11254 * we do the proper checking in order to get both 11255 * e.error and e.stat reporting the correct info. 11256 */ 11257 if (e.stat == NFS4_OK) 11258 e.stat = puterrno4(e.error); 11259 if (e.error == 0) 11260 e.error = geterrno4(e.stat); 11261 11262 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11263 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11264 dmapp->caller->error = e.error; 11265 } 11266 } 11267 11268 (void) as_delete_callback(as, arg); 11269 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11270 } 11271 11272 11273 static uint_t 11274 fattr4_maxfilesize_to_bits(uint64_t ll) 11275 { 11276 uint_t l = 1; 11277 11278 if (ll == 0) { 11279 return (0); 11280 } 11281 11282 if (ll & 0xffffffff00000000) { 11283 l += 32; ll >>= 32; 11284 } 11285 if (ll & 0xffff0000) { 11286 l += 16; ll >>= 16; 11287 } 11288 if (ll & 0xff00) { 11289 l += 8; ll >>= 8; 11290 } 11291 if (ll & 0xf0) { 11292 l += 4; ll >>= 4; 11293 } 11294 if (ll & 0xc) { 11295 l += 2; ll >>= 2; 11296 } 11297 if (ll & 0x2) { 11298 l += 1; 11299 } 11300 return (l); 11301 } 11302 11303 static int 11304 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11305 { 11306 vnode_t *avp = NULL; 11307 int error; 11308 11309 if ((error = nfs4lookup_xattr(vp, "", &avp, 11310 LOOKUP_XATTR, cr)) == 0) 11311 error = do_xattr_exists_check(avp, valp, cr); 11312 if (avp) 11313 VN_RELE(avp); 11314 11315 return (error); 11316 } 11317 11318 /* ARGSUSED */ 11319 int 11320 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11321 caller_context_t *ct) 11322 { 11323 int error; 11324 hrtime_t t; 11325 rnode4_t *rp; 11326 nfs4_ga_res_t gar; 11327 nfs4_ga_ext_res_t ger; 11328 11329 gar.n4g_ext_res = &ger; 11330 11331 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11332 return (EIO); 11333 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11334 *valp = MAXPATHLEN; 11335 return (0); 11336 } 11337 if (cmd == _PC_ACL_ENABLED) { 11338 *valp = _ACL_ACE_ENABLED; 11339 return (0); 11340 } 11341 11342 rp = VTOR4(vp); 11343 if (cmd == _PC_XATTR_EXISTS) { 11344 /* 11345 * The existence of the xattr directory is not sufficient 11346 * for determining whether generic user attributes exists. 11347 * The attribute directory could only be a transient directory 11348 * used for Solaris sysattr support. Do a small readdir 11349 * to verify if the only entries are sysattrs or not. 11350 * 11351 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11352 * is NULL. Once the xadir vp exists, we can create xattrs, 11353 * and we don't have any way to update the "base" object's 11354 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11355 * could help out. 11356 */ 11357 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11358 rp->r_xattr_dir == NULL) { 11359 return (nfs4_have_xattrs(vp, valp, cr)); 11360 } 11361 } else { /* OLD CODE */ 11362 if (ATTRCACHE4_VALID(vp)) { 11363 mutex_enter(&rp->r_statelock); 11364 if (rp->r_pathconf.pc4_cache_valid) { 11365 error = 0; 11366 switch (cmd) { 11367 case _PC_FILESIZEBITS: 11368 *valp = 11369 rp->r_pathconf.pc4_filesizebits; 11370 break; 11371 case _PC_LINK_MAX: 11372 *valp = 11373 rp->r_pathconf.pc4_link_max; 11374 break; 11375 case _PC_NAME_MAX: 11376 *valp = 11377 rp->r_pathconf.pc4_name_max; 11378 break; 11379 case _PC_CHOWN_RESTRICTED: 11380 *valp = 11381 rp->r_pathconf.pc4_chown_restricted; 11382 break; 11383 case _PC_NO_TRUNC: 11384 *valp = 11385 rp->r_pathconf.pc4_no_trunc; 11386 break; 11387 default: 11388 error = EINVAL; 11389 break; 11390 } 11391 mutex_exit(&rp->r_statelock); 11392 #ifdef DEBUG 11393 nfs4_pathconf_cache_hits++; 11394 #endif 11395 return (error); 11396 } 11397 mutex_exit(&rp->r_statelock); 11398 } 11399 } 11400 #ifdef DEBUG 11401 nfs4_pathconf_cache_misses++; 11402 #endif 11403 11404 t = gethrtime(); 11405 11406 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11407 11408 if (error) { 11409 mutex_enter(&rp->r_statelock); 11410 rp->r_pathconf.pc4_cache_valid = FALSE; 11411 rp->r_pathconf.pc4_xattr_valid = FALSE; 11412 mutex_exit(&rp->r_statelock); 11413 return (error); 11414 } 11415 11416 /* interpret the max filesize */ 11417 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11418 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11419 11420 /* Store the attributes we just received */ 11421 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11422 11423 switch (cmd) { 11424 case _PC_FILESIZEBITS: 11425 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11426 break; 11427 case _PC_LINK_MAX: 11428 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11429 break; 11430 case _PC_NAME_MAX: 11431 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11432 break; 11433 case _PC_CHOWN_RESTRICTED: 11434 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11435 break; 11436 case _PC_NO_TRUNC: 11437 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11438 break; 11439 case _PC_XATTR_EXISTS: 11440 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11441 if (error = nfs4_have_xattrs(vp, valp, cr)) 11442 return (error); 11443 } 11444 break; 11445 default: 11446 return (EINVAL); 11447 } 11448 11449 return (0); 11450 } 11451 11452 /* 11453 * Called by async thread to do synchronous pageio. Do the i/o, wait 11454 * for it to complete, and cleanup the page list when done. 11455 */ 11456 static int 11457 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11458 int flags, cred_t *cr) 11459 { 11460 int error; 11461 11462 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11463 11464 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11465 if (flags & B_READ) 11466 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11467 else 11468 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11469 return (error); 11470 } 11471 11472 /* ARGSUSED */ 11473 static int 11474 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11475 int flags, cred_t *cr, caller_context_t *ct) 11476 { 11477 int error; 11478 rnode4_t *rp; 11479 11480 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11481 return (EIO); 11482 11483 if (pp == NULL) 11484 return (EINVAL); 11485 11486 rp = VTOR4(vp); 11487 mutex_enter(&rp->r_statelock); 11488 rp->r_count++; 11489 mutex_exit(&rp->r_statelock); 11490 11491 if (flags & B_ASYNC) { 11492 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11493 nfs4_sync_pageio); 11494 } else 11495 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11496 mutex_enter(&rp->r_statelock); 11497 rp->r_count--; 11498 cv_broadcast(&rp->r_cv); 11499 mutex_exit(&rp->r_statelock); 11500 return (error); 11501 } 11502 11503 /* ARGSUSED */ 11504 static void 11505 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11506 caller_context_t *ct) 11507 { 11508 int error; 11509 rnode4_t *rp; 11510 page_t *plist; 11511 page_t *pptr; 11512 offset3 offset; 11513 count3 len; 11514 k_sigset_t smask; 11515 11516 /* 11517 * We should get called with fl equal to either B_FREE or 11518 * B_INVAL. Any other value is illegal. 11519 * 11520 * The page that we are either supposed to free or destroy 11521 * should be exclusive locked and its io lock should not 11522 * be held. 11523 */ 11524 ASSERT(fl == B_FREE || fl == B_INVAL); 11525 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11526 11527 rp = VTOR4(vp); 11528 11529 /* 11530 * If the page doesn't need to be committed or we shouldn't 11531 * even bother attempting to commit it, then just make sure 11532 * that the p_fsdata byte is clear and then either free or 11533 * destroy the page as appropriate. 11534 */ 11535 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11536 pp->p_fsdata = C_NOCOMMIT; 11537 if (fl == B_FREE) 11538 page_free(pp, dn); 11539 else 11540 page_destroy(pp, dn); 11541 return; 11542 } 11543 11544 /* 11545 * If there is a page invalidation operation going on, then 11546 * if this is one of the pages being destroyed, then just 11547 * clear the p_fsdata byte and then either free or destroy 11548 * the page as appropriate. 11549 */ 11550 mutex_enter(&rp->r_statelock); 11551 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11552 mutex_exit(&rp->r_statelock); 11553 pp->p_fsdata = C_NOCOMMIT; 11554 if (fl == B_FREE) 11555 page_free(pp, dn); 11556 else 11557 page_destroy(pp, dn); 11558 return; 11559 } 11560 11561 /* 11562 * If we are freeing this page and someone else is already 11563 * waiting to do a commit, then just unlock the page and 11564 * return. That other thread will take care of commiting 11565 * this page. The page can be freed sometime after the 11566 * commit has finished. Otherwise, if the page is marked 11567 * as delay commit, then we may be getting called from 11568 * pvn_write_done, one page at a time. This could result 11569 * in one commit per page, so we end up doing lots of small 11570 * commits instead of fewer larger commits. This is bad, 11571 * we want do as few commits as possible. 11572 */ 11573 if (fl == B_FREE) { 11574 if (rp->r_flags & R4COMMITWAIT) { 11575 page_unlock(pp); 11576 mutex_exit(&rp->r_statelock); 11577 return; 11578 } 11579 if (pp->p_fsdata == C_DELAYCOMMIT) { 11580 pp->p_fsdata = C_COMMIT; 11581 page_unlock(pp); 11582 mutex_exit(&rp->r_statelock); 11583 return; 11584 } 11585 } 11586 11587 /* 11588 * Check to see if there is a signal which would prevent an 11589 * attempt to commit the pages from being successful. If so, 11590 * then don't bother with all of the work to gather pages and 11591 * generate the unsuccessful RPC. Just return from here and 11592 * let the page be committed at some later time. 11593 */ 11594 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11595 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11596 sigunintr(&smask); 11597 page_unlock(pp); 11598 mutex_exit(&rp->r_statelock); 11599 return; 11600 } 11601 sigunintr(&smask); 11602 11603 /* 11604 * We are starting to need to commit pages, so let's try 11605 * to commit as many as possible at once to reduce the 11606 * overhead. 11607 * 11608 * Set the `commit inprogress' state bit. We must 11609 * first wait until any current one finishes. Then 11610 * we initialize the c_pages list with this page. 11611 */ 11612 while (rp->r_flags & R4COMMIT) { 11613 rp->r_flags |= R4COMMITWAIT; 11614 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11615 rp->r_flags &= ~R4COMMITWAIT; 11616 } 11617 rp->r_flags |= R4COMMIT; 11618 mutex_exit(&rp->r_statelock); 11619 ASSERT(rp->r_commit.c_pages == NULL); 11620 rp->r_commit.c_pages = pp; 11621 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11622 rp->r_commit.c_commlen = PAGESIZE; 11623 11624 /* 11625 * Gather together all other pages which can be committed. 11626 * They will all be chained off r_commit.c_pages. 11627 */ 11628 nfs4_get_commit(vp); 11629 11630 /* 11631 * Clear the `commit inprogress' status and disconnect 11632 * the list of pages to be committed from the rnode. 11633 * At this same time, we also save the starting offset 11634 * and length of data to be committed on the server. 11635 */ 11636 plist = rp->r_commit.c_pages; 11637 rp->r_commit.c_pages = NULL; 11638 offset = rp->r_commit.c_commbase; 11639 len = rp->r_commit.c_commlen; 11640 mutex_enter(&rp->r_statelock); 11641 rp->r_flags &= ~R4COMMIT; 11642 cv_broadcast(&rp->r_commit.c_cv); 11643 mutex_exit(&rp->r_statelock); 11644 11645 if (curproc == proc_pageout || curproc == proc_fsflush || 11646 nfs_zone() != VTOMI4(vp)->mi_zone) { 11647 nfs4_async_commit(vp, plist, offset, len, 11648 cr, do_nfs4_async_commit); 11649 return; 11650 } 11651 11652 /* 11653 * Actually generate the COMMIT op over the wire operation. 11654 */ 11655 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11656 11657 /* 11658 * If we got an error during the commit, just unlock all 11659 * of the pages. The pages will get retransmitted to the 11660 * server during a putpage operation. 11661 */ 11662 if (error) { 11663 while (plist != NULL) { 11664 pptr = plist; 11665 page_sub(&plist, pptr); 11666 page_unlock(pptr); 11667 } 11668 return; 11669 } 11670 11671 /* 11672 * We've tried as hard as we can to commit the data to stable 11673 * storage on the server. We just unlock the rest of the pages 11674 * and clear the commit required state. They will be put 11675 * onto the tail of the cachelist if they are nolonger 11676 * mapped. 11677 */ 11678 while (plist != pp) { 11679 pptr = plist; 11680 page_sub(&plist, pptr); 11681 pptr->p_fsdata = C_NOCOMMIT; 11682 page_unlock(pptr); 11683 } 11684 11685 /* 11686 * It is possible that nfs4_commit didn't return error but 11687 * some other thread has modified the page we are going 11688 * to free/destroy. 11689 * In this case we need to rewrite the page. Do an explicit check 11690 * before attempting to free/destroy the page. If modified, needs to 11691 * be rewritten so unlock the page and return. 11692 */ 11693 if (hat_ismod(pp)) { 11694 pp->p_fsdata = C_NOCOMMIT; 11695 page_unlock(pp); 11696 return; 11697 } 11698 11699 /* 11700 * Now, as appropriate, either free or destroy the page 11701 * that we were called with. 11702 */ 11703 pp->p_fsdata = C_NOCOMMIT; 11704 if (fl == B_FREE) 11705 page_free(pp, dn); 11706 else 11707 page_destroy(pp, dn); 11708 } 11709 11710 /* 11711 * Commit requires that the current fh be the file written to. 11712 * The compound op structure is: 11713 * PUTFH(file), COMMIT 11714 */ 11715 static int 11716 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11717 { 11718 COMPOUND4args_clnt args; 11719 COMPOUND4res_clnt res; 11720 COMMIT4res *cm_res; 11721 nfs_argop4 argop[2]; 11722 nfs_resop4 *resop; 11723 int doqueue; 11724 mntinfo4_t *mi; 11725 rnode4_t *rp; 11726 cred_t *cred_otw = NULL; 11727 bool_t needrecov = FALSE; 11728 nfs4_recov_state_t recov_state; 11729 nfs4_open_stream_t *osp = NULL; 11730 bool_t first_time = TRUE; /* first time getting OTW cred */ 11731 bool_t last_time = FALSE; /* last time getting OTW cred */ 11732 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11733 11734 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11735 11736 rp = VTOR4(vp); 11737 11738 mi = VTOMI4(vp); 11739 recov_state.rs_flags = 0; 11740 recov_state.rs_num_retry_despite_err = 0; 11741 get_commit_cred: 11742 /* 11743 * Releases the osp, if a valid open stream is provided. 11744 * Puts a hold on the cred_otw and the new osp (if found). 11745 */ 11746 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11747 &first_time, &last_time); 11748 args.ctag = TAG_COMMIT; 11749 recov_retry: 11750 /* 11751 * Commit ops: putfh file; commit 11752 */ 11753 args.array_len = 2; 11754 args.array = argop; 11755 11756 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11757 &recov_state, NULL); 11758 if (e.error) { 11759 crfree(cred_otw); 11760 if (osp != NULL) 11761 open_stream_rele(osp, rp); 11762 return (e.error); 11763 } 11764 11765 /* putfh directory */ 11766 argop[0].argop = OP_CPUTFH; 11767 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11768 11769 /* commit */ 11770 argop[1].argop = OP_COMMIT; 11771 argop[1].nfs_argop4_u.opcommit.offset = offset; 11772 argop[1].nfs_argop4_u.opcommit.count = count; 11773 11774 doqueue = 1; 11775 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11776 11777 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11778 if (!needrecov && e.error) { 11779 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11780 needrecov); 11781 crfree(cred_otw); 11782 if (e.error == EACCES && last_time == FALSE) 11783 goto get_commit_cred; 11784 if (osp != NULL) 11785 open_stream_rele(osp, rp); 11786 return (e.error); 11787 } 11788 11789 if (needrecov) { 11790 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11791 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) { 11792 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11793 &recov_state, needrecov); 11794 if (!e.error) 11795 (void) xdr_free(xdr_COMPOUND4res_clnt, 11796 (caddr_t)&res); 11797 goto recov_retry; 11798 } 11799 if (e.error) { 11800 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11801 &recov_state, needrecov); 11802 crfree(cred_otw); 11803 if (osp != NULL) 11804 open_stream_rele(osp, rp); 11805 return (e.error); 11806 } 11807 /* fall through for res.status case */ 11808 } 11809 11810 if (res.status) { 11811 e.error = geterrno4(res.status); 11812 if (e.error == EACCES && last_time == FALSE) { 11813 crfree(cred_otw); 11814 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11815 &recov_state, needrecov); 11816 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11817 goto get_commit_cred; 11818 } 11819 /* 11820 * Can't do a nfs4_purge_stale_fh here because this 11821 * can cause a deadlock. nfs4_commit can 11822 * be called from nfs4_dispose which can be called 11823 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11824 * can call back to pvn_vplist_dirty. 11825 */ 11826 if (e.error == ESTALE) { 11827 mutex_enter(&rp->r_statelock); 11828 rp->r_flags |= R4STALE; 11829 if (!rp->r_error) 11830 rp->r_error = e.error; 11831 mutex_exit(&rp->r_statelock); 11832 PURGE_ATTRCACHE4(vp); 11833 } else { 11834 mutex_enter(&rp->r_statelock); 11835 if (!rp->r_error) 11836 rp->r_error = e.error; 11837 mutex_exit(&rp->r_statelock); 11838 } 11839 } else { 11840 ASSERT(rp->r_flags & R4HAVEVERF); 11841 resop = &res.array[1]; /* commit res */ 11842 cm_res = &resop->nfs_resop4_u.opcommit; 11843 mutex_enter(&rp->r_statelock); 11844 if (cm_res->writeverf == rp->r_writeverf) { 11845 mutex_exit(&rp->r_statelock); 11846 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11847 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11848 &recov_state, needrecov); 11849 crfree(cred_otw); 11850 if (osp != NULL) 11851 open_stream_rele(osp, rp); 11852 return (0); 11853 } 11854 nfs4_set_mod(vp); 11855 rp->r_writeverf = cm_res->writeverf; 11856 mutex_exit(&rp->r_statelock); 11857 e.error = NFS_VERF_MISMATCH; 11858 } 11859 11860 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11861 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11862 crfree(cred_otw); 11863 if (osp != NULL) 11864 open_stream_rele(osp, rp); 11865 11866 return (e.error); 11867 } 11868 11869 static void 11870 nfs4_set_mod(vnode_t *vp) 11871 { 11872 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11873 11874 /* make sure we're looking at the master vnode, not a shadow */ 11875 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check); 11876 } 11877 11878 /* 11879 * This function is used to gather a page list of the pages which 11880 * can be committed on the server. 11881 * 11882 * The calling thread must have set R4COMMIT. This bit is used to 11883 * serialize access to the commit structure in the rnode. As long 11884 * as the thread has set R4COMMIT, then it can manipulate the commit 11885 * structure without requiring any other locks. 11886 * 11887 * When this function is called from nfs4_dispose() the page passed 11888 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11889 * will skip it. This is not a problem since we initially add the 11890 * page to the r_commit page list. 11891 * 11892 */ 11893 static void 11894 nfs4_get_commit(vnode_t *vp) 11895 { 11896 rnode4_t *rp; 11897 page_t *pp; 11898 kmutex_t *vphm; 11899 11900 rp = VTOR4(vp); 11901 11902 ASSERT(rp->r_flags & R4COMMIT); 11903 11904 /* make sure we're looking at the master vnode, not a shadow */ 11905 11906 if (IS_SHADOW(vp, rp)) 11907 vp = RTOV4(rp); 11908 11909 vphm = page_vnode_mutex(vp); 11910 mutex_enter(vphm); 11911 11912 /* 11913 * If there are no pages associated with this vnode, then 11914 * just return. 11915 */ 11916 if ((pp = vp->v_pages) == NULL) { 11917 mutex_exit(vphm); 11918 return; 11919 } 11920 11921 /* 11922 * Step through all of the pages associated with this vnode 11923 * looking for pages which need to be committed. 11924 */ 11925 do { 11926 /* Skip marker pages. */ 11927 if (pp->p_hash == PVN_VPLIST_HASH_TAG) 11928 continue; 11929 11930 /* 11931 * First short-cut everything (without the page_lock) 11932 * and see if this page does not need to be committed 11933 * or is modified if so then we'll just skip it. 11934 */ 11935 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11936 continue; 11937 11938 /* 11939 * Attempt to lock the page. If we can't, then 11940 * someone else is messing with it or we have been 11941 * called from nfs4_dispose and this is the page that 11942 * nfs4_dispose was called with.. anyway just skip it. 11943 */ 11944 if (!page_trylock(pp, SE_EXCL)) 11945 continue; 11946 11947 /* 11948 * Lets check again now that we have the page lock. 11949 */ 11950 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11951 page_unlock(pp); 11952 continue; 11953 } 11954 11955 /* this had better not be a free page */ 11956 ASSERT(PP_ISFREE(pp) == 0); 11957 11958 /* 11959 * The page needs to be committed and we locked it. 11960 * Update the base and length parameters and add it 11961 * to r_pages. 11962 */ 11963 if (rp->r_commit.c_pages == NULL) { 11964 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11965 rp->r_commit.c_commlen = PAGESIZE; 11966 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11967 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11968 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11969 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11970 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11971 <= pp->p_offset) { 11972 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11973 rp->r_commit.c_commbase + PAGESIZE; 11974 } 11975 page_add(&rp->r_commit.c_pages, pp); 11976 } while ((pp = pp->p_vpnext) != vp->v_pages); 11977 11978 mutex_exit(vphm); 11979 } 11980 11981 /* 11982 * This routine is used to gather together a page list of the pages 11983 * which are to be committed on the server. This routine must not 11984 * be called if the calling thread holds any locked pages. 11985 * 11986 * The calling thread must have set R4COMMIT. This bit is used to 11987 * serialize access to the commit structure in the rnode. As long 11988 * as the thread has set R4COMMIT, then it can manipulate the commit 11989 * structure without requiring any other locks. 11990 */ 11991 static void 11992 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11993 { 11994 11995 rnode4_t *rp; 11996 page_t *pp; 11997 u_offset_t end; 11998 u_offset_t off; 11999 ASSERT(len != 0); 12000 rp = VTOR4(vp); 12001 ASSERT(rp->r_flags & R4COMMIT); 12002 12003 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12004 12005 /* make sure we're looking at the master vnode, not a shadow */ 12006 12007 if (IS_SHADOW(vp, rp)) 12008 vp = RTOV4(rp); 12009 12010 /* 12011 * If there are no pages associated with this vnode, then 12012 * just return. 12013 */ 12014 if ((pp = vp->v_pages) == NULL) 12015 return; 12016 /* 12017 * Calculate the ending offset. 12018 */ 12019 end = soff + len; 12020 for (off = soff; off < end; off += PAGESIZE) { 12021 /* 12022 * Lookup each page by vp, offset. 12023 */ 12024 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 12025 continue; 12026 /* 12027 * If this page does not need to be committed or is 12028 * modified, then just skip it. 12029 */ 12030 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12031 page_unlock(pp); 12032 continue; 12033 } 12034 12035 ASSERT(PP_ISFREE(pp) == 0); 12036 /* 12037 * The page needs to be committed and we locked it. 12038 * Update the base and length parameters and add it 12039 * to r_pages. 12040 */ 12041 if (rp->r_commit.c_pages == NULL) { 12042 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12043 rp->r_commit.c_commlen = PAGESIZE; 12044 } else { 12045 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12046 rp->r_commit.c_commbase + PAGESIZE; 12047 } 12048 page_add(&rp->r_commit.c_pages, pp); 12049 } 12050 } 12051 12052 /* 12053 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12054 * Flushes and commits data to the server. 12055 */ 12056 static int 12057 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12058 { 12059 int error; 12060 verifier4 write_verf; 12061 rnode4_t *rp = VTOR4(vp); 12062 12063 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12064 12065 /* 12066 * Flush the data portion of the file and then commit any 12067 * portions which need to be committed. This may need to 12068 * be done twice if the server has changed state since 12069 * data was last written. The data will need to be 12070 * rewritten to the server and then a new commit done. 12071 * 12072 * In fact, this may need to be done several times if the 12073 * server is having problems and crashing while we are 12074 * attempting to do this. 12075 */ 12076 12077 top: 12078 /* 12079 * Do a flush based on the poff and plen arguments. This 12080 * will synchronously write out any modified pages in the 12081 * range specified by (poff, plen). This starts all of the 12082 * i/o operations which will be waited for in the next 12083 * call to nfs4_putpage 12084 */ 12085 12086 mutex_enter(&rp->r_statelock); 12087 write_verf = rp->r_writeverf; 12088 mutex_exit(&rp->r_statelock); 12089 12090 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12091 if (error == EAGAIN) 12092 error = 0; 12093 12094 /* 12095 * Do a flush based on the poff and plen arguments. This 12096 * will synchronously write out any modified pages in the 12097 * range specified by (poff, plen) and wait until all of 12098 * the asynchronous i/o's in that range are done as well. 12099 */ 12100 if (!error) 12101 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12102 12103 if (error) 12104 return (error); 12105 12106 mutex_enter(&rp->r_statelock); 12107 if (rp->r_writeverf != write_verf) { 12108 mutex_exit(&rp->r_statelock); 12109 goto top; 12110 } 12111 mutex_exit(&rp->r_statelock); 12112 12113 /* 12114 * Now commit any pages which might need to be committed. 12115 * If the error, NFS_VERF_MISMATCH, is returned, then 12116 * start over with the flush operation. 12117 */ 12118 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12119 12120 if (error == NFS_VERF_MISMATCH) 12121 goto top; 12122 12123 return (error); 12124 } 12125 12126 /* 12127 * nfs4_commit_vp() will wait for other pending commits and 12128 * will either commit the whole file or a range, plen dictates 12129 * if we commit whole file. a value of zero indicates the whole 12130 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12131 */ 12132 static int 12133 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12134 cred_t *cr, int wait_on_writes) 12135 { 12136 rnode4_t *rp; 12137 page_t *plist; 12138 offset3 offset; 12139 count3 len; 12140 12141 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12142 12143 rp = VTOR4(vp); 12144 12145 /* 12146 * before we gather commitable pages make 12147 * sure there are no outstanding async writes 12148 */ 12149 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12150 mutex_enter(&rp->r_statelock); 12151 while (rp->r_count > 0) { 12152 cv_wait(&rp->r_cv, &rp->r_statelock); 12153 } 12154 mutex_exit(&rp->r_statelock); 12155 } 12156 12157 /* 12158 * Set the `commit inprogress' state bit. We must 12159 * first wait until any current one finishes. 12160 */ 12161 mutex_enter(&rp->r_statelock); 12162 while (rp->r_flags & R4COMMIT) { 12163 rp->r_flags |= R4COMMITWAIT; 12164 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12165 rp->r_flags &= ~R4COMMITWAIT; 12166 } 12167 rp->r_flags |= R4COMMIT; 12168 mutex_exit(&rp->r_statelock); 12169 12170 /* 12171 * Gather all of the pages which need to be 12172 * committed. 12173 */ 12174 if (plen == 0) 12175 nfs4_get_commit(vp); 12176 else 12177 nfs4_get_commit_range(vp, poff, plen); 12178 12179 /* 12180 * Clear the `commit inprogress' bit and disconnect the 12181 * page list which was gathered by nfs4_get_commit. 12182 */ 12183 plist = rp->r_commit.c_pages; 12184 rp->r_commit.c_pages = NULL; 12185 offset = rp->r_commit.c_commbase; 12186 len = rp->r_commit.c_commlen; 12187 mutex_enter(&rp->r_statelock); 12188 rp->r_flags &= ~R4COMMIT; 12189 cv_broadcast(&rp->r_commit.c_cv); 12190 mutex_exit(&rp->r_statelock); 12191 12192 /* 12193 * If any pages need to be committed, commit them and 12194 * then unlock them so that they can be freed some 12195 * time later. 12196 */ 12197 if (plist == NULL) 12198 return (0); 12199 12200 /* 12201 * No error occurred during the flush portion 12202 * of this operation, so now attempt to commit 12203 * the data to stable storage on the server. 12204 * 12205 * This will unlock all of the pages on the list. 12206 */ 12207 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12208 } 12209 12210 static int 12211 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12212 cred_t *cr) 12213 { 12214 int error; 12215 page_t *pp; 12216 12217 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12218 12219 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12220 12221 /* 12222 * If we got an error, then just unlock all of the pages 12223 * on the list. 12224 */ 12225 if (error) { 12226 while (plist != NULL) { 12227 pp = plist; 12228 page_sub(&plist, pp); 12229 page_unlock(pp); 12230 } 12231 return (error); 12232 } 12233 /* 12234 * We've tried as hard as we can to commit the data to stable 12235 * storage on the server. We just unlock the pages and clear 12236 * the commit required state. They will get freed later. 12237 */ 12238 while (plist != NULL) { 12239 pp = plist; 12240 page_sub(&plist, pp); 12241 pp->p_fsdata = C_NOCOMMIT; 12242 page_unlock(pp); 12243 } 12244 12245 return (error); 12246 } 12247 12248 static void 12249 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12250 cred_t *cr) 12251 { 12252 12253 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12254 } 12255 12256 /*ARGSUSED*/ 12257 static int 12258 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12259 caller_context_t *ct) 12260 { 12261 int error = 0; 12262 mntinfo4_t *mi; 12263 vattr_t va; 12264 vsecattr_t nfsace4_vsap; 12265 12266 mi = VTOMI4(vp); 12267 if (nfs_zone() != mi->mi_zone) 12268 return (EIO); 12269 if (mi->mi_flags & MI4_ACL) { 12270 /* if we have a delegation, return it */ 12271 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12272 (void) nfs4delegreturn(VTOR4(vp), 12273 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12274 12275 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12276 NFS4_ACL_SET); 12277 if (error) /* EINVAL */ 12278 return (error); 12279 12280 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12281 /* 12282 * These are aclent_t type entries. 12283 */ 12284 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12285 vp->v_type == VDIR, FALSE); 12286 if (error) 12287 return (error); 12288 } else { 12289 /* 12290 * These are ace_t type entries. 12291 */ 12292 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12293 FALSE); 12294 if (error) 12295 return (error); 12296 } 12297 bzero(&va, sizeof (va)); 12298 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12299 vs_ace4_destroy(&nfsace4_vsap); 12300 return (error); 12301 } 12302 return (ENOSYS); 12303 } 12304 12305 /* ARGSUSED */ 12306 int 12307 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12308 caller_context_t *ct) 12309 { 12310 int error; 12311 mntinfo4_t *mi; 12312 nfs4_ga_res_t gar; 12313 rnode4_t *rp = VTOR4(vp); 12314 12315 mi = VTOMI4(vp); 12316 if (nfs_zone() != mi->mi_zone) 12317 return (EIO); 12318 12319 bzero(&gar, sizeof (gar)); 12320 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12321 12322 /* 12323 * vsecattr->vsa_mask holds the original acl request mask. 12324 * This is needed when determining what to return. 12325 * (See: nfs4_create_getsecattr_return()) 12326 */ 12327 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12328 if (error) /* EINVAL */ 12329 return (error); 12330 12331 /* 12332 * If this is a referral stub, don't try to go OTW for an ACL 12333 */ 12334 if (RP_ISSTUB_REFERRAL(VTOR4(vp))) 12335 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 12336 12337 if (mi->mi_flags & MI4_ACL) { 12338 /* 12339 * Check if the data is cached and the cache is valid. If it 12340 * is we don't go over the wire. 12341 */ 12342 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12343 mutex_enter(&rp->r_statelock); 12344 if (rp->r_secattr != NULL) { 12345 error = nfs4_create_getsecattr_return( 12346 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12347 rp->r_attr.va_gid, 12348 vp->v_type == VDIR); 12349 if (!error) { /* error == 0 - Success! */ 12350 mutex_exit(&rp->r_statelock); 12351 return (error); 12352 } 12353 } 12354 mutex_exit(&rp->r_statelock); 12355 } 12356 12357 /* 12358 * The getattr otw call will always get both the acl, in 12359 * the form of a list of nfsace4's, and the number of acl 12360 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12361 */ 12362 gar.n4g_va.va_mask = AT_ALL; 12363 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12364 if (error) { 12365 vs_ace4_destroy(&gar.n4g_vsa); 12366 if (error == ENOTSUP || error == EOPNOTSUPP) 12367 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12368 return (error); 12369 } 12370 12371 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12372 /* 12373 * No error was returned, but according to the response 12374 * bitmap, neither was an acl. 12375 */ 12376 vs_ace4_destroy(&gar.n4g_vsa); 12377 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12378 return (error); 12379 } 12380 12381 /* 12382 * Update the cache with the ACL. 12383 */ 12384 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12385 12386 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12387 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12388 vp->v_type == VDIR); 12389 vs_ace4_destroy(&gar.n4g_vsa); 12390 if ((error) && (vsecattr->vsa_mask & 12391 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12392 (error != EACCES)) { 12393 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12394 } 12395 return (error); 12396 } 12397 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12398 return (error); 12399 } 12400 12401 /* 12402 * The function returns: 12403 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12404 * - EINVAL if the passed in "acl_mask" is an invalid request. 12405 * 12406 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12407 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12408 * 12409 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12410 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12411 * - We have a count field set without the corresponding acl field set. (e.g. - 12412 * VSA_ACECNT is set, but VSA_ACE is not) 12413 */ 12414 static int 12415 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12416 { 12417 /* Shortcut the masks that are always valid. */ 12418 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12419 return (0); 12420 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12421 return (0); 12422 12423 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12424 /* 12425 * We can't have any VSA_ACL type stuff in the mask now. 12426 */ 12427 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12428 VSA_DFACLCNT)) 12429 return (EINVAL); 12430 12431 if (op == NFS4_ACL_SET) { 12432 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12433 return (EINVAL); 12434 } 12435 } 12436 12437 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12438 /* 12439 * We can't have any VSA_ACE type stuff in the mask now. 12440 */ 12441 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12442 return (EINVAL); 12443 12444 if (op == NFS4_ACL_SET) { 12445 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12446 return (EINVAL); 12447 12448 if ((acl_mask & VSA_DFACLCNT) && 12449 !(acl_mask & VSA_DFACL)) 12450 return (EINVAL); 12451 } 12452 } 12453 return (0); 12454 } 12455 12456 /* 12457 * The theory behind creating the correct getsecattr return is simply this: 12458 * "Don't return anything that the caller is not expecting to have to free." 12459 */ 12460 static int 12461 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12462 uid_t uid, gid_t gid, int isdir) 12463 { 12464 int error = 0; 12465 /* Save the mask since the translators modify it. */ 12466 uint_t orig_mask = vsap->vsa_mask; 12467 12468 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12469 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE); 12470 12471 if (error) 12472 return (error); 12473 12474 /* 12475 * If the caller only asked for the ace count (VSA_ACECNT) 12476 * don't give them the full acl (VSA_ACE), free it. 12477 */ 12478 if (!orig_mask & VSA_ACE) { 12479 if (vsap->vsa_aclentp != NULL) { 12480 kmem_free(vsap->vsa_aclentp, 12481 vsap->vsa_aclcnt * sizeof (ace_t)); 12482 vsap->vsa_aclentp = NULL; 12483 } 12484 } 12485 vsap->vsa_mask = orig_mask; 12486 12487 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12488 VSA_DFACLCNT)) { 12489 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12490 isdir, FALSE); 12491 12492 if (error) 12493 return (error); 12494 12495 /* 12496 * If the caller only asked for the acl count (VSA_ACLCNT) 12497 * and/or the default acl count (VSA_DFACLCNT) don't give them 12498 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12499 */ 12500 if (!orig_mask & VSA_ACL) { 12501 if (vsap->vsa_aclentp != NULL) { 12502 kmem_free(vsap->vsa_aclentp, 12503 vsap->vsa_aclcnt * sizeof (aclent_t)); 12504 vsap->vsa_aclentp = NULL; 12505 } 12506 } 12507 12508 if (!orig_mask & VSA_DFACL) { 12509 if (vsap->vsa_dfaclentp != NULL) { 12510 kmem_free(vsap->vsa_dfaclentp, 12511 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12512 vsap->vsa_dfaclentp = NULL; 12513 } 12514 } 12515 vsap->vsa_mask = orig_mask; 12516 } 12517 return (0); 12518 } 12519 12520 /* ARGSUSED */ 12521 int 12522 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12523 caller_context_t *ct) 12524 { 12525 int error; 12526 12527 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12528 return (EIO); 12529 /* 12530 * check for valid cmd parameter 12531 */ 12532 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12533 return (EINVAL); 12534 12535 /* 12536 * Check access permissions 12537 */ 12538 if ((cmd & F_SHARE) && 12539 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12540 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12541 return (EBADF); 12542 12543 /* 12544 * If the filesystem is mounted using local locking, pass the 12545 * request off to the local share code. 12546 */ 12547 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12548 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12549 12550 switch (cmd) { 12551 case F_SHARE: 12552 case F_UNSHARE: 12553 /* 12554 * This will be properly implemented later, 12555 * see RFE: 4823948 . 12556 */ 12557 error = EAGAIN; 12558 break; 12559 12560 case F_HASREMOTELOCKS: 12561 /* 12562 * NFS client can't store remote locks itself 12563 */ 12564 shr->s_access = 0; 12565 error = 0; 12566 break; 12567 12568 default: 12569 error = EINVAL; 12570 break; 12571 } 12572 12573 return (error); 12574 } 12575 12576 /* 12577 * Common code called by directory ops to update the attrcache 12578 */ 12579 static int 12580 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12581 hrtime_t t, vnode_t *vp, cred_t *cr) 12582 { 12583 int error = 0; 12584 12585 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12586 12587 if (status != NFS4_OK) { 12588 /* getattr not done or failed */ 12589 PURGE_ATTRCACHE4(vp); 12590 return (error); 12591 } 12592 12593 if (garp) { 12594 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12595 } else { 12596 PURGE_ATTRCACHE4(vp); 12597 } 12598 return (error); 12599 } 12600 12601 /* 12602 * Update directory caches for directory modification ops (link, rename, etc.) 12603 * When dinfo is NULL, manage dircaches in the old way. 12604 */ 12605 static void 12606 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12607 dirattr_info_t *dinfo) 12608 { 12609 rnode4_t *drp = VTOR4(dvp); 12610 12611 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12612 12613 /* Purge rddir cache for dir since it changed */ 12614 if (drp->r_dir != NULL) 12615 nfs4_purge_rddir_cache(dvp); 12616 12617 /* 12618 * If caller provided dinfo, then use it to manage dir caches. 12619 */ 12620 if (dinfo != NULL) { 12621 if (vp != NULL) { 12622 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12623 if (!VTOR4(vp)->created_v4) { 12624 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12625 dnlc_update(dvp, nm, vp); 12626 } else { 12627 /* 12628 * XXX don't update if the created_v4 flag is 12629 * set 12630 */ 12631 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12632 NFS4_DEBUG(nfs4_client_state_debug, 12633 (CE_NOTE, "nfs4_update_dircaches: " 12634 "don't update dnlc: created_v4 flag")); 12635 } 12636 } 12637 12638 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12639 dinfo->di_cred, FALSE, cinfo); 12640 12641 return; 12642 } 12643 12644 /* 12645 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12646 * Since caller modified dir but didn't receive post-dirmod-op dir 12647 * attrs, the dir's attrs must be purged. 12648 * 12649 * XXX this check and dnlc update/purge should really be atomic, 12650 * XXX but can't use rnode statelock because it'll deadlock in 12651 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12652 * XXX does occur. 12653 * 12654 * XXX We also may want to check that atomic is true in the 12655 * XXX change_info struct. If it is not, the change_info may 12656 * XXX reflect changes by more than one clients which means that 12657 * XXX our cache may not be valid. 12658 */ 12659 PURGE_ATTRCACHE4(dvp); 12660 if (drp->r_change == cinfo->before) { 12661 /* no changes took place in the directory prior to our link */ 12662 if (vp != NULL) { 12663 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12664 if (!VTOR4(vp)->created_v4) { 12665 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12666 dnlc_update(dvp, nm, vp); 12667 } else { 12668 /* 12669 * XXX dont' update if the created_v4 flag 12670 * is set 12671 */ 12672 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12673 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12674 "nfs4_update_dircaches: don't" 12675 " update dnlc: created_v4 flag")); 12676 } 12677 } 12678 } else { 12679 /* Another client modified directory - purge its dnlc cache */ 12680 dnlc_purge_vp(dvp); 12681 } 12682 } 12683 12684 /* 12685 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12686 * file. 12687 * 12688 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12689 * file (ie: client recovery) and otherwise set to FALSE. 12690 * 12691 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12692 * initiated) calling functions. 12693 * 12694 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12695 * of resending a 'lost' open request. 12696 * 12697 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12698 * server that hands out BAD_SEQID on open confirm. 12699 * 12700 * Errors are returned via the nfs4_error_t parameter. 12701 */ 12702 void 12703 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12704 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12705 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12706 { 12707 COMPOUND4args_clnt args; 12708 COMPOUND4res_clnt res; 12709 nfs_argop4 argop[2]; 12710 nfs_resop4 *resop; 12711 int doqueue = 1; 12712 mntinfo4_t *mi; 12713 OPEN_CONFIRM4args *open_confirm_args; 12714 int needrecov; 12715 12716 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12717 #if DEBUG 12718 mutex_enter(&oop->oo_lock); 12719 ASSERT(oop->oo_seqid_inuse); 12720 mutex_exit(&oop->oo_lock); 12721 #endif 12722 12723 recov_retry_confirm: 12724 nfs4_error_zinit(ep); 12725 *retry_open = FALSE; 12726 12727 if (resend) 12728 args.ctag = TAG_OPEN_CONFIRM_LOST; 12729 else 12730 args.ctag = TAG_OPEN_CONFIRM; 12731 12732 args.array_len = 2; 12733 args.array = argop; 12734 12735 /* putfh target fh */ 12736 argop[0].argop = OP_CPUTFH; 12737 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12738 12739 argop[1].argop = OP_OPEN_CONFIRM; 12740 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12741 12742 (*seqid) += 1; 12743 open_confirm_args->seqid = *seqid; 12744 open_confirm_args->open_stateid = *stateid; 12745 12746 mi = VTOMI4(vp); 12747 12748 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12749 12750 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12751 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12752 } 12753 12754 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12755 if (!needrecov && ep->error) 12756 return; 12757 12758 if (needrecov) { 12759 bool_t abort = FALSE; 12760 12761 if (reopening_file == FALSE) { 12762 nfs4_bseqid_entry_t *bsep = NULL; 12763 12764 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12765 bsep = nfs4_create_bseqid_entry(oop, NULL, 12766 vp, 0, args.ctag, 12767 open_confirm_args->seqid); 12768 12769 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 12770 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL); 12771 if (bsep) { 12772 kmem_free(bsep, sizeof (*bsep)); 12773 if (num_bseqid_retryp && 12774 --(*num_bseqid_retryp) == 0) 12775 abort = TRUE; 12776 } 12777 } 12778 if ((ep->error == ETIMEDOUT || 12779 res.status == NFS4ERR_RESOURCE) && 12780 abort == FALSE && resend == FALSE) { 12781 if (!ep->error) 12782 (void) xdr_free(xdr_COMPOUND4res_clnt, 12783 (caddr_t)&res); 12784 12785 delay(SEC_TO_TICK(confirm_retry_sec)); 12786 goto recov_retry_confirm; 12787 } 12788 /* State may have changed so retry the entire OPEN op */ 12789 if (abort == FALSE) 12790 *retry_open = TRUE; 12791 else 12792 *retry_open = FALSE; 12793 if (!ep->error) 12794 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12795 return; 12796 } 12797 12798 if (res.status) { 12799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12800 return; 12801 } 12802 12803 resop = &res.array[1]; /* open confirm res */ 12804 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12805 stateid, sizeof (*stateid)); 12806 12807 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12808 } 12809 12810 /* 12811 * Return the credentials associated with a client state object. The 12812 * caller is responsible for freeing the credentials. 12813 */ 12814 12815 static cred_t * 12816 state_to_cred(nfs4_open_stream_t *osp) 12817 { 12818 cred_t *cr; 12819 12820 /* 12821 * It's ok to not lock the open stream and open owner to get 12822 * the oo_cred since this is only written once (upon creation) 12823 * and will not change. 12824 */ 12825 cr = osp->os_open_owner->oo_cred; 12826 crhold(cr); 12827 12828 return (cr); 12829 } 12830 12831 /* 12832 * nfs4_find_sysid 12833 * 12834 * Find the sysid for the knetconfig associated with the given mi. 12835 */ 12836 static struct lm_sysid * 12837 nfs4_find_sysid(mntinfo4_t *mi) 12838 { 12839 ASSERT(nfs_zone() == mi->mi_zone); 12840 12841 /* 12842 * Switch from RDMA knconf to original mount knconf 12843 */ 12844 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12845 mi->mi_curr_serv->sv_hostname, NULL)); 12846 } 12847 12848 #ifdef DEBUG 12849 /* 12850 * Return a string version of the call type for easy reading. 12851 */ 12852 static char * 12853 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12854 { 12855 switch (ctype) { 12856 case NFS4_LCK_CTYPE_NORM: 12857 return ("NORMAL"); 12858 case NFS4_LCK_CTYPE_RECLAIM: 12859 return ("RECLAIM"); 12860 case NFS4_LCK_CTYPE_RESEND: 12861 return ("RESEND"); 12862 case NFS4_LCK_CTYPE_REINSTATE: 12863 return ("REINSTATE"); 12864 default: 12865 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12866 "type %d", ctype); 12867 return (""); 12868 } 12869 } 12870 #endif 12871 12872 /* 12873 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12874 * Unlock requests don't have an over-the-wire locktype, so we just return 12875 * something non-threatening. 12876 */ 12877 12878 static nfs_lock_type4 12879 flk_to_locktype(int cmd, int l_type) 12880 { 12881 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12882 12883 switch (l_type) { 12884 case F_UNLCK: 12885 return (READ_LT); 12886 case F_RDLCK: 12887 if (cmd == F_SETLK) 12888 return (READ_LT); 12889 else 12890 return (READW_LT); 12891 case F_WRLCK: 12892 if (cmd == F_SETLK) 12893 return (WRITE_LT); 12894 else 12895 return (WRITEW_LT); 12896 } 12897 panic("flk_to_locktype"); 12898 /*NOTREACHED*/ 12899 } 12900 12901 /* 12902 * Do some preliminary checks for nfs4frlock. 12903 */ 12904 static int 12905 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12906 u_offset_t offset) 12907 { 12908 int error = 0; 12909 12910 /* 12911 * If we are setting a lock, check that the file is opened 12912 * with the correct mode. 12913 */ 12914 if (cmd == F_SETLK || cmd == F_SETLKW) { 12915 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12916 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12917 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12918 "nfs4frlock_validate_args: file was opened with " 12919 "incorrect mode")); 12920 return (EBADF); 12921 } 12922 } 12923 12924 /* Convert the offset. It may need to be restored before returning. */ 12925 if (error = convoff(vp, flk, 0, offset)) { 12926 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12927 "nfs4frlock_validate_args: convoff => error= %d\n", 12928 error)); 12929 return (error); 12930 } 12931 12932 return (error); 12933 } 12934 12935 /* 12936 * Set the flock64's lm_sysid for nfs4frlock. 12937 */ 12938 static int 12939 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12940 { 12941 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12942 12943 /* Find the lm_sysid */ 12944 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12945 12946 if (*lspp == NULL) { 12947 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12948 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12949 return (ENOLCK); 12950 } 12951 12952 flk->l_sysid = lm_sysidt(*lspp); 12953 12954 return (0); 12955 } 12956 12957 /* 12958 * Do the remaining preliminary setup for nfs4frlock. 12959 */ 12960 static void 12961 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12962 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12963 cred_t **cred_otw) 12964 { 12965 /* 12966 * set tick_delay to the base delay time. 12967 * (NFS4_BASE_WAIT_TIME is in secs) 12968 */ 12969 12970 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12971 12972 /* 12973 * If lock is relative to EOF, we need the newest length of the 12974 * file. Therefore invalidate the ATTR_CACHE. 12975 */ 12976 12977 *whencep = flk->l_whence; 12978 12979 if (*whencep == 2) /* SEEK_END */ 12980 PURGE_ATTRCACHE4(vp); 12981 12982 recov_statep->rs_flags = 0; 12983 recov_statep->rs_num_retry_despite_err = 0; 12984 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12985 } 12986 12987 /* 12988 * Initialize and allocate the data structures necessary for 12989 * the nfs4frlock call. 12990 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12991 */ 12992 static void 12993 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12994 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12995 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12996 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12997 { 12998 int argoplist_size; 12999 int num_ops = 2; 13000 13001 *retry = FALSE; 13002 *did_start_fop = FALSE; 13003 *skip_get_err = FALSE; 13004 lost_rqstp->lr_op = 0; 13005 argoplist_size = num_ops * sizeof (nfs_argop4); 13006 /* fill array with zero */ 13007 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 13008 13009 *argspp = argsp; 13010 *respp = NULL; 13011 13012 argsp->array_len = num_ops; 13013 argsp->array = *argopp; 13014 13015 /* initialize in case of error; will get real value down below */ 13016 argsp->ctag = TAG_NONE; 13017 13018 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 13019 *op_hintp = OH_LOCKU; 13020 else 13021 *op_hintp = OH_OTHER; 13022 } 13023 13024 /* 13025 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 13026 * the proper nfs4_server_t for this instance of nfs4frlock. 13027 * Returns 0 (success) or an errno value. 13028 */ 13029 static int 13030 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 13031 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 13032 bool_t *did_start_fop, bool_t *startrecovp) 13033 { 13034 int error = 0; 13035 rnode4_t *rp; 13036 13037 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13038 13039 if (ctype == NFS4_LCK_CTYPE_NORM) { 13040 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 13041 recov_statep, startrecovp); 13042 if (error) 13043 return (error); 13044 *did_start_fop = TRUE; 13045 } else { 13046 *did_start_fop = FALSE; 13047 *startrecovp = FALSE; 13048 } 13049 13050 if (!error) { 13051 rp = VTOR4(vp); 13052 13053 /* If the file failed recovery, just quit. */ 13054 mutex_enter(&rp->r_statelock); 13055 if (rp->r_flags & R4RECOVERR) { 13056 error = EIO; 13057 } 13058 mutex_exit(&rp->r_statelock); 13059 } 13060 13061 return (error); 13062 } 13063 13064 /* 13065 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13066 * resend nfs4frlock call is initiated by the recovery framework. 13067 * Acquires the lop and oop seqid synchronization. 13068 */ 13069 static void 13070 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13071 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13072 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13073 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13074 { 13075 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13076 int error; 13077 13078 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13079 (CE_NOTE, 13080 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13081 ASSERT(resend_rqstp != NULL); 13082 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13083 resend_rqstp->lr_op == OP_LOCKU); 13084 13085 *oopp = resend_rqstp->lr_oop; 13086 if (resend_rqstp->lr_oop) { 13087 open_owner_hold(resend_rqstp->lr_oop); 13088 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13089 ASSERT(error == 0); /* recov thread always succeeds */ 13090 } 13091 13092 /* Must resend this lost lock/locku request. */ 13093 ASSERT(resend_rqstp->lr_lop != NULL); 13094 *lopp = resend_rqstp->lr_lop; 13095 lock_owner_hold(resend_rqstp->lr_lop); 13096 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13097 ASSERT(error == 0); /* recov thread always succeeds */ 13098 13099 *ospp = resend_rqstp->lr_osp; 13100 if (*ospp) 13101 open_stream_hold(resend_rqstp->lr_osp); 13102 13103 if (resend_rqstp->lr_op == OP_LOCK) { 13104 LOCK4args *lock_args; 13105 13106 argop->argop = OP_LOCK; 13107 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13108 lock_args->locktype = resend_rqstp->lr_locktype; 13109 lock_args->reclaim = 13110 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13111 lock_args->offset = resend_rqstp->lr_flk->l_start; 13112 lock_args->length = resend_rqstp->lr_flk->l_len; 13113 if (lock_args->length == 0) 13114 lock_args->length = ~lock_args->length; 13115 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13116 mi2clientid(mi), &lock_args->locker); 13117 13118 switch (resend_rqstp->lr_ctype) { 13119 case NFS4_LCK_CTYPE_RESEND: 13120 argsp->ctag = TAG_LOCK_RESEND; 13121 break; 13122 case NFS4_LCK_CTYPE_REINSTATE: 13123 argsp->ctag = TAG_LOCK_REINSTATE; 13124 break; 13125 case NFS4_LCK_CTYPE_RECLAIM: 13126 argsp->ctag = TAG_LOCK_RECLAIM; 13127 break; 13128 default: 13129 argsp->ctag = TAG_LOCK_UNKNOWN; 13130 break; 13131 } 13132 } else { 13133 LOCKU4args *locku_args; 13134 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13135 13136 argop->argop = OP_LOCKU; 13137 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13138 locku_args->locktype = READ_LT; 13139 locku_args->seqid = lop->lock_seqid + 1; 13140 mutex_enter(&lop->lo_lock); 13141 locku_args->lock_stateid = lop->lock_stateid; 13142 mutex_exit(&lop->lo_lock); 13143 locku_args->offset = resend_rqstp->lr_flk->l_start; 13144 locku_args->length = resend_rqstp->lr_flk->l_len; 13145 if (locku_args->length == 0) 13146 locku_args->length = ~locku_args->length; 13147 13148 switch (resend_rqstp->lr_ctype) { 13149 case NFS4_LCK_CTYPE_RESEND: 13150 argsp->ctag = TAG_LOCKU_RESEND; 13151 break; 13152 case NFS4_LCK_CTYPE_REINSTATE: 13153 argsp->ctag = TAG_LOCKU_REINSTATE; 13154 break; 13155 default: 13156 argsp->ctag = TAG_LOCK_UNKNOWN; 13157 break; 13158 } 13159 } 13160 } 13161 13162 /* 13163 * Setup the LOCKT4 arguments. 13164 */ 13165 static void 13166 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13167 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 13168 rnode4_t *rp) 13169 { 13170 LOCKT4args *lockt_args; 13171 13172 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13173 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13174 argop->argop = OP_LOCKT; 13175 argsp->ctag = TAG_LOCKT; 13176 lockt_args = &argop->nfs_argop4_u.oplockt; 13177 13178 /* 13179 * The locktype will be READ_LT unless it's 13180 * a write lock. We do this because the Solaris 13181 * system call allows the combination of 13182 * F_UNLCK and F_GETLK* and so in that case the 13183 * unlock is mapped to a read. 13184 */ 13185 if (flk->l_type == F_WRLCK) 13186 lockt_args->locktype = WRITE_LT; 13187 else 13188 lockt_args->locktype = READ_LT; 13189 13190 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13191 /* set the lock owner4 args */ 13192 nfs4_setlockowner_args(&lockt_args->owner, rp, 13193 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13194 flk->l_pid); 13195 lockt_args->offset = flk->l_start; 13196 lockt_args->length = flk->l_len; 13197 if (flk->l_len == 0) 13198 lockt_args->length = ~lockt_args->length; 13199 13200 *lockt_argsp = lockt_args; 13201 } 13202 13203 /* 13204 * If the client is holding a delegation, and the open stream to be used 13205 * with this lock request is a delegation open stream, then re-open the stream. 13206 * Sets the nfs4_error_t to all zeros unless the open stream has already 13207 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13208 * means the caller should retry (like a recovery retry). 13209 */ 13210 static void 13211 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13212 { 13213 open_delegation_type4 dt; 13214 bool_t reopen_needed, force; 13215 nfs4_open_stream_t *osp; 13216 open_claim_type4 oclaim; 13217 rnode4_t *rp = VTOR4(vp); 13218 mntinfo4_t *mi = VTOMI4(vp); 13219 13220 ASSERT(nfs_zone() == mi->mi_zone); 13221 13222 nfs4_error_zinit(ep); 13223 13224 mutex_enter(&rp->r_statev4_lock); 13225 dt = rp->r_deleg_type; 13226 mutex_exit(&rp->r_statev4_lock); 13227 13228 if (dt != OPEN_DELEGATE_NONE) { 13229 nfs4_open_owner_t *oop; 13230 13231 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13232 if (!oop) { 13233 ep->stat = NFS4ERR_IO; 13234 return; 13235 } 13236 /* returns with 'os_sync_lock' held */ 13237 osp = find_open_stream(oop, rp); 13238 if (!osp) { 13239 open_owner_rele(oop); 13240 ep->stat = NFS4ERR_IO; 13241 return; 13242 } 13243 13244 if (osp->os_failed_reopen) { 13245 NFS4_DEBUG((nfs4_open_stream_debug || 13246 nfs4_client_lock_debug), (CE_NOTE, 13247 "nfs4frlock_check_deleg: os_failed_reopen set " 13248 "for osp %p, cr %p, rp %s", (void *)osp, 13249 (void *)cr, rnode4info(rp))); 13250 mutex_exit(&osp->os_sync_lock); 13251 open_stream_rele(osp, rp); 13252 open_owner_rele(oop); 13253 ep->stat = NFS4ERR_IO; 13254 return; 13255 } 13256 13257 /* 13258 * Determine whether a reopen is needed. If this 13259 * is a delegation open stream, then send the open 13260 * to the server to give visibility to the open owner. 13261 * Even if it isn't a delegation open stream, we need 13262 * to check if the previous open CLAIM_DELEGATE_CUR 13263 * was sufficient. 13264 */ 13265 13266 reopen_needed = osp->os_delegation || 13267 ((lt == F_RDLCK && 13268 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13269 (lt == F_WRLCK && 13270 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13271 13272 mutex_exit(&osp->os_sync_lock); 13273 open_owner_rele(oop); 13274 13275 if (reopen_needed) { 13276 /* 13277 * Always use CLAIM_PREVIOUS after server reboot. 13278 * The server will reject CLAIM_DELEGATE_CUR if 13279 * it is used during the grace period. 13280 */ 13281 mutex_enter(&mi->mi_lock); 13282 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13283 oclaim = CLAIM_PREVIOUS; 13284 force = TRUE; 13285 } else { 13286 oclaim = CLAIM_DELEGATE_CUR; 13287 force = FALSE; 13288 } 13289 mutex_exit(&mi->mi_lock); 13290 13291 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13292 if (ep->error == EAGAIN) { 13293 nfs4_error_zinit(ep); 13294 ep->stat = NFS4ERR_DELAY; 13295 } 13296 } 13297 open_stream_rele(osp, rp); 13298 osp = NULL; 13299 } 13300 } 13301 13302 /* 13303 * Setup the LOCKU4 arguments. 13304 * Returns errors via the nfs4_error_t. 13305 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13306 * over-the-wire. The caller must release the 13307 * reference on *lopp. 13308 * NFS4ERR_DELAY caller should retry (like recovery retry) 13309 * (other) unrecoverable error. 13310 */ 13311 static void 13312 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13313 LOCKU4args **locku_argsp, flock64_t *flk, 13314 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13315 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13316 bool_t *skip_get_err, bool_t *go_otwp) 13317 { 13318 nfs4_lock_owner_t *lop = NULL; 13319 LOCKU4args *locku_args; 13320 pid_t pid; 13321 bool_t is_spec = FALSE; 13322 rnode4_t *rp = VTOR4(vp); 13323 13324 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13325 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13326 13327 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13328 if (ep->error || ep->stat) 13329 return; 13330 13331 argop->argop = OP_LOCKU; 13332 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13333 argsp->ctag = TAG_LOCKU_REINSTATE; 13334 else 13335 argsp->ctag = TAG_LOCKU; 13336 locku_args = &argop->nfs_argop4_u.oplocku; 13337 *locku_argsp = locku_args; 13338 13339 /* 13340 * XXX what should locku_args->locktype be? 13341 * setting to ALWAYS be READ_LT so at least 13342 * it is a valid locktype. 13343 */ 13344 13345 locku_args->locktype = READ_LT; 13346 13347 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13348 flk->l_pid; 13349 13350 /* 13351 * Get the lock owner stateid. If no lock owner 13352 * exists, return success. 13353 */ 13354 lop = find_lock_owner(rp, pid, LOWN_ANY); 13355 *lopp = lop; 13356 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13357 is_spec = TRUE; 13358 if (!lop || is_spec) { 13359 /* 13360 * No lock owner so no locks to unlock. 13361 * Return success. If there was a failed 13362 * reclaim earlier, the lock might still be 13363 * registered with the local locking code, 13364 * so notify it of the unlock. 13365 * 13366 * If the lockowner is using a special stateid, 13367 * then the original lock request (that created 13368 * this lockowner) was never successful, so we 13369 * have no lock to undo OTW. 13370 */ 13371 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13372 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13373 "(%ld) so return success", (long)pid)); 13374 13375 if (ctype == NFS4_LCK_CTYPE_NORM) 13376 flk->l_pid = curproc->p_pid; 13377 nfs4_register_lock_locally(vp, flk, flag, offset); 13378 /* 13379 * Release our hold and NULL out so final_cleanup 13380 * doesn't try to end a lock seqid sync we 13381 * never started. 13382 */ 13383 if (is_spec) { 13384 lock_owner_rele(lop); 13385 *lopp = NULL; 13386 } 13387 *skip_get_err = TRUE; 13388 *go_otwp = FALSE; 13389 return; 13390 } 13391 13392 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13393 if (ep->error == EAGAIN) { 13394 lock_owner_rele(lop); 13395 *lopp = NULL; 13396 return; 13397 } 13398 13399 mutex_enter(&lop->lo_lock); 13400 locku_args->lock_stateid = lop->lock_stateid; 13401 mutex_exit(&lop->lo_lock); 13402 locku_args->seqid = lop->lock_seqid + 1; 13403 13404 /* leave the ref count on lop, rele after RPC call */ 13405 13406 locku_args->offset = flk->l_start; 13407 locku_args->length = flk->l_len; 13408 if (flk->l_len == 0) 13409 locku_args->length = ~locku_args->length; 13410 13411 *go_otwp = TRUE; 13412 } 13413 13414 /* 13415 * Setup the LOCK4 arguments. 13416 * 13417 * Returns errors via the nfs4_error_t. 13418 * NFS4_OK no problems 13419 * NFS4ERR_DELAY caller should retry (like recovery retry) 13420 * (other) unrecoverable error 13421 */ 13422 static void 13423 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13424 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13425 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13426 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13427 { 13428 LOCK4args *lock_args; 13429 nfs4_open_owner_t *oop = NULL; 13430 nfs4_open_stream_t *osp = NULL; 13431 nfs4_lock_owner_t *lop = NULL; 13432 pid_t pid; 13433 rnode4_t *rp = VTOR4(vp); 13434 13435 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13436 13437 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13438 if (ep->error || ep->stat != NFS4_OK) 13439 return; 13440 13441 argop->argop = OP_LOCK; 13442 if (ctype == NFS4_LCK_CTYPE_NORM) 13443 argsp->ctag = TAG_LOCK; 13444 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13445 argsp->ctag = TAG_RELOCK; 13446 else 13447 argsp->ctag = TAG_LOCK_REINSTATE; 13448 lock_args = &argop->nfs_argop4_u.oplock; 13449 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13450 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13451 /* 13452 * Get the lock owner. If no lock owner exists, 13453 * create a 'temporary' one and grab the open seqid 13454 * synchronization (which puts a hold on the open 13455 * owner and open stream). 13456 * This also grabs the lock seqid synchronization. 13457 */ 13458 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13459 ep->stat = 13460 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13461 13462 if (ep->stat != NFS4_OK) 13463 goto out; 13464 13465 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13466 &lock_args->locker); 13467 13468 lock_args->offset = flk->l_start; 13469 lock_args->length = flk->l_len; 13470 if (flk->l_len == 0) 13471 lock_args->length = ~lock_args->length; 13472 *lock_argsp = lock_args; 13473 out: 13474 *oopp = oop; 13475 *ospp = osp; 13476 *lopp = lop; 13477 } 13478 13479 /* 13480 * After we get the reply from the server, record the proper information 13481 * for possible resend lock requests. 13482 * 13483 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13484 */ 13485 static void 13486 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13487 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13488 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13489 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13490 { 13491 bool_t unlock = (flk->l_type == F_UNLCK); 13492 13493 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13494 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13495 ctype == NFS4_LCK_CTYPE_REINSTATE); 13496 13497 if (error != 0 && !unlock) { 13498 NFS4_DEBUG((nfs4_lost_rqst_debug || 13499 nfs4_client_lock_debug), (CE_NOTE, 13500 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13501 " for lop %p", (void *)lop)); 13502 ASSERT(lop != NULL); 13503 mutex_enter(&lop->lo_lock); 13504 lop->lo_pending_rqsts = 1; 13505 mutex_exit(&lop->lo_lock); 13506 } 13507 13508 lost_rqstp->lr_putfirst = FALSE; 13509 lost_rqstp->lr_op = 0; 13510 13511 /* 13512 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13513 * recovery purposes so that the lock request that was sent 13514 * can be saved and re-issued later. Ditto for EIO from a forced 13515 * unmount. This is done to have the client's local locking state 13516 * match the v4 server's state; that is, the request was 13517 * potentially received and accepted by the server but the client 13518 * thinks it was not. 13519 */ 13520 if (error == ETIMEDOUT || error == EINTR || 13521 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13522 NFS4_DEBUG((nfs4_lost_rqst_debug || 13523 nfs4_client_lock_debug), (CE_NOTE, 13524 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13525 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13526 (void *)lop, (void *)oop, (void *)osp)); 13527 if (unlock) 13528 lost_rqstp->lr_op = OP_LOCKU; 13529 else { 13530 lost_rqstp->lr_op = OP_LOCK; 13531 lost_rqstp->lr_locktype = locktype; 13532 } 13533 /* 13534 * Objects are held and rele'd via the recovery code. 13535 * See nfs4_save_lost_rqst. 13536 */ 13537 lost_rqstp->lr_vp = vp; 13538 lost_rqstp->lr_dvp = NULL; 13539 lost_rqstp->lr_oop = oop; 13540 lost_rqstp->lr_osp = osp; 13541 lost_rqstp->lr_lop = lop; 13542 lost_rqstp->lr_cr = cr; 13543 switch (ctype) { 13544 case NFS4_LCK_CTYPE_NORM: 13545 flk->l_pid = ttoproc(curthread)->p_pid; 13546 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13547 break; 13548 case NFS4_LCK_CTYPE_REINSTATE: 13549 lost_rqstp->lr_putfirst = TRUE; 13550 lost_rqstp->lr_ctype = ctype; 13551 break; 13552 default: 13553 break; 13554 } 13555 lost_rqstp->lr_flk = flk; 13556 } 13557 } 13558 13559 /* 13560 * Update lop's seqid. Also update the seqid stored in a resend request, 13561 * if any. (Some recovery errors increment the seqid, and we may have to 13562 * send the resend request again.) 13563 */ 13564 13565 static void 13566 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13567 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13568 { 13569 if (lock_args) { 13570 if (lock_args->locker.new_lock_owner == TRUE) 13571 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13572 else { 13573 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13574 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13575 } 13576 } else if (locku_args) { 13577 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13578 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13579 } 13580 } 13581 13582 /* 13583 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13584 * COMPOUND4 args/res for calls that need to retry. 13585 * Switches the *cred_otwp to base_cr. 13586 */ 13587 static void 13588 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13589 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13590 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13591 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13592 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13593 { 13594 nfs4_open_owner_t *oop = *oopp; 13595 nfs4_open_stream_t *osp = *ospp; 13596 nfs4_lock_owner_t *lop = *lopp; 13597 nfs_argop4 *argop = (*argspp)->array; 13598 13599 if (*did_start_fop) { 13600 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13601 needrecov); 13602 *did_start_fop = FALSE; 13603 } 13604 ASSERT((*argspp)->array_len == 2); 13605 if (argop[1].argop == OP_LOCK) 13606 nfs4args_lock_free(&argop[1]); 13607 else if (argop[1].argop == OP_LOCKT) 13608 nfs4args_lockt_free(&argop[1]); 13609 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13610 if (!error) 13611 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13612 *argspp = NULL; 13613 *respp = NULL; 13614 13615 if (lop) { 13616 nfs4_end_lock_seqid_sync(lop); 13617 lock_owner_rele(lop); 13618 *lopp = NULL; 13619 } 13620 13621 /* need to free up the reference on osp for lock args */ 13622 if (osp != NULL) { 13623 open_stream_rele(osp, VTOR4(vp)); 13624 *ospp = NULL; 13625 } 13626 13627 /* need to free up the reference on oop for lock args */ 13628 if (oop != NULL) { 13629 nfs4_end_open_seqid_sync(oop); 13630 open_owner_rele(oop); 13631 *oopp = NULL; 13632 } 13633 13634 crfree(*cred_otwp); 13635 *cred_otwp = base_cr; 13636 crhold(*cred_otwp); 13637 } 13638 13639 /* 13640 * Function to process the client's recovery for nfs4frlock. 13641 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13642 * 13643 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13644 * COMPOUND4 args/res for calls that need to retry. 13645 * 13646 * Note: the rp's r_lkserlock is *not* dropped during this path. 13647 */ 13648 static bool_t 13649 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13650 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13651 LOCK4args *lock_args, LOCKU4args *locku_args, 13652 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13653 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13654 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13655 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13656 { 13657 nfs4_open_owner_t *oop = *oopp; 13658 nfs4_open_stream_t *osp = *ospp; 13659 nfs4_lock_owner_t *lop = *lopp; 13660 13661 bool_t abort, retry; 13662 13663 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13664 ASSERT((*argspp) != NULL); 13665 ASSERT((*respp) != NULL); 13666 if (lock_args || locku_args) 13667 ASSERT(lop != NULL); 13668 13669 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13670 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13671 13672 retry = TRUE; 13673 abort = FALSE; 13674 if (needrecov) { 13675 nfs4_bseqid_entry_t *bsep = NULL; 13676 nfs_opnum4 op; 13677 13678 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13679 13680 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13681 seqid4 seqid; 13682 13683 if (lock_args) { 13684 if (lock_args->locker.new_lock_owner == TRUE) 13685 seqid = lock_args->locker.locker4_u. 13686 open_owner.open_seqid; 13687 else 13688 seqid = lock_args->locker.locker4_u. 13689 lock_owner.lock_seqid; 13690 } else if (locku_args) { 13691 seqid = locku_args->seqid; 13692 } else { 13693 seqid = 0; 13694 } 13695 13696 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13697 flk->l_pid, (*argspp)->ctag, seqid); 13698 } 13699 13700 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13701 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13702 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13703 NULL, op, bsep, NULL, NULL); 13704 13705 if (bsep) 13706 kmem_free(bsep, sizeof (*bsep)); 13707 } 13708 13709 /* 13710 * Return that we do not want to retry the request for 3 cases: 13711 * 1. If we received EINTR or are bailing out because of a forced 13712 * unmount, we came into this code path just for the sake of 13713 * initiating recovery, we now need to return the error. 13714 * 2. If we have aborted recovery. 13715 * 3. We received NFS4ERR_BAD_SEQID. 13716 */ 13717 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13718 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13719 retry = FALSE; 13720 13721 if (*did_start_fop == TRUE) { 13722 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13723 needrecov); 13724 *did_start_fop = FALSE; 13725 } 13726 13727 if (retry == TRUE) { 13728 nfs_argop4 *argop; 13729 13730 argop = (*argspp)->array; 13731 ASSERT((*argspp)->array_len == 2); 13732 13733 if (argop[1].argop == OP_LOCK) 13734 nfs4args_lock_free(&argop[1]); 13735 else if (argop[1].argop == OP_LOCKT) 13736 nfs4args_lockt_free(&argop[1]); 13737 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13738 if (!ep->error) 13739 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13740 *respp = NULL; 13741 *argspp = NULL; 13742 } 13743 13744 if (lop != NULL) { 13745 nfs4_end_lock_seqid_sync(lop); 13746 lock_owner_rele(lop); 13747 } 13748 13749 *lopp = NULL; 13750 13751 /* need to free up the reference on osp for lock args */ 13752 if (osp != NULL) { 13753 open_stream_rele(osp, rp); 13754 *ospp = NULL; 13755 } 13756 13757 /* need to free up the reference on oop for lock args */ 13758 if (oop != NULL) { 13759 nfs4_end_open_seqid_sync(oop); 13760 open_owner_rele(oop); 13761 *oopp = NULL; 13762 } 13763 13764 return (retry); 13765 } 13766 13767 /* 13768 * Handles the successful reply from the server for nfs4frlock. 13769 */ 13770 static void 13771 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13772 vnode_t *vp, int flag, u_offset_t offset, 13773 nfs4_lost_rqst_t *resend_rqstp) 13774 { 13775 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13776 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13777 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13778 if (ctype == NFS4_LCK_CTYPE_NORM) { 13779 flk->l_pid = ttoproc(curthread)->p_pid; 13780 /* 13781 * We do not register lost locks locally in 13782 * the 'resend' case since the user/application 13783 * doesn't think we have the lock. 13784 */ 13785 ASSERT(!resend_rqstp); 13786 nfs4_register_lock_locally(vp, flk, flag, offset); 13787 } 13788 } 13789 } 13790 13791 /* 13792 * Handle the DENIED reply from the server for nfs4frlock. 13793 * Returns TRUE if we should retry the request; FALSE otherwise. 13794 * 13795 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13796 * COMPOUND4 args/res for calls that need to retry. Can also 13797 * drop and regrab the r_lkserlock. 13798 */ 13799 static bool_t 13800 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13801 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13802 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13803 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13804 nfs4_recov_state_t *recov_statep, int needrecov, 13805 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13806 clock_t *tick_delayp, short *whencep, int *errorp, 13807 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13808 bool_t *skip_get_err) 13809 { 13810 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13811 13812 if (lock_args) { 13813 nfs4_open_owner_t *oop = *oopp; 13814 nfs4_open_stream_t *osp = *ospp; 13815 nfs4_lock_owner_t *lop = *lopp; 13816 int intr; 13817 13818 /* 13819 * Blocking lock needs to sleep and retry from the request. 13820 * 13821 * Do not block and wait for 'resend' or 'reinstate' 13822 * lock requests, just return the error. 13823 * 13824 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13825 */ 13826 if (cmd == F_SETLKW) { 13827 rnode4_t *rp = VTOR4(vp); 13828 nfs_argop4 *argop = (*argspp)->array; 13829 13830 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13831 13832 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13833 recov_statep, needrecov); 13834 *did_start_fop = FALSE; 13835 ASSERT((*argspp)->array_len == 2); 13836 if (argop[1].argop == OP_LOCK) 13837 nfs4args_lock_free(&argop[1]); 13838 else if (argop[1].argop == OP_LOCKT) 13839 nfs4args_lockt_free(&argop[1]); 13840 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13841 if (*respp) 13842 (void) xdr_free(xdr_COMPOUND4res_clnt, 13843 (caddr_t)*respp); 13844 *argspp = NULL; 13845 *respp = NULL; 13846 nfs4_end_lock_seqid_sync(lop); 13847 lock_owner_rele(lop); 13848 *lopp = NULL; 13849 if (osp != NULL) { 13850 open_stream_rele(osp, rp); 13851 *ospp = NULL; 13852 } 13853 if (oop != NULL) { 13854 nfs4_end_open_seqid_sync(oop); 13855 open_owner_rele(oop); 13856 *oopp = NULL; 13857 } 13858 13859 nfs_rw_exit(&rp->r_lkserlock); 13860 13861 intr = nfs4_block_and_wait(tick_delayp, rp); 13862 13863 if (intr) { 13864 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13865 RW_WRITER, FALSE); 13866 *errorp = EINTR; 13867 return (FALSE); 13868 } 13869 13870 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13871 RW_WRITER, FALSE); 13872 13873 /* 13874 * Make sure we are still safe to lock with 13875 * regards to mmapping. 13876 */ 13877 if (!nfs4_safelock(vp, flk, cr)) { 13878 *errorp = EAGAIN; 13879 return (FALSE); 13880 } 13881 13882 return (TRUE); 13883 } 13884 if (ctype == NFS4_LCK_CTYPE_NORM) 13885 *errorp = EAGAIN; 13886 *skip_get_err = TRUE; 13887 flk->l_whence = 0; 13888 *whencep = 0; 13889 return (FALSE); 13890 } else if (lockt_args) { 13891 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13892 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13893 13894 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13895 flk, lockt_args); 13896 13897 /* according to NLM code */ 13898 *errorp = 0; 13899 *whencep = 0; 13900 *skip_get_err = TRUE; 13901 return (FALSE); 13902 } 13903 return (FALSE); 13904 } 13905 13906 /* 13907 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13908 */ 13909 static void 13910 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13911 { 13912 switch (resp->status) { 13913 case NFS4ERR_ACCESS: 13914 case NFS4ERR_ADMIN_REVOKED: 13915 case NFS4ERR_BADHANDLE: 13916 case NFS4ERR_BAD_RANGE: 13917 case NFS4ERR_BAD_SEQID: 13918 case NFS4ERR_BAD_STATEID: 13919 case NFS4ERR_BADXDR: 13920 case NFS4ERR_DEADLOCK: 13921 case NFS4ERR_DELAY: 13922 case NFS4ERR_EXPIRED: 13923 case NFS4ERR_FHEXPIRED: 13924 case NFS4ERR_GRACE: 13925 case NFS4ERR_INVAL: 13926 case NFS4ERR_ISDIR: 13927 case NFS4ERR_LEASE_MOVED: 13928 case NFS4ERR_LOCK_NOTSUPP: 13929 case NFS4ERR_LOCK_RANGE: 13930 case NFS4ERR_MOVED: 13931 case NFS4ERR_NOFILEHANDLE: 13932 case NFS4ERR_NO_GRACE: 13933 case NFS4ERR_OLD_STATEID: 13934 case NFS4ERR_OPENMODE: 13935 case NFS4ERR_RECLAIM_BAD: 13936 case NFS4ERR_RECLAIM_CONFLICT: 13937 case NFS4ERR_RESOURCE: 13938 case NFS4ERR_SERVERFAULT: 13939 case NFS4ERR_STALE: 13940 case NFS4ERR_STALE_CLIENTID: 13941 case NFS4ERR_STALE_STATEID: 13942 return; 13943 default: 13944 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13945 "nfs4frlock_results_default: got unrecognizable " 13946 "res.status %d", resp->status)); 13947 *errorp = NFS4ERR_INVAL; 13948 } 13949 } 13950 13951 /* 13952 * The lock request was successful, so update the client's state. 13953 */ 13954 static void 13955 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13956 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13957 vnode_t *vp, flock64_t *flk, cred_t *cr, 13958 nfs4_lost_rqst_t *resend_rqstp) 13959 { 13960 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13961 13962 if (lock_args) { 13963 LOCK4res *lock_res; 13964 13965 lock_res = &resop->nfs_resop4_u.oplock; 13966 /* update the stateid with server's response */ 13967 13968 if (lock_args->locker.new_lock_owner == TRUE) { 13969 mutex_enter(&lop->lo_lock); 13970 lop->lo_just_created = NFS4_PERM_CREATED; 13971 mutex_exit(&lop->lo_lock); 13972 } 13973 13974 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13975 13976 /* 13977 * If the lock was the result of a resending a lost 13978 * request, we've synched up the stateid and seqid 13979 * with the server, but now the server might be out of sync 13980 * with what the application thinks it has for locks. 13981 * Clean that up here. It's unclear whether we should do 13982 * this even if the filesystem has been forcibly unmounted. 13983 * For most servers, it's probably wasted effort, but 13984 * RFC3530 lets servers require that unlocks exactly match 13985 * the locks that are held. 13986 */ 13987 if (resend_rqstp != NULL && 13988 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13989 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13990 } else { 13991 flk->l_whence = 0; 13992 } 13993 } else if (locku_args) { 13994 LOCKU4res *locku_res; 13995 13996 locku_res = &resop->nfs_resop4_u.oplocku; 13997 13998 /* Update the stateid with the server's response */ 13999 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 14000 } else if (lockt_args) { 14001 /* Switch the lock type to express success, see fcntl */ 14002 flk->l_type = F_UNLCK; 14003 flk->l_whence = 0; 14004 } 14005 } 14006 14007 /* 14008 * Do final cleanup before exiting nfs4frlock. 14009 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 14010 * COMPOUND4 args/res for calls that haven't already. 14011 */ 14012 static void 14013 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 14014 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 14015 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 14016 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 14017 short whence, u_offset_t offset, struct lm_sysid *ls, 14018 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 14019 bool_t did_start_fop, bool_t skip_get_err, 14020 cred_t *cred_otw, cred_t *cred) 14021 { 14022 mntinfo4_t *mi = VTOMI4(vp); 14023 rnode4_t *rp = VTOR4(vp); 14024 int error = *errorp; 14025 nfs_argop4 *argop; 14026 int do_flush_pages = 0; 14027 14028 ASSERT(nfs_zone() == mi->mi_zone); 14029 /* 14030 * The client recovery code wants the raw status information, 14031 * so don't map the NFS status code to an errno value for 14032 * non-normal call types. 14033 */ 14034 if (ctype == NFS4_LCK_CTYPE_NORM) { 14035 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 14036 *errorp = geterrno4(resp->status); 14037 if (did_start_fop == TRUE) 14038 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 14039 needrecov); 14040 14041 /* 14042 * We've established a new lock on the server, so invalidate 14043 * the pages associated with the vnode to get the most up to 14044 * date pages from the server after acquiring the lock. We 14045 * want to be sure that the read operation gets the newest data. 14046 * N.B. 14047 * We used to do this in nfs4frlock_results_ok but that doesn't 14048 * work since VOP_PUTPAGE can call nfs4_commit which calls 14049 * nfs4_start_fop. We flush the pages below after calling 14050 * nfs4_end_fop above 14051 * The flush of the page cache must be done after 14052 * nfs4_end_open_seqid_sync() to avoid a 4-way hang. 14053 */ 14054 if (!error && resp && resp->status == NFS4_OK) 14055 do_flush_pages = 1; 14056 } 14057 if (argsp) { 14058 ASSERT(argsp->array_len == 2); 14059 argop = argsp->array; 14060 if (argop[1].argop == OP_LOCK) 14061 nfs4args_lock_free(&argop[1]); 14062 else if (argop[1].argop == OP_LOCKT) 14063 nfs4args_lockt_free(&argop[1]); 14064 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14065 if (resp) 14066 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14067 } 14068 14069 /* free the reference on the lock owner */ 14070 if (lop != NULL) { 14071 nfs4_end_lock_seqid_sync(lop); 14072 lock_owner_rele(lop); 14073 } 14074 14075 /* need to free up the reference on osp for lock args */ 14076 if (osp != NULL) 14077 open_stream_rele(osp, rp); 14078 14079 /* need to free up the reference on oop for lock args */ 14080 if (oop != NULL) { 14081 nfs4_end_open_seqid_sync(oop); 14082 open_owner_rele(oop); 14083 } 14084 14085 if (do_flush_pages) 14086 nfs4_flush_pages(vp, cred); 14087 14088 (void) convoff(vp, flk, whence, offset); 14089 14090 lm_rel_sysid(ls); 14091 14092 /* 14093 * Record debug information in the event we get EINVAL. 14094 */ 14095 mutex_enter(&mi->mi_lock); 14096 if (*errorp == EINVAL && (lock_args || locku_args) && 14097 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14098 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14099 zcmn_err(getzoneid(), CE_NOTE, 14100 "%s operation failed with " 14101 "EINVAL probably since the server, %s," 14102 " doesn't support POSIX style locking", 14103 lock_args ? "LOCK" : "LOCKU", 14104 mi->mi_curr_serv->sv_hostname); 14105 mi->mi_flags |= MI4_LOCK_DEBUG; 14106 } 14107 } 14108 mutex_exit(&mi->mi_lock); 14109 14110 if (cred_otw) 14111 crfree(cred_otw); 14112 } 14113 14114 /* 14115 * This calls the server and the local locking code. 14116 * 14117 * Client locks are registerred locally by oring the sysid with 14118 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 14119 * We need to distinguish between the two to avoid collision in case one 14120 * machine is used as both client and server. 14121 * 14122 * Blocking lock requests will continually retry to acquire the lock 14123 * forever. 14124 * 14125 * The ctype is defined as follows: 14126 * NFS4_LCK_CTYPE_NORM: normal lock request. 14127 * 14128 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14129 * recovery, get the pid from flk instead of curproc, and don't reregister 14130 * the lock locally. 14131 * 14132 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14133 * that we will use the information passed in via resend_rqstp to setup the 14134 * lock/locku request. This resend is the exact same request as the 'lost 14135 * lock', and is initiated by the recovery framework. A successful resend 14136 * request can initiate one or more reinstate requests. 14137 * 14138 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14139 * does not trigger additional reinstate requests. This lock call type is 14140 * set for setting the v4 server's locking state back to match what the 14141 * client's local locking state is in the event of a received 'lost lock'. 14142 * 14143 * Errors are returned via the nfs4_error_t parameter. 14144 */ 14145 void 14146 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14147 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 14148 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 14149 { 14150 COMPOUND4args_clnt args, *argsp = NULL; 14151 COMPOUND4res_clnt res, *resp = NULL; 14152 nfs_argop4 *argop; 14153 nfs_resop4 *resop; 14154 rnode4_t *rp; 14155 int doqueue = 1; 14156 clock_t tick_delay; /* delay in clock ticks */ 14157 struct lm_sysid *ls; 14158 LOCK4args *lock_args = NULL; 14159 LOCKU4args *locku_args = NULL; 14160 LOCKT4args *lockt_args = NULL; 14161 nfs4_open_owner_t *oop = NULL; 14162 nfs4_open_stream_t *osp = NULL; 14163 nfs4_lock_owner_t *lop = NULL; 14164 bool_t needrecov = FALSE; 14165 nfs4_recov_state_t recov_state; 14166 short whence; 14167 nfs4_op_hint_t op_hint; 14168 nfs4_lost_rqst_t lost_rqst; 14169 bool_t retry = FALSE; 14170 bool_t did_start_fop = FALSE; 14171 bool_t skip_get_err = FALSE; 14172 cred_t *cred_otw = NULL; 14173 bool_t recovonly; /* just queue request */ 14174 int frc_no_reclaim = 0; 14175 #ifdef DEBUG 14176 char *name; 14177 #endif 14178 14179 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14180 14181 #ifdef DEBUG 14182 name = fn_name(VTOSV(vp)->sv_name); 14183 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14184 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 14185 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14186 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 14187 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 14188 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 14189 resend_rqstp ? "TRUE" : "FALSE")); 14190 kmem_free(name, MAXNAMELEN); 14191 #endif 14192 14193 nfs4_error_zinit(ep); 14194 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 14195 if (ep->error) 14196 return; 14197 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 14198 if (ep->error) 14199 return; 14200 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 14201 vp, cr, &cred_otw); 14202 14203 recov_retry: 14204 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14205 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14206 rp = VTOR4(vp); 14207 14208 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14209 &did_start_fop, &recovonly); 14210 14211 if (ep->error) 14212 goto out; 14213 14214 if (recovonly) { 14215 /* 14216 * Leave the request for the recovery system to deal with. 14217 */ 14218 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14219 ASSERT(cmd != F_GETLK); 14220 ASSERT(flk->l_type == F_UNLCK); 14221 14222 nfs4_error_init(ep, EINTR); 14223 needrecov = TRUE; 14224 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14225 if (lop != NULL) { 14226 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14227 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14228 (void) nfs4_start_recovery(ep, 14229 VTOMI4(vp), vp, NULL, NULL, 14230 (lost_rqst.lr_op == OP_LOCK || 14231 lost_rqst.lr_op == OP_LOCKU) ? 14232 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL); 14233 lock_owner_rele(lop); 14234 lop = NULL; 14235 } 14236 flk->l_pid = curproc->p_pid; 14237 nfs4_register_lock_locally(vp, flk, flag, offset); 14238 goto out; 14239 } 14240 14241 /* putfh directory fh */ 14242 argop[0].argop = OP_CPUTFH; 14243 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14244 14245 /* 14246 * Set up the over-the-wire arguments and get references to the 14247 * open owner, etc. 14248 */ 14249 14250 if (ctype == NFS4_LCK_CTYPE_RESEND || 14251 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14252 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14253 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14254 } else { 14255 bool_t go_otw = TRUE; 14256 14257 ASSERT(resend_rqstp == NULL); 14258 14259 switch (cmd) { 14260 case F_GETLK: 14261 case F_O_GETLK: 14262 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14263 &lockt_args, argsp, flk, rp); 14264 break; 14265 case F_SETLKW: 14266 case F_SETLK: 14267 if (flk->l_type == F_UNLCK) 14268 nfs4frlock_setup_locku_args(ctype, 14269 &argop[1], &locku_args, flk, 14270 &lop, ep, argsp, 14271 vp, flag, offset, cr, 14272 &skip_get_err, &go_otw); 14273 else 14274 nfs4frlock_setup_lock_args(ctype, 14275 &lock_args, &oop, &osp, &lop, &argop[1], 14276 argsp, flk, cmd, vp, cr, ep); 14277 14278 if (ep->error) 14279 goto out; 14280 14281 switch (ep->stat) { 14282 case NFS4_OK: 14283 break; 14284 case NFS4ERR_DELAY: 14285 /* recov thread never gets this error */ 14286 ASSERT(resend_rqstp == NULL); 14287 ASSERT(did_start_fop); 14288 14289 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14290 &recov_state, TRUE); 14291 did_start_fop = FALSE; 14292 if (argop[1].argop == OP_LOCK) 14293 nfs4args_lock_free(&argop[1]); 14294 else if (argop[1].argop == OP_LOCKT) 14295 nfs4args_lockt_free(&argop[1]); 14296 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14297 argsp = NULL; 14298 goto recov_retry; 14299 default: 14300 ep->error = EIO; 14301 goto out; 14302 } 14303 break; 14304 default: 14305 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14306 "nfs4_frlock: invalid cmd %d", cmd)); 14307 ep->error = EINVAL; 14308 goto out; 14309 } 14310 14311 if (!go_otw) 14312 goto out; 14313 } 14314 14315 /* XXX should we use the local reclock as a cache ? */ 14316 /* 14317 * Unregister the lock with the local locking code before 14318 * contacting the server. This avoids a potential race where 14319 * another process gets notified that it has been granted a lock 14320 * before we can unregister ourselves locally. 14321 */ 14322 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14323 if (ctype == NFS4_LCK_CTYPE_NORM) 14324 flk->l_pid = ttoproc(curthread)->p_pid; 14325 nfs4_register_lock_locally(vp, flk, flag, offset); 14326 } 14327 14328 /* 14329 * Send the server the lock request. Continually loop with a delay 14330 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14331 */ 14332 resp = &res; 14333 14334 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14335 (CE_NOTE, 14336 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14337 rnode4info(rp))); 14338 14339 if (lock_args && frc_no_reclaim) { 14340 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14341 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14342 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14343 lock_args->reclaim = FALSE; 14344 if (did_reclaimp) 14345 *did_reclaimp = 0; 14346 } 14347 14348 /* 14349 * Do the OTW call. 14350 */ 14351 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14352 14353 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14354 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14355 14356 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14357 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14358 "nfs4frlock: needrecov %d", needrecov)); 14359 14360 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14361 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14362 args.ctag); 14363 14364 /* 14365 * Check if one of these mutually exclusive error cases has 14366 * happened: 14367 * need to swap credentials due to access error 14368 * recovery is needed 14369 * different error (only known case is missing Kerberos ticket) 14370 */ 14371 14372 if ((ep->error == EACCES || 14373 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14374 cred_otw != cr) { 14375 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14376 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14377 cr, &cred_otw); 14378 goto recov_retry; 14379 } 14380 14381 if (needrecov) { 14382 /* 14383 * LOCKT requests don't need to recover from lost 14384 * requests since they don't create/modify state. 14385 */ 14386 if ((ep->error == EINTR || 14387 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14388 lockt_args) 14389 goto out; 14390 /* 14391 * Do not attempt recovery for requests initiated by 14392 * the recovery framework. Let the framework redrive them. 14393 */ 14394 if (ctype != NFS4_LCK_CTYPE_NORM) 14395 goto out; 14396 else { 14397 ASSERT(resend_rqstp == NULL); 14398 } 14399 14400 nfs4frlock_save_lost_rqst(ctype, ep->error, 14401 flk_to_locktype(cmd, flk->l_type), 14402 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14403 14404 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14405 &resp, lock_args, locku_args, &oop, &osp, &lop, 14406 rp, vp, &recov_state, op_hint, &did_start_fop, 14407 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14408 14409 if (retry) { 14410 ASSERT(oop == NULL); 14411 ASSERT(osp == NULL); 14412 ASSERT(lop == NULL); 14413 goto recov_retry; 14414 } 14415 goto out; 14416 } 14417 14418 /* 14419 * Bail out if have reached this point with ep->error set. Can 14420 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14421 * This happens if Kerberos ticket has expired or has been 14422 * destroyed. 14423 */ 14424 if (ep->error != 0) 14425 goto out; 14426 14427 /* 14428 * Process the reply. 14429 */ 14430 switch (resp->status) { 14431 case NFS4_OK: 14432 resop = &resp->array[1]; 14433 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14434 resend_rqstp); 14435 /* 14436 * Have a successful lock operation, now update state. 14437 */ 14438 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14439 resop, lop, vp, flk, cr, resend_rqstp); 14440 break; 14441 14442 case NFS4ERR_DENIED: 14443 resop = &resp->array[1]; 14444 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14445 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14446 &recov_state, needrecov, &argsp, &resp, 14447 &tick_delay, &whence, &ep->error, resop, cr, 14448 &did_start_fop, &skip_get_err); 14449 14450 if (retry) { 14451 ASSERT(oop == NULL); 14452 ASSERT(osp == NULL); 14453 ASSERT(lop == NULL); 14454 goto recov_retry; 14455 } 14456 break; 14457 /* 14458 * If the server won't let us reclaim, fall-back to trying to lock 14459 * the file from scratch. Code elsewhere will check the changeinfo 14460 * to ensure the file hasn't been changed. 14461 */ 14462 case NFS4ERR_NO_GRACE: 14463 if (lock_args && lock_args->reclaim == TRUE) { 14464 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14465 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14466 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14467 frc_no_reclaim = 1; 14468 /* clean up before retrying */ 14469 needrecov = 0; 14470 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14471 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14472 &recov_state, op_hint, &did_start_fop, NULL, flk); 14473 goto recov_retry; 14474 } 14475 /* FALLTHROUGH */ 14476 14477 default: 14478 nfs4frlock_results_default(resp, &ep->error); 14479 break; 14480 } 14481 out: 14482 /* 14483 * Process and cleanup from error. Make interrupted unlock 14484 * requests look successful, since they will be handled by the 14485 * client recovery code. 14486 */ 14487 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14488 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14489 lock_args, locku_args, did_start_fop, 14490 skip_get_err, cred_otw, cr); 14491 14492 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14493 (cmd == F_SETLK || cmd == F_SETLKW)) 14494 ep->error = 0; 14495 } 14496 14497 /* 14498 * nfs4_safelock: 14499 * 14500 * Return non-zero if the given lock request can be handled without 14501 * violating the constraints on concurrent mapping and locking. 14502 */ 14503 14504 static int 14505 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14506 { 14507 rnode4_t *rp = VTOR4(vp); 14508 struct vattr va; 14509 int error; 14510 14511 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14512 ASSERT(rp->r_mapcnt >= 0); 14513 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14514 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14515 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14516 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14517 14518 if (rp->r_mapcnt == 0) 14519 return (1); /* always safe if not mapped */ 14520 14521 /* 14522 * If the file is already mapped and there are locks, then they 14523 * should be all safe locks. So adding or removing a lock is safe 14524 * as long as the new request is safe (i.e., whole-file, meaning 14525 * length and starting offset are both zero). 14526 */ 14527 14528 if (bfp->l_start != 0 || bfp->l_len != 0) { 14529 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14530 "cannot lock a memory mapped file unless locking the " 14531 "entire file: start %"PRIx64", len %"PRIx64, 14532 bfp->l_start, bfp->l_len)); 14533 return (0); 14534 } 14535 14536 /* mandatory locking and mapping don't mix */ 14537 va.va_mask = AT_MODE; 14538 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14539 if (error != 0) { 14540 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14541 "getattr error %d", error)); 14542 return (0); /* treat errors conservatively */ 14543 } 14544 if (MANDLOCK(vp, va.va_mode)) { 14545 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14546 "cannot mandatory lock and mmap a file")); 14547 return (0); 14548 } 14549 14550 return (1); 14551 } 14552 14553 14554 /* 14555 * Register the lock locally within Solaris. 14556 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14557 * recording locks locally. 14558 * 14559 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14560 * are registered locally. 14561 */ 14562 void 14563 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14564 u_offset_t offset) 14565 { 14566 int oldsysid; 14567 int error; 14568 #ifdef DEBUG 14569 char *name; 14570 #endif 14571 14572 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14573 14574 #ifdef DEBUG 14575 name = fn_name(VTOSV(vp)->sv_name); 14576 NFS4_DEBUG(nfs4_client_lock_debug, 14577 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14578 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14579 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14580 flk->l_sysid)); 14581 kmem_free(name, MAXNAMELEN); 14582 #endif 14583 14584 /* register the lock with local locking */ 14585 oldsysid = flk->l_sysid; 14586 flk->l_sysid |= LM_SYSID_CLIENT; 14587 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14588 #ifdef DEBUG 14589 if (error != 0) { 14590 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14591 "nfs4_register_lock_locally: could not register with" 14592 " local locking")); 14593 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14594 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14595 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14596 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14597 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14598 flk->l_type, flk->l_start, flk->l_len)); 14599 (void) reclock(vp, flk, 0, flag, offset, NULL); 14600 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14601 "blocked by pid %d sysid 0x%x type %d " 14602 "off 0x%" PRIx64 " len 0x%" PRIx64, 14603 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14604 flk->l_len)); 14605 } 14606 #endif 14607 flk->l_sysid = oldsysid; 14608 } 14609 14610 /* 14611 * nfs4_lockrelease: 14612 * 14613 * Release any locks on the given vnode that are held by the current 14614 * process. Also removes the lock owner (if one exists) from the rnode's 14615 * list. 14616 */ 14617 static int 14618 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14619 { 14620 flock64_t ld; 14621 int ret, error; 14622 rnode4_t *rp; 14623 nfs4_lock_owner_t *lop; 14624 nfs4_recov_state_t recov_state; 14625 mntinfo4_t *mi; 14626 bool_t possible_orphan = FALSE; 14627 bool_t recovonly; 14628 14629 ASSERT((uintptr_t)vp > KERNELBASE); 14630 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14631 14632 rp = VTOR4(vp); 14633 mi = VTOMI4(vp); 14634 14635 /* 14636 * If we have not locked anything then we can 14637 * just return since we have no work to do. 14638 */ 14639 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14640 return (0); 14641 } 14642 14643 /* 14644 * We need to comprehend that another thread may 14645 * kick off recovery and the lock_owner we have stashed 14646 * in lop might be invalid so we should NOT cache it 14647 * locally! 14648 */ 14649 recov_state.rs_flags = 0; 14650 recov_state.rs_num_retry_despite_err = 0; 14651 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14652 &recovonly); 14653 if (error) { 14654 mutex_enter(&rp->r_statelock); 14655 rp->r_flags |= R4LODANGLERS; 14656 mutex_exit(&rp->r_statelock); 14657 return (error); 14658 } 14659 14660 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14661 14662 /* 14663 * Check if the lock owner might have a lock (request was sent but 14664 * no response was received). Also check if there are any remote 14665 * locks on the file. (In theory we shouldn't have to make this 14666 * second check if there's no lock owner, but for now we'll be 14667 * conservative and do it anyway.) If either condition is true, 14668 * send an unlock for the entire file to the server. 14669 * 14670 * Note that no explicit synchronization is needed here. At worst, 14671 * flk_has_remote_locks() will return a false positive, in which case 14672 * the unlock call wastes time but doesn't harm correctness. 14673 */ 14674 14675 if (lop) { 14676 mutex_enter(&lop->lo_lock); 14677 possible_orphan = lop->lo_pending_rqsts; 14678 mutex_exit(&lop->lo_lock); 14679 lock_owner_rele(lop); 14680 } 14681 14682 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14683 14684 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14685 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14686 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14687 (void *)lop)); 14688 14689 if (possible_orphan || flk_has_remote_locks(vp)) { 14690 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14691 ld.l_whence = 0; /* unlock from start of file */ 14692 ld.l_start = 0; 14693 ld.l_len = 0; /* do entire file */ 14694 14695 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14696 cr, NULL); 14697 14698 if (ret != 0) { 14699 /* 14700 * If VOP_FRLOCK fails, make sure we unregister 14701 * local locks before we continue. 14702 */ 14703 ld.l_pid = ttoproc(curthread)->p_pid; 14704 nfs4_register_lock_locally(vp, &ld, flag, offset); 14705 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14706 "nfs4_lockrelease: lock release error on vp" 14707 " %p: error %d.\n", (void *)vp, ret)); 14708 } 14709 } 14710 14711 recov_state.rs_flags = 0; 14712 recov_state.rs_num_retry_despite_err = 0; 14713 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14714 &recovonly); 14715 if (error) { 14716 mutex_enter(&rp->r_statelock); 14717 rp->r_flags |= R4LODANGLERS; 14718 mutex_exit(&rp->r_statelock); 14719 return (error); 14720 } 14721 14722 /* 14723 * So, here we're going to need to retrieve the lock-owner 14724 * again (in case recovery has done a switch-a-roo) and 14725 * remove it because we can. 14726 */ 14727 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14728 14729 if (lop) { 14730 nfs4_rnode_remove_lock_owner(rp, lop); 14731 lock_owner_rele(lop); 14732 } 14733 14734 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14735 return (0); 14736 } 14737 14738 /* 14739 * Wait for 'tick_delay' clock ticks. 14740 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14741 * NOTE: lock_lease_time is in seconds. 14742 * 14743 * XXX For future improvements, should implement a waiting queue scheme. 14744 */ 14745 static int 14746 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14747 { 14748 long milliseconds_delay; 14749 time_t lock_lease_time; 14750 14751 /* wait tick_delay clock ticks or siginteruptus */ 14752 if (delay_sig(*tick_delay)) { 14753 return (EINTR); 14754 } 14755 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14756 "reissue the lock request: blocked for %ld clock ticks: %ld " 14757 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14758 14759 /* get the lease time */ 14760 lock_lease_time = r2lease_time(rp); 14761 14762 /* drv_hztousec converts ticks to microseconds */ 14763 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14764 if (milliseconds_delay < lock_lease_time * 1000) { 14765 *tick_delay = 2 * *tick_delay; 14766 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14767 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14768 } 14769 return (0); 14770 } 14771 14772 14773 void 14774 nfs4_vnops_init(void) 14775 { 14776 } 14777 14778 void 14779 nfs4_vnops_fini(void) 14780 { 14781 } 14782 14783 /* 14784 * Return a reference to the directory (parent) vnode for a given vnode, 14785 * using the saved pathname information and the directory file handle. The 14786 * caller is responsible for disposing of the reference. 14787 * Returns zero or an errno value. 14788 * 14789 * Caller should set need_start_op to FALSE if it is the recovery 14790 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14791 */ 14792 int 14793 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14794 { 14795 svnode_t *svnp; 14796 vnode_t *dvp = NULL; 14797 servinfo4_t *svp; 14798 nfs4_fname_t *mfname; 14799 int error; 14800 14801 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14802 14803 if (vp->v_flag & VROOT) { 14804 nfs4_sharedfh_t *sfh; 14805 nfs_fh4 fh; 14806 mntinfo4_t *mi; 14807 14808 ASSERT(vp->v_type == VREG); 14809 14810 mi = VTOMI4(vp); 14811 svp = mi->mi_curr_serv; 14812 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14813 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14814 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14815 sfh = sfh4_get(&fh, VTOMI4(vp)); 14816 nfs_rw_exit(&svp->sv_lock); 14817 mfname = mi->mi_fname; 14818 fn_hold(mfname); 14819 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14820 sfh4_rele(&sfh); 14821 14822 if (dvp->v_type == VNON) 14823 dvp->v_type = VDIR; 14824 *dvpp = dvp; 14825 return (0); 14826 } 14827 14828 svnp = VTOSV(vp); 14829 14830 if (svnp == NULL) { 14831 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14832 "shadow node is NULL")); 14833 return (EINVAL); 14834 } 14835 14836 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14837 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14838 "shadow node name or dfh val == NULL")); 14839 return (EINVAL); 14840 } 14841 14842 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14843 (int)need_start_op); 14844 if (error != 0) { 14845 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14846 "nfs4_make_dotdot returned %d", error)); 14847 return (error); 14848 } 14849 if (!dvp) { 14850 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14851 "nfs4_make_dotdot returned a NULL dvp")); 14852 return (EIO); 14853 } 14854 if (dvp->v_type == VNON) 14855 dvp->v_type = VDIR; 14856 ASSERT(dvp->v_type == VDIR); 14857 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14858 mutex_enter(&dvp->v_lock); 14859 dvp->v_flag |= V_XATTRDIR; 14860 mutex_exit(&dvp->v_lock); 14861 } 14862 *dvpp = dvp; 14863 return (0); 14864 } 14865 14866 /* 14867 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14868 * length that fnamep can accept, including the trailing null. 14869 * Returns 0 if okay, returns an errno value if there was a problem. 14870 */ 14871 14872 int 14873 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14874 { 14875 char *fn; 14876 int err = 0; 14877 servinfo4_t *svp; 14878 svnode_t *shvp; 14879 14880 /* 14881 * If the file being opened has VROOT set, then this is 14882 * a "file" mount. sv_name will not be interesting, so 14883 * go back to the servinfo4 to get the original mount 14884 * path and strip off all but the final edge. Otherwise 14885 * just return the name from the shadow vnode. 14886 */ 14887 14888 if (vp->v_flag & VROOT) { 14889 14890 svp = VTOMI4(vp)->mi_curr_serv; 14891 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14892 14893 fn = strrchr(svp->sv_path, '/'); 14894 if (fn == NULL) 14895 err = EINVAL; 14896 else 14897 fn++; 14898 } else { 14899 shvp = VTOSV(vp); 14900 fn = fn_name(shvp->sv_name); 14901 } 14902 14903 if (err == 0) 14904 if (strlen(fn) < maxlen) 14905 (void) strcpy(fnamep, fn); 14906 else 14907 err = ENAMETOOLONG; 14908 14909 if (vp->v_flag & VROOT) 14910 nfs_rw_exit(&svp->sv_lock); 14911 else 14912 kmem_free(fn, MAXNAMELEN); 14913 14914 return (err); 14915 } 14916 14917 /* 14918 * Bookkeeping for a close that doesn't need to go over the wire. 14919 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14920 * it is left at 1. 14921 */ 14922 void 14923 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14924 { 14925 rnode4_t *rp; 14926 mntinfo4_t *mi; 14927 14928 mi = VTOMI4(vp); 14929 rp = VTOR4(vp); 14930 14931 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14932 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14933 ASSERT(nfs_zone() == mi->mi_zone); 14934 ASSERT(mutex_owned(&osp->os_sync_lock)); 14935 ASSERT(*have_lockp); 14936 14937 if (!osp->os_valid || 14938 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14939 return; 14940 } 14941 14942 /* 14943 * This removes the reference obtained at OPEN; ie, 14944 * when the open stream structure was created. 14945 * 14946 * We don't have to worry about calling 'open_stream_rele' 14947 * since we our currently holding a reference to this 14948 * open stream which means the count can not go to 0 with 14949 * this decrement. 14950 */ 14951 ASSERT(osp->os_ref_count >= 2); 14952 osp->os_ref_count--; 14953 osp->os_valid = 0; 14954 mutex_exit(&osp->os_sync_lock); 14955 *have_lockp = 0; 14956 14957 nfs4_dec_state_ref_count(mi); 14958 } 14959 14960 /* 14961 * Close all remaining open streams on the rnode. These open streams 14962 * could be here because: 14963 * - The close attempted at either close or delmap failed 14964 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14965 * - Someone did mknod on a regular file but never opened it 14966 */ 14967 int 14968 nfs4close_all(vnode_t *vp, cred_t *cr) 14969 { 14970 nfs4_open_stream_t *osp; 14971 int error; 14972 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14973 rnode4_t *rp; 14974 14975 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14976 14977 error = 0; 14978 rp = VTOR4(vp); 14979 14980 /* 14981 * At this point, all we know is that the last time 14982 * someone called vn_rele, the count was 1. Since then, 14983 * the vnode could have been re-activated. We want to 14984 * loop through the open streams and close each one, but 14985 * we have to be careful since once we release the rnode 14986 * hash bucket lock, someone else is free to come in and 14987 * re-activate the rnode and add new open streams. The 14988 * strategy is take the rnode hash bucket lock, verify that 14989 * the count is still 1, grab the open stream off the 14990 * head of the list and mark it invalid, then release the 14991 * rnode hash bucket lock and proceed with that open stream. 14992 * This is ok because nfs4close_one() will acquire the proper 14993 * open/create to close/destroy synchronization for open 14994 * streams, and will ensure that if someone has reopened 14995 * the open stream after we've dropped the hash bucket lock 14996 * then we'll just simply return without destroying the 14997 * open stream. 14998 * Repeat until the list is empty. 14999 */ 15000 15001 for (;;) { 15002 15003 /* make sure vnode hasn't been reactivated */ 15004 rw_enter(&rp->r_hashq->r_lock, RW_READER); 15005 mutex_enter(&vp->v_lock); 15006 if (vp->v_count > 1) { 15007 mutex_exit(&vp->v_lock); 15008 rw_exit(&rp->r_hashq->r_lock); 15009 break; 15010 } 15011 /* 15012 * Grabbing r_os_lock before releasing v_lock prevents 15013 * a window where the rnode/open stream could get 15014 * reactivated (and os_force_close set to 0) before we 15015 * had a chance to set os_force_close to 1. 15016 */ 15017 mutex_enter(&rp->r_os_lock); 15018 mutex_exit(&vp->v_lock); 15019 15020 osp = list_head(&rp->r_open_streams); 15021 if (!osp) { 15022 /* nothing left to CLOSE OTW, so return */ 15023 mutex_exit(&rp->r_os_lock); 15024 rw_exit(&rp->r_hashq->r_lock); 15025 break; 15026 } 15027 15028 mutex_enter(&rp->r_statev4_lock); 15029 /* the file can't still be mem mapped */ 15030 ASSERT(rp->r_mapcnt == 0); 15031 if (rp->created_v4) 15032 rp->created_v4 = 0; 15033 mutex_exit(&rp->r_statev4_lock); 15034 15035 /* 15036 * Grab a ref on this open stream; nfs4close_one 15037 * will mark it as invalid 15038 */ 15039 mutex_enter(&osp->os_sync_lock); 15040 osp->os_ref_count++; 15041 osp->os_force_close = 1; 15042 mutex_exit(&osp->os_sync_lock); 15043 mutex_exit(&rp->r_os_lock); 15044 rw_exit(&rp->r_hashq->r_lock); 15045 15046 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15047 15048 /* Update error if it isn't already non-zero */ 15049 if (error == 0) { 15050 if (e.error) 15051 error = e.error; 15052 else if (e.stat) 15053 error = geterrno4(e.stat); 15054 } 15055 15056 #ifdef DEBUG 15057 nfs4close_all_cnt++; 15058 #endif 15059 /* Release the ref on osp acquired above. */ 15060 open_stream_rele(osp, rp); 15061 15062 /* Proceed to the next open stream, if any */ 15063 } 15064 return (error); 15065 } 15066 15067 /* 15068 * nfs4close_one - close one open stream for a file if needed. 15069 * 15070 * "close_type" indicates which close path this is: 15071 * CLOSE_NORM: close initiated via VOP_CLOSE. 15072 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15073 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15074 * the close and release of client state for this open stream 15075 * (unless someone else has the open stream open). 15076 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15077 * (e.g., due to abort because of a signal). 15078 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15079 * 15080 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15081 * recovery. Instead, the caller is expected to deal with retries. 15082 * 15083 * The caller can either pass in the osp ('provided_osp') or not. 15084 * 15085 * 'access_bits' represents the access we are closing/downgrading. 15086 * 15087 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15088 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15089 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15090 * 15091 * Errors are returned via the nfs4_error_t. 15092 */ 15093 void 15094 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15095 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15096 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15097 uint_t mmap_flags) 15098 { 15099 nfs4_open_owner_t *oop; 15100 nfs4_open_stream_t *osp = NULL; 15101 int retry = 0; 15102 int num_retries = NFS4_NUM_RECOV_RETRIES; 15103 rnode4_t *rp; 15104 mntinfo4_t *mi; 15105 nfs4_recov_state_t recov_state; 15106 cred_t *cred_otw = NULL; 15107 bool_t recovonly = FALSE; 15108 int isrecov; 15109 int force_close; 15110 int close_failed = 0; 15111 int did_dec_count = 0; 15112 int did_start_op = 0; 15113 int did_force_recovlock = 0; 15114 int did_start_seqid_sync = 0; 15115 int have_sync_lock = 0; 15116 15117 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15118 15119 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15120 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15121 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15122 len, maxprot, mmap_flags, access_bits)); 15123 15124 nfs4_error_zinit(ep); 15125 rp = VTOR4(vp); 15126 mi = VTOMI4(vp); 15127 isrecov = (close_type == CLOSE_RESEND || 15128 close_type == CLOSE_AFTER_RESEND); 15129 15130 /* 15131 * First get the open owner. 15132 */ 15133 if (!provided_osp) { 15134 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15135 } else { 15136 oop = provided_osp->os_open_owner; 15137 ASSERT(oop != NULL); 15138 open_owner_hold(oop); 15139 } 15140 15141 if (!oop) { 15142 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15143 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15144 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15145 (void *)provided_osp, close_type)); 15146 ep->error = EIO; 15147 goto out; 15148 } 15149 15150 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15151 recov_retry: 15152 osp = NULL; 15153 close_failed = 0; 15154 force_close = (close_type == CLOSE_FORCE); 15155 retry = 0; 15156 did_start_op = 0; 15157 did_force_recovlock = 0; 15158 did_start_seqid_sync = 0; 15159 have_sync_lock = 0; 15160 recovonly = FALSE; 15161 recov_state.rs_flags = 0; 15162 recov_state.rs_num_retry_despite_err = 0; 15163 15164 /* 15165 * Second synchronize with recovery. 15166 */ 15167 if (!isrecov) { 15168 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15169 &recov_state, &recovonly); 15170 if (!ep->error) { 15171 did_start_op = 1; 15172 } else { 15173 close_failed = 1; 15174 /* 15175 * If we couldn't get start_fop, but have to 15176 * cleanup state, then at least acquire the 15177 * mi_recovlock so we can synchronize with 15178 * recovery. 15179 */ 15180 if (close_type == CLOSE_FORCE) { 15181 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15182 RW_READER, FALSE); 15183 did_force_recovlock = 1; 15184 } else 15185 goto out; 15186 } 15187 } 15188 15189 /* 15190 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15191 * set 'recovonly' to TRUE since most likely this is due to 15192 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15193 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15194 * to retry, causing us to loop until recovery finishes. Plus we 15195 * don't need protection over the open seqid since we're not going 15196 * OTW, hence don't need to use the seqid. 15197 */ 15198 if (recovonly == FALSE) { 15199 /* need to grab the open owner sync before 'os_sync_lock' */ 15200 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15201 if (ep->error == EAGAIN) { 15202 ASSERT(!isrecov); 15203 if (did_start_op) 15204 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15205 &recov_state, TRUE); 15206 if (did_force_recovlock) 15207 nfs_rw_exit(&mi->mi_recovlock); 15208 goto recov_retry; 15209 } 15210 did_start_seqid_sync = 1; 15211 } 15212 15213 /* 15214 * Third get an open stream and acquire 'os_sync_lock' to 15215 * sychronize the opening/creating of an open stream with the 15216 * closing/destroying of an open stream. 15217 */ 15218 if (!provided_osp) { 15219 /* returns with 'os_sync_lock' held */ 15220 osp = find_open_stream(oop, rp); 15221 if (!osp) { 15222 ep->error = EIO; 15223 goto out; 15224 } 15225 } else { 15226 osp = provided_osp; 15227 open_stream_hold(osp); 15228 mutex_enter(&osp->os_sync_lock); 15229 } 15230 have_sync_lock = 1; 15231 15232 ASSERT(oop == osp->os_open_owner); 15233 15234 /* 15235 * Fourth, do any special pre-OTW CLOSE processing 15236 * based on the specific close type. 15237 */ 15238 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15239 !did_dec_count) { 15240 ASSERT(osp->os_open_ref_count > 0); 15241 osp->os_open_ref_count--; 15242 did_dec_count = 1; 15243 if (osp->os_open_ref_count == 0) 15244 osp->os_final_close = 1; 15245 } 15246 15247 if (close_type == CLOSE_FORCE) { 15248 /* see if somebody reopened the open stream. */ 15249 if (!osp->os_force_close) { 15250 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15251 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15252 "was reopened, vp %p", (void *)osp, (void *)vp)); 15253 ep->error = 0; 15254 ep->stat = NFS4_OK; 15255 goto out; 15256 } 15257 15258 if (!osp->os_final_close && !did_dec_count) { 15259 osp->os_open_ref_count--; 15260 did_dec_count = 1; 15261 } 15262 15263 /* 15264 * We can't depend on os_open_ref_count being 0 due to the 15265 * way executables are opened (VN_RELE to match a VOP_OPEN). 15266 */ 15267 #ifdef NOTYET 15268 ASSERT(osp->os_open_ref_count == 0); 15269 #endif 15270 if (osp->os_open_ref_count != 0) { 15271 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15272 "nfs4close_one: should panic here on an " 15273 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15274 "since this is probably the exec problem.")); 15275 15276 osp->os_open_ref_count = 0; 15277 } 15278 15279 /* 15280 * There is the possibility that nfs4close_one() 15281 * for close_type == CLOSE_DELMAP couldn't find the 15282 * open stream, thus couldn't decrement its os_mapcnt; 15283 * therefore we can't use this ASSERT yet. 15284 */ 15285 #ifdef NOTYET 15286 ASSERT(osp->os_mapcnt == 0); 15287 #endif 15288 osp->os_mapcnt = 0; 15289 } 15290 15291 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15292 ASSERT(osp->os_mapcnt >= btopr(len)); 15293 15294 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15295 osp->os_mmap_write -= btopr(len); 15296 if (maxprot & PROT_READ) 15297 osp->os_mmap_read -= btopr(len); 15298 if (maxprot & PROT_EXEC) 15299 osp->os_mmap_read -= btopr(len); 15300 /* mirror the PROT_NONE check in nfs4_addmap() */ 15301 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15302 !(maxprot & PROT_EXEC)) 15303 osp->os_mmap_read -= btopr(len); 15304 osp->os_mapcnt -= btopr(len); 15305 did_dec_count = 1; 15306 } 15307 15308 if (recovonly) { 15309 nfs4_lost_rqst_t lost_rqst; 15310 15311 /* request should not already be in recovery queue */ 15312 ASSERT(lrp == NULL); 15313 nfs4_error_init(ep, EINTR); 15314 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15315 osp, cred_otw, vp); 15316 mutex_exit(&osp->os_sync_lock); 15317 have_sync_lock = 0; 15318 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15319 lost_rqst.lr_op == OP_CLOSE ? 15320 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL); 15321 close_failed = 1; 15322 force_close = 0; 15323 goto close_cleanup; 15324 } 15325 15326 /* 15327 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15328 * we stopped operating on the open owner's <old oo_name, old seqid> 15329 * space, which means we stopped operating on the open stream 15330 * too. So don't go OTW (as the seqid is likely bad, and the 15331 * stateid could be stale, potentially triggering a false 15332 * setclientid), and just clean up the client's internal state. 15333 */ 15334 if (osp->os_orig_oo_name != oop->oo_name) { 15335 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15336 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15337 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15338 "oo_name %" PRIx64")", 15339 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15340 oop->oo_name)); 15341 close_failed = 1; 15342 } 15343 15344 /* If the file failed recovery, just quit. */ 15345 mutex_enter(&rp->r_statelock); 15346 if (rp->r_flags & R4RECOVERR) { 15347 close_failed = 1; 15348 } 15349 mutex_exit(&rp->r_statelock); 15350 15351 /* 15352 * If the force close path failed to obtain start_fop 15353 * then skip the OTW close and just remove the state. 15354 */ 15355 if (close_failed) 15356 goto close_cleanup; 15357 15358 /* 15359 * Fifth, check to see if there are still mapped pages or other 15360 * opens using this open stream. If there are then we can't 15361 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15362 */ 15363 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15364 nfs4_lost_rqst_t new_lost_rqst; 15365 bool_t needrecov = FALSE; 15366 cred_t *odg_cred_otw = NULL; 15367 seqid4 open_dg_seqid = 0; 15368 15369 if (osp->os_delegation) { 15370 /* 15371 * If this open stream was never OPENed OTW then we 15372 * surely can't DOWNGRADE it (especially since the 15373 * osp->open_stateid is really a delegation stateid 15374 * when os_delegation is 1). 15375 */ 15376 if (access_bits & FREAD) 15377 osp->os_share_acc_read--; 15378 if (access_bits & FWRITE) 15379 osp->os_share_acc_write--; 15380 osp->os_share_deny_none--; 15381 nfs4_error_zinit(ep); 15382 goto out; 15383 } 15384 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15385 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15386 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15387 if (needrecov && !isrecov) { 15388 bool_t abort; 15389 nfs4_bseqid_entry_t *bsep = NULL; 15390 15391 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15392 bsep = nfs4_create_bseqid_entry(oop, NULL, 15393 vp, 0, 15394 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15395 open_dg_seqid); 15396 15397 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15398 oop, osp, odg_cred_otw, vp, access_bits, 0); 15399 mutex_exit(&osp->os_sync_lock); 15400 have_sync_lock = 0; 15401 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15402 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15403 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15404 bsep, NULL, NULL); 15405 if (odg_cred_otw) 15406 crfree(odg_cred_otw); 15407 if (bsep) 15408 kmem_free(bsep, sizeof (*bsep)); 15409 15410 if (abort == TRUE) 15411 goto out; 15412 15413 if (did_start_seqid_sync) { 15414 nfs4_end_open_seqid_sync(oop); 15415 did_start_seqid_sync = 0; 15416 } 15417 open_stream_rele(osp, rp); 15418 15419 if (did_start_op) 15420 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15421 &recov_state, FALSE); 15422 if (did_force_recovlock) 15423 nfs_rw_exit(&mi->mi_recovlock); 15424 15425 goto recov_retry; 15426 } else { 15427 if (odg_cred_otw) 15428 crfree(odg_cred_otw); 15429 } 15430 goto out; 15431 } 15432 15433 /* 15434 * If this open stream was created as the results of an open 15435 * while holding a delegation, then just release it; no need 15436 * to do an OTW close. Otherwise do a "normal" OTW close. 15437 */ 15438 if (osp->os_delegation) { 15439 nfs4close_notw(vp, osp, &have_sync_lock); 15440 nfs4_error_zinit(ep); 15441 goto out; 15442 } 15443 15444 /* 15445 * If this stream is not valid, we're done. 15446 */ 15447 if (!osp->os_valid) { 15448 nfs4_error_zinit(ep); 15449 goto out; 15450 } 15451 15452 /* 15453 * Last open or mmap ref has vanished, need to do an OTW close. 15454 * First check to see if a close is still necessary. 15455 */ 15456 if (osp->os_failed_reopen) { 15457 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15458 "don't close OTW osp %p since reopen failed.", 15459 (void *)osp)); 15460 /* 15461 * Reopen of the open stream failed, hence the 15462 * stateid of the open stream is invalid/stale, and 15463 * sending this OTW would incorrectly cause another 15464 * round of recovery. In this case, we need to set 15465 * the 'os_valid' bit to 0 so another thread doesn't 15466 * come in and re-open this open stream before 15467 * this "closing" thread cleans up state (decrementing 15468 * the nfs4_server_t's state_ref_count and decrementing 15469 * the os_ref_count). 15470 */ 15471 osp->os_valid = 0; 15472 /* 15473 * This removes the reference obtained at OPEN; ie, 15474 * when the open stream structure was created. 15475 * 15476 * We don't have to worry about calling 'open_stream_rele' 15477 * since we our currently holding a reference to this 15478 * open stream which means the count can not go to 0 with 15479 * this decrement. 15480 */ 15481 ASSERT(osp->os_ref_count >= 2); 15482 osp->os_ref_count--; 15483 nfs4_error_zinit(ep); 15484 close_failed = 0; 15485 goto close_cleanup; 15486 } 15487 15488 ASSERT(osp->os_ref_count > 1); 15489 15490 /* 15491 * Sixth, try the CLOSE OTW. 15492 */ 15493 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15494 close_type, ep, &have_sync_lock); 15495 15496 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15497 /* 15498 * Let the recovery thread be responsible for 15499 * removing the state for CLOSE. 15500 */ 15501 close_failed = 1; 15502 force_close = 0; 15503 retry = 0; 15504 } 15505 15506 /* See if we need to retry with a different cred */ 15507 if ((ep->error == EACCES || 15508 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15509 cred_otw != cr) { 15510 crfree(cred_otw); 15511 cred_otw = cr; 15512 crhold(cred_otw); 15513 retry = 1; 15514 } 15515 15516 if (ep->error || ep->stat) 15517 close_failed = 1; 15518 15519 if (retry && !isrecov && num_retries-- > 0) { 15520 if (have_sync_lock) { 15521 mutex_exit(&osp->os_sync_lock); 15522 have_sync_lock = 0; 15523 } 15524 if (did_start_seqid_sync) { 15525 nfs4_end_open_seqid_sync(oop); 15526 did_start_seqid_sync = 0; 15527 } 15528 open_stream_rele(osp, rp); 15529 15530 if (did_start_op) 15531 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15532 &recov_state, FALSE); 15533 if (did_force_recovlock) 15534 nfs_rw_exit(&mi->mi_recovlock); 15535 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15536 "nfs4close_one: need to retry the close " 15537 "operation")); 15538 goto recov_retry; 15539 } 15540 close_cleanup: 15541 /* 15542 * Seventh and lastly, process our results. 15543 */ 15544 if (close_failed && force_close) { 15545 /* 15546 * It's ok to drop and regrab the 'os_sync_lock' since 15547 * nfs4close_notw() will recheck to make sure the 15548 * "close"/removal of state should happen. 15549 */ 15550 if (!have_sync_lock) { 15551 mutex_enter(&osp->os_sync_lock); 15552 have_sync_lock = 1; 15553 } 15554 /* 15555 * This is last call, remove the ref on the open 15556 * stream created by open and clean everything up. 15557 */ 15558 osp->os_pending_close = 0; 15559 nfs4close_notw(vp, osp, &have_sync_lock); 15560 nfs4_error_zinit(ep); 15561 } 15562 15563 if (!close_failed) { 15564 if (have_sync_lock) { 15565 osp->os_pending_close = 0; 15566 mutex_exit(&osp->os_sync_lock); 15567 have_sync_lock = 0; 15568 } else { 15569 mutex_enter(&osp->os_sync_lock); 15570 osp->os_pending_close = 0; 15571 mutex_exit(&osp->os_sync_lock); 15572 } 15573 if (did_start_op && recov_state.rs_sp != NULL) { 15574 mutex_enter(&recov_state.rs_sp->s_lock); 15575 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15576 mutex_exit(&recov_state.rs_sp->s_lock); 15577 } else { 15578 nfs4_dec_state_ref_count(mi); 15579 } 15580 nfs4_error_zinit(ep); 15581 } 15582 15583 out: 15584 if (have_sync_lock) 15585 mutex_exit(&osp->os_sync_lock); 15586 if (did_start_op) 15587 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15588 recovonly ? TRUE : FALSE); 15589 if (did_force_recovlock) 15590 nfs_rw_exit(&mi->mi_recovlock); 15591 if (cred_otw) 15592 crfree(cred_otw); 15593 if (osp) 15594 open_stream_rele(osp, rp); 15595 if (oop) { 15596 if (did_start_seqid_sync) 15597 nfs4_end_open_seqid_sync(oop); 15598 open_owner_rele(oop); 15599 } 15600 } 15601 15602 /* 15603 * Convert information returned by the server in the LOCK4denied 15604 * structure to the form required by fcntl. 15605 */ 15606 static void 15607 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15608 { 15609 nfs4_lo_name_t *lo; 15610 15611 #ifdef DEBUG 15612 if (denied_to_flk_debug) { 15613 lockt_denied_debug = lockt_denied; 15614 debug_enter("lockt_denied"); 15615 } 15616 #endif 15617 15618 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15619 flk->l_whence = 0; /* aka SEEK_SET */ 15620 flk->l_start = lockt_denied->offset; 15621 flk->l_len = lockt_denied->length; 15622 15623 /* 15624 * If the blocking clientid matches our client id, then we can 15625 * interpret the lockowner (since we built it). If not, then 15626 * fabricate a sysid and pid. Note that the l_sysid field 15627 * in *flk already has the local sysid. 15628 */ 15629 15630 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15631 15632 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15633 lo = (nfs4_lo_name_t *) 15634 lockt_denied->owner.owner_val; 15635 15636 flk->l_pid = lo->ln_pid; 15637 } else { 15638 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15639 "denied_to_flk: bad lock owner length\n")); 15640 15641 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15642 } 15643 } else { 15644 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15645 "denied_to_flk: foreign clientid\n")); 15646 15647 /* 15648 * Construct a new sysid which should be different from 15649 * sysids of other systems. 15650 */ 15651 15652 flk->l_sysid++; 15653 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15654 } 15655 } 15656 15657 static pid_t 15658 lo_to_pid(lock_owner4 *lop) 15659 { 15660 pid_t pid = 0; 15661 uchar_t *cp; 15662 int i; 15663 15664 cp = (uchar_t *)&lop->clientid; 15665 15666 for (i = 0; i < sizeof (lop->clientid); i++) 15667 pid += (pid_t)*cp++; 15668 15669 cp = (uchar_t *)lop->owner_val; 15670 15671 for (i = 0; i < lop->owner_len; i++) 15672 pid += (pid_t)*cp++; 15673 15674 return (pid); 15675 } 15676 15677 /* 15678 * Given a lock pointer, returns the length of that lock. 15679 * "end" is the last locked offset the "l_len" covers from 15680 * the start of the lock. 15681 */ 15682 static off64_t 15683 lock_to_end(flock64_t *lock) 15684 { 15685 off64_t lock_end; 15686 15687 if (lock->l_len == 0) 15688 lock_end = (off64_t)MAXEND; 15689 else 15690 lock_end = lock->l_start + lock->l_len - 1; 15691 15692 return (lock_end); 15693 } 15694 15695 /* 15696 * Given the end of a lock, it will return you the length "l_len" for that lock. 15697 */ 15698 static off64_t 15699 end_to_len(off64_t start, off64_t end) 15700 { 15701 off64_t lock_len; 15702 15703 ASSERT(end >= start); 15704 if (end == MAXEND) 15705 lock_len = 0; 15706 else 15707 lock_len = end - start + 1; 15708 15709 return (lock_len); 15710 } 15711 15712 /* 15713 * On given end for a lock it determines if it is the last locked offset 15714 * or not, if so keeps it as is, else adds one to return the length for 15715 * valid start. 15716 */ 15717 static off64_t 15718 start_check(off64_t x) 15719 { 15720 if (x == MAXEND) 15721 return (x); 15722 else 15723 return (x + 1); 15724 } 15725 15726 /* 15727 * See if these two locks overlap, and if so return 1; 15728 * otherwise, return 0. 15729 */ 15730 static int 15731 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15732 { 15733 off64_t llfp_end, curfp_end; 15734 15735 llfp_end = lock_to_end(llfp); 15736 curfp_end = lock_to_end(curfp); 15737 15738 if (((llfp_end >= curfp->l_start) && 15739 (llfp->l_start <= curfp->l_start)) || 15740 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15741 return (1); 15742 return (0); 15743 } 15744 15745 /* 15746 * Determine what the intersecting lock region is, and add that to the 15747 * 'nl_llpp' locklist in increasing order (by l_start). 15748 */ 15749 static void 15750 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15751 locklist_t **nl_llpp, vnode_t *vp) 15752 { 15753 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15754 off64_t lost_flp_end, local_flp_end, len, start; 15755 15756 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15757 15758 if (!locks_intersect(lost_flp, local_flp)) 15759 return; 15760 15761 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15762 "locks intersect")); 15763 15764 lost_flp_end = lock_to_end(lost_flp); 15765 local_flp_end = lock_to_end(local_flp); 15766 15767 /* Find the starting point of the intersecting region */ 15768 if (local_flp->l_start > lost_flp->l_start) 15769 start = local_flp->l_start; 15770 else 15771 start = lost_flp->l_start; 15772 15773 /* Find the lenght of the intersecting region */ 15774 if (lost_flp_end < local_flp_end) 15775 len = end_to_len(start, lost_flp_end); 15776 else 15777 len = end_to_len(start, local_flp_end); 15778 15779 /* 15780 * Prepare the flock structure for the intersection found and insert 15781 * it into the new list in increasing l_start order. This list contains 15782 * intersections of locks registered by the client with the local host 15783 * and the lost lock. 15784 * The lock type of this lock is the same as that of the local_flp. 15785 */ 15786 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15787 intersect_llp->ll_flock.l_start = start; 15788 intersect_llp->ll_flock.l_len = len; 15789 intersect_llp->ll_flock.l_type = local_flp->l_type; 15790 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15791 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15792 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15793 intersect_llp->ll_vp = vp; 15794 15795 tmp_fllp = *nl_llpp; 15796 cur_fllp = NULL; 15797 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15798 intersect_llp->ll_flock.l_start) { 15799 cur_fllp = tmp_fllp; 15800 tmp_fllp = tmp_fllp->ll_next; 15801 } 15802 if (cur_fllp == NULL) { 15803 /* first on the list */ 15804 intersect_llp->ll_next = *nl_llpp; 15805 *nl_llpp = intersect_llp; 15806 } else { 15807 intersect_llp->ll_next = cur_fllp->ll_next; 15808 cur_fllp->ll_next = intersect_llp; 15809 } 15810 15811 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15812 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15813 intersect_llp->ll_flock.l_start, 15814 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15815 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15816 } 15817 15818 /* 15819 * Our local locking current state is potentially different than 15820 * what the NFSv4 server thinks we have due to a lost lock that was 15821 * resent and then received. We need to reset our "NFSv4" locking 15822 * state to match the current local locking state for this pid since 15823 * that is what the user/application sees as what the world is. 15824 * 15825 * We cannot afford to drop the open/lock seqid sync since then we can 15826 * get confused about what the current local locking state "is" versus 15827 * "was". 15828 * 15829 * If we are unable to fix up the locks, we send SIGLOST to the affected 15830 * process. This is not done if the filesystem has been forcibly 15831 * unmounted, in case the process has already exited and a new process 15832 * exists with the same pid. 15833 */ 15834 static void 15835 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15836 nfs4_lock_owner_t *lop) 15837 { 15838 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15839 mntinfo4_t *mi = VTOMI4(vp); 15840 const int cmd = F_SETLK; 15841 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15842 flock64_t ul_fl; 15843 15844 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15845 "nfs4_reinstitute_local_lock_state")); 15846 15847 /* 15848 * Find active locks for this vp from the local locking code. 15849 * Scan through this list and find out the locks that intersect with 15850 * the lost lock. Once we find the lock that intersects, add the 15851 * intersection area as a new lock to a new list "ri_llp". The lock 15852 * type of the intersection region lock added to ri_llp is the same 15853 * as that found in the active lock list, "list". The intersecting 15854 * region locks are added to ri_llp in increasing l_start order. 15855 */ 15856 ASSERT(nfs_zone() == mi->mi_zone); 15857 15858 locks = flk_active_locks_for_vp(vp); 15859 ri_llp = NULL; 15860 15861 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15862 ASSERT(llp->ll_vp == vp); 15863 /* 15864 * Pick locks that belong to this pid/lockowner 15865 */ 15866 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15867 continue; 15868 15869 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15870 } 15871 15872 /* 15873 * Now we have the list of intersections with the lost lock. These are 15874 * the locks that were/are active before the server replied to the 15875 * last/lost lock. Issue these locks to the server here. Playing these 15876 * locks to the server will re-establish aur current local locking state 15877 * with the v4 server. 15878 * If we get an error, send SIGLOST to the application for that lock. 15879 */ 15880 15881 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15882 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15883 "nfs4_reinstitute_local_lock_state: need to issue " 15884 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15885 llp->ll_flock.l_start, 15886 llp->ll_flock.l_start + llp->ll_flock.l_len, 15887 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15888 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15889 /* 15890 * No need to relock what we already have 15891 */ 15892 if (llp->ll_flock.l_type == lost_flp->l_type) 15893 continue; 15894 15895 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15896 } 15897 15898 /* 15899 * Now keeping the start of the lost lock as our reference parse the 15900 * newly created ri_llp locklist to find the ranges that we have locked 15901 * with the v4 server but not in the current local locking. We need 15902 * to unlock these ranges. 15903 * These ranges can also be reffered to as those ranges, where the lost 15904 * lock does not overlap with the locks in the ri_llp but are locked 15905 * since the server replied to the lost lock. 15906 */ 15907 cur_start = lost_flp->l_start; 15908 lost_flp_end = lock_to_end(lost_flp); 15909 15910 ul_fl.l_type = F_UNLCK; 15911 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15912 ul_fl.l_sysid = lost_flp->l_sysid; 15913 ul_fl.l_pid = lost_flp->l_pid; 15914 15915 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15916 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15917 15918 if (llp->ll_flock.l_start <= cur_start) { 15919 cur_start = start_check(llp_ll_flock_end); 15920 continue; 15921 } 15922 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15923 "nfs4_reinstitute_local_lock_state: " 15924 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15925 cur_start, llp->ll_flock.l_start)); 15926 15927 ul_fl.l_start = cur_start; 15928 ul_fl.l_len = end_to_len(cur_start, 15929 (llp->ll_flock.l_start - 1)); 15930 15931 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15932 cur_start = start_check(llp_ll_flock_end); 15933 } 15934 15935 /* 15936 * In the case where the lost lock ends after all intersecting locks, 15937 * unlock the last part of the lost lock range. 15938 */ 15939 if (cur_start != start_check(lost_flp_end)) { 15940 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15941 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15942 "lost lock region [%"PRIx64" - %"PRIx64"]", 15943 cur_start, lost_flp->l_start + lost_flp->l_len)); 15944 15945 ul_fl.l_start = cur_start; 15946 /* 15947 * Is it an to-EOF lock? if so unlock till the end 15948 */ 15949 if (lost_flp->l_len == 0) 15950 ul_fl.l_len = 0; 15951 else 15952 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15953 15954 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15955 } 15956 15957 if (locks != NULL) 15958 flk_free_locklist(locks); 15959 15960 /* Free up our newly created locklist */ 15961 for (llp = ri_llp; llp != NULL; ) { 15962 tmp_llp = llp->ll_next; 15963 kmem_free(llp, sizeof (locklist_t)); 15964 llp = tmp_llp; 15965 } 15966 15967 /* 15968 * Now return back to the original calling nfs4frlock() 15969 * and let us naturally drop our seqid syncs. 15970 */ 15971 } 15972 15973 /* 15974 * Create a lost state record for the given lock reinstantiation request 15975 * and push it onto the lost state queue. 15976 */ 15977 static void 15978 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15979 nfs4_lock_owner_t *lop) 15980 { 15981 nfs4_lost_rqst_t req; 15982 nfs_lock_type4 locktype; 15983 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15984 15985 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15986 15987 locktype = flk_to_locktype(cmd, flk->l_type); 15988 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15989 NULL, NULL, lop, flk, &req, cr, vp); 15990 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15991 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15992 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15993 NULL, NULL, NULL); 15994 }