1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /*
30 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
31 * All Rights Reserved
32 */
33
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/vfs_opreg.h>
42 #include <sys/file.h>
43 #include <sys/filio.h>
44 #include <sys/uio.h>
45 #include <sys/buf.h>
46 #include <sys/mman.h>
47 #include <sys/pathname.h>
48 #include <sys/dirent.h>
49 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/swap.h>
54 #include <sys/errno.h>
55 #include <sys/strsubr.h>
56 #include <sys/sysmacros.h>
57 #include <sys/kmem.h>
58 #include <sys/cmn_err.h>
59 #include <sys/pathconf.h>
60 #include <sys/utsname.h>
61 #include <sys/dnlc.h>
62 #include <sys/acl.h>
63 #include <sys/systeminfo.h>
64 #include <sys/policy.h>
65 #include <sys/sdt.h>
66 #include <sys/list.h>
67 #include <sys/stat.h>
68 #include <sys/zone.h>
69
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82
83 #include <vm/hat.h>
84 #include <vm/as.h>
85 #include <vm/page.h>
86 #include <vm/pvn.h>
87 #include <vm/seg.h>
88 #include <vm/seg_map.h>
89 #include <vm/seg_kpm.h>
90 #include <vm/seg_vn.h>
91
92 #include <fs/fs_subr.h>
93
94 #include <sys/ddi.h>
95 #include <sys/int_fmtio.h>
96 #include <sys/fs/autofs.h>
97
98 typedef struct {
99 nfs4_ga_res_t *di_garp;
100 cred_t *di_cred;
101 hrtime_t di_time_call;
102 } dirattr_info_t;
103
104 typedef enum nfs4_acl_op {
105 NFS4_ACL_GET,
106 NFS4_ACL_SET
107 } nfs4_acl_op_t;
108
109 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi);
110
111 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
112 char *, dirattr_info_t *);
113
114 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
115 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
116 nfs4_error_t *, int *);
117 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
118 cred_t *);
119 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
120 stable_how4 *);
121 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
122 cred_t *, bool_t, struct uio *);
123 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
124 vsecattr_t *);
125 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
126 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
127 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
128 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
129 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
130 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
131 int, vnode_t **, cred_t *);
132 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
133 cred_t *, int, int, enum createmode4, int);
134 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
135 caller_context_t *);
136 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
137 vnode_t *, char *, cred_t *, nfsstat4 *);
138 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
139 vnode_t *, char *, cred_t *, nfsstat4 *);
140 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
141 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
142 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
143 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
144 page_t *[], size_t, struct seg *, caddr_t,
145 enum seg_rw, cred_t *);
146 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
147 cred_t *);
148 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
149 int, cred_t *);
150 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
151 int, cred_t *);
152 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *);
153 static void nfs4_set_mod(vnode_t *);
154 static void nfs4_get_commit(vnode_t *);
155 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
156 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
157 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
158 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
159 cred_t *);
160 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
161 cred_t *);
162 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
163 hrtime_t, vnode_t *, cred_t *);
164 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
165 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
166 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
167 u_offset_t);
168 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
169 static int nfs4_block_and_wait(clock_t *, rnode4_t *);
170 static cred_t *state_to_cred(nfs4_open_stream_t *);
171 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
172 static pid_t lo_to_pid(lock_owner4 *);
173 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
174 cred_t *, nfs4_lock_owner_t *);
175 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
176 nfs4_lock_owner_t *);
177 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
178 static void nfs4_delmap_callback(struct as *, void *, uint_t);
179 static void nfs4_free_delmapcall(nfs4_delmapcall_t *);
180 static nfs4_delmapcall_t *nfs4_init_delmapcall();
181 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
182 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
183 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
184 uid_t, gid_t, int);
185
186 /*
187 * Routines that implement the setting of v4 args for the misc. ops
188 */
189 static void nfs4args_lock_free(nfs_argop4 *);
190 static void nfs4args_lockt_free(nfs_argop4 *);
191 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
192 int, rnode4_t *, cred_t *, bitmap4, int *,
193 nfs4_stateid_types_t *);
194 static void nfs4args_setattr_free(nfs_argop4 *);
195 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
196 bitmap4);
197 static void nfs4args_verify_free(nfs_argop4 *);
198 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
199 WRITE4args **, nfs4_stateid_types_t *);
200
201 /*
202 * These are the vnode ops functions that implement the vnode interface to
203 * the networked file system. See more comments below at nfs4_vnodeops.
204 */
205 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
206 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
207 caller_context_t *);
208 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *,
209 caller_context_t *);
210 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *,
211 caller_context_t *);
212 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
213 caller_context_t *);
214 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
215 caller_context_t *);
216 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
217 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *,
218 caller_context_t *);
219 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
220 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
221 int, vnode_t **, cred_t *, int, caller_context_t *,
222 vsecattr_t *);
223 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
224 int);
225 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
226 caller_context_t *, int);
227 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
228 caller_context_t *, int);
229 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
230 cred_t *, caller_context_t *, int, vsecattr_t *);
231 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
232 caller_context_t *, int);
233 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
234 cred_t *, caller_context_t *, int);
235 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
236 caller_context_t *, int);
237 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
238 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
239 page_t *[], size_t, struct seg *, caddr_t,
240 enum seg_rw, cred_t *, caller_context_t *);
241 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
242 caller_context_t *);
243 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
244 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
245 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
246 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
247 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
248 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
249 struct flk_callback *, cred_t *, caller_context_t *);
250 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
251 cred_t *, caller_context_t *);
252 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
253 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
254 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
255 cred_t *, caller_context_t *);
256 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
257 caller_context_t *);
258 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
259 caller_context_t *);
260 /*
261 * These vnode ops are required to be called from outside this source file,
262 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
263 * as static.
264 */
265 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
266 caller_context_t *);
267 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
268 int nfs4_lookup(vnode_t *, char *, vnode_t **,
269 struct pathname *, int, vnode_t *, cred_t *,
270 caller_context_t *, int *, pathname_t *);
271 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
272 int nfs4_rwlock(vnode_t *, int, caller_context_t *);
273 void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
274 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
275 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
276 caller_context_t *);
277 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
278 caller_context_t *);
279 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
280 caller_context_t *);
281
282 /*
283 * Used for nfs4_commit_vp() to indicate if we should
284 * wait on pending writes.
285 */
286 #define NFS4_WRITE_NOWAIT 0
287 #define NFS4_WRITE_WAIT 1
288
289 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */
290
291 /*
292 * Error flags used to pass information about certain special errors
293 * which need to be handled specially.
294 */
295 #define NFS_EOF -98
296 #define NFS_VERF_MISMATCH -97
297
298 /*
299 * Flags used to differentiate between which operation drove the
300 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
301 */
302 #define NFS4_CLOSE_OP 0x1
303 #define NFS4_DELMAP_OP 0x2
304 #define NFS4_INACTIVE_OP 0x3
305
306 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
307
308 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
309 #define ALIGN64(x, ptr, sz) \
310 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \
311 if (x) { \
312 x = sizeof (uint64_t) - (x); \
313 sz -= (x); \
314 ptr += (x); \
315 }
316
317 #ifdef DEBUG
318 int nfs4_client_attr_debug = 0;
319 int nfs4_client_state_debug = 0;
320 int nfs4_client_shadow_debug = 0;
321 int nfs4_client_lock_debug = 0;
322 int nfs4_seqid_sync = 0;
323 int nfs4_client_map_debug = 0;
324 static int nfs4_pageio_debug = 0;
325 int nfs4_client_inactive_debug = 0;
326 int nfs4_client_recov_debug = 0;
327 int nfs4_client_failover_debug = 0;
328 int nfs4_client_call_debug = 0;
329 int nfs4_client_lookup_debug = 0;
330 int nfs4_client_zone_debug = 0;
331 int nfs4_lost_rqst_debug = 0;
332 int nfs4_rdattrerr_debug = 0;
333 int nfs4_open_stream_debug = 0;
334
335 int nfs4read_error_inject;
336
337 static int nfs4_create_misses = 0;
338
339 static int nfs4_readdir_cache_shorts = 0;
340 static int nfs4_readdir_readahead = 0;
341
342 static int nfs4_bio_do_stop = 0;
343
344 static int nfs4_lostpage = 0; /* number of times we lost original page */
345
346 int nfs4_mmap_debug = 0;
347
348 static int nfs4_pathconf_cache_hits = 0;
349 static int nfs4_pathconf_cache_misses = 0;
350
351 int nfs4close_all_cnt;
352 int nfs4close_one_debug = 0;
353 int nfs4close_notw_debug = 0;
354
355 int denied_to_flk_debug = 0;
356 void *lockt_denied_debug;
357
358 #endif
359
360 /*
361 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
362 * or NFS4ERR_RESOURCE.
363 */
364 static int confirm_retry_sec = 30;
365
366 static int nfs4_lookup_neg_cache = 1;
367
368 /*
369 * number of pages to read ahead
370 * optimized for 100 base-T.
371 */
372 static int nfs4_nra = 4;
373
374 static int nfs4_do_symlink_cache = 1;
375
376 static int nfs4_pathconf_disable_cache = 0;
377
378 /*
379 * These are the vnode ops routines which implement the vnode interface to
380 * the networked file system. These routines just take their parameters,
381 * make them look networkish by putting the right info into interface structs,
382 * and then calling the appropriate remote routine(s) to do the work.
383 *
384 * Note on directory name lookup cacheing: If we detect a stale fhandle,
385 * we purge the directory cache relative to that vnode. This way, the
386 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for
387 * more details on rnode locking.
388 */
389
390 struct vnodeops *nfs4_vnodeops;
391
392 const fs_operation_def_t nfs4_vnodeops_template[] = {
393 VOPNAME_OPEN, { .vop_open = nfs4_open },
394 VOPNAME_CLOSE, { .vop_close = nfs4_close },
395 VOPNAME_READ, { .vop_read = nfs4_read },
396 VOPNAME_WRITE, { .vop_write = nfs4_write },
397 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl },
398 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr },
399 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr },
400 VOPNAME_ACCESS, { .vop_access = nfs4_access },
401 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup },
402 VOPNAME_CREATE, { .vop_create = nfs4_create },
403 VOPNAME_REMOVE, { .vop_remove = nfs4_remove },
404 VOPNAME_LINK, { .vop_link = nfs4_link },
405 VOPNAME_RENAME, { .vop_rename = nfs4_rename },
406 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir },
407 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir },
408 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir },
409 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink },
410 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink },
411 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync },
412 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
413 VOPNAME_FID, { .vop_fid = nfs4_fid },
414 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
415 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
416 VOPNAME_SEEK, { .vop_seek = nfs4_seek },
417 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock },
418 VOPNAME_SPACE, { .vop_space = nfs4_space },
419 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
420 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage },
421 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage },
422 VOPNAME_MAP, { .vop_map = nfs4_map },
423 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap },
424 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap },
425 /* no separate nfs4_dump */
426 VOPNAME_DUMP, { .vop_dump = nfs_dump },
427 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
428 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio },
429 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose },
430 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr },
431 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
432 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock },
433 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
434 NULL, NULL
435 };
436
437 /*
438 * The following are subroutines and definitions to set args or get res
439 * for the different nfsv4 ops
440 */
441
442 void
443 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
444 {
445 int i;
446
447 for (i = 0; i < arglen; i++) {
448 if (argop[i].argop == OP_LOOKUP) {
449 kmem_free(
450 argop[i].nfs_argop4_u.oplookup.
451 objname.utf8string_val,
452 argop[i].nfs_argop4_u.oplookup.
453 objname.utf8string_len);
454 }
455 }
456 }
457
458 static void
459 nfs4args_lock_free(nfs_argop4 *argop)
460 {
461 locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
462
463 if (locker->new_lock_owner == TRUE) {
464 open_to_lock_owner4 *open_owner;
465
466 open_owner = &locker->locker4_u.open_owner;
467 if (open_owner->lock_owner.owner_val != NULL) {
468 kmem_free(open_owner->lock_owner.owner_val,
469 open_owner->lock_owner.owner_len);
470 }
471 }
472 }
473
474 static void
475 nfs4args_lockt_free(nfs_argop4 *argop)
476 {
477 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
478
479 if (lowner->owner_val != NULL) {
480 kmem_free(lowner->owner_val, lowner->owner_len);
481 }
482 }
483
484 static void
485 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
486 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
487 nfs4_stateid_types_t *sid_types)
488 {
489 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
490 mntinfo4_t *mi;
491
492 argop->argop = OP_SETATTR;
493 /*
494 * The stateid is set to 0 if client is not modifying the size
495 * and otherwise to whatever nfs4_get_stateid() returns.
496 *
497 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
498 * state struct could be found for the process/file pair. We may
499 * want to change this in the future (by OPENing the file). See
500 * bug # 4474852.
501 */
502 if (vap->va_mask & AT_SIZE) {
503
504 ASSERT(rp != NULL);
505 mi = VTOMI4(RTOV4(rp));
506
507 argop->nfs_argop4_u.opsetattr.stateid =
508 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
509 OP_SETATTR, sid_types, FALSE);
510 } else {
511 bzero(&argop->nfs_argop4_u.opsetattr.stateid,
512 sizeof (stateid4));
513 }
514
515 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
516 if (*error)
517 bzero(attr, sizeof (*attr));
518 }
519
520 static void
521 nfs4args_setattr_free(nfs_argop4 *argop)
522 {
523 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
524 }
525
526 static int
527 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
528 bitmap4 supp)
529 {
530 fattr4 *attr;
531 int error = 0;
532
533 argop->argop = op;
534 switch (op) {
535 case OP_VERIFY:
536 attr = &argop->nfs_argop4_u.opverify.obj_attributes;
537 break;
538 case OP_NVERIFY:
539 attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
540 break;
541 default:
542 return (EINVAL);
543 }
544 if (!error)
545 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
546 if (error)
547 bzero(attr, sizeof (*attr));
548 return (error);
549 }
550
551 static void
552 nfs4args_verify_free(nfs_argop4 *argop)
553 {
554 switch (argop->argop) {
555 case OP_VERIFY:
556 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
557 break;
558 case OP_NVERIFY:
559 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
560 break;
561 default:
562 break;
563 }
564 }
565
566 static void
567 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
568 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
569 {
570 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
571 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
572
573 argop->argop = OP_WRITE;
574 wargs->stable = stable;
575 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
576 mi, OP_WRITE, sid_tp);
577 wargs->mblk = NULL;
578 *wargs_pp = wargs;
579 }
580
581 void
582 nfs4args_copen_free(OPEN4cargs *open_args)
583 {
584 if (open_args->owner.owner_val) {
585 kmem_free(open_args->owner.owner_val,
586 open_args->owner.owner_len);
587 }
588 if ((open_args->opentype == OPEN4_CREATE) &&
589 (open_args->mode != EXCLUSIVE4)) {
590 nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
591 }
592 }
593
594 /*
595 * XXX: This is referenced in modstubs.s
596 */
597 struct vnodeops *
598 nfs4_getvnodeops(void)
599 {
600 return (nfs4_vnodeops);
601 }
602
603 /*
604 * The OPEN operation opens a regular file.
605 */
606 /*ARGSUSED3*/
607 static int
608 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
609 {
610 vnode_t *dvp = NULL;
611 rnode4_t *rp, *drp;
612 int error;
613 int just_been_created;
614 char fn[MAXNAMELEN];
615
616 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
617 if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
618 return (EIO);
619 rp = VTOR4(*vpp);
620
621 /*
622 * Check to see if opening something besides a regular file;
623 * if so skip the OTW call
624 */
625 if ((*vpp)->v_type != VREG) {
626 error = nfs4_open_non_reg_file(vpp, flag, cr);
627 return (error);
628 }
629
630 /*
631 * XXX - would like a check right here to know if the file is
632 * executable or not, so as to skip OTW
633 */
634
635 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
636 return (error);
637
638 drp = VTOR4(dvp);
639 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
640 return (EINTR);
641
642 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
643 nfs_rw_exit(&drp->r_rwlock);
644 return (error);
645 }
646
647 /*
648 * See if this file has just been CREATEd.
649 * If so, clear the flag and update the dnlc, which was previously
650 * skipped in nfs4_create.
651 * XXX need better serilization on this.
652 * XXX move this into the nf4open_otw call, after we have
653 * XXX acquired the open owner seqid sync.
654 */
655 mutex_enter(&rp->r_statev4_lock);
656 if (rp->created_v4) {
657 rp->created_v4 = 0;
658 mutex_exit(&rp->r_statev4_lock);
659
660 dnlc_update(dvp, fn, *vpp);
661 /* This is needed so we don't bump the open ref count */
662 just_been_created = 1;
663 } else {
664 mutex_exit(&rp->r_statev4_lock);
665 just_been_created = 0;
666 }
667
668 /*
669 * If caller specified O_TRUNC/FTRUNC, then be sure to set
670 * FWRITE (to drive successful setattr(size=0) after open)
671 */
672 if (flag & FTRUNC)
673 flag |= FWRITE;
674
675 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
676 just_been_created);
677
678 if (!error && !((*vpp)->v_flag & VROOT))
679 dnlc_update(dvp, fn, *vpp);
680
681 nfs_rw_exit(&drp->r_rwlock);
682
683 /* release the hold from vtodv */
684 VN_RELE(dvp);
685
686 /* exchange the shadow for the master vnode, if needed */
687
688 if (error == 0 && IS_SHADOW(*vpp, rp))
689 sv_exchange(vpp);
690
691 return (error);
692 }
693
694 /*
695 * See if there's a "lost open" request to be saved and recovered.
696 */
697 static void
698 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
699 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
700 vnode_t *dvp, OPEN4cargs *open_args)
701 {
702 vfs_t *vfsp;
703 char *srccfp;
704
705 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
706
707 if (error != ETIMEDOUT && error != EINTR &&
708 !NFS4_FRC_UNMT_ERR(error, vfsp)) {
709 lost_rqstp->lr_op = 0;
710 return;
711 }
712
713 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
714 "nfs4open_save_lost_rqst: error %d", error));
715
716 lost_rqstp->lr_op = OP_OPEN;
717
718 /*
719 * The vp (if it is not NULL) and dvp are held and rele'd via
720 * the recovery code. See nfs4_save_lost_rqst.
721 */
722 lost_rqstp->lr_vp = vp;
723 lost_rqstp->lr_dvp = dvp;
724 lost_rqstp->lr_oop = oop;
725 lost_rqstp->lr_osp = NULL;
726 lost_rqstp->lr_lop = NULL;
727 lost_rqstp->lr_cr = cr;
728 lost_rqstp->lr_flk = NULL;
729 lost_rqstp->lr_oacc = open_args->share_access;
730 lost_rqstp->lr_odeny = open_args->share_deny;
731 lost_rqstp->lr_oclaim = open_args->claim;
732 if (open_args->claim == CLAIM_DELEGATE_CUR) {
733 lost_rqstp->lr_ostateid =
734 open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
735 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
736 } else {
737 srccfp = open_args->open_claim4_u.cfile;
738 }
739 lost_rqstp->lr_ofile.utf8string_len = 0;
740 lost_rqstp->lr_ofile.utf8string_val = NULL;
741 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
742 lost_rqstp->lr_putfirst = FALSE;
743 }
744
745 struct nfs4_excl_time {
746 uint32 seconds;
747 uint32 nseconds;
748 };
749
750 /*
751 * The OPEN operation creates and/or opens a regular file
752 *
753 * ARGSUSED
754 */
755 static int
756 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
757 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
758 enum createmode4 createmode, int file_just_been_created)
759 {
760 rnode4_t *rp;
761 rnode4_t *drp = VTOR4(dvp);
762 vnode_t *vp = NULL;
763 vnode_t *vpi = *vpp;
764 bool_t needrecov = FALSE;
765
766 int doqueue = 1;
767
768 COMPOUND4args_clnt args;
769 COMPOUND4res_clnt res;
770 nfs_argop4 *argop;
771 nfs_resop4 *resop;
772 int argoplist_size;
773 int idx_open, idx_fattr;
774
775 GETFH4res *gf_res = NULL;
776 OPEN4res *op_res = NULL;
777 nfs4_ga_res_t *garp;
778 fattr4 *attr = NULL;
779 struct nfs4_excl_time verf;
780 bool_t did_excl_setup = FALSE;
781 int created_osp;
782
783 OPEN4cargs *open_args;
784 nfs4_open_owner_t *oop = NULL;
785 nfs4_open_stream_t *osp = NULL;
786 seqid4 seqid = 0;
787 bool_t retry_open = FALSE;
788 nfs4_recov_state_t recov_state;
789 nfs4_lost_rqst_t lost_rqst;
790 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
791 hrtime_t t;
792 int acc = 0;
793 cred_t *cred_otw = NULL; /* cred used to do the RPC call */
794 cred_t *ncr = NULL;
795
796 nfs4_sharedfh_t *otw_sfh;
797 nfs4_sharedfh_t *orig_sfh;
798 int fh_differs = 0;
799 int numops, setgid_flag;
800 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
801
802 /*
803 * Make sure we properly deal with setting the right gid on
804 * a newly created file to reflect the parent's setgid bit
805 */
806 setgid_flag = 0;
807 if (create_flag && in_va) {
808
809 /*
810 * If there is grpid mount flag used or
811 * the parent's directory has the setgid bit set
812 * _and_ the client was able to get a valid mapping
813 * for the parent dir's owner_group, we want to
814 * append NVERIFY(owner_group == dva.va_gid) and
815 * SETATTR to the CREATE compound.
816 */
817 mutex_enter(&drp->r_statelock);
818 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
819 drp->r_attr.va_mode & VSGID) &&
820 drp->r_attr.va_gid != GID_NOBODY) {
821 in_va->va_mask |= AT_GID;
822 in_va->va_gid = drp->r_attr.va_gid;
823 setgid_flag = 1;
824 }
825 mutex_exit(&drp->r_statelock);
826 }
827
828 /*
829 * Normal/non-create compound:
830 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
831 *
832 * Open(create) compound no setgid:
833 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
834 * RESTOREFH + GETATTR
835 *
836 * Open(create) setgid:
837 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
838 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
839 * NVERIFY(grp) + SETATTR
840 */
841 if (setgid_flag) {
842 numops = 10;
843 idx_open = 1;
844 idx_fattr = 3;
845 } else if (create_flag) {
846 numops = 7;
847 idx_open = 2;
848 idx_fattr = 4;
849 } else {
850 numops = 4;
851 idx_open = 1;
852 idx_fattr = 3;
853 }
854
855 args.array_len = numops;
856 argoplist_size = numops * sizeof (nfs_argop4);
857 argop = kmem_alloc(argoplist_size, KM_SLEEP);
858
859 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
860 "open %s open flag 0x%x cred %p", file_name, open_flag,
861 (void *)cr));
862
863 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
864 if (create_flag) {
865 /*
866 * We are to create a file. Initialize the passed in vnode
867 * pointer.
868 */
869 vpi = NULL;
870 } else {
871 /*
872 * Check to see if the client owns a read delegation and is
873 * trying to open for write. If so, then return the delegation
874 * to avoid the server doing a cb_recall and returning DELAY.
875 * NB - we don't use the statev4_lock here because we'd have
876 * to drop the lock anyway and the result would be stale.
877 */
878 if ((open_flag & FWRITE) &&
879 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
880 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
881
882 /*
883 * If the file has a delegation, then do an access check up
884 * front. This avoids having to an access check later after
885 * we've already done start_op, which could deadlock.
886 */
887 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
888 if (open_flag & FREAD &&
889 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
890 acc |= VREAD;
891 if (open_flag & FWRITE &&
892 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
893 acc |= VWRITE;
894 }
895 }
896
897 drp = VTOR4(dvp);
898
899 recov_state.rs_flags = 0;
900 recov_state.rs_num_retry_despite_err = 0;
901 cred_otw = cr;
902
903 recov_retry:
904 fh_differs = 0;
905 nfs4_error_zinit(&e);
906
907 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
908 if (e.error) {
909 if (ncr != NULL)
910 crfree(ncr);
911 kmem_free(argop, argoplist_size);
912 return (e.error);
913 }
914
915 args.ctag = TAG_OPEN;
916 args.array_len = numops;
917 args.array = argop;
918
919 /* putfh directory fh */
920 argop[0].argop = OP_CPUTFH;
921 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
922
923 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
924 argop[idx_open].argop = OP_COPEN;
925 open_args = &argop[idx_open].nfs_argop4_u.opcopen;
926 open_args->claim = CLAIM_NULL;
927
928 /* name of file */
929 open_args->open_claim4_u.cfile = file_name;
930 open_args->owner.owner_len = 0;
931 open_args->owner.owner_val = NULL;
932
933 if (create_flag) {
934 /* CREATE a file */
935 open_args->opentype = OPEN4_CREATE;
936 open_args->mode = createmode;
937 if (createmode == EXCLUSIVE4) {
938 if (did_excl_setup == FALSE) {
939 verf.seconds = zone_get_hostid(NULL);
940 if (verf.seconds != 0)
941 verf.nseconds = newnum();
942 else {
943 timestruc_t now;
944
945 gethrestime(&now);
946 verf.seconds = now.tv_sec;
947 verf.nseconds = now.tv_nsec;
948 }
949 /*
950 * Since the server will use this value for the
951 * mtime, make sure that it can't overflow. Zero
952 * out the MSB. The actual value does not matter
953 * here, only its uniqeness.
954 */
955 verf.seconds &= INT32_MAX;
956 did_excl_setup = TRUE;
957 }
958
959 /* Now copy over verifier to OPEN4args. */
960 open_args->createhow4_u.createverf = *(uint64_t *)&verf;
961 } else {
962 int v_error;
963 bitmap4 supp_attrs;
964 servinfo4_t *svp;
965
966 attr = &open_args->createhow4_u.createattrs;
967
968 svp = drp->r_server;
969 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
970 supp_attrs = svp->sv_supp_attrs;
971 nfs_rw_exit(&svp->sv_lock);
972
973 /* GUARDED4 or UNCHECKED4 */
974 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
975 supp_attrs);
976 if (v_error) {
977 bzero(attr, sizeof (*attr));
978 nfs4args_copen_free(open_args);
979 nfs4_end_op(VTOMI4(dvp), dvp, vpi,
980 &recov_state, FALSE);
981 if (ncr != NULL)
982 crfree(ncr);
983 kmem_free(argop, argoplist_size);
984 return (v_error);
985 }
986 }
987 } else {
988 /* NO CREATE */
989 open_args->opentype = OPEN4_NOCREATE;
990 }
991
992 if (recov_state.rs_sp != NULL) {
993 mutex_enter(&recov_state.rs_sp->s_lock);
994 open_args->owner.clientid = recov_state.rs_sp->clientid;
995 mutex_exit(&recov_state.rs_sp->s_lock);
996 } else {
997 /* XXX should we just fail here? */
998 open_args->owner.clientid = 0;
999 }
1000
1001 /*
1002 * This increments oop's ref count or creates a temporary 'just_created'
1003 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1004 * completes.
1005 */
1006 mutex_enter(&VTOMI4(dvp)->mi_lock);
1007
1008 /* See if a permanent or just created open owner exists */
1009 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1010 if (!oop) {
1011 /*
1012 * This open owner does not exist so create a temporary
1013 * just created one.
1014 */
1015 oop = create_open_owner(cr, VTOMI4(dvp));
1016 ASSERT(oop != NULL);
1017 }
1018 mutex_exit(&VTOMI4(dvp)->mi_lock);
1019
1020 /* this length never changes, do alloc before seqid sync */
1021 open_args->owner.owner_len = sizeof (oop->oo_name);
1022 open_args->owner.owner_val =
1023 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1024
1025 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1026 if (e.error == EAGAIN) {
1027 open_owner_rele(oop);
1028 nfs4args_copen_free(open_args);
1029 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1030 if (ncr != NULL) {
1031 crfree(ncr);
1032 ncr = NULL;
1033 }
1034 goto recov_retry;
1035 }
1036
1037 /* Check to see if we need to do the OTW call */
1038 if (!create_flag) {
1039 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1040 file_just_been_created, &e.error, acc, &recov_state)) {
1041
1042 /*
1043 * The OTW open is not necessary. Either
1044 * the open can succeed without it (eg.
1045 * delegation, error == 0) or the open
1046 * must fail due to an access failure
1047 * (error != 0). In either case, tidy
1048 * up and return.
1049 */
1050
1051 nfs4_end_open_seqid_sync(oop);
1052 open_owner_rele(oop);
1053 nfs4args_copen_free(open_args);
1054 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1055 if (ncr != NULL)
1056 crfree(ncr);
1057 kmem_free(argop, argoplist_size);
1058 return (e.error);
1059 }
1060 }
1061
1062 bcopy(&oop->oo_name, open_args->owner.owner_val,
1063 open_args->owner.owner_len);
1064
1065 seqid = nfs4_get_open_seqid(oop) + 1;
1066 open_args->seqid = seqid;
1067 open_args->share_access = 0;
1068 if (open_flag & FREAD)
1069 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1070 if (open_flag & FWRITE)
1071 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1072 open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1073
1074
1075
1076 /*
1077 * getfh w/sanity check for idx_open/idx_fattr
1078 */
1079 ASSERT((idx_open + 1) == (idx_fattr - 1));
1080 argop[idx_open + 1].argop = OP_GETFH;
1081
1082 /* getattr */
1083 argop[idx_fattr].argop = OP_GETATTR;
1084 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1085 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1086
1087 if (setgid_flag) {
1088 vattr_t _v;
1089 servinfo4_t *svp;
1090 bitmap4 supp_attrs;
1091
1092 svp = drp->r_server;
1093 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1094 supp_attrs = svp->sv_supp_attrs;
1095 nfs_rw_exit(&svp->sv_lock);
1096
1097 /*
1098 * For setgid case, we need to:
1099 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1100 */
1101 argop[4].argop = OP_SAVEFH;
1102
1103 argop[5].argop = OP_CPUTFH;
1104 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1105
1106 argop[6].argop = OP_GETATTR;
1107 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1108 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1109
1110 argop[7].argop = OP_RESTOREFH;
1111
1112 /*
1113 * nverify
1114 */
1115 _v.va_mask = AT_GID;
1116 _v.va_gid = in_va->va_gid;
1117 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1118 supp_attrs))) {
1119
1120 /*
1121 * setattr
1122 *
1123 * We _know_ we're not messing with AT_SIZE or
1124 * AT_XTIME, so no need for stateid or flags.
1125 * Also we specify NULL rp since we're only
1126 * interested in setting owner_group attributes.
1127 */
1128 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1129 supp_attrs, &e.error, 0);
1130 if (e.error)
1131 nfs4args_verify_free(&argop[8]);
1132 }
1133
1134 if (e.error) {
1135 /*
1136 * XXX - Revisit the last argument to nfs4_end_op()
1137 * once 5020486 is fixed.
1138 */
1139 nfs4_end_open_seqid_sync(oop);
1140 open_owner_rele(oop);
1141 nfs4args_copen_free(open_args);
1142 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1143 if (ncr != NULL)
1144 crfree(ncr);
1145 kmem_free(argop, argoplist_size);
1146 return (e.error);
1147 }
1148 } else if (create_flag) {
1149 argop[1].argop = OP_SAVEFH;
1150
1151 argop[5].argop = OP_RESTOREFH;
1152
1153 argop[6].argop = OP_GETATTR;
1154 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1155 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1156 }
1157
1158 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1159 "nfs4open_otw: %s call, nm %s, rp %s",
1160 needrecov ? "recov" : "first", file_name,
1161 rnode4info(VTOR4(dvp))));
1162
1163 t = gethrtime();
1164
1165 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1166
1167 if (!e.error && nfs4_need_to_bump_seqid(&res))
1168 nfs4_set_open_seqid(seqid, oop, args.ctag);
1169
1170 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1171
1172 if (e.error || needrecov) {
1173 bool_t abort = FALSE;
1174
1175 if (needrecov) {
1176 nfs4_bseqid_entry_t *bsep = NULL;
1177
1178 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1179 cred_otw, vpi, dvp, open_args);
1180
1181 if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1182 bsep = nfs4_create_bseqid_entry(oop, NULL,
1183 vpi, 0, args.ctag, open_args->seqid);
1184 num_bseqid_retry--;
1185 }
1186
1187 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1188 NULL, lost_rqst.lr_op == OP_OPEN ?
1189 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1190
1191 if (bsep)
1192 kmem_free(bsep, sizeof (*bsep));
1193 /* give up if we keep getting BAD_SEQID */
1194 if (num_bseqid_retry == 0)
1195 abort = TRUE;
1196 if (abort == TRUE && e.error == 0)
1197 e.error = geterrno4(res.status);
1198 }
1199 nfs4_end_open_seqid_sync(oop);
1200 open_owner_rele(oop);
1201 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1202 nfs4args_copen_free(open_args);
1203 if (setgid_flag) {
1204 nfs4args_verify_free(&argop[8]);
1205 nfs4args_setattr_free(&argop[9]);
1206 }
1207 if (!e.error)
1208 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1209 if (ncr != NULL) {
1210 crfree(ncr);
1211 ncr = NULL;
1212 }
1213 if (!needrecov || abort == TRUE || e.error == EINTR ||
1214 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1215 kmem_free(argop, argoplist_size);
1216 return (e.error);
1217 }
1218 goto recov_retry;
1219 }
1220
1221 /*
1222 * Will check and update lease after checking the rflag for
1223 * OPEN_CONFIRM in the successful OPEN call.
1224 */
1225 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1226
1227 /*
1228 * XXX what if we're crossing mount points from server1:/drp
1229 * to server2:/drp/rp.
1230 */
1231
1232 /* Signal our end of use of the open seqid */
1233 nfs4_end_open_seqid_sync(oop);
1234
1235 /*
1236 * This will destroy the open owner if it was just created,
1237 * and no one else has put a reference on it.
1238 */
1239 open_owner_rele(oop);
1240 if (create_flag && (createmode != EXCLUSIVE4) &&
1241 res.status == NFS4ERR_BADOWNER)
1242 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1243
1244 e.error = geterrno4(res.status);
1245 nfs4args_copen_free(open_args);
1246 if (setgid_flag) {
1247 nfs4args_verify_free(&argop[8]);
1248 nfs4args_setattr_free(&argop[9]);
1249 }
1250 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1251 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1252 /*
1253 * If the reply is NFS4ERR_ACCESS, it may be because
1254 * we are root (no root net access). If the real uid
1255 * is not root, then retry with the real uid instead.
1256 */
1257 if (ncr != NULL) {
1258 crfree(ncr);
1259 ncr = NULL;
1260 }
1261 if (res.status == NFS4ERR_ACCESS &&
1262 (ncr = crnetadjust(cred_otw)) != NULL) {
1263 cred_otw = ncr;
1264 goto recov_retry;
1265 }
1266 kmem_free(argop, argoplist_size);
1267 return (e.error);
1268 }
1269
1270 resop = &res.array[idx_open]; /* open res */
1271 op_res = &resop->nfs_resop4_u.opopen;
1272
1273 #ifdef DEBUG
1274 /*
1275 * verify attrset bitmap
1276 */
1277 if (create_flag &&
1278 (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1279 /* make sure attrset returned is what we asked for */
1280 /* XXX Ignore this 'error' for now */
1281 if (attr->attrmask != op_res->attrset)
1282 /* EMPTY */;
1283 }
1284 #endif
1285
1286 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1287 mutex_enter(&VTOMI4(dvp)->mi_lock);
1288 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1289 mutex_exit(&VTOMI4(dvp)->mi_lock);
1290 }
1291
1292 resop = &res.array[idx_open + 1]; /* getfh res */
1293 gf_res = &resop->nfs_resop4_u.opgetfh;
1294
1295 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1296
1297 /*
1298 * The open stateid has been updated on the server but not
1299 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache->
1300 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1301 * WRITE call. That, however, will use the old stateid, so go ahead
1302 * and upate the open stateid now, before any call to makenfs4node.
1303 */
1304 if (vpi) {
1305 nfs4_open_stream_t *tmp_osp;
1306 rnode4_t *tmp_rp = VTOR4(vpi);
1307
1308 tmp_osp = find_open_stream(oop, tmp_rp);
1309 if (tmp_osp) {
1310 tmp_osp->open_stateid = op_res->stateid;
1311 mutex_exit(&tmp_osp->os_sync_lock);
1312 open_stream_rele(tmp_osp, tmp_rp);
1313 }
1314
1315 /*
1316 * We must determine if the file handle given by the otw open
1317 * is the same as the file handle which was passed in with
1318 * *vpp. This case can be reached if the file we are trying
1319 * to open has been removed and another file has been created
1320 * having the same file name. The passed in vnode is released
1321 * later.
1322 */
1323 orig_sfh = VTOR4(vpi)->r_fh;
1324 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1325 }
1326
1327 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1328
1329 if (create_flag || fh_differs) {
1330 int rnode_err = 0;
1331
1332 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1333 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1334
1335 if (e.error)
1336 PURGE_ATTRCACHE4(vp);
1337 /*
1338 * For the newly created vp case, make sure the rnode
1339 * isn't bad before using it.
1340 */
1341 mutex_enter(&(VTOR4(vp))->r_statelock);
1342 if (VTOR4(vp)->r_flags & R4RECOVERR)
1343 rnode_err = EIO;
1344 mutex_exit(&(VTOR4(vp))->r_statelock);
1345
1346 if (rnode_err) {
1347 nfs4_end_open_seqid_sync(oop);
1348 nfs4args_copen_free(open_args);
1349 if (setgid_flag) {
1350 nfs4args_verify_free(&argop[8]);
1351 nfs4args_setattr_free(&argop[9]);
1352 }
1353 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1354 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1355 needrecov);
1356 open_owner_rele(oop);
1357 VN_RELE(vp);
1358 if (ncr != NULL)
1359 crfree(ncr);
1360 sfh4_rele(&otw_sfh);
1361 kmem_free(argop, argoplist_size);
1362 return (EIO);
1363 }
1364 } else {
1365 vp = vpi;
1366 }
1367 sfh4_rele(&otw_sfh);
1368
1369 /*
1370 * It seems odd to get a full set of attrs and then not update
1371 * the object's attrcache in the non-create case. Create case uses
1372 * the attrs since makenfs4node checks to see if the attrs need to
1373 * be updated (and then updates them). The non-create case should
1374 * update attrs also.
1375 */
1376 if (! create_flag && ! fh_differs && !e.error) {
1377 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1378 }
1379
1380 nfs4_error_zinit(&e);
1381 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1382 /* This does not do recovery for vp explicitly. */
1383 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1384 &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1385
1386 if (e.error || e.stat) {
1387 nfs4_end_open_seqid_sync(oop);
1388 nfs4args_copen_free(open_args);
1389 if (setgid_flag) {
1390 nfs4args_verify_free(&argop[8]);
1391 nfs4args_setattr_free(&argop[9]);
1392 }
1393 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1394 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1395 needrecov);
1396 open_owner_rele(oop);
1397 if (create_flag || fh_differs) {
1398 /* rele the makenfs4node */
1399 VN_RELE(vp);
1400 }
1401 if (ncr != NULL) {
1402 crfree(ncr);
1403 ncr = NULL;
1404 }
1405 if (retry_open == TRUE) {
1406 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1407 "nfs4open_otw: retry the open since OPEN "
1408 "CONFIRM failed with error %d stat %d",
1409 e.error, e.stat));
1410 if (create_flag && createmode == GUARDED4) {
1411 NFS4_DEBUG(nfs4_client_recov_debug,
1412 (CE_NOTE, "nfs4open_otw: switch "
1413 "createmode from GUARDED4 to "
1414 "UNCHECKED4"));
1415 createmode = UNCHECKED4;
1416 }
1417 goto recov_retry;
1418 }
1419 if (!e.error) {
1420 if (create_flag && (createmode != EXCLUSIVE4) &&
1421 e.stat == NFS4ERR_BADOWNER)
1422 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1423
1424 e.error = geterrno4(e.stat);
1425 }
1426 kmem_free(argop, argoplist_size);
1427 return (e.error);
1428 }
1429 }
1430
1431 rp = VTOR4(vp);
1432
1433 mutex_enter(&rp->r_statev4_lock);
1434 if (create_flag)
1435 rp->created_v4 = 1;
1436 mutex_exit(&rp->r_statev4_lock);
1437
1438 mutex_enter(&oop->oo_lock);
1439 /* Doesn't matter if 'oo_just_created' already was set as this */
1440 oop->oo_just_created = NFS4_PERM_CREATED;
1441 if (oop->oo_cred_otw)
1442 crfree(oop->oo_cred_otw);
1443 oop->oo_cred_otw = cred_otw;
1444 crhold(oop->oo_cred_otw);
1445 mutex_exit(&oop->oo_lock);
1446
1447 /* returns with 'os_sync_lock' held */
1448 osp = find_or_create_open_stream(oop, rp, &created_osp);
1449 if (!osp) {
1450 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1451 "nfs4open_otw: failed to create an open stream"));
1452 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1453 "signal our end of use of the open seqid"));
1454
1455 nfs4_end_open_seqid_sync(oop);
1456 open_owner_rele(oop);
1457 nfs4args_copen_free(open_args);
1458 if (setgid_flag) {
1459 nfs4args_verify_free(&argop[8]);
1460 nfs4args_setattr_free(&argop[9]);
1461 }
1462 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1463 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1464 if (create_flag || fh_differs)
1465 VN_RELE(vp);
1466 if (ncr != NULL)
1467 crfree(ncr);
1468
1469 kmem_free(argop, argoplist_size);
1470 return (EINVAL);
1471
1472 }
1473
1474 osp->open_stateid = op_res->stateid;
1475
1476 if (open_flag & FREAD)
1477 osp->os_share_acc_read++;
1478 if (open_flag & FWRITE)
1479 osp->os_share_acc_write++;
1480 osp->os_share_deny_none++;
1481
1482 /*
1483 * Need to reset this bitfield for the possible case where we were
1484 * going to OTW CLOSE the file, got a non-recoverable error, and before
1485 * we could retry the CLOSE, OPENed the file again.
1486 */
1487 ASSERT(osp->os_open_owner->oo_seqid_inuse);
1488 osp->os_final_close = 0;
1489 osp->os_force_close = 0;
1490 #ifdef DEBUG
1491 if (osp->os_failed_reopen)
1492 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1493 " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1494 (void *)osp, (void *)cr, rnode4info(rp)));
1495 #endif
1496 osp->os_failed_reopen = 0;
1497
1498 mutex_exit(&osp->os_sync_lock);
1499
1500 nfs4_end_open_seqid_sync(oop);
1501
1502 if (created_osp && recov_state.rs_sp != NULL) {
1503 mutex_enter(&recov_state.rs_sp->s_lock);
1504 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1505 mutex_exit(&recov_state.rs_sp->s_lock);
1506 }
1507
1508 /* get rid of our reference to find oop */
1509 open_owner_rele(oop);
1510
1511 open_stream_rele(osp, rp);
1512
1513 /* accept delegation, if any */
1514 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1515
1516 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1517
1518 if (createmode == EXCLUSIVE4 &&
1519 (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1520 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1521 " EXCLUSIVE4: sending a SETATTR"));
1522 /*
1523 * If doing an exclusive create, then generate
1524 * a SETATTR to set the initial attributes.
1525 * Try to set the mtime and the atime to the
1526 * server's current time. It is somewhat
1527 * expected that these fields will be used to
1528 * store the exclusive create cookie. If not,
1529 * server implementors will need to know that
1530 * a SETATTR will follow an exclusive create
1531 * and the cookie should be destroyed if
1532 * appropriate.
1533 *
1534 * The AT_GID and AT_SIZE bits are turned off
1535 * so that the SETATTR request will not attempt
1536 * to process these. The gid will be set
1537 * separately if appropriate. The size is turned
1538 * off because it is assumed that a new file will
1539 * be created empty and if the file wasn't empty,
1540 * then the exclusive create will have failed
1541 * because the file must have existed already.
1542 * Therefore, no truncate operation is needed.
1543 */
1544 in_va->va_mask &= ~(AT_GID | AT_SIZE);
1545 in_va->va_mask |= (AT_MTIME | AT_ATIME);
1546
1547 e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1548 if (e.error) {
1549 /*
1550 * Couldn't correct the attributes of
1551 * the newly created file and the
1552 * attributes are wrong. Remove the
1553 * file and return an error to the
1554 * application.
1555 */
1556 /* XXX will this take care of client state ? */
1557 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1558 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1559 " remove file", e.error));
1560 VN_RELE(vp);
1561 (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1562 /*
1563 * Since we've reled the vnode and removed
1564 * the file we now need to return the error.
1565 * At this point we don't want to update the
1566 * dircaches, call nfs4_waitfor_purge_complete
1567 * or set vpp to vp so we need to skip these
1568 * as well.
1569 */
1570 goto skip_update_dircaches;
1571 }
1572 }
1573
1574 /*
1575 * If we created or found the correct vnode, due to create_flag or
1576 * fh_differs being set, then update directory cache attribute, readdir
1577 * and dnlc caches.
1578 */
1579 if (create_flag || fh_differs) {
1580 dirattr_info_t dinfo, *dinfop;
1581
1582 /*
1583 * Make sure getattr succeeded before using results.
1584 * note: op 7 is getattr(dir) for both flavors of
1585 * open(create).
1586 */
1587 if (create_flag && res.status == NFS4_OK) {
1588 dinfo.di_time_call = t;
1589 dinfo.di_cred = cr;
1590 dinfo.di_garp =
1591 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1592 dinfop = &dinfo;
1593 } else {
1594 dinfop = NULL;
1595 }
1596
1597 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1598 dinfop);
1599 }
1600
1601 /*
1602 * If the page cache for this file was flushed from actions
1603 * above, it was done asynchronously and if that is true,
1604 * there is a need to wait here for it to complete. This must
1605 * be done outside of start_fop/end_fop.
1606 */
1607 (void) nfs4_waitfor_purge_complete(vp);
1608
1609 /*
1610 * It is implicit that we are in the open case (create_flag == 0) since
1611 * fh_differs can only be set to a non-zero value in the open case.
1612 */
1613 if (fh_differs != 0 && vpi != NULL)
1614 VN_RELE(vpi);
1615
1616 /*
1617 * Be sure to set *vpp to the correct value before returning.
1618 */
1619 *vpp = vp;
1620
1621 skip_update_dircaches:
1622
1623 nfs4args_copen_free(open_args);
1624 if (setgid_flag) {
1625 nfs4args_verify_free(&argop[8]);
1626 nfs4args_setattr_free(&argop[9]);
1627 }
1628 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1629
1630 if (ncr)
1631 crfree(ncr);
1632 kmem_free(argop, argoplist_size);
1633 return (e.error);
1634 }
1635
1636 /*
1637 * Reopen an open instance. cf. nfs4open_otw().
1638 *
1639 * Errors are returned by the nfs4_error_t parameter.
1640 * - ep->error contains an errno value or zero.
1641 * - if it is zero, ep->stat is set to an NFS status code, if any.
1642 * If the file could not be reopened, but the caller should continue, the
1643 * file is marked dead and no error values are returned. If the caller
1644 * should stop recovering open files and start over, either the ep->error
1645 * value or ep->stat will indicate an error (either something that requires
1646 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile
1647 * filehandles) may be handled silently by this routine.
1648 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1649 * will be started, so the caller should not do it.
1650 *
1651 * Gotos:
1652 * - kill_file : reopen failed in such a fashion to constitute marking the
1653 * file dead and setting the open stream's 'os_failed_reopen' as 1. This
1654 * is for cases where recovery is not possible.
1655 * - failed_reopen : same as above, except that the file has already been
1656 * marked dead, so no need to do it again.
1657 * - bailout : reopen failed but we are able to recover and retry the reopen -
1658 * either within this function immediately or via the calling function.
1659 */
1660
1661 void
1662 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1663 open_claim_type4 claim, bool_t frc_use_claim_previous,
1664 bool_t is_recov)
1665 {
1666 COMPOUND4args_clnt args;
1667 COMPOUND4res_clnt res;
1668 nfs_argop4 argop[4];
1669 nfs_resop4 *resop;
1670 OPEN4res *op_res = NULL;
1671 OPEN4cargs *open_args;
1672 GETFH4res *gf_res;
1673 rnode4_t *rp = VTOR4(vp);
1674 int doqueue = 1;
1675 cred_t *cr = NULL, *cred_otw = NULL;
1676 nfs4_open_owner_t *oop = NULL;
1677 seqid4 seqid;
1678 nfs4_ga_res_t *garp;
1679 char fn[MAXNAMELEN];
1680 nfs4_recov_state_t recov = {NULL, 0};
1681 nfs4_lost_rqst_t lost_rqst;
1682 mntinfo4_t *mi = VTOMI4(vp);
1683 bool_t abort;
1684 char *failed_msg = "";
1685 int fh_different;
1686 hrtime_t t;
1687 nfs4_bseqid_entry_t *bsep = NULL;
1688
1689 ASSERT(nfs4_consistent_type(vp));
1690 ASSERT(nfs_zone() == mi->mi_zone);
1691
1692 nfs4_error_zinit(ep);
1693
1694 /* this is the cred used to find the open owner */
1695 cr = state_to_cred(osp);
1696 if (cr == NULL) {
1697 failed_msg = "Couldn't reopen: no cred";
1698 goto kill_file;
1699 }
1700 /* use this cred for OTW operations */
1701 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1702
1703 top:
1704 nfs4_error_zinit(ep);
1705
1706 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1707 /* File system has been unmounted, quit */
1708 ep->error = EIO;
1709 failed_msg = "Couldn't reopen: file system has been unmounted";
1710 goto kill_file;
1711 }
1712
1713 oop = osp->os_open_owner;
1714
1715 ASSERT(oop != NULL);
1716 if (oop == NULL) { /* be defensive in non-DEBUG */
1717 failed_msg = "can't reopen: no open owner";
1718 goto kill_file;
1719 }
1720 open_owner_hold(oop);
1721
1722 ep->error = nfs4_start_open_seqid_sync(oop, mi);
1723 if (ep->error) {
1724 open_owner_rele(oop);
1725 oop = NULL;
1726 goto bailout;
1727 }
1728
1729 /*
1730 * If the rnode has a delegation and the delegation has been
1731 * recovered and the server didn't request a recall and the caller
1732 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1733 * recovery) and the rnode hasn't been marked dead, then install
1734 * the delegation stateid in the open stream. Otherwise, proceed
1735 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1736 */
1737 mutex_enter(&rp->r_statev4_lock);
1738 if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1739 !rp->r_deleg_return_pending &&
1740 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1741 !rp->r_deleg_needs_recall &&
1742 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1743 !(rp->r_flags & R4RECOVERR)) {
1744 mutex_enter(&osp->os_sync_lock);
1745 osp->os_delegation = 1;
1746 osp->open_stateid = rp->r_deleg_stateid;
1747 mutex_exit(&osp->os_sync_lock);
1748 mutex_exit(&rp->r_statev4_lock);
1749 goto bailout;
1750 }
1751 mutex_exit(&rp->r_statev4_lock);
1752
1753 /*
1754 * If the file failed recovery, just quit. This failure need not
1755 * affect other reopens, so don't return an error.
1756 */
1757 mutex_enter(&rp->r_statelock);
1758 if (rp->r_flags & R4RECOVERR) {
1759 mutex_exit(&rp->r_statelock);
1760 ep->error = 0;
1761 goto failed_reopen;
1762 }
1763 mutex_exit(&rp->r_statelock);
1764
1765 /*
1766 * argop is empty here
1767 *
1768 * PUTFH, OPEN, GETATTR
1769 */
1770 args.ctag = TAG_REOPEN;
1771 args.array_len = 4;
1772 args.array = argop;
1773
1774 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1775 "nfs4_reopen: file is type %d, id %s",
1776 vp->v_type, rnode4info(VTOR4(vp))));
1777
1778 argop[0].argop = OP_CPUTFH;
1779
1780 if (claim != CLAIM_PREVIOUS) {
1781 /*
1782 * if this is a file mount then
1783 * use the mntinfo parentfh
1784 */
1785 argop[0].nfs_argop4_u.opcputfh.sfh =
1786 (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1787 VTOSV(vp)->sv_dfh;
1788 } else {
1789 /* putfh fh to reopen */
1790 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1791 }
1792
1793 argop[1].argop = OP_COPEN;
1794 open_args = &argop[1].nfs_argop4_u.opcopen;
1795 open_args->claim = claim;
1796
1797 if (claim == CLAIM_NULL) {
1798
1799 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1800 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1801 "failed for vp 0x%p for CLAIM_NULL with %m",
1802 (void *)vp);
1803 failed_msg = "Couldn't reopen: vtoname failed for "
1804 "CLAIM_NULL";
1805 /* nothing allocated yet */
1806 goto kill_file;
1807 }
1808
1809 open_args->open_claim4_u.cfile = fn;
1810 } else if (claim == CLAIM_PREVIOUS) {
1811
1812 /*
1813 * We have two cases to deal with here:
1814 * 1) We're being called to reopen files in order to satisfy
1815 * a lock operation request which requires us to explicitly
1816 * reopen files which were opened under a delegation. If
1817 * we're in recovery, we *must* use CLAIM_PREVIOUS. In
1818 * that case, frc_use_claim_previous is TRUE and we must
1819 * use the rnode's current delegation type (r_deleg_type).
1820 * 2) We're reopening files during some form of recovery.
1821 * In this case, frc_use_claim_previous is FALSE and we
1822 * use the delegation type appropriate for recovery
1823 * (r_deleg_needs_recovery).
1824 */
1825 mutex_enter(&rp->r_statev4_lock);
1826 open_args->open_claim4_u.delegate_type =
1827 frc_use_claim_previous ?
1828 rp->r_deleg_type :
1829 rp->r_deleg_needs_recovery;
1830 mutex_exit(&rp->r_statev4_lock);
1831
1832 } else if (claim == CLAIM_DELEGATE_CUR) {
1833
1834 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1835 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1836 "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1837 "with %m", (void *)vp);
1838 failed_msg = "Couldn't reopen: vtoname failed for "
1839 "CLAIM_DELEGATE_CUR";
1840 /* nothing allocated yet */
1841 goto kill_file;
1842 }
1843
1844 mutex_enter(&rp->r_statev4_lock);
1845 open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1846 rp->r_deleg_stateid;
1847 mutex_exit(&rp->r_statev4_lock);
1848
1849 open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1850 }
1851 open_args->opentype = OPEN4_NOCREATE;
1852 open_args->owner.clientid = mi2clientid(mi);
1853 open_args->owner.owner_len = sizeof (oop->oo_name);
1854 open_args->owner.owner_val =
1855 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1856 bcopy(&oop->oo_name, open_args->owner.owner_val,
1857 open_args->owner.owner_len);
1858 open_args->share_access = 0;
1859 open_args->share_deny = 0;
1860
1861 mutex_enter(&osp->os_sync_lock);
1862 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1863 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1864 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1865 (void *)osp, (void *)rp, osp->os_share_acc_read,
1866 osp->os_share_acc_write, osp->os_open_ref_count,
1867 osp->os_mmap_read, osp->os_mmap_write, claim));
1868
1869 if (osp->os_share_acc_read || osp->os_mmap_read)
1870 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1871 if (osp->os_share_acc_write || osp->os_mmap_write)
1872 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1873 if (osp->os_share_deny_read)
1874 open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1875 if (osp->os_share_deny_write)
1876 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1877 mutex_exit(&osp->os_sync_lock);
1878
1879 seqid = nfs4_get_open_seqid(oop) + 1;
1880 open_args->seqid = seqid;
1881
1882 /* Construct the getfh part of the compound */
1883 argop[2].argop = OP_GETFH;
1884
1885 /* Construct the getattr part of the compound */
1886 argop[3].argop = OP_GETATTR;
1887 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1888 argop[3].nfs_argop4_u.opgetattr.mi = mi;
1889
1890 t = gethrtime();
1891
1892 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1893
1894 if (ep->error) {
1895 if (!is_recov && !frc_use_claim_previous &&
1896 (ep->error == EINTR || ep->error == ETIMEDOUT ||
1897 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1898 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1899 cred_otw, vp, NULL, open_args);
1900 abort = nfs4_start_recovery(ep,
1901 VTOMI4(vp), vp, NULL, NULL,
1902 lost_rqst.lr_op == OP_OPEN ?
1903 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1904 nfs4args_copen_free(open_args);
1905 goto bailout;
1906 }
1907
1908 nfs4args_copen_free(open_args);
1909
1910 if (ep->error == EACCES && cred_otw != cr) {
1911 crfree(cred_otw);
1912 cred_otw = cr;
1913 crhold(cred_otw);
1914 nfs4_end_open_seqid_sync(oop);
1915 open_owner_rele(oop);
1916 oop = NULL;
1917 goto top;
1918 }
1919 if (ep->error == ETIMEDOUT)
1920 goto bailout;
1921 failed_msg = "Couldn't reopen: rpc error";
1922 goto kill_file;
1923 }
1924
1925 if (nfs4_need_to_bump_seqid(&res))
1926 nfs4_set_open_seqid(seqid, oop, args.ctag);
1927
1928 switch (res.status) {
1929 case NFS4_OK:
1930 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1931 mutex_enter(&rp->r_statelock);
1932 rp->r_delay_interval = 0;
1933 mutex_exit(&rp->r_statelock);
1934 }
1935 break;
1936 case NFS4ERR_BAD_SEQID:
1937 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1938 args.ctag, open_args->seqid);
1939
1940 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1941 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1942 NULL, OP_OPEN, bsep, NULL, NULL);
1943
1944 nfs4args_copen_free(open_args);
1945 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1946 nfs4_end_open_seqid_sync(oop);
1947 open_owner_rele(oop);
1948 oop = NULL;
1949 kmem_free(bsep, sizeof (*bsep));
1950
1951 goto kill_file;
1952 case NFS4ERR_NO_GRACE:
1953 nfs4args_copen_free(open_args);
1954 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1955 nfs4_end_open_seqid_sync(oop);
1956 open_owner_rele(oop);
1957 oop = NULL;
1958 if (claim == CLAIM_PREVIOUS) {
1959 /*
1960 * Retry as a plain open. We don't need to worry about
1961 * checking the changeinfo: it is acceptable for a
1962 * client to re-open a file and continue processing
1963 * (in the absence of locks).
1964 */
1965 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1966 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1967 "will retry as CLAIM_NULL"));
1968 claim = CLAIM_NULL;
1969 nfs4_mi_kstat_inc_no_grace(mi);
1970 goto top;
1971 }
1972 failed_msg =
1973 "Couldn't reopen: tried reclaim outside grace period. ";
1974 goto kill_file;
1975 case NFS4ERR_GRACE:
1976 nfs4_set_grace_wait(mi);
1977 nfs4args_copen_free(open_args);
1978 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1979 nfs4_end_open_seqid_sync(oop);
1980 open_owner_rele(oop);
1981 oop = NULL;
1982 ep->error = nfs4_wait_for_grace(mi, &recov);
1983 if (ep->error != 0)
1984 goto bailout;
1985 goto top;
1986 case NFS4ERR_DELAY:
1987 nfs4_set_delay_wait(vp);
1988 nfs4args_copen_free(open_args);
1989 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1990 nfs4_end_open_seqid_sync(oop);
1991 open_owner_rele(oop);
1992 oop = NULL;
1993 ep->error = nfs4_wait_for_delay(vp, &recov);
1994 nfs4_mi_kstat_inc_delay(mi);
1995 if (ep->error != 0)
1996 goto bailout;
1997 goto top;
1998 case NFS4ERR_FHEXPIRED:
1999 /* recover filehandle and retry */
2000 abort = nfs4_start_recovery(ep,
2001 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2002 nfs4args_copen_free(open_args);
2003 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2004 nfs4_end_open_seqid_sync(oop);
2005 open_owner_rele(oop);
2006 oop = NULL;
2007 if (abort == FALSE)
2008 goto top;
2009 failed_msg = "Couldn't reopen: recovery aborted";
2010 goto kill_file;
2011 case NFS4ERR_RESOURCE:
2012 case NFS4ERR_STALE_CLIENTID:
2013 case NFS4ERR_WRONGSEC:
2014 case NFS4ERR_EXPIRED:
2015 /*
2016 * Do not mark the file dead and let the calling
2017 * function initiate recovery.
2018 */
2019 nfs4args_copen_free(open_args);
2020 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2021 nfs4_end_open_seqid_sync(oop);
2022 open_owner_rele(oop);
2023 oop = NULL;
2024 goto bailout;
2025 case NFS4ERR_ACCESS:
2026 if (cred_otw != cr) {
2027 crfree(cred_otw);
2028 cred_otw = cr;
2029 crhold(cred_otw);
2030 nfs4args_copen_free(open_args);
2031 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2032 nfs4_end_open_seqid_sync(oop);
2033 open_owner_rele(oop);
2034 oop = NULL;
2035 goto top;
2036 }
2037 /* fall through */
2038 default:
2039 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2040 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2041 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2042 rnode4info(VTOR4(vp))));
2043 failed_msg = "Couldn't reopen: NFSv4 error";
2044 nfs4args_copen_free(open_args);
2045 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2046 goto kill_file;
2047 }
2048
2049 resop = &res.array[1]; /* open res */
2050 op_res = &resop->nfs_resop4_u.opopen;
2051
2052 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2053
2054 /*
2055 * Check if the path we reopened really is the same
2056 * file. We could end up in a situation where the file
2057 * was removed and a new file created with the same name.
2058 */
2059 resop = &res.array[2];
2060 gf_res = &resop->nfs_resop4_u.opgetfh;
2061 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2062 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2063 if (fh_different) {
2064 if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2065 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2066 /* Oops, we don't have the same file */
2067 if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2068 failed_msg = "Couldn't reopen: Persistent "
2069 "file handle changed";
2070 else
2071 failed_msg = "Couldn't reopen: Volatile "
2072 "(no expire on open) file handle changed";
2073
2074 nfs4args_copen_free(open_args);
2075 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2076 nfs_rw_exit(&mi->mi_fh_lock);
2077 goto kill_file;
2078
2079 } else {
2080 /*
2081 * We have volatile file handles that don't compare.
2082 * If the fids are the same then we assume that the
2083 * file handle expired but the rnode still refers to
2084 * the same file object.
2085 *
2086 * First check that we have fids or not.
2087 * If we don't we have a dumb server so we will
2088 * just assume every thing is ok for now.
2089 */
2090 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2091 rp->r_attr.va_mask & AT_NODEID &&
2092 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2093 /*
2094 * We have fids, but they don't
2095 * compare. So kill the file.
2096 */
2097 failed_msg =
2098 "Couldn't reopen: file handle changed"
2099 " due to mismatched fids";
2100 nfs4args_copen_free(open_args);
2101 (void) xdr_free(xdr_COMPOUND4res_clnt,
2102 (caddr_t)&res);
2103 nfs_rw_exit(&mi->mi_fh_lock);
2104 goto kill_file;
2105 } else {
2106 /*
2107 * We have volatile file handles that refers
2108 * to the same file (at least they have the
2109 * same fid) or we don't have fids so we
2110 * can't tell. :(. We'll be a kind and accepting
2111 * client so we'll update the rnode's file
2112 * handle with the otw handle.
2113 *
2114 * We need to drop mi->mi_fh_lock since
2115 * sh4_update acquires it. Since there is
2116 * only one recovery thread there is no
2117 * race.
2118 */
2119 nfs_rw_exit(&mi->mi_fh_lock);
2120 sfh4_update(rp->r_fh, &gf_res->object);
2121 }
2122 }
2123 } else {
2124 nfs_rw_exit(&mi->mi_fh_lock);
2125 }
2126
2127 ASSERT(nfs4_consistent_type(vp));
2128
2129 /*
2130 * If the server wanted an OPEN_CONFIRM but that fails, just start
2131 * over. Presumably if there is a persistent error it will show up
2132 * when we resend the OPEN.
2133 */
2134 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2135 bool_t retry_open = FALSE;
2136
2137 nfs4open_confirm(vp, &seqid, &op_res->stateid,
2138 cred_otw, is_recov, &retry_open,
2139 oop, FALSE, ep, NULL);
2140 if (ep->error || ep->stat) {
2141 nfs4args_copen_free(open_args);
2142 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2143 nfs4_end_open_seqid_sync(oop);
2144 open_owner_rele(oop);
2145 oop = NULL;
2146 goto top;
2147 }
2148 }
2149
2150 mutex_enter(&osp->os_sync_lock);
2151 osp->open_stateid = op_res->stateid;
2152 osp->os_delegation = 0;
2153 /*
2154 * Need to reset this bitfield for the possible case where we were
2155 * going to OTW CLOSE the file, got a non-recoverable error, and before
2156 * we could retry the CLOSE, OPENed the file again.
2157 */
2158 ASSERT(osp->os_open_owner->oo_seqid_inuse);
2159 osp->os_final_close = 0;
2160 osp->os_force_close = 0;
2161 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2162 osp->os_dc_openacc = open_args->share_access;
2163 mutex_exit(&osp->os_sync_lock);
2164
2165 nfs4_end_open_seqid_sync(oop);
2166
2167 /* accept delegation, if any */
2168 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2169
2170 nfs4args_copen_free(open_args);
2171
2172 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2173
2174 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2175
2176 ASSERT(nfs4_consistent_type(vp));
2177
2178 open_owner_rele(oop);
2179 crfree(cr);
2180 crfree(cred_otw);
2181 return;
2182
2183 kill_file:
2184 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2185 failed_reopen:
2186 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2187 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2188 (void *)osp, (void *)cr, rnode4info(rp)));
2189 mutex_enter(&osp->os_sync_lock);
2190 osp->os_failed_reopen = 1;
2191 mutex_exit(&osp->os_sync_lock);
2192 bailout:
2193 if (oop != NULL) {
2194 nfs4_end_open_seqid_sync(oop);
2195 open_owner_rele(oop);
2196 }
2197 if (cr != NULL)
2198 crfree(cr);
2199 if (cred_otw != NULL)
2200 crfree(cred_otw);
2201 }
2202
2203 /* for . and .. OPENs */
2204 /* ARGSUSED */
2205 static int
2206 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2207 {
2208 rnode4_t *rp;
2209 nfs4_ga_res_t gar;
2210
2211 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2212
2213 /*
2214 * If close-to-open consistency checking is turned off or
2215 * if there is no cached data, we can avoid
2216 * the over the wire getattr. Otherwise, force a
2217 * call to the server to get fresh attributes and to
2218 * check caches. This is required for close-to-open
2219 * consistency.
2220 */
2221 rp = VTOR4(*vpp);
2222 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2223 (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2224 return (0);
2225
2226 gar.n4g_va.va_mask = AT_ALL;
2227 return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2228 }
2229
2230 /*
2231 * CLOSE a file
2232 */
2233 /* ARGSUSED */
2234 static int
2235 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2236 caller_context_t *ct)
2237 {
2238 rnode4_t *rp;
2239 int error = 0;
2240 int r_error = 0;
2241 int n4error = 0;
2242 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2243
2244 /*
2245 * Remove client state for this (lockowner, file) pair.
2246 * Issue otw v4 call to have the server do the same.
2247 */
2248
2249 rp = VTOR4(vp);
2250
2251 /*
2252 * zone_enter(2) prevents processes from changing zones with NFS files
2253 * open; if we happen to get here from the wrong zone we can't do
2254 * anything over the wire.
2255 */
2256 if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2257 /*
2258 * We could attempt to clean up locks, except we're sure
2259 * that the current process didn't acquire any locks on
2260 * the file: any attempt to lock a file belong to another zone
2261 * will fail, and one can't lock an NFS file and then change
2262 * zones, as that fails too.
2263 *
2264 * Returning an error here is the sane thing to do. A
2265 * subsequent call to VN_RELE() which translates to a
2266 * nfs4_inactive() will clean up state: if the zone of the
2267 * vnode's origin is still alive and kicking, the inactive
2268 * thread will handle the request (from the correct zone), and
2269 * everything (minus the OTW close call) should be OK. If the
2270 * zone is going away nfs4_async_inactive() will throw away
2271 * delegations, open streams and cached pages inline.
2272 */
2273 return (EIO);
2274 }
2275
2276 /*
2277 * If we are using local locking for this filesystem, then
2278 * release all of the SYSV style record locks. Otherwise,
2279 * we are doing network locking and we need to release all
2280 * of the network locks. All of the locks held by this
2281 * process on this file are released no matter what the
2282 * incoming reference count is.
2283 */
2284 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2285 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2286 cleanshares(vp, ttoproc(curthread)->p_pid);
2287 } else
2288 e.error = nfs4_lockrelease(vp, flag, offset, cr);
2289
2290 if (e.error) {
2291 struct lm_sysid *lmsid;
2292 lmsid = nfs4_find_sysid(VTOMI4(vp));
2293 if (lmsid == NULL) {
2294 DTRACE_PROBE2(unknown__sysid, int, e.error,
2295 vnode_t *, vp);
2296 } else {
2297 cleanlocks(vp, ttoproc(curthread)->p_pid,
2298 (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2299 }
2300 return (e.error);
2301 }
2302
2303 if (count > 1)
2304 return (0);
2305
2306 /*
2307 * If the file has been `unlinked', then purge the
2308 * DNLC so that this vnode will get reycled quicker
2309 * and the .nfs* file on the server will get removed.
2310 */
2311 if (rp->r_unldvp != NULL)
2312 dnlc_purge_vp(vp);
2313
2314 /*
2315 * If the file was open for write and there are pages,
2316 * do a synchronous flush and commit of all of the
2317 * dirty and uncommitted pages.
2318 */
2319 ASSERT(!e.error);
2320 if ((flag & FWRITE) && nfs4_has_pages(vp))
2321 error = nfs4_putpage_commit(vp, 0, 0, cr);
2322
2323 mutex_enter(&rp->r_statelock);
2324 r_error = rp->r_error;
2325 rp->r_error = 0;
2326 mutex_exit(&rp->r_statelock);
2327
2328 /*
2329 * If this file type is one for which no explicit 'open' was
2330 * done, then bail now (ie. no need for protocol 'close'). If
2331 * there was an error w/the vm subsystem, return _that_ error,
2332 * otherwise, return any errors that may've been reported via
2333 * the rnode.
2334 */
2335 if (vp->v_type != VREG)
2336 return (error ? error : r_error);
2337
2338 /*
2339 * The sync putpage commit may have failed above, but since
2340 * we're working w/a regular file, we need to do the protocol
2341 * 'close' (nfs4close_one will figure out if an otw close is
2342 * needed or not). Report any errors _after_ doing the protocol
2343 * 'close'.
2344 */
2345 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2346 n4error = e.error ? e.error : geterrno4(e.stat);
2347
2348 /*
2349 * Error reporting prio (Hi -> Lo)
2350 *
2351 * i) nfs4_putpage_commit (error)
2352 * ii) rnode's (r_error)
2353 * iii) nfs4close_one (n4error)
2354 */
2355 return (error ? error : (r_error ? r_error : n4error));
2356 }
2357
2358 /*
2359 * Initialize *lost_rqstp.
2360 */
2361
2362 static void
2363 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2364 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2365 vnode_t *vp)
2366 {
2367 if (error != ETIMEDOUT && error != EINTR &&
2368 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2369 lost_rqstp->lr_op = 0;
2370 return;
2371 }
2372
2373 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2374 "nfs4close_save_lost_rqst: error %d", error));
2375
2376 lost_rqstp->lr_op = OP_CLOSE;
2377 /*
2378 * The vp is held and rele'd via the recovery code.
2379 * See nfs4_save_lost_rqst.
2380 */
2381 lost_rqstp->lr_vp = vp;
2382 lost_rqstp->lr_dvp = NULL;
2383 lost_rqstp->lr_oop = oop;
2384 lost_rqstp->lr_osp = osp;
2385 ASSERT(osp != NULL);
2386 ASSERT(mutex_owned(&osp->os_sync_lock));
2387 osp->os_pending_close = 1;
2388 lost_rqstp->lr_lop = NULL;
2389 lost_rqstp->lr_cr = cr;
2390 lost_rqstp->lr_flk = NULL;
2391 lost_rqstp->lr_putfirst = FALSE;
2392 }
2393
2394 /*
2395 * Assumes you already have the open seqid sync grabbed as well as the
2396 * 'os_sync_lock'. Note: this will release the open seqid sync and
2397 * 'os_sync_lock' if client recovery starts. Calling functions have to
2398 * be prepared to handle this.
2399 *
2400 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2401 * was needed and was started, and that the calling function should retry
2402 * this function; otherwise it is returned as 0.
2403 *
2404 * Errors are returned via the nfs4_error_t parameter.
2405 */
2406 static void
2407 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2408 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2409 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2410 {
2411 COMPOUND4args_clnt args;
2412 COMPOUND4res_clnt res;
2413 CLOSE4args *close_args;
2414 nfs_resop4 *resop;
2415 nfs_argop4 argop[3];
2416 int doqueue = 1;
2417 mntinfo4_t *mi;
2418 seqid4 seqid;
2419 vnode_t *vp;
2420 bool_t needrecov = FALSE;
2421 nfs4_lost_rqst_t lost_rqst;
2422 hrtime_t t;
2423
2424 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2425
2426 ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2427
2428 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2429
2430 /* Only set this to 1 if recovery is started */
2431 *recov = 0;
2432
2433 /* do the OTW call to close the file */
2434
2435 if (close_type == CLOSE_RESEND)
2436 args.ctag = TAG_CLOSE_LOST;
2437 else if (close_type == CLOSE_AFTER_RESEND)
2438 args.ctag = TAG_CLOSE_UNDO;
2439 else
2440 args.ctag = TAG_CLOSE;
2441
2442 args.array_len = 3;
2443 args.array = argop;
2444
2445 vp = RTOV4(rp);
2446
2447 mi = VTOMI4(vp);
2448
2449 /* putfh target fh */
2450 argop[0].argop = OP_CPUTFH;
2451 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2452
2453 argop[1].argop = OP_GETATTR;
2454 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2455 argop[1].nfs_argop4_u.opgetattr.mi = mi;
2456
2457 argop[2].argop = OP_CLOSE;
2458 close_args = &argop[2].nfs_argop4_u.opclose;
2459
2460 seqid = nfs4_get_open_seqid(oop) + 1;
2461
2462 close_args->seqid = seqid;
2463 close_args->open_stateid = osp->open_stateid;
2464
2465 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2466 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2467 rnode4info(rp)));
2468
2469 t = gethrtime();
2470
2471 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2472
2473 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2474 nfs4_set_open_seqid(seqid, oop, args.ctag);
2475 }
2476
2477 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2478 if (ep->error && !needrecov) {
2479 /*
2480 * if there was an error and no recovery is to be done
2481 * then then set up the file to flush its cache if
2482 * needed for the next caller.
2483 */
2484 mutex_enter(&rp->r_statelock);
2485 PURGE_ATTRCACHE4_LOCKED(rp);
2486 rp->r_flags &= ~R4WRITEMODIFIED;
2487 mutex_exit(&rp->r_statelock);
2488 return;
2489 }
2490
2491 if (needrecov) {
2492 bool_t abort;
2493 nfs4_bseqid_entry_t *bsep = NULL;
2494
2495 if (close_type != CLOSE_RESEND)
2496 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2497 osp, cred_otw, vp);
2498
2499 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2500 bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2501 0, args.ctag, close_args->seqid);
2502
2503 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2504 "nfs4close_otw: initiating recovery. error %d "
2505 "res.status %d", ep->error, res.status));
2506
2507 /*
2508 * Drop the 'os_sync_lock' here so we don't hit
2509 * a potential recursive mutex_enter via an
2510 * 'open_stream_hold()'.
2511 */
2512 mutex_exit(&osp->os_sync_lock);
2513 *have_sync_lockp = 0;
2514 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2515 (close_type != CLOSE_RESEND &&
2516 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2517 OP_CLOSE, bsep, NULL, NULL);
2518
2519 /* drop open seq sync, and let the calling function regrab it */
2520 nfs4_end_open_seqid_sync(oop);
2521 *did_start_seqid_syncp = 0;
2522
2523 if (bsep)
2524 kmem_free(bsep, sizeof (*bsep));
2525 /*
2526 * For signals, the caller wants to quit, so don't say to
2527 * retry. For forced unmount, if it's a user thread, it
2528 * wants to quit. If it's a recovery thread, the retry
2529 * will happen higher-up on the call stack. Either way,
2530 * don't say to retry.
2531 */
2532 if (abort == FALSE && ep->error != EINTR &&
2533 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2534 close_type != CLOSE_RESEND &&
2535 close_type != CLOSE_AFTER_RESEND)
2536 *recov = 1;
2537 else
2538 *recov = 0;
2539
2540 if (!ep->error)
2541 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2542 return;
2543 }
2544
2545 if (res.status) {
2546 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2547 return;
2548 }
2549
2550 mutex_enter(&rp->r_statev4_lock);
2551 rp->created_v4 = 0;
2552 mutex_exit(&rp->r_statev4_lock);
2553
2554 resop = &res.array[2];
2555 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2556 osp->os_valid = 0;
2557
2558 /*
2559 * This removes the reference obtained at OPEN; ie, when the
2560 * open stream structure was created.
2561 *
2562 * We don't have to worry about calling 'open_stream_rele'
2563 * since we our currently holding a reference to the open
2564 * stream which means the count cannot go to 0 with this
2565 * decrement.
2566 */
2567 ASSERT(osp->os_ref_count >= 2);
2568 osp->os_ref_count--;
2569
2570 if (!ep->error)
2571 nfs4_attr_cache(vp,
2572 &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2573 t, cred_otw, TRUE, NULL);
2574
2575 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2576 " returning %d", ep->error));
2577
2578 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2579 }
2580
2581 /* ARGSUSED */
2582 static int
2583 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2584 caller_context_t *ct)
2585 {
2586 rnode4_t *rp;
2587 u_offset_t off;
2588 offset_t diff;
2589 uint_t on;
2590 uint_t n;
2591 caddr_t base;
2592 uint_t flags;
2593 int error;
2594 mntinfo4_t *mi;
2595
2596 rp = VTOR4(vp);
2597
2598 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2599
2600 if (IS_SHADOW(vp, rp))
2601 vp = RTOV4(rp);
2602
2603 if (vp->v_type != VREG)
2604 return (EISDIR);
2605
2606 mi = VTOMI4(vp);
2607
2608 if (nfs_zone() != mi->mi_zone)
2609 return (EIO);
2610
2611 if (uiop->uio_resid == 0)
2612 return (0);
2613
2614 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2615 return (EINVAL);
2616
2617 mutex_enter(&rp->r_statelock);
2618 if (rp->r_flags & R4RECOVERRP)
2619 error = (rp->r_error ? rp->r_error : EIO);
2620 else
2621 error = 0;
2622 mutex_exit(&rp->r_statelock);
2623 if (error)
2624 return (error);
2625
2626 /*
2627 * Bypass VM if caching has been disabled (e.g., locking) or if
2628 * using client-side direct I/O and the file is not mmap'd and
2629 * there are no cached pages.
2630 */
2631 if ((vp->v_flag & VNOCACHE) ||
2632 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2633 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2634 size_t resid = 0;
2635
2636 return (nfs4read(vp, NULL, uiop->uio_loffset,
2637 uiop->uio_resid, &resid, cr, FALSE, uiop));
2638 }
2639
2640 error = 0;
2641
2642 do {
2643 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2644 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2645 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2646
2647 if (error = nfs4_validate_caches(vp, cr))
2648 break;
2649
2650 mutex_enter(&rp->r_statelock);
2651 while (rp->r_flags & R4INCACHEPURGE) {
2652 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2653 mutex_exit(&rp->r_statelock);
2654 return (EINTR);
2655 }
2656 }
2657 diff = rp->r_size - uiop->uio_loffset;
2658 mutex_exit(&rp->r_statelock);
2659 if (diff <= 0)
2660 break;
2661 if (diff < n)
2662 n = (uint_t)diff;
2663
2664 if (vpm_enable) {
2665 /*
2666 * Copy data.
2667 */
2668 error = vpm_data_copy(vp, off + on, n, uiop,
2669 1, NULL, 0, S_READ);
2670 } else {
2671 base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2672 S_READ);
2673
2674 error = uiomove(base + on, n, UIO_READ, uiop);
2675 }
2676
2677 if (!error) {
2678 /*
2679 * If read a whole block or read to eof,
2680 * won't need this buffer again soon.
2681 */
2682 mutex_enter(&rp->r_statelock);
2683 if (n + on == MAXBSIZE ||
2684 uiop->uio_loffset == rp->r_size)
2685 flags = SM_DONTNEED;
2686 else
2687 flags = 0;
2688 mutex_exit(&rp->r_statelock);
2689 if (vpm_enable) {
2690 error = vpm_sync_pages(vp, off, n, flags);
2691 } else {
2692 error = segmap_release(segkmap, base, flags);
2693 }
2694 } else {
2695 if (vpm_enable) {
2696 (void) vpm_sync_pages(vp, off, n, 0);
2697 } else {
2698 (void) segmap_release(segkmap, base, 0);
2699 }
2700 }
2701 } while (!error && uiop->uio_resid > 0);
2702
2703 return (error);
2704 }
2705
2706 /* ARGSUSED */
2707 static int
2708 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2709 caller_context_t *ct)
2710 {
2711 rlim64_t limit = uiop->uio_llimit;
2712 rnode4_t *rp;
2713 u_offset_t off;
2714 caddr_t base;
2715 uint_t flags;
2716 int remainder;
2717 size_t n;
2718 int on;
2719 int error;
2720 int resid;
2721 u_offset_t offset;
2722 mntinfo4_t *mi;
2723 uint_t bsize;
2724
2725 rp = VTOR4(vp);
2726
2727 if (IS_SHADOW(vp, rp))
2728 vp = RTOV4(rp);
2729
2730 if (vp->v_type != VREG)
2731 return (EISDIR);
2732
2733 mi = VTOMI4(vp);
2734
2735 if (nfs_zone() != mi->mi_zone)
2736 return (EIO);
2737
2738 if (uiop->uio_resid == 0)
2739 return (0);
2740
2741 mutex_enter(&rp->r_statelock);
2742 if (rp->r_flags & R4RECOVERRP)
2743 error = (rp->r_error ? rp->r_error : EIO);
2744 else
2745 error = 0;
2746 mutex_exit(&rp->r_statelock);
2747 if (error)
2748 return (error);
2749
2750 if (ioflag & FAPPEND) {
2751 struct vattr va;
2752
2753 /*
2754 * Must serialize if appending.
2755 */
2756 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2757 nfs_rw_exit(&rp->r_rwlock);
2758 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2759 INTR4(vp)))
2760 return (EINTR);
2761 }
2762
2763 va.va_mask = AT_SIZE;
2764 error = nfs4getattr(vp, &va, cr);
2765 if (error)
2766 return (error);
2767 uiop->uio_loffset = va.va_size;
2768 }
2769
2770 offset = uiop->uio_loffset + uiop->uio_resid;
2771
2772 if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2773 return (EINVAL);
2774
2775 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2776 limit = MAXOFFSET_T;
2777
2778 /*
2779 * Check to make sure that the process will not exceed
2780 * its limit on file size. It is okay to write up to
2781 * the limit, but not beyond. Thus, the write which
2782 * reaches the limit will be short and the next write
2783 * will return an error.
2784 */
2785 remainder = 0;
2786 if (offset > uiop->uio_llimit) {
2787 remainder = offset - uiop->uio_llimit;
2788 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2789 if (uiop->uio_resid <= 0) {
2790 proc_t *p = ttoproc(curthread);
2791
2792 uiop->uio_resid += remainder;
2793 mutex_enter(&p->p_lock);
2794 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2795 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2796 mutex_exit(&p->p_lock);
2797 return (EFBIG);
2798 }
2799 }
2800
2801 /* update the change attribute, if we have a write delegation */
2802
2803 mutex_enter(&rp->r_statev4_lock);
2804 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2805 rp->r_deleg_change++;
2806
2807 mutex_exit(&rp->r_statev4_lock);
2808
2809 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2810 return (EINTR);
2811
2812 /*
2813 * Bypass VM if caching has been disabled (e.g., locking) or if
2814 * using client-side direct I/O and the file is not mmap'd and
2815 * there are no cached pages.
2816 */
2817 if ((vp->v_flag & VNOCACHE) ||
2818 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2819 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2820 size_t bufsize;
2821 int count;
2822 u_offset_t org_offset;
2823 stable_how4 stab_comm;
2824 nfs4_fwrite:
2825 if (rp->r_flags & R4STALE) {
2826 resid = uiop->uio_resid;
2827 offset = uiop->uio_loffset;
2828 error = rp->r_error;
2829 /*
2830 * A close may have cleared r_error, if so,
2831 * propagate ESTALE error return properly
2832 */
2833 if (error == 0)
2834 error = ESTALE;
2835 goto bottom;
2836 }
2837
2838 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2839 base = kmem_alloc(bufsize, KM_SLEEP);
2840 do {
2841 if (ioflag & FDSYNC)
2842 stab_comm = DATA_SYNC4;
2843 else
2844 stab_comm = FILE_SYNC4;
2845 resid = uiop->uio_resid;
2846 offset = uiop->uio_loffset;
2847 count = MIN(uiop->uio_resid, bufsize);
2848 org_offset = uiop->uio_loffset;
2849 error = uiomove(base, count, UIO_WRITE, uiop);
2850 if (!error) {
2851 error = nfs4write(vp, base, org_offset,
2852 count, cr, &stab_comm);
2853 if (!error) {
2854 mutex_enter(&rp->r_statelock);
2855 if (rp->r_size < uiop->uio_loffset)
2856 rp->r_size = uiop->uio_loffset;
2857 mutex_exit(&rp->r_statelock);
2858 }
2859 }
2860 } while (!error && uiop->uio_resid > 0);
2861 kmem_free(base, bufsize);
2862 goto bottom;
2863 }
2864
2865 bsize = vp->v_vfsp->vfs_bsize;
2866
2867 do {
2868 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2869 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2870 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2871
2872 resid = uiop->uio_resid;
2873 offset = uiop->uio_loffset;
2874
2875 if (rp->r_flags & R4STALE) {
2876 error = rp->r_error;
2877 /*
2878 * A close may have cleared r_error, if so,
2879 * propagate ESTALE error return properly
2880 */
2881 if (error == 0)
2882 error = ESTALE;
2883 break;
2884 }
2885
2886 /*
2887 * Don't create dirty pages faster than they
2888 * can be cleaned so that the system doesn't
2889 * get imbalanced. If the async queue is
2890 * maxed out, then wait for it to drain before
2891 * creating more dirty pages. Also, wait for
2892 * any threads doing pagewalks in the vop_getattr
2893 * entry points so that they don't block for
2894 * long periods.
2895 */
2896 mutex_enter(&rp->r_statelock);
2897 while ((mi->mi_max_threads != 0 &&
2898 rp->r_awcount > 2 * mi->mi_max_threads) ||
2899 rp->r_gcount > 0) {
2900 if (INTR4(vp)) {
2901 klwp_t *lwp = ttolwp(curthread);
2902
2903 if (lwp != NULL)
2904 lwp->lwp_nostop++;
2905 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2906 mutex_exit(&rp->r_statelock);
2907 if (lwp != NULL)
2908 lwp->lwp_nostop--;
2909 error = EINTR;
2910 goto bottom;
2911 }
2912 if (lwp != NULL)
2913 lwp->lwp_nostop--;
2914 } else
2915 cv_wait(&rp->r_cv, &rp->r_statelock);
2916 }
2917 mutex_exit(&rp->r_statelock);
2918
2919 /*
2920 * Touch the page and fault it in if it is not in core
2921 * before segmap_getmapflt or vpm_data_copy can lock it.
2922 * This is to avoid the deadlock if the buffer is mapped
2923 * to the same file through mmap which we want to write.
2924 */
2925 uio_prefaultpages((long)n, uiop);
2926
2927 if (vpm_enable) {
2928 /*
2929 * It will use kpm mappings, so no need to
2930 * pass an address.
2931 */
2932 error = writerp4(rp, NULL, n, uiop, 0);
2933 } else {
2934 if (segmap_kpm) {
2935 int pon = uiop->uio_loffset & PAGEOFFSET;
2936 size_t pn = MIN(PAGESIZE - pon,
2937 uiop->uio_resid);
2938 int pagecreate;
2939
2940 mutex_enter(&rp->r_statelock);
2941 pagecreate = (pon == 0) && (pn == PAGESIZE ||
2942 uiop->uio_loffset + pn >= rp->r_size);
2943 mutex_exit(&rp->r_statelock);
2944
2945 base = segmap_getmapflt(segkmap, vp, off + on,
2946 pn, !pagecreate, S_WRITE);
2947
2948 error = writerp4(rp, base + pon, n, uiop,
2949 pagecreate);
2950
2951 } else {
2952 base = segmap_getmapflt(segkmap, vp, off + on,
2953 n, 0, S_READ);
2954 error = writerp4(rp, base + on, n, uiop, 0);
2955 }
2956 }
2957
2958 if (!error) {
2959 if (mi->mi_flags & MI4_NOAC)
2960 flags = SM_WRITE;
2961 else if ((uiop->uio_loffset % bsize) == 0 ||
2962 IS_SWAPVP(vp)) {
2963 /*
2964 * Have written a whole block.
2965 * Start an asynchronous write
2966 * and mark the buffer to
2967 * indicate that it won't be
2968 * needed again soon.
2969 */
2970 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
2971 } else
2972 flags = 0;
2973 if ((ioflag & (FSYNC|FDSYNC)) ||
2974 (rp->r_flags & R4OUTOFSPACE)) {
2975 flags &= ~SM_ASYNC;
2976 flags |= SM_WRITE;
2977 }
2978 if (vpm_enable) {
2979 error = vpm_sync_pages(vp, off, n, flags);
2980 } else {
2981 error = segmap_release(segkmap, base, flags);
2982 }
2983 } else {
2984 if (vpm_enable) {
2985 (void) vpm_sync_pages(vp, off, n, 0);
2986 } else {
2987 (void) segmap_release(segkmap, base, 0);
2988 }
2989 /*
2990 * In the event that we got an access error while
2991 * faulting in a page for a write-only file just
2992 * force a write.
2993 */
2994 if (error == EACCES)
2995 goto nfs4_fwrite;
2996 }
2997 } while (!error && uiop->uio_resid > 0);
2998
2999 bottom:
3000 if (error) {
3001 uiop->uio_resid = resid + remainder;
3002 uiop->uio_loffset = offset;
3003 } else {
3004 uiop->uio_resid += remainder;
3005
3006 mutex_enter(&rp->r_statev4_lock);
3007 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3008 gethrestime(&rp->r_attr.va_mtime);
3009 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3010 }
3011 mutex_exit(&rp->r_statev4_lock);
3012 }
3013
3014 nfs_rw_exit(&rp->r_lkserlock);
3015
3016 return (error);
3017 }
3018
3019 /*
3020 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3021 */
3022 static int
3023 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3024 int flags, cred_t *cr)
3025 {
3026 struct buf *bp;
3027 int error;
3028 page_t *savepp;
3029 uchar_t fsdata;
3030 stable_how4 stab_comm;
3031
3032 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3033 bp = pageio_setup(pp, len, vp, flags);
3034 ASSERT(bp != NULL);
3035
3036 /*
3037 * pageio_setup should have set b_addr to 0. This
3038 * is correct since we want to do I/O on a page
3039 * boundary. bp_mapin will use this addr to calculate
3040 * an offset, and then set b_addr to the kernel virtual
3041 * address it allocated for us.
3042 */
3043 ASSERT(bp->b_un.b_addr == 0);
3044
3045 bp->b_edev = 0;
3046 bp->b_dev = 0;
3047 bp->b_lblkno = lbtodb(off);
3048 bp->b_file = vp;
3049 bp->b_offset = (offset_t)off;
3050 bp_mapin(bp);
3051
3052 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3053 freemem > desfree)
3054 stab_comm = UNSTABLE4;
3055 else
3056 stab_comm = FILE_SYNC4;
3057
3058 error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3059
3060 bp_mapout(bp);
3061 pageio_done(bp);
3062
3063 if (stab_comm == UNSTABLE4)
3064 fsdata = C_DELAYCOMMIT;
3065 else
3066 fsdata = C_NOCOMMIT;
3067
3068 savepp = pp;
3069 do {
3070 pp->p_fsdata = fsdata;
3071 } while ((pp = pp->p_next) != savepp);
3072
3073 return (error);
3074 }
3075
3076 /*
3077 */
3078 static int
3079 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3080 {
3081 nfs4_open_owner_t *oop;
3082 nfs4_open_stream_t *osp;
3083 rnode4_t *rp = VTOR4(vp);
3084 mntinfo4_t *mi = VTOMI4(vp);
3085 int reopen_needed;
3086
3087 ASSERT(nfs_zone() == mi->mi_zone);
3088
3089
3090 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3091 if (!oop)
3092 return (EIO);
3093
3094 /* returns with 'os_sync_lock' held */
3095 osp = find_open_stream(oop, rp);
3096 if (!osp) {
3097 open_owner_rele(oop);
3098 return (EIO);
3099 }
3100
3101 if (osp->os_failed_reopen) {
3102 mutex_exit(&osp->os_sync_lock);
3103 open_stream_rele(osp, rp);
3104 open_owner_rele(oop);
3105 return (EIO);
3106 }
3107
3108 /*
3109 * Determine whether a reopen is needed. If this
3110 * is a delegation open stream, then the os_delegation bit
3111 * should be set.
3112 */
3113
3114 reopen_needed = osp->os_delegation;
3115
3116 mutex_exit(&osp->os_sync_lock);
3117 open_owner_rele(oop);
3118
3119 if (reopen_needed) {
3120 nfs4_error_zinit(ep);
3121 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3122 mutex_enter(&osp->os_sync_lock);
3123 if (ep->error || ep->stat || osp->os_failed_reopen) {
3124 mutex_exit(&osp->os_sync_lock);
3125 open_stream_rele(osp, rp);
3126 return (EIO);
3127 }
3128 mutex_exit(&osp->os_sync_lock);
3129 }
3130 open_stream_rele(osp, rp);
3131
3132 return (0);
3133 }
3134
3135 /*
3136 * Write to file. Writes to remote server in largest size
3137 * chunks that the server can handle. Write is synchronous.
3138 */
3139 static int
3140 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3141 stable_how4 *stab_comm)
3142 {
3143 mntinfo4_t *mi;
3144 COMPOUND4args_clnt args;
3145 COMPOUND4res_clnt res;
3146 WRITE4args *wargs;
3147 WRITE4res *wres;
3148 nfs_argop4 argop[2];
3149 nfs_resop4 *resop;
3150 int tsize;
3151 stable_how4 stable;
3152 rnode4_t *rp;
3153 int doqueue = 1;
3154 bool_t needrecov;
3155 nfs4_recov_state_t recov_state;
3156 nfs4_stateid_types_t sid_types;
3157 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3158 int recov;
3159
3160 rp = VTOR4(vp);
3161 mi = VTOMI4(vp);
3162
3163 ASSERT(nfs_zone() == mi->mi_zone);
3164
3165 stable = *stab_comm;
3166 *stab_comm = FILE_SYNC4;
3167
3168 needrecov = FALSE;
3169 recov_state.rs_flags = 0;
3170 recov_state.rs_num_retry_despite_err = 0;
3171 nfs4_init_stateid_types(&sid_types);
3172
3173 /* Is curthread the recovery thread? */
3174 mutex_enter(&mi->mi_lock);
3175 recov = (mi->mi_recovthread == curthread);
3176 mutex_exit(&mi->mi_lock);
3177
3178 recov_retry:
3179 args.ctag = TAG_WRITE;
3180 args.array_len = 2;
3181 args.array = argop;
3182
3183 if (!recov) {
3184 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3185 &recov_state, NULL);
3186 if (e.error)
3187 return (e.error);
3188 }
3189
3190 /* 0. putfh target fh */
3191 argop[0].argop = OP_CPUTFH;
3192 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3193
3194 /* 1. write */
3195 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3196
3197 do {
3198
3199 wargs->offset = (offset4)offset;
3200 wargs->data_val = base;
3201
3202 if (mi->mi_io_kstats) {
3203 mutex_enter(&mi->mi_lock);
3204 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3205 mutex_exit(&mi->mi_lock);
3206 }
3207
3208 if ((vp->v_flag & VNOCACHE) ||
3209 (rp->r_flags & R4DIRECTIO) ||
3210 (mi->mi_flags & MI4_DIRECTIO))
3211 tsize = MIN(mi->mi_stsize, count);
3212 else
3213 tsize = MIN(mi->mi_curwrite, count);
3214 wargs->data_len = (uint_t)tsize;
3215 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3216
3217 if (mi->mi_io_kstats) {
3218 mutex_enter(&mi->mi_lock);
3219 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3220 mutex_exit(&mi->mi_lock);
3221 }
3222
3223 if (!recov) {
3224 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3225 if (e.error && !needrecov) {
3226 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3227 &recov_state, needrecov);
3228 return (e.error);
3229 }
3230 } else {
3231 if (e.error)
3232 return (e.error);
3233 }
3234
3235 /*
3236 * Do handling of OLD_STATEID outside
3237 * of the normal recovery framework.
3238 *
3239 * If write receives a BAD stateid error while using a
3240 * delegation stateid, retry using the open stateid (if it
3241 * exists). If it doesn't have an open stateid, reopen the
3242 * file first, then retry.
3243 */
3244 if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3245 sid_types.cur_sid_type != SPEC_SID) {
3246 nfs4_save_stateid(&wargs->stateid, &sid_types);
3247 if (!recov)
3248 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3249 &recov_state, needrecov);
3250 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3251 goto recov_retry;
3252 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3253 sid_types.cur_sid_type == DEL_SID) {
3254 nfs4_save_stateid(&wargs->stateid, &sid_types);
3255 mutex_enter(&rp->r_statev4_lock);
3256 rp->r_deleg_return_pending = TRUE;
3257 mutex_exit(&rp->r_statev4_lock);
3258 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3259 if (!recov)
3260 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3261 &recov_state, needrecov);
3262 (void) xdr_free(xdr_COMPOUND4res_clnt,
3263 (caddr_t)&res);
3264 return (EIO);
3265 }
3266 if (!recov)
3267 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3268 &recov_state, needrecov);
3269 /* hold needed for nfs4delegreturn_thread */
3270 VN_HOLD(vp);
3271 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3272 NFS4_DR_DISCARD), FALSE);
3273 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3274 goto recov_retry;
3275 }
3276
3277 if (needrecov) {
3278 bool_t abort;
3279
3280 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3281 "nfs4write: client got error %d, res.status %d"
3282 ", so start recovery", e.error, res.status));
3283
3284 abort = nfs4_start_recovery(&e,
3285 VTOMI4(vp), vp, NULL, &wargs->stateid,
3286 NULL, OP_WRITE, NULL, NULL, NULL);
3287 if (!e.error) {
3288 e.error = geterrno4(res.status);
3289 (void) xdr_free(xdr_COMPOUND4res_clnt,
3290 (caddr_t)&res);
3291 }
3292 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3293 &recov_state, needrecov);
3294 if (abort == FALSE)
3295 goto recov_retry;
3296 return (e.error);
3297 }
3298
3299 if (res.status) {
3300 e.error = geterrno4(res.status);
3301 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3302 if (!recov)
3303 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3304 &recov_state, needrecov);
3305 return (e.error);
3306 }
3307
3308 resop = &res.array[1]; /* write res */
3309 wres = &resop->nfs_resop4_u.opwrite;
3310
3311 if ((int)wres->count > tsize) {
3312 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3313
3314 zcmn_err(getzoneid(), CE_WARN,
3315 "nfs4write: server wrote %u, requested was %u",
3316 (int)wres->count, tsize);
3317 if (!recov)
3318 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3319 &recov_state, needrecov);
3320 return (EIO);
3321 }
3322 if (wres->committed == UNSTABLE4) {
3323 *stab_comm = UNSTABLE4;
3324 if (wargs->stable == DATA_SYNC4 ||
3325 wargs->stable == FILE_SYNC4) {
3326 (void) xdr_free(xdr_COMPOUND4res_clnt,
3327 (caddr_t)&res);
3328 zcmn_err(getzoneid(), CE_WARN,
3329 "nfs4write: server %s did not commit "
3330 "to stable storage",
3331 rp->r_server->sv_hostname);
3332 if (!recov)
3333 nfs4_end_fop(VTOMI4(vp), vp, NULL,
3334 OH_WRITE, &recov_state, needrecov);
3335 return (EIO);
3336 }
3337 }
3338
3339 tsize = (int)wres->count;
3340 count -= tsize;
3341 base += tsize;
3342 offset += tsize;
3343 if (mi->mi_io_kstats) {
3344 mutex_enter(&mi->mi_lock);
3345 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3346 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3347 tsize;
3348 mutex_exit(&mi->mi_lock);
3349 }
3350 lwp_stat_update(LWP_STAT_OUBLK, 1);
3351 mutex_enter(&rp->r_statelock);
3352 if (rp->r_flags & R4HAVEVERF) {
3353 if (rp->r_writeverf != wres->writeverf) {
3354 nfs4_set_mod(vp);
3355 rp->r_writeverf = wres->writeverf;
3356 }
3357 } else {
3358 rp->r_writeverf = wres->writeverf;
3359 rp->r_flags |= R4HAVEVERF;
3360 }
3361 PURGE_ATTRCACHE4_LOCKED(rp);
3362 rp->r_flags |= R4WRITEMODIFIED;
3363 gethrestime(&rp->r_attr.va_mtime);
3364 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3365 mutex_exit(&rp->r_statelock);
3366 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3367 } while (count);
3368
3369 if (!recov)
3370 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3371 needrecov);
3372
3373 return (e.error);
3374 }
3375
3376 /*
3377 * Read from a file. Reads data in largest chunks our interface can handle.
3378 */
3379 static int
3380 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3381 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3382 {
3383 mntinfo4_t *mi;
3384 COMPOUND4args_clnt args;
3385 COMPOUND4res_clnt res;
3386 READ4args *rargs;
3387 nfs_argop4 argop[2];
3388 int tsize;
3389 int doqueue;
3390 rnode4_t *rp;
3391 int data_len;
3392 bool_t is_eof;
3393 bool_t needrecov = FALSE;
3394 nfs4_recov_state_t recov_state;
3395 nfs4_stateid_types_t sid_types;
3396 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3397
3398 rp = VTOR4(vp);
3399 mi = VTOMI4(vp);
3400 doqueue = 1;
3401
3402 ASSERT(nfs_zone() == mi->mi_zone);
3403
3404 args.ctag = async ? TAG_READAHEAD : TAG_READ;
3405
3406 args.array_len = 2;
3407 args.array = argop;
3408
3409 nfs4_init_stateid_types(&sid_types);
3410
3411 recov_state.rs_flags = 0;
3412 recov_state.rs_num_retry_despite_err = 0;
3413
3414 recov_retry:
3415 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3416 &recov_state, NULL);
3417 if (e.error)
3418 return (e.error);
3419
3420 /* putfh target fh */
3421 argop[0].argop = OP_CPUTFH;
3422 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3423
3424 /* read */
3425 argop[1].argop = OP_READ;
3426 rargs = &argop[1].nfs_argop4_u.opread;
3427 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3428 OP_READ, &sid_types, async);
3429
3430 do {
3431 if (mi->mi_io_kstats) {
3432 mutex_enter(&mi->mi_lock);
3433 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3434 mutex_exit(&mi->mi_lock);
3435 }
3436
3437 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3438 "nfs4read: %s call, rp %s",
3439 needrecov ? "recov" : "first",
3440 rnode4info(rp)));
3441
3442 if ((vp->v_flag & VNOCACHE) ||
3443 (rp->r_flags & R4DIRECTIO) ||
3444 (mi->mi_flags & MI4_DIRECTIO))
3445 tsize = MIN(mi->mi_tsize, count);
3446 else
3447 tsize = MIN(mi->mi_curread, count);
3448
3449 rargs->offset = (offset4)offset;
3450 rargs->count = (count4)tsize;
3451 rargs->res_data_val_alt = NULL;
3452 rargs->res_mblk = NULL;
3453 rargs->res_uiop = NULL;
3454 rargs->res_maxsize = 0;
3455 rargs->wlist = NULL;
3456
3457 if (uiop)
3458 rargs->res_uiop = uiop;
3459 else
3460 rargs->res_data_val_alt = base;
3461 rargs->res_maxsize = tsize;
3462
3463 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3464 #ifdef DEBUG
3465 if (nfs4read_error_inject) {
3466 res.status = nfs4read_error_inject;
3467 nfs4read_error_inject = 0;
3468 }
3469 #endif
3470
3471 if (mi->mi_io_kstats) {
3472 mutex_enter(&mi->mi_lock);
3473 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3474 mutex_exit(&mi->mi_lock);
3475 }
3476
3477 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3478 if (e.error != 0 && !needrecov) {
3479 nfs4_end_fop(mi, vp, NULL, OH_READ,
3480 &recov_state, needrecov);
3481 return (e.error);
3482 }
3483
3484 /*
3485 * Do proper retry for OLD and BAD stateid errors outside
3486 * of the normal recovery framework. There are two differences
3487 * between async and sync reads. The first is that we allow
3488 * retry on BAD_STATEID for async reads, but not sync reads.
3489 * The second is that we mark the file dead for a failed
3490 * attempt with a special stateid for sync reads, but just
3491 * return EIO for async reads.
3492 *
3493 * If a sync read receives a BAD stateid error while using a
3494 * delegation stateid, retry using the open stateid (if it
3495 * exists). If it doesn't have an open stateid, reopen the
3496 * file first, then retry.
3497 */
3498 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3499 res.status == NFS4ERR_BAD_STATEID) && async) {
3500 nfs4_end_fop(mi, vp, NULL, OH_READ,
3501 &recov_state, needrecov);
3502 if (sid_types.cur_sid_type == SPEC_SID) {
3503 (void) xdr_free(xdr_COMPOUND4res_clnt,
3504 (caddr_t)&res);
3505 return (EIO);
3506 }
3507 nfs4_save_stateid(&rargs->stateid, &sid_types);
3508 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3509 goto recov_retry;
3510 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3511 !async && sid_types.cur_sid_type != SPEC_SID) {
3512 nfs4_save_stateid(&rargs->stateid, &sid_types);
3513 nfs4_end_fop(mi, vp, NULL, OH_READ,
3514 &recov_state, needrecov);
3515 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3516 goto recov_retry;
3517 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3518 sid_types.cur_sid_type == DEL_SID) {
3519 nfs4_save_stateid(&rargs->stateid, &sid_types);
3520 mutex_enter(&rp->r_statev4_lock);
3521 rp->r_deleg_return_pending = TRUE;
3522 mutex_exit(&rp->r_statev4_lock);
3523 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3524 nfs4_end_fop(mi, vp, NULL, OH_READ,
3525 &recov_state, needrecov);
3526 (void) xdr_free(xdr_COMPOUND4res_clnt,
3527 (caddr_t)&res);
3528 return (EIO);
3529 }
3530 nfs4_end_fop(mi, vp, NULL, OH_READ,
3531 &recov_state, needrecov);
3532 /* hold needed for nfs4delegreturn_thread */
3533 VN_HOLD(vp);
3534 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3535 NFS4_DR_DISCARD), FALSE);
3536 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3537 goto recov_retry;
3538 }
3539 if (needrecov) {
3540 bool_t abort;
3541
3542 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3543 "nfs4read: initiating recovery\n"));
3544 abort = nfs4_start_recovery(&e,
3545 mi, vp, NULL, &rargs->stateid,
3546 NULL, OP_READ, NULL, NULL, NULL);
3547 nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 &recov_state, needrecov);
3549 /*
3550 * Do not retry if we got OLD_STATEID using a special
3551 * stateid. This avoids looping with a broken server.
3552 */
3553 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3554 sid_types.cur_sid_type == SPEC_SID)
3555 abort = TRUE;
3556
3557 if (abort == FALSE) {
3558 /*
3559 * Need to retry all possible stateids in
3560 * case the recovery error wasn't stateid
3561 * related or the stateids have become
3562 * stale (server reboot).
3563 */
3564 nfs4_init_stateid_types(&sid_types);
3565 (void) xdr_free(xdr_COMPOUND4res_clnt,
3566 (caddr_t)&res);
3567 goto recov_retry;
3568 }
3569
3570 if (!e.error) {
3571 e.error = geterrno4(res.status);
3572 (void) xdr_free(xdr_COMPOUND4res_clnt,
3573 (caddr_t)&res);
3574 }
3575 return (e.error);
3576 }
3577
3578 if (res.status) {
3579 e.error = geterrno4(res.status);
3580 nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 &recov_state, needrecov);
3582 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3583 return (e.error);
3584 }
3585
3586 data_len = res.array[1].nfs_resop4_u.opread.data_len;
3587 count -= data_len;
3588 if (base)
3589 base += data_len;
3590 offset += data_len;
3591 if (mi->mi_io_kstats) {
3592 mutex_enter(&mi->mi_lock);
3593 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3594 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3595 mutex_exit(&mi->mi_lock);
3596 }
3597 lwp_stat_update(LWP_STAT_INBLK, 1);
3598 is_eof = res.array[1].nfs_resop4_u.opread.eof;
3599 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3600
3601 } while (count && !is_eof);
3602
3603 *residp = count;
3604
3605 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3606
3607 return (e.error);
3608 }
3609
3610 /* ARGSUSED */
3611 static int
3612 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3613 caller_context_t *ct)
3614 {
3615 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3616 return (EIO);
3617 switch (cmd) {
3618 case _FIODIRECTIO:
3619 return (nfs4_directio(vp, (int)arg, cr));
3620 default:
3621 return (ENOTTY);
3622 }
3623 }
3624
3625 /* ARGSUSED */
3626 int
3627 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3628 caller_context_t *ct)
3629 {
3630 int error;
3631 rnode4_t *rp = VTOR4(vp);
3632
3633 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3634 return (EIO);
3635 /*
3636 * If it has been specified that the return value will
3637 * just be used as a hint, and we are only being asked
3638 * for size, fsid or rdevid, then return the client's
3639 * notion of these values without checking to make sure
3640 * that the attribute cache is up to date.
3641 * The whole point is to avoid an over the wire GETATTR
3642 * call.
3643 */
3644 if (flags & ATTR_HINT) {
3645 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3646 mutex_enter(&rp->r_statelock);
3647 if (vap->va_mask & AT_SIZE)
3648 vap->va_size = rp->r_size;
3649 if (vap->va_mask & AT_FSID)
3650 vap->va_fsid = rp->r_attr.va_fsid;
3651 if (vap->va_mask & AT_RDEV)
3652 vap->va_rdev = rp->r_attr.va_rdev;
3653 mutex_exit(&rp->r_statelock);
3654 return (0);
3655 }
3656 }
3657
3658 /*
3659 * Only need to flush pages if asking for the mtime
3660 * and if there any dirty pages or any outstanding
3661 * asynchronous (write) requests for this file.
3662 */
3663 if (vap->va_mask & AT_MTIME) {
3664 rp = VTOR4(vp);
3665 if (nfs4_has_pages(vp)) {
3666 mutex_enter(&rp->r_statev4_lock);
3667 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3668 mutex_exit(&rp->r_statev4_lock);
3669 if (rp->r_flags & R4DIRTY ||
3670 rp->r_awcount > 0) {
3671 mutex_enter(&rp->r_statelock);
3672 rp->r_gcount++;
3673 mutex_exit(&rp->r_statelock);
3674 error =
3675 nfs4_putpage(vp, (u_offset_t)0,
3676 0, 0, cr, NULL);
3677 mutex_enter(&rp->r_statelock);
3678 if (error && (error == ENOSPC ||
3679 error == EDQUOT)) {
3680 if (!rp->r_error)
3681 rp->r_error = error;
3682 }
3683 if (--rp->r_gcount == 0)
3684 cv_broadcast(&rp->r_cv);
3685 mutex_exit(&rp->r_statelock);
3686 }
3687 } else {
3688 mutex_exit(&rp->r_statev4_lock);
3689 }
3690 }
3691 }
3692 return (nfs4getattr(vp, vap, cr));
3693 }
3694
3695 int
3696 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3697 {
3698 /*
3699 * If these are the only two bits cleared
3700 * on the server then return 0 (OK) else
3701 * return 1 (BAD).
3702 */
3703 on_client &= ~(S_ISUID|S_ISGID);
3704 if (on_client == from_server)
3705 return (0);
3706 else
3707 return (1);
3708 }
3709
3710 /*ARGSUSED4*/
3711 static int
3712 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3713 caller_context_t *ct)
3714 {
3715 if (vap->va_mask & AT_NOSET)
3716 return (EINVAL);
3717
3718 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3719 return (EIO);
3720
3721 /*
3722 * Don't call secpolicy_vnode_setattr, the client cannot
3723 * use its cached attributes to make security decisions
3724 * as the server may be faking mode bits or mapping uid/gid.
3725 * Always just let the server to the checking.
3726 * If we provide the ability to remove basic priviledges
3727 * to setattr (e.g. basic without chmod) then we will
3728 * need to add a check here before calling the server.
3729 */
3730
3731 return (nfs4setattr(vp, vap, flags, cr, NULL));
3732 }
3733
3734 /*
3735 * To replace the "guarded" version 3 setattr, we use two types of compound
3736 * setattr requests:
3737 * 1. The "normal" setattr, used when the size of the file isn't being
3738 * changed - { Putfh <fh>; Setattr; Getattr }/
3739 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3740 * with only ctime as the argument. If the server ctime differs from
3741 * what is cached on the client, the verify will fail, but we would
3742 * already have the ctime from the preceding getattr, so just set it
3743 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3744 * Setattr; Getattr }.
3745 *
3746 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3747 * this setattr and NULL if they are not.
3748 */
3749 static int
3750 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3751 vsecattr_t *vsap)
3752 {
3753 COMPOUND4args_clnt args;
3754 COMPOUND4res_clnt res, *resp = NULL;
3755 nfs4_ga_res_t *garp = NULL;
3756 int numops = 3; /* { Putfh; Setattr; Getattr } */
3757 nfs_argop4 argop[5];
3758 int verify_argop = -1;
3759 int setattr_argop = 1;
3760 nfs_resop4 *resop;
3761 vattr_t va;
3762 rnode4_t *rp;
3763 int doqueue = 1;
3764 uint_t mask = vap->va_mask;
3765 mode_t omode;
3766 vsecattr_t *vsp;
3767 timestruc_t ctime;
3768 bool_t needrecov = FALSE;
3769 nfs4_recov_state_t recov_state;
3770 nfs4_stateid_types_t sid_types;
3771 stateid4 stateid;
3772 hrtime_t t;
3773 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3774 servinfo4_t *svp;
3775 bitmap4 supp_attrs;
3776
3777 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3778 rp = VTOR4(vp);
3779 nfs4_init_stateid_types(&sid_types);
3780
3781 /*
3782 * Only need to flush pages if there are any pages and
3783 * if the file is marked as dirty in some fashion. The
3784 * file must be flushed so that we can accurately
3785 * determine the size of the file and the cached data
3786 * after the SETATTR returns. A file is considered to
3787 * be dirty if it is either marked with R4DIRTY, has
3788 * outstanding i/o's active, or is mmap'd. In this
3789 * last case, we can't tell whether there are dirty
3790 * pages, so we flush just to be sure.
3791 */
3792 if (nfs4_has_pages(vp) &&
3793 ((rp->r_flags & R4DIRTY) ||
3794 rp->r_count > 0 ||
3795 rp->r_mapcnt > 0)) {
3796 ASSERT(vp->v_type != VCHR);
3797 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3798 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3799 mutex_enter(&rp->r_statelock);
3800 if (!rp->r_error)
3801 rp->r_error = e.error;
3802 mutex_exit(&rp->r_statelock);
3803 }
3804 }
3805
3806 if (mask & AT_SIZE) {
3807 /*
3808 * Verification setattr compound for non-deleg AT_SIZE:
3809 * { Putfh; Getattr; Verify; Setattr; Getattr }
3810 * Set ctime local here (outside the do_again label)
3811 * so that subsequent retries (after failed VERIFY)
3812 * will use ctime from GETATTR results (from failed
3813 * verify compound) as VERIFY arg.
3814 * If file has delegation, then VERIFY(time_metadata)
3815 * is of little added value, so don't bother.
3816 */
3817 mutex_enter(&rp->r_statev4_lock);
3818 if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3819 rp->r_deleg_return_pending) {
3820 numops = 5;
3821 ctime = rp->r_attr.va_ctime;
3822 }
3823 mutex_exit(&rp->r_statev4_lock);
3824 }
3825
3826 recov_state.rs_flags = 0;
3827 recov_state.rs_num_retry_despite_err = 0;
3828
3829 args.ctag = TAG_SETATTR;
3830 do_again:
3831 recov_retry:
3832 setattr_argop = numops - 2;
3833
3834 args.array = argop;
3835 args.array_len = numops;
3836
3837 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3838 if (e.error)
3839 return (e.error);
3840
3841
3842 /* putfh target fh */
3843 argop[0].argop = OP_CPUTFH;
3844 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3845
3846 if (numops == 5) {
3847 /*
3848 * We only care about the ctime, but need to get mtime
3849 * and size for proper cache update.
3850 */
3851 /* getattr */
3852 argop[1].argop = OP_GETATTR;
3853 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3854 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3855
3856 /* verify - set later in loop */
3857 verify_argop = 2;
3858 }
3859
3860 /* setattr */
3861 svp = rp->r_server;
3862 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3863 supp_attrs = svp->sv_supp_attrs;
3864 nfs_rw_exit(&svp->sv_lock);
3865
3866 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3867 supp_attrs, &e.error, &sid_types);
3868 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3869 if (e.error) {
3870 /* req time field(s) overflow - return immediately */
3871 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3872 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3873 opsetattr.obj_attributes);
3874 return (e.error);
3875 }
3876 omode = rp->r_attr.va_mode;
3877
3878 /* getattr */
3879 argop[numops-1].argop = OP_GETATTR;
3880 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3881 /*
3882 * If we are setting the ACL (indicated only by vsap != NULL), request
3883 * the ACL in this getattr. The ACL returned from this getattr will be
3884 * used in updating the ACL cache.
3885 */
3886 if (vsap != NULL)
3887 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3888 FATTR4_ACL_MASK;
3889 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3890
3891 /*
3892 * setattr iterates if the object size is set and the cached ctime
3893 * does not match the file ctime. In that case, verify the ctime first.
3894 */
3895
3896 do {
3897 if (verify_argop != -1) {
3898 /*
3899 * Verify that the ctime match before doing setattr.
3900 */
3901 va.va_mask = AT_CTIME;
3902 va.va_ctime = ctime;
3903 svp = rp->r_server;
3904 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 supp_attrs = svp->sv_supp_attrs;
3906 nfs_rw_exit(&svp->sv_lock);
3907 e.error = nfs4args_verify(&argop[verify_argop], &va,
3908 OP_VERIFY, supp_attrs);
3909 if (e.error) {
3910 /* req time field(s) overflow - return */
3911 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3912 needrecov);
3913 break;
3914 }
3915 }
3916
3917 doqueue = 1;
3918
3919 t = gethrtime();
3920
3921 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3922
3923 /*
3924 * Purge the access cache and ACL cache if changing either the
3925 * owner of the file, the group owner, or the mode. These may
3926 * change the access permissions of the file, so purge old
3927 * information and start over again.
3928 */
3929 if (mask & (AT_UID | AT_GID | AT_MODE)) {
3930 (void) nfs4_access_purge_rp(rp);
3931 if (rp->r_secattr != NULL) {
3932 mutex_enter(&rp->r_statelock);
3933 vsp = rp->r_secattr;
3934 rp->r_secattr = NULL;
3935 mutex_exit(&rp->r_statelock);
3936 if (vsp != NULL)
3937 nfs4_acl_free_cache(vsp);
3938 }
3939 }
3940
3941 /*
3942 * If res.array_len == numops, then everything succeeded,
3943 * except for possibly the final getattr. If only the
3944 * last getattr failed, give up, and don't try recovery.
3945 */
3946 if (res.array_len == numops) {
3947 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3948 needrecov);
3949 if (! e.error)
3950 resp = &res;
3951 break;
3952 }
3953
3954 /*
3955 * if either rpc call failed or completely succeeded - done
3956 */
3957 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3958 if (e.error) {
3959 PURGE_ATTRCACHE4(vp);
3960 if (!needrecov) {
3961 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3962 needrecov);
3963 break;
3964 }
3965 }
3966
3967 /*
3968 * Do proper retry for OLD_STATEID outside of the normal
3969 * recovery framework.
3970 */
3971 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3972 sid_types.cur_sid_type != SPEC_SID &&
3973 sid_types.cur_sid_type != NO_SID) {
3974 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3975 needrecov);
3976 nfs4_save_stateid(&stateid, &sid_types);
3977 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3978 opsetattr.obj_attributes);
3979 if (verify_argop != -1) {
3980 nfs4args_verify_free(&argop[verify_argop]);
3981 verify_argop = -1;
3982 }
3983 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3984 goto recov_retry;
3985 }
3986
3987 if (needrecov) {
3988 bool_t abort;
3989
3990 abort = nfs4_start_recovery(&e,
3991 VTOMI4(vp), vp, NULL, NULL, NULL,
3992 OP_SETATTR, NULL, NULL, NULL);
3993 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3994 needrecov);
3995 /*
3996 * Do not retry if we failed with OLD_STATEID using
3997 * a special stateid. This is done to avoid looping
3998 * with a broken server.
3999 */
4000 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4001 (sid_types.cur_sid_type == SPEC_SID ||
4002 sid_types.cur_sid_type == NO_SID))
4003 abort = TRUE;
4004 if (!e.error) {
4005 if (res.status == NFS4ERR_BADOWNER)
4006 nfs4_log_badowner(VTOMI4(vp),
4007 OP_SETATTR);
4008
4009 e.error = geterrno4(res.status);
4010 (void) xdr_free(xdr_COMPOUND4res_clnt,
4011 (caddr_t)&res);
4012 }
4013 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4014 opsetattr.obj_attributes);
4015 if (verify_argop != -1) {
4016 nfs4args_verify_free(&argop[verify_argop]);
4017 verify_argop = -1;
4018 }
4019 if (abort == FALSE) {
4020 /*
4021 * Need to retry all possible stateids in
4022 * case the recovery error wasn't stateid
4023 * related or the stateids have become
4024 * stale (server reboot).
4025 */
4026 nfs4_init_stateid_types(&sid_types);
4027 goto recov_retry;
4028 }
4029 return (e.error);
4030 }
4031
4032 /*
4033 * Need to call nfs4_end_op before nfs4getattr to
4034 * avoid potential nfs4_start_op deadlock. See RFE
4035 * 4777612. Calls to nfs4_invalidate_pages() and
4036 * nfs4_purge_stale_fh() might also generate over the
4037 * wire calls which my cause nfs4_start_op() deadlock.
4038 */
4039 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4040
4041 /*
4042 * Check to update lease.
4043 */
4044 resp = &res;
4045 if (res.status == NFS4_OK) {
4046 break;
4047 }
4048
4049 /*
4050 * Check if verify failed to see if try again
4051 */
4052 if ((verify_argop == -1) || (res.array_len != 3)) {
4053 /*
4054 * can't continue...
4055 */
4056 if (res.status == NFS4ERR_BADOWNER)
4057 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4058
4059 e.error = geterrno4(res.status);
4060 } else {
4061 /*
4062 * When the verify request fails, the client ctime is
4063 * not in sync with the server. This is the same as
4064 * the version 3 "not synchronized" error, and we
4065 * handle it in a similar manner (XXX do we need to???).
4066 * Use the ctime returned in the first getattr for
4067 * the input to the next verify.
4068 * If we couldn't get the attributes, then we give up
4069 * because we can't complete the operation as required.
4070 */
4071 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4072 }
4073 if (e.error) {
4074 PURGE_ATTRCACHE4(vp);
4075 nfs4_purge_stale_fh(e.error, vp, cr);
4076 } else {
4077 /*
4078 * retry with a new verify value
4079 */
4080 ctime = garp->n4g_va.va_ctime;
4081 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4082 resp = NULL;
4083 }
4084 if (!e.error) {
4085 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4086 opsetattr.obj_attributes);
4087 if (verify_argop != -1) {
4088 nfs4args_verify_free(&argop[verify_argop]);
4089 verify_argop = -1;
4090 }
4091 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4092 goto do_again;
4093 }
4094 } while (!e.error);
4095
4096 if (e.error) {
4097 /*
4098 * If we are here, rfs4call has an irrecoverable error - return
4099 */
4100 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4101 opsetattr.obj_attributes);
4102 if (verify_argop != -1) {
4103 nfs4args_verify_free(&argop[verify_argop]);
4104 verify_argop = -1;
4105 }
4106 if (resp)
4107 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4108 return (e.error);
4109 }
4110
4111
4112
4113 /*
4114 * If changing the size of the file, invalidate
4115 * any local cached data which is no longer part
4116 * of the file. We also possibly invalidate the
4117 * last page in the file. We could use
4118 * pvn_vpzero(), but this would mark the page as
4119 * modified and require it to be written back to
4120 * the server for no particularly good reason.
4121 * This way, if we access it, then we bring it
4122 * back in. A read should be cheaper than a
4123 * write.
4124 */
4125 if (mask & AT_SIZE) {
4126 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4127 }
4128
4129 /* either no error or one of the postop getattr failed */
4130
4131 /*
4132 * XXX Perform a simplified version of wcc checking. Instead of
4133 * have another getattr to get pre-op, just purge cache if
4134 * any of the ops prior to and including the getattr failed.
4135 * If the getattr succeeded then update the attrcache accordingly.
4136 */
4137
4138 garp = NULL;
4139 if (res.status == NFS4_OK) {
4140 /*
4141 * Last getattr
4142 */
4143 resop = &res.array[numops - 1];
4144 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4145 }
4146 /*
4147 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4148 * rather than filling it. See the function itself for details.
4149 */
4150 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4151 if (garp != NULL) {
4152 if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4153 nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4154 vs_ace4_destroy(&garp->n4g_vsa);
4155 } else {
4156 if (vsap != NULL) {
4157 /*
4158 * The ACL was supposed to be set and to be
4159 * returned in the last getattr of this
4160 * compound, but for some reason the getattr
4161 * result doesn't contain the ACL. In this
4162 * case, purge the ACL cache.
4163 */
4164 if (rp->r_secattr != NULL) {
4165 mutex_enter(&rp->r_statelock);
4166 vsp = rp->r_secattr;
4167 rp->r_secattr = NULL;
4168 mutex_exit(&rp->r_statelock);
4169 if (vsp != NULL)
4170 nfs4_acl_free_cache(vsp);
4171 }
4172 }
4173 }
4174 }
4175
4176 if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4177 /*
4178 * Set the size, rather than relying on getting it updated
4179 * via a GETATTR. With delegations the client tries to
4180 * suppress GETATTR calls.
4181 */
4182 mutex_enter(&rp->r_statelock);
4183 rp->r_size = vap->va_size;
4184 mutex_exit(&rp->r_statelock);
4185 }
4186
4187 /*
4188 * Can free up request args and res
4189 */
4190 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4191 opsetattr.obj_attributes);
4192 if (verify_argop != -1) {
4193 nfs4args_verify_free(&argop[verify_argop]);
4194 verify_argop = -1;
4195 }
4196 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4197
4198 /*
4199 * Some servers will change the mode to clear the setuid
4200 * and setgid bits when changing the uid or gid. The
4201 * client needs to compensate appropriately.
4202 */
4203 if (mask & (AT_UID | AT_GID)) {
4204 int terror, do_setattr;
4205
4206 do_setattr = 0;
4207 va.va_mask = AT_MODE;
4208 terror = nfs4getattr(vp, &va, cr);
4209 if (!terror &&
4210 (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4211 (!(mask & AT_MODE) && va.va_mode != omode))) {
4212 va.va_mask = AT_MODE;
4213 if (mask & AT_MODE) {
4214 /*
4215 * We asked the mode to be changed and what
4216 * we just got from the server in getattr is
4217 * not what we wanted it to be, so set it now.
4218 */
4219 va.va_mode = vap->va_mode;
4220 do_setattr = 1;
4221 } else {
4222 /*
4223 * We did not ask the mode to be changed,
4224 * Check to see that the server just cleared
4225 * I_SUID and I_GUID from it. If not then
4226 * set mode to omode with UID/GID cleared.
4227 */
4228 if (nfs4_compare_modes(va.va_mode, omode)) {
4229 omode &= ~(S_ISUID|S_ISGID);
4230 va.va_mode = omode;
4231 do_setattr = 1;
4232 }
4233 }
4234
4235 if (do_setattr)
4236 (void) nfs4setattr(vp, &va, 0, cr, NULL);
4237 }
4238 }
4239
4240 return (e.error);
4241 }
4242
4243 /* ARGSUSED */
4244 static int
4245 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4246 {
4247 COMPOUND4args_clnt args;
4248 COMPOUND4res_clnt res;
4249 int doqueue;
4250 uint32_t acc, resacc, argacc;
4251 rnode4_t *rp;
4252 cred_t *cred, *ncr, *ncrfree = NULL;
4253 nfs4_access_type_t cacc;
4254 int num_ops;
4255 nfs_argop4 argop[3];
4256 nfs_resop4 *resop;
4257 bool_t needrecov = FALSE, do_getattr;
4258 nfs4_recov_state_t recov_state;
4259 int rpc_error;
4260 hrtime_t t;
4261 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4262 mntinfo4_t *mi = VTOMI4(vp);
4263
4264 if (nfs_zone() != mi->mi_zone)
4265 return (EIO);
4266
4267 acc = 0;
4268 if (mode & VREAD)
4269 acc |= ACCESS4_READ;
4270 if (mode & VWRITE) {
4271 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4272 return (EROFS);
4273 if (vp->v_type == VDIR)
4274 acc |= ACCESS4_DELETE;
4275 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4276 }
4277 if (mode & VEXEC) {
4278 if (vp->v_type == VDIR)
4279 acc |= ACCESS4_LOOKUP;
4280 else
4281 acc |= ACCESS4_EXECUTE;
4282 }
4283
4284 if (VTOR4(vp)->r_acache != NULL) {
4285 e.error = nfs4_validate_caches(vp, cr);
4286 if (e.error)
4287 return (e.error);
4288 }
4289
4290 rp = VTOR4(vp);
4291 if (vp->v_type == VDIR)
4292 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4293 ACCESS4_EXTEND | ACCESS4_LOOKUP;
4294 else
4295 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4296 ACCESS4_EXECUTE;
4297 recov_state.rs_flags = 0;
4298 recov_state.rs_num_retry_despite_err = 0;
4299
4300 cred = cr;
4301 /*
4302 * ncr and ncrfree both initially
4303 * point to the memory area returned
4304 * by crnetadjust();
4305 * ncrfree not NULL when exiting means
4306 * that we need to release it
4307 */
4308 ncr = crnetadjust(cred);
4309 ncrfree = ncr;
4310
4311 tryagain:
4312 cacc = nfs4_access_check(rp, acc, cred);
4313 if (cacc == NFS4_ACCESS_ALLOWED) {
4314 if (ncrfree != NULL)
4315 crfree(ncrfree);
4316 return (0);
4317 }
4318 if (cacc == NFS4_ACCESS_DENIED) {
4319 /*
4320 * If the cred can be adjusted, try again
4321 * with the new cred.
4322 */
4323 if (ncr != NULL) {
4324 cred = ncr;
4325 ncr = NULL;
4326 goto tryagain;
4327 }
4328 if (ncrfree != NULL)
4329 crfree(ncrfree);
4330 return (EACCES);
4331 }
4332
4333 recov_retry:
4334 /*
4335 * Don't take with r_statev4_lock here. r_deleg_type could
4336 * change as soon as lock is released. Since it is an int,
4337 * there is no atomicity issue.
4338 */
4339 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4340 num_ops = do_getattr ? 3 : 2;
4341
4342 args.ctag = TAG_ACCESS;
4343
4344 args.array_len = num_ops;
4345 args.array = argop;
4346
4347 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4348 &recov_state, NULL)) {
4349 if (ncrfree != NULL)
4350 crfree(ncrfree);
4351 return (e.error);
4352 }
4353
4354 /* putfh target fh */
4355 argop[0].argop = OP_CPUTFH;
4356 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4357
4358 /* access */
4359 argop[1].argop = OP_ACCESS;
4360 argop[1].nfs_argop4_u.opaccess.access = argacc;
4361
4362 /* getattr */
4363 if (do_getattr) {
4364 argop[2].argop = OP_GETATTR;
4365 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4366 argop[2].nfs_argop4_u.opgetattr.mi = mi;
4367 }
4368
4369 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4370 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4371 rnode4info(VTOR4(vp))));
4372
4373 doqueue = 1;
4374 t = gethrtime();
4375 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4376 rpc_error = e.error;
4377
4378 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4379 if (needrecov) {
4380 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4381 "nfs4_access: initiating recovery\n"));
4382
4383 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4384 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4385 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4386 &recov_state, needrecov);
4387 if (!e.error)
4388 (void) xdr_free(xdr_COMPOUND4res_clnt,
4389 (caddr_t)&res);
4390 goto recov_retry;
4391 }
4392 }
4393 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4394
4395 if (e.error)
4396 goto out;
4397
4398 if (res.status) {
4399 e.error = geterrno4(res.status);
4400 /*
4401 * This might generate over the wire calls throught
4402 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4403 * here to avoid a deadlock.
4404 */
4405 nfs4_purge_stale_fh(e.error, vp, cr);
4406 goto out;
4407 }
4408 resop = &res.array[1]; /* access res */
4409
4410 resacc = resop->nfs_resop4_u.opaccess.access;
4411
4412 if (do_getattr) {
4413 resop++; /* getattr res */
4414 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4415 t, cr, FALSE, NULL);
4416 }
4417
4418 if (!e.error) {
4419 nfs4_access_cache(rp, argacc, resacc, cred);
4420 /*
4421 * we just cached results with cred; if cred is the
4422 * adjusted credentials from crnetadjust, we do not want
4423 * to release them before exiting: hence setting ncrfree
4424 * to NULL
4425 */
4426 if (cred != cr)
4427 ncrfree = NULL;
4428 /* XXX check the supported bits too? */
4429 if ((acc & resacc) != acc) {
4430 /*
4431 * The following code implements the semantic
4432 * that a setuid root program has *at least* the
4433 * permissions of the user that is running the
4434 * program. See rfs3call() for more portions
4435 * of the implementation of this functionality.
4436 */
4437 /* XXX-LP */
4438 if (ncr != NULL) {
4439 (void) xdr_free(xdr_COMPOUND4res_clnt,
4440 (caddr_t)&res);
4441 cred = ncr;
4442 ncr = NULL;
4443 goto tryagain;
4444 }
4445 e.error = EACCES;
4446 }
4447 }
4448
4449 out:
4450 if (!rpc_error)
4451 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4452
4453 if (ncrfree != NULL)
4454 crfree(ncrfree);
4455
4456 return (e.error);
4457 }
4458
4459 /* ARGSUSED */
4460 static int
4461 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4462 {
4463 COMPOUND4args_clnt args;
4464 COMPOUND4res_clnt res;
4465 int doqueue;
4466 rnode4_t *rp;
4467 nfs_argop4 argop[3];
4468 nfs_resop4 *resop;
4469 READLINK4res *lr_res;
4470 nfs4_ga_res_t *garp;
4471 uint_t len;
4472 char *linkdata;
4473 bool_t needrecov = FALSE;
4474 nfs4_recov_state_t recov_state;
4475 hrtime_t t;
4476 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4477
4478 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4479 return (EIO);
4480 /*
4481 * Can't readlink anything other than a symbolic link.
4482 */
4483 if (vp->v_type != VLNK)
4484 return (EINVAL);
4485
4486 rp = VTOR4(vp);
4487 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4488 e.error = nfs4_validate_caches(vp, cr);
4489 if (e.error)
4490 return (e.error);
4491 mutex_enter(&rp->r_statelock);
4492 if (rp->r_symlink.contents != NULL) {
4493 e.error = uiomove(rp->r_symlink.contents,
4494 rp->r_symlink.len, UIO_READ, uiop);
4495 mutex_exit(&rp->r_statelock);
4496 return (e.error);
4497 }
4498 mutex_exit(&rp->r_statelock);
4499 }
4500 recov_state.rs_flags = 0;
4501 recov_state.rs_num_retry_despite_err = 0;
4502
4503 recov_retry:
4504 args.array_len = 3;
4505 args.array = argop;
4506 args.ctag = TAG_READLINK;
4507
4508 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4509 if (e.error) {
4510 return (e.error);
4511 }
4512
4513 /* 0. putfh symlink fh */
4514 argop[0].argop = OP_CPUTFH;
4515 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4516
4517 /* 1. readlink */
4518 argop[1].argop = OP_READLINK;
4519
4520 /* 2. getattr */
4521 argop[2].argop = OP_GETATTR;
4522 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4523 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4524
4525 doqueue = 1;
4526
4527 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4528 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4529 rnode4info(VTOR4(vp))));
4530
4531 t = gethrtime();
4532
4533 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4534
4535 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4536 if (needrecov) {
4537 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4538 "nfs4_readlink: initiating recovery\n"));
4539
4540 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4541 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4542 if (!e.error)
4543 (void) xdr_free(xdr_COMPOUND4res_clnt,
4544 (caddr_t)&res);
4545
4546 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4547 needrecov);
4548 goto recov_retry;
4549 }
4550 }
4551
4552 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4553
4554 if (e.error)
4555 return (e.error);
4556
4557 /*
4558 * There is an path in the code below which calls
4559 * nfs4_purge_stale_fh(), which may generate otw calls through
4560 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4561 * here to avoid nfs4_start_op() deadlock.
4562 */
4563
4564 if (res.status && (res.array_len < args.array_len)) {
4565 /*
4566 * either Putfh or Link failed
4567 */
4568 e.error = geterrno4(res.status);
4569 nfs4_purge_stale_fh(e.error, vp, cr);
4570 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4571 return (e.error);
4572 }
4573
4574 resop = &res.array[1]; /* readlink res */
4575 lr_res = &resop->nfs_resop4_u.opreadlink;
4576
4577 /*
4578 * treat symlink names as data
4579 */
4580 linkdata = utf8_to_str(&lr_res->link, &len, NULL);
4581 if (linkdata != NULL) {
4582 int uio_len = len - 1;
4583 /* len includes null byte, which we won't uiomove */
4584 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4585 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4586 mutex_enter(&rp->r_statelock);
4587 if (rp->r_symlink.contents == NULL) {
4588 rp->r_symlink.contents = linkdata;
4589 rp->r_symlink.len = uio_len;
4590 rp->r_symlink.size = len;
4591 mutex_exit(&rp->r_statelock);
4592 } else {
4593 mutex_exit(&rp->r_statelock);
4594 kmem_free(linkdata, len);
4595 }
4596 } else {
4597 kmem_free(linkdata, len);
4598 }
4599 }
4600 if (res.status == NFS4_OK) {
4601 resop++; /* getattr res */
4602 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4603 }
4604 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4605
4606 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4607
4608 /*
4609 * The over the wire error for attempting to readlink something
4610 * other than a symbolic link is ENXIO. However, we need to
4611 * return EINVAL instead of ENXIO, so we map it here.
4612 */
4613 return (e.error == ENXIO ? EINVAL : e.error);
4614 }
4615
4616 /*
4617 * Flush local dirty pages to stable storage on the server.
4618 *
4619 * If FNODSYNC is specified, then there is nothing to do because
4620 * metadata changes are not cached on the client before being
4621 * sent to the server.
4622 */
4623 /* ARGSUSED */
4624 static int
4625 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4626 {
4627 int error;
4628
4629 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4630 return (0);
4631 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4632 return (EIO);
4633 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4634 if (!error)
4635 error = VTOR4(vp)->r_error;
4636 return (error);
4637 }
4638
4639 /*
4640 * Weirdness: if the file was removed or the target of a rename
4641 * operation while it was open, it got renamed instead. Here we
4642 * remove the renamed file.
4643 */
4644 /* ARGSUSED */
4645 void
4646 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4647 {
4648 rnode4_t *rp;
4649
4650 ASSERT(vp != DNLC_NO_VNODE);
4651
4652 rp = VTOR4(vp);
4653
4654 if (IS_SHADOW(vp, rp)) {
4655 sv_inactive(vp);
4656 return;
4657 }
4658
4659 /*
4660 * If this is coming from the wrong zone, we let someone in the right
4661 * zone take care of it asynchronously. We can get here due to
4662 * VN_RELE() being called from pageout() or fsflush(). This call may
4663 * potentially turn into an expensive no-op if, for instance, v_count
4664 * gets incremented in the meantime, but it's still correct.
4665 */
4666 if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4667 nfs4_async_inactive(vp, cr);
4668 return;
4669 }
4670
4671 /*
4672 * Some of the cleanup steps might require over-the-wire
4673 * operations. Since VOP_INACTIVE can get called as a result of
4674 * other over-the-wire operations (e.g., an attribute cache update
4675 * can lead to a DNLC purge), doing those steps now would lead to a
4676 * nested call to the recovery framework, which can deadlock. So
4677 * do any over-the-wire cleanups asynchronously, in a separate
4678 * thread.
4679 */
4680
4681 mutex_enter(&rp->r_os_lock);
4682 mutex_enter(&rp->r_statelock);
4683 mutex_enter(&rp->r_statev4_lock);
4684
4685 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4686 mutex_exit(&rp->r_statev4_lock);
4687 mutex_exit(&rp->r_statelock);
4688 mutex_exit(&rp->r_os_lock);
4689 nfs4_async_inactive(vp, cr);
4690 return;
4691 }
4692
4693 if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4694 rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4695 mutex_exit(&rp->r_statev4_lock);
4696 mutex_exit(&rp->r_statelock);
4697 mutex_exit(&rp->r_os_lock);
4698 nfs4_async_inactive(vp, cr);
4699 return;
4700 }
4701
4702 if (rp->r_unldvp != NULL) {
4703 mutex_exit(&rp->r_statev4_lock);
4704 mutex_exit(&rp->r_statelock);
4705 mutex_exit(&rp->r_os_lock);
4706 nfs4_async_inactive(vp, cr);
4707 return;
4708 }
4709 mutex_exit(&rp->r_statev4_lock);
4710 mutex_exit(&rp->r_statelock);
4711 mutex_exit(&rp->r_os_lock);
4712
4713 rp4_addfree(rp, cr);
4714 }
4715
4716 /*
4717 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4718 * various bits of state. The caller must not refer to vp after this call.
4719 */
4720
4721 void
4722 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4723 {
4724 rnode4_t *rp = VTOR4(vp);
4725 nfs4_recov_state_t recov_state;
4726 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4727 vnode_t *unldvp;
4728 char *unlname;
4729 cred_t *unlcred;
4730 COMPOUND4args_clnt args;
4731 COMPOUND4res_clnt res, *resp;
4732 nfs_argop4 argop[2];
4733 int doqueue;
4734 #ifdef DEBUG
4735 char *name;
4736 #endif
4737
4738 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4739 ASSERT(!IS_SHADOW(vp, rp));
4740
4741 #ifdef DEBUG
4742 name = fn_name(VTOSV(vp)->sv_name);
4743 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4744 "release vnode %s", name));
4745 kmem_free(name, MAXNAMELEN);
4746 #endif
4747
4748 if (vp->v_type == VREG) {
4749 bool_t recov_failed = FALSE;
4750
4751 e.error = nfs4close_all(vp, cr);
4752 if (e.error) {
4753 /* Check to see if recovery failed */
4754 mutex_enter(&(VTOMI4(vp)->mi_lock));
4755 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4756 recov_failed = TRUE;
4757 mutex_exit(&(VTOMI4(vp)->mi_lock));
4758 if (!recov_failed) {
4759 mutex_enter(&rp->r_statelock);
4760 if (rp->r_flags & R4RECOVERR)
4761 recov_failed = TRUE;
4762 mutex_exit(&rp->r_statelock);
4763 }
4764 if (recov_failed) {
4765 NFS4_DEBUG(nfs4_client_recov_debug,
4766 (CE_NOTE, "nfs4_inactive_otw: "
4767 "close failed (recovery failure)"));
4768 }
4769 }
4770 }
4771
4772 redo:
4773 if (rp->r_unldvp == NULL) {
4774 rp4_addfree(rp, cr);
4775 return;
4776 }
4777
4778 /*
4779 * Save the vnode pointer for the directory where the
4780 * unlinked-open file got renamed, then set it to NULL
4781 * to prevent another thread from getting here before
4782 * we're done with the remove. While we have the
4783 * statelock, make local copies of the pertinent rnode
4784 * fields. If we weren't to do this in an atomic way, the
4785 * the unl* fields could become inconsistent with respect
4786 * to each other due to a race condition between this
4787 * code and nfs_remove(). See bug report 1034328.
4788 */
4789 mutex_enter(&rp->r_statelock);
4790 if (rp->r_unldvp == NULL) {
4791 mutex_exit(&rp->r_statelock);
4792 rp4_addfree(rp, cr);
4793 return;
4794 }
4795
4796 unldvp = rp->r_unldvp;
4797 rp->r_unldvp = NULL;
4798 unlname = rp->r_unlname;
4799 rp->r_unlname = NULL;
4800 unlcred = rp->r_unlcred;
4801 rp->r_unlcred = NULL;
4802 mutex_exit(&rp->r_statelock);
4803
4804 /*
4805 * If there are any dirty pages left, then flush
4806 * them. This is unfortunate because they just
4807 * may get thrown away during the remove operation,
4808 * but we have to do this for correctness.
4809 */
4810 if (nfs4_has_pages(vp) &&
4811 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4812 ASSERT(vp->v_type != VCHR);
4813 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4814 if (e.error) {
4815 mutex_enter(&rp->r_statelock);
4816 if (!rp->r_error)
4817 rp->r_error = e.error;
4818 mutex_exit(&rp->r_statelock);
4819 }
4820 }
4821
4822 recov_state.rs_flags = 0;
4823 recov_state.rs_num_retry_despite_err = 0;
4824 recov_retry_remove:
4825 /*
4826 * Do the remove operation on the renamed file
4827 */
4828 args.ctag = TAG_INACTIVE;
4829
4830 /*
4831 * Remove ops: putfh dir; remove
4832 */
4833 args.array_len = 2;
4834 args.array = argop;
4835
4836 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4837 if (e.error) {
4838 kmem_free(unlname, MAXNAMELEN);
4839 crfree(unlcred);
4840 VN_RELE(unldvp);
4841 /*
4842 * Try again; this time around r_unldvp will be NULL, so we'll
4843 * just call rp4_addfree() and return.
4844 */
4845 goto redo;
4846 }
4847
4848 /* putfh directory */
4849 argop[0].argop = OP_CPUTFH;
4850 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4851
4852 /* remove */
4853 argop[1].argop = OP_CREMOVE;
4854 argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4855
4856 doqueue = 1;
4857 resp = &res;
4858
4859 #if 0 /* notyet */
4860 /*
4861 * Can't do this yet. We may be being called from
4862 * dnlc_purge_XXX while that routine is holding a
4863 * mutex lock to the nc_rele list. The calls to
4864 * nfs3_cache_wcc_data may result in calls to
4865 * dnlc_purge_XXX. This will result in a deadlock.
4866 */
4867 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4868 if (e.error) {
4869 PURGE_ATTRCACHE4(unldvp);
4870 resp = NULL;
4871 } else if (res.status) {
4872 e.error = geterrno4(res.status);
4873 PURGE_ATTRCACHE4(unldvp);
4874 /*
4875 * This code is inactive right now
4876 * but if made active there should
4877 * be a nfs4_end_op() call before
4878 * nfs4_purge_stale_fh to avoid start_op()
4879 * deadlock. See BugId: 4948726
4880 */
4881 nfs4_purge_stale_fh(error, unldvp, cr);
4882 } else {
4883 nfs_resop4 *resop;
4884 REMOVE4res *rm_res;
4885
4886 resop = &res.array[1];
4887 rm_res = &resop->nfs_resop4_u.opremove;
4888 /*
4889 * Update directory cache attribute,
4890 * readdir and dnlc caches.
4891 */
4892 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4893 }
4894 #else
4895 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4896
4897 PURGE_ATTRCACHE4(unldvp);
4898 #endif
4899
4900 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4901 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4902 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4903 if (!e.error)
4904 (void) xdr_free(xdr_COMPOUND4res_clnt,
4905 (caddr_t)&res);
4906 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4907 &recov_state, TRUE);
4908 goto recov_retry_remove;
4909 }
4910 }
4911 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4912
4913 /*
4914 * Release stuff held for the remove
4915 */
4916 VN_RELE(unldvp);
4917 if (!e.error && resp)
4918 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4919
4920 kmem_free(unlname, MAXNAMELEN);
4921 crfree(unlcred);
4922 goto redo;
4923 }
4924
4925 /*
4926 * Remote file system operations having to do with directory manipulation.
4927 */
4928 /* ARGSUSED3 */
4929 int
4930 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4931 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4932 int *direntflags, pathname_t *realpnp)
4933 {
4934 int error;
4935 vnode_t *vp, *avp = NULL;
4936 rnode4_t *drp;
4937
4938 *vpp = NULL;
4939 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4940 return (EPERM);
4941 /*
4942 * if LOOKUP_XATTR, must replace dvp (object) with
4943 * object's attrdir before continuing with lookup
4944 */
4945 if (flags & LOOKUP_XATTR) {
4946 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4947 if (error)
4948 return (error);
4949
4950 dvp = avp;
4951
4952 /*
4953 * If lookup is for "", just return dvp now. The attrdir
4954 * has already been activated (from nfs4lookup_xattr), and
4955 * the caller will RELE the original dvp -- not
4956 * the attrdir. So, set vpp and return.
4957 * Currently, when the LOOKUP_XATTR flag is
4958 * passed to VOP_LOOKUP, the name is always empty, and
4959 * shortcircuiting here avoids 3 unneeded lock/unlock
4960 * pairs.
4961 *
4962 * If a non-empty name was provided, then it is the
4963 * attribute name, and it will be looked up below.
4964 */
4965 if (*nm == '\0') {
4966 *vpp = dvp;
4967 return (0);
4968 }
4969
4970 /*
4971 * The vfs layer never sends a name when asking for the
4972 * attrdir, so we should never get here (unless of course
4973 * name is passed at some time in future -- at which time
4974 * we'll blow up here).
4975 */
4976 ASSERT(0);
4977 }
4978
4979 drp = VTOR4(dvp);
4980 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4981 return (EINTR);
4982
4983 error = nfs4lookup(dvp, nm, vpp, cr, 0);
4984 nfs_rw_exit(&drp->r_rwlock);
4985
4986 /*
4987 * If vnode is a device, create special vnode.
4988 */
4989 if (!error && ISVDEV((*vpp)->v_type)) {
4990 vp = *vpp;
4991 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
4992 VN_RELE(vp);
4993 }
4994
4995 return (error);
4996 }
4997
4998 /* ARGSUSED */
4999 static int
5000 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5001 {
5002 int error;
5003 rnode4_t *drp;
5004 int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5005 mntinfo4_t *mi;
5006
5007 mi = VTOMI4(dvp);
5008 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5009 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5010 return (EINVAL);
5011
5012 drp = VTOR4(dvp);
5013 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5014 return (EINTR);
5015
5016 mutex_enter(&drp->r_statelock);
5017 /*
5018 * If the server doesn't support xattrs just return EINVAL
5019 */
5020 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5021 mutex_exit(&drp->r_statelock);
5022 nfs_rw_exit(&drp->r_rwlock);
5023 return (EINVAL);
5024 }
5025
5026 /*
5027 * If there is a cached xattr directory entry,
5028 * use it as long as the attributes are valid. If the
5029 * attributes are not valid, take the simple approach and
5030 * free the cached value and re-fetch a new value.
5031 *
5032 * We don't negative entry cache for now, if we did we
5033 * would need to check if the file has changed on every
5034 * lookup. But xattrs don't exist very often and failing
5035 * an openattr is not much more expensive than and NVERIFY or GETATTR
5036 * so do an openattr over the wire for now.
5037 */
5038 if (drp->r_xattr_dir != NULL) {
5039 if (ATTRCACHE4_VALID(dvp)) {
5040 VN_HOLD(drp->r_xattr_dir);
5041 *vpp = drp->r_xattr_dir;
5042 mutex_exit(&drp->r_statelock);
5043 nfs_rw_exit(&drp->r_rwlock);
5044 return (0);
5045 }
5046 VN_RELE(drp->r_xattr_dir);
5047 drp->r_xattr_dir = NULL;
5048 }
5049 mutex_exit(&drp->r_statelock);
5050
5051 error = nfs4openattr(dvp, vpp, cflag, cr);
5052
5053 nfs_rw_exit(&drp->r_rwlock);
5054
5055 return (error);
5056 }
5057
5058 static int
5059 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5060 {
5061 int error;
5062 rnode4_t *drp;
5063
5064 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5065
5066 /*
5067 * If lookup is for "", just return dvp. Don't need
5068 * to send it over the wire, look it up in the dnlc,
5069 * or perform any access checks.
5070 */
5071 if (*nm == '\0') {
5072 VN_HOLD(dvp);
5073 *vpp = dvp;
5074 return (0);
5075 }
5076
5077 /*
5078 * Can't do lookups in non-directories.
5079 */
5080 if (dvp->v_type != VDIR)
5081 return (ENOTDIR);
5082
5083 /*
5084 * If lookup is for ".", just return dvp. Don't need
5085 * to send it over the wire or look it up in the dnlc,
5086 * just need to check access.
5087 */
5088 if (nm[0] == '.' && nm[1] == '\0') {
5089 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5090 if (error)
5091 return (error);
5092 VN_HOLD(dvp);
5093 *vpp = dvp;
5094 return (0);
5095 }
5096
5097 drp = VTOR4(dvp);
5098 if (!(drp->r_flags & R4LOOKUP)) {
5099 mutex_enter(&drp->r_statelock);
5100 drp->r_flags |= R4LOOKUP;
5101 mutex_exit(&drp->r_statelock);
5102 }
5103
5104 *vpp = NULL;
5105 /*
5106 * Lookup this name in the DNLC. If there is no entry
5107 * lookup over the wire.
5108 */
5109 if (!skipdnlc)
5110 *vpp = dnlc_lookup(dvp, nm);
5111 if (*vpp == NULL) {
5112 /*
5113 * We need to go over the wire to lookup the name.
5114 */
5115 return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5116 }
5117
5118 /*
5119 * We hit on the dnlc
5120 */
5121 if (*vpp != DNLC_NO_VNODE ||
5122 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5123 /*
5124 * But our attrs may not be valid.
5125 */
5126 if (ATTRCACHE4_VALID(dvp)) {
5127 error = nfs4_waitfor_purge_complete(dvp);
5128 if (error) {
5129 VN_RELE(*vpp);
5130 *vpp = NULL;
5131 return (error);
5132 }
5133
5134 /*
5135 * If after the purge completes, check to make sure
5136 * our attrs are still valid.
5137 */
5138 if (ATTRCACHE4_VALID(dvp)) {
5139 /*
5140 * If we waited for a purge we may have
5141 * lost our vnode so look it up again.
5142 */
5143 VN_RELE(*vpp);
5144 *vpp = dnlc_lookup(dvp, nm);
5145 if (*vpp == NULL)
5146 return (nfs4lookupnew_otw(dvp,
5147 nm, vpp, cr));
5148
5149 /*
5150 * The access cache should almost always hit
5151 */
5152 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5153
5154 if (error) {
5155 VN_RELE(*vpp);
5156 *vpp = NULL;
5157 return (error);
5158 }
5159 if (*vpp == DNLC_NO_VNODE) {
5160 VN_RELE(*vpp);
5161 *vpp = NULL;
5162 return (ENOENT);
5163 }
5164 return (0);
5165 }
5166 }
5167 }
5168
5169 ASSERT(*vpp != NULL);
5170
5171 /*
5172 * We may have gotten here we have one of the following cases:
5173 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5174 * need to validate them.
5175 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always
5176 * must validate.
5177 *
5178 * Go to the server and check if the directory has changed, if
5179 * it hasn't we are done and can use the dnlc entry.
5180 */
5181 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5182 }
5183
5184 /*
5185 * Go to the server and check if the directory has changed, if
5186 * it hasn't we are done and can use the dnlc entry. If it
5187 * has changed we get a new copy of its attributes and check
5188 * the access for VEXEC, then relookup the filename and
5189 * get its filehandle and attributes.
5190 *
5191 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5192 * if the NVERIFY failed we must
5193 * purge the caches
5194 * cache new attributes (will set r_time_attr_inval)
5195 * cache new access
5196 * recheck VEXEC access
5197 * add name to dnlc, possibly negative
5198 * if LOOKUP succeeded
5199 * cache new attributes
5200 * else
5201 * set a new r_time_attr_inval for dvp
5202 * check to make sure we have access
5203 *
5204 * The vpp returned is the vnode passed in if the directory is valid,
5205 * a new vnode if successful lookup, or NULL on error.
5206 */
5207 static int
5208 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5209 {
5210 COMPOUND4args_clnt args;
5211 COMPOUND4res_clnt res;
5212 fattr4 *ver_fattr;
5213 fattr4_change dchange;
5214 int32_t *ptr;
5215 int argoplist_size = 7 * sizeof (nfs_argop4);
5216 nfs_argop4 *argop;
5217 int doqueue;
5218 mntinfo4_t *mi;
5219 nfs4_recov_state_t recov_state;
5220 hrtime_t t;
5221 int isdotdot;
5222 vnode_t *nvp;
5223 nfs_fh4 *fhp;
5224 nfs4_sharedfh_t *sfhp;
5225 nfs4_access_type_t cacc;
5226 rnode4_t *nrp;
5227 rnode4_t *drp = VTOR4(dvp);
5228 nfs4_ga_res_t *garp = NULL;
5229 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5230
5231 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5232 ASSERT(nm != NULL);
5233 ASSERT(nm[0] != '\0');
5234 ASSERT(dvp->v_type == VDIR);
5235 ASSERT(nm[0] != '.' || nm[1] != '\0');
5236 ASSERT(*vpp != NULL);
5237
5238 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5239 isdotdot = 1;
5240 args.ctag = TAG_LOOKUP_VPARENT;
5241 } else {
5242 /*
5243 * If dvp were a stub, it should have triggered and caused
5244 * a mount for us to get this far.
5245 */
5246 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5247
5248 isdotdot = 0;
5249 args.ctag = TAG_LOOKUP_VALID;
5250 }
5251
5252 mi = VTOMI4(dvp);
5253 recov_state.rs_flags = 0;
5254 recov_state.rs_num_retry_despite_err = 0;
5255
5256 nvp = NULL;
5257
5258 /* Save the original mount point security information */
5259 (void) save_mnt_secinfo(mi->mi_curr_serv);
5260
5261 recov_retry:
5262 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5263 &recov_state, NULL);
5264 if (e.error) {
5265 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5266 VN_RELE(*vpp);
5267 *vpp = NULL;
5268 return (e.error);
5269 }
5270
5271 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5272
5273 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5274 args.array_len = 7;
5275 args.array = argop;
5276
5277 /* 0. putfh file */
5278 argop[0].argop = OP_CPUTFH;
5279 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5280
5281 /* 1. nverify the change info */
5282 argop[1].argop = OP_NVERIFY;
5283 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5284 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5285 ver_fattr->attrlist4 = (char *)&dchange;
5286 ptr = (int32_t *)&dchange;
5287 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5288 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5289
5290 /* 2. getattr directory */
5291 argop[2].argop = OP_GETATTR;
5292 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5293 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5294
5295 /* 3. access directory */
5296 argop[3].argop = OP_ACCESS;
5297 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5298 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5299
5300 /* 4. lookup name */
5301 if (isdotdot) {
5302 argop[4].argop = OP_LOOKUPP;
5303 } else {
5304 argop[4].argop = OP_CLOOKUP;
5305 argop[4].nfs_argop4_u.opclookup.cname = nm;
5306 }
5307
5308 /* 5. resulting file handle */
5309 argop[5].argop = OP_GETFH;
5310
5311 /* 6. resulting file attributes */
5312 argop[6].argop = OP_GETATTR;
5313 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5314 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5315
5316 doqueue = 1;
5317 t = gethrtime();
5318
5319 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5320
5321 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5322 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5323 if (e.error != 0 && *vpp != NULL)
5324 VN_RELE(*vpp);
5325 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5326 &recov_state, FALSE);
5327 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5328 kmem_free(argop, argoplist_size);
5329 return (e.error);
5330 }
5331
5332 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5333 /*
5334 * For WRONGSEC of a non-dotdot case, send secinfo directly
5335 * from this thread, do not go thru the recovery thread since
5336 * we need the nm information.
5337 *
5338 * Not doing dotdot case because there is no specification
5339 * for (PUTFH, SECINFO "..") yet.
5340 */
5341 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5342 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5343 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5344 &recov_state, FALSE);
5345 else
5346 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5347 &recov_state, TRUE);
5348 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5349 kmem_free(argop, argoplist_size);
5350 if (!e.error)
5351 goto recov_retry;
5352 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5353 VN_RELE(*vpp);
5354 *vpp = NULL;
5355 return (e.error);
5356 }
5357
5358 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5359 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5360 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5361 &recov_state, TRUE);
5362
5363 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5364 kmem_free(argop, argoplist_size);
5365 goto recov_retry;
5366 }
5367 }
5368
5369 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5370
5371 if (e.error || res.array_len == 0) {
5372 /*
5373 * If e.error isn't set, then reply has no ops (or we couldn't
5374 * be here). The only legal way to reply without an op array
5375 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5376 * be in the reply for all other status values.
5377 *
5378 * For valid replies without an ops array, return ENOTSUP
5379 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5380 * return EIO -- don't trust status.
5381 */
5382 if (e.error == 0)
5383 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5384 ENOTSUP : EIO;
5385 VN_RELE(*vpp);
5386 *vpp = NULL;
5387 kmem_free(argop, argoplist_size);
5388 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5389 return (e.error);
5390 }
5391
5392 if (res.status != NFS4ERR_SAME) {
5393 e.error = geterrno4(res.status);
5394
5395 /*
5396 * The NVERIFY "failed" so the directory has changed
5397 * First make sure PUTFH succeeded and NVERIFY "failed"
5398 * cleanly.
5399 */
5400 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5401 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5402 nfs4_purge_stale_fh(e.error, dvp, cr);
5403 VN_RELE(*vpp);
5404 *vpp = NULL;
5405 goto exit;
5406 }
5407
5408 /*
5409 * We know the NVERIFY "failed" so we must:
5410 * purge the caches (access and indirectly dnlc if needed)
5411 */
5412 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5413
5414 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5415 nfs4_purge_stale_fh(e.error, dvp, cr);
5416 VN_RELE(*vpp);
5417 *vpp = NULL;
5418 goto exit;
5419 }
5420
5421 /*
5422 * Install new cached attributes for the directory
5423 */
5424 nfs4_attr_cache(dvp,
5425 &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5426 t, cr, FALSE, NULL);
5427
5428 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5429 nfs4_purge_stale_fh(e.error, dvp, cr);
5430 VN_RELE(*vpp);
5431 *vpp = NULL;
5432 e.error = geterrno4(res.status);
5433 goto exit;
5434 }
5435
5436 /*
5437 * Now we know the directory is valid,
5438 * cache new directory access
5439 */
5440 nfs4_access_cache(drp,
5441 args.array[3].nfs_argop4_u.opaccess.access,
5442 res.array[3].nfs_resop4_u.opaccess.access, cr);
5443
5444 /*
5445 * recheck VEXEC access
5446 */
5447 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5448 if (cacc != NFS4_ACCESS_ALLOWED) {
5449 /*
5450 * Directory permissions might have been revoked
5451 */
5452 if (cacc == NFS4_ACCESS_DENIED) {
5453 e.error = EACCES;
5454 VN_RELE(*vpp);
5455 *vpp = NULL;
5456 goto exit;
5457 }
5458
5459 /*
5460 * Somehow we must not have asked for enough
5461 * so try a singleton ACCESS, should never happen.
5462 */
5463 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5464 if (e.error) {
5465 VN_RELE(*vpp);
5466 *vpp = NULL;
5467 goto exit;
5468 }
5469 }
5470
5471 e.error = geterrno4(res.status);
5472 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5473 /*
5474 * The lookup failed, probably no entry
5475 */
5476 if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5477 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5478 } else {
5479 /*
5480 * Might be some other error, so remove
5481 * the dnlc entry to make sure we start all
5482 * over again, next time.
5483 */
5484 dnlc_remove(dvp, nm);
5485 }
5486 VN_RELE(*vpp);
5487 *vpp = NULL;
5488 goto exit;
5489 }
5490
5491 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5492 /*
5493 * The file exists but we can't get its fh for
5494 * some unknown reason. Remove it from the dnlc
5495 * and error out to be safe.
5496 */
5497 dnlc_remove(dvp, nm);
5498 VN_RELE(*vpp);
5499 *vpp = NULL;
5500 goto exit;
5501 }
5502 fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5503 if (fhp->nfs_fh4_len == 0) {
5504 /*
5505 * The file exists but a bogus fh
5506 * some unknown reason. Remove it from the dnlc
5507 * and error out to be safe.
5508 */
5509 e.error = ENOENT;
5510 dnlc_remove(dvp, nm);
5511 VN_RELE(*vpp);
5512 *vpp = NULL;
5513 goto exit;
5514 }
5515 sfhp = sfh4_get(fhp, mi);
5516
5517 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5518 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5519
5520 /*
5521 * Make the new rnode
5522 */
5523 if (isdotdot) {
5524 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5525 if (e.error) {
5526 sfh4_rele(&sfhp);
5527 VN_RELE(*vpp);
5528 *vpp = NULL;
5529 goto exit;
5530 }
5531 /*
5532 * XXX if nfs4_make_dotdot uses an existing rnode
5533 * XXX it doesn't update the attributes.
5534 * XXX for now just save them again to save an OTW
5535 */
5536 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5537 } else {
5538 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5539 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5540 /*
5541 * If v_type == VNON, then garp was NULL because
5542 * the last op in the compound failed and makenfs4node
5543 * could not find the vnode for sfhp. It created
5544 * a new vnode, so we have nothing to purge here.
5545 */
5546 if (nvp->v_type == VNON) {
5547 vattr_t vattr;
5548
5549 vattr.va_mask = AT_TYPE;
5550 /*
5551 * N.B. We've already called nfs4_end_fop above.
5552 */
5553 e.error = nfs4getattr(nvp, &vattr, cr);
5554 if (e.error) {
5555 sfh4_rele(&sfhp);
5556 VN_RELE(*vpp);
5557 *vpp = NULL;
5558 VN_RELE(nvp);
5559 goto exit;
5560 }
5561 nvp->v_type = vattr.va_type;
5562 }
5563 }
5564 sfh4_rele(&sfhp);
5565
5566 nrp = VTOR4(nvp);
5567 mutex_enter(&nrp->r_statev4_lock);
5568 if (!nrp->created_v4) {
5569 mutex_exit(&nrp->r_statev4_lock);
5570 dnlc_update(dvp, nm, nvp);
5571 } else
5572 mutex_exit(&nrp->r_statev4_lock);
5573
5574 VN_RELE(*vpp);
5575 *vpp = nvp;
5576 } else {
5577 hrtime_t now;
5578 hrtime_t delta = 0;
5579
5580 e.error = 0;
5581
5582 /*
5583 * Because the NVERIFY "succeeded" we know that the
5584 * directory attributes are still valid
5585 * so update r_time_attr_inval
5586 */
5587 now = gethrtime();
5588 mutex_enter(&drp->r_statelock);
5589 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5590 delta = now - drp->r_time_attr_saved;
5591 if (delta < mi->mi_acdirmin)
5592 delta = mi->mi_acdirmin;
5593 else if (delta > mi->mi_acdirmax)
5594 delta = mi->mi_acdirmax;
5595 }
5596 drp->r_time_attr_inval = now + delta;
5597 mutex_exit(&drp->r_statelock);
5598 dnlc_update(dvp, nm, *vpp);
5599
5600 /*
5601 * Even though we have a valid directory attr cache
5602 * and dnlc entry, we may not have access.
5603 * This should almost always hit the cache.
5604 */
5605 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5606 if (e.error) {
5607 VN_RELE(*vpp);
5608 *vpp = NULL;
5609 }
5610
5611 if (*vpp == DNLC_NO_VNODE) {
5612 VN_RELE(*vpp);
5613 *vpp = NULL;
5614 e.error = ENOENT;
5615 }
5616 }
5617
5618 exit:
5619 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5620 kmem_free(argop, argoplist_size);
5621 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5622 return (e.error);
5623 }
5624
5625 /*
5626 * We need to go over the wire to lookup the name, but
5627 * while we are there verify the directory has not
5628 * changed but if it has, get new attributes and check access
5629 *
5630 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5631 * NVERIFY GETATTR ACCESS
5632 *
5633 * With the results:
5634 * if the NVERIFY failed we must purge the caches, add new attributes,
5635 * and cache new access.
5636 * set a new r_time_attr_inval
5637 * add name to dnlc, possibly negative
5638 * if LOOKUP succeeded
5639 * cache new attributes
5640 */
5641 static int
5642 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5643 {
5644 COMPOUND4args_clnt args;
5645 COMPOUND4res_clnt res;
5646 fattr4 *ver_fattr;
5647 fattr4_change dchange;
5648 int32_t *ptr;
5649 nfs4_ga_res_t *garp = NULL;
5650 int argoplist_size = 9 * sizeof (nfs_argop4);
5651 nfs_argop4 *argop;
5652 int doqueue;
5653 mntinfo4_t *mi;
5654 nfs4_recov_state_t recov_state;
5655 hrtime_t t;
5656 int isdotdot;
5657 vnode_t *nvp;
5658 nfs_fh4 *fhp;
5659 nfs4_sharedfh_t *sfhp;
5660 nfs4_access_type_t cacc;
5661 rnode4_t *nrp;
5662 rnode4_t *drp = VTOR4(dvp);
5663 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5664
5665 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5666 ASSERT(nm != NULL);
5667 ASSERT(nm[0] != '\0');
5668 ASSERT(dvp->v_type == VDIR);
5669 ASSERT(nm[0] != '.' || nm[1] != '\0');
5670 ASSERT(*vpp == NULL);
5671
5672 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5673 isdotdot = 1;
5674 args.ctag = TAG_LOOKUP_PARENT;
5675 } else {
5676 /*
5677 * If dvp were a stub, it should have triggered and caused
5678 * a mount for us to get this far.
5679 */
5680 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5681
5682 isdotdot = 0;
5683 args.ctag = TAG_LOOKUP;
5684 }
5685
5686 mi = VTOMI4(dvp);
5687 recov_state.rs_flags = 0;
5688 recov_state.rs_num_retry_despite_err = 0;
5689
5690 nvp = NULL;
5691
5692 /* Save the original mount point security information */
5693 (void) save_mnt_secinfo(mi->mi_curr_serv);
5694
5695 recov_retry:
5696 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5697 &recov_state, NULL);
5698 if (e.error) {
5699 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5700 return (e.error);
5701 }
5702
5703 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5704
5705 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5706 args.array_len = 9;
5707 args.array = argop;
5708
5709 /* 0. putfh file */
5710 argop[0].argop = OP_CPUTFH;
5711 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5712
5713 /* 1. savefh for the nverify */
5714 argop[1].argop = OP_SAVEFH;
5715
5716 /* 2. lookup name */
5717 if (isdotdot) {
5718 argop[2].argop = OP_LOOKUPP;
5719 } else {
5720 argop[2].argop = OP_CLOOKUP;
5721 argop[2].nfs_argop4_u.opclookup.cname = nm;
5722 }
5723
5724 /* 3. resulting file handle */
5725 argop[3].argop = OP_GETFH;
5726
5727 /* 4. resulting file attributes */
5728 argop[4].argop = OP_GETATTR;
5729 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5730 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5731
5732 /* 5. restorefh back the directory for the nverify */
5733 argop[5].argop = OP_RESTOREFH;
5734
5735 /* 6. nverify the change info */
5736 argop[6].argop = OP_NVERIFY;
5737 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5738 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5739 ver_fattr->attrlist4 = (char *)&dchange;
5740 ptr = (int32_t *)&dchange;
5741 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5742 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5743
5744 /* 7. getattr directory */
5745 argop[7].argop = OP_GETATTR;
5746 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5747 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5748
5749 /* 8. access directory */
5750 argop[8].argop = OP_ACCESS;
5751 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5752 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5753
5754 doqueue = 1;
5755 t = gethrtime();
5756
5757 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5758
5759 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5760 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5761 if (e.error != 0 && *vpp != NULL)
5762 VN_RELE(*vpp);
5763 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5764 &recov_state, FALSE);
5765 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5766 kmem_free(argop, argoplist_size);
5767 return (e.error);
5768 }
5769
5770 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5771 /*
5772 * For WRONGSEC of a non-dotdot case, send secinfo directly
5773 * from this thread, do not go thru the recovery thread since
5774 * we need the nm information.
5775 *
5776 * Not doing dotdot case because there is no specification
5777 * for (PUTFH, SECINFO "..") yet.
5778 */
5779 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5780 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5781 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5782 &recov_state, FALSE);
5783 else
5784 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5785 &recov_state, TRUE);
5786 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5787 kmem_free(argop, argoplist_size);
5788 if (!e.error)
5789 goto recov_retry;
5790 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5791 return (e.error);
5792 }
5793
5794 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5795 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5796 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5797 &recov_state, TRUE);
5798
5799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5800 kmem_free(argop, argoplist_size);
5801 goto recov_retry;
5802 }
5803 }
5804
5805 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5806
5807 if (e.error || res.array_len == 0) {
5808 /*
5809 * If e.error isn't set, then reply has no ops (or we couldn't
5810 * be here). The only legal way to reply without an op array
5811 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5812 * be in the reply for all other status values.
5813 *
5814 * For valid replies without an ops array, return ENOTSUP
5815 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5816 * return EIO -- don't trust status.
5817 */
5818 if (e.error == 0)
5819 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5820 ENOTSUP : EIO;
5821
5822 kmem_free(argop, argoplist_size);
5823 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5824 return (e.error);
5825 }
5826
5827 e.error = geterrno4(res.status);
5828
5829 /*
5830 * The PUTFH and SAVEFH may have failed.
5831 */
5832 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5833 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5834 nfs4_purge_stale_fh(e.error, dvp, cr);
5835 goto exit;
5836 }
5837
5838 /*
5839 * Check if the file exists, if it does delay entering
5840 * into the dnlc until after we update the directory
5841 * attributes so we don't cause it to get purged immediately.
5842 */
5843 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5844 /*
5845 * The lookup failed, probably no entry
5846 */
5847 if (e.error == ENOENT && nfs4_lookup_neg_cache)
5848 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5849 goto exit;
5850 }
5851
5852 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5853 /*
5854 * The file exists but we can't get its fh for
5855 * some unknown reason. Error out to be safe.
5856 */
5857 goto exit;
5858 }
5859
5860 fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5861 if (fhp->nfs_fh4_len == 0) {
5862 /*
5863 * The file exists but a bogus fh
5864 * some unknown reason. Error out to be safe.
5865 */
5866 e.error = EIO;
5867 goto exit;
5868 }
5869 sfhp = sfh4_get(fhp, mi);
5870
5871 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5872 sfh4_rele(&sfhp);
5873 goto exit;
5874 }
5875 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5876
5877 /*
5878 * The RESTOREFH may have failed
5879 */
5880 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5881 sfh4_rele(&sfhp);
5882 e.error = EIO;
5883 goto exit;
5884 }
5885
5886 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5887 /*
5888 * First make sure the NVERIFY failed as we expected,
5889 * if it didn't then be conservative and error out
5890 * as we can't trust the directory.
5891 */
5892 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5893 sfh4_rele(&sfhp);
5894 e.error = EIO;
5895 goto exit;
5896 }
5897
5898 /*
5899 * We know the NVERIFY "failed" so the directory has changed,
5900 * so we must:
5901 * purge the caches (access and indirectly dnlc if needed)
5902 */
5903 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5904
5905 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5906 sfh4_rele(&sfhp);
5907 goto exit;
5908 }
5909 nfs4_attr_cache(dvp,
5910 &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5911 t, cr, FALSE, NULL);
5912
5913 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5914 nfs4_purge_stale_fh(e.error, dvp, cr);
5915 sfh4_rele(&sfhp);
5916 e.error = geterrno4(res.status);
5917 goto exit;
5918 }
5919
5920 /*
5921 * Now we know the directory is valid,
5922 * cache new directory access
5923 */
5924 nfs4_access_cache(drp,
5925 args.array[8].nfs_argop4_u.opaccess.access,
5926 res.array[8].nfs_resop4_u.opaccess.access, cr);
5927
5928 /*
5929 * recheck VEXEC access
5930 */
5931 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5932 if (cacc != NFS4_ACCESS_ALLOWED) {
5933 /*
5934 * Directory permissions might have been revoked
5935 */
5936 if (cacc == NFS4_ACCESS_DENIED) {
5937 sfh4_rele(&sfhp);
5938 e.error = EACCES;
5939 goto exit;
5940 }
5941
5942 /*
5943 * Somehow we must not have asked for enough
5944 * so try a singleton ACCESS should never happen
5945 */
5946 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5947 if (e.error) {
5948 sfh4_rele(&sfhp);
5949 goto exit;
5950 }
5951 }
5952
5953 e.error = geterrno4(res.status);
5954 } else {
5955 hrtime_t now;
5956 hrtime_t delta = 0;
5957
5958 e.error = 0;
5959
5960 /*
5961 * Because the NVERIFY "succeeded" we know that the
5962 * directory attributes are still valid
5963 * so update r_time_attr_inval
5964 */
5965 now = gethrtime();
5966 mutex_enter(&drp->r_statelock);
5967 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5968 delta = now - drp->r_time_attr_saved;
5969 if (delta < mi->mi_acdirmin)
5970 delta = mi->mi_acdirmin;
5971 else if (delta > mi->mi_acdirmax)
5972 delta = mi->mi_acdirmax;
5973 }
5974 drp->r_time_attr_inval = now + delta;
5975 mutex_exit(&drp->r_statelock);
5976
5977 /*
5978 * Even though we have a valid directory attr cache,
5979 * we may not have access.
5980 * This should almost always hit the cache.
5981 */
5982 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5983 if (e.error) {
5984 sfh4_rele(&sfhp);
5985 goto exit;
5986 }
5987 }
5988
5989 /*
5990 * Now we have successfully completed the lookup, if the
5991 * directory has changed we now have the valid attributes.
5992 * We also know we have directory access.
5993 * Create the new rnode and insert it in the dnlc.
5994 */
5995 if (isdotdot) {
5996 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5997 if (e.error) {
5998 sfh4_rele(&sfhp);
5999 goto exit;
6000 }
6001 /*
6002 * XXX if nfs4_make_dotdot uses an existing rnode
6003 * XXX it doesn't update the attributes.
6004 * XXX for now just save them again to save an OTW
6005 */
6006 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6007 } else {
6008 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6009 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6010 }
6011 sfh4_rele(&sfhp);
6012
6013 nrp = VTOR4(nvp);
6014 mutex_enter(&nrp->r_statev4_lock);
6015 if (!nrp->created_v4) {
6016 mutex_exit(&nrp->r_statev4_lock);
6017 dnlc_update(dvp, nm, nvp);
6018 } else
6019 mutex_exit(&nrp->r_statev4_lock);
6020
6021 *vpp = nvp;
6022
6023 exit:
6024 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6025 kmem_free(argop, argoplist_size);
6026 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6027 return (e.error);
6028 }
6029
6030 #ifdef DEBUG
6031 void
6032 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6033 {
6034 uint_t i, len;
6035 zoneid_t zoneid = getzoneid();
6036 char *s;
6037
6038 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6039 for (i = 0; i < argcnt; i++) {
6040 nfs_argop4 *op = &argbase[i];
6041 switch (op->argop) {
6042 case OP_CPUTFH:
6043 case OP_PUTFH:
6044 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6045 break;
6046 case OP_PUTROOTFH:
6047 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6048 break;
6049 case OP_CLOOKUP:
6050 s = op->nfs_argop4_u.opclookup.cname;
6051 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6052 break;
6053 case OP_LOOKUP:
6054 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6055 &len, NULL);
6056 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6057 kmem_free(s, len);
6058 break;
6059 case OP_LOOKUPP:
6060 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6061 break;
6062 case OP_GETFH:
6063 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6064 break;
6065 case OP_GETATTR:
6066 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6067 break;
6068 case OP_OPENATTR:
6069 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6070 break;
6071 default:
6072 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6073 op->argop);
6074 break;
6075 }
6076 }
6077 }
6078 #endif
6079
6080 /*
6081 * nfs4lookup_setup - constructs a multi-lookup compound request.
6082 *
6083 * Given the path "nm1/nm2/.../nmn", the following compound requests
6084 * may be created:
6085 *
6086 * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6087 * is faster, for now.
6088 *
6089 * l4_getattrs indicates the type of compound requested.
6090 *
6091 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6092 *
6093 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} }
6094 *
6095 * total number of ops is n + 1.
6096 *
6097 * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6098 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6099 * before the last component, and only get attributes
6100 * for the last component. Note that the second-to-last
6101 * pathname component is XATTR_RPATH, which does NOT go
6102 * over-the-wire as a lookup.
6103 *
6104 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6105 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6106 *
6107 * and total number of ops is n + 5.
6108 *
6109 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6110 * attribute directory: create lookups plus an OPENATTR
6111 * replacing the last lookup. Note that the last pathname
6112 * component is XATTR_RPATH, which does NOT go over-the-wire
6113 * as a lookup.
6114 *
6115 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6116 * Openattr; Getfh; Getattr }
6117 *
6118 * and total number of ops is n + 5.
6119 *
6120 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6121 * nodes too.
6122 *
6123 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6124 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr }
6125 *
6126 * and total number of ops is 3*n + 1.
6127 *
6128 * All cases: returns the index in the arg array of the final LOOKUP op, or
6129 * -1 if no LOOKUPs were used.
6130 */
6131 int
6132 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6133 {
6134 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6135 nfs_argop4 *argbase, *argop;
6136 int arglen, argcnt;
6137 int n = 1; /* number of components */
6138 int nga = 1; /* number of Getattr's in request */
6139 char c = '\0', *s, *p;
6140 int lookup_idx = -1;
6141 int argoplist_size;
6142
6143 /* set lookuparg response result to 0 */
6144 lookupargp->resp->status = NFS4_OK;
6145
6146 /* skip leading "/" or "." e.g. ".//./" if there is */
6147 for (; ; nm++) {
6148 if (*nm != '/' && *nm != '.')
6149 break;
6150
6151 /* ".." is counted as 1 component */
6152 if (*nm == '.' && *(nm + 1) != '/')
6153 break;
6154 }
6155
6156 /*
6157 * Find n = number of components - nm must be null terminated
6158 * Skip "." components.
6159 */
6160 if (*nm != '\0')
6161 for (n = 1, s = nm; *s != '\0'; s++) {
6162 if ((*s == '/') && (*(s + 1) != '/') &&
6163 (*(s + 1) != '\0') &&
6164 !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6165 *(s + 2) == '\0')))
6166 n++;
6167 }
6168 else
6169 n = 0;
6170
6171 /*
6172 * nga is number of components that need Getfh+Getattr
6173 */
6174 switch (l4_getattrs) {
6175 case LKP4_NO_ATTRIBUTES:
6176 nga = 0;
6177 break;
6178 case LKP4_ALL_ATTRIBUTES:
6179 nga = n;
6180 /*
6181 * Always have at least 1 getfh, getattr pair
6182 */
6183 if (nga == 0)
6184 nga++;
6185 break;
6186 case LKP4_LAST_ATTRDIR:
6187 case LKP4_LAST_NAMED_ATTR:
6188 nga = n+1;
6189 break;
6190 }
6191
6192 /*
6193 * If change to use the filehandle attr instead of getfh
6194 * the following line can be deleted.
6195 */
6196 nga *= 2;
6197
6198 /*
6199 * calculate number of ops in request as
6200 * header + trailer + lookups + getattrs
6201 */
6202 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6203
6204 argoplist_size = arglen * sizeof (nfs_argop4);
6205 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6206 lookupargp->argsp->array = argop;
6207
6208 argcnt = lookupargp->header_len;
6209 argop += argcnt;
6210
6211 /*
6212 * loop and create a lookup op and possibly getattr/getfh for
6213 * each component. Skip "." components.
6214 */
6215 for (s = nm; *s != '\0'; s = p) {
6216 /*
6217 * Set up a pathname struct for each component if needed
6218 */
6219 while (*s == '/')
6220 s++;
6221 if (*s == '\0')
6222 break;
6223
6224 for (p = s; (*p != '/') && (*p != '\0'); p++)
6225 ;
6226 c = *p;
6227 *p = '\0';
6228
6229 if (s[0] == '.' && s[1] == '\0') {
6230 *p = c;
6231 continue;
6232 }
6233 if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6234 strcmp(s, XATTR_RPATH) == 0) {
6235 /* getfh XXX may not be needed in future */
6236 argop->argop = OP_GETFH;
6237 argop++;
6238 argcnt++;
6239
6240 /* getattr */
6241 argop->argop = OP_GETATTR;
6242 argop->nfs_argop4_u.opgetattr.attr_request =
6243 lookupargp->ga_bits;
6244 argop->nfs_argop4_u.opgetattr.mi =
6245 lookupargp->mi;
6246 argop++;
6247 argcnt++;
6248
6249 /* openattr */
6250 argop->argop = OP_OPENATTR;
6251 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6252 strcmp(s, XATTR_RPATH) == 0) {
6253 /* openattr */
6254 argop->argop = OP_OPENATTR;
6255 argop++;
6256 argcnt++;
6257
6258 /* getfh XXX may not be needed in future */
6259 argop->argop = OP_GETFH;
6260 argop++;
6261 argcnt++;
6262
6263 /* getattr */
6264 argop->argop = OP_GETATTR;
6265 argop->nfs_argop4_u.opgetattr.attr_request =
6266 lookupargp->ga_bits;
6267 argop->nfs_argop4_u.opgetattr.mi =
6268 lookupargp->mi;
6269 argop++;
6270 argcnt++;
6271 *p = c;
6272 continue;
6273 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6274 /* lookupp */
6275 argop->argop = OP_LOOKUPP;
6276 } else {
6277 /* lookup */
6278 argop->argop = OP_LOOKUP;
6279 (void) str_to_utf8(s,
6280 &argop->nfs_argop4_u.oplookup.objname);
6281 }
6282 lookup_idx = argcnt;
6283 argop++;
6284 argcnt++;
6285
6286 *p = c;
6287
6288 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6289 /* getfh XXX may not be needed in future */
6290 argop->argop = OP_GETFH;
6291 argop++;
6292 argcnt++;
6293
6294 /* getattr */
6295 argop->argop = OP_GETATTR;
6296 argop->nfs_argop4_u.opgetattr.attr_request =
6297 lookupargp->ga_bits;
6298 argop->nfs_argop4_u.opgetattr.mi =
6299 lookupargp->mi;
6300 argop++;
6301 argcnt++;
6302 }
6303 }
6304
6305 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6306 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6307 if (needgetfh) {
6308 /* stick in a post-lookup getfh */
6309 argop->argop = OP_GETFH;
6310 argcnt++;
6311 argop++;
6312 }
6313 /* post-lookup getattr */
6314 argop->argop = OP_GETATTR;
6315 argop->nfs_argop4_u.opgetattr.attr_request =
6316 lookupargp->ga_bits;
6317 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6318 argcnt++;
6319 }
6320 argcnt += lookupargp->trailer_len; /* actual op count */
6321 lookupargp->argsp->array_len = argcnt;
6322 lookupargp->arglen = arglen;
6323
6324 #ifdef DEBUG
6325 if (nfs4_client_lookup_debug)
6326 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6327 #endif
6328
6329 return (lookup_idx);
6330 }
6331
6332 static int
6333 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6334 {
6335 COMPOUND4args_clnt args;
6336 COMPOUND4res_clnt res;
6337 GETFH4res *gf_res = NULL;
6338 nfs_argop4 argop[4];
6339 nfs_resop4 *resop = NULL;
6340 nfs4_sharedfh_t *sfhp;
6341 hrtime_t t;
6342 nfs4_error_t e;
6343
6344 rnode4_t *drp;
6345 int doqueue = 1;
6346 vnode_t *vp;
6347 int needrecov = 0;
6348 nfs4_recov_state_t recov_state;
6349
6350 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6351
6352 *avp = NULL;
6353 recov_state.rs_flags = 0;
6354 recov_state.rs_num_retry_despite_err = 0;
6355
6356 recov_retry:
6357 /* COMPOUND: putfh, openattr, getfh, getattr */
6358 args.array_len = 4;
6359 args.array = argop;
6360 args.ctag = TAG_OPENATTR;
6361
6362 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6363 if (e.error)
6364 return (e.error);
6365
6366 drp = VTOR4(dvp);
6367
6368 /* putfh */
6369 argop[0].argop = OP_CPUTFH;
6370 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6371
6372 /* openattr */
6373 argop[1].argop = OP_OPENATTR;
6374 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6375
6376 /* getfh */
6377 argop[2].argop = OP_GETFH;
6378
6379 /* getattr */
6380 argop[3].argop = OP_GETATTR;
6381 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6382 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6383
6384 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6385 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6386 rnode4info(drp)));
6387
6388 t = gethrtime();
6389
6390 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6391
6392 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6393 if (needrecov) {
6394 bool_t abort;
6395
6396 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6397 "nfs4openattr: initiating recovery\n"));
6398
6399 abort = nfs4_start_recovery(&e,
6400 VTOMI4(dvp), dvp, NULL, NULL, NULL,
6401 OP_OPENATTR, NULL, NULL, NULL);
6402 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6403 if (!e.error) {
6404 e.error = geterrno4(res.status);
6405 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6406 }
6407 if (abort == FALSE)
6408 goto recov_retry;
6409 return (e.error);
6410 }
6411
6412 if (e.error) {
6413 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6414 return (e.error);
6415 }
6416
6417 if (res.status) {
6418 /*
6419 * If OTW errro is NOTSUPP, then it should be
6420 * translated to EINVAL. All Solaris file system
6421 * implementations return EINVAL to the syscall layer
6422 * when the attrdir cannot be created due to an
6423 * implementation restriction or noxattr mount option.
6424 */
6425 if (res.status == NFS4ERR_NOTSUPP) {
6426 mutex_enter(&drp->r_statelock);
6427 if (drp->r_xattr_dir)
6428 VN_RELE(drp->r_xattr_dir);
6429 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6430 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6431 mutex_exit(&drp->r_statelock);
6432
6433 e.error = EINVAL;
6434 } else {
6435 e.error = geterrno4(res.status);
6436 }
6437
6438 if (e.error) {
6439 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6440 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6441 needrecov);
6442 return (e.error);
6443 }
6444 }
6445
6446 resop = &res.array[0]; /* putfh res */
6447 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6448
6449 resop = &res.array[1]; /* openattr res */
6450 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6451
6452 resop = &res.array[2]; /* getfh res */
6453 gf_res = &resop->nfs_resop4_u.opgetfh;
6454 if (gf_res->object.nfs_fh4_len == 0) {
6455 *avp = NULL;
6456 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6457 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6458 return (ENOENT);
6459 }
6460
6461 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6462 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6463 dvp->v_vfsp, t, cr, dvp,
6464 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6465 sfh4_rele(&sfhp);
6466
6467 if (e.error)
6468 PURGE_ATTRCACHE4(vp);
6469
6470 mutex_enter(&vp->v_lock);
6471 vp->v_flag |= V_XATTRDIR;
6472 mutex_exit(&vp->v_lock);
6473
6474 *avp = vp;
6475
6476 mutex_enter(&drp->r_statelock);
6477 if (drp->r_xattr_dir)
6478 VN_RELE(drp->r_xattr_dir);
6479 VN_HOLD(vp);
6480 drp->r_xattr_dir = vp;
6481
6482 /*
6483 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6484 * NULL. xattrs could be created at any time, and we have no
6485 * way to update pc4_xattr_exists in the base object if/when
6486 * it happens.
6487 */
6488 drp->r_pathconf.pc4_xattr_valid = 0;
6489
6490 mutex_exit(&drp->r_statelock);
6491
6492 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6493
6494 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6495
6496 return (0);
6497 }
6498
6499 /* ARGSUSED */
6500 static int
6501 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6502 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6503 vsecattr_t *vsecp)
6504 {
6505 int error;
6506 vnode_t *vp = NULL;
6507 rnode4_t *rp;
6508 struct vattr vattr;
6509 rnode4_t *drp;
6510 vnode_t *tempvp;
6511 enum createmode4 createmode;
6512 bool_t must_trunc = FALSE;
6513 int truncating = 0;
6514
6515 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6516 return (EPERM);
6517 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6518 return (EINVAL);
6519 }
6520
6521 /* . and .. have special meaning in the protocol, reject them. */
6522
6523 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6524 return (EISDIR);
6525
6526 drp = VTOR4(dvp);
6527
6528 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6529 return (EINTR);
6530
6531 top:
6532 /*
6533 * We make a copy of the attributes because the caller does not
6534 * expect us to change what va points to.
6535 */
6536 vattr = *va;
6537
6538 /*
6539 * If the pathname is "", then dvp is the root vnode of
6540 * a remote file mounted over a local directory.
6541 * All that needs to be done is access
6542 * checking and truncation. Note that we avoid doing
6543 * open w/ create because the parent directory might
6544 * be in pseudo-fs and the open would fail.
6545 */
6546 if (*nm == '\0') {
6547 error = 0;
6548 VN_HOLD(dvp);
6549 vp = dvp;
6550 must_trunc = TRUE;
6551 } else {
6552 /*
6553 * We need to go over the wire, just to be sure whether the
6554 * file exists or not. Using the DNLC can be dangerous in
6555 * this case when making a decision regarding existence.
6556 */
6557 error = nfs4lookup(dvp, nm, &vp, cr, 1);
6558 }
6559
6560 if (exclusive)
6561 createmode = EXCLUSIVE4;
6562 else
6563 createmode = GUARDED4;
6564
6565 /*
6566 * error would be set if the file does not exist on the
6567 * server, so lets go create it.
6568 */
6569 if (error) {
6570 goto create_otw;
6571 }
6572
6573 /*
6574 * File does exist on the server
6575 */
6576 if (exclusive == EXCL)
6577 error = EEXIST;
6578 else if (vp->v_type == VDIR && (mode & VWRITE))
6579 error = EISDIR;
6580 else {
6581 /*
6582 * If vnode is a device, create special vnode.
6583 */
6584 if (ISVDEV(vp->v_type)) {
6585 tempvp = vp;
6586 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6587 VN_RELE(tempvp);
6588 }
6589 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6590 if ((vattr.va_mask & AT_SIZE) &&
6591 vp->v_type == VREG) {
6592 rp = VTOR4(vp);
6593 /*
6594 * Check here for large file handled
6595 * by LF-unaware process (as
6596 * ufs_create() does)
6597 */
6598 if (!(flags & FOFFMAX)) {
6599 mutex_enter(&rp->r_statelock);
6600 if (rp->r_size > MAXOFF32_T)
6601 error = EOVERFLOW;
6602 mutex_exit(&rp->r_statelock);
6603 }
6604
6605 /* if error is set then we need to return */
6606 if (error) {
6607 nfs_rw_exit(&drp->r_rwlock);
6608 VN_RELE(vp);
6609 return (error);
6610 }
6611
6612 if (must_trunc) {
6613 vattr.va_mask = AT_SIZE;
6614 error = nfs4setattr(vp, &vattr, 0, cr,
6615 NULL);
6616 } else {
6617 /*
6618 * we know we have a regular file that already
6619 * exists and we may end up truncating the file
6620 * as a result of the open_otw, so flush out
6621 * any dirty pages for this file first.
6622 */
6623 if (nfs4_has_pages(vp) &&
6624 ((rp->r_flags & R4DIRTY) ||
6625 rp->r_count > 0 ||
6626 rp->r_mapcnt > 0)) {
6627 error = nfs4_putpage(vp,
6628 (offset_t)0, 0, 0, cr, ct);
6629 if (error && (error == ENOSPC ||
6630 error == EDQUOT)) {
6631 mutex_enter(
6632 &rp->r_statelock);
6633 if (!rp->r_error)
6634 rp->r_error =
6635 error;
6636 mutex_exit(
6637 &rp->r_statelock);
6638 }
6639 }
6640 vattr.va_mask = (AT_SIZE |
6641 AT_TYPE | AT_MODE);
6642 vattr.va_type = VREG;
6643 createmode = UNCHECKED4;
6644 truncating = 1;
6645 goto create_otw;
6646 }
6647 }
6648 }
6649 }
6650 nfs_rw_exit(&drp->r_rwlock);
6651 if (error) {
6652 VN_RELE(vp);
6653 } else {
6654 vnode_t *tvp;
6655 rnode4_t *trp;
6656 /*
6657 * existing file got truncated, notify.
6658 */
6659 tvp = vp;
6660 if (vp->v_type == VREG) {
6661 trp = VTOR4(vp);
6662 if (IS_SHADOW(vp, trp))
6663 tvp = RTOV4(trp);
6664 }
6665 vnevent_create(tvp, ct);
6666 *vpp = vp;
6667 }
6668 return (error);
6669
6670 create_otw:
6671 dnlc_remove(dvp, nm);
6672
6673 ASSERT(vattr.va_mask & AT_TYPE);
6674
6675 /*
6676 * If not a regular file let nfs4mknod() handle it.
6677 */
6678 if (vattr.va_type != VREG) {
6679 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6680 nfs_rw_exit(&drp->r_rwlock);
6681 return (error);
6682 }
6683
6684 /*
6685 * It _is_ a regular file.
6686 */
6687 ASSERT(vattr.va_mask & AT_MODE);
6688 if (MANDMODE(vattr.va_mode)) {
6689 nfs_rw_exit(&drp->r_rwlock);
6690 return (EACCES);
6691 }
6692
6693 /*
6694 * If this happens to be a mknod of a regular file, then flags will
6695 * have neither FREAD or FWRITE. However, we must set at least one
6696 * for the call to nfs4open_otw. If it's open(O_CREAT) driving
6697 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6698 * set (based on openmode specified by app).
6699 */
6700 if ((flags & (FREAD|FWRITE)) == 0)
6701 flags |= (FREAD|FWRITE);
6702
6703 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6704
6705 if (vp != NULL) {
6706 /* if create was successful, throw away the file's pages */
6707 if (!error && (vattr.va_mask & AT_SIZE))
6708 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6709 cr);
6710 /* release the lookup hold */
6711 VN_RELE(vp);
6712 vp = NULL;
6713 }
6714
6715 /*
6716 * validate that we opened a regular file. This handles a misbehaving
6717 * server that returns an incorrect FH.
6718 */
6719 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6720 error = EISDIR;
6721 VN_RELE(*vpp);
6722 }
6723
6724 /*
6725 * If this is not an exclusive create, then the CREATE
6726 * request will be made with the GUARDED mode set. This
6727 * means that the server will return EEXIST if the file
6728 * exists. The file could exist because of a retransmitted
6729 * request. In this case, we recover by starting over and
6730 * checking to see whether the file exists. This second
6731 * time through it should and a CREATE request will not be
6732 * sent.
6733 *
6734 * This handles the problem of a dangling CREATE request
6735 * which contains attributes which indicate that the file
6736 * should be truncated. This retransmitted request could
6737 * possibly truncate valid data in the file if not caught
6738 * by the duplicate request mechanism on the server or if
6739 * not caught by other means. The scenario is:
6740 *
6741 * Client transmits CREATE request with size = 0
6742 * Client times out, retransmits request.
6743 * Response to the first request arrives from the server
6744 * and the client proceeds on.
6745 * Client writes data to the file.
6746 * The server now processes retransmitted CREATE request
6747 * and truncates file.
6748 *
6749 * The use of the GUARDED CREATE request prevents this from
6750 * happening because the retransmitted CREATE would fail
6751 * with EEXIST and would not truncate the file.
6752 */
6753 if (error == EEXIST && exclusive == NONEXCL) {
6754 #ifdef DEBUG
6755 nfs4_create_misses++;
6756 #endif
6757 goto top;
6758 }
6759 nfs_rw_exit(&drp->r_rwlock);
6760 if (truncating && !error && *vpp) {
6761 vnode_t *tvp;
6762 rnode4_t *trp;
6763 /*
6764 * existing file got truncated, notify.
6765 */
6766 tvp = *vpp;
6767 trp = VTOR4(tvp);
6768 if (IS_SHADOW(tvp, trp))
6769 tvp = RTOV4(trp);
6770 vnevent_create(tvp, ct);
6771 }
6772 return (error);
6773 }
6774
6775 /*
6776 * Create compound (for mkdir, mknod, symlink):
6777 * { Putfh <dfh>; Create; Getfh; Getattr }
6778 * It's okay if setattr failed to set gid - this is not considered
6779 * an error, but purge attrs in that case.
6780 */
6781 static int
6782 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6783 vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6784 {
6785 int need_end_op = FALSE;
6786 COMPOUND4args_clnt args;
6787 COMPOUND4res_clnt res, *resp = NULL;
6788 nfs_argop4 *argop;
6789 nfs_resop4 *resop;
6790 int doqueue;
6791 mntinfo4_t *mi;
6792 rnode4_t *drp = VTOR4(dvp);
6793 change_info4 *cinfo;
6794 GETFH4res *gf_res;
6795 struct vattr vattr;
6796 vnode_t *vp;
6797 fattr4 *crattr;
6798 bool_t needrecov = FALSE;
6799 nfs4_recov_state_t recov_state;
6800 nfs4_sharedfh_t *sfhp = NULL;
6801 hrtime_t t;
6802 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6803 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6804 dirattr_info_t dinfo, *dinfop;
6805 servinfo4_t *svp;
6806 bitmap4 supp_attrs;
6807
6808 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6809 type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6810
6811 mi = VTOMI4(dvp);
6812
6813 /*
6814 * Make sure we properly deal with setting the right gid
6815 * on a new directory to reflect the parent's setgid bit
6816 */
6817 setgid_flag = 0;
6818 if (type == NF4DIR) {
6819 struct vattr dva;
6820
6821 va->va_mode &= ~VSGID;
6822 dva.va_mask = AT_MODE | AT_GID;
6823 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6824
6825 /*
6826 * If the parent's directory has the setgid bit set
6827 * _and_ the client was able to get a valid mapping
6828 * for the parent dir's owner_group, we want to
6829 * append NVERIFY(owner_group == dva.va_gid) and
6830 * SETTATTR to the CREATE compound.
6831 */
6832 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6833 setgid_flag = 1;
6834 va->va_mode |= VSGID;
6835 if (dva.va_gid != GID_NOBODY) {
6836 va->va_mask |= AT_GID;
6837 va->va_gid = dva.va_gid;
6838 }
6839 }
6840 }
6841 }
6842
6843 /*
6844 * Create ops:
6845 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6846 * 5:restorefh(dir) 6:getattr(dir)
6847 *
6848 * if (setgid)
6849 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6850 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6851 * 8:nverify 9:setattr
6852 */
6853 if (setgid_flag) {
6854 numops = 10;
6855 idx_create = 1;
6856 idx_fattr = 3;
6857 } else {
6858 numops = 7;
6859 idx_create = 2;
6860 idx_fattr = 4;
6861 }
6862
6863 ASSERT(nfs_zone() == mi->mi_zone);
6864 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6865 return (EINTR);
6866 }
6867 recov_state.rs_flags = 0;
6868 recov_state.rs_num_retry_despite_err = 0;
6869
6870 argoplist_size = numops * sizeof (nfs_argop4);
6871 argop = kmem_alloc(argoplist_size, KM_SLEEP);
6872
6873 recov_retry:
6874 if (type == NF4LNK)
6875 args.ctag = TAG_SYMLINK;
6876 else if (type == NF4DIR)
6877 args.ctag = TAG_MKDIR;
6878 else
6879 args.ctag = TAG_MKNOD;
6880
6881 args.array_len = numops;
6882 args.array = argop;
6883
6884 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6885 nfs_rw_exit(&drp->r_rwlock);
6886 kmem_free(argop, argoplist_size);
6887 return (e.error);
6888 }
6889 need_end_op = TRUE;
6890
6891
6892 /* 0: putfh directory */
6893 argop[0].argop = OP_CPUTFH;
6894 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6895
6896 /* 1/2: Create object */
6897 argop[idx_create].argop = OP_CCREATE;
6898 argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6899 argop[idx_create].nfs_argop4_u.opccreate.type = type;
6900 if (type == NF4LNK) {
6901 /*
6902 * symlink, treat name as data
6903 */
6904 ASSERT(data != NULL);
6905 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6906 (char *)data;
6907 }
6908 if (type == NF4BLK || type == NF4CHR) {
6909 ASSERT(data != NULL);
6910 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6911 *((specdata4 *)data);
6912 }
6913
6914 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6915
6916 svp = drp->r_server;
6917 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6918 supp_attrs = svp->sv_supp_attrs;
6919 nfs_rw_exit(&svp->sv_lock);
6920
6921 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6922 nfs_rw_exit(&drp->r_rwlock);
6923 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6924 e.error = EINVAL;
6925 kmem_free(argop, argoplist_size);
6926 return (e.error);
6927 }
6928
6929 /* 2/3: getfh fh of created object */
6930 ASSERT(idx_create + 1 == idx_fattr - 1);
6931 argop[idx_create + 1].argop = OP_GETFH;
6932
6933 /* 3/4: getattr of new object */
6934 argop[idx_fattr].argop = OP_GETATTR;
6935 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6936 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6937
6938 if (setgid_flag) {
6939 vattr_t _v;
6940
6941 argop[4].argop = OP_SAVEFH;
6942
6943 argop[5].argop = OP_CPUTFH;
6944 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6945
6946 argop[6].argop = OP_GETATTR;
6947 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6948 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6949
6950 argop[7].argop = OP_RESTOREFH;
6951
6952 /*
6953 * nverify
6954 *
6955 * XXX - Revisit the last argument to nfs4_end_op()
6956 * once 5020486 is fixed.
6957 */
6958 _v.va_mask = AT_GID;
6959 _v.va_gid = va->va_gid;
6960 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6961 supp_attrs)) {
6962 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6963 nfs_rw_exit(&drp->r_rwlock);
6964 nfs4_fattr4_free(crattr);
6965 kmem_free(argop, argoplist_size);
6966 return (e.error);
6967 }
6968
6969 /*
6970 * setattr
6971 *
6972 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
6973 * so no need for stateid or flags. Also we specify NULL
6974 * rp since we're only interested in setting owner_group
6975 * attributes.
6976 */
6977 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
6978 &e.error, 0);
6979
6980 if (e.error) {
6981 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6982 nfs_rw_exit(&drp->r_rwlock);
6983 nfs4_fattr4_free(crattr);
6984 nfs4args_verify_free(&argop[8]);
6985 kmem_free(argop, argoplist_size);
6986 return (e.error);
6987 }
6988 } else {
6989 argop[1].argop = OP_SAVEFH;
6990
6991 argop[5].argop = OP_RESTOREFH;
6992
6993 argop[6].argop = OP_GETATTR;
6994 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6995 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6996 }
6997
6998 dnlc_remove(dvp, nm);
6999
7000 doqueue = 1;
7001 t = gethrtime();
7002 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7003
7004 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7005 if (e.error) {
7006 PURGE_ATTRCACHE4(dvp);
7007 if (!needrecov)
7008 goto out;
7009 }
7010
7011 if (needrecov) {
7012 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7013 OP_CREATE, NULL, NULL, NULL) == FALSE) {
7014 nfs4_end_op(mi, dvp, NULL, &recov_state,
7015 needrecov);
7016 need_end_op = FALSE;
7017 nfs4_fattr4_free(crattr);
7018 if (setgid_flag) {
7019 nfs4args_verify_free(&argop[8]);
7020 nfs4args_setattr_free(&argop[9]);
7021 }
7022 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7023 goto recov_retry;
7024 }
7025 }
7026
7027 resp = &res;
7028
7029 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7030
7031 if (res.status == NFS4ERR_BADOWNER)
7032 nfs4_log_badowner(mi, OP_CREATE);
7033
7034 e.error = geterrno4(res.status);
7035
7036 /*
7037 * This check is left over from when create was implemented
7038 * using a setattr op (instead of createattrs). If the
7039 * putfh/create/getfh failed, the error was returned. If
7040 * setattr/getattr failed, we keep going.
7041 *
7042 * It might be better to get rid of the GETFH also, and just
7043 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7044 * Then if any of the operations failed, we could return the
7045 * error now, and remove much of the error code below.
7046 */
7047 if (res.array_len <= idx_fattr) {
7048 /*
7049 * Either Putfh, Create or Getfh failed.
7050 */
7051 PURGE_ATTRCACHE4(dvp);
7052 /*
7053 * nfs4_purge_stale_fh() may generate otw calls through
7054 * nfs4_invalidate_pages. Hence the need to call
7055 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7056 */
7057 nfs4_end_op(mi, dvp, NULL, &recov_state,
7058 needrecov);
7059 need_end_op = FALSE;
7060 nfs4_purge_stale_fh(e.error, dvp, cr);
7061 goto out;
7062 }
7063 }
7064
7065 resop = &res.array[idx_create]; /* create res */
7066 cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7067
7068 resop = &res.array[idx_create + 1]; /* getfh res */
7069 gf_res = &resop->nfs_resop4_u.opgetfh;
7070
7071 sfhp = sfh4_get(&gf_res->object, mi);
7072 if (e.error) {
7073 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7074 fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7075 if (vp->v_type == VNON) {
7076 vattr.va_mask = AT_TYPE;
7077 /*
7078 * Need to call nfs4_end_op before nfs4getattr to avoid
7079 * potential nfs4_start_op deadlock. See RFE 4777612.
7080 */
7081 nfs4_end_op(mi, dvp, NULL, &recov_state,
7082 needrecov);
7083 need_end_op = FALSE;
7084 e.error = nfs4getattr(vp, &vattr, cr);
7085 if (e.error) {
7086 VN_RELE(vp);
7087 *vpp = NULL;
7088 goto out;
7089 }
7090 vp->v_type = vattr.va_type;
7091 }
7092 e.error = 0;
7093 } else {
7094 *vpp = vp = makenfs4node(sfhp,
7095 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7096 dvp->v_vfsp, t, cr,
7097 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7098 }
7099
7100 /*
7101 * If compound succeeded, then update dir attrs
7102 */
7103 if (res.status == NFS4_OK) {
7104 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7105 dinfo.di_cred = cr;
7106 dinfo.di_time_call = t;
7107 dinfop = &dinfo;
7108 } else
7109 dinfop = NULL;
7110
7111 /* Update directory cache attribute, readdir and dnlc caches */
7112 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7113
7114 out:
7115 if (sfhp != NULL)
7116 sfh4_rele(&sfhp);
7117 nfs_rw_exit(&drp->r_rwlock);
7118 nfs4_fattr4_free(crattr);
7119 if (setgid_flag) {
7120 nfs4args_verify_free(&argop[8]);
7121 nfs4args_setattr_free(&argop[9]);
7122 }
7123 if (resp)
7124 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7125 if (need_end_op)
7126 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7127
7128 kmem_free(argop, argoplist_size);
7129 return (e.error);
7130 }
7131
7132 /* ARGSUSED */
7133 static int
7134 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7135 int mode, vnode_t **vpp, cred_t *cr)
7136 {
7137 int error;
7138 vnode_t *vp;
7139 nfs_ftype4 type;
7140 specdata4 spec, *specp = NULL;
7141
7142 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7143
7144 switch (va->va_type) {
7145 case VCHR:
7146 case VBLK:
7147 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7148 spec.specdata1 = getmajor(va->va_rdev);
7149 spec.specdata2 = getminor(va->va_rdev);
7150 specp = &spec;
7151 break;
7152
7153 case VFIFO:
7154 type = NF4FIFO;
7155 break;
7156 case VSOCK:
7157 type = NF4SOCK;
7158 break;
7159
7160 default:
7161 return (EINVAL);
7162 }
7163
7164 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7165 if (error) {
7166 return (error);
7167 }
7168
7169 /*
7170 * This might not be needed any more; special case to deal
7171 * with problematic v2/v3 servers. Since create was unable
7172 * to set group correctly, not sure what hope setattr has.
7173 */
7174 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7175 va->va_mask = AT_GID;
7176 (void) nfs4setattr(vp, va, 0, cr, NULL);
7177 }
7178
7179 /*
7180 * If vnode is a device create special vnode
7181 */
7182 if (ISVDEV(vp->v_type)) {
7183 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7184 VN_RELE(vp);
7185 } else {
7186 *vpp = vp;
7187 }
7188 return (error);
7189 }
7190
7191 /*
7192 * Remove requires that the current fh be the target directory.
7193 * After the operation, the current fh is unchanged.
7194 * The compound op structure is:
7195 * PUTFH(targetdir), REMOVE
7196 *
7197 * Weirdness: if the vnode to be removed is open
7198 * we rename it instead of removing it and nfs_inactive
7199 * will remove the new name.
7200 */
7201 /* ARGSUSED */
7202 static int
7203 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7204 {
7205 COMPOUND4args_clnt args;
7206 COMPOUND4res_clnt res, *resp = NULL;
7207 REMOVE4res *rm_res;
7208 nfs_argop4 argop[3];
7209 nfs_resop4 *resop;
7210 vnode_t *vp;
7211 char *tmpname;
7212 int doqueue;
7213 mntinfo4_t *mi;
7214 rnode4_t *rp;
7215 rnode4_t *drp;
7216 int needrecov = 0;
7217 nfs4_recov_state_t recov_state;
7218 int isopen;
7219 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7220 dirattr_info_t dinfo;
7221
7222 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7223 return (EPERM);
7224 drp = VTOR4(dvp);
7225 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7226 return (EINTR);
7227
7228 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7229 if (e.error) {
7230 nfs_rw_exit(&drp->r_rwlock);
7231 return (e.error);
7232 }
7233
7234 if (vp->v_type == VDIR) {
7235 VN_RELE(vp);
7236 nfs_rw_exit(&drp->r_rwlock);
7237 return (EISDIR);
7238 }
7239
7240 /*
7241 * First just remove the entry from the name cache, as it
7242 * is most likely the only entry for this vp.
7243 */
7244 dnlc_remove(dvp, nm);
7245
7246 rp = VTOR4(vp);
7247
7248 /*
7249 * For regular file types, check to see if the file is open by looking
7250 * at the open streams.
7251 * For all other types, check the reference count on the vnode. Since
7252 * they are not opened OTW they never have an open stream.
7253 *
7254 * If the file is open, rename it to .nfsXXXX.
7255 */
7256 if (vp->v_type != VREG) {
7257 /*
7258 * If the file has a v_count > 1 then there may be more than one
7259 * entry in the name cache due multiple links or an open file,
7260 * but we don't have the real reference count so flush all
7261 * possible entries.
7262 */
7263 if (vp->v_count > 1)
7264 dnlc_purge_vp(vp);
7265
7266 /*
7267 * Now we have the real reference count.
7268 */
7269 isopen = vp->v_count > 1;
7270 } else {
7271 mutex_enter(&rp->r_os_lock);
7272 isopen = list_head(&rp->r_open_streams) != NULL;
7273 mutex_exit(&rp->r_os_lock);
7274 }
7275
7276 mutex_enter(&rp->r_statelock);
7277 if (isopen &&
7278 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7279 mutex_exit(&rp->r_statelock);
7280 tmpname = newname();
7281 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7282 if (e.error)
7283 kmem_free(tmpname, MAXNAMELEN);
7284 else {
7285 mutex_enter(&rp->r_statelock);
7286 if (rp->r_unldvp == NULL) {
7287 VN_HOLD(dvp);
7288 rp->r_unldvp = dvp;
7289 if (rp->r_unlcred != NULL)
7290 crfree(rp->r_unlcred);
7291 crhold(cr);
7292 rp->r_unlcred = cr;
7293 rp->r_unlname = tmpname;
7294 } else {
7295 kmem_free(rp->r_unlname, MAXNAMELEN);
7296 rp->r_unlname = tmpname;
7297 }
7298 mutex_exit(&rp->r_statelock);
7299 }
7300 VN_RELE(vp);
7301 nfs_rw_exit(&drp->r_rwlock);
7302 return (e.error);
7303 }
7304 /*
7305 * Actually remove the file/dir
7306 */
7307 mutex_exit(&rp->r_statelock);
7308
7309 /*
7310 * We need to flush any dirty pages which happen to
7311 * be hanging around before removing the file.
7312 * This shouldn't happen very often since in NFSv4
7313 * we should be close to open consistent.
7314 */
7315 if (nfs4_has_pages(vp) &&
7316 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7317 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7318 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7319 mutex_enter(&rp->r_statelock);
7320 if (!rp->r_error)
7321 rp->r_error = e.error;
7322 mutex_exit(&rp->r_statelock);
7323 }
7324 }
7325
7326 mi = VTOMI4(dvp);
7327
7328 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7329 recov_state.rs_flags = 0;
7330 recov_state.rs_num_retry_despite_err = 0;
7331
7332 recov_retry:
7333 /*
7334 * Remove ops: putfh dir; remove
7335 */
7336 args.ctag = TAG_REMOVE;
7337 args.array_len = 3;
7338 args.array = argop;
7339
7340 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7341 if (e.error) {
7342 nfs_rw_exit(&drp->r_rwlock);
7343 VN_RELE(vp);
7344 return (e.error);
7345 }
7346
7347 /* putfh directory */
7348 argop[0].argop = OP_CPUTFH;
7349 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7350
7351 /* remove */
7352 argop[1].argop = OP_CREMOVE;
7353 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7354
7355 /* getattr dir */
7356 argop[2].argop = OP_GETATTR;
7357 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7358 argop[2].nfs_argop4_u.opgetattr.mi = mi;
7359
7360 doqueue = 1;
7361 dinfo.di_time_call = gethrtime();
7362 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7363
7364 PURGE_ATTRCACHE4(vp);
7365
7366 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7367 if (e.error)
7368 PURGE_ATTRCACHE4(dvp);
7369
7370 if (needrecov) {
7371 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7372 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7373 if (!e.error)
7374 (void) xdr_free(xdr_COMPOUND4res_clnt,
7375 (caddr_t)&res);
7376 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7377 needrecov);
7378 goto recov_retry;
7379 }
7380 }
7381
7382 /*
7383 * Matching nfs4_end_op() for start_op() above.
7384 * There is a path in the code below which calls
7385 * nfs4_purge_stale_fh(), which may generate otw calls through
7386 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7387 * here to avoid nfs4_start_op() deadlock.
7388 */
7389 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7390
7391 if (!e.error) {
7392 resp = &res;
7393
7394 if (res.status) {
7395 e.error = geterrno4(res.status);
7396 PURGE_ATTRCACHE4(dvp);
7397 nfs4_purge_stale_fh(e.error, dvp, cr);
7398 } else {
7399 resop = &res.array[1]; /* remove res */
7400 rm_res = &resop->nfs_resop4_u.opremove;
7401
7402 dinfo.di_garp =
7403 &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7404 dinfo.di_cred = cr;
7405
7406 /* Update directory attr, readdir and dnlc caches */
7407 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7408 &dinfo);
7409 }
7410 }
7411 nfs_rw_exit(&drp->r_rwlock);
7412 if (resp)
7413 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7414
7415 if (e.error == 0) {
7416 vnode_t *tvp;
7417 rnode4_t *trp;
7418 trp = VTOR4(vp);
7419 tvp = vp;
7420 if (IS_SHADOW(vp, trp))
7421 tvp = RTOV4(trp);
7422 vnevent_remove(tvp, dvp, nm, ct);
7423 }
7424 VN_RELE(vp);
7425 return (e.error);
7426 }
7427
7428 /*
7429 * Link requires that the current fh be the target directory and the
7430 * saved fh be the source fh. After the operation, the current fh is unchanged.
7431 * Thus the compound op structure is:
7432 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7433 * GETATTR(file)
7434 */
7435 /* ARGSUSED */
7436 static int
7437 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7438 caller_context_t *ct, int flags)
7439 {
7440 COMPOUND4args_clnt args;
7441 COMPOUND4res_clnt res, *resp = NULL;
7442 LINK4res *ln_res;
7443 int argoplist_size = 7 * sizeof (nfs_argop4);
7444 nfs_argop4 *argop;
7445 nfs_resop4 *resop;
7446 vnode_t *realvp, *nvp;
7447 int doqueue;
7448 mntinfo4_t *mi;
7449 rnode4_t *tdrp;
7450 bool_t needrecov = FALSE;
7451 nfs4_recov_state_t recov_state;
7452 hrtime_t t;
7453 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7454 dirattr_info_t dinfo;
7455
7456 ASSERT(*tnm != '\0');
7457 ASSERT(tdvp->v_type == VDIR);
7458 ASSERT(nfs4_consistent_type(tdvp));
7459 ASSERT(nfs4_consistent_type(svp));
7460
7461 if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7462 return (EPERM);
7463 if (VOP_REALVP(svp, &realvp, ct) == 0) {
7464 svp = realvp;
7465 ASSERT(nfs4_consistent_type(svp));
7466 }
7467
7468 tdrp = VTOR4(tdvp);
7469 mi = VTOMI4(svp);
7470
7471 if (!(mi->mi_flags & MI4_LINK)) {
7472 return (EOPNOTSUPP);
7473 }
7474 recov_state.rs_flags = 0;
7475 recov_state.rs_num_retry_despite_err = 0;
7476
7477 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7478 return (EINTR);
7479
7480 recov_retry:
7481 argop = kmem_alloc(argoplist_size, KM_SLEEP);
7482
7483 args.ctag = TAG_LINK;
7484
7485 /*
7486 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7487 * restorefh; getattr(fl)
7488 */
7489 args.array_len = 7;
7490 args.array = argop;
7491
7492 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7493 if (e.error) {
7494 kmem_free(argop, argoplist_size);
7495 nfs_rw_exit(&tdrp->r_rwlock);
7496 return (e.error);
7497 }
7498
7499 /* 0. putfh file */
7500 argop[0].argop = OP_CPUTFH;
7501 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7502
7503 /* 1. save current fh to free up the space for the dir */
7504 argop[1].argop = OP_SAVEFH;
7505
7506 /* 2. putfh targetdir */
7507 argop[2].argop = OP_CPUTFH;
7508 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7509
7510 /* 3. link: current_fh is targetdir, saved_fh is source */
7511 argop[3].argop = OP_CLINK;
7512 argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7513
7514 /* 4. Get attributes of dir */
7515 argop[4].argop = OP_GETATTR;
7516 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7517 argop[4].nfs_argop4_u.opgetattr.mi = mi;
7518
7519 /* 5. If link was successful, restore current vp to file */
7520 argop[5].argop = OP_RESTOREFH;
7521
7522 /* 6. Get attributes of linked object */
7523 argop[6].argop = OP_GETATTR;
7524 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7525 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7526
7527 dnlc_remove(tdvp, tnm);
7528
7529 doqueue = 1;
7530 t = gethrtime();
7531
7532 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7533
7534 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7535 if (e.error != 0 && !needrecov) {
7536 PURGE_ATTRCACHE4(tdvp);
7537 PURGE_ATTRCACHE4(svp);
7538 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7539 goto out;
7540 }
7541
7542 if (needrecov) {
7543 bool_t abort;
7544
7545 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7546 NULL, NULL, OP_LINK, NULL, NULL, NULL);
7547 if (abort == FALSE) {
7548 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7549 needrecov);
7550 kmem_free(argop, argoplist_size);
7551 if (!e.error)
7552 (void) xdr_free(xdr_COMPOUND4res_clnt,
7553 (caddr_t)&res);
7554 goto recov_retry;
7555 } else {
7556 if (e.error != 0) {
7557 PURGE_ATTRCACHE4(tdvp);
7558 PURGE_ATTRCACHE4(svp);
7559 nfs4_end_op(VTOMI4(svp), svp, tdvp,
7560 &recov_state, needrecov);
7561 goto out;
7562 }
7563 /* fall through for res.status case */
7564 }
7565 }
7566
7567 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7568
7569 resp = &res;
7570 if (res.status) {
7571 /* If link succeeded, then don't return error */
7572 e.error = geterrno4(res.status);
7573 if (res.array_len <= 4) {
7574 /*
7575 * Either Putfh, Savefh, Putfh dir, or Link failed
7576 */
7577 PURGE_ATTRCACHE4(svp);
7578 PURGE_ATTRCACHE4(tdvp);
7579 if (e.error == EOPNOTSUPP) {
7580 mutex_enter(&mi->mi_lock);
7581 mi->mi_flags &= ~MI4_LINK;
7582 mutex_exit(&mi->mi_lock);
7583 }
7584 /* Remap EISDIR to EPERM for non-root user for SVVS */
7585 /* XXX-LP */
7586 if (e.error == EISDIR && crgetuid(cr) != 0)
7587 e.error = EPERM;
7588 goto out;
7589 }
7590 }
7591
7592 /* either no error or one of the postop getattr failed */
7593
7594 /*
7595 * XXX - if LINK succeeded, but no attrs were returned for link
7596 * file, purge its cache.
7597 *
7598 * XXX Perform a simplified version of wcc checking. Instead of
7599 * have another getattr to get pre-op, just purge cache if
7600 * any of the ops prior to and including the getattr failed.
7601 * If the getattr succeeded then update the attrcache accordingly.
7602 */
7603
7604 /*
7605 * update cache with link file postattrs.
7606 * Note: at this point resop points to link res.
7607 */
7608 resop = &res.array[3]; /* link res */
7609 ln_res = &resop->nfs_resop4_u.oplink;
7610 if (res.status == NFS4_OK)
7611 e.error = nfs4_update_attrcache(res.status,
7612 &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7613 t, svp, cr);
7614
7615 /*
7616 * Call makenfs4node to create the new shadow vp for tnm.
7617 * We pass NULL attrs because we just cached attrs for
7618 * the src object. All we're trying to accomplish is to
7619 * to create the new shadow vnode.
7620 */
7621 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7622 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7623
7624 /* Update target cache attribute, readdir and dnlc caches */
7625 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7626 dinfo.di_time_call = t;
7627 dinfo.di_cred = cr;
7628
7629 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7630 ASSERT(nfs4_consistent_type(tdvp));
7631 ASSERT(nfs4_consistent_type(svp));
7632 ASSERT(nfs4_consistent_type(nvp));
7633 VN_RELE(nvp);
7634
7635 if (!e.error) {
7636 vnode_t *tvp;
7637 rnode4_t *trp;
7638 /*
7639 * Notify the source file of this link operation.
7640 */
7641 trp = VTOR4(svp);
7642 tvp = svp;
7643 if (IS_SHADOW(svp, trp))
7644 tvp = RTOV4(trp);
7645 vnevent_link(tvp, ct);
7646 }
7647 out:
7648 kmem_free(argop, argoplist_size);
7649 if (resp)
7650 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7651
7652 nfs_rw_exit(&tdrp->r_rwlock);
7653
7654 return (e.error);
7655 }
7656
7657 /* ARGSUSED */
7658 static int
7659 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7660 caller_context_t *ct, int flags)
7661 {
7662 vnode_t *realvp;
7663
7664 if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7665 return (EPERM);
7666 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7667 ndvp = realvp;
7668
7669 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7670 }
7671
7672 /*
7673 * nfs4rename does the real work of renaming in NFS Version 4.
7674 *
7675 * A file handle is considered volatile for renaming purposes if either
7676 * of the volatile bits are turned on. However, the compound may differ
7677 * based on the likelihood of the filehandle to change during rename.
7678 */
7679 static int
7680 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7681 caller_context_t *ct)
7682 {
7683 int error;
7684 mntinfo4_t *mi;
7685 vnode_t *nvp = NULL;
7686 vnode_t *ovp = NULL;
7687 char *tmpname = NULL;
7688 rnode4_t *rp;
7689 rnode4_t *odrp;
7690 rnode4_t *ndrp;
7691 int did_link = 0;
7692 int do_link = 1;
7693 nfsstat4 stat = NFS4_OK;
7694
7695 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7696 ASSERT(nfs4_consistent_type(odvp));
7697 ASSERT(nfs4_consistent_type(ndvp));
7698
7699 if (onm[0] == '.' && (onm[1] == '\0' ||
7700 (onm[1] == '.' && onm[2] == '\0')))
7701 return (EINVAL);
7702
7703 if (nnm[0] == '.' && (nnm[1] == '\0' ||
7704 (nnm[1] == '.' && nnm[2] == '\0')))
7705 return (EINVAL);
7706
7707 odrp = VTOR4(odvp);
7708 ndrp = VTOR4(ndvp);
7709 if ((intptr_t)odrp < (intptr_t)ndrp) {
7710 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7711 return (EINTR);
7712 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7713 nfs_rw_exit(&odrp->r_rwlock);
7714 return (EINTR);
7715 }
7716 } else {
7717 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7718 return (EINTR);
7719 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7720 nfs_rw_exit(&ndrp->r_rwlock);
7721 return (EINTR);
7722 }
7723 }
7724
7725 /*
7726 * Lookup the target file. If it exists, it needs to be
7727 * checked to see whether it is a mount point and whether
7728 * it is active (open).
7729 */
7730 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7731 if (!error) {
7732 int isactive;
7733
7734 ASSERT(nfs4_consistent_type(nvp));
7735 /*
7736 * If this file has been mounted on, then just
7737 * return busy because renaming to it would remove
7738 * the mounted file system from the name space.
7739 */
7740 if (vn_ismntpt(nvp)) {
7741 VN_RELE(nvp);
7742 nfs_rw_exit(&odrp->r_rwlock);
7743 nfs_rw_exit(&ndrp->r_rwlock);
7744 return (EBUSY);
7745 }
7746
7747 /*
7748 * First just remove the entry from the name cache, as it
7749 * is most likely the only entry for this vp.
7750 */
7751 dnlc_remove(ndvp, nnm);
7752
7753 rp = VTOR4(nvp);
7754
7755 if (nvp->v_type != VREG) {
7756 /*
7757 * Purge the name cache of all references to this vnode
7758 * so that we can check the reference count to infer
7759 * whether it is active or not.
7760 */
7761 if (nvp->v_count > 1)
7762 dnlc_purge_vp(nvp);
7763
7764 isactive = nvp->v_count > 1;
7765 } else {
7766 mutex_enter(&rp->r_os_lock);
7767 isactive = list_head(&rp->r_open_streams) != NULL;
7768 mutex_exit(&rp->r_os_lock);
7769 }
7770
7771 /*
7772 * If the vnode is active and is not a directory,
7773 * arrange to rename it to a
7774 * temporary file so that it will continue to be
7775 * accessible. This implements the "unlink-open-file"
7776 * semantics for the target of a rename operation.
7777 * Before doing this though, make sure that the
7778 * source and target files are not already the same.
7779 */
7780 if (isactive && nvp->v_type != VDIR) {
7781 /*
7782 * Lookup the source name.
7783 */
7784 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7785
7786 /*
7787 * The source name *should* already exist.
7788 */
7789 if (error) {
7790 VN_RELE(nvp);
7791 nfs_rw_exit(&odrp->r_rwlock);
7792 nfs_rw_exit(&ndrp->r_rwlock);
7793 return (error);
7794 }
7795
7796 ASSERT(nfs4_consistent_type(ovp));
7797
7798 /*
7799 * Compare the two vnodes. If they are the same,
7800 * just release all held vnodes and return success.
7801 */
7802 if (VN_CMP(ovp, nvp)) {
7803 VN_RELE(ovp);
7804 VN_RELE(nvp);
7805 nfs_rw_exit(&odrp->r_rwlock);
7806 nfs_rw_exit(&ndrp->r_rwlock);
7807 return (0);
7808 }
7809
7810 /*
7811 * Can't mix and match directories and non-
7812 * directories in rename operations. We already
7813 * know that the target is not a directory. If
7814 * the source is a directory, return an error.
7815 */
7816 if (ovp->v_type == VDIR) {
7817 VN_RELE(ovp);
7818 VN_RELE(nvp);
7819 nfs_rw_exit(&odrp->r_rwlock);
7820 nfs_rw_exit(&ndrp->r_rwlock);
7821 return (ENOTDIR);
7822 }
7823 link_call:
7824 /*
7825 * The target file exists, is not the same as
7826 * the source file, and is active. We first
7827 * try to Link it to a temporary filename to
7828 * avoid having the server removing the file
7829 * completely (which could cause data loss to
7830 * the user's POV in the event the Rename fails
7831 * -- see bug 1165874).
7832 */
7833 /*
7834 * The do_link and did_link booleans are
7835 * introduced in the event we get NFS4ERR_FILE_OPEN
7836 * returned for the Rename. Some servers can
7837 * not Rename over an Open file, so they return
7838 * this error. The client needs to Remove the
7839 * newly created Link and do two Renames, just
7840 * as if the server didn't support LINK.
7841 */
7842 tmpname = newname();
7843 error = 0;
7844
7845 if (do_link) {
7846 error = nfs4_link(ndvp, nvp, tmpname, cr,
7847 NULL, 0);
7848 }
7849 if (error == EOPNOTSUPP || !do_link) {
7850 error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7851 cr, NULL, 0);
7852 did_link = 0;
7853 } else {
7854 did_link = 1;
7855 }
7856 if (error) {
7857 kmem_free(tmpname, MAXNAMELEN);
7858 VN_RELE(ovp);
7859 VN_RELE(nvp);
7860 nfs_rw_exit(&odrp->r_rwlock);
7861 nfs_rw_exit(&ndrp->r_rwlock);
7862 return (error);
7863 }
7864
7865 mutex_enter(&rp->r_statelock);
7866 if (rp->r_unldvp == NULL) {
7867 VN_HOLD(ndvp);
7868 rp->r_unldvp = ndvp;
7869 if (rp->r_unlcred != NULL)
7870 crfree(rp->r_unlcred);
7871 crhold(cr);
7872 rp->r_unlcred = cr;
7873 rp->r_unlname = tmpname;
7874 } else {
7875 if (rp->r_unlname)
7876 kmem_free(rp->r_unlname, MAXNAMELEN);
7877 rp->r_unlname = tmpname;
7878 }
7879 mutex_exit(&rp->r_statelock);
7880 }
7881
7882 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7883
7884 ASSERT(nfs4_consistent_type(nvp));
7885 }
7886
7887 if (ovp == NULL) {
7888 /*
7889 * When renaming directories to be a subdirectory of a
7890 * different parent, the dnlc entry for ".." will no
7891 * longer be valid, so it must be removed.
7892 *
7893 * We do a lookup here to determine whether we are renaming
7894 * a directory and we need to check if we are renaming
7895 * an unlinked file. This might have already been done
7896 * in previous code, so we check ovp == NULL to avoid
7897 * doing it twice.
7898 */
7899 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7900 /*
7901 * The source name *should* already exist.
7902 */
7903 if (error) {
7904 nfs_rw_exit(&odrp->r_rwlock);
7905 nfs_rw_exit(&ndrp->r_rwlock);
7906 if (nvp) {
7907 VN_RELE(nvp);
7908 }
7909 return (error);
7910 }
7911 ASSERT(ovp != NULL);
7912 ASSERT(nfs4_consistent_type(ovp));
7913 }
7914
7915 /*
7916 * Is the object being renamed a dir, and if so, is
7917 * it being renamed to a child of itself? The underlying
7918 * fs should ultimately return EINVAL for this case;
7919 * however, buggy beta non-Solaris NFSv4 servers at
7920 * interop testing events have allowed this behavior,
7921 * and it caused our client to panic due to a recursive
7922 * mutex_enter in fn_move.
7923 *
7924 * The tedious locking in fn_move could be changed to
7925 * deal with this case, and the client could avoid the
7926 * panic; however, the client would just confuse itself
7927 * later and misbehave. A better way to handle the broken
7928 * server is to detect this condition and return EINVAL
7929 * without ever sending the the bogus rename to the server.
7930 * We know the rename is invalid -- just fail it now.
7931 */
7932 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7933 VN_RELE(ovp);
7934 nfs_rw_exit(&odrp->r_rwlock);
7935 nfs_rw_exit(&ndrp->r_rwlock);
7936 if (nvp) {
7937 VN_RELE(nvp);
7938 }
7939 return (EINVAL);
7940 }
7941
7942 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7943
7944 /*
7945 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7946 * possible for the filehandle to change due to the rename.
7947 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7948 * the fh will not change because of the rename, but we still need
7949 * to update its rnode entry with the new name for
7950 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7951 * has no effect on these for now, but for future improvements,
7952 * we might want to use it too to simplify handling of files
7953 * that are open with that flag on. (XXX)
7954 */
7955 mi = VTOMI4(odvp);
7956 if (NFS4_VOLATILE_FH(mi))
7957 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7958 &stat);
7959 else
7960 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7961 &stat);
7962
7963 ASSERT(nfs4_consistent_type(odvp));
7964 ASSERT(nfs4_consistent_type(ndvp));
7965 ASSERT(nfs4_consistent_type(ovp));
7966
7967 if (stat == NFS4ERR_FILE_OPEN && did_link) {
7968 do_link = 0;
7969 /*
7970 * Before the 'link_call' code, we did a nfs4_lookup
7971 * that puts a VN_HOLD on nvp. After the nfs4_link
7972 * call we call VN_RELE to match that hold. We need
7973 * to place an additional VN_HOLD here since we will
7974 * be hitting that VN_RELE again.
7975 */
7976 VN_HOLD(nvp);
7977
7978 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
7979
7980 /* Undo the unlinked file naming stuff we just did */
7981 mutex_enter(&rp->r_statelock);
7982 if (rp->r_unldvp) {
7983 VN_RELE(ndvp);
7984 rp->r_unldvp = NULL;
7985 if (rp->r_unlcred != NULL)
7986 crfree(rp->r_unlcred);
7987 rp->r_unlcred = NULL;
7988 /* rp->r_unlanme points to tmpname */
7989 if (rp->r_unlname)
7990 kmem_free(rp->r_unlname, MAXNAMELEN);
7991 rp->r_unlname = NULL;
7992 }
7993 mutex_exit(&rp->r_statelock);
7994
7995 if (nvp) {
7996 VN_RELE(nvp);
7997 }
7998 goto link_call;
7999 }
8000
8001 if (error) {
8002 VN_RELE(ovp);
8003 nfs_rw_exit(&odrp->r_rwlock);
8004 nfs_rw_exit(&ndrp->r_rwlock);
8005 if (nvp) {
8006 VN_RELE(nvp);
8007 }
8008 return (error);
8009 }
8010
8011 /*
8012 * when renaming directories to be a subdirectory of a
8013 * different parent, the dnlc entry for ".." will no
8014 * longer be valid, so it must be removed
8015 */
8016 rp = VTOR4(ovp);
8017 if (ndvp != odvp) {
8018 if (ovp->v_type == VDIR) {
8019 dnlc_remove(ovp, "..");
8020 if (rp->r_dir != NULL)
8021 nfs4_purge_rddir_cache(ovp);
8022 }
8023 }
8024
8025 /*
8026 * If we are renaming the unlinked file, update the
8027 * r_unldvp and r_unlname as needed.
8028 */
8029 mutex_enter(&rp->r_statelock);
8030 if (rp->r_unldvp != NULL) {
8031 if (strcmp(rp->r_unlname, onm) == 0) {
8032 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8033 rp->r_unlname[MAXNAMELEN - 1] = '\0';
8034 if (ndvp != rp->r_unldvp) {
8035 VN_RELE(rp->r_unldvp);
8036 rp->r_unldvp = ndvp;
8037 VN_HOLD(ndvp);
8038 }
8039 }
8040 }
8041 mutex_exit(&rp->r_statelock);
8042
8043 /*
8044 * Notify the rename vnevents to source vnode, and to the target
8045 * vnode if it already existed.
8046 */
8047 if (error == 0) {
8048 vnode_t *tvp;
8049 rnode4_t *trp;
8050 /*
8051 * Notify the vnode. Each links is represented by
8052 * a different vnode, in nfsv4.
8053 */
8054 if (nvp) {
8055 trp = VTOR4(nvp);
8056 tvp = nvp;
8057 if (IS_SHADOW(nvp, trp))
8058 tvp = RTOV4(trp);
8059 vnevent_rename_dest(tvp, ndvp, nnm, ct);
8060 }
8061
8062 /*
8063 * if the source and destination directory are not the
8064 * same notify the destination directory.
8065 */
8066 if (VTOR4(odvp) != VTOR4(ndvp)) {
8067 trp = VTOR4(ndvp);
8068 tvp = ndvp;
8069 if (IS_SHADOW(ndvp, trp))
8070 tvp = RTOV4(trp);
8071 vnevent_rename_dest_dir(tvp, ct);
8072 }
8073
8074 trp = VTOR4(ovp);
8075 tvp = ovp;
8076 if (IS_SHADOW(ovp, trp))
8077 tvp = RTOV4(trp);
8078 vnevent_rename_src(tvp, odvp, onm, ct);
8079 }
8080
8081 if (nvp) {
8082 VN_RELE(nvp);
8083 }
8084 VN_RELE(ovp);
8085
8086 nfs_rw_exit(&odrp->r_rwlock);
8087 nfs_rw_exit(&ndrp->r_rwlock);
8088
8089 return (error);
8090 }
8091
8092 /*
8093 * When the parent directory has changed, sv_dfh must be updated
8094 */
8095 static void
8096 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8097 {
8098 svnode_t *sv = VTOSV(vp);
8099 nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8100 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8101
8102 sfh4_hold(new_dfh);
8103 sv->sv_dfh = new_dfh;
8104 sfh4_rele(&old_dfh);
8105 }
8106
8107 /*
8108 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8109 * when it is known that the filehandle is persistent through rename.
8110 *
8111 * Rename requires that the current fh be the target directory and the
8112 * saved fh be the source directory. After the operation, the current fh
8113 * is unchanged.
8114 * The compound op structure for persistent fh rename is:
8115 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8116 * Rather than bother with the directory postop args, we'll simply
8117 * update that a change occurred in the cache, so no post-op getattrs.
8118 */
8119 static int
8120 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8121 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8122 {
8123 COMPOUND4args_clnt args;
8124 COMPOUND4res_clnt res, *resp = NULL;
8125 nfs_argop4 *argop;
8126 nfs_resop4 *resop;
8127 int doqueue, argoplist_size;
8128 mntinfo4_t *mi;
8129 rnode4_t *odrp = VTOR4(odvp);
8130 rnode4_t *ndrp = VTOR4(ndvp);
8131 RENAME4res *rn_res;
8132 bool_t needrecov;
8133 nfs4_recov_state_t recov_state;
8134 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8135 dirattr_info_t dinfo, *dinfop;
8136
8137 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8138
8139 recov_state.rs_flags = 0;
8140 recov_state.rs_num_retry_despite_err = 0;
8141
8142 /*
8143 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8144 *
8145 * If source/target are different dirs, then append putfh(src); getattr
8146 */
8147 args.array_len = (odvp == ndvp) ? 5 : 7;
8148 argoplist_size = args.array_len * sizeof (nfs_argop4);
8149 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8150
8151 recov_retry:
8152 *statp = NFS4_OK;
8153
8154 /* No need to Lookup the file, persistent fh */
8155 args.ctag = TAG_RENAME;
8156
8157 mi = VTOMI4(odvp);
8158 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8159 if (e.error) {
8160 kmem_free(argop, argoplist_size);
8161 return (e.error);
8162 }
8163
8164 /* 0: putfh source directory */
8165 argop[0].argop = OP_CPUTFH;
8166 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8167
8168 /* 1: Save source fh to free up current for target */
8169 argop[1].argop = OP_SAVEFH;
8170
8171 /* 2: putfh targetdir */
8172 argop[2].argop = OP_CPUTFH;
8173 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8174
8175 /* 3: current_fh is targetdir, saved_fh is sourcedir */
8176 argop[3].argop = OP_CRENAME;
8177 argop[3].nfs_argop4_u.opcrename.coldname = onm;
8178 argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8179
8180 /* 4: getattr (targetdir) */
8181 argop[4].argop = OP_GETATTR;
8182 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8183 argop[4].nfs_argop4_u.opgetattr.mi = mi;
8184
8185 if (ndvp != odvp) {
8186
8187 /* 5: putfh (sourcedir) */
8188 argop[5].argop = OP_CPUTFH;
8189 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8190
8191 /* 6: getattr (sourcedir) */
8192 argop[6].argop = OP_GETATTR;
8193 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8194 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8195 }
8196
8197 dnlc_remove(odvp, onm);
8198 dnlc_remove(ndvp, nnm);
8199
8200 doqueue = 1;
8201 dinfo.di_time_call = gethrtime();
8202 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8203
8204 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8205 if (e.error) {
8206 PURGE_ATTRCACHE4(odvp);
8207 PURGE_ATTRCACHE4(ndvp);
8208 } else {
8209 *statp = res.status;
8210 }
8211
8212 if (needrecov) {
8213 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8214 OP_RENAME, NULL, NULL, NULL) == FALSE) {
8215 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8216 if (!e.error)
8217 (void) xdr_free(xdr_COMPOUND4res_clnt,
8218 (caddr_t)&res);
8219 goto recov_retry;
8220 }
8221 }
8222
8223 if (!e.error) {
8224 resp = &res;
8225 /*
8226 * as long as OP_RENAME
8227 */
8228 if (res.status != NFS4_OK && res.array_len <= 4) {
8229 e.error = geterrno4(res.status);
8230 PURGE_ATTRCACHE4(odvp);
8231 PURGE_ATTRCACHE4(ndvp);
8232 /*
8233 * System V defines rename to return EEXIST, not
8234 * ENOTEMPTY if the target directory is not empty.
8235 * Over the wire, the error is NFSERR_ENOTEMPTY
8236 * which geterrno4 maps to ENOTEMPTY.
8237 */
8238 if (e.error == ENOTEMPTY)
8239 e.error = EEXIST;
8240 } else {
8241
8242 resop = &res.array[3]; /* rename res */
8243 rn_res = &resop->nfs_resop4_u.oprename;
8244
8245 if (res.status == NFS4_OK) {
8246 /*
8247 * Update target attribute, readdir and dnlc
8248 * caches.
8249 */
8250 dinfo.di_garp =
8251 &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8252 dinfo.di_cred = cr;
8253 dinfop = &dinfo;
8254 } else
8255 dinfop = NULL;
8256
8257 nfs4_update_dircaches(&rn_res->target_cinfo,
8258 ndvp, NULL, NULL, dinfop);
8259
8260 /*
8261 * Update source attribute, readdir and dnlc caches
8262 *
8263 */
8264 if (ndvp != odvp) {
8265 update_parentdir_sfh(renvp, ndvp);
8266
8267 if (dinfop)
8268 dinfo.di_garp =
8269 &(res.array[6].nfs_resop4_u.
8270 opgetattr.ga_res);
8271
8272 nfs4_update_dircaches(&rn_res->source_cinfo,
8273 odvp, NULL, NULL, dinfop);
8274 }
8275
8276 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8277 nnm);
8278 }
8279 }
8280
8281 if (resp)
8282 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8283 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8284 kmem_free(argop, argoplist_size);
8285
8286 return (e.error);
8287 }
8288
8289 /*
8290 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8291 * it is possible for the filehandle to change due to the rename.
8292 *
8293 * The compound req in this case includes a post-rename lookup and getattr
8294 * to ensure that we have the correct fh and attributes for the object.
8295 *
8296 * Rename requires that the current fh be the target directory and the
8297 * saved fh be the source directory. After the operation, the current fh
8298 * is unchanged.
8299 *
8300 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8301 * update the filehandle for the renamed object. We also get the old
8302 * filehandle for historical reasons; this should be taken out sometime.
8303 * This results in a rather cumbersome compound...
8304 *
8305 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8306 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8307 *
8308 */
8309 static int
8310 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8311 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8312 {
8313 COMPOUND4args_clnt args;
8314 COMPOUND4res_clnt res, *resp = NULL;
8315 int argoplist_size;
8316 nfs_argop4 *argop;
8317 nfs_resop4 *resop;
8318 int doqueue;
8319 mntinfo4_t *mi;
8320 rnode4_t *odrp = VTOR4(odvp); /* old directory */
8321 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */
8322 rnode4_t *orp = VTOR4(ovp); /* object being renamed */
8323 RENAME4res *rn_res;
8324 GETFH4res *ngf_res;
8325 bool_t needrecov;
8326 nfs4_recov_state_t recov_state;
8327 hrtime_t t;
8328 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8329 dirattr_info_t dinfo, *dinfop = &dinfo;
8330
8331 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8332
8333 recov_state.rs_flags = 0;
8334 recov_state.rs_num_retry_despite_err = 0;
8335
8336 recov_retry:
8337 *statp = NFS4_OK;
8338
8339 /*
8340 * There is a window between the RPC and updating the path and
8341 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery
8342 * code, so that it doesn't try to use the old path during that
8343 * window.
8344 */
8345 mutex_enter(&orp->r_statelock);
8346 while (orp->r_flags & R4RECEXPFH) {
8347 klwp_t *lwp = ttolwp(curthread);
8348
8349 if (lwp != NULL)
8350 lwp->lwp_nostop++;
8351 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8352 mutex_exit(&orp->r_statelock);
8353 if (lwp != NULL)
8354 lwp->lwp_nostop--;
8355 return (EINTR);
8356 }
8357 if (lwp != NULL)
8358 lwp->lwp_nostop--;
8359 }
8360 orp->r_flags |= R4RECEXPFH;
8361 mutex_exit(&orp->r_statelock);
8362
8363 mi = VTOMI4(odvp);
8364
8365 args.ctag = TAG_RENAME_VFH;
8366 args.array_len = (odvp == ndvp) ? 10 : 12;
8367 argoplist_size = args.array_len * sizeof (nfs_argop4);
8368 argop = kmem_alloc(argoplist_size, KM_SLEEP);
8369
8370 /*
8371 * Rename ops:
8372 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8373 * PUTFH(targetdir), RENAME, GETATTR(targetdir)
8374 * LOOKUP(trgt), GETFH(new), GETATTR,
8375 *
8376 * if (odvp != ndvp)
8377 * add putfh(sourcedir), getattr(sourcedir) }
8378 */
8379 args.array = argop;
8380
8381 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8382 &recov_state, NULL);
8383 if (e.error) {
8384 kmem_free(argop, argoplist_size);
8385 mutex_enter(&orp->r_statelock);
8386 orp->r_flags &= ~R4RECEXPFH;
8387 cv_broadcast(&orp->r_cv);
8388 mutex_exit(&orp->r_statelock);
8389 return (e.error);
8390 }
8391
8392 /* 0: putfh source directory */
8393 argop[0].argop = OP_CPUTFH;
8394 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8395
8396 /* 1: Save source fh to free up current for target */
8397 argop[1].argop = OP_SAVEFH;
8398
8399 /* 2: Lookup pre-rename fh of renamed object */
8400 argop[2].argop = OP_CLOOKUP;
8401 argop[2].nfs_argop4_u.opclookup.cname = onm;
8402
8403 /* 3: getfh fh of renamed object (before rename) */
8404 argop[3].argop = OP_GETFH;
8405
8406 /* 4: putfh targetdir */
8407 argop[4].argop = OP_CPUTFH;
8408 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8409
8410 /* 5: current_fh is targetdir, saved_fh is sourcedir */
8411 argop[5].argop = OP_CRENAME;
8412 argop[5].nfs_argop4_u.opcrename.coldname = onm;
8413 argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8414
8415 /* 6: getattr of target dir (post op attrs) */
8416 argop[6].argop = OP_GETATTR;
8417 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8418 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8419
8420 /* 7: Lookup post-rename fh of renamed object */
8421 argop[7].argop = OP_CLOOKUP;
8422 argop[7].nfs_argop4_u.opclookup.cname = nnm;
8423
8424 /* 8: getfh fh of renamed object (after rename) */
8425 argop[8].argop = OP_GETFH;
8426
8427 /* 9: getattr of renamed object */
8428 argop[9].argop = OP_GETATTR;
8429 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8430 argop[9].nfs_argop4_u.opgetattr.mi = mi;
8431
8432 /*
8433 * If source/target dirs are different, then get new post-op
8434 * attrs for source dir also.
8435 */
8436 if (ndvp != odvp) {
8437 /* 10: putfh (sourcedir) */
8438 argop[10].argop = OP_CPUTFH;
8439 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8440
8441 /* 11: getattr (sourcedir) */
8442 argop[11].argop = OP_GETATTR;
8443 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8444 argop[11].nfs_argop4_u.opgetattr.mi = mi;
8445 }
8446
8447 dnlc_remove(odvp, onm);
8448 dnlc_remove(ndvp, nnm);
8449
8450 doqueue = 1;
8451 t = gethrtime();
8452 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8453
8454 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8455 if (e.error) {
8456 PURGE_ATTRCACHE4(odvp);
8457 PURGE_ATTRCACHE4(ndvp);
8458 if (!needrecov) {
8459 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8460 &recov_state, needrecov);
8461 goto out;
8462 }
8463 } else {
8464 *statp = res.status;
8465 }
8466
8467 if (needrecov) {
8468 bool_t abort;
8469
8470 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8471 OP_RENAME, NULL, NULL, NULL);
8472 if (abort == FALSE) {
8473 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8474 &recov_state, needrecov);
8475 kmem_free(argop, argoplist_size);
8476 if (!e.error)
8477 (void) xdr_free(xdr_COMPOUND4res_clnt,
8478 (caddr_t)&res);
8479 mutex_enter(&orp->r_statelock);
8480 orp->r_flags &= ~R4RECEXPFH;
8481 cv_broadcast(&orp->r_cv);
8482 mutex_exit(&orp->r_statelock);
8483 goto recov_retry;
8484 } else {
8485 if (e.error != 0) {
8486 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8487 &recov_state, needrecov);
8488 goto out;
8489 }
8490 /* fall through for res.status case */
8491 }
8492 }
8493
8494 resp = &res;
8495 /*
8496 * If OP_RENAME (or any prev op) failed, then return an error.
8497 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8498 */
8499 if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8500 /*
8501 * Error in an op other than last Getattr
8502 */
8503 e.error = geterrno4(res.status);
8504 PURGE_ATTRCACHE4(odvp);
8505 PURGE_ATTRCACHE4(ndvp);
8506 /*
8507 * System V defines rename to return EEXIST, not
8508 * ENOTEMPTY if the target directory is not empty.
8509 * Over the wire, the error is NFSERR_ENOTEMPTY
8510 * which geterrno4 maps to ENOTEMPTY.
8511 */
8512 if (e.error == ENOTEMPTY)
8513 e.error = EEXIST;
8514 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8515 needrecov);
8516 goto out;
8517 }
8518
8519 /* rename results */
8520 rn_res = &res.array[5].nfs_resop4_u.oprename;
8521
8522 if (res.status == NFS4_OK) {
8523 /* Update target attribute, readdir and dnlc caches */
8524 dinfo.di_garp =
8525 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8526 dinfo.di_cred = cr;
8527 dinfo.di_time_call = t;
8528 } else
8529 dinfop = NULL;
8530
8531 /* Update source cache attribute, readdir and dnlc caches */
8532 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8533
8534 /* Update source cache attribute, readdir and dnlc caches */
8535 if (ndvp != odvp) {
8536 update_parentdir_sfh(ovp, ndvp);
8537
8538 /*
8539 * If dinfop is non-NULL, then compound succeded, so
8540 * set di_garp to attrs for source dir. dinfop is only
8541 * set to NULL when compound fails.
8542 */
8543 if (dinfop)
8544 dinfo.di_garp =
8545 &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8546 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8547 dinfop);
8548 }
8549
8550 /*
8551 * Update the rnode with the new component name and args,
8552 * and if the file handle changed, also update it with the new fh.
8553 * This is only necessary if the target object has an rnode
8554 * entry and there is no need to create one for it.
8555 */
8556 resop = &res.array[8]; /* getfh new res */
8557 ngf_res = &resop->nfs_resop4_u.opgetfh;
8558
8559 /*
8560 * Update the path and filehandle for the renamed object.
8561 */
8562 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8563
8564 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8565
8566 if (res.status == NFS4_OK) {
8567 resop++; /* getattr res */
8568 e.error = nfs4_update_attrcache(res.status,
8569 &resop->nfs_resop4_u.opgetattr.ga_res,
8570 t, ovp, cr);
8571 }
8572
8573 out:
8574 kmem_free(argop, argoplist_size);
8575 if (resp)
8576 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8577 mutex_enter(&orp->r_statelock);
8578 orp->r_flags &= ~R4RECEXPFH;
8579 cv_broadcast(&orp->r_cv);
8580 mutex_exit(&orp->r_statelock);
8581
8582 return (e.error);
8583 }
8584
8585 /* ARGSUSED */
8586 static int
8587 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8588 caller_context_t *ct, int flags, vsecattr_t *vsecp)
8589 {
8590 int error;
8591 vnode_t *vp;
8592
8593 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8594 return (EPERM);
8595 /*
8596 * As ".." has special meaning and rather than send a mkdir
8597 * over the wire to just let the server freak out, we just
8598 * short circuit it here and return EEXIST
8599 */
8600 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8601 return (EEXIST);
8602
8603 /*
8604 * Decision to get the right gid and setgid bit of the
8605 * new directory is now made in call_nfs4_create_req.
8606 */
8607 va->va_mask |= AT_MODE;
8608 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8609 if (error)
8610 return (error);
8611
8612 *vpp = vp;
8613 return (0);
8614 }
8615
8616
8617 /*
8618 * rmdir is using the same remove v4 op as does remove.
8619 * Remove requires that the current fh be the target directory.
8620 * After the operation, the current fh is unchanged.
8621 * The compound op structure is:
8622 * PUTFH(targetdir), REMOVE
8623 */
8624 /*ARGSUSED4*/
8625 static int
8626 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8627 caller_context_t *ct, int flags)
8628 {
8629 int need_end_op = FALSE;
8630 COMPOUND4args_clnt args;
8631 COMPOUND4res_clnt res, *resp = NULL;
8632 REMOVE4res *rm_res;
8633 nfs_argop4 argop[3];
8634 nfs_resop4 *resop;
8635 vnode_t *vp;
8636 int doqueue;
8637 mntinfo4_t *mi;
8638 rnode4_t *drp;
8639 bool_t needrecov = FALSE;
8640 nfs4_recov_state_t recov_state;
8641 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8642 dirattr_info_t dinfo, *dinfop;
8643
8644 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8645 return (EPERM);
8646 /*
8647 * As ".." has special meaning and rather than send a rmdir
8648 * over the wire to just let the server freak out, we just
8649 * short circuit it here and return EEXIST
8650 */
8651 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8652 return (EEXIST);
8653
8654 drp = VTOR4(dvp);
8655 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8656 return (EINTR);
8657
8658 /*
8659 * Attempt to prevent a rmdir(".") from succeeding.
8660 */
8661 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8662 if (e.error) {
8663 nfs_rw_exit(&drp->r_rwlock);
8664 return (e.error);
8665 }
8666 if (vp == cdir) {
8667 VN_RELE(vp);
8668 nfs_rw_exit(&drp->r_rwlock);
8669 return (EINVAL);
8670 }
8671
8672 /*
8673 * Since nfsv4 remove op works on both files and directories,
8674 * check that the removed object is indeed a directory.
8675 */
8676 if (vp->v_type != VDIR) {
8677 VN_RELE(vp);
8678 nfs_rw_exit(&drp->r_rwlock);
8679 return (ENOTDIR);
8680 }
8681
8682 /*
8683 * First just remove the entry from the name cache, as it
8684 * is most likely an entry for this vp.
8685 */
8686 dnlc_remove(dvp, nm);
8687
8688 /*
8689 * If there vnode reference count is greater than one, then
8690 * there may be additional references in the DNLC which will
8691 * need to be purged. First, trying removing the entry for
8692 * the parent directory and see if that removes the additional
8693 * reference(s). If that doesn't do it, then use dnlc_purge_vp
8694 * to completely remove any references to the directory which
8695 * might still exist in the DNLC.
8696 */
8697 if (vp->v_count > 1) {
8698 dnlc_remove(vp, "..");
8699 if (vp->v_count > 1)
8700 dnlc_purge_vp(vp);
8701 }
8702
8703 mi = VTOMI4(dvp);
8704 recov_state.rs_flags = 0;
8705 recov_state.rs_num_retry_despite_err = 0;
8706
8707 recov_retry:
8708 args.ctag = TAG_RMDIR;
8709
8710 /*
8711 * Rmdir ops: putfh dir; remove
8712 */
8713 args.array_len = 3;
8714 args.array = argop;
8715
8716 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8717 if (e.error) {
8718 nfs_rw_exit(&drp->r_rwlock);
8719 return (e.error);
8720 }
8721 need_end_op = TRUE;
8722
8723 /* putfh directory */
8724 argop[0].argop = OP_CPUTFH;
8725 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8726
8727 /* remove */
8728 argop[1].argop = OP_CREMOVE;
8729 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8730
8731 /* getattr (postop attrs for dir that contained removed dir) */
8732 argop[2].argop = OP_GETATTR;
8733 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8734 argop[2].nfs_argop4_u.opgetattr.mi = mi;
8735
8736 dinfo.di_time_call = gethrtime();
8737 doqueue = 1;
8738 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8739
8740 PURGE_ATTRCACHE4(vp);
8741
8742 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8743 if (e.error) {
8744 PURGE_ATTRCACHE4(dvp);
8745 }
8746
8747 if (needrecov) {
8748 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8749 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8750 if (!e.error)
8751 (void) xdr_free(xdr_COMPOUND4res_clnt,
8752 (caddr_t)&res);
8753
8754 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8755 needrecov);
8756 need_end_op = FALSE;
8757 goto recov_retry;
8758 }
8759 }
8760
8761 if (!e.error) {
8762 resp = &res;
8763
8764 /*
8765 * Only return error if first 2 ops (OP_REMOVE or earlier)
8766 * failed.
8767 */
8768 if (res.status != NFS4_OK && res.array_len <= 2) {
8769 e.error = geterrno4(res.status);
8770 PURGE_ATTRCACHE4(dvp);
8771 nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8772 &recov_state, needrecov);
8773 need_end_op = FALSE;
8774 nfs4_purge_stale_fh(e.error, dvp, cr);
8775 /*
8776 * System V defines rmdir to return EEXIST, not
8777 * ENOTEMPTY if the directory is not empty. Over
8778 * the wire, the error is NFSERR_ENOTEMPTY which
8779 * geterrno4 maps to ENOTEMPTY.
8780 */
8781 if (e.error == ENOTEMPTY)
8782 e.error = EEXIST;
8783 } else {
8784 resop = &res.array[1]; /* remove res */
8785 rm_res = &resop->nfs_resop4_u.opremove;
8786
8787 if (res.status == NFS4_OK) {
8788 resop = &res.array[2]; /* dir attrs */
8789 dinfo.di_garp =
8790 &resop->nfs_resop4_u.opgetattr.ga_res;
8791 dinfo.di_cred = cr;
8792 dinfop = &dinfo;
8793 } else
8794 dinfop = NULL;
8795
8796 /* Update dir attribute, readdir and dnlc caches */
8797 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8798 dinfop);
8799
8800 /* destroy rddir cache for dir that was removed */
8801 if (VTOR4(vp)->r_dir != NULL)
8802 nfs4_purge_rddir_cache(vp);
8803 }
8804 }
8805
8806 if (need_end_op)
8807 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8808
8809 nfs_rw_exit(&drp->r_rwlock);
8810
8811 if (resp)
8812 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8813
8814 if (e.error == 0) {
8815 vnode_t *tvp;
8816 rnode4_t *trp;
8817 trp = VTOR4(vp);
8818 tvp = vp;
8819 if (IS_SHADOW(vp, trp))
8820 tvp = RTOV4(trp);
8821 vnevent_rmdir(tvp, dvp, nm, ct);
8822 }
8823
8824 VN_RELE(vp);
8825
8826 return (e.error);
8827 }
8828
8829 /* ARGSUSED */
8830 static int
8831 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8832 caller_context_t *ct, int flags)
8833 {
8834 int error;
8835 vnode_t *vp;
8836 rnode4_t *rp;
8837 char *contents;
8838 mntinfo4_t *mi = VTOMI4(dvp);
8839
8840 if (nfs_zone() != mi->mi_zone)
8841 return (EPERM);
8842 if (!(mi->mi_flags & MI4_SYMLINK))
8843 return (EOPNOTSUPP);
8844
8845 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8846 if (error)
8847 return (error);
8848
8849 ASSERT(nfs4_consistent_type(vp));
8850 rp = VTOR4(vp);
8851 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8852
8853 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8854
8855 if (contents != NULL) {
8856 mutex_enter(&rp->r_statelock);
8857 if (rp->r_symlink.contents == NULL) {
8858 rp->r_symlink.len = strlen(tnm);
8859 bcopy(tnm, contents, rp->r_symlink.len);
8860 rp->r_symlink.contents = contents;
8861 rp->r_symlink.size = MAXPATHLEN;
8862 mutex_exit(&rp->r_statelock);
8863 } else {
8864 mutex_exit(&rp->r_statelock);
8865 kmem_free((void *)contents, MAXPATHLEN);
8866 }
8867 }
8868 }
8869 VN_RELE(vp);
8870
8871 return (error);
8872 }
8873
8874
8875 /*
8876 * Read directory entries.
8877 * There are some weird things to look out for here. The uio_loffset
8878 * field is either 0 or it is the offset returned from a previous
8879 * readdir. It is an opaque value used by the server to find the
8880 * correct directory block to read. The count field is the number
8881 * of blocks to read on the server. This is advisory only, the server
8882 * may return only one block's worth of entries. Entries may be compressed
8883 * on the server.
8884 */
8885 /* ARGSUSED */
8886 static int
8887 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8888 caller_context_t *ct, int flags)
8889 {
8890 int error;
8891 uint_t count;
8892 rnode4_t *rp;
8893 rddir4_cache *rdc;
8894 rddir4_cache *rrdc;
8895
8896 if (nfs_zone() != VTOMI4(vp)->mi_zone)
8897 return (EIO);
8898 rp = VTOR4(vp);
8899
8900 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8901
8902 /*
8903 * Make sure that the directory cache is valid.
8904 */
8905 if (rp->r_dir != NULL) {
8906 if (nfs_disable_rddir_cache != 0) {
8907 /*
8908 * Setting nfs_disable_rddir_cache in /etc/system
8909 * allows interoperability with servers that do not
8910 * properly update the attributes of directories.
8911 * Any cached information gets purged before an
8912 * access is made to it.
8913 */
8914 nfs4_purge_rddir_cache(vp);
8915 }
8916
8917 error = nfs4_validate_caches(vp, cr);
8918 if (error)
8919 return (error);
8920 }
8921
8922 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8923
8924 /*
8925 * Short circuit last readdir which always returns 0 bytes.
8926 * This can be done after the directory has been read through
8927 * completely at least once. This will set r_direof which
8928 * can be used to find the value of the last cookie.
8929 */
8930 mutex_enter(&rp->r_statelock);
8931 if (rp->r_direof != NULL &&
8932 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8933 mutex_exit(&rp->r_statelock);
8934 #ifdef DEBUG
8935 nfs4_readdir_cache_shorts++;
8936 #endif
8937 if (eofp)
8938 *eofp = 1;
8939 return (0);
8940 }
8941
8942 /*
8943 * Look for a cache entry. Cache entries are identified
8944 * by the NFS cookie value and the byte count requested.
8945 */
8946 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8947
8948 /*
8949 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8950 */
8951 if (rdc == NULL) {
8952 mutex_exit(&rp->r_statelock);
8953 return (EINTR);
8954 }
8955
8956 /*
8957 * Check to see if we need to fill this entry in.
8958 */
8959 if (rdc->flags & RDDIRREQ) {
8960 rdc->flags &= ~RDDIRREQ;
8961 rdc->flags |= RDDIR;
8962 mutex_exit(&rp->r_statelock);
8963
8964 /*
8965 * Do the readdir.
8966 */
8967 nfs4readdir(vp, rdc, cr);
8968
8969 /*
8970 * Reacquire the lock, so that we can continue
8971 */
8972 mutex_enter(&rp->r_statelock);
8973 /*
8974 * The entry is now complete
8975 */
8976 rdc->flags &= ~RDDIR;
8977 }
8978
8979 ASSERT(!(rdc->flags & RDDIR));
8980
8981 /*
8982 * If an error occurred while attempting
8983 * to fill the cache entry, mark the entry invalid and
8984 * just return the error.
8985 */
8986 if (rdc->error) {
8987 error = rdc->error;
8988 rdc->flags |= RDDIRREQ;
8989 rddir4_cache_rele(rp, rdc);
8990 mutex_exit(&rp->r_statelock);
8991 return (error);
8992 }
8993
8994 /*
8995 * The cache entry is complete and good,
8996 * copyout the dirent structs to the calling
8997 * thread.
8998 */
8999 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9000
9001 /*
9002 * If no error occurred during the copyout,
9003 * update the offset in the uio struct to
9004 * contain the value of the next NFS 4 cookie
9005 * and set the eof value appropriately.
9006 */
9007 if (!error) {
9008 uiop->uio_loffset = rdc->nfs4_ncookie;
9009 if (eofp)
9010 *eofp = rdc->eof;
9011 }
9012
9013 /*
9014 * Decide whether to do readahead. Don't if we
9015 * have already read to the end of directory.
9016 */
9017 if (rdc->eof) {
9018 /*
9019 * Make the entry the direof only if it is cached
9020 */
9021 if (rdc->flags & RDDIRCACHED)
9022 rp->r_direof = rdc;
9023 rddir4_cache_rele(rp, rdc);
9024 mutex_exit(&rp->r_statelock);
9025 return (error);
9026 }
9027
9028 /* Determine if a readdir readahead should be done */
9029 if (!(rp->r_flags & R4LOOKUP)) {
9030 rddir4_cache_rele(rp, rdc);
9031 mutex_exit(&rp->r_statelock);
9032 return (error);
9033 }
9034
9035 /*
9036 * Now look for a readahead entry.
9037 *
9038 * Check to see whether we found an entry for the readahead.
9039 * If so, we don't need to do anything further, so free the new
9040 * entry if one was allocated. Otherwise, allocate a new entry, add
9041 * it to the cache, and then initiate an asynchronous readdir
9042 * operation to fill it.
9043 */
9044 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9045
9046 /*
9047 * A readdir cache entry could not be obtained for the readahead. In
9048 * this case we skip the readahead and return.
9049 */
9050 if (rrdc == NULL) {
9051 rddir4_cache_rele(rp, rdc);
9052 mutex_exit(&rp->r_statelock);
9053 return (error);
9054 }
9055
9056 /*
9057 * Check to see if we need to fill this entry in.
9058 */
9059 if (rrdc->flags & RDDIRREQ) {
9060 rrdc->flags &= ~RDDIRREQ;
9061 rrdc->flags |= RDDIR;
9062 rddir4_cache_rele(rp, rdc);
9063 mutex_exit(&rp->r_statelock);
9064 #ifdef DEBUG
9065 nfs4_readdir_readahead++;
9066 #endif
9067 /*
9068 * Do the readdir.
9069 */
9070 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9071 return (error);
9072 }
9073
9074 rddir4_cache_rele(rp, rrdc);
9075 rddir4_cache_rele(rp, rdc);
9076 mutex_exit(&rp->r_statelock);
9077 return (error);
9078 }
9079
9080 static int
9081 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9082 {
9083 int error;
9084 rnode4_t *rp;
9085
9086 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9087
9088 rp = VTOR4(vp);
9089
9090 /*
9091 * Obtain the readdir results for the caller.
9092 */
9093 nfs4readdir(vp, rdc, cr);
9094
9095 mutex_enter(&rp->r_statelock);
9096 /*
9097 * The entry is now complete
9098 */
9099 rdc->flags &= ~RDDIR;
9100
9101 error = rdc->error;
9102 if (error)
9103 rdc->flags |= RDDIRREQ;
9104 rddir4_cache_rele(rp, rdc);
9105 mutex_exit(&rp->r_statelock);
9106
9107 return (error);
9108 }
9109
9110 /*
9111 * Read directory entries.
9112 * There are some weird things to look out for here. The uio_loffset
9113 * field is either 0 or it is the offset returned from a previous
9114 * readdir. It is an opaque value used by the server to find the
9115 * correct directory block to read. The count field is the number
9116 * of blocks to read on the server. This is advisory only, the server
9117 * may return only one block's worth of entries. Entries may be compressed
9118 * on the server.
9119 *
9120 * Generates the following compound request:
9121 * 1. If readdir offset is zero and no dnlc entry for parent exists,
9122 * must include a Lookupp as well. In this case, send:
9123 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9124 * 2. Otherwise just do: { Putfh <fh>; Readdir }
9125 *
9126 * Get complete attributes and filehandles for entries if this is the
9127 * first read of the directory. Otherwise, just get fileid's.
9128 */
9129 static void
9130 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9131 {
9132 COMPOUND4args_clnt args;
9133 COMPOUND4res_clnt res;
9134 READDIR4args *rargs;
9135 READDIR4res_clnt *rd_res;
9136 bitmap4 rd_bitsval;
9137 nfs_argop4 argop[5];
9138 nfs_resop4 *resop;
9139 rnode4_t *rp = VTOR4(vp);
9140 mntinfo4_t *mi = VTOMI4(vp);
9141 int doqueue;
9142 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */
9143 vnode_t *dvp;
9144 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9145 int num_ops, res_opcnt;
9146 bool_t needrecov = FALSE;
9147 nfs4_recov_state_t recov_state;
9148 hrtime_t t;
9149 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9150
9151 ASSERT(nfs_zone() == mi->mi_zone);
9152 ASSERT(rdc->flags & RDDIR);
9153 ASSERT(rdc->entries == NULL);
9154
9155 /*
9156 * If rp were a stub, it should have triggered and caused
9157 * a mount for us to get this far.
9158 */
9159 ASSERT(!RP_ISSTUB(rp));
9160
9161 num_ops = 2;
9162 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9163 /*
9164 * Since nfsv4 readdir may not return entries for "." and "..",
9165 * the client must recreate them:
9166 * To find the correct nodeid, do the following:
9167 * For current node, get nodeid from dnlc.
9168 * - if current node is rootvp, set pnodeid to nodeid.
9169 * - else if parent is in the dnlc, get its nodeid from there.
9170 * - else add LOOKUPP+GETATTR to compound.
9171 */
9172 nodeid = rp->r_attr.va_nodeid;
9173 if (vp->v_flag & VROOT) {
9174 pnodeid = nodeid; /* root of mount point */
9175 } else {
9176 dvp = dnlc_lookup(vp, "..");
9177 if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9178 /* parent in dnlc cache - no need for otw */
9179 pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9180 } else {
9181 /*
9182 * parent not in dnlc cache,
9183 * do lookupp to get its id
9184 */
9185 num_ops = 5;
9186 pnodeid = 0; /* set later by getattr parent */
9187 }
9188 if (dvp)
9189 VN_RELE(dvp);
9190 }
9191 }
9192 recov_state.rs_flags = 0;
9193 recov_state.rs_num_retry_despite_err = 0;
9194
9195 /* Save the original mount point security flavor */
9196 (void) save_mnt_secinfo(mi->mi_curr_serv);
9197
9198 recov_retry:
9199 args.ctag = TAG_READDIR;
9200
9201 args.array = argop;
9202 args.array_len = num_ops;
9203
9204 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9205 &recov_state, NULL)) {
9206 /*
9207 * If readdir a node that is a stub for a crossed mount point,
9208 * keep the original secinfo flavor for the current file
9209 * system, not the crossed one.
9210 */
9211 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9212 rdc->error = e.error;
9213 return;
9214 }
9215
9216 /*
9217 * Determine which attrs to request for dirents. This code
9218 * must be protected by nfs4_start/end_fop because of r_server
9219 * (which will change during failover recovery).
9220 *
9221 */
9222 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9223 /*
9224 * Get all vattr attrs plus filehandle and rdattr_error
9225 */
9226 rd_bitsval = NFS4_VATTR_MASK |
9227 FATTR4_RDATTR_ERROR_MASK |
9228 FATTR4_FILEHANDLE_MASK;
9229
9230 if (rp->r_flags & R4READDIRWATTR) {
9231 mutex_enter(&rp->r_statelock);
9232 rp->r_flags &= ~R4READDIRWATTR;
9233 mutex_exit(&rp->r_statelock);
9234 }
9235 } else {
9236 servinfo4_t *svp = rp->r_server;
9237
9238 /*
9239 * Already read directory. Use readdir with
9240 * no attrs (except for mounted_on_fileid) for updates.
9241 */
9242 rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9243
9244 /*
9245 * request mounted on fileid if supported, else request
9246 * fileid. maybe we should verify that fileid is supported
9247 * and request something else if not.
9248 */
9249 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9250 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9251 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9252 nfs_rw_exit(&svp->sv_lock);
9253 }
9254
9255 /* putfh directory fh */
9256 argop[0].argop = OP_CPUTFH;
9257 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9258
9259 argop[1].argop = OP_READDIR;
9260 rargs = &argop[1].nfs_argop4_u.opreaddir;
9261 /*
9262 * 1 and 2 are reserved for client "." and ".." entry offset.
9263 * cookie 0 should be used over-the-wire to start reading at
9264 * the beginning of the directory excluding "." and "..".
9265 */
9266 if (rdc->nfs4_cookie == 0 ||
9267 rdc->nfs4_cookie == 1 ||
9268 rdc->nfs4_cookie == 2) {
9269 rargs->cookie = (nfs_cookie4)0;
9270 rargs->cookieverf = 0;
9271 } else {
9272 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9273 mutex_enter(&rp->r_statelock);
9274 rargs->cookieverf = rp->r_cookieverf4;
9275 mutex_exit(&rp->r_statelock);
9276 }
9277 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9278 rargs->maxcount = mi->mi_tsize;
9279 rargs->attr_request = rd_bitsval;
9280 rargs->rdc = rdc;
9281 rargs->dvp = vp;
9282 rargs->mi = mi;
9283 rargs->cr = cr;
9284
9285
9286 /*
9287 * If count < than the minimum required, we return no entries
9288 * and fail with EINVAL
9289 */
9290 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9291 rdc->error = EINVAL;
9292 goto out;
9293 }
9294
9295 if (args.array_len == 5) {
9296 /*
9297 * Add lookupp and getattr for parent nodeid.
9298 */
9299 argop[2].argop = OP_LOOKUPP;
9300
9301 argop[3].argop = OP_GETFH;
9302
9303 /* getattr parent */
9304 argop[4].argop = OP_GETATTR;
9305 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9306 argop[4].nfs_argop4_u.opgetattr.mi = mi;
9307 }
9308
9309 doqueue = 1;
9310
9311 if (mi->mi_io_kstats) {
9312 mutex_enter(&mi->mi_lock);
9313 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9314 mutex_exit(&mi->mi_lock);
9315 }
9316
9317 /* capture the time of this call */
9318 rargs->t = t = gethrtime();
9319
9320 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9321
9322 if (mi->mi_io_kstats) {
9323 mutex_enter(&mi->mi_lock);
9324 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9325 mutex_exit(&mi->mi_lock);
9326 }
9327
9328 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9329
9330 /*
9331 * If RPC error occurred and it isn't an error that
9332 * triggers recovery, then go ahead and fail now.
9333 */
9334 if (e.error != 0 && !needrecov) {
9335 rdc->error = e.error;
9336 goto out;
9337 }
9338
9339 if (needrecov) {
9340 bool_t abort;
9341
9342 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9343 "nfs4readdir: initiating recovery.\n"));
9344
9345 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9346 NULL, OP_READDIR, NULL, NULL, NULL);
9347 if (abort == FALSE) {
9348 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9349 &recov_state, needrecov);
9350 if (!e.error)
9351 (void) xdr_free(xdr_COMPOUND4res_clnt,
9352 (caddr_t)&res);
9353 if (rdc->entries != NULL) {
9354 kmem_free(rdc->entries, rdc->entlen);
9355 rdc->entries = NULL;
9356 }
9357 goto recov_retry;
9358 }
9359
9360 if (e.error != 0) {
9361 rdc->error = e.error;
9362 goto out;
9363 }
9364
9365 /* fall through for res.status case */
9366 }
9367
9368 res_opcnt = res.array_len;
9369
9370 /*
9371 * If compound failed first 2 ops (PUTFH+READDIR), then return
9372 * failure here. Subsequent ops are for filling out dot-dot
9373 * dirent, and if they fail, we still want to give the caller
9374 * the dirents returned by (the successful) READDIR op, so we need
9375 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9376 *
9377 * One example where PUTFH+READDIR ops would succeed but
9378 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9379 * but lacks x. In this case, a POSIX server's VOP_READDIR
9380 * would succeed; however, VOP_LOOKUP(..) would fail since no
9381 * x perm. We need to come up with a non-vendor-specific way
9382 * for a POSIX server to return d_ino from dotdot's dirent if
9383 * client only requests mounted_on_fileid, and just say the
9384 * LOOKUPP succeeded and fill out the GETATTR. However, if
9385 * client requested any mandatory attrs, server would be required
9386 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9387 * for dotdot.
9388 */
9389
9390 if (res.status) {
9391 if (res_opcnt <= 2) {
9392 e.error = geterrno4(res.status);
9393 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9394 &recov_state, needrecov);
9395 nfs4_purge_stale_fh(e.error, vp, cr);
9396 rdc->error = e.error;
9397 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9398 if (rdc->entries != NULL) {
9399 kmem_free(rdc->entries, rdc->entlen);
9400 rdc->entries = NULL;
9401 }
9402 /*
9403 * If readdir a node that is a stub for a
9404 * crossed mount point, keep the original
9405 * secinfo flavor for the current file system,
9406 * not the crossed one.
9407 */
9408 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9409 return;
9410 }
9411 }
9412
9413 resop = &res.array[1]; /* readdir res */
9414 rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9415
9416 mutex_enter(&rp->r_statelock);
9417 rp->r_cookieverf4 = rd_res->cookieverf;
9418 mutex_exit(&rp->r_statelock);
9419
9420 /*
9421 * For "." and ".." entries
9422 * e.g.
9423 * seek(cookie=0) -> "." entry with d_off = 1
9424 * seek(cookie=1) -> ".." entry with d_off = 2
9425 */
9426 if (cookie == (nfs_cookie4) 0) {
9427 if (rd_res->dotp)
9428 rd_res->dotp->d_ino = nodeid;
9429 if (rd_res->dotdotp)
9430 rd_res->dotdotp->d_ino = pnodeid;
9431 }
9432 if (cookie == (nfs_cookie4) 1) {
9433 if (rd_res->dotdotp)
9434 rd_res->dotdotp->d_ino = pnodeid;
9435 }
9436
9437
9438 /* LOOKUPP+GETATTR attemped */
9439 if (args.array_len == 5 && rd_res->dotdotp) {
9440 if (res.status == NFS4_OK && res_opcnt == 5) {
9441 nfs_fh4 *fhp;
9442 nfs4_sharedfh_t *sfhp;
9443 vnode_t *pvp;
9444 nfs4_ga_res_t *garp;
9445
9446 resop++; /* lookupp */
9447 resop++; /* getfh */
9448 fhp = &resop->nfs_resop4_u.opgetfh.object;
9449
9450 resop++; /* getattr of parent */
9451
9452 /*
9453 * First, take care of finishing the
9454 * readdir results.
9455 */
9456 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9457 /*
9458 * The d_ino of .. must be the inode number
9459 * of the mounted filesystem.
9460 */
9461 if (garp->n4g_va.va_mask & AT_NODEID)
9462 rd_res->dotdotp->d_ino =
9463 garp->n4g_va.va_nodeid;
9464
9465
9466 /*
9467 * Next, create the ".." dnlc entry
9468 */
9469 sfhp = sfh4_get(fhp, mi);
9470 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9471 dnlc_update(vp, "..", pvp);
9472 VN_RELE(pvp);
9473 }
9474 sfh4_rele(&sfhp);
9475 }
9476 }
9477
9478 if (mi->mi_io_kstats) {
9479 mutex_enter(&mi->mi_lock);
9480 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9481 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9482 mutex_exit(&mi->mi_lock);
9483 }
9484
9485 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9486
9487 out:
9488 /*
9489 * If readdir a node that is a stub for a crossed mount point,
9490 * keep the original secinfo flavor for the current file system,
9491 * not the crossed one.
9492 */
9493 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9494
9495 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9496 }
9497
9498
9499 static int
9500 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9501 {
9502 rnode4_t *rp = VTOR4(bp->b_vp);
9503 int count;
9504 int error;
9505 cred_t *cred_otw = NULL;
9506 offset_t offset;
9507 nfs4_open_stream_t *osp = NULL;
9508 bool_t first_time = TRUE; /* first time getting otw cred */
9509 bool_t last_time = FALSE; /* last time getting otw cred */
9510
9511 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9512
9513 DTRACE_IO1(start, struct buf *, bp);
9514 offset = ldbtob(bp->b_lblkno);
9515
9516 if (bp->b_flags & B_READ) {
9517 read_again:
9518 /*
9519 * Releases the osp, if it is provided.
9520 * Puts a hold on the cred_otw and the new osp (if found).
9521 */
9522 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9523 &first_time, &last_time);
9524 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9525 offset, bp->b_bcount, &bp->b_resid, cred_otw,
9526 readahead, NULL);
9527 crfree(cred_otw);
9528 if (!error) {
9529 if (bp->b_resid) {
9530 /*
9531 * Didn't get it all because we hit EOF,
9532 * zero all the memory beyond the EOF.
9533 */
9534 /* bzero(rdaddr + */
9535 bzero(bp->b_un.b_addr +
9536 bp->b_bcount - bp->b_resid, bp->b_resid);
9537 }
9538 mutex_enter(&rp->r_statelock);
9539 if (bp->b_resid == bp->b_bcount &&
9540 offset >= rp->r_size) {
9541 /*
9542 * We didn't read anything at all as we are
9543 * past EOF. Return an error indicator back
9544 * but don't destroy the pages (yet).
9545 */
9546 error = NFS_EOF;
9547 }
9548 mutex_exit(&rp->r_statelock);
9549 } else if (error == EACCES && last_time == FALSE) {
9550 goto read_again;
9551 }
9552 } else {
9553 if (!(rp->r_flags & R4STALE)) {
9554 write_again:
9555 /*
9556 * Releases the osp, if it is provided.
9557 * Puts a hold on the cred_otw and the new
9558 * osp (if found).
9559 */
9560 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9561 &first_time, &last_time);
9562 mutex_enter(&rp->r_statelock);
9563 count = MIN(bp->b_bcount, rp->r_size - offset);
9564 mutex_exit(&rp->r_statelock);
9565 if (count < 0)
9566 cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9567 #ifdef DEBUG
9568 if (count == 0) {
9569 zoneid_t zoneid = getzoneid();
9570
9571 zcmn_err(zoneid, CE_WARN,
9572 "nfs4_bio: zero length write at %lld",
9573 offset);
9574 zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9575 "b_bcount=%ld, file size=%lld",
9576 rp->r_flags, (long)bp->b_bcount,
9577 rp->r_size);
9578 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9579 if (nfs4_bio_do_stop)
9580 debug_enter("nfs4_bio");
9581 }
9582 #endif
9583 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9584 count, cred_otw, stab_comm);
9585 if (error == EACCES && last_time == FALSE) {
9586 crfree(cred_otw);
9587 goto write_again;
9588 }
9589 bp->b_error = error;
9590 if (error && error != EINTR &&
9591 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9592 /*
9593 * Don't print EDQUOT errors on the console.
9594 * Don't print asynchronous EACCES errors.
9595 * Don't print EFBIG errors.
9596 * Print all other write errors.
9597 */
9598 if (error != EDQUOT && error != EFBIG &&
9599 (error != EACCES ||
9600 !(bp->b_flags & B_ASYNC)))
9601 nfs4_write_error(bp->b_vp,
9602 error, cred_otw);
9603 /*
9604 * Update r_error and r_flags as appropriate.
9605 * If the error was ESTALE, then mark the
9606 * rnode as not being writeable and save
9607 * the error status. Otherwise, save any
9608 * errors which occur from asynchronous
9609 * page invalidations. Any errors occurring
9610 * from other operations should be saved
9611 * by the caller.
9612 */
9613 mutex_enter(&rp->r_statelock);
9614 if (error == ESTALE) {
9615 rp->r_flags |= R4STALE;
9616 if (!rp->r_error)
9617 rp->r_error = error;
9618 } else if (!rp->r_error &&
9619 (bp->b_flags &
9620 (B_INVAL|B_FORCE|B_ASYNC)) ==
9621 (B_INVAL|B_FORCE|B_ASYNC)) {
9622 rp->r_error = error;
9623 }
9624 mutex_exit(&rp->r_statelock);
9625 }
9626 crfree(cred_otw);
9627 } else {
9628 error = rp->r_error;
9629 /*
9630 * A close may have cleared r_error, if so,
9631 * propagate ESTALE error return properly
9632 */
9633 if (error == 0)
9634 error = ESTALE;
9635 }
9636 }
9637
9638 if (error != 0 && error != NFS_EOF)
9639 bp->b_flags |= B_ERROR;
9640
9641 if (osp)
9642 open_stream_rele(osp, rp);
9643
9644 DTRACE_IO1(done, struct buf *, bp);
9645
9646 return (error);
9647 }
9648
9649 /* ARGSUSED */
9650 int
9651 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9652 {
9653 return (EREMOTE);
9654 }
9655
9656 /* ARGSUSED2 */
9657 int
9658 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9659 {
9660 rnode4_t *rp = VTOR4(vp);
9661
9662 if (!write_lock) {
9663 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9664 return (V_WRITELOCK_FALSE);
9665 }
9666
9667 if ((rp->r_flags & R4DIRECTIO) ||
9668 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9669 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9670 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9671 return (V_WRITELOCK_FALSE);
9672 nfs_rw_exit(&rp->r_rwlock);
9673 }
9674
9675 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9676 return (V_WRITELOCK_TRUE);
9677 }
9678
9679 /* ARGSUSED */
9680 void
9681 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9682 {
9683 rnode4_t *rp = VTOR4(vp);
9684
9685 nfs_rw_exit(&rp->r_rwlock);
9686 }
9687
9688 /* ARGSUSED */
9689 static int
9690 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9691 {
9692 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9693 return (EIO);
9694
9695 /*
9696 * Because we stuff the readdir cookie into the offset field
9697 * someone may attempt to do an lseek with the cookie which
9698 * we want to succeed.
9699 */
9700 if (vp->v_type == VDIR)
9701 return (0);
9702 if (*noffp < 0)
9703 return (EINVAL);
9704 return (0);
9705 }
9706
9707
9708 /*
9709 * Return all the pages from [off..off+len) in file
9710 */
9711 /* ARGSUSED */
9712 static int
9713 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9714 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9715 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9716 {
9717 rnode4_t *rp;
9718 int error;
9719 mntinfo4_t *mi;
9720
9721 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9722 return (EIO);
9723 rp = VTOR4(vp);
9724 if (IS_SHADOW(vp, rp))
9725 vp = RTOV4(rp);
9726
9727 if (vp->v_flag & VNOMAP)
9728 return (ENOSYS);
9729
9730 if (protp != NULL)
9731 *protp = PROT_ALL;
9732
9733 /*
9734 * Now validate that the caches are up to date.
9735 */
9736 if (error = nfs4_validate_caches(vp, cr))
9737 return (error);
9738
9739 mi = VTOMI4(vp);
9740 retry:
9741 mutex_enter(&rp->r_statelock);
9742
9743 /*
9744 * Don't create dirty pages faster than they
9745 * can be cleaned so that the system doesn't
9746 * get imbalanced. If the async queue is
9747 * maxed out, then wait for it to drain before
9748 * creating more dirty pages. Also, wait for
9749 * any threads doing pagewalks in the vop_getattr
9750 * entry points so that they don't block for
9751 * long periods.
9752 */
9753 if (rw == S_CREATE) {
9754 while ((mi->mi_max_threads != 0 &&
9755 rp->r_awcount > 2 * mi->mi_max_threads) ||
9756 rp->r_gcount > 0)
9757 cv_wait(&rp->r_cv, &rp->r_statelock);
9758 }
9759
9760 /*
9761 * If we are getting called as a side effect of an nfs_write()
9762 * operation the local file size might not be extended yet.
9763 * In this case we want to be able to return pages of zeroes.
9764 */
9765 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9766 NFS4_DEBUG(nfs4_pageio_debug,
9767 (CE_NOTE, "getpage beyond EOF: off=%lld, "
9768 "len=%llu, size=%llu, attrsize =%llu", off,
9769 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9770 mutex_exit(&rp->r_statelock);
9771 return (EFAULT); /* beyond EOF */
9772 }
9773
9774 mutex_exit(&rp->r_statelock);
9775
9776 if (len <= PAGESIZE) {
9777 error = nfs4_getapage(vp, off, len, protp, pl, plsz,
9778 seg, addr, rw, cr);
9779 NFS4_DEBUG(nfs4_pageio_debug && error,
9780 (CE_NOTE, "getpage error %d; off=%lld, "
9781 "len=%lld", error, off, (u_longlong_t)len));
9782 } else {
9783 error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9784 pl, plsz, seg, addr, rw, cr);
9785 NFS4_DEBUG(nfs4_pageio_debug && error,
9786 (CE_NOTE, "getpages error %d; off=%lld, "
9787 "len=%lld", error, off, (u_longlong_t)len));
9788 }
9789
9790 switch (error) {
9791 case NFS_EOF:
9792 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9793 goto retry;
9794 case ESTALE:
9795 nfs4_purge_stale_fh(error, vp, cr);
9796 }
9797
9798 return (error);
9799 }
9800
9801 /*
9802 * Called from pvn_getpages or nfs4_getpage to get a particular page.
9803 */
9804 /* ARGSUSED */
9805 static int
9806 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9807 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9808 enum seg_rw rw, cred_t *cr)
9809 {
9810 rnode4_t *rp;
9811 uint_t bsize;
9812 struct buf *bp;
9813 page_t *pp;
9814 u_offset_t lbn;
9815 u_offset_t io_off;
9816 u_offset_t blkoff;
9817 u_offset_t rablkoff;
9818 size_t io_len;
9819 uint_t blksize;
9820 int error;
9821 int readahead;
9822 int readahead_issued = 0;
9823 int ra_window; /* readahead window */
9824 page_t *pagefound;
9825 page_t *savepp;
9826
9827 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9828 return (EIO);
9829
9830 rp = VTOR4(vp);
9831 ASSERT(!IS_SHADOW(vp, rp));
9832 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9833
9834 reread:
9835 bp = NULL;
9836 pp = NULL;
9837 pagefound = NULL;
9838
9839 if (pl != NULL)
9840 pl[0] = NULL;
9841
9842 error = 0;
9843 lbn = off / bsize;
9844 blkoff = lbn * bsize;
9845
9846 /*
9847 * Queueing up the readahead before doing the synchronous read
9848 * results in a significant increase in read throughput because
9849 * of the increased parallelism between the async threads and
9850 * the process context.
9851 */
9852 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9853 rw != S_CREATE &&
9854 !(vp->v_flag & VNOCACHE)) {
9855 mutex_enter(&rp->r_statelock);
9856
9857 /*
9858 * Calculate the number of readaheads to do.
9859 * a) No readaheads at offset = 0.
9860 * b) Do maximum(nfs4_nra) readaheads when the readahead
9861 * window is closed.
9862 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9863 * upon how far the readahead window is open or close.
9864 * d) No readaheads if rp->r_nextr is not within the scope
9865 * of the readahead window (random i/o).
9866 */
9867
9868 if (off == 0)
9869 readahead = 0;
9870 else if (blkoff == rp->r_nextr)
9871 readahead = nfs4_nra;
9872 else if (rp->r_nextr > blkoff &&
9873 ((ra_window = (rp->r_nextr - blkoff) / bsize)
9874 <= (nfs4_nra - 1)))
9875 readahead = nfs4_nra - ra_window;
9876 else
9877 readahead = 0;
9878
9879 rablkoff = rp->r_nextr;
9880 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9881 mutex_exit(&rp->r_statelock);
9882 if (nfs4_async_readahead(vp, rablkoff + bsize,
9883 addr + (rablkoff + bsize - off),
9884 seg, cr, nfs4_readahead) < 0) {
9885 mutex_enter(&rp->r_statelock);
9886 break;
9887 }
9888 readahead--;
9889 rablkoff += bsize;
9890 /*
9891 * Indicate that we did a readahead so
9892 * readahead offset is not updated
9893 * by the synchronous read below.
9894 */
9895 readahead_issued = 1;
9896 mutex_enter(&rp->r_statelock);
9897 /*
9898 * set readahead offset to
9899 * offset of last async readahead
9900 * request.
9901 */
9902 rp->r_nextr = rablkoff;
9903 }
9904 mutex_exit(&rp->r_statelock);
9905 }
9906
9907 again:
9908 if ((pagefound = page_exists(vp, off)) == NULL) {
9909 if (pl == NULL) {
9910 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9911 nfs4_readahead);
9912 } else if (rw == S_CREATE) {
9913 /*
9914 * Block for this page is not allocated, or the offset
9915 * is beyond the current allocation size, or we're
9916 * allocating a swap slot and the page was not found,
9917 * so allocate it and return a zero page.
9918 */
9919 if ((pp = page_create_va(vp, off,
9920 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9921 cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9922 io_len = PAGESIZE;
9923 mutex_enter(&rp->r_statelock);
9924 rp->r_nextr = off + PAGESIZE;
9925 mutex_exit(&rp->r_statelock);
9926 } else {
9927 /*
9928 * Need to go to server to get a block
9929 */
9930 mutex_enter(&rp->r_statelock);
9931 if (blkoff < rp->r_size &&
9932 blkoff + bsize > rp->r_size) {
9933 /*
9934 * If less than a block left in
9935 * file read less than a block.
9936 */
9937 if (rp->r_size <= off) {
9938 /*
9939 * Trying to access beyond EOF,
9940 * set up to get at least one page.
9941 */
9942 blksize = off + PAGESIZE - blkoff;
9943 } else
9944 blksize = rp->r_size - blkoff;
9945 } else if ((off == 0) ||
9946 (off != rp->r_nextr && !readahead_issued)) {
9947 blksize = PAGESIZE;
9948 blkoff = off; /* block = page here */
9949 } else
9950 blksize = bsize;
9951 mutex_exit(&rp->r_statelock);
9952
9953 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9954 &io_len, blkoff, blksize, 0);
9955
9956 /*
9957 * Some other thread has entered the page,
9958 * so just use it.
9959 */
9960 if (pp == NULL)
9961 goto again;
9962
9963 /*
9964 * Now round the request size up to page boundaries.
9965 * This ensures that the entire page will be
9966 * initialized to zeroes if EOF is encountered.
9967 */
9968 io_len = ptob(btopr(io_len));
9969
9970 bp = pageio_setup(pp, io_len, vp, B_READ);
9971 ASSERT(bp != NULL);
9972
9973 /*
9974 * pageio_setup should have set b_addr to 0. This
9975 * is correct since we want to do I/O on a page
9976 * boundary. bp_mapin will use this addr to calculate
9977 * an offset, and then set b_addr to the kernel virtual
9978 * address it allocated for us.
9979 */
9980 ASSERT(bp->b_un.b_addr == 0);
9981
9982 bp->b_edev = 0;
9983 bp->b_dev = 0;
9984 bp->b_lblkno = lbtodb(io_off);
9985 bp->b_file = vp;
9986 bp->b_offset = (offset_t)off;
9987 bp_mapin(bp);
9988
9989 /*
9990 * If doing a write beyond what we believe is EOF,
9991 * don't bother trying to read the pages from the
9992 * server, we'll just zero the pages here. We
9993 * don't check that the rw flag is S_WRITE here
9994 * because some implementations may attempt a
9995 * read access to the buffer before copying data.
9996 */
9997 mutex_enter(&rp->r_statelock);
9998 if (io_off >= rp->r_size && seg == segkmap) {
9999 mutex_exit(&rp->r_statelock);
10000 bzero(bp->b_un.b_addr, io_len);
10001 } else {
10002 mutex_exit(&rp->r_statelock);
10003 error = nfs4_bio(bp, NULL, cr, FALSE);
10004 }
10005
10006 /*
10007 * Unmap the buffer before freeing it.
10008 */
10009 bp_mapout(bp);
10010 pageio_done(bp);
10011
10012 savepp = pp;
10013 do {
10014 pp->p_fsdata = C_NOCOMMIT;
10015 } while ((pp = pp->p_next) != savepp);
10016
10017 if (error == NFS_EOF) {
10018 /*
10019 * If doing a write system call just return
10020 * zeroed pages, else user tried to get pages
10021 * beyond EOF, return error. We don't check
10022 * that the rw flag is S_WRITE here because
10023 * some implementations may attempt a read
10024 * access to the buffer before copying data.
10025 */
10026 if (seg == segkmap)
10027 error = 0;
10028 else
10029 error = EFAULT;
10030 }
10031
10032 if (!readahead_issued && !error) {
10033 mutex_enter(&rp->r_statelock);
10034 rp->r_nextr = io_off + io_len;
10035 mutex_exit(&rp->r_statelock);
10036 }
10037 }
10038 }
10039
10040 out:
10041 if (pl == NULL)
10042 return (error);
10043
10044 if (error) {
10045 if (pp != NULL)
10046 pvn_read_done(pp, B_ERROR);
10047 return (error);
10048 }
10049
10050 if (pagefound) {
10051 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10052
10053 /*
10054 * Page exists in the cache, acquire the appropriate lock.
10055 * If this fails, start all over again.
10056 */
10057 if ((pp = page_lookup(vp, off, se)) == NULL) {
10058 #ifdef DEBUG
10059 nfs4_lostpage++;
10060 #endif
10061 goto reread;
10062 }
10063 pl[0] = pp;
10064 pl[1] = NULL;
10065 return (0);
10066 }
10067
10068 if (pp != NULL)
10069 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10070
10071 return (error);
10072 }
10073
10074 static void
10075 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10076 cred_t *cr)
10077 {
10078 int error;
10079 page_t *pp;
10080 u_offset_t io_off;
10081 size_t io_len;
10082 struct buf *bp;
10083 uint_t bsize, blksize;
10084 rnode4_t *rp = VTOR4(vp);
10085 page_t *savepp;
10086
10087 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10088
10089 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10090
10091 mutex_enter(&rp->r_statelock);
10092 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10093 /*
10094 * If less than a block left in file read less
10095 * than a block.
10096 */
10097 blksize = rp->r_size - blkoff;
10098 } else
10099 blksize = bsize;
10100 mutex_exit(&rp->r_statelock);
10101
10102 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10103 &io_off, &io_len, blkoff, blksize, 1);
10104 /*
10105 * The isra flag passed to the kluster function is 1, we may have
10106 * gotten a return value of NULL for a variety of reasons (# of free
10107 * pages < minfree, someone entered the page on the vnode etc). In all
10108 * cases, we want to punt on the readahead.
10109 */
10110 if (pp == NULL)
10111 return;
10112
10113 /*
10114 * Now round the request size up to page boundaries.
10115 * This ensures that the entire page will be
10116 * initialized to zeroes if EOF is encountered.
10117 */
10118 io_len = ptob(btopr(io_len));
10119
10120 bp = pageio_setup(pp, io_len, vp, B_READ);
10121 ASSERT(bp != NULL);
10122
10123 /*
10124 * pageio_setup should have set b_addr to 0. This is correct since
10125 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10126 * to calculate an offset, and then set b_addr to the kernel virtual
10127 * address it allocated for us.
10128 */
10129 ASSERT(bp->b_un.b_addr == 0);
10130
10131 bp->b_edev = 0;
10132 bp->b_dev = 0;
10133 bp->b_lblkno = lbtodb(io_off);
10134 bp->b_file = vp;
10135 bp->b_offset = (offset_t)blkoff;
10136 bp_mapin(bp);
10137
10138 /*
10139 * If doing a write beyond what we believe is EOF, don't bother trying
10140 * to read the pages from the server, we'll just zero the pages here.
10141 * We don't check that the rw flag is S_WRITE here because some
10142 * implementations may attempt a read access to the buffer before
10143 * copying data.
10144 */
10145 mutex_enter(&rp->r_statelock);
10146 if (io_off >= rp->r_size && seg == segkmap) {
10147 mutex_exit(&rp->r_statelock);
10148 bzero(bp->b_un.b_addr, io_len);
10149 error = 0;
10150 } else {
10151 mutex_exit(&rp->r_statelock);
10152 error = nfs4_bio(bp, NULL, cr, TRUE);
10153 if (error == NFS_EOF)
10154 error = 0;
10155 }
10156
10157 /*
10158 * Unmap the buffer before freeing it.
10159 */
10160 bp_mapout(bp);
10161 pageio_done(bp);
10162
10163 savepp = pp;
10164 do {
10165 pp->p_fsdata = C_NOCOMMIT;
10166 } while ((pp = pp->p_next) != savepp);
10167
10168 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10169
10170 /*
10171 * In case of error set readahead offset
10172 * to the lowest offset.
10173 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10174 */
10175 if (error && rp->r_nextr > io_off) {
10176 mutex_enter(&rp->r_statelock);
10177 if (rp->r_nextr > io_off)
10178 rp->r_nextr = io_off;
10179 mutex_exit(&rp->r_statelock);
10180 }
10181 }
10182
10183 /*
10184 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10185 * If len == 0, do from off to EOF.
10186 *
10187 * The normal cases should be len == 0 && off == 0 (entire vp list) or
10188 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10189 * (from pageout).
10190 */
10191 /* ARGSUSED */
10192 static int
10193 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10194 caller_context_t *ct)
10195 {
10196 int error;
10197 rnode4_t *rp;
10198
10199 ASSERT(cr != NULL);
10200
10201 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10202 return (EIO);
10203
10204 rp = VTOR4(vp);
10205 if (IS_SHADOW(vp, rp))
10206 vp = RTOV4(rp);
10207
10208 /*
10209 * XXX - Why should this check be made here?
10210 */
10211 if (vp->v_flag & VNOMAP)
10212 return (ENOSYS);
10213
10214 if (len == 0 && !(flags & B_INVAL) &&
10215 (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10216 return (0);
10217
10218 mutex_enter(&rp->r_statelock);
10219 rp->r_count++;
10220 mutex_exit(&rp->r_statelock);
10221 error = nfs4_putpages(vp, off, len, flags, cr);
10222 mutex_enter(&rp->r_statelock);
10223 rp->r_count--;
10224 cv_broadcast(&rp->r_cv);
10225 mutex_exit(&rp->r_statelock);
10226
10227 return (error);
10228 }
10229
10230 /*
10231 * Write out a single page, possibly klustering adjacent dirty pages.
10232 */
10233 int
10234 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10235 int flags, cred_t *cr)
10236 {
10237 u_offset_t io_off;
10238 u_offset_t lbn_off;
10239 u_offset_t lbn;
10240 size_t io_len;
10241 uint_t bsize;
10242 int error;
10243 rnode4_t *rp;
10244
10245 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10246 ASSERT(pp != NULL);
10247 ASSERT(cr != NULL);
10248 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10249
10250 rp = VTOR4(vp);
10251 ASSERT(rp->r_count > 0);
10252 ASSERT(!IS_SHADOW(vp, rp));
10253
10254 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10255 lbn = pp->p_offset / bsize;
10256 lbn_off = lbn * bsize;
10257
10258 /*
10259 * Find a kluster that fits in one block, or in
10260 * one page if pages are bigger than blocks. If
10261 * there is less file space allocated than a whole
10262 * page, we'll shorten the i/o request below.
10263 */
10264 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10265 roundup(bsize, PAGESIZE), flags);
10266
10267 /*
10268 * pvn_write_kluster shouldn't have returned a page with offset
10269 * behind the original page we were given. Verify that.
10270 */
10271 ASSERT((pp->p_offset / bsize) >= lbn);
10272
10273 /*
10274 * Now pp will have the list of kept dirty pages marked for
10275 * write back. It will also handle invalidation and freeing
10276 * of pages that are not dirty. Check for page length rounding
10277 * problems.
10278 */
10279 if (io_off + io_len > lbn_off + bsize) {
10280 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10281 io_len = lbn_off + bsize - io_off;
10282 }
10283 /*
10284 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10285 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10286 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10287 * progress and the r_size has not been made consistent with the
10288 * new size of the file. When the uiomove() completes the r_size is
10289 * updated and the R4MODINPROGRESS flag is cleared.
10290 *
10291 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10292 * consistent value of r_size. Without this handshaking, it is
10293 * possible that nfs4_bio() picks up the old value of r_size
10294 * before the uiomove() in writerp4() completes. This will result
10295 * in the write through nfs4_bio() being dropped.
10296 *
10297 * More precisely, there is a window between the time the uiomove()
10298 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10299 * operation intervenes in this window, the page will be picked up,
10300 * because it is dirty (it will be unlocked, unless it was
10301 * pagecreate'd). When the page is picked up as dirty, the dirty
10302 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10303 * checked. This will still be the old size. Therefore the page will
10304 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10305 * the page will be found to be clean and the write will be dropped.
10306 */
10307 if (rp->r_flags & R4MODINPROGRESS) {
10308 mutex_enter(&rp->r_statelock);
10309 if ((rp->r_flags & R4MODINPROGRESS) &&
10310 rp->r_modaddr + MAXBSIZE > io_off &&
10311 rp->r_modaddr < io_off + io_len) {
10312 page_t *plist;
10313 /*
10314 * A write is in progress for this region of the file.
10315 * If we did not detect R4MODINPROGRESS here then this
10316 * path through nfs_putapage() would eventually go to
10317 * nfs4_bio() and may not write out all of the data
10318 * in the pages. We end up losing data. So we decide
10319 * to set the modified bit on each page in the page
10320 * list and mark the rnode with R4DIRTY. This write
10321 * will be restarted at some later time.
10322 */
10323 plist = pp;
10324 while (plist != NULL) {
10325 pp = plist;
10326 page_sub(&plist, pp);
10327 hat_setmod(pp);
10328 page_io_unlock(pp);
10329 page_unlock(pp);
10330 }
10331 rp->r_flags |= R4DIRTY;
10332 mutex_exit(&rp->r_statelock);
10333 if (offp)
10334 *offp = io_off;
10335 if (lenp)
10336 *lenp = io_len;
10337 return (0);
10338 }
10339 mutex_exit(&rp->r_statelock);
10340 }
10341
10342 if (flags & B_ASYNC) {
10343 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10344 nfs4_sync_putapage);
10345 } else
10346 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10347
10348 if (offp)
10349 *offp = io_off;
10350 if (lenp)
10351 *lenp = io_len;
10352 return (error);
10353 }
10354
10355 static int
10356 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10357 int flags, cred_t *cr)
10358 {
10359 int error;
10360 rnode4_t *rp;
10361
10362 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10363
10364 flags |= B_WRITE;
10365
10366 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10367
10368 rp = VTOR4(vp);
10369
10370 if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10371 error == EACCES) &&
10372 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10373 if (!(rp->r_flags & R4OUTOFSPACE)) {
10374 mutex_enter(&rp->r_statelock);
10375 rp->r_flags |= R4OUTOFSPACE;
10376 mutex_exit(&rp->r_statelock);
10377 }
10378 flags |= B_ERROR;
10379 pvn_write_done(pp, flags);
10380 /*
10381 * If this was not an async thread, then try again to
10382 * write out the pages, but this time, also destroy
10383 * them whether or not the write is successful. This
10384 * will prevent memory from filling up with these
10385 * pages and destroying them is the only alternative
10386 * if they can't be written out.
10387 *
10388 * Don't do this if this is an async thread because
10389 * when the pages are unlocked in pvn_write_done,
10390 * some other thread could have come along, locked
10391 * them, and queued for an async thread. It would be
10392 * possible for all of the async threads to be tied
10393 * up waiting to lock the pages again and they would
10394 * all already be locked and waiting for an async
10395 * thread to handle them. Deadlock.
10396 */
10397 if (!(flags & B_ASYNC)) {
10398 error = nfs4_putpage(vp, io_off, io_len,
10399 B_INVAL | B_FORCE, cr, NULL);
10400 }
10401 } else {
10402 if (error)
10403 flags |= B_ERROR;
10404 else if (rp->r_flags & R4OUTOFSPACE) {
10405 mutex_enter(&rp->r_statelock);
10406 rp->r_flags &= ~R4OUTOFSPACE;
10407 mutex_exit(&rp->r_statelock);
10408 }
10409 pvn_write_done(pp, flags);
10410 if (freemem < desfree)
10411 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10412 NFS4_WRITE_NOWAIT);
10413 }
10414
10415 return (error);
10416 }
10417
10418 #ifdef DEBUG
10419 int nfs4_force_open_before_mmap = 0;
10420 #endif
10421
10422 /* ARGSUSED */
10423 static int
10424 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10425 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10426 caller_context_t *ct)
10427 {
10428 struct segvn_crargs vn_a;
10429 int error = 0;
10430 rnode4_t *rp = VTOR4(vp);
10431 mntinfo4_t *mi = VTOMI4(vp);
10432
10433 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10434 return (EIO);
10435
10436 if (vp->v_flag & VNOMAP)
10437 return (ENOSYS);
10438
10439 if (off < 0 || (off + len) < 0)
10440 return (ENXIO);
10441
10442 if (vp->v_type != VREG)
10443 return (ENODEV);
10444
10445 /*
10446 * If the file is delegated to the client don't do anything.
10447 * If the file is not delegated, then validate the data cache.
10448 */
10449 mutex_enter(&rp->r_statev4_lock);
10450 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10451 mutex_exit(&rp->r_statev4_lock);
10452 error = nfs4_validate_caches(vp, cr);
10453 if (error)
10454 return (error);
10455 } else {
10456 mutex_exit(&rp->r_statev4_lock);
10457 }
10458
10459 /*
10460 * Check to see if the vnode is currently marked as not cachable.
10461 * This means portions of the file are locked (through VOP_FRLOCK).
10462 * In this case the map request must be refused. We use
10463 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10464 *
10465 * Atomically increment r_inmap after acquiring r_rwlock. The
10466 * idea here is to acquire r_rwlock to block read/write and
10467 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10468 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10469 * and we can prevent the deadlock that would have occurred
10470 * when nfs4_addmap() would have acquired it out of order.
10471 *
10472 * Since we are not protecting r_inmap by any lock, we do not
10473 * hold any lock when we decrement it. We atomically decrement
10474 * r_inmap after we release r_lkserlock.
10475 */
10476
10477 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10478 return (EINTR);
10479 atomic_add_int(&rp->r_inmap, 1);
10480 nfs_rw_exit(&rp->r_rwlock);
10481
10482 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10483 atomic_add_int(&rp->r_inmap, -1);
10484 return (EINTR);
10485 }
10486
10487
10488 if (vp->v_flag & VNOCACHE) {
10489 error = EAGAIN;
10490 goto done;
10491 }
10492
10493 /*
10494 * Don't allow concurrent locks and mapping if mandatory locking is
10495 * enabled.
10496 */
10497 if (flk_has_remote_locks(vp)) {
10498 struct vattr va;
10499 va.va_mask = AT_MODE;
10500 error = nfs4getattr(vp, &va, cr);
10501 if (error != 0)
10502 goto done;
10503 if (MANDLOCK(vp, va.va_mode)) {
10504 error = EAGAIN;
10505 goto done;
10506 }
10507 }
10508
10509 /*
10510 * It is possible that the rnode has a lost lock request that we
10511 * are still trying to recover, and that the request conflicts with
10512 * this map request.
10513 *
10514 * An alternative approach would be for nfs4_safemap() to consider
10515 * queued lock requests when deciding whether to set or clear
10516 * VNOCACHE. This would require the frlock code path to call
10517 * nfs4_safemap() after enqueing a lost request.
10518 */
10519 if (nfs4_map_lost_lock_conflict(vp)) {
10520 error = EAGAIN;
10521 goto done;
10522 }
10523
10524 as_rangelock(as);
10525 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10526 if (error != 0) {
10527 as_rangeunlock(as);
10528 goto done;
10529 }
10530
10531 if (vp->v_type == VREG) {
10532 /*
10533 * We need to retrieve the open stream
10534 */
10535 nfs4_open_stream_t *osp = NULL;
10536 nfs4_open_owner_t *oop = NULL;
10537
10538 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10539 if (oop != NULL) {
10540 /* returns with 'os_sync_lock' held */
10541 osp = find_open_stream(oop, rp);
10542 open_owner_rele(oop);
10543 }
10544 if (osp == NULL) {
10545 #ifdef DEBUG
10546 if (nfs4_force_open_before_mmap) {
10547 error = EIO;
10548 goto done;
10549 }
10550 #endif
10551 /* returns with 'os_sync_lock' held */
10552 error = open_and_get_osp(vp, cr, &osp);
10553 if (osp == NULL) {
10554 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10555 "nfs4_map: we tried to OPEN the file "
10556 "but again no osp, so fail with EIO"));
10557 goto done;
10558 }
10559 }
10560
10561 if (osp->os_failed_reopen) {
10562 mutex_exit(&osp->os_sync_lock);
10563 open_stream_rele(osp, rp);
10564 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10565 "nfs4_map: os_failed_reopen set on "
10566 "osp %p, cr %p, rp %s", (void *)osp,
10567 (void *)cr, rnode4info(rp)));
10568 error = EIO;
10569 goto done;
10570 }
10571 mutex_exit(&osp->os_sync_lock);
10572 open_stream_rele(osp, rp);
10573 }
10574
10575 vn_a.vp = vp;
10576 vn_a.offset = off;
10577 vn_a.type = (flags & MAP_TYPE);
10578 vn_a.prot = (uchar_t)prot;
10579 vn_a.maxprot = (uchar_t)maxprot;
10580 vn_a.flags = (flags & ~MAP_TYPE);
10581 vn_a.cred = cr;
10582 vn_a.amp = NULL;
10583 vn_a.szc = 0;
10584 vn_a.lgrp_mem_policy_flags = 0;
10585
10586 error = as_map(as, *addrp, len, segvn_create, &vn_a);
10587 as_rangeunlock(as);
10588
10589 done:
10590 nfs_rw_exit(&rp->r_lkserlock);
10591 atomic_add_int(&rp->r_inmap, -1);
10592 return (error);
10593 }
10594
10595 /*
10596 * We're most likely dealing with a kernel module that likes to READ
10597 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10598 * officially OPEN the file to create the necessary client state
10599 * for bookkeeping of os_mmap_read/write counts.
10600 *
10601 * Since VOP_MAP only passes in a pointer to the vnode rather than
10602 * a double pointer, we can't handle the case where nfs4open_otw()
10603 * returns a different vnode than the one passed into VOP_MAP (since
10604 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case,
10605 * we return NULL and let nfs4_map() fail. Note: the only case where
10606 * this should happen is if the file got removed and replaced with the
10607 * same name on the server (in addition to the fact that we're trying
10608 * to VOP_MAP withouth VOP_OPENing the file in the first place).
10609 */
10610 static int
10611 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10612 {
10613 rnode4_t *rp, *drp;
10614 vnode_t *dvp, *open_vp;
10615 char file_name[MAXNAMELEN];
10616 int just_created;
10617 nfs4_open_stream_t *osp;
10618 nfs4_open_owner_t *oop;
10619 int error;
10620
10621 *ospp = NULL;
10622 open_vp = map_vp;
10623
10624 rp = VTOR4(open_vp);
10625 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10626 return (error);
10627 drp = VTOR4(dvp);
10628
10629 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10630 VN_RELE(dvp);
10631 return (EINTR);
10632 }
10633
10634 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10635 nfs_rw_exit(&drp->r_rwlock);
10636 VN_RELE(dvp);
10637 return (error);
10638 }
10639
10640 mutex_enter(&rp->r_statev4_lock);
10641 if (rp->created_v4) {
10642 rp->created_v4 = 0;
10643 mutex_exit(&rp->r_statev4_lock);
10644
10645 dnlc_update(dvp, file_name, open_vp);
10646 /* This is needed so we don't bump the open ref count */
10647 just_created = 1;
10648 } else {
10649 mutex_exit(&rp->r_statev4_lock);
10650 just_created = 0;
10651 }
10652
10653 VN_HOLD(map_vp);
10654
10655 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10656 just_created);
10657 if (error) {
10658 nfs_rw_exit(&drp->r_rwlock);
10659 VN_RELE(dvp);
10660 VN_RELE(map_vp);
10661 return (error);
10662 }
10663
10664 nfs_rw_exit(&drp->r_rwlock);
10665 VN_RELE(dvp);
10666
10667 /*
10668 * If nfs4open_otw() returned a different vnode then "undo"
10669 * the open and return failure to the caller.
10670 */
10671 if (!VN_CMP(open_vp, map_vp)) {
10672 nfs4_error_t e;
10673
10674 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10675 "open returned a different vnode"));
10676 /*
10677 * If there's an error, ignore it,
10678 * and let VOP_INACTIVE handle it.
10679 */
10680 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10681 CLOSE_NORM, 0, 0, 0);
10682 VN_RELE(map_vp);
10683 return (EIO);
10684 }
10685
10686 VN_RELE(map_vp);
10687
10688 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10689 if (!oop) {
10690 nfs4_error_t e;
10691
10692 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10693 "no open owner"));
10694 /*
10695 * If there's an error, ignore it,
10696 * and let VOP_INACTIVE handle it.
10697 */
10698 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10699 CLOSE_NORM, 0, 0, 0);
10700 return (EIO);
10701 }
10702 osp = find_open_stream(oop, rp);
10703 open_owner_rele(oop);
10704 *ospp = osp;
10705 return (0);
10706 }
10707
10708 /*
10709 * Please be aware that when this function is called, the address space write
10710 * a_lock is held. Do not put over the wire calls in this function.
10711 */
10712 /* ARGSUSED */
10713 static int
10714 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10715 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10716 caller_context_t *ct)
10717 {
10718 rnode4_t *rp;
10719 int error = 0;
10720 mntinfo4_t *mi;
10721
10722 mi = VTOMI4(vp);
10723 rp = VTOR4(vp);
10724
10725 if (nfs_zone() != mi->mi_zone)
10726 return (EIO);
10727 if (vp->v_flag & VNOMAP)
10728 return (ENOSYS);
10729
10730 /*
10731 * Don't need to update the open stream first, since this
10732 * mmap can't add any additional share access that isn't
10733 * already contained in the open stream (for the case where we
10734 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10735 * take into account os_mmap_read[write] counts).
10736 */
10737 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10738
10739 if (vp->v_type == VREG) {
10740 /*
10741 * We need to retrieve the open stream and update the counts.
10742 * If there is no open stream here, something is wrong.
10743 */
10744 nfs4_open_stream_t *osp = NULL;
10745 nfs4_open_owner_t *oop = NULL;
10746
10747 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10748 if (oop != NULL) {
10749 /* returns with 'os_sync_lock' held */
10750 osp = find_open_stream(oop, rp);
10751 open_owner_rele(oop);
10752 }
10753 if (osp == NULL) {
10754 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10755 "nfs4_addmap: we should have an osp"
10756 "but we don't, so fail with EIO"));
10757 error = EIO;
10758 goto out;
10759 }
10760
10761 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10762 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10763
10764 /*
10765 * Update the map count in the open stream.
10766 * This is necessary in the case where we
10767 * open/mmap/close/, then the server reboots, and we
10768 * attempt to reopen. If the mmap doesn't add share
10769 * access then we send an invalid reopen with
10770 * access = NONE.
10771 *
10772 * We need to specifically check each PROT_* so a mmap
10773 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10774 * read and write access. A simple comparison of prot
10775 * to ~PROT_WRITE to determine read access is insufficient
10776 * since prot can be |= with PROT_USER, etc.
10777 */
10778
10779 /*
10780 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10781 */
10782 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10783 osp->os_mmap_write += btopr(len);
10784 if (maxprot & PROT_READ)
10785 osp->os_mmap_read += btopr(len);
10786 if (maxprot & PROT_EXEC)
10787 osp->os_mmap_read += btopr(len);
10788 /*
10789 * Ensure that os_mmap_read gets incremented, even if
10790 * maxprot were to look like PROT_NONE.
10791 */
10792 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10793 !(maxprot & PROT_EXEC))
10794 osp->os_mmap_read += btopr(len);
10795 osp->os_mapcnt += btopr(len);
10796 mutex_exit(&osp->os_sync_lock);
10797 open_stream_rele(osp, rp);
10798 }
10799
10800 out:
10801 /*
10802 * If we got an error, then undo our
10803 * incrementing of 'r_mapcnt'.
10804 */
10805
10806 if (error) {
10807 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10808 ASSERT(rp->r_mapcnt >= 0);
10809 }
10810 return (error);
10811 }
10812
10813 /* ARGSUSED */
10814 static int
10815 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10816 {
10817
10818 return (VTOR4(vp1) == VTOR4(vp2));
10819 }
10820
10821 /* ARGSUSED */
10822 static int
10823 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10824 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10825 caller_context_t *ct)
10826 {
10827 int rc;
10828 u_offset_t start, end;
10829 rnode4_t *rp;
10830 int error = 0, intr = INTR4(vp);
10831 nfs4_error_t e;
10832
10833 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10834 return (EIO);
10835
10836 /* check for valid cmd parameter */
10837 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10838 return (EINVAL);
10839
10840 /* Verify l_type. */
10841 switch (bfp->l_type) {
10842 case F_RDLCK:
10843 if (cmd != F_GETLK && !(flag & FREAD))
10844 return (EBADF);
10845 break;
10846 case F_WRLCK:
10847 if (cmd != F_GETLK && !(flag & FWRITE))
10848 return (EBADF);
10849 break;
10850 case F_UNLCK:
10851 intr = 0;
10852 break;
10853
10854 default:
10855 return (EINVAL);
10856 }
10857
10858 /* check the validity of the lock range */
10859 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10860 return (rc);
10861 if (rc = flk_check_lock_data(start, end, MAXEND))
10862 return (rc);
10863
10864 /*
10865 * If the filesystem is mounted using local locking, pass the
10866 * request off to the local locking code.
10867 */
10868 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10869 if (cmd == F_SETLK || cmd == F_SETLKW) {
10870 /*
10871 * For complete safety, we should be holding
10872 * r_lkserlock. However, we can't call
10873 * nfs4_safelock and then fs_frlock while
10874 * holding r_lkserlock, so just invoke
10875 * nfs4_safelock and expect that this will
10876 * catch enough of the cases.
10877 */
10878 if (!nfs4_safelock(vp, bfp, cr))
10879 return (EAGAIN);
10880 }
10881 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10882 }
10883
10884 rp = VTOR4(vp);
10885
10886 /*
10887 * Check whether the given lock request can proceed, given the
10888 * current file mappings.
10889 */
10890 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10891 return (EINTR);
10892 if (cmd == F_SETLK || cmd == F_SETLKW) {
10893 if (!nfs4_safelock(vp, bfp, cr)) {
10894 rc = EAGAIN;
10895 goto done;
10896 }
10897 }
10898
10899 /*
10900 * Flush the cache after waiting for async I/O to finish. For new
10901 * locks, this is so that the process gets the latest bits from the
10902 * server. For unlocks, this is so that other clients see the
10903 * latest bits once the file has been unlocked. If currently dirty
10904 * pages can't be flushed, then don't allow a lock to be set. But
10905 * allow unlocks to succeed, to avoid having orphan locks on the
10906 * server.
10907 */
10908 if (cmd != F_GETLK) {
10909 mutex_enter(&rp->r_statelock);
10910 while (rp->r_count > 0) {
10911 if (intr) {
10912 klwp_t *lwp = ttolwp(curthread);
10913
10914 if (lwp != NULL)
10915 lwp->lwp_nostop++;
10916 if (cv_wait_sig(&rp->r_cv,
10917 &rp->r_statelock) == 0) {
10918 if (lwp != NULL)
10919 lwp->lwp_nostop--;
10920 rc = EINTR;
10921 break;
10922 }
10923 if (lwp != NULL)
10924 lwp->lwp_nostop--;
10925 } else
10926 cv_wait(&rp->r_cv, &rp->r_statelock);
10927 }
10928 mutex_exit(&rp->r_statelock);
10929 if (rc != 0)
10930 goto done;
10931 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10932 if (error) {
10933 if (error == ENOSPC || error == EDQUOT) {
10934 mutex_enter(&rp->r_statelock);
10935 if (!rp->r_error)
10936 rp->r_error = error;
10937 mutex_exit(&rp->r_statelock);
10938 }
10939 if (bfp->l_type != F_UNLCK) {
10940 rc = ENOLCK;
10941 goto done;
10942 }
10943 }
10944 }
10945
10946 /*
10947 * Call the lock manager to do the real work of contacting
10948 * the server and obtaining the lock.
10949 */
10950 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10951 cr, &e, NULL, NULL);
10952 rc = e.error;
10953
10954 if (rc == 0)
10955 nfs4_lockcompletion(vp, cmd);
10956
10957 done:
10958 nfs_rw_exit(&rp->r_lkserlock);
10959
10960 return (rc);
10961 }
10962
10963 /*
10964 * Free storage space associated with the specified vnode. The portion
10965 * to be freed is specified by bfp->l_start and bfp->l_len (already
10966 * normalized to a "whence" of 0).
10967 *
10968 * This is an experimental facility whose continued existence is not
10969 * guaranteed. Currently, we only support the special case
10970 * of l_len == 0, meaning free to end of file.
10971 */
10972 /* ARGSUSED */
10973 static int
10974 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10975 offset_t offset, cred_t *cr, caller_context_t *ct)
10976 {
10977 int error;
10978
10979 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10980 return (EIO);
10981 ASSERT(vp->v_type == VREG);
10982 if (cmd != F_FREESP)
10983 return (EINVAL);
10984
10985 error = convoff(vp, bfp, 0, offset);
10986 if (!error) {
10987 ASSERT(bfp->l_start >= 0);
10988 if (bfp->l_len == 0) {
10989 struct vattr va;
10990
10991 va.va_mask = AT_SIZE;
10992 va.va_size = bfp->l_start;
10993 error = nfs4setattr(vp, &va, 0, cr, NULL);
10994 } else
10995 error = EINVAL;
10996 }
10997
10998 return (error);
10999 }
11000
11001 /* ARGSUSED */
11002 int
11003 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11004 {
11005 rnode4_t *rp;
11006 rp = VTOR4(vp);
11007
11008 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11009 vp = RTOV4(rp);
11010 }
11011 *vpp = vp;
11012 return (0);
11013 }
11014
11015 /*
11016 * Setup and add an address space callback to do the work of the delmap call.
11017 * The callback will (and must be) deleted in the actual callback function.
11018 *
11019 * This is done in order to take care of the problem that we have with holding
11020 * the address space's a_lock for a long period of time (e.g. if the NFS server
11021 * is down). Callbacks will be executed in the address space code while the
11022 * a_lock is not held. Holding the address space's a_lock causes things such
11023 * as ps and fork to hang because they are trying to acquire this lock as well.
11024 */
11025 /* ARGSUSED */
11026 static int
11027 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11028 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11029 caller_context_t *ct)
11030 {
11031 int caller_found;
11032 int error;
11033 rnode4_t *rp;
11034 nfs4_delmap_args_t *dmapp;
11035 nfs4_delmapcall_t *delmap_call;
11036
11037 if (vp->v_flag & VNOMAP)
11038 return (ENOSYS);
11039
11040 /*
11041 * A process may not change zones if it has NFS pages mmap'ed
11042 * in, so we can't legitimately get here from the wrong zone.
11043 */
11044 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11045
11046 rp = VTOR4(vp);
11047
11048 /*
11049 * The way that the address space of this process deletes its mapping
11050 * of this file is via the following call chains:
11051 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11052 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11053 *
11054 * With the use of address space callbacks we are allowed to drop the
11055 * address space lock, a_lock, while executing the NFS operations that
11056 * need to go over the wire. Returning EAGAIN to the caller of this
11057 * function is what drives the execution of the callback that we add
11058 * below. The callback will be executed by the address space code
11059 * after dropping the a_lock. When the callback is finished, since
11060 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11061 * is called again on the same segment to finish the rest of the work
11062 * that needs to happen during unmapping.
11063 *
11064 * This action of calling back into the segment driver causes
11065 * nfs4_delmap() to get called again, but since the callback was
11066 * already executed at this point, it already did the work and there
11067 * is nothing left for us to do.
11068 *
11069 * To Summarize:
11070 * - The first time nfs4_delmap is called by the current thread is when
11071 * we add the caller associated with this delmap to the delmap caller
11072 * list, add the callback, and return EAGAIN.
11073 * - The second time in this call chain when nfs4_delmap is called we
11074 * will find this caller in the delmap caller list and realize there
11075 * is no more work to do thus removing this caller from the list and
11076 * returning the error that was set in the callback execution.
11077 */
11078 caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11079 if (caller_found) {
11080 /*
11081 * 'error' is from the actual delmap operations. To avoid
11082 * hangs, we need to handle the return of EAGAIN differently
11083 * since this is what drives the callback execution.
11084 * In this case, we don't want to return EAGAIN and do the
11085 * callback execution because there are none to execute.
11086 */
11087 if (error == EAGAIN)
11088 return (0);
11089 else
11090 return (error);
11091 }
11092
11093 /* current caller was not in the list */
11094 delmap_call = nfs4_init_delmapcall();
11095
11096 mutex_enter(&rp->r_statelock);
11097 list_insert_tail(&rp->r_indelmap, delmap_call);
11098 mutex_exit(&rp->r_statelock);
11099
11100 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11101
11102 dmapp->vp = vp;
11103 dmapp->off = off;
11104 dmapp->addr = addr;
11105 dmapp->len = len;
11106 dmapp->prot = prot;
11107 dmapp->maxprot = maxprot;
11108 dmapp->flags = flags;
11109 dmapp->cr = cr;
11110 dmapp->caller = delmap_call;
11111
11112 error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11113 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11114
11115 return (error ? error : EAGAIN);
11116 }
11117
11118 static nfs4_delmapcall_t *
11119 nfs4_init_delmapcall()
11120 {
11121 nfs4_delmapcall_t *delmap_call;
11122
11123 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11124 delmap_call->call_id = curthread;
11125 delmap_call->error = 0;
11126
11127 return (delmap_call);
11128 }
11129
11130 static void
11131 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11132 {
11133 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11134 }
11135
11136 /*
11137 * Searches for the current delmap caller (based on curthread) in the list of
11138 * callers. If it is found, we remove it and free the delmap caller.
11139 * Returns:
11140 * 0 if the caller wasn't found
11141 * 1 if the caller was found, removed and freed. *errp will be set
11142 * to what the result of the delmap was.
11143 */
11144 static int
11145 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11146 {
11147 nfs4_delmapcall_t *delmap_call;
11148
11149 /*
11150 * If the list doesn't exist yet, we create it and return
11151 * that the caller wasn't found. No list = no callers.
11152 */
11153 mutex_enter(&rp->r_statelock);
11154 if (!(rp->r_flags & R4DELMAPLIST)) {
11155 /* The list does not exist */
11156 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11157 offsetof(nfs4_delmapcall_t, call_node));
11158 rp->r_flags |= R4DELMAPLIST;
11159 mutex_exit(&rp->r_statelock);
11160 return (0);
11161 } else {
11162 /* The list exists so search it */
11163 for (delmap_call = list_head(&rp->r_indelmap);
11164 delmap_call != NULL;
11165 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11166 if (delmap_call->call_id == curthread) {
11167 /* current caller is in the list */
11168 *errp = delmap_call->error;
11169 list_remove(&rp->r_indelmap, delmap_call);
11170 mutex_exit(&rp->r_statelock);
11171 nfs4_free_delmapcall(delmap_call);
11172 return (1);
11173 }
11174 }
11175 }
11176 mutex_exit(&rp->r_statelock);
11177 return (0);
11178 }
11179
11180 /*
11181 * Remove some pages from an mmap'd vnode. Just update the
11182 * count of pages. If doing close-to-open, then flush and
11183 * commit all of the pages associated with this file.
11184 * Otherwise, start an asynchronous page flush to write out
11185 * any dirty pages. This will also associate a credential
11186 * with the rnode which can be used to write the pages.
11187 */
11188 /* ARGSUSED */
11189 static void
11190 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11191 {
11192 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11193 rnode4_t *rp;
11194 mntinfo4_t *mi;
11195 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg;
11196
11197 rp = VTOR4(dmapp->vp);
11198 mi = VTOMI4(dmapp->vp);
11199
11200 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11201 ASSERT(rp->r_mapcnt >= 0);
11202
11203 /*
11204 * Initiate a page flush and potential commit if there are
11205 * pages, the file system was not mounted readonly, the segment
11206 * was mapped shared, and the pages themselves were writeable.
11207 */
11208 if (nfs4_has_pages(dmapp->vp) &&
11209 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11210 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11211 mutex_enter(&rp->r_statelock);
11212 rp->r_flags |= R4DIRTY;
11213 mutex_exit(&rp->r_statelock);
11214 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11215 dmapp->len, dmapp->cr);
11216 if (!e.error) {
11217 mutex_enter(&rp->r_statelock);
11218 e.error = rp->r_error;
11219 rp->r_error = 0;
11220 mutex_exit(&rp->r_statelock);
11221 }
11222 } else
11223 e.error = 0;
11224
11225 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11226 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11227 B_INVAL, dmapp->cr, NULL);
11228
11229 if (e.error) {
11230 e.stat = puterrno4(e.error);
11231 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11232 OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11233 dmapp->caller->error = e.error;
11234 }
11235
11236 /* Check to see if we need to close the file */
11237
11238 if (dmapp->vp->v_type == VREG) {
11239 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11240 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11241
11242 if (e.error != 0 || e.stat != NFS4_OK) {
11243 /*
11244 * Since it is possible that e.error == 0 and
11245 * e.stat != NFS4_OK (and vice versa),
11246 * we do the proper checking in order to get both
11247 * e.error and e.stat reporting the correct info.
11248 */
11249 if (e.stat == NFS4_OK)
11250 e.stat = puterrno4(e.error);
11251 if (e.error == 0)
11252 e.error = geterrno4(e.stat);
11253
11254 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11255 OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11256 dmapp->caller->error = e.error;
11257 }
11258 }
11259
11260 (void) as_delete_callback(as, arg);
11261 kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11262 }
11263
11264
11265 static uint_t
11266 fattr4_maxfilesize_to_bits(uint64_t ll)
11267 {
11268 uint_t l = 1;
11269
11270 if (ll == 0) {
11271 return (0);
11272 }
11273
11274 if (ll & 0xffffffff00000000) {
11275 l += 32; ll >>= 32;
11276 }
11277 if (ll & 0xffff0000) {
11278 l += 16; ll >>= 16;
11279 }
11280 if (ll & 0xff00) {
11281 l += 8; ll >>= 8;
11282 }
11283 if (ll & 0xf0) {
11284 l += 4; ll >>= 4;
11285 }
11286 if (ll & 0xc) {
11287 l += 2; ll >>= 2;
11288 }
11289 if (ll & 0x2) {
11290 l += 1;
11291 }
11292 return (l);
11293 }
11294
11295 static int
11296 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11297 {
11298 vnode_t *avp = NULL;
11299 int error;
11300
11301 if ((error = nfs4lookup_xattr(vp, "", &avp,
11302 LOOKUP_XATTR, cr)) == 0)
11303 error = do_xattr_exists_check(avp, valp, cr);
11304 if (avp)
11305 VN_RELE(avp);
11306
11307 return (error);
11308 }
11309
11310 /* ARGSUSED */
11311 int
11312 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11313 caller_context_t *ct)
11314 {
11315 int error;
11316 hrtime_t t;
11317 rnode4_t *rp;
11318 nfs4_ga_res_t gar;
11319 nfs4_ga_ext_res_t ger;
11320
11321 gar.n4g_ext_res = &ger;
11322
11323 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11324 return (EIO);
11325 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11326 *valp = MAXPATHLEN;
11327 return (0);
11328 }
11329 if (cmd == _PC_ACL_ENABLED) {
11330 *valp = _ACL_ACE_ENABLED;
11331 return (0);
11332 }
11333
11334 rp = VTOR4(vp);
11335 if (cmd == _PC_XATTR_EXISTS) {
11336 /*
11337 * The existence of the xattr directory is not sufficient
11338 * for determining whether generic user attributes exists.
11339 * The attribute directory could only be a transient directory
11340 * used for Solaris sysattr support. Do a small readdir
11341 * to verify if the only entries are sysattrs or not.
11342 *
11343 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11344 * is NULL. Once the xadir vp exists, we can create xattrs,
11345 * and we don't have any way to update the "base" object's
11346 * pc4_xattr_exists from the xattr or xadir. Maybe FEM
11347 * could help out.
11348 */
11349 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11350 rp->r_xattr_dir == NULL) {
11351 return (nfs4_have_xattrs(vp, valp, cr));
11352 }
11353 } else { /* OLD CODE */
11354 if (ATTRCACHE4_VALID(vp)) {
11355 mutex_enter(&rp->r_statelock);
11356 if (rp->r_pathconf.pc4_cache_valid) {
11357 error = 0;
11358 switch (cmd) {
11359 case _PC_FILESIZEBITS:
11360 *valp =
11361 rp->r_pathconf.pc4_filesizebits;
11362 break;
11363 case _PC_LINK_MAX:
11364 *valp =
11365 rp->r_pathconf.pc4_link_max;
11366 break;
11367 case _PC_NAME_MAX:
11368 *valp =
11369 rp->r_pathconf.pc4_name_max;
11370 break;
11371 case _PC_CHOWN_RESTRICTED:
11372 *valp =
11373 rp->r_pathconf.pc4_chown_restricted;
11374 break;
11375 case _PC_NO_TRUNC:
11376 *valp =
11377 rp->r_pathconf.pc4_no_trunc;
11378 break;
11379 default:
11380 error = EINVAL;
11381 break;
11382 }
11383 mutex_exit(&rp->r_statelock);
11384 #ifdef DEBUG
11385 nfs4_pathconf_cache_hits++;
11386 #endif
11387 return (error);
11388 }
11389 mutex_exit(&rp->r_statelock);
11390 }
11391 }
11392 #ifdef DEBUG
11393 nfs4_pathconf_cache_misses++;
11394 #endif
11395
11396 t = gethrtime();
11397
11398 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11399
11400 if (error) {
11401 mutex_enter(&rp->r_statelock);
11402 rp->r_pathconf.pc4_cache_valid = FALSE;
11403 rp->r_pathconf.pc4_xattr_valid = FALSE;
11404 mutex_exit(&rp->r_statelock);
11405 return (error);
11406 }
11407
11408 /* interpret the max filesize */
11409 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11410 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11411
11412 /* Store the attributes we just received */
11413 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11414
11415 switch (cmd) {
11416 case _PC_FILESIZEBITS:
11417 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11418 break;
11419 case _PC_LINK_MAX:
11420 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11421 break;
11422 case _PC_NAME_MAX:
11423 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11424 break;
11425 case _PC_CHOWN_RESTRICTED:
11426 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11427 break;
11428 case _PC_NO_TRUNC:
11429 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11430 break;
11431 case _PC_XATTR_EXISTS:
11432 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11433 if (error = nfs4_have_xattrs(vp, valp, cr))
11434 return (error);
11435 }
11436 break;
11437 default:
11438 return (EINVAL);
11439 }
11440
11441 return (0);
11442 }
11443
11444 /*
11445 * Called by async thread to do synchronous pageio. Do the i/o, wait
11446 * for it to complete, and cleanup the page list when done.
11447 */
11448 static int
11449 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11450 int flags, cred_t *cr)
11451 {
11452 int error;
11453
11454 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11455
11456 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11457 if (flags & B_READ)
11458 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11459 else
11460 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11461 return (error);
11462 }
11463
11464 /* ARGSUSED */
11465 static int
11466 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11467 int flags, cred_t *cr, caller_context_t *ct)
11468 {
11469 int error;
11470 rnode4_t *rp;
11471
11472 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11473 return (EIO);
11474
11475 if (pp == NULL)
11476 return (EINVAL);
11477
11478 rp = VTOR4(vp);
11479 mutex_enter(&rp->r_statelock);
11480 rp->r_count++;
11481 mutex_exit(&rp->r_statelock);
11482
11483 if (flags & B_ASYNC) {
11484 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11485 nfs4_sync_pageio);
11486 } else
11487 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11488 mutex_enter(&rp->r_statelock);
11489 rp->r_count--;
11490 cv_broadcast(&rp->r_cv);
11491 mutex_exit(&rp->r_statelock);
11492 return (error);
11493 }
11494
11495 /* ARGSUSED */
11496 static void
11497 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11498 caller_context_t *ct)
11499 {
11500 int error;
11501 rnode4_t *rp;
11502 page_t *plist;
11503 page_t *pptr;
11504 offset3 offset;
11505 count3 len;
11506 k_sigset_t smask;
11507
11508 /*
11509 * We should get called with fl equal to either B_FREE or
11510 * B_INVAL. Any other value is illegal.
11511 *
11512 * The page that we are either supposed to free or destroy
11513 * should be exclusive locked and its io lock should not
11514 * be held.
11515 */
11516 ASSERT(fl == B_FREE || fl == B_INVAL);
11517 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11518
11519 rp = VTOR4(vp);
11520
11521 /*
11522 * If the page doesn't need to be committed or we shouldn't
11523 * even bother attempting to commit it, then just make sure
11524 * that the p_fsdata byte is clear and then either free or
11525 * destroy the page as appropriate.
11526 */
11527 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11528 pp->p_fsdata = C_NOCOMMIT;
11529 if (fl == B_FREE)
11530 page_free(pp, dn);
11531 else
11532 page_destroy(pp, dn);
11533 return;
11534 }
11535
11536 /*
11537 * If there is a page invalidation operation going on, then
11538 * if this is one of the pages being destroyed, then just
11539 * clear the p_fsdata byte and then either free or destroy
11540 * the page as appropriate.
11541 */
11542 mutex_enter(&rp->r_statelock);
11543 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11544 mutex_exit(&rp->r_statelock);
11545 pp->p_fsdata = C_NOCOMMIT;
11546 if (fl == B_FREE)
11547 page_free(pp, dn);
11548 else
11549 page_destroy(pp, dn);
11550 return;
11551 }
11552
11553 /*
11554 * If we are freeing this page and someone else is already
11555 * waiting to do a commit, then just unlock the page and
11556 * return. That other thread will take care of commiting
11557 * this page. The page can be freed sometime after the
11558 * commit has finished. Otherwise, if the page is marked
11559 * as delay commit, then we may be getting called from
11560 * pvn_write_done, one page at a time. This could result
11561 * in one commit per page, so we end up doing lots of small
11562 * commits instead of fewer larger commits. This is bad,
11563 * we want do as few commits as possible.
11564 */
11565 if (fl == B_FREE) {
11566 if (rp->r_flags & R4COMMITWAIT) {
11567 page_unlock(pp);
11568 mutex_exit(&rp->r_statelock);
11569 return;
11570 }
11571 if (pp->p_fsdata == C_DELAYCOMMIT) {
11572 pp->p_fsdata = C_COMMIT;
11573 page_unlock(pp);
11574 mutex_exit(&rp->r_statelock);
11575 return;
11576 }
11577 }
11578
11579 /*
11580 * Check to see if there is a signal which would prevent an
11581 * attempt to commit the pages from being successful. If so,
11582 * then don't bother with all of the work to gather pages and
11583 * generate the unsuccessful RPC. Just return from here and
11584 * let the page be committed at some later time.
11585 */
11586 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11587 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11588 sigunintr(&smask);
11589 page_unlock(pp);
11590 mutex_exit(&rp->r_statelock);
11591 return;
11592 }
11593 sigunintr(&smask);
11594
11595 /*
11596 * We are starting to need to commit pages, so let's try
11597 * to commit as many as possible at once to reduce the
11598 * overhead.
11599 *
11600 * Set the `commit inprogress' state bit. We must
11601 * first wait until any current one finishes. Then
11602 * we initialize the c_pages list with this page.
11603 */
11604 while (rp->r_flags & R4COMMIT) {
11605 rp->r_flags |= R4COMMITWAIT;
11606 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11607 rp->r_flags &= ~R4COMMITWAIT;
11608 }
11609 rp->r_flags |= R4COMMIT;
11610 mutex_exit(&rp->r_statelock);
11611 ASSERT(rp->r_commit.c_pages == NULL);
11612 rp->r_commit.c_pages = pp;
11613 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11614 rp->r_commit.c_commlen = PAGESIZE;
11615
11616 /*
11617 * Gather together all other pages which can be committed.
11618 * They will all be chained off r_commit.c_pages.
11619 */
11620 nfs4_get_commit(vp);
11621
11622 /*
11623 * Clear the `commit inprogress' status and disconnect
11624 * the list of pages to be committed from the rnode.
11625 * At this same time, we also save the starting offset
11626 * and length of data to be committed on the server.
11627 */
11628 plist = rp->r_commit.c_pages;
11629 rp->r_commit.c_pages = NULL;
11630 offset = rp->r_commit.c_commbase;
11631 len = rp->r_commit.c_commlen;
11632 mutex_enter(&rp->r_statelock);
11633 rp->r_flags &= ~R4COMMIT;
11634 cv_broadcast(&rp->r_commit.c_cv);
11635 mutex_exit(&rp->r_statelock);
11636
11637 if (curproc == proc_pageout || curproc == proc_fsflush ||
11638 nfs_zone() != VTOMI4(vp)->mi_zone) {
11639 nfs4_async_commit(vp, plist, offset, len,
11640 cr, do_nfs4_async_commit);
11641 return;
11642 }
11643
11644 /*
11645 * Actually generate the COMMIT op over the wire operation.
11646 */
11647 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11648
11649 /*
11650 * If we got an error during the commit, just unlock all
11651 * of the pages. The pages will get retransmitted to the
11652 * server during a putpage operation.
11653 */
11654 if (error) {
11655 while (plist != NULL) {
11656 pptr = plist;
11657 page_sub(&plist, pptr);
11658 page_unlock(pptr);
11659 }
11660 return;
11661 }
11662
11663 /*
11664 * We've tried as hard as we can to commit the data to stable
11665 * storage on the server. We just unlock the rest of the pages
11666 * and clear the commit required state. They will be put
11667 * onto the tail of the cachelist if they are nolonger
11668 * mapped.
11669 */
11670 while (plist != pp) {
11671 pptr = plist;
11672 page_sub(&plist, pptr);
11673 pptr->p_fsdata = C_NOCOMMIT;
11674 page_unlock(pptr);
11675 }
11676
11677 /*
11678 * It is possible that nfs4_commit didn't return error but
11679 * some other thread has modified the page we are going
11680 * to free/destroy.
11681 * In this case we need to rewrite the page. Do an explicit check
11682 * before attempting to free/destroy the page. If modified, needs to
11683 * be rewritten so unlock the page and return.
11684 */
11685 if (hat_ismod(pp)) {
11686 pp->p_fsdata = C_NOCOMMIT;
11687 page_unlock(pp);
11688 return;
11689 }
11690
11691 /*
11692 * Now, as appropriate, either free or destroy the page
11693 * that we were called with.
11694 */
11695 pp->p_fsdata = C_NOCOMMIT;
11696 if (fl == B_FREE)
11697 page_free(pp, dn);
11698 else
11699 page_destroy(pp, dn);
11700 }
11701
11702 /*
11703 * Commit requires that the current fh be the file written to.
11704 * The compound op structure is:
11705 * PUTFH(file), COMMIT
11706 */
11707 static int
11708 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11709 {
11710 COMPOUND4args_clnt args;
11711 COMPOUND4res_clnt res;
11712 COMMIT4res *cm_res;
11713 nfs_argop4 argop[2];
11714 nfs_resop4 *resop;
11715 int doqueue;
11716 mntinfo4_t *mi;
11717 rnode4_t *rp;
11718 cred_t *cred_otw = NULL;
11719 bool_t needrecov = FALSE;
11720 nfs4_recov_state_t recov_state;
11721 nfs4_open_stream_t *osp = NULL;
11722 bool_t first_time = TRUE; /* first time getting OTW cred */
11723 bool_t last_time = FALSE; /* last time getting OTW cred */
11724 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11725
11726 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11727
11728 rp = VTOR4(vp);
11729
11730 mi = VTOMI4(vp);
11731 recov_state.rs_flags = 0;
11732 recov_state.rs_num_retry_despite_err = 0;
11733 get_commit_cred:
11734 /*
11735 * Releases the osp, if a valid open stream is provided.
11736 * Puts a hold on the cred_otw and the new osp (if found).
11737 */
11738 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11739 &first_time, &last_time);
11740 args.ctag = TAG_COMMIT;
11741 recov_retry:
11742 /*
11743 * Commit ops: putfh file; commit
11744 */
11745 args.array_len = 2;
11746 args.array = argop;
11747
11748 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11749 &recov_state, NULL);
11750 if (e.error) {
11751 crfree(cred_otw);
11752 if (osp != NULL)
11753 open_stream_rele(osp, rp);
11754 return (e.error);
11755 }
11756
11757 /* putfh directory */
11758 argop[0].argop = OP_CPUTFH;
11759 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11760
11761 /* commit */
11762 argop[1].argop = OP_COMMIT;
11763 argop[1].nfs_argop4_u.opcommit.offset = offset;
11764 argop[1].nfs_argop4_u.opcommit.count = count;
11765
11766 doqueue = 1;
11767 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11768
11769 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11770 if (!needrecov && e.error) {
11771 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11772 needrecov);
11773 crfree(cred_otw);
11774 if (e.error == EACCES && last_time == FALSE)
11775 goto get_commit_cred;
11776 if (osp != NULL)
11777 open_stream_rele(osp, rp);
11778 return (e.error);
11779 }
11780
11781 if (needrecov) {
11782 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11783 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11784 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11785 &recov_state, needrecov);
11786 if (!e.error)
11787 (void) xdr_free(xdr_COMPOUND4res_clnt,
11788 (caddr_t)&res);
11789 goto recov_retry;
11790 }
11791 if (e.error) {
11792 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11793 &recov_state, needrecov);
11794 crfree(cred_otw);
11795 if (osp != NULL)
11796 open_stream_rele(osp, rp);
11797 return (e.error);
11798 }
11799 /* fall through for res.status case */
11800 }
11801
11802 if (res.status) {
11803 e.error = geterrno4(res.status);
11804 if (e.error == EACCES && last_time == FALSE) {
11805 crfree(cred_otw);
11806 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11807 &recov_state, needrecov);
11808 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11809 goto get_commit_cred;
11810 }
11811 /*
11812 * Can't do a nfs4_purge_stale_fh here because this
11813 * can cause a deadlock. nfs4_commit can
11814 * be called from nfs4_dispose which can be called
11815 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh
11816 * can call back to pvn_vplist_dirty.
11817 */
11818 if (e.error == ESTALE) {
11819 mutex_enter(&rp->r_statelock);
11820 rp->r_flags |= R4STALE;
11821 if (!rp->r_error)
11822 rp->r_error = e.error;
11823 mutex_exit(&rp->r_statelock);
11824 PURGE_ATTRCACHE4(vp);
11825 } else {
11826 mutex_enter(&rp->r_statelock);
11827 if (!rp->r_error)
11828 rp->r_error = e.error;
11829 mutex_exit(&rp->r_statelock);
11830 }
11831 } else {
11832 ASSERT(rp->r_flags & R4HAVEVERF);
11833 resop = &res.array[1]; /* commit res */
11834 cm_res = &resop->nfs_resop4_u.opcommit;
11835 mutex_enter(&rp->r_statelock);
11836 if (cm_res->writeverf == rp->r_writeverf) {
11837 mutex_exit(&rp->r_statelock);
11838 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11839 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11840 &recov_state, needrecov);
11841 crfree(cred_otw);
11842 if (osp != NULL)
11843 open_stream_rele(osp, rp);
11844 return (0);
11845 }
11846 nfs4_set_mod(vp);
11847 rp->r_writeverf = cm_res->writeverf;
11848 mutex_exit(&rp->r_statelock);
11849 e.error = NFS_VERF_MISMATCH;
11850 }
11851
11852 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11853 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11854 crfree(cred_otw);
11855 if (osp != NULL)
11856 open_stream_rele(osp, rp);
11857
11858 return (e.error);
11859 }
11860
11861 static void
11862 nfs4_set_mod(vnode_t *vp)
11863 {
11864 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11865
11866 /* make sure we're looking at the master vnode, not a shadow */
11867 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11868 }
11869
11870 /*
11871 * This function is used to gather a page list of the pages which
11872 * can be committed on the server.
11873 *
11874 * The calling thread must have set R4COMMIT. This bit is used to
11875 * serialize access to the commit structure in the rnode. As long
11876 * as the thread has set R4COMMIT, then it can manipulate the commit
11877 * structure without requiring any other locks.
11878 *
11879 * When this function is called from nfs4_dispose() the page passed
11880 * into nfs4_dispose() will be SE_EXCL locked, and so this function
11881 * will skip it. This is not a problem since we initially add the
11882 * page to the r_commit page list.
11883 *
11884 */
11885 static void
11886 nfs4_get_commit(vnode_t *vp)
11887 {
11888 rnode4_t *rp;
11889 page_t *pp;
11890 kmutex_t *vphm;
11891
11892 rp = VTOR4(vp);
11893
11894 ASSERT(rp->r_flags & R4COMMIT);
11895
11896 /* make sure we're looking at the master vnode, not a shadow */
11897
11898 if (IS_SHADOW(vp, rp))
11899 vp = RTOV4(rp);
11900
11901 vphm = page_vnode_mutex(vp);
11902 mutex_enter(vphm);
11903
11904 /*
11905 * If there are no pages associated with this vnode, then
11906 * just return.
11907 */
11908 if ((pp = vp->v_pages) == NULL) {
11909 mutex_exit(vphm);
11910 return;
11911 }
11912
11913 /*
11914 * Step through all of the pages associated with this vnode
11915 * looking for pages which need to be committed.
11916 */
11917 do {
11918 /* Skip marker pages. */
11919 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11920 continue;
11921
11922 /*
11923 * First short-cut everything (without the page_lock)
11924 * and see if this page does not need to be committed
11925 * or is modified if so then we'll just skip it.
11926 */
11927 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11928 continue;
11929
11930 /*
11931 * Attempt to lock the page. If we can't, then
11932 * someone else is messing with it or we have been
11933 * called from nfs4_dispose and this is the page that
11934 * nfs4_dispose was called with.. anyway just skip it.
11935 */
11936 if (!page_trylock(pp, SE_EXCL))
11937 continue;
11938
11939 /*
11940 * Lets check again now that we have the page lock.
11941 */
11942 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11943 page_unlock(pp);
11944 continue;
11945 }
11946
11947 /* this had better not be a free page */
11948 ASSERT(PP_ISFREE(pp) == 0);
11949
11950 /*
11951 * The page needs to be committed and we locked it.
11952 * Update the base and length parameters and add it
11953 * to r_pages.
11954 */
11955 if (rp->r_commit.c_pages == NULL) {
11956 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11957 rp->r_commit.c_commlen = PAGESIZE;
11958 } else if (pp->p_offset < rp->r_commit.c_commbase) {
11959 rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11960 (offset3)pp->p_offset + rp->r_commit.c_commlen;
11961 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11962 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11963 <= pp->p_offset) {
11964 rp->r_commit.c_commlen = (offset3)pp->p_offset -
11965 rp->r_commit.c_commbase + PAGESIZE;
11966 }
11967 page_add(&rp->r_commit.c_pages, pp);
11968 } while ((pp = pp->p_vpnext) != vp->v_pages);
11969
11970 mutex_exit(vphm);
11971 }
11972
11973 /*
11974 * This routine is used to gather together a page list of the pages
11975 * which are to be committed on the server. This routine must not
11976 * be called if the calling thread holds any locked pages.
11977 *
11978 * The calling thread must have set R4COMMIT. This bit is used to
11979 * serialize access to the commit structure in the rnode. As long
11980 * as the thread has set R4COMMIT, then it can manipulate the commit
11981 * structure without requiring any other locks.
11982 */
11983 static void
11984 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
11985 {
11986
11987 rnode4_t *rp;
11988 page_t *pp;
11989 u_offset_t end;
11990 u_offset_t off;
11991 ASSERT(len != 0);
11992 rp = VTOR4(vp);
11993 ASSERT(rp->r_flags & R4COMMIT);
11994
11995 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11996
11997 /* make sure we're looking at the master vnode, not a shadow */
11998
11999 if (IS_SHADOW(vp, rp))
12000 vp = RTOV4(rp);
12001
12002 /*
12003 * If there are no pages associated with this vnode, then
12004 * just return.
12005 */
12006 if ((pp = vp->v_pages) == NULL)
12007 return;
12008 /*
12009 * Calculate the ending offset.
12010 */
12011 end = soff + len;
12012 for (off = soff; off < end; off += PAGESIZE) {
12013 /*
12014 * Lookup each page by vp, offset.
12015 */
12016 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12017 continue;
12018 /*
12019 * If this page does not need to be committed or is
12020 * modified, then just skip it.
12021 */
12022 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12023 page_unlock(pp);
12024 continue;
12025 }
12026
12027 ASSERT(PP_ISFREE(pp) == 0);
12028 /*
12029 * The page needs to be committed and we locked it.
12030 * Update the base and length parameters and add it
12031 * to r_pages.
12032 */
12033 if (rp->r_commit.c_pages == NULL) {
12034 rp->r_commit.c_commbase = (offset3)pp->p_offset;
12035 rp->r_commit.c_commlen = PAGESIZE;
12036 } else {
12037 rp->r_commit.c_commlen = (offset3)pp->p_offset -
12038 rp->r_commit.c_commbase + PAGESIZE;
12039 }
12040 page_add(&rp->r_commit.c_pages, pp);
12041 }
12042 }
12043
12044 /*
12045 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12046 * Flushes and commits data to the server.
12047 */
12048 static int
12049 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12050 {
12051 int error;
12052 verifier4 write_verf;
12053 rnode4_t *rp = VTOR4(vp);
12054
12055 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12056
12057 /*
12058 * Flush the data portion of the file and then commit any
12059 * portions which need to be committed. This may need to
12060 * be done twice if the server has changed state since
12061 * data was last written. The data will need to be
12062 * rewritten to the server and then a new commit done.
12063 *
12064 * In fact, this may need to be done several times if the
12065 * server is having problems and crashing while we are
12066 * attempting to do this.
12067 */
12068
12069 top:
12070 /*
12071 * Do a flush based on the poff and plen arguments. This
12072 * will synchronously write out any modified pages in the
12073 * range specified by (poff, plen). This starts all of the
12074 * i/o operations which will be waited for in the next
12075 * call to nfs4_putpage
12076 */
12077
12078 mutex_enter(&rp->r_statelock);
12079 write_verf = rp->r_writeverf;
12080 mutex_exit(&rp->r_statelock);
12081
12082 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12083 if (error == EAGAIN)
12084 error = 0;
12085
12086 /*
12087 * Do a flush based on the poff and plen arguments. This
12088 * will synchronously write out any modified pages in the
12089 * range specified by (poff, plen) and wait until all of
12090 * the asynchronous i/o's in that range are done as well.
12091 */
12092 if (!error)
12093 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12094
12095 if (error)
12096 return (error);
12097
12098 mutex_enter(&rp->r_statelock);
12099 if (rp->r_writeverf != write_verf) {
12100 mutex_exit(&rp->r_statelock);
12101 goto top;
12102 }
12103 mutex_exit(&rp->r_statelock);
12104
12105 /*
12106 * Now commit any pages which might need to be committed.
12107 * If the error, NFS_VERF_MISMATCH, is returned, then
12108 * start over with the flush operation.
12109 */
12110 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12111
12112 if (error == NFS_VERF_MISMATCH)
12113 goto top;
12114
12115 return (error);
12116 }
12117
12118 /*
12119 * nfs4_commit_vp() will wait for other pending commits and
12120 * will either commit the whole file or a range, plen dictates
12121 * if we commit whole file. a value of zero indicates the whole
12122 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12123 */
12124 static int
12125 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12126 cred_t *cr, int wait_on_writes)
12127 {
12128 rnode4_t *rp;
12129 page_t *plist;
12130 offset3 offset;
12131 count3 len;
12132
12133 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12134
12135 rp = VTOR4(vp);
12136
12137 /*
12138 * before we gather commitable pages make
12139 * sure there are no outstanding async writes
12140 */
12141 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12142 mutex_enter(&rp->r_statelock);
12143 while (rp->r_count > 0) {
12144 cv_wait(&rp->r_cv, &rp->r_statelock);
12145 }
12146 mutex_exit(&rp->r_statelock);
12147 }
12148
12149 /*
12150 * Set the `commit inprogress' state bit. We must
12151 * first wait until any current one finishes.
12152 */
12153 mutex_enter(&rp->r_statelock);
12154 while (rp->r_flags & R4COMMIT) {
12155 rp->r_flags |= R4COMMITWAIT;
12156 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12157 rp->r_flags &= ~R4COMMITWAIT;
12158 }
12159 rp->r_flags |= R4COMMIT;
12160 mutex_exit(&rp->r_statelock);
12161
12162 /*
12163 * Gather all of the pages which need to be
12164 * committed.
12165 */
12166 if (plen == 0)
12167 nfs4_get_commit(vp);
12168 else
12169 nfs4_get_commit_range(vp, poff, plen);
12170
12171 /*
12172 * Clear the `commit inprogress' bit and disconnect the
12173 * page list which was gathered by nfs4_get_commit.
12174 */
12175 plist = rp->r_commit.c_pages;
12176 rp->r_commit.c_pages = NULL;
12177 offset = rp->r_commit.c_commbase;
12178 len = rp->r_commit.c_commlen;
12179 mutex_enter(&rp->r_statelock);
12180 rp->r_flags &= ~R4COMMIT;
12181 cv_broadcast(&rp->r_commit.c_cv);
12182 mutex_exit(&rp->r_statelock);
12183
12184 /*
12185 * If any pages need to be committed, commit them and
12186 * then unlock them so that they can be freed some
12187 * time later.
12188 */
12189 if (plist == NULL)
12190 return (0);
12191
12192 /*
12193 * No error occurred during the flush portion
12194 * of this operation, so now attempt to commit
12195 * the data to stable storage on the server.
12196 *
12197 * This will unlock all of the pages on the list.
12198 */
12199 return (nfs4_sync_commit(vp, plist, offset, len, cr));
12200 }
12201
12202 static int
12203 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12204 cred_t *cr)
12205 {
12206 int error;
12207 page_t *pp;
12208
12209 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12210
12211 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12212
12213 /*
12214 * If we got an error, then just unlock all of the pages
12215 * on the list.
12216 */
12217 if (error) {
12218 while (plist != NULL) {
12219 pp = plist;
12220 page_sub(&plist, pp);
12221 page_unlock(pp);
12222 }
12223 return (error);
12224 }
12225 /*
12226 * We've tried as hard as we can to commit the data to stable
12227 * storage on the server. We just unlock the pages and clear
12228 * the commit required state. They will get freed later.
12229 */
12230 while (plist != NULL) {
12231 pp = plist;
12232 page_sub(&plist, pp);
12233 pp->p_fsdata = C_NOCOMMIT;
12234 page_unlock(pp);
12235 }
12236
12237 return (error);
12238 }
12239
12240 static void
12241 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12242 cred_t *cr)
12243 {
12244
12245 (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12246 }
12247
12248 /*ARGSUSED*/
12249 static int
12250 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12251 caller_context_t *ct)
12252 {
12253 int error = 0;
12254 mntinfo4_t *mi;
12255 vattr_t va;
12256 vsecattr_t nfsace4_vsap;
12257
12258 mi = VTOMI4(vp);
12259 if (nfs_zone() != mi->mi_zone)
12260 return (EIO);
12261 if (mi->mi_flags & MI4_ACL) {
12262 /* if we have a delegation, return it */
12263 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12264 (void) nfs4delegreturn(VTOR4(vp),
12265 NFS4_DR_REOPEN|NFS4_DR_PUSH);
12266
12267 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12268 NFS4_ACL_SET);
12269 if (error) /* EINVAL */
12270 return (error);
12271
12272 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12273 /*
12274 * These are aclent_t type entries.
12275 */
12276 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12277 vp->v_type == VDIR, FALSE);
12278 if (error)
12279 return (error);
12280 } else {
12281 /*
12282 * These are ace_t type entries.
12283 */
12284 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12285 FALSE);
12286 if (error)
12287 return (error);
12288 }
12289 bzero(&va, sizeof (va));
12290 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12291 vs_ace4_destroy(&nfsace4_vsap);
12292 return (error);
12293 }
12294 return (ENOSYS);
12295 }
12296
12297 /* ARGSUSED */
12298 int
12299 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12300 caller_context_t *ct)
12301 {
12302 int error;
12303 mntinfo4_t *mi;
12304 nfs4_ga_res_t gar;
12305 rnode4_t *rp = VTOR4(vp);
12306
12307 mi = VTOMI4(vp);
12308 if (nfs_zone() != mi->mi_zone)
12309 return (EIO);
12310
12311 bzero(&gar, sizeof (gar));
12312 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12313
12314 /*
12315 * vsecattr->vsa_mask holds the original acl request mask.
12316 * This is needed when determining what to return.
12317 * (See: nfs4_create_getsecattr_return())
12318 */
12319 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12320 if (error) /* EINVAL */
12321 return (error);
12322
12323 /*
12324 * If this is a referral stub, don't try to go OTW for an ACL
12325 */
12326 if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12327 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12328
12329 if (mi->mi_flags & MI4_ACL) {
12330 /*
12331 * Check if the data is cached and the cache is valid. If it
12332 * is we don't go over the wire.
12333 */
12334 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12335 mutex_enter(&rp->r_statelock);
12336 if (rp->r_secattr != NULL) {
12337 error = nfs4_create_getsecattr_return(
12338 rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12339 rp->r_attr.va_gid,
12340 vp->v_type == VDIR);
12341 if (!error) { /* error == 0 - Success! */
12342 mutex_exit(&rp->r_statelock);
12343 return (error);
12344 }
12345 }
12346 mutex_exit(&rp->r_statelock);
12347 }
12348
12349 /*
12350 * The getattr otw call will always get both the acl, in
12351 * the form of a list of nfsace4's, and the number of acl
12352 * entries; independent of the value of gar.n4g_vsa.vsa_mask.
12353 */
12354 gar.n4g_va.va_mask = AT_ALL;
12355 error = nfs4_getattr_otw(vp, &gar, cr, 1);
12356 if (error) {
12357 vs_ace4_destroy(&gar.n4g_vsa);
12358 if (error == ENOTSUP || error == EOPNOTSUPP)
12359 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12360 return (error);
12361 }
12362
12363 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12364 /*
12365 * No error was returned, but according to the response
12366 * bitmap, neither was an acl.
12367 */
12368 vs_ace4_destroy(&gar.n4g_vsa);
12369 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12370 return (error);
12371 }
12372
12373 /*
12374 * Update the cache with the ACL.
12375 */
12376 nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12377
12378 error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12379 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12380 vp->v_type == VDIR);
12381 vs_ace4_destroy(&gar.n4g_vsa);
12382 if ((error) && (vsecattr->vsa_mask &
12383 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12384 (error != EACCES)) {
12385 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12386 }
12387 return (error);
12388 }
12389 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12390 return (error);
12391 }
12392
12393 /*
12394 * The function returns:
12395 * - 0 (zero) if the passed in "acl_mask" is a valid request.
12396 * - EINVAL if the passed in "acl_mask" is an invalid request.
12397 *
12398 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12399 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12400 *
12401 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12402 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12403 * - We have a count field set without the corresponding acl field set. (e.g. -
12404 * VSA_ACECNT is set, but VSA_ACE is not)
12405 */
12406 static int
12407 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12408 {
12409 /* Shortcut the masks that are always valid. */
12410 if (acl_mask == (VSA_ACE | VSA_ACECNT))
12411 return (0);
12412 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12413 return (0);
12414
12415 if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12416 /*
12417 * We can't have any VSA_ACL type stuff in the mask now.
12418 */
12419 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12420 VSA_DFACLCNT))
12421 return (EINVAL);
12422
12423 if (op == NFS4_ACL_SET) {
12424 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12425 return (EINVAL);
12426 }
12427 }
12428
12429 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12430 /*
12431 * We can't have any VSA_ACE type stuff in the mask now.
12432 */
12433 if (acl_mask & (VSA_ACE | VSA_ACECNT))
12434 return (EINVAL);
12435
12436 if (op == NFS4_ACL_SET) {
12437 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12438 return (EINVAL);
12439
12440 if ((acl_mask & VSA_DFACLCNT) &&
12441 !(acl_mask & VSA_DFACL))
12442 return (EINVAL);
12443 }
12444 }
12445 return (0);
12446 }
12447
12448 /*
12449 * The theory behind creating the correct getsecattr return is simply this:
12450 * "Don't return anything that the caller is not expecting to have to free."
12451 */
12452 static int
12453 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12454 uid_t uid, gid_t gid, int isdir)
12455 {
12456 int error = 0;
12457 /* Save the mask since the translators modify it. */
12458 uint_t orig_mask = vsap->vsa_mask;
12459
12460 if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12461 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12462
12463 if (error)
12464 return (error);
12465
12466 /*
12467 * If the caller only asked for the ace count (VSA_ACECNT)
12468 * don't give them the full acl (VSA_ACE), free it.
12469 */
12470 if (!orig_mask & VSA_ACE) {
12471 if (vsap->vsa_aclentp != NULL) {
12472 kmem_free(vsap->vsa_aclentp,
12473 vsap->vsa_aclcnt * sizeof (ace_t));
12474 vsap->vsa_aclentp = NULL;
12475 }
12476 }
12477 vsap->vsa_mask = orig_mask;
12478
12479 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12480 VSA_DFACLCNT)) {
12481 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12482 isdir, FALSE);
12483
12484 if (error)
12485 return (error);
12486
12487 /*
12488 * If the caller only asked for the acl count (VSA_ACLCNT)
12489 * and/or the default acl count (VSA_DFACLCNT) don't give them
12490 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12491 */
12492 if (!orig_mask & VSA_ACL) {
12493 if (vsap->vsa_aclentp != NULL) {
12494 kmem_free(vsap->vsa_aclentp,
12495 vsap->vsa_aclcnt * sizeof (aclent_t));
12496 vsap->vsa_aclentp = NULL;
12497 }
12498 }
12499
12500 if (!orig_mask & VSA_DFACL) {
12501 if (vsap->vsa_dfaclentp != NULL) {
12502 kmem_free(vsap->vsa_dfaclentp,
12503 vsap->vsa_dfaclcnt * sizeof (aclent_t));
12504 vsap->vsa_dfaclentp = NULL;
12505 }
12506 }
12507 vsap->vsa_mask = orig_mask;
12508 }
12509 return (0);
12510 }
12511
12512 /* ARGSUSED */
12513 int
12514 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12515 caller_context_t *ct)
12516 {
12517 int error;
12518
12519 if (nfs_zone() != VTOMI4(vp)->mi_zone)
12520 return (EIO);
12521 /*
12522 * check for valid cmd parameter
12523 */
12524 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12525 return (EINVAL);
12526
12527 /*
12528 * Check access permissions
12529 */
12530 if ((cmd & F_SHARE) &&
12531 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12532 (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12533 return (EBADF);
12534
12535 /*
12536 * If the filesystem is mounted using local locking, pass the
12537 * request off to the local share code.
12538 */
12539 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12540 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12541
12542 switch (cmd) {
12543 case F_SHARE:
12544 case F_UNSHARE:
12545 /*
12546 * This will be properly implemented later,
12547 * see RFE: 4823948 .
12548 */
12549 error = EAGAIN;
12550 break;
12551
12552 case F_HASREMOTELOCKS:
12553 /*
12554 * NFS client can't store remote locks itself
12555 */
12556 shr->s_access = 0;
12557 error = 0;
12558 break;
12559
12560 default:
12561 error = EINVAL;
12562 break;
12563 }
12564
12565 return (error);
12566 }
12567
12568 /*
12569 * Common code called by directory ops to update the attrcache
12570 */
12571 static int
12572 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12573 hrtime_t t, vnode_t *vp, cred_t *cr)
12574 {
12575 int error = 0;
12576
12577 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12578
12579 if (status != NFS4_OK) {
12580 /* getattr not done or failed */
12581 PURGE_ATTRCACHE4(vp);
12582 return (error);
12583 }
12584
12585 if (garp) {
12586 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12587 } else {
12588 PURGE_ATTRCACHE4(vp);
12589 }
12590 return (error);
12591 }
12592
12593 /*
12594 * Update directory caches for directory modification ops (link, rename, etc.)
12595 * When dinfo is NULL, manage dircaches in the old way.
12596 */
12597 static void
12598 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12599 dirattr_info_t *dinfo)
12600 {
12601 rnode4_t *drp = VTOR4(dvp);
12602
12603 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12604
12605 /* Purge rddir cache for dir since it changed */
12606 if (drp->r_dir != NULL)
12607 nfs4_purge_rddir_cache(dvp);
12608
12609 /*
12610 * If caller provided dinfo, then use it to manage dir caches.
12611 */
12612 if (dinfo != NULL) {
12613 if (vp != NULL) {
12614 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12615 if (!VTOR4(vp)->created_v4) {
12616 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12617 dnlc_update(dvp, nm, vp);
12618 } else {
12619 /*
12620 * XXX don't update if the created_v4 flag is
12621 * set
12622 */
12623 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12624 NFS4_DEBUG(nfs4_client_state_debug,
12625 (CE_NOTE, "nfs4_update_dircaches: "
12626 "don't update dnlc: created_v4 flag"));
12627 }
12628 }
12629
12630 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12631 dinfo->di_cred, FALSE, cinfo);
12632
12633 return;
12634 }
12635
12636 /*
12637 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12638 * Since caller modified dir but didn't receive post-dirmod-op dir
12639 * attrs, the dir's attrs must be purged.
12640 *
12641 * XXX this check and dnlc update/purge should really be atomic,
12642 * XXX but can't use rnode statelock because it'll deadlock in
12643 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12644 * XXX does occur.
12645 *
12646 * XXX We also may want to check that atomic is true in the
12647 * XXX change_info struct. If it is not, the change_info may
12648 * XXX reflect changes by more than one clients which means that
12649 * XXX our cache may not be valid.
12650 */
12651 PURGE_ATTRCACHE4(dvp);
12652 if (drp->r_change == cinfo->before) {
12653 /* no changes took place in the directory prior to our link */
12654 if (vp != NULL) {
12655 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12656 if (!VTOR4(vp)->created_v4) {
12657 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12658 dnlc_update(dvp, nm, vp);
12659 } else {
12660 /*
12661 * XXX dont' update if the created_v4 flag
12662 * is set
12663 */
12664 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12665 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12666 "nfs4_update_dircaches: don't"
12667 " update dnlc: created_v4 flag"));
12668 }
12669 }
12670 } else {
12671 /* Another client modified directory - purge its dnlc cache */
12672 dnlc_purge_vp(dvp);
12673 }
12674 }
12675
12676 /*
12677 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12678 * file.
12679 *
12680 * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12681 * file (ie: client recovery) and otherwise set to FALSE.
12682 *
12683 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12684 * initiated) calling functions.
12685 *
12686 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12687 * of resending a 'lost' open request.
12688 *
12689 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12690 * server that hands out BAD_SEQID on open confirm.
12691 *
12692 * Errors are returned via the nfs4_error_t parameter.
12693 */
12694 void
12695 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12696 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12697 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12698 {
12699 COMPOUND4args_clnt args;
12700 COMPOUND4res_clnt res;
12701 nfs_argop4 argop[2];
12702 nfs_resop4 *resop;
12703 int doqueue = 1;
12704 mntinfo4_t *mi;
12705 OPEN_CONFIRM4args *open_confirm_args;
12706 int needrecov;
12707
12708 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12709 #if DEBUG
12710 mutex_enter(&oop->oo_lock);
12711 ASSERT(oop->oo_seqid_inuse);
12712 mutex_exit(&oop->oo_lock);
12713 #endif
12714
12715 recov_retry_confirm:
12716 nfs4_error_zinit(ep);
12717 *retry_open = FALSE;
12718
12719 if (resend)
12720 args.ctag = TAG_OPEN_CONFIRM_LOST;
12721 else
12722 args.ctag = TAG_OPEN_CONFIRM;
12723
12724 args.array_len = 2;
12725 args.array = argop;
12726
12727 /* putfh target fh */
12728 argop[0].argop = OP_CPUTFH;
12729 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12730
12731 argop[1].argop = OP_OPEN_CONFIRM;
12732 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12733
12734 (*seqid) += 1;
12735 open_confirm_args->seqid = *seqid;
12736 open_confirm_args->open_stateid = *stateid;
12737
12738 mi = VTOMI4(vp);
12739
12740 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12741
12742 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12743 nfs4_set_open_seqid((*seqid), oop, args.ctag);
12744 }
12745
12746 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12747 if (!needrecov && ep->error)
12748 return;
12749
12750 if (needrecov) {
12751 bool_t abort = FALSE;
12752
12753 if (reopening_file == FALSE) {
12754 nfs4_bseqid_entry_t *bsep = NULL;
12755
12756 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12757 bsep = nfs4_create_bseqid_entry(oop, NULL,
12758 vp, 0, args.ctag,
12759 open_confirm_args->seqid);
12760
12761 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12762 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12763 if (bsep) {
12764 kmem_free(bsep, sizeof (*bsep));
12765 if (num_bseqid_retryp &&
12766 --(*num_bseqid_retryp) == 0)
12767 abort = TRUE;
12768 }
12769 }
12770 if ((ep->error == ETIMEDOUT ||
12771 res.status == NFS4ERR_RESOURCE) &&
12772 abort == FALSE && resend == FALSE) {
12773 if (!ep->error)
12774 (void) xdr_free(xdr_COMPOUND4res_clnt,
12775 (caddr_t)&res);
12776
12777 delay(SEC_TO_TICK(confirm_retry_sec));
12778 goto recov_retry_confirm;
12779 }
12780 /* State may have changed so retry the entire OPEN op */
12781 if (abort == FALSE)
12782 *retry_open = TRUE;
12783 else
12784 *retry_open = FALSE;
12785 if (!ep->error)
12786 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12787 return;
12788 }
12789
12790 if (res.status) {
12791 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12792 return;
12793 }
12794
12795 resop = &res.array[1]; /* open confirm res */
12796 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12797 stateid, sizeof (*stateid));
12798
12799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12800 }
12801
12802 /*
12803 * Return the credentials associated with a client state object. The
12804 * caller is responsible for freeing the credentials.
12805 */
12806
12807 static cred_t *
12808 state_to_cred(nfs4_open_stream_t *osp)
12809 {
12810 cred_t *cr;
12811
12812 /*
12813 * It's ok to not lock the open stream and open owner to get
12814 * the oo_cred since this is only written once (upon creation)
12815 * and will not change.
12816 */
12817 cr = osp->os_open_owner->oo_cred;
12818 crhold(cr);
12819
12820 return (cr);
12821 }
12822
12823 /*
12824 * nfs4_find_sysid
12825 *
12826 * Find the sysid for the knetconfig associated with the given mi.
12827 */
12828 static struct lm_sysid *
12829 nfs4_find_sysid(mntinfo4_t *mi)
12830 {
12831 ASSERT(nfs_zone() == mi->mi_zone);
12832
12833 /*
12834 * Switch from RDMA knconf to original mount knconf
12835 */
12836 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12837 mi->mi_curr_serv->sv_hostname, NULL));
12838 }
12839
12840 #ifdef DEBUG
12841 /*
12842 * Return a string version of the call type for easy reading.
12843 */
12844 static char *
12845 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12846 {
12847 switch (ctype) {
12848 case NFS4_LCK_CTYPE_NORM:
12849 return ("NORMAL");
12850 case NFS4_LCK_CTYPE_RECLAIM:
12851 return ("RECLAIM");
12852 case NFS4_LCK_CTYPE_RESEND:
12853 return ("RESEND");
12854 case NFS4_LCK_CTYPE_REINSTATE:
12855 return ("REINSTATE");
12856 default:
12857 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12858 "type %d", ctype);
12859 return ("");
12860 }
12861 }
12862 #endif
12863
12864 /*
12865 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12866 * Unlock requests don't have an over-the-wire locktype, so we just return
12867 * something non-threatening.
12868 */
12869
12870 static nfs_lock_type4
12871 flk_to_locktype(int cmd, int l_type)
12872 {
12873 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12874
12875 switch (l_type) {
12876 case F_UNLCK:
12877 return (READ_LT);
12878 case F_RDLCK:
12879 if (cmd == F_SETLK)
12880 return (READ_LT);
12881 else
12882 return (READW_LT);
12883 case F_WRLCK:
12884 if (cmd == F_SETLK)
12885 return (WRITE_LT);
12886 else
12887 return (WRITEW_LT);
12888 }
12889 panic("flk_to_locktype");
12890 /*NOTREACHED*/
12891 }
12892
12893 /*
12894 * Do some preliminary checks for nfs4frlock.
12895 */
12896 static int
12897 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12898 u_offset_t offset)
12899 {
12900 int error = 0;
12901
12902 /*
12903 * If we are setting a lock, check that the file is opened
12904 * with the correct mode.
12905 */
12906 if (cmd == F_SETLK || cmd == F_SETLKW) {
12907 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12908 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12909 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12910 "nfs4frlock_validate_args: file was opened with "
12911 "incorrect mode"));
12912 return (EBADF);
12913 }
12914 }
12915
12916 /* Convert the offset. It may need to be restored before returning. */
12917 if (error = convoff(vp, flk, 0, offset)) {
12918 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12919 "nfs4frlock_validate_args: convoff => error= %d\n",
12920 error));
12921 return (error);
12922 }
12923
12924 return (error);
12925 }
12926
12927 /*
12928 * Set the flock64's lm_sysid for nfs4frlock.
12929 */
12930 static int
12931 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12932 {
12933 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12934
12935 /* Find the lm_sysid */
12936 *lspp = nfs4_find_sysid(VTOMI4(vp));
12937
12938 if (*lspp == NULL) {
12939 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12941 return (ENOLCK);
12942 }
12943
12944 flk->l_sysid = lm_sysidt(*lspp);
12945
12946 return (0);
12947 }
12948
12949 /*
12950 * Do the remaining preliminary setup for nfs4frlock.
12951 */
12952 static void
12953 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12954 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12955 cred_t **cred_otw)
12956 {
12957 /*
12958 * set tick_delay to the base delay time.
12959 * (NFS4_BASE_WAIT_TIME is in secs)
12960 */
12961
12962 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12963
12964 /*
12965 * If lock is relative to EOF, we need the newest length of the
12966 * file. Therefore invalidate the ATTR_CACHE.
12967 */
12968
12969 *whencep = flk->l_whence;
12970
12971 if (*whencep == 2) /* SEEK_END */
12972 PURGE_ATTRCACHE4(vp);
12973
12974 recov_statep->rs_flags = 0;
12975 recov_statep->rs_num_retry_despite_err = 0;
12976 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12977 }
12978
12979 /*
12980 * Initialize and allocate the data structures necessary for
12981 * the nfs4frlock call.
12982 * Allocates argsp's op array, frees up the saved_rqstpp if there is one.
12983 */
12984 static void
12985 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
12986 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
12987 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
12988 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
12989 {
12990 int argoplist_size;
12991 int num_ops = 2;
12992
12993 *retry = FALSE;
12994 *did_start_fop = FALSE;
12995 *skip_get_err = FALSE;
12996 lost_rqstp->lr_op = 0;
12997 argoplist_size = num_ops * sizeof (nfs_argop4);
12998 /* fill array with zero */
12999 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13000
13001 *argspp = argsp;
13002 *respp = NULL;
13003
13004 argsp->array_len = num_ops;
13005 argsp->array = *argopp;
13006
13007 /* initialize in case of error; will get real value down below */
13008 argsp->ctag = TAG_NONE;
13009
13010 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13011 *op_hintp = OH_LOCKU;
13012 else
13013 *op_hintp = OH_OTHER;
13014 }
13015
13016 /*
13017 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign
13018 * the proper nfs4_server_t for this instance of nfs4frlock.
13019 * Returns 0 (success) or an errno value.
13020 */
13021 static int
13022 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13023 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13024 bool_t *did_start_fop, bool_t *startrecovp)
13025 {
13026 int error = 0;
13027 rnode4_t *rp;
13028
13029 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13030
13031 if (ctype == NFS4_LCK_CTYPE_NORM) {
13032 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13033 recov_statep, startrecovp);
13034 if (error)
13035 return (error);
13036 *did_start_fop = TRUE;
13037 } else {
13038 *did_start_fop = FALSE;
13039 *startrecovp = FALSE;
13040 }
13041
13042 if (!error) {
13043 rp = VTOR4(vp);
13044
13045 /* If the file failed recovery, just quit. */
13046 mutex_enter(&rp->r_statelock);
13047 if (rp->r_flags & R4RECOVERR) {
13048 error = EIO;
13049 }
13050 mutex_exit(&rp->r_statelock);
13051 }
13052
13053 return (error);
13054 }
13055
13056 /*
13057 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A
13058 * resend nfs4frlock call is initiated by the recovery framework.
13059 * Acquires the lop and oop seqid synchronization.
13060 */
13061 static void
13062 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13063 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13064 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13065 LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13066 {
13067 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13068 int error;
13069
13070 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13071 (CE_NOTE,
13072 "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13073 ASSERT(resend_rqstp != NULL);
13074 ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13075 resend_rqstp->lr_op == OP_LOCKU);
13076
13077 *oopp = resend_rqstp->lr_oop;
13078 if (resend_rqstp->lr_oop) {
13079 open_owner_hold(resend_rqstp->lr_oop);
13080 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13081 ASSERT(error == 0); /* recov thread always succeeds */
13082 }
13083
13084 /* Must resend this lost lock/locku request. */
13085 ASSERT(resend_rqstp->lr_lop != NULL);
13086 *lopp = resend_rqstp->lr_lop;
13087 lock_owner_hold(resend_rqstp->lr_lop);
13088 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13089 ASSERT(error == 0); /* recov thread always succeeds */
13090
13091 *ospp = resend_rqstp->lr_osp;
13092 if (*ospp)
13093 open_stream_hold(resend_rqstp->lr_osp);
13094
13095 if (resend_rqstp->lr_op == OP_LOCK) {
13096 LOCK4args *lock_args;
13097
13098 argop->argop = OP_LOCK;
13099 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13100 lock_args->locktype = resend_rqstp->lr_locktype;
13101 lock_args->reclaim =
13102 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13103 lock_args->offset = resend_rqstp->lr_flk->l_start;
13104 lock_args->length = resend_rqstp->lr_flk->l_len;
13105 if (lock_args->length == 0)
13106 lock_args->length = ~lock_args->length;
13107 nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13108 mi2clientid(mi), &lock_args->locker);
13109
13110 switch (resend_rqstp->lr_ctype) {
13111 case NFS4_LCK_CTYPE_RESEND:
13112 argsp->ctag = TAG_LOCK_RESEND;
13113 break;
13114 case NFS4_LCK_CTYPE_REINSTATE:
13115 argsp->ctag = TAG_LOCK_REINSTATE;
13116 break;
13117 case NFS4_LCK_CTYPE_RECLAIM:
13118 argsp->ctag = TAG_LOCK_RECLAIM;
13119 break;
13120 default:
13121 argsp->ctag = TAG_LOCK_UNKNOWN;
13122 break;
13123 }
13124 } else {
13125 LOCKU4args *locku_args;
13126 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13127
13128 argop->argop = OP_LOCKU;
13129 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13130 locku_args->locktype = READ_LT;
13131 locku_args->seqid = lop->lock_seqid + 1;
13132 mutex_enter(&lop->lo_lock);
13133 locku_args->lock_stateid = lop->lock_stateid;
13134 mutex_exit(&lop->lo_lock);
13135 locku_args->offset = resend_rqstp->lr_flk->l_start;
13136 locku_args->length = resend_rqstp->lr_flk->l_len;
13137 if (locku_args->length == 0)
13138 locku_args->length = ~locku_args->length;
13139
13140 switch (resend_rqstp->lr_ctype) {
13141 case NFS4_LCK_CTYPE_RESEND:
13142 argsp->ctag = TAG_LOCKU_RESEND;
13143 break;
13144 case NFS4_LCK_CTYPE_REINSTATE:
13145 argsp->ctag = TAG_LOCKU_REINSTATE;
13146 break;
13147 default:
13148 argsp->ctag = TAG_LOCK_UNKNOWN;
13149 break;
13150 }
13151 }
13152 }
13153
13154 /*
13155 * Setup the LOCKT4 arguments.
13156 */
13157 static void
13158 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13159 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13160 rnode4_t *rp)
13161 {
13162 LOCKT4args *lockt_args;
13163
13164 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13165 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13166 argop->argop = OP_LOCKT;
13167 argsp->ctag = TAG_LOCKT;
13168 lockt_args = &argop->nfs_argop4_u.oplockt;
13169
13170 /*
13171 * The locktype will be READ_LT unless it's
13172 * a write lock. We do this because the Solaris
13173 * system call allows the combination of
13174 * F_UNLCK and F_GETLK* and so in that case the
13175 * unlock is mapped to a read.
13176 */
13177 if (flk->l_type == F_WRLCK)
13178 lockt_args->locktype = WRITE_LT;
13179 else
13180 lockt_args->locktype = READ_LT;
13181
13182 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13183 /* set the lock owner4 args */
13184 nfs4_setlockowner_args(&lockt_args->owner, rp,
13185 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13186 flk->l_pid);
13187 lockt_args->offset = flk->l_start;
13188 lockt_args->length = flk->l_len;
13189 if (flk->l_len == 0)
13190 lockt_args->length = ~lockt_args->length;
13191
13192 *lockt_argsp = lockt_args;
13193 }
13194
13195 /*
13196 * If the client is holding a delegation, and the open stream to be used
13197 * with this lock request is a delegation open stream, then re-open the stream.
13198 * Sets the nfs4_error_t to all zeros unless the open stream has already
13199 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY
13200 * means the caller should retry (like a recovery retry).
13201 */
13202 static void
13203 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13204 {
13205 open_delegation_type4 dt;
13206 bool_t reopen_needed, force;
13207 nfs4_open_stream_t *osp;
13208 open_claim_type4 oclaim;
13209 rnode4_t *rp = VTOR4(vp);
13210 mntinfo4_t *mi = VTOMI4(vp);
13211
13212 ASSERT(nfs_zone() == mi->mi_zone);
13213
13214 nfs4_error_zinit(ep);
13215
13216 mutex_enter(&rp->r_statev4_lock);
13217 dt = rp->r_deleg_type;
13218 mutex_exit(&rp->r_statev4_lock);
13219
13220 if (dt != OPEN_DELEGATE_NONE) {
13221 nfs4_open_owner_t *oop;
13222
13223 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13224 if (!oop) {
13225 ep->stat = NFS4ERR_IO;
13226 return;
13227 }
13228 /* returns with 'os_sync_lock' held */
13229 osp = find_open_stream(oop, rp);
13230 if (!osp) {
13231 open_owner_rele(oop);
13232 ep->stat = NFS4ERR_IO;
13233 return;
13234 }
13235
13236 if (osp->os_failed_reopen) {
13237 NFS4_DEBUG((nfs4_open_stream_debug ||
13238 nfs4_client_lock_debug), (CE_NOTE,
13239 "nfs4frlock_check_deleg: os_failed_reopen set "
13240 "for osp %p, cr %p, rp %s", (void *)osp,
13241 (void *)cr, rnode4info(rp)));
13242 mutex_exit(&osp->os_sync_lock);
13243 open_stream_rele(osp, rp);
13244 open_owner_rele(oop);
13245 ep->stat = NFS4ERR_IO;
13246 return;
13247 }
13248
13249 /*
13250 * Determine whether a reopen is needed. If this
13251 * is a delegation open stream, then send the open
13252 * to the server to give visibility to the open owner.
13253 * Even if it isn't a delegation open stream, we need
13254 * to check if the previous open CLAIM_DELEGATE_CUR
13255 * was sufficient.
13256 */
13257
13258 reopen_needed = osp->os_delegation ||
13259 ((lt == F_RDLCK &&
13260 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13261 (lt == F_WRLCK &&
13262 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13263
13264 mutex_exit(&osp->os_sync_lock);
13265 open_owner_rele(oop);
13266
13267 if (reopen_needed) {
13268 /*
13269 * Always use CLAIM_PREVIOUS after server reboot.
13270 * The server will reject CLAIM_DELEGATE_CUR if
13271 * it is used during the grace period.
13272 */
13273 mutex_enter(&mi->mi_lock);
13274 if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13275 oclaim = CLAIM_PREVIOUS;
13276 force = TRUE;
13277 } else {
13278 oclaim = CLAIM_DELEGATE_CUR;
13279 force = FALSE;
13280 }
13281 mutex_exit(&mi->mi_lock);
13282
13283 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13284 if (ep->error == EAGAIN) {
13285 nfs4_error_zinit(ep);
13286 ep->stat = NFS4ERR_DELAY;
13287 }
13288 }
13289 open_stream_rele(osp, rp);
13290 osp = NULL;
13291 }
13292 }
13293
13294 /*
13295 * Setup the LOCKU4 arguments.
13296 * Returns errors via the nfs4_error_t.
13297 * NFS4_OK no problems. *go_otwp is TRUE if call should go
13298 * over-the-wire. The caller must release the
13299 * reference on *lopp.
13300 * NFS4ERR_DELAY caller should retry (like recovery retry)
13301 * (other) unrecoverable error.
13302 */
13303 static void
13304 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13305 LOCKU4args **locku_argsp, flock64_t *flk,
13306 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13307 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13308 bool_t *skip_get_err, bool_t *go_otwp)
13309 {
13310 nfs4_lock_owner_t *lop = NULL;
13311 LOCKU4args *locku_args;
13312 pid_t pid;
13313 bool_t is_spec = FALSE;
13314 rnode4_t *rp = VTOR4(vp);
13315
13316 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13317 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13318
13319 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13320 if (ep->error || ep->stat)
13321 return;
13322
13323 argop->argop = OP_LOCKU;
13324 if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13325 argsp->ctag = TAG_LOCKU_REINSTATE;
13326 else
13327 argsp->ctag = TAG_LOCKU;
13328 locku_args = &argop->nfs_argop4_u.oplocku;
13329 *locku_argsp = locku_args;
13330
13331 /*
13332 * XXX what should locku_args->locktype be?
13333 * setting to ALWAYS be READ_LT so at least
13334 * it is a valid locktype.
13335 */
13336
13337 locku_args->locktype = READ_LT;
13338
13339 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13340 flk->l_pid;
13341
13342 /*
13343 * Get the lock owner stateid. If no lock owner
13344 * exists, return success.
13345 */
13346 lop = find_lock_owner(rp, pid, LOWN_ANY);
13347 *lopp = lop;
13348 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13349 is_spec = TRUE;
13350 if (!lop || is_spec) {
13351 /*
13352 * No lock owner so no locks to unlock.
13353 * Return success. If there was a failed
13354 * reclaim earlier, the lock might still be
13355 * registered with the local locking code,
13356 * so notify it of the unlock.
13357 *
13358 * If the lockowner is using a special stateid,
13359 * then the original lock request (that created
13360 * this lockowner) was never successful, so we
13361 * have no lock to undo OTW.
13362 */
13363 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13364 "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13365 "(%ld) so return success", (long)pid));
13366
13367 if (ctype == NFS4_LCK_CTYPE_NORM)
13368 flk->l_pid = curproc->p_pid;
13369 nfs4_register_lock_locally(vp, flk, flag, offset);
13370 /*
13371 * Release our hold and NULL out so final_cleanup
13372 * doesn't try to end a lock seqid sync we
13373 * never started.
13374 */
13375 if (is_spec) {
13376 lock_owner_rele(lop);
13377 *lopp = NULL;
13378 }
13379 *skip_get_err = TRUE;
13380 *go_otwp = FALSE;
13381 return;
13382 }
13383
13384 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13385 if (ep->error == EAGAIN) {
13386 lock_owner_rele(lop);
13387 *lopp = NULL;
13388 return;
13389 }
13390
13391 mutex_enter(&lop->lo_lock);
13392 locku_args->lock_stateid = lop->lock_stateid;
13393 mutex_exit(&lop->lo_lock);
13394 locku_args->seqid = lop->lock_seqid + 1;
13395
13396 /* leave the ref count on lop, rele after RPC call */
13397
13398 locku_args->offset = flk->l_start;
13399 locku_args->length = flk->l_len;
13400 if (flk->l_len == 0)
13401 locku_args->length = ~locku_args->length;
13402
13403 *go_otwp = TRUE;
13404 }
13405
13406 /*
13407 * Setup the LOCK4 arguments.
13408 *
13409 * Returns errors via the nfs4_error_t.
13410 * NFS4_OK no problems
13411 * NFS4ERR_DELAY caller should retry (like recovery retry)
13412 * (other) unrecoverable error
13413 */
13414 static void
13415 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13416 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13417 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13418 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13419 {
13420 LOCK4args *lock_args;
13421 nfs4_open_owner_t *oop = NULL;
13422 nfs4_open_stream_t *osp = NULL;
13423 nfs4_lock_owner_t *lop = NULL;
13424 pid_t pid;
13425 rnode4_t *rp = VTOR4(vp);
13426
13427 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13428
13429 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13430 if (ep->error || ep->stat != NFS4_OK)
13431 return;
13432
13433 argop->argop = OP_LOCK;
13434 if (ctype == NFS4_LCK_CTYPE_NORM)
13435 argsp->ctag = TAG_LOCK;
13436 else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13437 argsp->ctag = TAG_RELOCK;
13438 else
13439 argsp->ctag = TAG_LOCK_REINSTATE;
13440 lock_args = &argop->nfs_argop4_u.oplock;
13441 lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13442 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13443 /*
13444 * Get the lock owner. If no lock owner exists,
13445 * create a 'temporary' one and grab the open seqid
13446 * synchronization (which puts a hold on the open
13447 * owner and open stream).
13448 * This also grabs the lock seqid synchronization.
13449 */
13450 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13451 ep->stat =
13452 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13453
13454 if (ep->stat != NFS4_OK)
13455 goto out;
13456
13457 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13458 &lock_args->locker);
13459
13460 lock_args->offset = flk->l_start;
13461 lock_args->length = flk->l_len;
13462 if (flk->l_len == 0)
13463 lock_args->length = ~lock_args->length;
13464 *lock_argsp = lock_args;
13465 out:
13466 *oopp = oop;
13467 *ospp = osp;
13468 *lopp = lop;
13469 }
13470
13471 /*
13472 * After we get the reply from the server, record the proper information
13473 * for possible resend lock requests.
13474 *
13475 * Allocates memory for the saved_rqstp if we have a lost lock to save.
13476 */
13477 static void
13478 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13479 nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13480 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13481 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13482 {
13483 bool_t unlock = (flk->l_type == F_UNLCK);
13484
13485 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13486 ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13487 ctype == NFS4_LCK_CTYPE_REINSTATE);
13488
13489 if (error != 0 && !unlock) {
13490 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13491 nfs4_client_lock_debug), (CE_NOTE,
13492 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13493 " for lop %p", (void *)lop));
13494 ASSERT(lop != NULL);
13495 mutex_enter(&lop->lo_lock);
13496 lop->lo_pending_rqsts = 1;
13497 mutex_exit(&lop->lo_lock);
13498 }
13499
13500 lost_rqstp->lr_putfirst = FALSE;
13501 lost_rqstp->lr_op = 0;
13502
13503 /*
13504 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13505 * recovery purposes so that the lock request that was sent
13506 * can be saved and re-issued later. Ditto for EIO from a forced
13507 * unmount. This is done to have the client's local locking state
13508 * match the v4 server's state; that is, the request was
13509 * potentially received and accepted by the server but the client
13510 * thinks it was not.
13511 */
13512 if (error == ETIMEDOUT || error == EINTR ||
13513 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13514 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13515 nfs4_client_lock_debug), (CE_NOTE,
13516 "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13517 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13518 (void *)lop, (void *)oop, (void *)osp));
13519 if (unlock)
13520 lost_rqstp->lr_op = OP_LOCKU;
13521 else {
13522 lost_rqstp->lr_op = OP_LOCK;
13523 lost_rqstp->lr_locktype = locktype;
13524 }
13525 /*
13526 * Objects are held and rele'd via the recovery code.
13527 * See nfs4_save_lost_rqst.
13528 */
13529 lost_rqstp->lr_vp = vp;
13530 lost_rqstp->lr_dvp = NULL;
13531 lost_rqstp->lr_oop = oop;
13532 lost_rqstp->lr_osp = osp;
13533 lost_rqstp->lr_lop = lop;
13534 lost_rqstp->lr_cr = cr;
13535 switch (ctype) {
13536 case NFS4_LCK_CTYPE_NORM:
13537 flk->l_pid = ttoproc(curthread)->p_pid;
13538 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13539 break;
13540 case NFS4_LCK_CTYPE_REINSTATE:
13541 lost_rqstp->lr_putfirst = TRUE;
13542 lost_rqstp->lr_ctype = ctype;
13543 break;
13544 default:
13545 break;
13546 }
13547 lost_rqstp->lr_flk = flk;
13548 }
13549 }
13550
13551 /*
13552 * Update lop's seqid. Also update the seqid stored in a resend request,
13553 * if any. (Some recovery errors increment the seqid, and we may have to
13554 * send the resend request again.)
13555 */
13556
13557 static void
13558 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13559 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13560 {
13561 if (lock_args) {
13562 if (lock_args->locker.new_lock_owner == TRUE)
13563 nfs4_get_and_set_next_open_seqid(oop, tag_type);
13564 else {
13565 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13566 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13567 }
13568 } else if (locku_args) {
13569 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13570 nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13571 }
13572 }
13573
13574 /*
13575 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13576 * COMPOUND4 args/res for calls that need to retry.
13577 * Switches the *cred_otwp to base_cr.
13578 */
13579 static void
13580 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13581 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13582 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13583 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13584 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13585 {
13586 nfs4_open_owner_t *oop = *oopp;
13587 nfs4_open_stream_t *osp = *ospp;
13588 nfs4_lock_owner_t *lop = *lopp;
13589 nfs_argop4 *argop = (*argspp)->array;
13590
13591 if (*did_start_fop) {
13592 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13593 needrecov);
13594 *did_start_fop = FALSE;
13595 }
13596 ASSERT((*argspp)->array_len == 2);
13597 if (argop[1].argop == OP_LOCK)
13598 nfs4args_lock_free(&argop[1]);
13599 else if (argop[1].argop == OP_LOCKT)
13600 nfs4args_lockt_free(&argop[1]);
13601 kmem_free(argop, 2 * sizeof (nfs_argop4));
13602 if (!error)
13603 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13604 *argspp = NULL;
13605 *respp = NULL;
13606
13607 if (lop) {
13608 nfs4_end_lock_seqid_sync(lop);
13609 lock_owner_rele(lop);
13610 *lopp = NULL;
13611 }
13612
13613 /* need to free up the reference on osp for lock args */
13614 if (osp != NULL) {
13615 open_stream_rele(osp, VTOR4(vp));
13616 *ospp = NULL;
13617 }
13618
13619 /* need to free up the reference on oop for lock args */
13620 if (oop != NULL) {
13621 nfs4_end_open_seqid_sync(oop);
13622 open_owner_rele(oop);
13623 *oopp = NULL;
13624 }
13625
13626 crfree(*cred_otwp);
13627 *cred_otwp = base_cr;
13628 crhold(*cred_otwp);
13629 }
13630
13631 /*
13632 * Function to process the client's recovery for nfs4frlock.
13633 * Returns TRUE if we should retry the lock request; FALSE otherwise.
13634 *
13635 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13636 * COMPOUND4 args/res for calls that need to retry.
13637 *
13638 * Note: the rp's r_lkserlock is *not* dropped during this path.
13639 */
13640 static bool_t
13641 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13642 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13643 LOCK4args *lock_args, LOCKU4args *locku_args,
13644 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13645 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13646 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13647 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13648 {
13649 nfs4_open_owner_t *oop = *oopp;
13650 nfs4_open_stream_t *osp = *ospp;
13651 nfs4_lock_owner_t *lop = *lopp;
13652
13653 bool_t abort, retry;
13654
13655 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13656 ASSERT((*argspp) != NULL);
13657 ASSERT((*respp) != NULL);
13658 if (lock_args || locku_args)
13659 ASSERT(lop != NULL);
13660
13661 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13662 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13663
13664 retry = TRUE;
13665 abort = FALSE;
13666 if (needrecov) {
13667 nfs4_bseqid_entry_t *bsep = NULL;
13668 nfs_opnum4 op;
13669
13670 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13671
13672 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13673 seqid4 seqid;
13674
13675 if (lock_args) {
13676 if (lock_args->locker.new_lock_owner == TRUE)
13677 seqid = lock_args->locker.locker4_u.
13678 open_owner.open_seqid;
13679 else
13680 seqid = lock_args->locker.locker4_u.
13681 lock_owner.lock_seqid;
13682 } else if (locku_args) {
13683 seqid = locku_args->seqid;
13684 } else {
13685 seqid = 0;
13686 }
13687
13688 bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13689 flk->l_pid, (*argspp)->ctag, seqid);
13690 }
13691
13692 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13693 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13694 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13695 NULL, op, bsep, NULL, NULL);
13696
13697 if (bsep)
13698 kmem_free(bsep, sizeof (*bsep));
13699 }
13700
13701 /*
13702 * Return that we do not want to retry the request for 3 cases:
13703 * 1. If we received EINTR or are bailing out because of a forced
13704 * unmount, we came into this code path just for the sake of
13705 * initiating recovery, we now need to return the error.
13706 * 2. If we have aborted recovery.
13707 * 3. We received NFS4ERR_BAD_SEQID.
13708 */
13709 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13710 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13711 retry = FALSE;
13712
13713 if (*did_start_fop == TRUE) {
13714 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13715 needrecov);
13716 *did_start_fop = FALSE;
13717 }
13718
13719 if (retry == TRUE) {
13720 nfs_argop4 *argop;
13721
13722 argop = (*argspp)->array;
13723 ASSERT((*argspp)->array_len == 2);
13724
13725 if (argop[1].argop == OP_LOCK)
13726 nfs4args_lock_free(&argop[1]);
13727 else if (argop[1].argop == OP_LOCKT)
13728 nfs4args_lockt_free(&argop[1]);
13729 kmem_free(argop, 2 * sizeof (nfs_argop4));
13730 if (!ep->error)
13731 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13732 *respp = NULL;
13733 *argspp = NULL;
13734 }
13735
13736 if (lop != NULL) {
13737 nfs4_end_lock_seqid_sync(lop);
13738 lock_owner_rele(lop);
13739 }
13740
13741 *lopp = NULL;
13742
13743 /* need to free up the reference on osp for lock args */
13744 if (osp != NULL) {
13745 open_stream_rele(osp, rp);
13746 *ospp = NULL;
13747 }
13748
13749 /* need to free up the reference on oop for lock args */
13750 if (oop != NULL) {
13751 nfs4_end_open_seqid_sync(oop);
13752 open_owner_rele(oop);
13753 *oopp = NULL;
13754 }
13755
13756 return (retry);
13757 }
13758
13759 /*
13760 * Handles the successful reply from the server for nfs4frlock.
13761 */
13762 static void
13763 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13764 vnode_t *vp, int flag, u_offset_t offset,
13765 nfs4_lost_rqst_t *resend_rqstp)
13766 {
13767 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13768 if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13769 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13770 if (ctype == NFS4_LCK_CTYPE_NORM) {
13771 flk->l_pid = ttoproc(curthread)->p_pid;
13772 /*
13773 * We do not register lost locks locally in
13774 * the 'resend' case since the user/application
13775 * doesn't think we have the lock.
13776 */
13777 ASSERT(!resend_rqstp);
13778 nfs4_register_lock_locally(vp, flk, flag, offset);
13779 }
13780 }
13781 }
13782
13783 /*
13784 * Handle the DENIED reply from the server for nfs4frlock.
13785 * Returns TRUE if we should retry the request; FALSE otherwise.
13786 *
13787 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13788 * COMPOUND4 args/res for calls that need to retry. Can also
13789 * drop and regrab the r_lkserlock.
13790 */
13791 static bool_t
13792 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13793 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13794 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13795 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13796 nfs4_recov_state_t *recov_statep, int needrecov,
13797 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13798 clock_t *tick_delayp, short *whencep, int *errorp,
13799 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13800 bool_t *skip_get_err)
13801 {
13802 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13803
13804 if (lock_args) {
13805 nfs4_open_owner_t *oop = *oopp;
13806 nfs4_open_stream_t *osp = *ospp;
13807 nfs4_lock_owner_t *lop = *lopp;
13808 int intr;
13809
13810 /*
13811 * Blocking lock needs to sleep and retry from the request.
13812 *
13813 * Do not block and wait for 'resend' or 'reinstate'
13814 * lock requests, just return the error.
13815 *
13816 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13817 */
13818 if (cmd == F_SETLKW) {
13819 rnode4_t *rp = VTOR4(vp);
13820 nfs_argop4 *argop = (*argspp)->array;
13821
13822 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13823
13824 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13825 recov_statep, needrecov);
13826 *did_start_fop = FALSE;
13827 ASSERT((*argspp)->array_len == 2);
13828 if (argop[1].argop == OP_LOCK)
13829 nfs4args_lock_free(&argop[1]);
13830 else if (argop[1].argop == OP_LOCKT)
13831 nfs4args_lockt_free(&argop[1]);
13832 kmem_free(argop, 2 * sizeof (nfs_argop4));
13833 if (*respp)
13834 (void) xdr_free(xdr_COMPOUND4res_clnt,
13835 (caddr_t)*respp);
13836 *argspp = NULL;
13837 *respp = NULL;
13838 nfs4_end_lock_seqid_sync(lop);
13839 lock_owner_rele(lop);
13840 *lopp = NULL;
13841 if (osp != NULL) {
13842 open_stream_rele(osp, rp);
13843 *ospp = NULL;
13844 }
13845 if (oop != NULL) {
13846 nfs4_end_open_seqid_sync(oop);
13847 open_owner_rele(oop);
13848 *oopp = NULL;
13849 }
13850
13851 nfs_rw_exit(&rp->r_lkserlock);
13852
13853 intr = nfs4_block_and_wait(tick_delayp, rp);
13854
13855 if (intr) {
13856 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13857 RW_WRITER, FALSE);
13858 *errorp = EINTR;
13859 return (FALSE);
13860 }
13861
13862 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13863 RW_WRITER, FALSE);
13864
13865 /*
13866 * Make sure we are still safe to lock with
13867 * regards to mmapping.
13868 */
13869 if (!nfs4_safelock(vp, flk, cr)) {
13870 *errorp = EAGAIN;
13871 return (FALSE);
13872 }
13873
13874 return (TRUE);
13875 }
13876 if (ctype == NFS4_LCK_CTYPE_NORM)
13877 *errorp = EAGAIN;
13878 *skip_get_err = TRUE;
13879 flk->l_whence = 0;
13880 *whencep = 0;
13881 return (FALSE);
13882 } else if (lockt_args) {
13883 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13884 "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13885
13886 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13887 flk, lockt_args);
13888
13889 /* according to NLM code */
13890 *errorp = 0;
13891 *whencep = 0;
13892 *skip_get_err = TRUE;
13893 return (FALSE);
13894 }
13895 return (FALSE);
13896 }
13897
13898 /*
13899 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13900 */
13901 static void
13902 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13903 {
13904 switch (resp->status) {
13905 case NFS4ERR_ACCESS:
13906 case NFS4ERR_ADMIN_REVOKED:
13907 case NFS4ERR_BADHANDLE:
13908 case NFS4ERR_BAD_RANGE:
13909 case NFS4ERR_BAD_SEQID:
13910 case NFS4ERR_BAD_STATEID:
13911 case NFS4ERR_BADXDR:
13912 case NFS4ERR_DEADLOCK:
13913 case NFS4ERR_DELAY:
13914 case NFS4ERR_EXPIRED:
13915 case NFS4ERR_FHEXPIRED:
13916 case NFS4ERR_GRACE:
13917 case NFS4ERR_INVAL:
13918 case NFS4ERR_ISDIR:
13919 case NFS4ERR_LEASE_MOVED:
13920 case NFS4ERR_LOCK_NOTSUPP:
13921 case NFS4ERR_LOCK_RANGE:
13922 case NFS4ERR_MOVED:
13923 case NFS4ERR_NOFILEHANDLE:
13924 case NFS4ERR_NO_GRACE:
13925 case NFS4ERR_OLD_STATEID:
13926 case NFS4ERR_OPENMODE:
13927 case NFS4ERR_RECLAIM_BAD:
13928 case NFS4ERR_RECLAIM_CONFLICT:
13929 case NFS4ERR_RESOURCE:
13930 case NFS4ERR_SERVERFAULT:
13931 case NFS4ERR_STALE:
13932 case NFS4ERR_STALE_CLIENTID:
13933 case NFS4ERR_STALE_STATEID:
13934 return;
13935 default:
13936 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13937 "nfs4frlock_results_default: got unrecognizable "
13938 "res.status %d", resp->status));
13939 *errorp = NFS4ERR_INVAL;
13940 }
13941 }
13942
13943 /*
13944 * The lock request was successful, so update the client's state.
13945 */
13946 static void
13947 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13948 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13949 vnode_t *vp, flock64_t *flk, cred_t *cr,
13950 nfs4_lost_rqst_t *resend_rqstp)
13951 {
13952 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13953
13954 if (lock_args) {
13955 LOCK4res *lock_res;
13956
13957 lock_res = &resop->nfs_resop4_u.oplock;
13958 /* update the stateid with server's response */
13959
13960 if (lock_args->locker.new_lock_owner == TRUE) {
13961 mutex_enter(&lop->lo_lock);
13962 lop->lo_just_created = NFS4_PERM_CREATED;
13963 mutex_exit(&lop->lo_lock);
13964 }
13965
13966 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13967
13968 /*
13969 * If the lock was the result of a resending a lost
13970 * request, we've synched up the stateid and seqid
13971 * with the server, but now the server might be out of sync
13972 * with what the application thinks it has for locks.
13973 * Clean that up here. It's unclear whether we should do
13974 * this even if the filesystem has been forcibly unmounted.
13975 * For most servers, it's probably wasted effort, but
13976 * RFC3530 lets servers require that unlocks exactly match
13977 * the locks that are held.
13978 */
13979 if (resend_rqstp != NULL &&
13980 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13981 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13982 } else {
13983 flk->l_whence = 0;
13984 }
13985 } else if (locku_args) {
13986 LOCKU4res *locku_res;
13987
13988 locku_res = &resop->nfs_resop4_u.oplocku;
13989
13990 /* Update the stateid with the server's response */
13991 nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
13992 } else if (lockt_args) {
13993 /* Switch the lock type to express success, see fcntl */
13994 flk->l_type = F_UNLCK;
13995 flk->l_whence = 0;
13996 }
13997 }
13998
13999 /*
14000 * Do final cleanup before exiting nfs4frlock.
14001 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14002 * COMPOUND4 args/res for calls that haven't already.
14003 */
14004 static void
14005 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14006 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14007 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14008 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14009 short whence, u_offset_t offset, struct lm_sysid *ls,
14010 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14011 bool_t did_start_fop, bool_t skip_get_err,
14012 cred_t *cred_otw, cred_t *cred)
14013 {
14014 mntinfo4_t *mi = VTOMI4(vp);
14015 rnode4_t *rp = VTOR4(vp);
14016 int error = *errorp;
14017 nfs_argop4 *argop;
14018 int do_flush_pages = 0;
14019
14020 ASSERT(nfs_zone() == mi->mi_zone);
14021 /*
14022 * The client recovery code wants the raw status information,
14023 * so don't map the NFS status code to an errno value for
14024 * non-normal call types.
14025 */
14026 if (ctype == NFS4_LCK_CTYPE_NORM) {
14027 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14028 *errorp = geterrno4(resp->status);
14029 if (did_start_fop == TRUE)
14030 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14031 needrecov);
14032
14033 /*
14034 * We've established a new lock on the server, so invalidate
14035 * the pages associated with the vnode to get the most up to
14036 * date pages from the server after acquiring the lock. We
14037 * want to be sure that the read operation gets the newest data.
14038 * N.B.
14039 * We used to do this in nfs4frlock_results_ok but that doesn't
14040 * work since VOP_PUTPAGE can call nfs4_commit which calls
14041 * nfs4_start_fop. We flush the pages below after calling
14042 * nfs4_end_fop above
14043 * The flush of the page cache must be done after
14044 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14045 */
14046 if (!error && resp && resp->status == NFS4_OK)
14047 do_flush_pages = 1;
14048 }
14049 if (argsp) {
14050 ASSERT(argsp->array_len == 2);
14051 argop = argsp->array;
14052 if (argop[1].argop == OP_LOCK)
14053 nfs4args_lock_free(&argop[1]);
14054 else if (argop[1].argop == OP_LOCKT)
14055 nfs4args_lockt_free(&argop[1]);
14056 kmem_free(argop, 2 * sizeof (nfs_argop4));
14057 if (resp)
14058 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14059 }
14060
14061 /* free the reference on the lock owner */
14062 if (lop != NULL) {
14063 nfs4_end_lock_seqid_sync(lop);
14064 lock_owner_rele(lop);
14065 }
14066
14067 /* need to free up the reference on osp for lock args */
14068 if (osp != NULL)
14069 open_stream_rele(osp, rp);
14070
14071 /* need to free up the reference on oop for lock args */
14072 if (oop != NULL) {
14073 nfs4_end_open_seqid_sync(oop);
14074 open_owner_rele(oop);
14075 }
14076
14077 if (do_flush_pages)
14078 nfs4_flush_pages(vp, cred);
14079
14080 (void) convoff(vp, flk, whence, offset);
14081
14082 lm_rel_sysid(ls);
14083
14084 /*
14085 * Record debug information in the event we get EINVAL.
14086 */
14087 mutex_enter(&mi->mi_lock);
14088 if (*errorp == EINVAL && (lock_args || locku_args) &&
14089 (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14090 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14091 zcmn_err(getzoneid(), CE_NOTE,
14092 "%s operation failed with "
14093 "EINVAL probably since the server, %s,"
14094 " doesn't support POSIX style locking",
14095 lock_args ? "LOCK" : "LOCKU",
14096 mi->mi_curr_serv->sv_hostname);
14097 mi->mi_flags |= MI4_LOCK_DEBUG;
14098 }
14099 }
14100 mutex_exit(&mi->mi_lock);
14101
14102 if (cred_otw)
14103 crfree(cred_otw);
14104 }
14105
14106 /*
14107 * This calls the server and the local locking code.
14108 *
14109 * Client locks are registerred locally by oring the sysid with
14110 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14111 * We need to distinguish between the two to avoid collision in case one
14112 * machine is used as both client and server.
14113 *
14114 * Blocking lock requests will continually retry to acquire the lock
14115 * forever.
14116 *
14117 * The ctype is defined as follows:
14118 * NFS4_LCK_CTYPE_NORM: normal lock request.
14119 *
14120 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client
14121 * recovery, get the pid from flk instead of curproc, and don't reregister
14122 * the lock locally.
14123 *
14124 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14125 * that we will use the information passed in via resend_rqstp to setup the
14126 * lock/locku request. This resend is the exact same request as the 'lost
14127 * lock', and is initiated by the recovery framework. A successful resend
14128 * request can initiate one or more reinstate requests.
14129 *
14130 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14131 * does not trigger additional reinstate requests. This lock call type is
14132 * set for setting the v4 server's locking state back to match what the
14133 * client's local locking state is in the event of a received 'lost lock'.
14134 *
14135 * Errors are returned via the nfs4_error_t parameter.
14136 */
14137 void
14138 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14139 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14140 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14141 {
14142 COMPOUND4args_clnt args, *argsp = NULL;
14143 COMPOUND4res_clnt res, *resp = NULL;
14144 nfs_argop4 *argop;
14145 nfs_resop4 *resop;
14146 rnode4_t *rp;
14147 int doqueue = 1;
14148 clock_t tick_delay; /* delay in clock ticks */
14149 struct lm_sysid *ls;
14150 LOCK4args *lock_args = NULL;
14151 LOCKU4args *locku_args = NULL;
14152 LOCKT4args *lockt_args = NULL;
14153 nfs4_open_owner_t *oop = NULL;
14154 nfs4_open_stream_t *osp = NULL;
14155 nfs4_lock_owner_t *lop = NULL;
14156 bool_t needrecov = FALSE;
14157 nfs4_recov_state_t recov_state;
14158 short whence;
14159 nfs4_op_hint_t op_hint;
14160 nfs4_lost_rqst_t lost_rqst;
14161 bool_t retry = FALSE;
14162 bool_t did_start_fop = FALSE;
14163 bool_t skip_get_err = FALSE;
14164 cred_t *cred_otw = NULL;
14165 bool_t recovonly; /* just queue request */
14166 int frc_no_reclaim = 0;
14167 #ifdef DEBUG
14168 char *name;
14169 #endif
14170
14171 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14172
14173 #ifdef DEBUG
14174 name = fn_name(VTOSV(vp)->sv_name);
14175 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14176 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14177 "length %"PRIu64", pid %d, sysid %d, call type %s, "
14178 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14179 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14180 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14181 resend_rqstp ? "TRUE" : "FALSE"));
14182 kmem_free(name, MAXNAMELEN);
14183 #endif
14184
14185 nfs4_error_zinit(ep);
14186 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14187 if (ep->error)
14188 return;
14189 ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14190 if (ep->error)
14191 return;
14192 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14193 vp, cr, &cred_otw);
14194
14195 recov_retry:
14196 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14197 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14198 rp = VTOR4(vp);
14199
14200 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14201 &did_start_fop, &recovonly);
14202
14203 if (ep->error)
14204 goto out;
14205
14206 if (recovonly) {
14207 /*
14208 * Leave the request for the recovery system to deal with.
14209 */
14210 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14211 ASSERT(cmd != F_GETLK);
14212 ASSERT(flk->l_type == F_UNLCK);
14213
14214 nfs4_error_init(ep, EINTR);
14215 needrecov = TRUE;
14216 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14217 if (lop != NULL) {
14218 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14219 NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14220 (void) nfs4_start_recovery(ep,
14221 VTOMI4(vp), vp, NULL, NULL,
14222 (lost_rqst.lr_op == OP_LOCK ||
14223 lost_rqst.lr_op == OP_LOCKU) ?
14224 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14225 lock_owner_rele(lop);
14226 lop = NULL;
14227 }
14228 flk->l_pid = curproc->p_pid;
14229 nfs4_register_lock_locally(vp, flk, flag, offset);
14230 goto out;
14231 }
14232
14233 /* putfh directory fh */
14234 argop[0].argop = OP_CPUTFH;
14235 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14236
14237 /*
14238 * Set up the over-the-wire arguments and get references to the
14239 * open owner, etc.
14240 */
14241
14242 if (ctype == NFS4_LCK_CTYPE_RESEND ||
14243 ctype == NFS4_LCK_CTYPE_REINSTATE) {
14244 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14245 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14246 } else {
14247 bool_t go_otw = TRUE;
14248
14249 ASSERT(resend_rqstp == NULL);
14250
14251 switch (cmd) {
14252 case F_GETLK:
14253 case F_O_GETLK:
14254 nfs4frlock_setup_lockt_args(ctype, &argop[1],
14255 &lockt_args, argsp, flk, rp);
14256 break;
14257 case F_SETLKW:
14258 case F_SETLK:
14259 if (flk->l_type == F_UNLCK)
14260 nfs4frlock_setup_locku_args(ctype,
14261 &argop[1], &locku_args, flk,
14262 &lop, ep, argsp,
14263 vp, flag, offset, cr,
14264 &skip_get_err, &go_otw);
14265 else
14266 nfs4frlock_setup_lock_args(ctype,
14267 &lock_args, &oop, &osp, &lop, &argop[1],
14268 argsp, flk, cmd, vp, cr, ep);
14269
14270 if (ep->error)
14271 goto out;
14272
14273 switch (ep->stat) {
14274 case NFS4_OK:
14275 break;
14276 case NFS4ERR_DELAY:
14277 /* recov thread never gets this error */
14278 ASSERT(resend_rqstp == NULL);
14279 ASSERT(did_start_fop);
14280
14281 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14282 &recov_state, TRUE);
14283 did_start_fop = FALSE;
14284 if (argop[1].argop == OP_LOCK)
14285 nfs4args_lock_free(&argop[1]);
14286 else if (argop[1].argop == OP_LOCKT)
14287 nfs4args_lockt_free(&argop[1]);
14288 kmem_free(argop, 2 * sizeof (nfs_argop4));
14289 argsp = NULL;
14290 goto recov_retry;
14291 default:
14292 ep->error = EIO;
14293 goto out;
14294 }
14295 break;
14296 default:
14297 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14298 "nfs4_frlock: invalid cmd %d", cmd));
14299 ep->error = EINVAL;
14300 goto out;
14301 }
14302
14303 if (!go_otw)
14304 goto out;
14305 }
14306
14307 /* XXX should we use the local reclock as a cache ? */
14308 /*
14309 * Unregister the lock with the local locking code before
14310 * contacting the server. This avoids a potential race where
14311 * another process gets notified that it has been granted a lock
14312 * before we can unregister ourselves locally.
14313 */
14314 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14315 if (ctype == NFS4_LCK_CTYPE_NORM)
14316 flk->l_pid = ttoproc(curthread)->p_pid;
14317 nfs4_register_lock_locally(vp, flk, flag, offset);
14318 }
14319
14320 /*
14321 * Send the server the lock request. Continually loop with a delay
14322 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14323 */
14324 resp = &res;
14325
14326 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14327 (CE_NOTE,
14328 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14329 rnode4info(rp)));
14330
14331 if (lock_args && frc_no_reclaim) {
14332 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14333 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14334 "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14335 lock_args->reclaim = FALSE;
14336 if (did_reclaimp)
14337 *did_reclaimp = 0;
14338 }
14339
14340 /*
14341 * Do the OTW call.
14342 */
14343 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14344
14345 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14346 "nfs4frlock: error %d, status %d", ep->error, resp->status));
14347
14348 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14349 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14350 "nfs4frlock: needrecov %d", needrecov));
14351
14352 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14353 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14354 args.ctag);
14355
14356 /*
14357 * Check if one of these mutually exclusive error cases has
14358 * happened:
14359 * need to swap credentials due to access error
14360 * recovery is needed
14361 * different error (only known case is missing Kerberos ticket)
14362 */
14363
14364 if ((ep->error == EACCES ||
14365 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14366 cred_otw != cr) {
14367 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14368 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14369 cr, &cred_otw);
14370 goto recov_retry;
14371 }
14372
14373 if (needrecov) {
14374 /*
14375 * LOCKT requests don't need to recover from lost
14376 * requests since they don't create/modify state.
14377 */
14378 if ((ep->error == EINTR ||
14379 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14380 lockt_args)
14381 goto out;
14382 /*
14383 * Do not attempt recovery for requests initiated by
14384 * the recovery framework. Let the framework redrive them.
14385 */
14386 if (ctype != NFS4_LCK_CTYPE_NORM)
14387 goto out;
14388 else {
14389 ASSERT(resend_rqstp == NULL);
14390 }
14391
14392 nfs4frlock_save_lost_rqst(ctype, ep->error,
14393 flk_to_locktype(cmd, flk->l_type),
14394 oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14395
14396 retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14397 &resp, lock_args, locku_args, &oop, &osp, &lop,
14398 rp, vp, &recov_state, op_hint, &did_start_fop,
14399 cmd != F_GETLK ? &lost_rqst : NULL, flk);
14400
14401 if (retry) {
14402 ASSERT(oop == NULL);
14403 ASSERT(osp == NULL);
14404 ASSERT(lop == NULL);
14405 goto recov_retry;
14406 }
14407 goto out;
14408 }
14409
14410 /*
14411 * Bail out if have reached this point with ep->error set. Can
14412 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14413 * This happens if Kerberos ticket has expired or has been
14414 * destroyed.
14415 */
14416 if (ep->error != 0)
14417 goto out;
14418
14419 /*
14420 * Process the reply.
14421 */
14422 switch (resp->status) {
14423 case NFS4_OK:
14424 resop = &resp->array[1];
14425 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14426 resend_rqstp);
14427 /*
14428 * Have a successful lock operation, now update state.
14429 */
14430 nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14431 resop, lop, vp, flk, cr, resend_rqstp);
14432 break;
14433
14434 case NFS4ERR_DENIED:
14435 resop = &resp->array[1];
14436 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14437 &oop, &osp, &lop, cmd, vp, flk, op_hint,
14438 &recov_state, needrecov, &argsp, &resp,
14439 &tick_delay, &whence, &ep->error, resop, cr,
14440 &did_start_fop, &skip_get_err);
14441
14442 if (retry) {
14443 ASSERT(oop == NULL);
14444 ASSERT(osp == NULL);
14445 ASSERT(lop == NULL);
14446 goto recov_retry;
14447 }
14448 break;
14449 /*
14450 * If the server won't let us reclaim, fall-back to trying to lock
14451 * the file from scratch. Code elsewhere will check the changeinfo
14452 * to ensure the file hasn't been changed.
14453 */
14454 case NFS4ERR_NO_GRACE:
14455 if (lock_args && lock_args->reclaim == TRUE) {
14456 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14457 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14458 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14459 frc_no_reclaim = 1;
14460 /* clean up before retrying */
14461 needrecov = 0;
14462 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14463 lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14464 &recov_state, op_hint, &did_start_fop, NULL, flk);
14465 goto recov_retry;
14466 }
14467 /* FALLTHROUGH */
14468
14469 default:
14470 nfs4frlock_results_default(resp, &ep->error);
14471 break;
14472 }
14473 out:
14474 /*
14475 * Process and cleanup from error. Make interrupted unlock
14476 * requests look successful, since they will be handled by the
14477 * client recovery code.
14478 */
14479 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14480 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14481 lock_args, locku_args, did_start_fop,
14482 skip_get_err, cred_otw, cr);
14483
14484 if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14485 (cmd == F_SETLK || cmd == F_SETLKW))
14486 ep->error = 0;
14487 }
14488
14489 /*
14490 * nfs4_safelock:
14491 *
14492 * Return non-zero if the given lock request can be handled without
14493 * violating the constraints on concurrent mapping and locking.
14494 */
14495
14496 static int
14497 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14498 {
14499 rnode4_t *rp = VTOR4(vp);
14500 struct vattr va;
14501 int error;
14502
14503 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14504 ASSERT(rp->r_mapcnt >= 0);
14505 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14506 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14507 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14508 bfp->l_start, bfp->l_len, rp->r_mapcnt));
14509
14510 if (rp->r_mapcnt == 0)
14511 return (1); /* always safe if not mapped */
14512
14513 /*
14514 * If the file is already mapped and there are locks, then they
14515 * should be all safe locks. So adding or removing a lock is safe
14516 * as long as the new request is safe (i.e., whole-file, meaning
14517 * length and starting offset are both zero).
14518 */
14519
14520 if (bfp->l_start != 0 || bfp->l_len != 0) {
14521 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14522 "cannot lock a memory mapped file unless locking the "
14523 "entire file: start %"PRIx64", len %"PRIx64,
14524 bfp->l_start, bfp->l_len));
14525 return (0);
14526 }
14527
14528 /* mandatory locking and mapping don't mix */
14529 va.va_mask = AT_MODE;
14530 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14531 if (error != 0) {
14532 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14533 "getattr error %d", error));
14534 return (0); /* treat errors conservatively */
14535 }
14536 if (MANDLOCK(vp, va.va_mode)) {
14537 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14538 "cannot mandatory lock and mmap a file"));
14539 return (0);
14540 }
14541
14542 return (1);
14543 }
14544
14545
14546 /*
14547 * Register the lock locally within Solaris.
14548 * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14549 * recording locks locally.
14550 *
14551 * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14552 * are registered locally.
14553 */
14554 void
14555 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14556 u_offset_t offset)
14557 {
14558 int oldsysid;
14559 int error;
14560 #ifdef DEBUG
14561 char *name;
14562 #endif
14563
14564 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14565
14566 #ifdef DEBUG
14567 name = fn_name(VTOSV(vp)->sv_name);
14568 NFS4_DEBUG(nfs4_client_lock_debug,
14569 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14570 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14571 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14572 flk->l_sysid));
14573 kmem_free(name, MAXNAMELEN);
14574 #endif
14575
14576 /* register the lock with local locking */
14577 oldsysid = flk->l_sysid;
14578 flk->l_sysid |= LM_SYSID_CLIENT;
14579 error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14580 #ifdef DEBUG
14581 if (error != 0) {
14582 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14583 "nfs4_register_lock_locally: could not register with"
14584 " local locking"));
14585 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14586 "error %d, vp 0x%p, pid %d, sysid 0x%x",
14587 error, (void *)vp, flk->l_pid, flk->l_sysid));
14588 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14589 "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14590 flk->l_type, flk->l_start, flk->l_len));
14591 (void) reclock(vp, flk, 0, flag, offset, NULL);
14592 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14593 "blocked by pid %d sysid 0x%x type %d "
14594 "off 0x%" PRIx64 " len 0x%" PRIx64,
14595 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14596 flk->l_len));
14597 }
14598 #endif
14599 flk->l_sysid = oldsysid;
14600 }
14601
14602 /*
14603 * nfs4_lockrelease:
14604 *
14605 * Release any locks on the given vnode that are held by the current
14606 * process. Also removes the lock owner (if one exists) from the rnode's
14607 * list.
14608 */
14609 static int
14610 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14611 {
14612 flock64_t ld;
14613 int ret, error;
14614 rnode4_t *rp;
14615 nfs4_lock_owner_t *lop;
14616 nfs4_recov_state_t recov_state;
14617 mntinfo4_t *mi;
14618 bool_t possible_orphan = FALSE;
14619 bool_t recovonly;
14620
14621 ASSERT((uintptr_t)vp > KERNELBASE);
14622 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14623
14624 rp = VTOR4(vp);
14625 mi = VTOMI4(vp);
14626
14627 /*
14628 * If we have not locked anything then we can
14629 * just return since we have no work to do.
14630 */
14631 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14632 return (0);
14633 }
14634
14635 /*
14636 * We need to comprehend that another thread may
14637 * kick off recovery and the lock_owner we have stashed
14638 * in lop might be invalid so we should NOT cache it
14639 * locally!
14640 */
14641 recov_state.rs_flags = 0;
14642 recov_state.rs_num_retry_despite_err = 0;
14643 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14644 &recovonly);
14645 if (error) {
14646 mutex_enter(&rp->r_statelock);
14647 rp->r_flags |= R4LODANGLERS;
14648 mutex_exit(&rp->r_statelock);
14649 return (error);
14650 }
14651
14652 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14653
14654 /*
14655 * Check if the lock owner might have a lock (request was sent but
14656 * no response was received). Also check if there are any remote
14657 * locks on the file. (In theory we shouldn't have to make this
14658 * second check if there's no lock owner, but for now we'll be
14659 * conservative and do it anyway.) If either condition is true,
14660 * send an unlock for the entire file to the server.
14661 *
14662 * Note that no explicit synchronization is needed here. At worst,
14663 * flk_has_remote_locks() will return a false positive, in which case
14664 * the unlock call wastes time but doesn't harm correctness.
14665 */
14666
14667 if (lop) {
14668 mutex_enter(&lop->lo_lock);
14669 possible_orphan = lop->lo_pending_rqsts;
14670 mutex_exit(&lop->lo_lock);
14671 lock_owner_rele(lop);
14672 }
14673
14674 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14675
14676 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14677 "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14678 "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14679 (void *)lop));
14680
14681 if (possible_orphan || flk_has_remote_locks(vp)) {
14682 ld.l_type = F_UNLCK; /* set to unlock entire file */
14683 ld.l_whence = 0; /* unlock from start of file */
14684 ld.l_start = 0;
14685 ld.l_len = 0; /* do entire file */
14686
14687 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14688 cr, NULL);
14689
14690 if (ret != 0) {
14691 /*
14692 * If VOP_FRLOCK fails, make sure we unregister
14693 * local locks before we continue.
14694 */
14695 ld.l_pid = ttoproc(curthread)->p_pid;
14696 nfs4_register_lock_locally(vp, &ld, flag, offset);
14697 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14698 "nfs4_lockrelease: lock release error on vp"
14699 " %p: error %d.\n", (void *)vp, ret));
14700 }
14701 }
14702
14703 recov_state.rs_flags = 0;
14704 recov_state.rs_num_retry_despite_err = 0;
14705 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14706 &recovonly);
14707 if (error) {
14708 mutex_enter(&rp->r_statelock);
14709 rp->r_flags |= R4LODANGLERS;
14710 mutex_exit(&rp->r_statelock);
14711 return (error);
14712 }
14713
14714 /*
14715 * So, here we're going to need to retrieve the lock-owner
14716 * again (in case recovery has done a switch-a-roo) and
14717 * remove it because we can.
14718 */
14719 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14720
14721 if (lop) {
14722 nfs4_rnode_remove_lock_owner(rp, lop);
14723 lock_owner_rele(lop);
14724 }
14725
14726 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14727 return (0);
14728 }
14729
14730 /*
14731 * Wait for 'tick_delay' clock ticks.
14732 * Implement exponential backoff until hit the lease_time of this nfs4_server.
14733 * NOTE: lock_lease_time is in seconds.
14734 *
14735 * XXX For future improvements, should implement a waiting queue scheme.
14736 */
14737 static int
14738 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14739 {
14740 long milliseconds_delay;
14741 time_t lock_lease_time;
14742
14743 /* wait tick_delay clock ticks or siginteruptus */
14744 if (delay_sig(*tick_delay)) {
14745 return (EINTR);
14746 }
14747 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14748 "reissue the lock request: blocked for %ld clock ticks: %ld "
14749 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14750
14751 /* get the lease time */
14752 lock_lease_time = r2lease_time(rp);
14753
14754 /* drv_hztousec converts ticks to microseconds */
14755 milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14756 if (milliseconds_delay < lock_lease_time * 1000) {
14757 *tick_delay = 2 * *tick_delay;
14758 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14759 *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14760 }
14761 return (0);
14762 }
14763
14764
14765 void
14766 nfs4_vnops_init(void)
14767 {
14768 }
14769
14770 void
14771 nfs4_vnops_fini(void)
14772 {
14773 }
14774
14775 /*
14776 * Return a reference to the directory (parent) vnode for a given vnode,
14777 * using the saved pathname information and the directory file handle. The
14778 * caller is responsible for disposing of the reference.
14779 * Returns zero or an errno value.
14780 *
14781 * Caller should set need_start_op to FALSE if it is the recovery
14782 * thread, or if a start_fop has already been done. Otherwise, TRUE.
14783 */
14784 int
14785 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14786 {
14787 svnode_t *svnp;
14788 vnode_t *dvp = NULL;
14789 servinfo4_t *svp;
14790 nfs4_fname_t *mfname;
14791 int error;
14792
14793 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14794
14795 if (vp->v_flag & VROOT) {
14796 nfs4_sharedfh_t *sfh;
14797 nfs_fh4 fh;
14798 mntinfo4_t *mi;
14799
14800 ASSERT(vp->v_type == VREG);
14801
14802 mi = VTOMI4(vp);
14803 svp = mi->mi_curr_serv;
14804 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14805 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14806 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14807 sfh = sfh4_get(&fh, VTOMI4(vp));
14808 nfs_rw_exit(&svp->sv_lock);
14809 mfname = mi->mi_fname;
14810 fn_hold(mfname);
14811 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14812 sfh4_rele(&sfh);
14813
14814 if (dvp->v_type == VNON)
14815 dvp->v_type = VDIR;
14816 *dvpp = dvp;
14817 return (0);
14818 }
14819
14820 svnp = VTOSV(vp);
14821
14822 if (svnp == NULL) {
14823 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14824 "shadow node is NULL"));
14825 return (EINVAL);
14826 }
14827
14828 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14829 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14830 "shadow node name or dfh val == NULL"));
14831 return (EINVAL);
14832 }
14833
14834 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14835 (int)need_start_op);
14836 if (error != 0) {
14837 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14838 "nfs4_make_dotdot returned %d", error));
14839 return (error);
14840 }
14841 if (!dvp) {
14842 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14843 "nfs4_make_dotdot returned a NULL dvp"));
14844 return (EIO);
14845 }
14846 if (dvp->v_type == VNON)
14847 dvp->v_type = VDIR;
14848 ASSERT(dvp->v_type == VDIR);
14849 if (VTOR4(vp)->r_flags & R4ISXATTR) {
14850 mutex_enter(&dvp->v_lock);
14851 dvp->v_flag |= V_XATTRDIR;
14852 mutex_exit(&dvp->v_lock);
14853 }
14854 *dvpp = dvp;
14855 return (0);
14856 }
14857
14858 /*
14859 * Copy the (final) component name of vp to fnamep. maxlen is the maximum
14860 * length that fnamep can accept, including the trailing null.
14861 * Returns 0 if okay, returns an errno value if there was a problem.
14862 */
14863
14864 int
14865 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14866 {
14867 char *fn;
14868 int err = 0;
14869 servinfo4_t *svp;
14870 svnode_t *shvp;
14871
14872 /*
14873 * If the file being opened has VROOT set, then this is
14874 * a "file" mount. sv_name will not be interesting, so
14875 * go back to the servinfo4 to get the original mount
14876 * path and strip off all but the final edge. Otherwise
14877 * just return the name from the shadow vnode.
14878 */
14879
14880 if (vp->v_flag & VROOT) {
14881
14882 svp = VTOMI4(vp)->mi_curr_serv;
14883 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14884
14885 fn = strrchr(svp->sv_path, '/');
14886 if (fn == NULL)
14887 err = EINVAL;
14888 else
14889 fn++;
14890 } else {
14891 shvp = VTOSV(vp);
14892 fn = fn_name(shvp->sv_name);
14893 }
14894
14895 if (err == 0)
14896 if (strlen(fn) < maxlen)
14897 (void) strcpy(fnamep, fn);
14898 else
14899 err = ENAMETOOLONG;
14900
14901 if (vp->v_flag & VROOT)
14902 nfs_rw_exit(&svp->sv_lock);
14903 else
14904 kmem_free(fn, MAXNAMELEN);
14905
14906 return (err);
14907 }
14908
14909 /*
14910 * Bookkeeping for a close that doesn't need to go over the wire.
14911 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14912 * it is left at 1.
14913 */
14914 void
14915 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14916 {
14917 rnode4_t *rp;
14918 mntinfo4_t *mi;
14919
14920 mi = VTOMI4(vp);
14921 rp = VTOR4(vp);
14922
14923 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14924 "rp=%p osp=%p", (void *)rp, (void *)osp));
14925 ASSERT(nfs_zone() == mi->mi_zone);
14926 ASSERT(mutex_owned(&osp->os_sync_lock));
14927 ASSERT(*have_lockp);
14928
14929 if (!osp->os_valid ||
14930 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14931 return;
14932 }
14933
14934 /*
14935 * This removes the reference obtained at OPEN; ie,
14936 * when the open stream structure was created.
14937 *
14938 * We don't have to worry about calling 'open_stream_rele'
14939 * since we our currently holding a reference to this
14940 * open stream which means the count can not go to 0 with
14941 * this decrement.
14942 */
14943 ASSERT(osp->os_ref_count >= 2);
14944 osp->os_ref_count--;
14945 osp->os_valid = 0;
14946 mutex_exit(&osp->os_sync_lock);
14947 *have_lockp = 0;
14948
14949 nfs4_dec_state_ref_count(mi);
14950 }
14951
14952 /*
14953 * Close all remaining open streams on the rnode. These open streams
14954 * could be here because:
14955 * - The close attempted at either close or delmap failed
14956 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14957 * - Someone did mknod on a regular file but never opened it
14958 */
14959 int
14960 nfs4close_all(vnode_t *vp, cred_t *cr)
14961 {
14962 nfs4_open_stream_t *osp;
14963 int error;
14964 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14965 rnode4_t *rp;
14966
14967 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14968
14969 error = 0;
14970 rp = VTOR4(vp);
14971
14972 /*
14973 * At this point, all we know is that the last time
14974 * someone called vn_rele, the count was 1. Since then,
14975 * the vnode could have been re-activated. We want to
14976 * loop through the open streams and close each one, but
14977 * we have to be careful since once we release the rnode
14978 * hash bucket lock, someone else is free to come in and
14979 * re-activate the rnode and add new open streams. The
14980 * strategy is take the rnode hash bucket lock, verify that
14981 * the count is still 1, grab the open stream off the
14982 * head of the list and mark it invalid, then release the
14983 * rnode hash bucket lock and proceed with that open stream.
14984 * This is ok because nfs4close_one() will acquire the proper
14985 * open/create to close/destroy synchronization for open
14986 * streams, and will ensure that if someone has reopened
14987 * the open stream after we've dropped the hash bucket lock
14988 * then we'll just simply return without destroying the
14989 * open stream.
14990 * Repeat until the list is empty.
14991 */
14992
14993 for (;;) {
14994
14995 /* make sure vnode hasn't been reactivated */
14996 rw_enter(&rp->r_hashq->r_lock, RW_READER);
14997 mutex_enter(&vp->v_lock);
14998 if (vp->v_count > 1) {
14999 mutex_exit(&vp->v_lock);
15000 rw_exit(&rp->r_hashq->r_lock);
15001 break;
15002 }
15003 /*
15004 * Grabbing r_os_lock before releasing v_lock prevents
15005 * a window where the rnode/open stream could get
15006 * reactivated (and os_force_close set to 0) before we
15007 * had a chance to set os_force_close to 1.
15008 */
15009 mutex_enter(&rp->r_os_lock);
15010 mutex_exit(&vp->v_lock);
15011
15012 osp = list_head(&rp->r_open_streams);
15013 if (!osp) {
15014 /* nothing left to CLOSE OTW, so return */
15015 mutex_exit(&rp->r_os_lock);
15016 rw_exit(&rp->r_hashq->r_lock);
15017 break;
15018 }
15019
15020 mutex_enter(&rp->r_statev4_lock);
15021 /* the file can't still be mem mapped */
15022 ASSERT(rp->r_mapcnt == 0);
15023 if (rp->created_v4)
15024 rp->created_v4 = 0;
15025 mutex_exit(&rp->r_statev4_lock);
15026
15027 /*
15028 * Grab a ref on this open stream; nfs4close_one
15029 * will mark it as invalid
15030 */
15031 mutex_enter(&osp->os_sync_lock);
15032 osp->os_ref_count++;
15033 osp->os_force_close = 1;
15034 mutex_exit(&osp->os_sync_lock);
15035 mutex_exit(&rp->r_os_lock);
15036 rw_exit(&rp->r_hashq->r_lock);
15037
15038 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15039
15040 /* Update error if it isn't already non-zero */
15041 if (error == 0) {
15042 if (e.error)
15043 error = e.error;
15044 else if (e.stat)
15045 error = geterrno4(e.stat);
15046 }
15047
15048 #ifdef DEBUG
15049 nfs4close_all_cnt++;
15050 #endif
15051 /* Release the ref on osp acquired above. */
15052 open_stream_rele(osp, rp);
15053
15054 /* Proceed to the next open stream, if any */
15055 }
15056 return (error);
15057 }
15058
15059 /*
15060 * nfs4close_one - close one open stream for a file if needed.
15061 *
15062 * "close_type" indicates which close path this is:
15063 * CLOSE_NORM: close initiated via VOP_CLOSE.
15064 * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15065 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces
15066 * the close and release of client state for this open stream
15067 * (unless someone else has the open stream open).
15068 * CLOSE_RESEND: indicates the request is a replay of an earlier request
15069 * (e.g., due to abort because of a signal).
15070 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15071 *
15072 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15073 * recovery. Instead, the caller is expected to deal with retries.
15074 *
15075 * The caller can either pass in the osp ('provided_osp') or not.
15076 *
15077 * 'access_bits' represents the access we are closing/downgrading.
15078 *
15079 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the
15080 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15081 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15082 *
15083 * Errors are returned via the nfs4_error_t.
15084 */
15085 void
15086 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15087 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15088 nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15089 uint_t mmap_flags)
15090 {
15091 nfs4_open_owner_t *oop;
15092 nfs4_open_stream_t *osp = NULL;
15093 int retry = 0;
15094 int num_retries = NFS4_NUM_RECOV_RETRIES;
15095 rnode4_t *rp;
15096 mntinfo4_t *mi;
15097 nfs4_recov_state_t recov_state;
15098 cred_t *cred_otw = NULL;
15099 bool_t recovonly = FALSE;
15100 int isrecov;
15101 int force_close;
15102 int close_failed = 0;
15103 int did_dec_count = 0;
15104 int did_start_op = 0;
15105 int did_force_recovlock = 0;
15106 int did_start_seqid_sync = 0;
15107 int have_sync_lock = 0;
15108
15109 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15110
15111 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15112 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15113 (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15114 len, maxprot, mmap_flags, access_bits));
15115
15116 nfs4_error_zinit(ep);
15117 rp = VTOR4(vp);
15118 mi = VTOMI4(vp);
15119 isrecov = (close_type == CLOSE_RESEND ||
15120 close_type == CLOSE_AFTER_RESEND);
15121
15122 /*
15123 * First get the open owner.
15124 */
15125 if (!provided_osp) {
15126 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15127 } else {
15128 oop = provided_osp->os_open_owner;
15129 ASSERT(oop != NULL);
15130 open_owner_hold(oop);
15131 }
15132
15133 if (!oop) {
15134 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15135 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15136 "close type %d", (void *)rp, (void *)mi, (void *)cr,
15137 (void *)provided_osp, close_type));
15138 ep->error = EIO;
15139 goto out;
15140 }
15141
15142 cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15143 recov_retry:
15144 osp = NULL;
15145 close_failed = 0;
15146 force_close = (close_type == CLOSE_FORCE);
15147 retry = 0;
15148 did_start_op = 0;
15149 did_force_recovlock = 0;
15150 did_start_seqid_sync = 0;
15151 have_sync_lock = 0;
15152 recovonly = FALSE;
15153 recov_state.rs_flags = 0;
15154 recov_state.rs_num_retry_despite_err = 0;
15155
15156 /*
15157 * Second synchronize with recovery.
15158 */
15159 if (!isrecov) {
15160 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15161 &recov_state, &recovonly);
15162 if (!ep->error) {
15163 did_start_op = 1;
15164 } else {
15165 close_failed = 1;
15166 /*
15167 * If we couldn't get start_fop, but have to
15168 * cleanup state, then at least acquire the
15169 * mi_recovlock so we can synchronize with
15170 * recovery.
15171 */
15172 if (close_type == CLOSE_FORCE) {
15173 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15174 RW_READER, FALSE);
15175 did_force_recovlock = 1;
15176 } else
15177 goto out;
15178 }
15179 }
15180
15181 /*
15182 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15183 * set 'recovonly' to TRUE since most likely this is due to
15184 * reovery being active (MI4_RECOV_ACTIV). If recovery is active,
15185 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15186 * to retry, causing us to loop until recovery finishes. Plus we
15187 * don't need protection over the open seqid since we're not going
15188 * OTW, hence don't need to use the seqid.
15189 */
15190 if (recovonly == FALSE) {
15191 /* need to grab the open owner sync before 'os_sync_lock' */
15192 ep->error = nfs4_start_open_seqid_sync(oop, mi);
15193 if (ep->error == EAGAIN) {
15194 ASSERT(!isrecov);
15195 if (did_start_op)
15196 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15197 &recov_state, TRUE);
15198 if (did_force_recovlock)
15199 nfs_rw_exit(&mi->mi_recovlock);
15200 goto recov_retry;
15201 }
15202 did_start_seqid_sync = 1;
15203 }
15204
15205 /*
15206 * Third get an open stream and acquire 'os_sync_lock' to
15207 * sychronize the opening/creating of an open stream with the
15208 * closing/destroying of an open stream.
15209 */
15210 if (!provided_osp) {
15211 /* returns with 'os_sync_lock' held */
15212 osp = find_open_stream(oop, rp);
15213 if (!osp) {
15214 ep->error = EIO;
15215 goto out;
15216 }
15217 } else {
15218 osp = provided_osp;
15219 open_stream_hold(osp);
15220 mutex_enter(&osp->os_sync_lock);
15221 }
15222 have_sync_lock = 1;
15223
15224 ASSERT(oop == osp->os_open_owner);
15225
15226 /*
15227 * Fourth, do any special pre-OTW CLOSE processing
15228 * based on the specific close type.
15229 */
15230 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15231 !did_dec_count) {
15232 ASSERT(osp->os_open_ref_count > 0);
15233 osp->os_open_ref_count--;
15234 did_dec_count = 1;
15235 if (osp->os_open_ref_count == 0)
15236 osp->os_final_close = 1;
15237 }
15238
15239 if (close_type == CLOSE_FORCE) {
15240 /* see if somebody reopened the open stream. */
15241 if (!osp->os_force_close) {
15242 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15243 "nfs4close_one: skip CLOSE_FORCE as osp %p "
15244 "was reopened, vp %p", (void *)osp, (void *)vp));
15245 ep->error = 0;
15246 ep->stat = NFS4_OK;
15247 goto out;
15248 }
15249
15250 if (!osp->os_final_close && !did_dec_count) {
15251 osp->os_open_ref_count--;
15252 did_dec_count = 1;
15253 }
15254
15255 /*
15256 * We can't depend on os_open_ref_count being 0 due to the
15257 * way executables are opened (VN_RELE to match a VOP_OPEN).
15258 */
15259 #ifdef NOTYET
15260 ASSERT(osp->os_open_ref_count == 0);
15261 #endif
15262 if (osp->os_open_ref_count != 0) {
15263 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15264 "nfs4close_one: should panic here on an "
15265 "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15266 "since this is probably the exec problem."));
15267
15268 osp->os_open_ref_count = 0;
15269 }
15270
15271 /*
15272 * There is the possibility that nfs4close_one()
15273 * for close_type == CLOSE_DELMAP couldn't find the
15274 * open stream, thus couldn't decrement its os_mapcnt;
15275 * therefore we can't use this ASSERT yet.
15276 */
15277 #ifdef NOTYET
15278 ASSERT(osp->os_mapcnt == 0);
15279 #endif
15280 osp->os_mapcnt = 0;
15281 }
15282
15283 if (close_type == CLOSE_DELMAP && !did_dec_count) {
15284 ASSERT(osp->os_mapcnt >= btopr(len));
15285
15286 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15287 osp->os_mmap_write -= btopr(len);
15288 if (maxprot & PROT_READ)
15289 osp->os_mmap_read -= btopr(len);
15290 if (maxprot & PROT_EXEC)
15291 osp->os_mmap_read -= btopr(len);
15292 /* mirror the PROT_NONE check in nfs4_addmap() */
15293 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15294 !(maxprot & PROT_EXEC))
15295 osp->os_mmap_read -= btopr(len);
15296 osp->os_mapcnt -= btopr(len);
15297 did_dec_count = 1;
15298 }
15299
15300 if (recovonly) {
15301 nfs4_lost_rqst_t lost_rqst;
15302
15303 /* request should not already be in recovery queue */
15304 ASSERT(lrp == NULL);
15305 nfs4_error_init(ep, EINTR);
15306 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15307 osp, cred_otw, vp);
15308 mutex_exit(&osp->os_sync_lock);
15309 have_sync_lock = 0;
15310 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15311 lost_rqst.lr_op == OP_CLOSE ?
15312 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15313 close_failed = 1;
15314 force_close = 0;
15315 goto close_cleanup;
15316 }
15317
15318 /*
15319 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15320 * we stopped operating on the open owner's <old oo_name, old seqid>
15321 * space, which means we stopped operating on the open stream
15322 * too. So don't go OTW (as the seqid is likely bad, and the
15323 * stateid could be stale, potentially triggering a false
15324 * setclientid), and just clean up the client's internal state.
15325 */
15326 if (osp->os_orig_oo_name != oop->oo_name) {
15327 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15328 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15329 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15330 "oo_name %" PRIx64")",
15331 (void *)osp, (void *)oop, osp->os_orig_oo_name,
15332 oop->oo_name));
15333 close_failed = 1;
15334 }
15335
15336 /* If the file failed recovery, just quit. */
15337 mutex_enter(&rp->r_statelock);
15338 if (rp->r_flags & R4RECOVERR) {
15339 close_failed = 1;
15340 }
15341 mutex_exit(&rp->r_statelock);
15342
15343 /*
15344 * If the force close path failed to obtain start_fop
15345 * then skip the OTW close and just remove the state.
15346 */
15347 if (close_failed)
15348 goto close_cleanup;
15349
15350 /*
15351 * Fifth, check to see if there are still mapped pages or other
15352 * opens using this open stream. If there are then we can't
15353 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15354 */
15355 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15356 nfs4_lost_rqst_t new_lost_rqst;
15357 bool_t needrecov = FALSE;
15358 cred_t *odg_cred_otw = NULL;
15359 seqid4 open_dg_seqid = 0;
15360
15361 if (osp->os_delegation) {
15362 /*
15363 * If this open stream was never OPENed OTW then we
15364 * surely can't DOWNGRADE it (especially since the
15365 * osp->open_stateid is really a delegation stateid
15366 * when os_delegation is 1).
15367 */
15368 if (access_bits & FREAD)
15369 osp->os_share_acc_read--;
15370 if (access_bits & FWRITE)
15371 osp->os_share_acc_write--;
15372 osp->os_share_deny_none--;
15373 nfs4_error_zinit(ep);
15374 goto out;
15375 }
15376 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15377 lrp, ep, &odg_cred_otw, &open_dg_seqid);
15378 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15379 if (needrecov && !isrecov) {
15380 bool_t abort;
15381 nfs4_bseqid_entry_t *bsep = NULL;
15382
15383 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15384 bsep = nfs4_create_bseqid_entry(oop, NULL,
15385 vp, 0,
15386 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15387 open_dg_seqid);
15388
15389 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15390 oop, osp, odg_cred_otw, vp, access_bits, 0);
15391 mutex_exit(&osp->os_sync_lock);
15392 have_sync_lock = 0;
15393 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15394 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15395 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15396 bsep, NULL, NULL);
15397 if (odg_cred_otw)
15398 crfree(odg_cred_otw);
15399 if (bsep)
15400 kmem_free(bsep, sizeof (*bsep));
15401
15402 if (abort == TRUE)
15403 goto out;
15404
15405 if (did_start_seqid_sync) {
15406 nfs4_end_open_seqid_sync(oop);
15407 did_start_seqid_sync = 0;
15408 }
15409 open_stream_rele(osp, rp);
15410
15411 if (did_start_op)
15412 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15413 &recov_state, FALSE);
15414 if (did_force_recovlock)
15415 nfs_rw_exit(&mi->mi_recovlock);
15416
15417 goto recov_retry;
15418 } else {
15419 if (odg_cred_otw)
15420 crfree(odg_cred_otw);
15421 }
15422 goto out;
15423 }
15424
15425 /*
15426 * If this open stream was created as the results of an open
15427 * while holding a delegation, then just release it; no need
15428 * to do an OTW close. Otherwise do a "normal" OTW close.
15429 */
15430 if (osp->os_delegation) {
15431 nfs4close_notw(vp, osp, &have_sync_lock);
15432 nfs4_error_zinit(ep);
15433 goto out;
15434 }
15435
15436 /*
15437 * If this stream is not valid, we're done.
15438 */
15439 if (!osp->os_valid) {
15440 nfs4_error_zinit(ep);
15441 goto out;
15442 }
15443
15444 /*
15445 * Last open or mmap ref has vanished, need to do an OTW close.
15446 * First check to see if a close is still necessary.
15447 */
15448 if (osp->os_failed_reopen) {
15449 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15450 "don't close OTW osp %p since reopen failed.",
15451 (void *)osp));
15452 /*
15453 * Reopen of the open stream failed, hence the
15454 * stateid of the open stream is invalid/stale, and
15455 * sending this OTW would incorrectly cause another
15456 * round of recovery. In this case, we need to set
15457 * the 'os_valid' bit to 0 so another thread doesn't
15458 * come in and re-open this open stream before
15459 * this "closing" thread cleans up state (decrementing
15460 * the nfs4_server_t's state_ref_count and decrementing
15461 * the os_ref_count).
15462 */
15463 osp->os_valid = 0;
15464 /*
15465 * This removes the reference obtained at OPEN; ie,
15466 * when the open stream structure was created.
15467 *
15468 * We don't have to worry about calling 'open_stream_rele'
15469 * since we our currently holding a reference to this
15470 * open stream which means the count can not go to 0 with
15471 * this decrement.
15472 */
15473 ASSERT(osp->os_ref_count >= 2);
15474 osp->os_ref_count--;
15475 nfs4_error_zinit(ep);
15476 close_failed = 0;
15477 goto close_cleanup;
15478 }
15479
15480 ASSERT(osp->os_ref_count > 1);
15481
15482 /*
15483 * Sixth, try the CLOSE OTW.
15484 */
15485 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15486 close_type, ep, &have_sync_lock);
15487
15488 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15489 /*
15490 * Let the recovery thread be responsible for
15491 * removing the state for CLOSE.
15492 */
15493 close_failed = 1;
15494 force_close = 0;
15495 retry = 0;
15496 }
15497
15498 /* See if we need to retry with a different cred */
15499 if ((ep->error == EACCES ||
15500 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15501 cred_otw != cr) {
15502 crfree(cred_otw);
15503 cred_otw = cr;
15504 crhold(cred_otw);
15505 retry = 1;
15506 }
15507
15508 if (ep->error || ep->stat)
15509 close_failed = 1;
15510
15511 if (retry && !isrecov && num_retries-- > 0) {
15512 if (have_sync_lock) {
15513 mutex_exit(&osp->os_sync_lock);
15514 have_sync_lock = 0;
15515 }
15516 if (did_start_seqid_sync) {
15517 nfs4_end_open_seqid_sync(oop);
15518 did_start_seqid_sync = 0;
15519 }
15520 open_stream_rele(osp, rp);
15521
15522 if (did_start_op)
15523 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15524 &recov_state, FALSE);
15525 if (did_force_recovlock)
15526 nfs_rw_exit(&mi->mi_recovlock);
15527 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15528 "nfs4close_one: need to retry the close "
15529 "operation"));
15530 goto recov_retry;
15531 }
15532 close_cleanup:
15533 /*
15534 * Seventh and lastly, process our results.
15535 */
15536 if (close_failed && force_close) {
15537 /*
15538 * It's ok to drop and regrab the 'os_sync_lock' since
15539 * nfs4close_notw() will recheck to make sure the
15540 * "close"/removal of state should happen.
15541 */
15542 if (!have_sync_lock) {
15543 mutex_enter(&osp->os_sync_lock);
15544 have_sync_lock = 1;
15545 }
15546 /*
15547 * This is last call, remove the ref on the open
15548 * stream created by open and clean everything up.
15549 */
15550 osp->os_pending_close = 0;
15551 nfs4close_notw(vp, osp, &have_sync_lock);
15552 nfs4_error_zinit(ep);
15553 }
15554
15555 if (!close_failed) {
15556 if (have_sync_lock) {
15557 osp->os_pending_close = 0;
15558 mutex_exit(&osp->os_sync_lock);
15559 have_sync_lock = 0;
15560 } else {
15561 mutex_enter(&osp->os_sync_lock);
15562 osp->os_pending_close = 0;
15563 mutex_exit(&osp->os_sync_lock);
15564 }
15565 if (did_start_op && recov_state.rs_sp != NULL) {
15566 mutex_enter(&recov_state.rs_sp->s_lock);
15567 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15568 mutex_exit(&recov_state.rs_sp->s_lock);
15569 } else {
15570 nfs4_dec_state_ref_count(mi);
15571 }
15572 nfs4_error_zinit(ep);
15573 }
15574
15575 out:
15576 if (have_sync_lock)
15577 mutex_exit(&osp->os_sync_lock);
15578 if (did_start_op)
15579 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15580 recovonly ? TRUE : FALSE);
15581 if (did_force_recovlock)
15582 nfs_rw_exit(&mi->mi_recovlock);
15583 if (cred_otw)
15584 crfree(cred_otw);
15585 if (osp)
15586 open_stream_rele(osp, rp);
15587 if (oop) {
15588 if (did_start_seqid_sync)
15589 nfs4_end_open_seqid_sync(oop);
15590 open_owner_rele(oop);
15591 }
15592 }
15593
15594 /*
15595 * Convert information returned by the server in the LOCK4denied
15596 * structure to the form required by fcntl.
15597 */
15598 static void
15599 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15600 {
15601 nfs4_lo_name_t *lo;
15602
15603 #ifdef DEBUG
15604 if (denied_to_flk_debug) {
15605 lockt_denied_debug = lockt_denied;
15606 debug_enter("lockt_denied");
15607 }
15608 #endif
15609
15610 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15611 flk->l_whence = 0; /* aka SEEK_SET */
15612 flk->l_start = lockt_denied->offset;
15613 flk->l_len = lockt_denied->length;
15614
15615 /*
15616 * If the blocking clientid matches our client id, then we can
15617 * interpret the lockowner (since we built it). If not, then
15618 * fabricate a sysid and pid. Note that the l_sysid field
15619 * in *flk already has the local sysid.
15620 */
15621
15622 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15623
15624 if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15625 lo = (nfs4_lo_name_t *)
15626 lockt_denied->owner.owner_val;
15627
15628 flk->l_pid = lo->ln_pid;
15629 } else {
15630 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15631 "denied_to_flk: bad lock owner length\n"));
15632
15633 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15634 }
15635 } else {
15636 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15637 "denied_to_flk: foreign clientid\n"));
15638
15639 /*
15640 * Construct a new sysid which should be different from
15641 * sysids of other systems.
15642 */
15643
15644 flk->l_sysid++;
15645 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15646 }
15647 }
15648
15649 static pid_t
15650 lo_to_pid(lock_owner4 *lop)
15651 {
15652 pid_t pid = 0;
15653 uchar_t *cp;
15654 int i;
15655
15656 cp = (uchar_t *)&lop->clientid;
15657
15658 for (i = 0; i < sizeof (lop->clientid); i++)
15659 pid += (pid_t)*cp++;
15660
15661 cp = (uchar_t *)lop->owner_val;
15662
15663 for (i = 0; i < lop->owner_len; i++)
15664 pid += (pid_t)*cp++;
15665
15666 return (pid);
15667 }
15668
15669 /*
15670 * Given a lock pointer, returns the length of that lock.
15671 * "end" is the last locked offset the "l_len" covers from
15672 * the start of the lock.
15673 */
15674 static off64_t
15675 lock_to_end(flock64_t *lock)
15676 {
15677 off64_t lock_end;
15678
15679 if (lock->l_len == 0)
15680 lock_end = (off64_t)MAXEND;
15681 else
15682 lock_end = lock->l_start + lock->l_len - 1;
15683
15684 return (lock_end);
15685 }
15686
15687 /*
15688 * Given the end of a lock, it will return you the length "l_len" for that lock.
15689 */
15690 static off64_t
15691 end_to_len(off64_t start, off64_t end)
15692 {
15693 off64_t lock_len;
15694
15695 ASSERT(end >= start);
15696 if (end == MAXEND)
15697 lock_len = 0;
15698 else
15699 lock_len = end - start + 1;
15700
15701 return (lock_len);
15702 }
15703
15704 /*
15705 * On given end for a lock it determines if it is the last locked offset
15706 * or not, if so keeps it as is, else adds one to return the length for
15707 * valid start.
15708 */
15709 static off64_t
15710 start_check(off64_t x)
15711 {
15712 if (x == MAXEND)
15713 return (x);
15714 else
15715 return (x + 1);
15716 }
15717
15718 /*
15719 * See if these two locks overlap, and if so return 1;
15720 * otherwise, return 0.
15721 */
15722 static int
15723 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15724 {
15725 off64_t llfp_end, curfp_end;
15726
15727 llfp_end = lock_to_end(llfp);
15728 curfp_end = lock_to_end(curfp);
15729
15730 if (((llfp_end >= curfp->l_start) &&
15731 (llfp->l_start <= curfp->l_start)) ||
15732 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15733 return (1);
15734 return (0);
15735 }
15736
15737 /*
15738 * Determine what the intersecting lock region is, and add that to the
15739 * 'nl_llpp' locklist in increasing order (by l_start).
15740 */
15741 static void
15742 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15743 locklist_t **nl_llpp, vnode_t *vp)
15744 {
15745 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15746 off64_t lost_flp_end, local_flp_end, len, start;
15747
15748 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15749
15750 if (!locks_intersect(lost_flp, local_flp))
15751 return;
15752
15753 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15754 "locks intersect"));
15755
15756 lost_flp_end = lock_to_end(lost_flp);
15757 local_flp_end = lock_to_end(local_flp);
15758
15759 /* Find the starting point of the intersecting region */
15760 if (local_flp->l_start > lost_flp->l_start)
15761 start = local_flp->l_start;
15762 else
15763 start = lost_flp->l_start;
15764
15765 /* Find the lenght of the intersecting region */
15766 if (lost_flp_end < local_flp_end)
15767 len = end_to_len(start, lost_flp_end);
15768 else
15769 len = end_to_len(start, local_flp_end);
15770
15771 /*
15772 * Prepare the flock structure for the intersection found and insert
15773 * it into the new list in increasing l_start order. This list contains
15774 * intersections of locks registered by the client with the local host
15775 * and the lost lock.
15776 * The lock type of this lock is the same as that of the local_flp.
15777 */
15778 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15779 intersect_llp->ll_flock.l_start = start;
15780 intersect_llp->ll_flock.l_len = len;
15781 intersect_llp->ll_flock.l_type = local_flp->l_type;
15782 intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15783 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15784 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */
15785 intersect_llp->ll_vp = vp;
15786
15787 tmp_fllp = *nl_llpp;
15788 cur_fllp = NULL;
15789 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15790 intersect_llp->ll_flock.l_start) {
15791 cur_fllp = tmp_fllp;
15792 tmp_fllp = tmp_fllp->ll_next;
15793 }
15794 if (cur_fllp == NULL) {
15795 /* first on the list */
15796 intersect_llp->ll_next = *nl_llpp;
15797 *nl_llpp = intersect_llp;
15798 } else {
15799 intersect_llp->ll_next = cur_fllp->ll_next;
15800 cur_fllp->ll_next = intersect_llp;
15801 }
15802
15803 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15804 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15805 intersect_llp->ll_flock.l_start,
15806 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15807 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15808 }
15809
15810 /*
15811 * Our local locking current state is potentially different than
15812 * what the NFSv4 server thinks we have due to a lost lock that was
15813 * resent and then received. We need to reset our "NFSv4" locking
15814 * state to match the current local locking state for this pid since
15815 * that is what the user/application sees as what the world is.
15816 *
15817 * We cannot afford to drop the open/lock seqid sync since then we can
15818 * get confused about what the current local locking state "is" versus
15819 * "was".
15820 *
15821 * If we are unable to fix up the locks, we send SIGLOST to the affected
15822 * process. This is not done if the filesystem has been forcibly
15823 * unmounted, in case the process has already exited and a new process
15824 * exists with the same pid.
15825 */
15826 static void
15827 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15828 nfs4_lock_owner_t *lop)
15829 {
15830 locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15831 mntinfo4_t *mi = VTOMI4(vp);
15832 const int cmd = F_SETLK;
15833 off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15834 flock64_t ul_fl;
15835
15836 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15837 "nfs4_reinstitute_local_lock_state"));
15838
15839 /*
15840 * Find active locks for this vp from the local locking code.
15841 * Scan through this list and find out the locks that intersect with
15842 * the lost lock. Once we find the lock that intersects, add the
15843 * intersection area as a new lock to a new list "ri_llp". The lock
15844 * type of the intersection region lock added to ri_llp is the same
15845 * as that found in the active lock list, "list". The intersecting
15846 * region locks are added to ri_llp in increasing l_start order.
15847 */
15848 ASSERT(nfs_zone() == mi->mi_zone);
15849
15850 locks = flk_active_locks_for_vp(vp);
15851 ri_llp = NULL;
15852
15853 for (llp = locks; llp != NULL; llp = llp->ll_next) {
15854 ASSERT(llp->ll_vp == vp);
15855 /*
15856 * Pick locks that belong to this pid/lockowner
15857 */
15858 if (llp->ll_flock.l_pid != lost_flp->l_pid)
15859 continue;
15860
15861 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15862 }
15863
15864 /*
15865 * Now we have the list of intersections with the lost lock. These are
15866 * the locks that were/are active before the server replied to the
15867 * last/lost lock. Issue these locks to the server here. Playing these
15868 * locks to the server will re-establish aur current local locking state
15869 * with the v4 server.
15870 * If we get an error, send SIGLOST to the application for that lock.
15871 */
15872
15873 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15874 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15875 "nfs4_reinstitute_local_lock_state: need to issue "
15876 "flock: [%"PRIx64" - %"PRIx64"] : %s",
15877 llp->ll_flock.l_start,
15878 llp->ll_flock.l_start + llp->ll_flock.l_len,
15879 llp->ll_flock.l_type == F_RDLCK ? "READ" :
15880 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15881 /*
15882 * No need to relock what we already have
15883 */
15884 if (llp->ll_flock.l_type == lost_flp->l_type)
15885 continue;
15886
15887 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15888 }
15889
15890 /*
15891 * Now keeping the start of the lost lock as our reference parse the
15892 * newly created ri_llp locklist to find the ranges that we have locked
15893 * with the v4 server but not in the current local locking. We need
15894 * to unlock these ranges.
15895 * These ranges can also be reffered to as those ranges, where the lost
15896 * lock does not overlap with the locks in the ri_llp but are locked
15897 * since the server replied to the lost lock.
15898 */
15899 cur_start = lost_flp->l_start;
15900 lost_flp_end = lock_to_end(lost_flp);
15901
15902 ul_fl.l_type = F_UNLCK;
15903 ul_fl.l_whence = 0; /* aka SEEK_SET */
15904 ul_fl.l_sysid = lost_flp->l_sysid;
15905 ul_fl.l_pid = lost_flp->l_pid;
15906
15907 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15908 llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15909
15910 if (llp->ll_flock.l_start <= cur_start) {
15911 cur_start = start_check(llp_ll_flock_end);
15912 continue;
15913 }
15914 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15915 "nfs4_reinstitute_local_lock_state: "
15916 "UNLOCK [%"PRIx64" - %"PRIx64"]",
15917 cur_start, llp->ll_flock.l_start));
15918
15919 ul_fl.l_start = cur_start;
15920 ul_fl.l_len = end_to_len(cur_start,
15921 (llp->ll_flock.l_start - 1));
15922
15923 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15924 cur_start = start_check(llp_ll_flock_end);
15925 }
15926
15927 /*
15928 * In the case where the lost lock ends after all intersecting locks,
15929 * unlock the last part of the lost lock range.
15930 */
15931 if (cur_start != start_check(lost_flp_end)) {
15932 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15933 "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15934 "lost lock region [%"PRIx64" - %"PRIx64"]",
15935 cur_start, lost_flp->l_start + lost_flp->l_len));
15936
15937 ul_fl.l_start = cur_start;
15938 /*
15939 * Is it an to-EOF lock? if so unlock till the end
15940 */
15941 if (lost_flp->l_len == 0)
15942 ul_fl.l_len = 0;
15943 else
15944 ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15945
15946 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15947 }
15948
15949 if (locks != NULL)
15950 flk_free_locklist(locks);
15951
15952 /* Free up our newly created locklist */
15953 for (llp = ri_llp; llp != NULL; ) {
15954 tmp_llp = llp->ll_next;
15955 kmem_free(llp, sizeof (locklist_t));
15956 llp = tmp_llp;
15957 }
15958
15959 /*
15960 * Now return back to the original calling nfs4frlock()
15961 * and let us naturally drop our seqid syncs.
15962 */
15963 }
15964
15965 /*
15966 * Create a lost state record for the given lock reinstantiation request
15967 * and push it onto the lost state queue.
15968 */
15969 static void
15970 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15971 nfs4_lock_owner_t *lop)
15972 {
15973 nfs4_lost_rqst_t req;
15974 nfs_lock_type4 locktype;
15975 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15976
15977 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15978
15979 locktype = flk_to_locktype(cmd, flk->l_type);
15980 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15981 NULL, NULL, lop, flk, &req, cr, vp);
15982 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
15983 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
15984 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
15985 NULL, NULL, NULL);
15986 }