Print this page
4827 nfs4: slow file locking
4837 NFSv4 client lock retry delay upper limit should be shorter
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_vnops.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25 /*
26 26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 27 */
↓ open down ↓ |
27 lines elided |
↑ open up ↑ |
28 28
29 29 /*
30 30 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
31 31 * All Rights Reserved
32 32 */
33 33
34 34 /*
35 35 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
36 36 */
37 37
38 +/*
39 + * Copyright (c) 2014, STRATO AG. All rights reserved.
40 + */
41 +
38 42 #include <sys/param.h>
39 43 #include <sys/types.h>
40 44 #include <sys/systm.h>
41 45 #include <sys/cred.h>
42 46 #include <sys/time.h>
43 47 #include <sys/vnode.h>
44 48 #include <sys/vfs.h>
45 49 #include <sys/vfs_opreg.h>
46 50 #include <sys/file.h>
47 51 #include <sys/filio.h>
48 52 #include <sys/uio.h>
49 53 #include <sys/buf.h>
50 54 #include <sys/mman.h>
51 55 #include <sys/pathname.h>
52 56 #include <sys/dirent.h>
53 57 #include <sys/debug.h>
54 58 #include <sys/vmsystm.h>
55 59 #include <sys/fcntl.h>
56 60 #include <sys/flock.h>
57 61 #include <sys/swap.h>
58 62 #include <sys/errno.h>
59 63 #include <sys/strsubr.h>
60 64 #include <sys/sysmacros.h>
61 65 #include <sys/kmem.h>
62 66 #include <sys/cmn_err.h>
63 67 #include <sys/pathconf.h>
64 68 #include <sys/utsname.h>
65 69 #include <sys/dnlc.h>
66 70 #include <sys/acl.h>
67 71 #include <sys/systeminfo.h>
68 72 #include <sys/policy.h>
69 73 #include <sys/sdt.h>
70 74 #include <sys/list.h>
71 75 #include <sys/stat.h>
72 76 #include <sys/zone.h>
73 77
74 78 #include <rpc/types.h>
75 79 #include <rpc/auth.h>
76 80 #include <rpc/clnt.h>
77 81
78 82 #include <nfs/nfs.h>
79 83 #include <nfs/nfs_clnt.h>
80 84 #include <nfs/nfs_acl.h>
81 85 #include <nfs/lm.h>
82 86 #include <nfs/nfs4.h>
83 87 #include <nfs/nfs4_kprot.h>
84 88 #include <nfs/rnode4.h>
85 89 #include <nfs/nfs4_clnt.h>
86 90
87 91 #include <vm/hat.h>
88 92 #include <vm/as.h>
89 93 #include <vm/page.h>
90 94 #include <vm/pvn.h>
91 95 #include <vm/seg.h>
92 96 #include <vm/seg_map.h>
93 97 #include <vm/seg_kpm.h>
94 98 #include <vm/seg_vn.h>
95 99
96 100 #include <fs/fs_subr.h>
97 101
98 102 #include <sys/ddi.h>
99 103 #include <sys/int_fmtio.h>
100 104 #include <sys/fs/autofs.h>
101 105
102 106 typedef struct {
103 107 nfs4_ga_res_t *di_garp;
104 108 cred_t *di_cred;
105 109 hrtime_t di_time_call;
106 110 } dirattr_info_t;
107 111
108 112 typedef enum nfs4_acl_op {
109 113 NFS4_ACL_GET,
110 114 NFS4_ACL_SET
111 115 } nfs4_acl_op_t;
112 116
113 117 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi);
114 118
115 119 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
116 120 char *, dirattr_info_t *);
117 121
118 122 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
119 123 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
120 124 nfs4_error_t *, int *);
121 125 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
122 126 cred_t *);
123 127 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
124 128 stable_how4 *);
125 129 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
126 130 cred_t *, bool_t, struct uio *);
127 131 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
128 132 vsecattr_t *);
129 133 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
130 134 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
131 135 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
132 136 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
133 137 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
134 138 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
135 139 int, vnode_t **, cred_t *);
136 140 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
137 141 cred_t *, int, int, enum createmode4, int);
138 142 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
139 143 caller_context_t *);
140 144 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
141 145 vnode_t *, char *, cred_t *, nfsstat4 *);
142 146 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
143 147 vnode_t *, char *, cred_t *, nfsstat4 *);
144 148 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
145 149 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
146 150 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
147 151 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
148 152 page_t *[], size_t, struct seg *, caddr_t,
149 153 enum seg_rw, cred_t *);
150 154 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
151 155 cred_t *);
152 156 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
153 157 int, cred_t *);
154 158 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
155 159 int, cred_t *);
156 160 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *);
157 161 static void nfs4_set_mod(vnode_t *);
158 162 static void nfs4_get_commit(vnode_t *);
159 163 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
160 164 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
161 165 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
162 166 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
163 167 cred_t *);
164 168 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
165 169 cred_t *);
166 170 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
167 171 hrtime_t, vnode_t *, cred_t *);
168 172 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
169 173 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
170 174 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
171 175 u_offset_t);
172 176 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
173 177 static int nfs4_block_and_wait(clock_t *, rnode4_t *);
174 178 static cred_t *state_to_cred(nfs4_open_stream_t *);
175 179 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
176 180 static pid_t lo_to_pid(lock_owner4 *);
177 181 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
178 182 cred_t *, nfs4_lock_owner_t *);
179 183 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
180 184 nfs4_lock_owner_t *);
181 185 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
182 186 static void nfs4_delmap_callback(struct as *, void *, uint_t);
183 187 static void nfs4_free_delmapcall(nfs4_delmapcall_t *);
184 188 static nfs4_delmapcall_t *nfs4_init_delmapcall();
185 189 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
186 190 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
187 191 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
188 192 uid_t, gid_t, int);
189 193
190 194 /*
191 195 * Routines that implement the setting of v4 args for the misc. ops
192 196 */
193 197 static void nfs4args_lock_free(nfs_argop4 *);
194 198 static void nfs4args_lockt_free(nfs_argop4 *);
195 199 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
196 200 int, rnode4_t *, cred_t *, bitmap4, int *,
197 201 nfs4_stateid_types_t *);
198 202 static void nfs4args_setattr_free(nfs_argop4 *);
199 203 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
200 204 bitmap4);
201 205 static void nfs4args_verify_free(nfs_argop4 *);
202 206 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
203 207 WRITE4args **, nfs4_stateid_types_t *);
204 208
205 209 /*
206 210 * These are the vnode ops functions that implement the vnode interface to
207 211 * the networked file system. See more comments below at nfs4_vnodeops.
208 212 */
209 213 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
210 214 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
211 215 caller_context_t *);
212 216 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *,
213 217 caller_context_t *);
214 218 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *,
215 219 caller_context_t *);
216 220 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
217 221 caller_context_t *);
218 222 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
219 223 caller_context_t *);
220 224 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
221 225 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *,
222 226 caller_context_t *);
223 227 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
224 228 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
225 229 int, vnode_t **, cred_t *, int, caller_context_t *,
226 230 vsecattr_t *);
227 231 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
228 232 int);
229 233 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
230 234 caller_context_t *, int);
231 235 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
232 236 caller_context_t *, int);
233 237 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
234 238 cred_t *, caller_context_t *, int, vsecattr_t *);
235 239 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
236 240 caller_context_t *, int);
237 241 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
238 242 cred_t *, caller_context_t *, int);
239 243 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
240 244 caller_context_t *, int);
241 245 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
242 246 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
243 247 page_t *[], size_t, struct seg *, caddr_t,
244 248 enum seg_rw, cred_t *, caller_context_t *);
245 249 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
246 250 caller_context_t *);
247 251 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
248 252 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
249 253 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
250 254 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
251 255 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
252 256 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
253 257 struct flk_callback *, cred_t *, caller_context_t *);
254 258 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
255 259 cred_t *, caller_context_t *);
256 260 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
257 261 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
258 262 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
259 263 cred_t *, caller_context_t *);
260 264 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
261 265 caller_context_t *);
262 266 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
263 267 caller_context_t *);
264 268 /*
265 269 * These vnode ops are required to be called from outside this source file,
266 270 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
267 271 * as static.
268 272 */
269 273 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
270 274 caller_context_t *);
271 275 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
272 276 int nfs4_lookup(vnode_t *, char *, vnode_t **,
273 277 struct pathname *, int, vnode_t *, cred_t *,
274 278 caller_context_t *, int *, pathname_t *);
275 279 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
276 280 int nfs4_rwlock(vnode_t *, int, caller_context_t *);
277 281 void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
278 282 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
279 283 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
280 284 caller_context_t *);
281 285 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
282 286 caller_context_t *);
↓ open down ↓ |
235 lines elided |
↑ open up ↑ |
283 287 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
284 288 caller_context_t *);
285 289
286 290 /*
287 291 * Used for nfs4_commit_vp() to indicate if we should
288 292 * wait on pending writes.
289 293 */
290 294 #define NFS4_WRITE_NOWAIT 0
291 295 #define NFS4_WRITE_WAIT 1
292 296
293 -#define NFS4_BASE_WAIT_TIME 1 /* 1 second */
294 -
295 297 /*
296 298 * Error flags used to pass information about certain special errors
297 299 * which need to be handled specially.
298 300 */
299 301 #define NFS_EOF -98
300 302 #define NFS_VERF_MISMATCH -97
301 303
302 304 /*
303 305 * Flags used to differentiate between which operation drove the
304 306 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
305 307 */
306 308 #define NFS4_CLOSE_OP 0x1
307 309 #define NFS4_DELMAP_OP 0x2
308 310 #define NFS4_INACTIVE_OP 0x3
309 311
310 312 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
311 313
312 314 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
313 315 #define ALIGN64(x, ptr, sz) \
314 316 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \
315 317 if (x) { \
316 318 x = sizeof (uint64_t) - (x); \
317 319 sz -= (x); \
318 320 ptr += (x); \
319 321 }
320 322
321 323 #ifdef DEBUG
322 324 int nfs4_client_attr_debug = 0;
323 325 int nfs4_client_state_debug = 0;
324 326 int nfs4_client_shadow_debug = 0;
325 327 int nfs4_client_lock_debug = 0;
326 328 int nfs4_seqid_sync = 0;
327 329 int nfs4_client_map_debug = 0;
328 330 static int nfs4_pageio_debug = 0;
329 331 int nfs4_client_inactive_debug = 0;
330 332 int nfs4_client_recov_debug = 0;
331 333 int nfs4_client_failover_debug = 0;
332 334 int nfs4_client_call_debug = 0;
333 335 int nfs4_client_lookup_debug = 0;
334 336 int nfs4_client_zone_debug = 0;
335 337 int nfs4_lost_rqst_debug = 0;
336 338 int nfs4_rdattrerr_debug = 0;
337 339 int nfs4_open_stream_debug = 0;
338 340
339 341 int nfs4read_error_inject;
340 342
341 343 static int nfs4_create_misses = 0;
342 344
343 345 static int nfs4_readdir_cache_shorts = 0;
344 346 static int nfs4_readdir_readahead = 0;
345 347
346 348 static int nfs4_bio_do_stop = 0;
347 349
348 350 static int nfs4_lostpage = 0; /* number of times we lost original page */
349 351
350 352 int nfs4_mmap_debug = 0;
351 353
352 354 static int nfs4_pathconf_cache_hits = 0;
353 355 static int nfs4_pathconf_cache_misses = 0;
354 356
↓ open down ↓ |
50 lines elided |
↑ open up ↑ |
355 357 int nfs4close_all_cnt;
356 358 int nfs4close_one_debug = 0;
357 359 int nfs4close_notw_debug = 0;
358 360
359 361 int denied_to_flk_debug = 0;
360 362 void *lockt_denied_debug;
361 363
362 364 #endif
363 365
364 366 /*
367 + * In milliseconds. Should be less than half of the lease time or better,
368 + * less than one second.
369 + */
370 +int nfs4_base_wait_time = 20;
371 +
372 +/*
365 373 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
366 374 * or NFS4ERR_RESOURCE.
367 375 */
368 376 static int confirm_retry_sec = 30;
369 377
370 378 static int nfs4_lookup_neg_cache = 1;
371 379
372 380 /*
373 381 * number of pages to read ahead
374 382 * optimized for 100 base-T.
375 383 */
376 384 static int nfs4_nra = 4;
377 385
378 386 static int nfs4_do_symlink_cache = 1;
379 387
380 388 static int nfs4_pathconf_disable_cache = 0;
381 389
382 390 /*
383 391 * These are the vnode ops routines which implement the vnode interface to
384 392 * the networked file system. These routines just take their parameters,
385 393 * make them look networkish by putting the right info into interface structs,
386 394 * and then calling the appropriate remote routine(s) to do the work.
387 395 *
388 396 * Note on directory name lookup cacheing: If we detect a stale fhandle,
389 397 * we purge the directory cache relative to that vnode. This way, the
390 398 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for
391 399 * more details on rnode locking.
392 400 */
393 401
394 402 struct vnodeops *nfs4_vnodeops;
395 403
396 404 const fs_operation_def_t nfs4_vnodeops_template[] = {
397 405 VOPNAME_OPEN, { .vop_open = nfs4_open },
398 406 VOPNAME_CLOSE, { .vop_close = nfs4_close },
399 407 VOPNAME_READ, { .vop_read = nfs4_read },
400 408 VOPNAME_WRITE, { .vop_write = nfs4_write },
401 409 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl },
402 410 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr },
403 411 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr },
404 412 VOPNAME_ACCESS, { .vop_access = nfs4_access },
405 413 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup },
406 414 VOPNAME_CREATE, { .vop_create = nfs4_create },
407 415 VOPNAME_REMOVE, { .vop_remove = nfs4_remove },
408 416 VOPNAME_LINK, { .vop_link = nfs4_link },
409 417 VOPNAME_RENAME, { .vop_rename = nfs4_rename },
410 418 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir },
411 419 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir },
412 420 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir },
413 421 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink },
414 422 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink },
415 423 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync },
416 424 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
417 425 VOPNAME_FID, { .vop_fid = nfs4_fid },
418 426 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
419 427 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
420 428 VOPNAME_SEEK, { .vop_seek = nfs4_seek },
421 429 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock },
422 430 VOPNAME_SPACE, { .vop_space = nfs4_space },
423 431 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
424 432 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage },
425 433 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage },
426 434 VOPNAME_MAP, { .vop_map = nfs4_map },
427 435 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap },
428 436 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap },
429 437 /* no separate nfs4_dump */
430 438 VOPNAME_DUMP, { .vop_dump = nfs_dump },
431 439 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
432 440 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio },
433 441 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose },
434 442 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr },
435 443 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
436 444 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock },
437 445 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
438 446 NULL, NULL
439 447 };
440 448
441 449 /*
442 450 * The following are subroutines and definitions to set args or get res
443 451 * for the different nfsv4 ops
444 452 */
445 453
446 454 void
447 455 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
448 456 {
449 457 int i;
450 458
451 459 for (i = 0; i < arglen; i++) {
452 460 if (argop[i].argop == OP_LOOKUP) {
453 461 kmem_free(
454 462 argop[i].nfs_argop4_u.oplookup.
455 463 objname.utf8string_val,
456 464 argop[i].nfs_argop4_u.oplookup.
457 465 objname.utf8string_len);
458 466 }
459 467 }
460 468 }
461 469
462 470 static void
463 471 nfs4args_lock_free(nfs_argop4 *argop)
464 472 {
465 473 locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
466 474
467 475 if (locker->new_lock_owner == TRUE) {
468 476 open_to_lock_owner4 *open_owner;
469 477
470 478 open_owner = &locker->locker4_u.open_owner;
471 479 if (open_owner->lock_owner.owner_val != NULL) {
472 480 kmem_free(open_owner->lock_owner.owner_val,
473 481 open_owner->lock_owner.owner_len);
474 482 }
475 483 }
476 484 }
477 485
478 486 static void
479 487 nfs4args_lockt_free(nfs_argop4 *argop)
480 488 {
481 489 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
482 490
483 491 if (lowner->owner_val != NULL) {
484 492 kmem_free(lowner->owner_val, lowner->owner_len);
485 493 }
486 494 }
487 495
488 496 static void
489 497 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
490 498 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
491 499 nfs4_stateid_types_t *sid_types)
492 500 {
493 501 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
494 502 mntinfo4_t *mi;
495 503
496 504 argop->argop = OP_SETATTR;
497 505 /*
498 506 * The stateid is set to 0 if client is not modifying the size
499 507 * and otherwise to whatever nfs4_get_stateid() returns.
500 508 *
501 509 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
502 510 * state struct could be found for the process/file pair. We may
503 511 * want to change this in the future (by OPENing the file). See
504 512 * bug # 4474852.
505 513 */
506 514 if (vap->va_mask & AT_SIZE) {
507 515
508 516 ASSERT(rp != NULL);
509 517 mi = VTOMI4(RTOV4(rp));
510 518
511 519 argop->nfs_argop4_u.opsetattr.stateid =
512 520 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
513 521 OP_SETATTR, sid_types, FALSE);
514 522 } else {
515 523 bzero(&argop->nfs_argop4_u.opsetattr.stateid,
516 524 sizeof (stateid4));
517 525 }
518 526
519 527 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
520 528 if (*error)
521 529 bzero(attr, sizeof (*attr));
522 530 }
523 531
524 532 static void
525 533 nfs4args_setattr_free(nfs_argop4 *argop)
526 534 {
527 535 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
528 536 }
529 537
530 538 static int
531 539 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
532 540 bitmap4 supp)
533 541 {
534 542 fattr4 *attr;
535 543 int error = 0;
536 544
537 545 argop->argop = op;
538 546 switch (op) {
539 547 case OP_VERIFY:
540 548 attr = &argop->nfs_argop4_u.opverify.obj_attributes;
541 549 break;
542 550 case OP_NVERIFY:
543 551 attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
544 552 break;
545 553 default:
546 554 return (EINVAL);
547 555 }
548 556 if (!error)
549 557 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
550 558 if (error)
551 559 bzero(attr, sizeof (*attr));
552 560 return (error);
553 561 }
554 562
555 563 static void
556 564 nfs4args_verify_free(nfs_argop4 *argop)
557 565 {
558 566 switch (argop->argop) {
559 567 case OP_VERIFY:
560 568 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
561 569 break;
562 570 case OP_NVERIFY:
563 571 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
564 572 break;
565 573 default:
566 574 break;
567 575 }
568 576 }
569 577
570 578 static void
571 579 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
572 580 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
573 581 {
574 582 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
575 583 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
576 584
577 585 argop->argop = OP_WRITE;
578 586 wargs->stable = stable;
579 587 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
580 588 mi, OP_WRITE, sid_tp);
581 589 wargs->mblk = NULL;
582 590 *wargs_pp = wargs;
583 591 }
584 592
585 593 void
586 594 nfs4args_copen_free(OPEN4cargs *open_args)
587 595 {
588 596 if (open_args->owner.owner_val) {
589 597 kmem_free(open_args->owner.owner_val,
590 598 open_args->owner.owner_len);
591 599 }
592 600 if ((open_args->opentype == OPEN4_CREATE) &&
593 601 (open_args->mode != EXCLUSIVE4)) {
594 602 nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
595 603 }
596 604 }
597 605
598 606 /*
599 607 * XXX: This is referenced in modstubs.s
600 608 */
601 609 struct vnodeops *
602 610 nfs4_getvnodeops(void)
603 611 {
604 612 return (nfs4_vnodeops);
605 613 }
606 614
607 615 /*
608 616 * The OPEN operation opens a regular file.
609 617 */
610 618 /*ARGSUSED3*/
611 619 static int
612 620 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
613 621 {
614 622 vnode_t *dvp = NULL;
615 623 rnode4_t *rp, *drp;
616 624 int error;
617 625 int just_been_created;
618 626 char fn[MAXNAMELEN];
619 627
620 628 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
621 629 if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
622 630 return (EIO);
623 631 rp = VTOR4(*vpp);
624 632
625 633 /*
626 634 * Check to see if opening something besides a regular file;
627 635 * if so skip the OTW call
628 636 */
629 637 if ((*vpp)->v_type != VREG) {
630 638 error = nfs4_open_non_reg_file(vpp, flag, cr);
631 639 return (error);
632 640 }
633 641
634 642 /*
635 643 * XXX - would like a check right here to know if the file is
636 644 * executable or not, so as to skip OTW
637 645 */
638 646
639 647 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
640 648 return (error);
641 649
642 650 drp = VTOR4(dvp);
643 651 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
644 652 return (EINTR);
645 653
646 654 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
647 655 nfs_rw_exit(&drp->r_rwlock);
648 656 return (error);
649 657 }
650 658
651 659 /*
652 660 * See if this file has just been CREATEd.
653 661 * If so, clear the flag and update the dnlc, which was previously
654 662 * skipped in nfs4_create.
655 663 * XXX need better serilization on this.
656 664 * XXX move this into the nf4open_otw call, after we have
657 665 * XXX acquired the open owner seqid sync.
658 666 */
659 667 mutex_enter(&rp->r_statev4_lock);
660 668 if (rp->created_v4) {
661 669 rp->created_v4 = 0;
662 670 mutex_exit(&rp->r_statev4_lock);
663 671
664 672 dnlc_update(dvp, fn, *vpp);
665 673 /* This is needed so we don't bump the open ref count */
666 674 just_been_created = 1;
667 675 } else {
668 676 mutex_exit(&rp->r_statev4_lock);
669 677 just_been_created = 0;
670 678 }
671 679
672 680 /*
673 681 * If caller specified O_TRUNC/FTRUNC, then be sure to set
674 682 * FWRITE (to drive successful setattr(size=0) after open)
675 683 */
676 684 if (flag & FTRUNC)
677 685 flag |= FWRITE;
678 686
679 687 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
680 688 just_been_created);
681 689
682 690 if (!error && !((*vpp)->v_flag & VROOT))
683 691 dnlc_update(dvp, fn, *vpp);
684 692
685 693 nfs_rw_exit(&drp->r_rwlock);
686 694
687 695 /* release the hold from vtodv */
688 696 VN_RELE(dvp);
689 697
690 698 /* exchange the shadow for the master vnode, if needed */
691 699
692 700 if (error == 0 && IS_SHADOW(*vpp, rp))
693 701 sv_exchange(vpp);
694 702
695 703 return (error);
696 704 }
697 705
698 706 /*
699 707 * See if there's a "lost open" request to be saved and recovered.
700 708 */
701 709 static void
702 710 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
703 711 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
704 712 vnode_t *dvp, OPEN4cargs *open_args)
705 713 {
706 714 vfs_t *vfsp;
707 715 char *srccfp;
708 716
709 717 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
710 718
711 719 if (error != ETIMEDOUT && error != EINTR &&
712 720 !NFS4_FRC_UNMT_ERR(error, vfsp)) {
713 721 lost_rqstp->lr_op = 0;
714 722 return;
715 723 }
716 724
717 725 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
718 726 "nfs4open_save_lost_rqst: error %d", error));
719 727
720 728 lost_rqstp->lr_op = OP_OPEN;
721 729
722 730 /*
723 731 * The vp (if it is not NULL) and dvp are held and rele'd via
724 732 * the recovery code. See nfs4_save_lost_rqst.
725 733 */
726 734 lost_rqstp->lr_vp = vp;
727 735 lost_rqstp->lr_dvp = dvp;
728 736 lost_rqstp->lr_oop = oop;
729 737 lost_rqstp->lr_osp = NULL;
730 738 lost_rqstp->lr_lop = NULL;
731 739 lost_rqstp->lr_cr = cr;
732 740 lost_rqstp->lr_flk = NULL;
733 741 lost_rqstp->lr_oacc = open_args->share_access;
734 742 lost_rqstp->lr_odeny = open_args->share_deny;
735 743 lost_rqstp->lr_oclaim = open_args->claim;
736 744 if (open_args->claim == CLAIM_DELEGATE_CUR) {
737 745 lost_rqstp->lr_ostateid =
738 746 open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
739 747 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
740 748 } else {
741 749 srccfp = open_args->open_claim4_u.cfile;
742 750 }
743 751 lost_rqstp->lr_ofile.utf8string_len = 0;
744 752 lost_rqstp->lr_ofile.utf8string_val = NULL;
745 753 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
746 754 lost_rqstp->lr_putfirst = FALSE;
747 755 }
748 756
749 757 struct nfs4_excl_time {
750 758 uint32 seconds;
751 759 uint32 nseconds;
752 760 };
753 761
754 762 /*
755 763 * The OPEN operation creates and/or opens a regular file
756 764 *
757 765 * ARGSUSED
758 766 */
759 767 static int
760 768 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
761 769 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
762 770 enum createmode4 createmode, int file_just_been_created)
763 771 {
764 772 rnode4_t *rp;
765 773 rnode4_t *drp = VTOR4(dvp);
766 774 vnode_t *vp = NULL;
767 775 vnode_t *vpi = *vpp;
768 776 bool_t needrecov = FALSE;
769 777
770 778 int doqueue = 1;
771 779
772 780 COMPOUND4args_clnt args;
773 781 COMPOUND4res_clnt res;
774 782 nfs_argop4 *argop;
775 783 nfs_resop4 *resop;
776 784 int argoplist_size;
777 785 int idx_open, idx_fattr;
778 786
779 787 GETFH4res *gf_res = NULL;
780 788 OPEN4res *op_res = NULL;
781 789 nfs4_ga_res_t *garp;
782 790 fattr4 *attr = NULL;
783 791 struct nfs4_excl_time verf;
784 792 bool_t did_excl_setup = FALSE;
785 793 int created_osp;
786 794
787 795 OPEN4cargs *open_args;
788 796 nfs4_open_owner_t *oop = NULL;
789 797 nfs4_open_stream_t *osp = NULL;
790 798 seqid4 seqid = 0;
791 799 bool_t retry_open = FALSE;
792 800 nfs4_recov_state_t recov_state;
793 801 nfs4_lost_rqst_t lost_rqst;
794 802 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
795 803 hrtime_t t;
796 804 int acc = 0;
797 805 cred_t *cred_otw = NULL; /* cred used to do the RPC call */
798 806 cred_t *ncr = NULL;
799 807
800 808 nfs4_sharedfh_t *otw_sfh;
801 809 nfs4_sharedfh_t *orig_sfh;
802 810 int fh_differs = 0;
803 811 int numops, setgid_flag;
804 812 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
805 813
806 814 /*
807 815 * Make sure we properly deal with setting the right gid on
808 816 * a newly created file to reflect the parent's setgid bit
809 817 */
810 818 setgid_flag = 0;
811 819 if (create_flag && in_va) {
812 820
813 821 /*
814 822 * If there is grpid mount flag used or
815 823 * the parent's directory has the setgid bit set
816 824 * _and_ the client was able to get a valid mapping
817 825 * for the parent dir's owner_group, we want to
818 826 * append NVERIFY(owner_group == dva.va_gid) and
819 827 * SETATTR to the CREATE compound.
820 828 */
821 829 mutex_enter(&drp->r_statelock);
822 830 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
823 831 drp->r_attr.va_mode & VSGID) &&
824 832 drp->r_attr.va_gid != GID_NOBODY) {
825 833 in_va->va_mask |= AT_GID;
826 834 in_va->va_gid = drp->r_attr.va_gid;
827 835 setgid_flag = 1;
828 836 }
829 837 mutex_exit(&drp->r_statelock);
830 838 }
831 839
832 840 /*
833 841 * Normal/non-create compound:
834 842 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
835 843 *
836 844 * Open(create) compound no setgid:
837 845 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
838 846 * RESTOREFH + GETATTR
839 847 *
840 848 * Open(create) setgid:
841 849 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
842 850 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
843 851 * NVERIFY(grp) + SETATTR
844 852 */
845 853 if (setgid_flag) {
846 854 numops = 10;
847 855 idx_open = 1;
848 856 idx_fattr = 3;
849 857 } else if (create_flag) {
850 858 numops = 7;
851 859 idx_open = 2;
852 860 idx_fattr = 4;
853 861 } else {
854 862 numops = 4;
855 863 idx_open = 1;
856 864 idx_fattr = 3;
857 865 }
858 866
859 867 args.array_len = numops;
860 868 argoplist_size = numops * sizeof (nfs_argop4);
861 869 argop = kmem_alloc(argoplist_size, KM_SLEEP);
862 870
863 871 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
864 872 "open %s open flag 0x%x cred %p", file_name, open_flag,
865 873 (void *)cr));
866 874
867 875 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
868 876 if (create_flag) {
869 877 /*
870 878 * We are to create a file. Initialize the passed in vnode
871 879 * pointer.
872 880 */
873 881 vpi = NULL;
874 882 } else {
875 883 /*
876 884 * Check to see if the client owns a read delegation and is
877 885 * trying to open for write. If so, then return the delegation
878 886 * to avoid the server doing a cb_recall and returning DELAY.
879 887 * NB - we don't use the statev4_lock here because we'd have
880 888 * to drop the lock anyway and the result would be stale.
881 889 */
882 890 if ((open_flag & FWRITE) &&
883 891 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
884 892 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
885 893
886 894 /*
887 895 * If the file has a delegation, then do an access check up
888 896 * front. This avoids having to an access check later after
889 897 * we've already done start_op, which could deadlock.
890 898 */
891 899 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
892 900 if (open_flag & FREAD &&
893 901 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
894 902 acc |= VREAD;
895 903 if (open_flag & FWRITE &&
896 904 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
897 905 acc |= VWRITE;
898 906 }
899 907 }
900 908
901 909 drp = VTOR4(dvp);
902 910
903 911 recov_state.rs_flags = 0;
904 912 recov_state.rs_num_retry_despite_err = 0;
905 913 cred_otw = cr;
906 914
907 915 recov_retry:
908 916 fh_differs = 0;
909 917 nfs4_error_zinit(&e);
910 918
911 919 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
912 920 if (e.error) {
913 921 if (ncr != NULL)
914 922 crfree(ncr);
915 923 kmem_free(argop, argoplist_size);
916 924 return (e.error);
917 925 }
918 926
919 927 args.ctag = TAG_OPEN;
920 928 args.array_len = numops;
921 929 args.array = argop;
922 930
923 931 /* putfh directory fh */
924 932 argop[0].argop = OP_CPUTFH;
925 933 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
926 934
927 935 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
928 936 argop[idx_open].argop = OP_COPEN;
929 937 open_args = &argop[idx_open].nfs_argop4_u.opcopen;
930 938 open_args->claim = CLAIM_NULL;
931 939
932 940 /* name of file */
933 941 open_args->open_claim4_u.cfile = file_name;
934 942 open_args->owner.owner_len = 0;
935 943 open_args->owner.owner_val = NULL;
936 944
937 945 if (create_flag) {
938 946 /* CREATE a file */
939 947 open_args->opentype = OPEN4_CREATE;
940 948 open_args->mode = createmode;
941 949 if (createmode == EXCLUSIVE4) {
942 950 if (did_excl_setup == FALSE) {
943 951 verf.seconds = zone_get_hostid(NULL);
944 952 if (verf.seconds != 0)
945 953 verf.nseconds = newnum();
946 954 else {
947 955 timestruc_t now;
948 956
949 957 gethrestime(&now);
950 958 verf.seconds = now.tv_sec;
951 959 verf.nseconds = now.tv_nsec;
952 960 }
953 961 /*
954 962 * Since the server will use this value for the
955 963 * mtime, make sure that it can't overflow. Zero
956 964 * out the MSB. The actual value does not matter
957 965 * here, only its uniqeness.
958 966 */
959 967 verf.seconds &= INT32_MAX;
960 968 did_excl_setup = TRUE;
961 969 }
962 970
963 971 /* Now copy over verifier to OPEN4args. */
964 972 open_args->createhow4_u.createverf = *(uint64_t *)&verf;
965 973 } else {
966 974 int v_error;
967 975 bitmap4 supp_attrs;
968 976 servinfo4_t *svp;
969 977
970 978 attr = &open_args->createhow4_u.createattrs;
971 979
972 980 svp = drp->r_server;
973 981 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
974 982 supp_attrs = svp->sv_supp_attrs;
975 983 nfs_rw_exit(&svp->sv_lock);
976 984
977 985 /* GUARDED4 or UNCHECKED4 */
978 986 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
979 987 supp_attrs);
980 988 if (v_error) {
981 989 bzero(attr, sizeof (*attr));
982 990 nfs4args_copen_free(open_args);
983 991 nfs4_end_op(VTOMI4(dvp), dvp, vpi,
984 992 &recov_state, FALSE);
985 993 if (ncr != NULL)
986 994 crfree(ncr);
987 995 kmem_free(argop, argoplist_size);
988 996 return (v_error);
989 997 }
990 998 }
991 999 } else {
992 1000 /* NO CREATE */
993 1001 open_args->opentype = OPEN4_NOCREATE;
994 1002 }
995 1003
996 1004 if (recov_state.rs_sp != NULL) {
997 1005 mutex_enter(&recov_state.rs_sp->s_lock);
998 1006 open_args->owner.clientid = recov_state.rs_sp->clientid;
999 1007 mutex_exit(&recov_state.rs_sp->s_lock);
1000 1008 } else {
1001 1009 /* XXX should we just fail here? */
1002 1010 open_args->owner.clientid = 0;
1003 1011 }
1004 1012
1005 1013 /*
1006 1014 * This increments oop's ref count or creates a temporary 'just_created'
1007 1015 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1008 1016 * completes.
1009 1017 */
1010 1018 mutex_enter(&VTOMI4(dvp)->mi_lock);
1011 1019
1012 1020 /* See if a permanent or just created open owner exists */
1013 1021 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1014 1022 if (!oop) {
1015 1023 /*
1016 1024 * This open owner does not exist so create a temporary
1017 1025 * just created one.
1018 1026 */
1019 1027 oop = create_open_owner(cr, VTOMI4(dvp));
1020 1028 ASSERT(oop != NULL);
1021 1029 }
1022 1030 mutex_exit(&VTOMI4(dvp)->mi_lock);
1023 1031
1024 1032 /* this length never changes, do alloc before seqid sync */
1025 1033 open_args->owner.owner_len = sizeof (oop->oo_name);
1026 1034 open_args->owner.owner_val =
1027 1035 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1028 1036
1029 1037 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1030 1038 if (e.error == EAGAIN) {
1031 1039 open_owner_rele(oop);
1032 1040 nfs4args_copen_free(open_args);
1033 1041 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1034 1042 if (ncr != NULL) {
1035 1043 crfree(ncr);
1036 1044 ncr = NULL;
1037 1045 }
1038 1046 goto recov_retry;
1039 1047 }
1040 1048
1041 1049 /* Check to see if we need to do the OTW call */
1042 1050 if (!create_flag) {
1043 1051 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1044 1052 file_just_been_created, &e.error, acc, &recov_state)) {
1045 1053
1046 1054 /*
1047 1055 * The OTW open is not necessary. Either
1048 1056 * the open can succeed without it (eg.
1049 1057 * delegation, error == 0) or the open
1050 1058 * must fail due to an access failure
1051 1059 * (error != 0). In either case, tidy
1052 1060 * up and return.
1053 1061 */
1054 1062
1055 1063 nfs4_end_open_seqid_sync(oop);
1056 1064 open_owner_rele(oop);
1057 1065 nfs4args_copen_free(open_args);
1058 1066 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1059 1067 if (ncr != NULL)
1060 1068 crfree(ncr);
1061 1069 kmem_free(argop, argoplist_size);
1062 1070 return (e.error);
1063 1071 }
1064 1072 }
1065 1073
1066 1074 bcopy(&oop->oo_name, open_args->owner.owner_val,
1067 1075 open_args->owner.owner_len);
1068 1076
1069 1077 seqid = nfs4_get_open_seqid(oop) + 1;
1070 1078 open_args->seqid = seqid;
1071 1079 open_args->share_access = 0;
1072 1080 if (open_flag & FREAD)
1073 1081 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1074 1082 if (open_flag & FWRITE)
1075 1083 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1076 1084 open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1077 1085
1078 1086
1079 1087
1080 1088 /*
1081 1089 * getfh w/sanity check for idx_open/idx_fattr
1082 1090 */
1083 1091 ASSERT((idx_open + 1) == (idx_fattr - 1));
1084 1092 argop[idx_open + 1].argop = OP_GETFH;
1085 1093
1086 1094 /* getattr */
1087 1095 argop[idx_fattr].argop = OP_GETATTR;
1088 1096 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1089 1097 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1090 1098
1091 1099 if (setgid_flag) {
1092 1100 vattr_t _v;
1093 1101 servinfo4_t *svp;
1094 1102 bitmap4 supp_attrs;
1095 1103
1096 1104 svp = drp->r_server;
1097 1105 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1098 1106 supp_attrs = svp->sv_supp_attrs;
1099 1107 nfs_rw_exit(&svp->sv_lock);
1100 1108
1101 1109 /*
1102 1110 * For setgid case, we need to:
1103 1111 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1104 1112 */
1105 1113 argop[4].argop = OP_SAVEFH;
1106 1114
1107 1115 argop[5].argop = OP_CPUTFH;
1108 1116 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1109 1117
1110 1118 argop[6].argop = OP_GETATTR;
1111 1119 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1112 1120 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1113 1121
1114 1122 argop[7].argop = OP_RESTOREFH;
1115 1123
1116 1124 /*
1117 1125 * nverify
1118 1126 */
1119 1127 _v.va_mask = AT_GID;
1120 1128 _v.va_gid = in_va->va_gid;
1121 1129 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1122 1130 supp_attrs))) {
1123 1131
1124 1132 /*
1125 1133 * setattr
1126 1134 *
1127 1135 * We _know_ we're not messing with AT_SIZE or
1128 1136 * AT_XTIME, so no need for stateid or flags.
1129 1137 * Also we specify NULL rp since we're only
1130 1138 * interested in setting owner_group attributes.
1131 1139 */
1132 1140 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1133 1141 supp_attrs, &e.error, 0);
1134 1142 if (e.error)
1135 1143 nfs4args_verify_free(&argop[8]);
1136 1144 }
1137 1145
1138 1146 if (e.error) {
1139 1147 /*
1140 1148 * XXX - Revisit the last argument to nfs4_end_op()
1141 1149 * once 5020486 is fixed.
1142 1150 */
1143 1151 nfs4_end_open_seqid_sync(oop);
1144 1152 open_owner_rele(oop);
1145 1153 nfs4args_copen_free(open_args);
1146 1154 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1147 1155 if (ncr != NULL)
1148 1156 crfree(ncr);
1149 1157 kmem_free(argop, argoplist_size);
1150 1158 return (e.error);
1151 1159 }
1152 1160 } else if (create_flag) {
1153 1161 argop[1].argop = OP_SAVEFH;
1154 1162
1155 1163 argop[5].argop = OP_RESTOREFH;
1156 1164
1157 1165 argop[6].argop = OP_GETATTR;
1158 1166 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1159 1167 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1160 1168 }
1161 1169
1162 1170 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1163 1171 "nfs4open_otw: %s call, nm %s, rp %s",
1164 1172 needrecov ? "recov" : "first", file_name,
1165 1173 rnode4info(VTOR4(dvp))));
1166 1174
1167 1175 t = gethrtime();
1168 1176
1169 1177 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1170 1178
1171 1179 if (!e.error && nfs4_need_to_bump_seqid(&res))
1172 1180 nfs4_set_open_seqid(seqid, oop, args.ctag);
1173 1181
1174 1182 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1175 1183
1176 1184 if (e.error || needrecov) {
1177 1185 bool_t abort = FALSE;
1178 1186
1179 1187 if (needrecov) {
1180 1188 nfs4_bseqid_entry_t *bsep = NULL;
1181 1189
1182 1190 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1183 1191 cred_otw, vpi, dvp, open_args);
1184 1192
1185 1193 if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1186 1194 bsep = nfs4_create_bseqid_entry(oop, NULL,
1187 1195 vpi, 0, args.ctag, open_args->seqid);
1188 1196 num_bseqid_retry--;
1189 1197 }
1190 1198
1191 1199 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1192 1200 NULL, lost_rqst.lr_op == OP_OPEN ?
1193 1201 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1194 1202
1195 1203 if (bsep)
1196 1204 kmem_free(bsep, sizeof (*bsep));
1197 1205 /* give up if we keep getting BAD_SEQID */
1198 1206 if (num_bseqid_retry == 0)
1199 1207 abort = TRUE;
1200 1208 if (abort == TRUE && e.error == 0)
1201 1209 e.error = geterrno4(res.status);
1202 1210 }
1203 1211 nfs4_end_open_seqid_sync(oop);
1204 1212 open_owner_rele(oop);
1205 1213 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1206 1214 nfs4args_copen_free(open_args);
1207 1215 if (setgid_flag) {
1208 1216 nfs4args_verify_free(&argop[8]);
1209 1217 nfs4args_setattr_free(&argop[9]);
1210 1218 }
1211 1219 if (!e.error)
1212 1220 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1213 1221 if (ncr != NULL) {
1214 1222 crfree(ncr);
1215 1223 ncr = NULL;
1216 1224 }
1217 1225 if (!needrecov || abort == TRUE || e.error == EINTR ||
1218 1226 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1219 1227 kmem_free(argop, argoplist_size);
1220 1228 return (e.error);
1221 1229 }
1222 1230 goto recov_retry;
1223 1231 }
1224 1232
1225 1233 /*
1226 1234 * Will check and update lease after checking the rflag for
1227 1235 * OPEN_CONFIRM in the successful OPEN call.
1228 1236 */
1229 1237 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1230 1238
1231 1239 /*
1232 1240 * XXX what if we're crossing mount points from server1:/drp
1233 1241 * to server2:/drp/rp.
1234 1242 */
1235 1243
1236 1244 /* Signal our end of use of the open seqid */
1237 1245 nfs4_end_open_seqid_sync(oop);
1238 1246
1239 1247 /*
1240 1248 * This will destroy the open owner if it was just created,
1241 1249 * and no one else has put a reference on it.
1242 1250 */
1243 1251 open_owner_rele(oop);
1244 1252 if (create_flag && (createmode != EXCLUSIVE4) &&
1245 1253 res.status == NFS4ERR_BADOWNER)
1246 1254 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1247 1255
1248 1256 e.error = geterrno4(res.status);
1249 1257 nfs4args_copen_free(open_args);
1250 1258 if (setgid_flag) {
1251 1259 nfs4args_verify_free(&argop[8]);
1252 1260 nfs4args_setattr_free(&argop[9]);
1253 1261 }
1254 1262 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1255 1263 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1256 1264 /*
1257 1265 * If the reply is NFS4ERR_ACCESS, it may be because
1258 1266 * we are root (no root net access). If the real uid
1259 1267 * is not root, then retry with the real uid instead.
1260 1268 */
1261 1269 if (ncr != NULL) {
1262 1270 crfree(ncr);
1263 1271 ncr = NULL;
1264 1272 }
1265 1273 if (res.status == NFS4ERR_ACCESS &&
1266 1274 (ncr = crnetadjust(cred_otw)) != NULL) {
1267 1275 cred_otw = ncr;
1268 1276 goto recov_retry;
1269 1277 }
1270 1278 kmem_free(argop, argoplist_size);
1271 1279 return (e.error);
1272 1280 }
1273 1281
1274 1282 resop = &res.array[idx_open]; /* open res */
1275 1283 op_res = &resop->nfs_resop4_u.opopen;
1276 1284
1277 1285 #ifdef DEBUG
1278 1286 /*
1279 1287 * verify attrset bitmap
1280 1288 */
1281 1289 if (create_flag &&
1282 1290 (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1283 1291 /* make sure attrset returned is what we asked for */
1284 1292 /* XXX Ignore this 'error' for now */
1285 1293 if (attr->attrmask != op_res->attrset)
1286 1294 /* EMPTY */;
1287 1295 }
1288 1296 #endif
1289 1297
1290 1298 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1291 1299 mutex_enter(&VTOMI4(dvp)->mi_lock);
1292 1300 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1293 1301 mutex_exit(&VTOMI4(dvp)->mi_lock);
1294 1302 }
1295 1303
1296 1304 resop = &res.array[idx_open + 1]; /* getfh res */
1297 1305 gf_res = &resop->nfs_resop4_u.opgetfh;
1298 1306
1299 1307 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1300 1308
1301 1309 /*
1302 1310 * The open stateid has been updated on the server but not
1303 1311 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache->
1304 1312 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1305 1313 * WRITE call. That, however, will use the old stateid, so go ahead
1306 1314 * and upate the open stateid now, before any call to makenfs4node.
1307 1315 */
1308 1316 if (vpi) {
1309 1317 nfs4_open_stream_t *tmp_osp;
1310 1318 rnode4_t *tmp_rp = VTOR4(vpi);
1311 1319
1312 1320 tmp_osp = find_open_stream(oop, tmp_rp);
1313 1321 if (tmp_osp) {
1314 1322 tmp_osp->open_stateid = op_res->stateid;
1315 1323 mutex_exit(&tmp_osp->os_sync_lock);
1316 1324 open_stream_rele(tmp_osp, tmp_rp);
1317 1325 }
1318 1326
1319 1327 /*
1320 1328 * We must determine if the file handle given by the otw open
1321 1329 * is the same as the file handle which was passed in with
1322 1330 * *vpp. This case can be reached if the file we are trying
1323 1331 * to open has been removed and another file has been created
1324 1332 * having the same file name. The passed in vnode is released
1325 1333 * later.
1326 1334 */
1327 1335 orig_sfh = VTOR4(vpi)->r_fh;
1328 1336 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1329 1337 }
1330 1338
1331 1339 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1332 1340
1333 1341 if (create_flag || fh_differs) {
1334 1342 int rnode_err = 0;
1335 1343
1336 1344 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1337 1345 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1338 1346
1339 1347 if (e.error)
1340 1348 PURGE_ATTRCACHE4(vp);
1341 1349 /*
1342 1350 * For the newly created vp case, make sure the rnode
1343 1351 * isn't bad before using it.
1344 1352 */
1345 1353 mutex_enter(&(VTOR4(vp))->r_statelock);
1346 1354 if (VTOR4(vp)->r_flags & R4RECOVERR)
1347 1355 rnode_err = EIO;
1348 1356 mutex_exit(&(VTOR4(vp))->r_statelock);
1349 1357
1350 1358 if (rnode_err) {
1351 1359 nfs4_end_open_seqid_sync(oop);
1352 1360 nfs4args_copen_free(open_args);
1353 1361 if (setgid_flag) {
1354 1362 nfs4args_verify_free(&argop[8]);
1355 1363 nfs4args_setattr_free(&argop[9]);
1356 1364 }
1357 1365 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1358 1366 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1359 1367 needrecov);
1360 1368 open_owner_rele(oop);
1361 1369 VN_RELE(vp);
1362 1370 if (ncr != NULL)
1363 1371 crfree(ncr);
1364 1372 sfh4_rele(&otw_sfh);
1365 1373 kmem_free(argop, argoplist_size);
1366 1374 return (EIO);
1367 1375 }
1368 1376 } else {
1369 1377 vp = vpi;
1370 1378 }
1371 1379 sfh4_rele(&otw_sfh);
1372 1380
1373 1381 /*
1374 1382 * It seems odd to get a full set of attrs and then not update
1375 1383 * the object's attrcache in the non-create case. Create case uses
1376 1384 * the attrs since makenfs4node checks to see if the attrs need to
1377 1385 * be updated (and then updates them). The non-create case should
1378 1386 * update attrs also.
1379 1387 */
1380 1388 if (! create_flag && ! fh_differs && !e.error) {
1381 1389 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1382 1390 }
1383 1391
1384 1392 nfs4_error_zinit(&e);
1385 1393 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1386 1394 /* This does not do recovery for vp explicitly. */
1387 1395 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1388 1396 &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1389 1397
1390 1398 if (e.error || e.stat) {
1391 1399 nfs4_end_open_seqid_sync(oop);
1392 1400 nfs4args_copen_free(open_args);
1393 1401 if (setgid_flag) {
1394 1402 nfs4args_verify_free(&argop[8]);
1395 1403 nfs4args_setattr_free(&argop[9]);
1396 1404 }
1397 1405 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1398 1406 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1399 1407 needrecov);
1400 1408 open_owner_rele(oop);
1401 1409 if (create_flag || fh_differs) {
1402 1410 /* rele the makenfs4node */
1403 1411 VN_RELE(vp);
1404 1412 }
1405 1413 if (ncr != NULL) {
1406 1414 crfree(ncr);
1407 1415 ncr = NULL;
1408 1416 }
1409 1417 if (retry_open == TRUE) {
1410 1418 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1411 1419 "nfs4open_otw: retry the open since OPEN "
1412 1420 "CONFIRM failed with error %d stat %d",
1413 1421 e.error, e.stat));
1414 1422 if (create_flag && createmode == GUARDED4) {
1415 1423 NFS4_DEBUG(nfs4_client_recov_debug,
1416 1424 (CE_NOTE, "nfs4open_otw: switch "
1417 1425 "createmode from GUARDED4 to "
1418 1426 "UNCHECKED4"));
1419 1427 createmode = UNCHECKED4;
1420 1428 }
1421 1429 goto recov_retry;
1422 1430 }
1423 1431 if (!e.error) {
1424 1432 if (create_flag && (createmode != EXCLUSIVE4) &&
1425 1433 e.stat == NFS4ERR_BADOWNER)
1426 1434 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1427 1435
1428 1436 e.error = geterrno4(e.stat);
1429 1437 }
1430 1438 kmem_free(argop, argoplist_size);
1431 1439 return (e.error);
1432 1440 }
1433 1441 }
1434 1442
1435 1443 rp = VTOR4(vp);
1436 1444
1437 1445 mutex_enter(&rp->r_statev4_lock);
1438 1446 if (create_flag)
1439 1447 rp->created_v4 = 1;
1440 1448 mutex_exit(&rp->r_statev4_lock);
1441 1449
1442 1450 mutex_enter(&oop->oo_lock);
1443 1451 /* Doesn't matter if 'oo_just_created' already was set as this */
1444 1452 oop->oo_just_created = NFS4_PERM_CREATED;
1445 1453 if (oop->oo_cred_otw)
1446 1454 crfree(oop->oo_cred_otw);
1447 1455 oop->oo_cred_otw = cred_otw;
1448 1456 crhold(oop->oo_cred_otw);
1449 1457 mutex_exit(&oop->oo_lock);
1450 1458
1451 1459 /* returns with 'os_sync_lock' held */
1452 1460 osp = find_or_create_open_stream(oop, rp, &created_osp);
1453 1461 if (!osp) {
1454 1462 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1455 1463 "nfs4open_otw: failed to create an open stream"));
1456 1464 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1457 1465 "signal our end of use of the open seqid"));
1458 1466
1459 1467 nfs4_end_open_seqid_sync(oop);
1460 1468 open_owner_rele(oop);
1461 1469 nfs4args_copen_free(open_args);
1462 1470 if (setgid_flag) {
1463 1471 nfs4args_verify_free(&argop[8]);
1464 1472 nfs4args_setattr_free(&argop[9]);
1465 1473 }
1466 1474 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1467 1475 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1468 1476 if (create_flag || fh_differs)
1469 1477 VN_RELE(vp);
1470 1478 if (ncr != NULL)
1471 1479 crfree(ncr);
1472 1480
1473 1481 kmem_free(argop, argoplist_size);
1474 1482 return (EINVAL);
1475 1483
1476 1484 }
1477 1485
1478 1486 osp->open_stateid = op_res->stateid;
1479 1487
1480 1488 if (open_flag & FREAD)
1481 1489 osp->os_share_acc_read++;
1482 1490 if (open_flag & FWRITE)
1483 1491 osp->os_share_acc_write++;
1484 1492 osp->os_share_deny_none++;
1485 1493
1486 1494 /*
1487 1495 * Need to reset this bitfield for the possible case where we were
1488 1496 * going to OTW CLOSE the file, got a non-recoverable error, and before
1489 1497 * we could retry the CLOSE, OPENed the file again.
1490 1498 */
1491 1499 ASSERT(osp->os_open_owner->oo_seqid_inuse);
1492 1500 osp->os_final_close = 0;
1493 1501 osp->os_force_close = 0;
1494 1502 #ifdef DEBUG
1495 1503 if (osp->os_failed_reopen)
1496 1504 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1497 1505 " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1498 1506 (void *)osp, (void *)cr, rnode4info(rp)));
1499 1507 #endif
1500 1508 osp->os_failed_reopen = 0;
1501 1509
1502 1510 mutex_exit(&osp->os_sync_lock);
1503 1511
1504 1512 nfs4_end_open_seqid_sync(oop);
1505 1513
1506 1514 if (created_osp && recov_state.rs_sp != NULL) {
1507 1515 mutex_enter(&recov_state.rs_sp->s_lock);
1508 1516 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1509 1517 mutex_exit(&recov_state.rs_sp->s_lock);
1510 1518 }
1511 1519
1512 1520 /* get rid of our reference to find oop */
1513 1521 open_owner_rele(oop);
1514 1522
1515 1523 open_stream_rele(osp, rp);
1516 1524
1517 1525 /* accept delegation, if any */
1518 1526 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1519 1527
1520 1528 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1521 1529
1522 1530 if (createmode == EXCLUSIVE4 &&
1523 1531 (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1524 1532 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1525 1533 " EXCLUSIVE4: sending a SETATTR"));
1526 1534 /*
1527 1535 * If doing an exclusive create, then generate
1528 1536 * a SETATTR to set the initial attributes.
1529 1537 * Try to set the mtime and the atime to the
1530 1538 * server's current time. It is somewhat
1531 1539 * expected that these fields will be used to
1532 1540 * store the exclusive create cookie. If not,
1533 1541 * server implementors will need to know that
1534 1542 * a SETATTR will follow an exclusive create
1535 1543 * and the cookie should be destroyed if
1536 1544 * appropriate.
1537 1545 *
1538 1546 * The AT_GID and AT_SIZE bits are turned off
1539 1547 * so that the SETATTR request will not attempt
1540 1548 * to process these. The gid will be set
1541 1549 * separately if appropriate. The size is turned
1542 1550 * off because it is assumed that a new file will
1543 1551 * be created empty and if the file wasn't empty,
1544 1552 * then the exclusive create will have failed
1545 1553 * because the file must have existed already.
1546 1554 * Therefore, no truncate operation is needed.
1547 1555 */
1548 1556 in_va->va_mask &= ~(AT_GID | AT_SIZE);
1549 1557 in_va->va_mask |= (AT_MTIME | AT_ATIME);
1550 1558
1551 1559 e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1552 1560 if (e.error) {
1553 1561 /*
1554 1562 * Couldn't correct the attributes of
1555 1563 * the newly created file and the
1556 1564 * attributes are wrong. Remove the
1557 1565 * file and return an error to the
1558 1566 * application.
1559 1567 */
1560 1568 /* XXX will this take care of client state ? */
1561 1569 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1562 1570 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1563 1571 " remove file", e.error));
1564 1572 VN_RELE(vp);
1565 1573 (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1566 1574 /*
1567 1575 * Since we've reled the vnode and removed
1568 1576 * the file we now need to return the error.
1569 1577 * At this point we don't want to update the
1570 1578 * dircaches, call nfs4_waitfor_purge_complete
1571 1579 * or set vpp to vp so we need to skip these
1572 1580 * as well.
1573 1581 */
1574 1582 goto skip_update_dircaches;
1575 1583 }
1576 1584 }
1577 1585
1578 1586 /*
1579 1587 * If we created or found the correct vnode, due to create_flag or
1580 1588 * fh_differs being set, then update directory cache attribute, readdir
1581 1589 * and dnlc caches.
1582 1590 */
1583 1591 if (create_flag || fh_differs) {
1584 1592 dirattr_info_t dinfo, *dinfop;
1585 1593
1586 1594 /*
1587 1595 * Make sure getattr succeeded before using results.
1588 1596 * note: op 7 is getattr(dir) for both flavors of
1589 1597 * open(create).
1590 1598 */
1591 1599 if (create_flag && res.status == NFS4_OK) {
1592 1600 dinfo.di_time_call = t;
1593 1601 dinfo.di_cred = cr;
1594 1602 dinfo.di_garp =
1595 1603 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1596 1604 dinfop = &dinfo;
1597 1605 } else {
1598 1606 dinfop = NULL;
1599 1607 }
1600 1608
1601 1609 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1602 1610 dinfop);
1603 1611 }
1604 1612
1605 1613 /*
1606 1614 * If the page cache for this file was flushed from actions
1607 1615 * above, it was done asynchronously and if that is true,
1608 1616 * there is a need to wait here for it to complete. This must
1609 1617 * be done outside of start_fop/end_fop.
1610 1618 */
1611 1619 (void) nfs4_waitfor_purge_complete(vp);
1612 1620
1613 1621 /*
1614 1622 * It is implicit that we are in the open case (create_flag == 0) since
1615 1623 * fh_differs can only be set to a non-zero value in the open case.
1616 1624 */
1617 1625 if (fh_differs != 0 && vpi != NULL)
1618 1626 VN_RELE(vpi);
1619 1627
1620 1628 /*
1621 1629 * Be sure to set *vpp to the correct value before returning.
1622 1630 */
1623 1631 *vpp = vp;
1624 1632
1625 1633 skip_update_dircaches:
1626 1634
1627 1635 nfs4args_copen_free(open_args);
1628 1636 if (setgid_flag) {
1629 1637 nfs4args_verify_free(&argop[8]);
1630 1638 nfs4args_setattr_free(&argop[9]);
1631 1639 }
1632 1640 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1633 1641
1634 1642 if (ncr)
1635 1643 crfree(ncr);
1636 1644 kmem_free(argop, argoplist_size);
1637 1645 return (e.error);
1638 1646 }
1639 1647
1640 1648 /*
1641 1649 * Reopen an open instance. cf. nfs4open_otw().
1642 1650 *
1643 1651 * Errors are returned by the nfs4_error_t parameter.
1644 1652 * - ep->error contains an errno value or zero.
1645 1653 * - if it is zero, ep->stat is set to an NFS status code, if any.
1646 1654 * If the file could not be reopened, but the caller should continue, the
1647 1655 * file is marked dead and no error values are returned. If the caller
1648 1656 * should stop recovering open files and start over, either the ep->error
1649 1657 * value or ep->stat will indicate an error (either something that requires
1650 1658 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile
1651 1659 * filehandles) may be handled silently by this routine.
1652 1660 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1653 1661 * will be started, so the caller should not do it.
1654 1662 *
1655 1663 * Gotos:
1656 1664 * - kill_file : reopen failed in such a fashion to constitute marking the
1657 1665 * file dead and setting the open stream's 'os_failed_reopen' as 1. This
1658 1666 * is for cases where recovery is not possible.
1659 1667 * - failed_reopen : same as above, except that the file has already been
1660 1668 * marked dead, so no need to do it again.
1661 1669 * - bailout : reopen failed but we are able to recover and retry the reopen -
1662 1670 * either within this function immediately or via the calling function.
1663 1671 */
1664 1672
1665 1673 void
1666 1674 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1667 1675 open_claim_type4 claim, bool_t frc_use_claim_previous,
1668 1676 bool_t is_recov)
1669 1677 {
1670 1678 COMPOUND4args_clnt args;
1671 1679 COMPOUND4res_clnt res;
1672 1680 nfs_argop4 argop[4];
1673 1681 nfs_resop4 *resop;
1674 1682 OPEN4res *op_res = NULL;
1675 1683 OPEN4cargs *open_args;
1676 1684 GETFH4res *gf_res;
1677 1685 rnode4_t *rp = VTOR4(vp);
1678 1686 int doqueue = 1;
1679 1687 cred_t *cr = NULL, *cred_otw = NULL;
1680 1688 nfs4_open_owner_t *oop = NULL;
1681 1689 seqid4 seqid;
1682 1690 nfs4_ga_res_t *garp;
1683 1691 char fn[MAXNAMELEN];
1684 1692 nfs4_recov_state_t recov = {NULL, 0};
1685 1693 nfs4_lost_rqst_t lost_rqst;
1686 1694 mntinfo4_t *mi = VTOMI4(vp);
1687 1695 bool_t abort;
1688 1696 char *failed_msg = "";
1689 1697 int fh_different;
1690 1698 hrtime_t t;
1691 1699 nfs4_bseqid_entry_t *bsep = NULL;
1692 1700
1693 1701 ASSERT(nfs4_consistent_type(vp));
1694 1702 ASSERT(nfs_zone() == mi->mi_zone);
1695 1703
1696 1704 nfs4_error_zinit(ep);
1697 1705
1698 1706 /* this is the cred used to find the open owner */
1699 1707 cr = state_to_cred(osp);
1700 1708 if (cr == NULL) {
1701 1709 failed_msg = "Couldn't reopen: no cred";
1702 1710 goto kill_file;
1703 1711 }
1704 1712 /* use this cred for OTW operations */
1705 1713 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1706 1714
1707 1715 top:
1708 1716 nfs4_error_zinit(ep);
1709 1717
1710 1718 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1711 1719 /* File system has been unmounted, quit */
1712 1720 ep->error = EIO;
1713 1721 failed_msg = "Couldn't reopen: file system has been unmounted";
1714 1722 goto kill_file;
1715 1723 }
1716 1724
1717 1725 oop = osp->os_open_owner;
1718 1726
1719 1727 ASSERT(oop != NULL);
1720 1728 if (oop == NULL) { /* be defensive in non-DEBUG */
1721 1729 failed_msg = "can't reopen: no open owner";
1722 1730 goto kill_file;
1723 1731 }
1724 1732 open_owner_hold(oop);
1725 1733
1726 1734 ep->error = nfs4_start_open_seqid_sync(oop, mi);
1727 1735 if (ep->error) {
1728 1736 open_owner_rele(oop);
1729 1737 oop = NULL;
1730 1738 goto bailout;
1731 1739 }
1732 1740
1733 1741 /*
1734 1742 * If the rnode has a delegation and the delegation has been
1735 1743 * recovered and the server didn't request a recall and the caller
1736 1744 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1737 1745 * recovery) and the rnode hasn't been marked dead, then install
1738 1746 * the delegation stateid in the open stream. Otherwise, proceed
1739 1747 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1740 1748 */
1741 1749 mutex_enter(&rp->r_statev4_lock);
1742 1750 if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1743 1751 !rp->r_deleg_return_pending &&
1744 1752 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1745 1753 !rp->r_deleg_needs_recall &&
1746 1754 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1747 1755 !(rp->r_flags & R4RECOVERR)) {
1748 1756 mutex_enter(&osp->os_sync_lock);
1749 1757 osp->os_delegation = 1;
1750 1758 osp->open_stateid = rp->r_deleg_stateid;
1751 1759 mutex_exit(&osp->os_sync_lock);
1752 1760 mutex_exit(&rp->r_statev4_lock);
1753 1761 goto bailout;
1754 1762 }
1755 1763 mutex_exit(&rp->r_statev4_lock);
1756 1764
1757 1765 /*
1758 1766 * If the file failed recovery, just quit. This failure need not
1759 1767 * affect other reopens, so don't return an error.
1760 1768 */
1761 1769 mutex_enter(&rp->r_statelock);
1762 1770 if (rp->r_flags & R4RECOVERR) {
1763 1771 mutex_exit(&rp->r_statelock);
1764 1772 ep->error = 0;
1765 1773 goto failed_reopen;
1766 1774 }
1767 1775 mutex_exit(&rp->r_statelock);
1768 1776
1769 1777 /*
1770 1778 * argop is empty here
1771 1779 *
1772 1780 * PUTFH, OPEN, GETATTR
1773 1781 */
1774 1782 args.ctag = TAG_REOPEN;
1775 1783 args.array_len = 4;
1776 1784 args.array = argop;
1777 1785
1778 1786 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1779 1787 "nfs4_reopen: file is type %d, id %s",
1780 1788 vp->v_type, rnode4info(VTOR4(vp))));
1781 1789
1782 1790 argop[0].argop = OP_CPUTFH;
1783 1791
1784 1792 if (claim != CLAIM_PREVIOUS) {
1785 1793 /*
1786 1794 * if this is a file mount then
1787 1795 * use the mntinfo parentfh
1788 1796 */
1789 1797 argop[0].nfs_argop4_u.opcputfh.sfh =
1790 1798 (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1791 1799 VTOSV(vp)->sv_dfh;
1792 1800 } else {
1793 1801 /* putfh fh to reopen */
1794 1802 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1795 1803 }
1796 1804
1797 1805 argop[1].argop = OP_COPEN;
1798 1806 open_args = &argop[1].nfs_argop4_u.opcopen;
1799 1807 open_args->claim = claim;
1800 1808
1801 1809 if (claim == CLAIM_NULL) {
1802 1810
1803 1811 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1804 1812 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1805 1813 "failed for vp 0x%p for CLAIM_NULL with %m",
1806 1814 (void *)vp);
1807 1815 failed_msg = "Couldn't reopen: vtoname failed for "
1808 1816 "CLAIM_NULL";
1809 1817 /* nothing allocated yet */
1810 1818 goto kill_file;
1811 1819 }
1812 1820
1813 1821 open_args->open_claim4_u.cfile = fn;
1814 1822 } else if (claim == CLAIM_PREVIOUS) {
1815 1823
1816 1824 /*
1817 1825 * We have two cases to deal with here:
1818 1826 * 1) We're being called to reopen files in order to satisfy
1819 1827 * a lock operation request which requires us to explicitly
1820 1828 * reopen files which were opened under a delegation. If
1821 1829 * we're in recovery, we *must* use CLAIM_PREVIOUS. In
1822 1830 * that case, frc_use_claim_previous is TRUE and we must
1823 1831 * use the rnode's current delegation type (r_deleg_type).
1824 1832 * 2) We're reopening files during some form of recovery.
1825 1833 * In this case, frc_use_claim_previous is FALSE and we
1826 1834 * use the delegation type appropriate for recovery
1827 1835 * (r_deleg_needs_recovery).
1828 1836 */
1829 1837 mutex_enter(&rp->r_statev4_lock);
1830 1838 open_args->open_claim4_u.delegate_type =
1831 1839 frc_use_claim_previous ?
1832 1840 rp->r_deleg_type :
1833 1841 rp->r_deleg_needs_recovery;
1834 1842 mutex_exit(&rp->r_statev4_lock);
1835 1843
1836 1844 } else if (claim == CLAIM_DELEGATE_CUR) {
1837 1845
1838 1846 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1839 1847 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1840 1848 "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1841 1849 "with %m", (void *)vp);
1842 1850 failed_msg = "Couldn't reopen: vtoname failed for "
1843 1851 "CLAIM_DELEGATE_CUR";
1844 1852 /* nothing allocated yet */
1845 1853 goto kill_file;
1846 1854 }
1847 1855
1848 1856 mutex_enter(&rp->r_statev4_lock);
1849 1857 open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1850 1858 rp->r_deleg_stateid;
1851 1859 mutex_exit(&rp->r_statev4_lock);
1852 1860
1853 1861 open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1854 1862 }
1855 1863 open_args->opentype = OPEN4_NOCREATE;
1856 1864 open_args->owner.clientid = mi2clientid(mi);
1857 1865 open_args->owner.owner_len = sizeof (oop->oo_name);
1858 1866 open_args->owner.owner_val =
1859 1867 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1860 1868 bcopy(&oop->oo_name, open_args->owner.owner_val,
1861 1869 open_args->owner.owner_len);
1862 1870 open_args->share_access = 0;
1863 1871 open_args->share_deny = 0;
1864 1872
1865 1873 mutex_enter(&osp->os_sync_lock);
1866 1874 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1867 1875 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1868 1876 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1869 1877 (void *)osp, (void *)rp, osp->os_share_acc_read,
1870 1878 osp->os_share_acc_write, osp->os_open_ref_count,
1871 1879 osp->os_mmap_read, osp->os_mmap_write, claim));
1872 1880
1873 1881 if (osp->os_share_acc_read || osp->os_mmap_read)
1874 1882 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1875 1883 if (osp->os_share_acc_write || osp->os_mmap_write)
1876 1884 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1877 1885 if (osp->os_share_deny_read)
1878 1886 open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1879 1887 if (osp->os_share_deny_write)
1880 1888 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1881 1889 mutex_exit(&osp->os_sync_lock);
1882 1890
1883 1891 seqid = nfs4_get_open_seqid(oop) + 1;
1884 1892 open_args->seqid = seqid;
1885 1893
1886 1894 /* Construct the getfh part of the compound */
1887 1895 argop[2].argop = OP_GETFH;
1888 1896
1889 1897 /* Construct the getattr part of the compound */
1890 1898 argop[3].argop = OP_GETATTR;
1891 1899 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1892 1900 argop[3].nfs_argop4_u.opgetattr.mi = mi;
1893 1901
1894 1902 t = gethrtime();
1895 1903
1896 1904 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1897 1905
1898 1906 if (ep->error) {
1899 1907 if (!is_recov && !frc_use_claim_previous &&
1900 1908 (ep->error == EINTR || ep->error == ETIMEDOUT ||
1901 1909 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1902 1910 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1903 1911 cred_otw, vp, NULL, open_args);
1904 1912 abort = nfs4_start_recovery(ep,
1905 1913 VTOMI4(vp), vp, NULL, NULL,
1906 1914 lost_rqst.lr_op == OP_OPEN ?
1907 1915 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1908 1916 nfs4args_copen_free(open_args);
1909 1917 goto bailout;
1910 1918 }
1911 1919
1912 1920 nfs4args_copen_free(open_args);
1913 1921
1914 1922 if (ep->error == EACCES && cred_otw != cr) {
1915 1923 crfree(cred_otw);
1916 1924 cred_otw = cr;
1917 1925 crhold(cred_otw);
1918 1926 nfs4_end_open_seqid_sync(oop);
1919 1927 open_owner_rele(oop);
1920 1928 oop = NULL;
1921 1929 goto top;
1922 1930 }
1923 1931 if (ep->error == ETIMEDOUT)
1924 1932 goto bailout;
1925 1933 failed_msg = "Couldn't reopen: rpc error";
1926 1934 goto kill_file;
1927 1935 }
1928 1936
1929 1937 if (nfs4_need_to_bump_seqid(&res))
1930 1938 nfs4_set_open_seqid(seqid, oop, args.ctag);
1931 1939
1932 1940 switch (res.status) {
1933 1941 case NFS4_OK:
1934 1942 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1935 1943 mutex_enter(&rp->r_statelock);
1936 1944 rp->r_delay_interval = 0;
1937 1945 mutex_exit(&rp->r_statelock);
1938 1946 }
1939 1947 break;
1940 1948 case NFS4ERR_BAD_SEQID:
1941 1949 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1942 1950 args.ctag, open_args->seqid);
1943 1951
1944 1952 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1945 1953 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1946 1954 NULL, OP_OPEN, bsep, NULL, NULL);
1947 1955
1948 1956 nfs4args_copen_free(open_args);
1949 1957 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1950 1958 nfs4_end_open_seqid_sync(oop);
1951 1959 open_owner_rele(oop);
1952 1960 oop = NULL;
1953 1961 kmem_free(bsep, sizeof (*bsep));
1954 1962
1955 1963 goto kill_file;
1956 1964 case NFS4ERR_NO_GRACE:
1957 1965 nfs4args_copen_free(open_args);
1958 1966 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1959 1967 nfs4_end_open_seqid_sync(oop);
1960 1968 open_owner_rele(oop);
1961 1969 oop = NULL;
1962 1970 if (claim == CLAIM_PREVIOUS) {
1963 1971 /*
1964 1972 * Retry as a plain open. We don't need to worry about
1965 1973 * checking the changeinfo: it is acceptable for a
1966 1974 * client to re-open a file and continue processing
1967 1975 * (in the absence of locks).
1968 1976 */
1969 1977 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1970 1978 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1971 1979 "will retry as CLAIM_NULL"));
1972 1980 claim = CLAIM_NULL;
1973 1981 nfs4_mi_kstat_inc_no_grace(mi);
1974 1982 goto top;
1975 1983 }
1976 1984 failed_msg =
1977 1985 "Couldn't reopen: tried reclaim outside grace period. ";
1978 1986 goto kill_file;
1979 1987 case NFS4ERR_GRACE:
1980 1988 nfs4_set_grace_wait(mi);
1981 1989 nfs4args_copen_free(open_args);
1982 1990 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1983 1991 nfs4_end_open_seqid_sync(oop);
1984 1992 open_owner_rele(oop);
1985 1993 oop = NULL;
1986 1994 ep->error = nfs4_wait_for_grace(mi, &recov);
1987 1995 if (ep->error != 0)
1988 1996 goto bailout;
1989 1997 goto top;
1990 1998 case NFS4ERR_DELAY:
1991 1999 nfs4_set_delay_wait(vp);
1992 2000 nfs4args_copen_free(open_args);
1993 2001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1994 2002 nfs4_end_open_seqid_sync(oop);
1995 2003 open_owner_rele(oop);
1996 2004 oop = NULL;
1997 2005 ep->error = nfs4_wait_for_delay(vp, &recov);
1998 2006 nfs4_mi_kstat_inc_delay(mi);
1999 2007 if (ep->error != 0)
2000 2008 goto bailout;
2001 2009 goto top;
2002 2010 case NFS4ERR_FHEXPIRED:
2003 2011 /* recover filehandle and retry */
2004 2012 abort = nfs4_start_recovery(ep,
2005 2013 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2006 2014 nfs4args_copen_free(open_args);
2007 2015 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2008 2016 nfs4_end_open_seqid_sync(oop);
2009 2017 open_owner_rele(oop);
2010 2018 oop = NULL;
2011 2019 if (abort == FALSE)
2012 2020 goto top;
2013 2021 failed_msg = "Couldn't reopen: recovery aborted";
2014 2022 goto kill_file;
2015 2023 case NFS4ERR_RESOURCE:
2016 2024 case NFS4ERR_STALE_CLIENTID:
2017 2025 case NFS4ERR_WRONGSEC:
2018 2026 case NFS4ERR_EXPIRED:
2019 2027 /*
2020 2028 * Do not mark the file dead and let the calling
2021 2029 * function initiate recovery.
2022 2030 */
2023 2031 nfs4args_copen_free(open_args);
2024 2032 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2025 2033 nfs4_end_open_seqid_sync(oop);
2026 2034 open_owner_rele(oop);
2027 2035 oop = NULL;
2028 2036 goto bailout;
2029 2037 case NFS4ERR_ACCESS:
2030 2038 if (cred_otw != cr) {
2031 2039 crfree(cred_otw);
2032 2040 cred_otw = cr;
2033 2041 crhold(cred_otw);
2034 2042 nfs4args_copen_free(open_args);
2035 2043 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2036 2044 nfs4_end_open_seqid_sync(oop);
2037 2045 open_owner_rele(oop);
2038 2046 oop = NULL;
2039 2047 goto top;
2040 2048 }
2041 2049 /* fall through */
2042 2050 default:
2043 2051 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2044 2052 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2045 2053 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2046 2054 rnode4info(VTOR4(vp))));
2047 2055 failed_msg = "Couldn't reopen: NFSv4 error";
2048 2056 nfs4args_copen_free(open_args);
2049 2057 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2050 2058 goto kill_file;
2051 2059 }
2052 2060
2053 2061 resop = &res.array[1]; /* open res */
2054 2062 op_res = &resop->nfs_resop4_u.opopen;
2055 2063
2056 2064 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2057 2065
2058 2066 /*
2059 2067 * Check if the path we reopened really is the same
2060 2068 * file. We could end up in a situation where the file
2061 2069 * was removed and a new file created with the same name.
2062 2070 */
2063 2071 resop = &res.array[2];
2064 2072 gf_res = &resop->nfs_resop4_u.opgetfh;
2065 2073 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2066 2074 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2067 2075 if (fh_different) {
2068 2076 if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2069 2077 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2070 2078 /* Oops, we don't have the same file */
2071 2079 if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2072 2080 failed_msg = "Couldn't reopen: Persistent "
2073 2081 "file handle changed";
2074 2082 else
2075 2083 failed_msg = "Couldn't reopen: Volatile "
2076 2084 "(no expire on open) file handle changed";
2077 2085
2078 2086 nfs4args_copen_free(open_args);
2079 2087 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2080 2088 nfs_rw_exit(&mi->mi_fh_lock);
2081 2089 goto kill_file;
2082 2090
2083 2091 } else {
2084 2092 /*
2085 2093 * We have volatile file handles that don't compare.
2086 2094 * If the fids are the same then we assume that the
2087 2095 * file handle expired but the rnode still refers to
2088 2096 * the same file object.
2089 2097 *
2090 2098 * First check that we have fids or not.
2091 2099 * If we don't we have a dumb server so we will
2092 2100 * just assume every thing is ok for now.
2093 2101 */
2094 2102 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2095 2103 rp->r_attr.va_mask & AT_NODEID &&
2096 2104 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2097 2105 /*
2098 2106 * We have fids, but they don't
2099 2107 * compare. So kill the file.
2100 2108 */
2101 2109 failed_msg =
2102 2110 "Couldn't reopen: file handle changed"
2103 2111 " due to mismatched fids";
2104 2112 nfs4args_copen_free(open_args);
2105 2113 (void) xdr_free(xdr_COMPOUND4res_clnt,
2106 2114 (caddr_t)&res);
2107 2115 nfs_rw_exit(&mi->mi_fh_lock);
2108 2116 goto kill_file;
2109 2117 } else {
2110 2118 /*
2111 2119 * We have volatile file handles that refers
2112 2120 * to the same file (at least they have the
2113 2121 * same fid) or we don't have fids so we
2114 2122 * can't tell. :(. We'll be a kind and accepting
2115 2123 * client so we'll update the rnode's file
2116 2124 * handle with the otw handle.
2117 2125 *
2118 2126 * We need to drop mi->mi_fh_lock since
2119 2127 * sh4_update acquires it. Since there is
2120 2128 * only one recovery thread there is no
2121 2129 * race.
2122 2130 */
2123 2131 nfs_rw_exit(&mi->mi_fh_lock);
2124 2132 sfh4_update(rp->r_fh, &gf_res->object);
2125 2133 }
2126 2134 }
2127 2135 } else {
2128 2136 nfs_rw_exit(&mi->mi_fh_lock);
2129 2137 }
2130 2138
2131 2139 ASSERT(nfs4_consistent_type(vp));
2132 2140
2133 2141 /*
2134 2142 * If the server wanted an OPEN_CONFIRM but that fails, just start
2135 2143 * over. Presumably if there is a persistent error it will show up
2136 2144 * when we resend the OPEN.
2137 2145 */
2138 2146 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2139 2147 bool_t retry_open = FALSE;
2140 2148
2141 2149 nfs4open_confirm(vp, &seqid, &op_res->stateid,
2142 2150 cred_otw, is_recov, &retry_open,
2143 2151 oop, FALSE, ep, NULL);
2144 2152 if (ep->error || ep->stat) {
2145 2153 nfs4args_copen_free(open_args);
2146 2154 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2147 2155 nfs4_end_open_seqid_sync(oop);
2148 2156 open_owner_rele(oop);
2149 2157 oop = NULL;
2150 2158 goto top;
2151 2159 }
2152 2160 }
2153 2161
2154 2162 mutex_enter(&osp->os_sync_lock);
2155 2163 osp->open_stateid = op_res->stateid;
2156 2164 osp->os_delegation = 0;
2157 2165 /*
2158 2166 * Need to reset this bitfield for the possible case where we were
2159 2167 * going to OTW CLOSE the file, got a non-recoverable error, and before
2160 2168 * we could retry the CLOSE, OPENed the file again.
2161 2169 */
2162 2170 ASSERT(osp->os_open_owner->oo_seqid_inuse);
2163 2171 osp->os_final_close = 0;
2164 2172 osp->os_force_close = 0;
2165 2173 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2166 2174 osp->os_dc_openacc = open_args->share_access;
2167 2175 mutex_exit(&osp->os_sync_lock);
2168 2176
2169 2177 nfs4_end_open_seqid_sync(oop);
2170 2178
2171 2179 /* accept delegation, if any */
2172 2180 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2173 2181
2174 2182 nfs4args_copen_free(open_args);
2175 2183
2176 2184 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2177 2185
2178 2186 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2179 2187
2180 2188 ASSERT(nfs4_consistent_type(vp));
2181 2189
2182 2190 open_owner_rele(oop);
2183 2191 crfree(cr);
2184 2192 crfree(cred_otw);
2185 2193 return;
2186 2194
2187 2195 kill_file:
2188 2196 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2189 2197 failed_reopen:
2190 2198 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2191 2199 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2192 2200 (void *)osp, (void *)cr, rnode4info(rp)));
2193 2201 mutex_enter(&osp->os_sync_lock);
2194 2202 osp->os_failed_reopen = 1;
2195 2203 mutex_exit(&osp->os_sync_lock);
2196 2204 bailout:
2197 2205 if (oop != NULL) {
2198 2206 nfs4_end_open_seqid_sync(oop);
2199 2207 open_owner_rele(oop);
2200 2208 }
2201 2209 if (cr != NULL)
2202 2210 crfree(cr);
2203 2211 if (cred_otw != NULL)
2204 2212 crfree(cred_otw);
2205 2213 }
2206 2214
2207 2215 /* for . and .. OPENs */
2208 2216 /* ARGSUSED */
2209 2217 static int
2210 2218 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2211 2219 {
2212 2220 rnode4_t *rp;
2213 2221 nfs4_ga_res_t gar;
2214 2222
2215 2223 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2216 2224
2217 2225 /*
2218 2226 * If close-to-open consistency checking is turned off or
2219 2227 * if there is no cached data, we can avoid
2220 2228 * the over the wire getattr. Otherwise, force a
2221 2229 * call to the server to get fresh attributes and to
2222 2230 * check caches. This is required for close-to-open
2223 2231 * consistency.
2224 2232 */
2225 2233 rp = VTOR4(*vpp);
2226 2234 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2227 2235 (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2228 2236 return (0);
2229 2237
2230 2238 gar.n4g_va.va_mask = AT_ALL;
2231 2239 return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2232 2240 }
2233 2241
2234 2242 /*
2235 2243 * CLOSE a file
2236 2244 */
2237 2245 /* ARGSUSED */
2238 2246 static int
2239 2247 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2240 2248 caller_context_t *ct)
2241 2249 {
2242 2250 rnode4_t *rp;
2243 2251 int error = 0;
2244 2252 int r_error = 0;
2245 2253 int n4error = 0;
2246 2254 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2247 2255
2248 2256 /*
2249 2257 * Remove client state for this (lockowner, file) pair.
2250 2258 * Issue otw v4 call to have the server do the same.
2251 2259 */
2252 2260
2253 2261 rp = VTOR4(vp);
2254 2262
2255 2263 /*
2256 2264 * zone_enter(2) prevents processes from changing zones with NFS files
2257 2265 * open; if we happen to get here from the wrong zone we can't do
2258 2266 * anything over the wire.
2259 2267 */
2260 2268 if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2261 2269 /*
2262 2270 * We could attempt to clean up locks, except we're sure
2263 2271 * that the current process didn't acquire any locks on
2264 2272 * the file: any attempt to lock a file belong to another zone
2265 2273 * will fail, and one can't lock an NFS file and then change
2266 2274 * zones, as that fails too.
2267 2275 *
2268 2276 * Returning an error here is the sane thing to do. A
2269 2277 * subsequent call to VN_RELE() which translates to a
2270 2278 * nfs4_inactive() will clean up state: if the zone of the
2271 2279 * vnode's origin is still alive and kicking, the inactive
2272 2280 * thread will handle the request (from the correct zone), and
2273 2281 * everything (minus the OTW close call) should be OK. If the
2274 2282 * zone is going away nfs4_async_inactive() will throw away
2275 2283 * delegations, open streams and cached pages inline.
2276 2284 */
2277 2285 return (EIO);
2278 2286 }
2279 2287
2280 2288 /*
2281 2289 * If we are using local locking for this filesystem, then
2282 2290 * release all of the SYSV style record locks. Otherwise,
2283 2291 * we are doing network locking and we need to release all
2284 2292 * of the network locks. All of the locks held by this
2285 2293 * process on this file are released no matter what the
2286 2294 * incoming reference count is.
2287 2295 */
2288 2296 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2289 2297 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2290 2298 cleanshares(vp, ttoproc(curthread)->p_pid);
2291 2299 } else
2292 2300 e.error = nfs4_lockrelease(vp, flag, offset, cr);
2293 2301
2294 2302 if (e.error) {
2295 2303 struct lm_sysid *lmsid;
2296 2304 lmsid = nfs4_find_sysid(VTOMI4(vp));
2297 2305 if (lmsid == NULL) {
2298 2306 DTRACE_PROBE2(unknown__sysid, int, e.error,
2299 2307 vnode_t *, vp);
2300 2308 } else {
2301 2309 cleanlocks(vp, ttoproc(curthread)->p_pid,
2302 2310 (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2303 2311 }
2304 2312 return (e.error);
2305 2313 }
2306 2314
2307 2315 if (count > 1)
2308 2316 return (0);
2309 2317
2310 2318 /*
2311 2319 * If the file has been `unlinked', then purge the
2312 2320 * DNLC so that this vnode will get reycled quicker
2313 2321 * and the .nfs* file on the server will get removed.
2314 2322 */
2315 2323 if (rp->r_unldvp != NULL)
2316 2324 dnlc_purge_vp(vp);
2317 2325
2318 2326 /*
2319 2327 * If the file was open for write and there are pages,
2320 2328 * do a synchronous flush and commit of all of the
2321 2329 * dirty and uncommitted pages.
2322 2330 */
2323 2331 ASSERT(!e.error);
2324 2332 if ((flag & FWRITE) && nfs4_has_pages(vp))
2325 2333 error = nfs4_putpage_commit(vp, 0, 0, cr);
2326 2334
2327 2335 mutex_enter(&rp->r_statelock);
2328 2336 r_error = rp->r_error;
2329 2337 rp->r_error = 0;
2330 2338 mutex_exit(&rp->r_statelock);
2331 2339
2332 2340 /*
2333 2341 * If this file type is one for which no explicit 'open' was
2334 2342 * done, then bail now (ie. no need for protocol 'close'). If
2335 2343 * there was an error w/the vm subsystem, return _that_ error,
2336 2344 * otherwise, return any errors that may've been reported via
2337 2345 * the rnode.
2338 2346 */
2339 2347 if (vp->v_type != VREG)
2340 2348 return (error ? error : r_error);
2341 2349
2342 2350 /*
2343 2351 * The sync putpage commit may have failed above, but since
2344 2352 * we're working w/a regular file, we need to do the protocol
2345 2353 * 'close' (nfs4close_one will figure out if an otw close is
2346 2354 * needed or not). Report any errors _after_ doing the protocol
2347 2355 * 'close'.
2348 2356 */
2349 2357 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2350 2358 n4error = e.error ? e.error : geterrno4(e.stat);
2351 2359
2352 2360 /*
2353 2361 * Error reporting prio (Hi -> Lo)
2354 2362 *
2355 2363 * i) nfs4_putpage_commit (error)
2356 2364 * ii) rnode's (r_error)
2357 2365 * iii) nfs4close_one (n4error)
2358 2366 */
2359 2367 return (error ? error : (r_error ? r_error : n4error));
2360 2368 }
2361 2369
2362 2370 /*
2363 2371 * Initialize *lost_rqstp.
2364 2372 */
2365 2373
2366 2374 static void
2367 2375 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2368 2376 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2369 2377 vnode_t *vp)
2370 2378 {
2371 2379 if (error != ETIMEDOUT && error != EINTR &&
2372 2380 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2373 2381 lost_rqstp->lr_op = 0;
2374 2382 return;
2375 2383 }
2376 2384
2377 2385 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2378 2386 "nfs4close_save_lost_rqst: error %d", error));
2379 2387
2380 2388 lost_rqstp->lr_op = OP_CLOSE;
2381 2389 /*
2382 2390 * The vp is held and rele'd via the recovery code.
2383 2391 * See nfs4_save_lost_rqst.
2384 2392 */
2385 2393 lost_rqstp->lr_vp = vp;
2386 2394 lost_rqstp->lr_dvp = NULL;
2387 2395 lost_rqstp->lr_oop = oop;
2388 2396 lost_rqstp->lr_osp = osp;
2389 2397 ASSERT(osp != NULL);
2390 2398 ASSERT(mutex_owned(&osp->os_sync_lock));
2391 2399 osp->os_pending_close = 1;
2392 2400 lost_rqstp->lr_lop = NULL;
2393 2401 lost_rqstp->lr_cr = cr;
2394 2402 lost_rqstp->lr_flk = NULL;
2395 2403 lost_rqstp->lr_putfirst = FALSE;
2396 2404 }
2397 2405
2398 2406 /*
2399 2407 * Assumes you already have the open seqid sync grabbed as well as the
2400 2408 * 'os_sync_lock'. Note: this will release the open seqid sync and
2401 2409 * 'os_sync_lock' if client recovery starts. Calling functions have to
2402 2410 * be prepared to handle this.
2403 2411 *
2404 2412 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2405 2413 * was needed and was started, and that the calling function should retry
2406 2414 * this function; otherwise it is returned as 0.
2407 2415 *
2408 2416 * Errors are returned via the nfs4_error_t parameter.
2409 2417 */
2410 2418 static void
2411 2419 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2412 2420 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2413 2421 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2414 2422 {
2415 2423 COMPOUND4args_clnt args;
2416 2424 COMPOUND4res_clnt res;
2417 2425 CLOSE4args *close_args;
2418 2426 nfs_resop4 *resop;
2419 2427 nfs_argop4 argop[3];
2420 2428 int doqueue = 1;
2421 2429 mntinfo4_t *mi;
2422 2430 seqid4 seqid;
2423 2431 vnode_t *vp;
2424 2432 bool_t needrecov = FALSE;
2425 2433 nfs4_lost_rqst_t lost_rqst;
2426 2434 hrtime_t t;
2427 2435
2428 2436 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2429 2437
2430 2438 ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2431 2439
2432 2440 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2433 2441
2434 2442 /* Only set this to 1 if recovery is started */
2435 2443 *recov = 0;
2436 2444
2437 2445 /* do the OTW call to close the file */
2438 2446
2439 2447 if (close_type == CLOSE_RESEND)
2440 2448 args.ctag = TAG_CLOSE_LOST;
2441 2449 else if (close_type == CLOSE_AFTER_RESEND)
2442 2450 args.ctag = TAG_CLOSE_UNDO;
2443 2451 else
2444 2452 args.ctag = TAG_CLOSE;
2445 2453
2446 2454 args.array_len = 3;
2447 2455 args.array = argop;
2448 2456
2449 2457 vp = RTOV4(rp);
2450 2458
2451 2459 mi = VTOMI4(vp);
2452 2460
2453 2461 /* putfh target fh */
2454 2462 argop[0].argop = OP_CPUTFH;
2455 2463 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2456 2464
2457 2465 argop[1].argop = OP_GETATTR;
2458 2466 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2459 2467 argop[1].nfs_argop4_u.opgetattr.mi = mi;
2460 2468
2461 2469 argop[2].argop = OP_CLOSE;
2462 2470 close_args = &argop[2].nfs_argop4_u.opclose;
2463 2471
2464 2472 seqid = nfs4_get_open_seqid(oop) + 1;
2465 2473
2466 2474 close_args->seqid = seqid;
2467 2475 close_args->open_stateid = osp->open_stateid;
2468 2476
2469 2477 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2470 2478 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2471 2479 rnode4info(rp)));
2472 2480
2473 2481 t = gethrtime();
2474 2482
2475 2483 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2476 2484
2477 2485 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2478 2486 nfs4_set_open_seqid(seqid, oop, args.ctag);
2479 2487 }
2480 2488
2481 2489 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2482 2490 if (ep->error && !needrecov) {
2483 2491 /*
2484 2492 * if there was an error and no recovery is to be done
2485 2493 * then then set up the file to flush its cache if
2486 2494 * needed for the next caller.
2487 2495 */
2488 2496 mutex_enter(&rp->r_statelock);
2489 2497 PURGE_ATTRCACHE4_LOCKED(rp);
2490 2498 rp->r_flags &= ~R4WRITEMODIFIED;
2491 2499 mutex_exit(&rp->r_statelock);
2492 2500 return;
2493 2501 }
2494 2502
2495 2503 if (needrecov) {
2496 2504 bool_t abort;
2497 2505 nfs4_bseqid_entry_t *bsep = NULL;
2498 2506
2499 2507 if (close_type != CLOSE_RESEND)
2500 2508 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2501 2509 osp, cred_otw, vp);
2502 2510
2503 2511 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2504 2512 bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2505 2513 0, args.ctag, close_args->seqid);
2506 2514
2507 2515 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2508 2516 "nfs4close_otw: initiating recovery. error %d "
2509 2517 "res.status %d", ep->error, res.status));
2510 2518
2511 2519 /*
2512 2520 * Drop the 'os_sync_lock' here so we don't hit
2513 2521 * a potential recursive mutex_enter via an
2514 2522 * 'open_stream_hold()'.
2515 2523 */
2516 2524 mutex_exit(&osp->os_sync_lock);
2517 2525 *have_sync_lockp = 0;
2518 2526 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2519 2527 (close_type != CLOSE_RESEND &&
2520 2528 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2521 2529 OP_CLOSE, bsep, NULL, NULL);
2522 2530
2523 2531 /* drop open seq sync, and let the calling function regrab it */
2524 2532 nfs4_end_open_seqid_sync(oop);
2525 2533 *did_start_seqid_syncp = 0;
2526 2534
2527 2535 if (bsep)
2528 2536 kmem_free(bsep, sizeof (*bsep));
2529 2537 /*
2530 2538 * For signals, the caller wants to quit, so don't say to
2531 2539 * retry. For forced unmount, if it's a user thread, it
2532 2540 * wants to quit. If it's a recovery thread, the retry
2533 2541 * will happen higher-up on the call stack. Either way,
2534 2542 * don't say to retry.
2535 2543 */
2536 2544 if (abort == FALSE && ep->error != EINTR &&
2537 2545 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2538 2546 close_type != CLOSE_RESEND &&
2539 2547 close_type != CLOSE_AFTER_RESEND)
2540 2548 *recov = 1;
2541 2549 else
2542 2550 *recov = 0;
2543 2551
2544 2552 if (!ep->error)
2545 2553 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2546 2554 return;
2547 2555 }
2548 2556
2549 2557 if (res.status) {
2550 2558 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2551 2559 return;
2552 2560 }
2553 2561
2554 2562 mutex_enter(&rp->r_statev4_lock);
2555 2563 rp->created_v4 = 0;
2556 2564 mutex_exit(&rp->r_statev4_lock);
2557 2565
2558 2566 resop = &res.array[2];
2559 2567 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2560 2568 osp->os_valid = 0;
2561 2569
2562 2570 /*
2563 2571 * This removes the reference obtained at OPEN; ie, when the
2564 2572 * open stream structure was created.
2565 2573 *
2566 2574 * We don't have to worry about calling 'open_stream_rele'
2567 2575 * since we our currently holding a reference to the open
2568 2576 * stream which means the count cannot go to 0 with this
2569 2577 * decrement.
2570 2578 */
2571 2579 ASSERT(osp->os_ref_count >= 2);
2572 2580 osp->os_ref_count--;
2573 2581
2574 2582 if (!ep->error)
2575 2583 nfs4_attr_cache(vp,
2576 2584 &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2577 2585 t, cred_otw, TRUE, NULL);
2578 2586
2579 2587 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2580 2588 " returning %d", ep->error));
2581 2589
2582 2590 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2583 2591 }
2584 2592
2585 2593 /* ARGSUSED */
2586 2594 static int
2587 2595 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2588 2596 caller_context_t *ct)
2589 2597 {
2590 2598 rnode4_t *rp;
2591 2599 u_offset_t off;
2592 2600 offset_t diff;
2593 2601 uint_t on;
2594 2602 uint_t n;
2595 2603 caddr_t base;
2596 2604 uint_t flags;
2597 2605 int error;
2598 2606 mntinfo4_t *mi;
2599 2607
2600 2608 rp = VTOR4(vp);
2601 2609
2602 2610 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2603 2611
2604 2612 if (IS_SHADOW(vp, rp))
2605 2613 vp = RTOV4(rp);
2606 2614
2607 2615 if (vp->v_type != VREG)
2608 2616 return (EISDIR);
2609 2617
2610 2618 mi = VTOMI4(vp);
2611 2619
2612 2620 if (nfs_zone() != mi->mi_zone)
2613 2621 return (EIO);
2614 2622
2615 2623 if (uiop->uio_resid == 0)
2616 2624 return (0);
2617 2625
2618 2626 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2619 2627 return (EINVAL);
2620 2628
2621 2629 mutex_enter(&rp->r_statelock);
2622 2630 if (rp->r_flags & R4RECOVERRP)
2623 2631 error = (rp->r_error ? rp->r_error : EIO);
2624 2632 else
2625 2633 error = 0;
2626 2634 mutex_exit(&rp->r_statelock);
2627 2635 if (error)
2628 2636 return (error);
2629 2637
2630 2638 /*
2631 2639 * Bypass VM if caching has been disabled (e.g., locking) or if
2632 2640 * using client-side direct I/O and the file is not mmap'd and
2633 2641 * there are no cached pages.
2634 2642 */
2635 2643 if ((vp->v_flag & VNOCACHE) ||
2636 2644 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2637 2645 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2638 2646 size_t resid = 0;
2639 2647
2640 2648 return (nfs4read(vp, NULL, uiop->uio_loffset,
2641 2649 uiop->uio_resid, &resid, cr, FALSE, uiop));
2642 2650 }
2643 2651
2644 2652 error = 0;
2645 2653
2646 2654 do {
2647 2655 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2648 2656 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2649 2657 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2650 2658
2651 2659 if (error = nfs4_validate_caches(vp, cr))
2652 2660 break;
2653 2661
2654 2662 mutex_enter(&rp->r_statelock);
2655 2663 while (rp->r_flags & R4INCACHEPURGE) {
2656 2664 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2657 2665 mutex_exit(&rp->r_statelock);
2658 2666 return (EINTR);
2659 2667 }
2660 2668 }
2661 2669 diff = rp->r_size - uiop->uio_loffset;
2662 2670 mutex_exit(&rp->r_statelock);
2663 2671 if (diff <= 0)
2664 2672 break;
2665 2673 if (diff < n)
2666 2674 n = (uint_t)diff;
2667 2675
2668 2676 if (vpm_enable) {
2669 2677 /*
2670 2678 * Copy data.
2671 2679 */
2672 2680 error = vpm_data_copy(vp, off + on, n, uiop,
2673 2681 1, NULL, 0, S_READ);
2674 2682 } else {
2675 2683 base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2676 2684 S_READ);
2677 2685
2678 2686 error = uiomove(base + on, n, UIO_READ, uiop);
2679 2687 }
2680 2688
2681 2689 if (!error) {
2682 2690 /*
2683 2691 * If read a whole block or read to eof,
2684 2692 * won't need this buffer again soon.
2685 2693 */
2686 2694 mutex_enter(&rp->r_statelock);
2687 2695 if (n + on == MAXBSIZE ||
2688 2696 uiop->uio_loffset == rp->r_size)
2689 2697 flags = SM_DONTNEED;
2690 2698 else
2691 2699 flags = 0;
2692 2700 mutex_exit(&rp->r_statelock);
2693 2701 if (vpm_enable) {
2694 2702 error = vpm_sync_pages(vp, off, n, flags);
2695 2703 } else {
2696 2704 error = segmap_release(segkmap, base, flags);
2697 2705 }
2698 2706 } else {
2699 2707 if (vpm_enable) {
2700 2708 (void) vpm_sync_pages(vp, off, n, 0);
2701 2709 } else {
2702 2710 (void) segmap_release(segkmap, base, 0);
2703 2711 }
2704 2712 }
2705 2713 } while (!error && uiop->uio_resid > 0);
2706 2714
2707 2715 return (error);
2708 2716 }
2709 2717
2710 2718 /* ARGSUSED */
2711 2719 static int
2712 2720 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2713 2721 caller_context_t *ct)
2714 2722 {
2715 2723 rlim64_t limit = uiop->uio_llimit;
2716 2724 rnode4_t *rp;
2717 2725 u_offset_t off;
2718 2726 caddr_t base;
2719 2727 uint_t flags;
2720 2728 int remainder;
2721 2729 size_t n;
2722 2730 int on;
2723 2731 int error;
2724 2732 int resid;
2725 2733 u_offset_t offset;
2726 2734 mntinfo4_t *mi;
2727 2735 uint_t bsize;
2728 2736
2729 2737 rp = VTOR4(vp);
2730 2738
2731 2739 if (IS_SHADOW(vp, rp))
2732 2740 vp = RTOV4(rp);
2733 2741
2734 2742 if (vp->v_type != VREG)
2735 2743 return (EISDIR);
2736 2744
2737 2745 mi = VTOMI4(vp);
2738 2746
2739 2747 if (nfs_zone() != mi->mi_zone)
2740 2748 return (EIO);
2741 2749
2742 2750 if (uiop->uio_resid == 0)
2743 2751 return (0);
2744 2752
2745 2753 mutex_enter(&rp->r_statelock);
2746 2754 if (rp->r_flags & R4RECOVERRP)
2747 2755 error = (rp->r_error ? rp->r_error : EIO);
2748 2756 else
2749 2757 error = 0;
2750 2758 mutex_exit(&rp->r_statelock);
2751 2759 if (error)
2752 2760 return (error);
2753 2761
2754 2762 if (ioflag & FAPPEND) {
2755 2763 struct vattr va;
2756 2764
2757 2765 /*
2758 2766 * Must serialize if appending.
2759 2767 */
2760 2768 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2761 2769 nfs_rw_exit(&rp->r_rwlock);
2762 2770 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2763 2771 INTR4(vp)))
2764 2772 return (EINTR);
2765 2773 }
2766 2774
2767 2775 va.va_mask = AT_SIZE;
2768 2776 error = nfs4getattr(vp, &va, cr);
2769 2777 if (error)
2770 2778 return (error);
2771 2779 uiop->uio_loffset = va.va_size;
2772 2780 }
2773 2781
2774 2782 offset = uiop->uio_loffset + uiop->uio_resid;
2775 2783
2776 2784 if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2777 2785 return (EINVAL);
2778 2786
2779 2787 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2780 2788 limit = MAXOFFSET_T;
2781 2789
2782 2790 /*
2783 2791 * Check to make sure that the process will not exceed
2784 2792 * its limit on file size. It is okay to write up to
2785 2793 * the limit, but not beyond. Thus, the write which
2786 2794 * reaches the limit will be short and the next write
2787 2795 * will return an error.
2788 2796 */
2789 2797 remainder = 0;
2790 2798 if (offset > uiop->uio_llimit) {
2791 2799 remainder = offset - uiop->uio_llimit;
2792 2800 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2793 2801 if (uiop->uio_resid <= 0) {
2794 2802 proc_t *p = ttoproc(curthread);
2795 2803
2796 2804 uiop->uio_resid += remainder;
2797 2805 mutex_enter(&p->p_lock);
2798 2806 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2799 2807 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2800 2808 mutex_exit(&p->p_lock);
2801 2809 return (EFBIG);
2802 2810 }
2803 2811 }
2804 2812
2805 2813 /* update the change attribute, if we have a write delegation */
2806 2814
2807 2815 mutex_enter(&rp->r_statev4_lock);
2808 2816 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2809 2817 rp->r_deleg_change++;
2810 2818
2811 2819 mutex_exit(&rp->r_statev4_lock);
2812 2820
2813 2821 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2814 2822 return (EINTR);
2815 2823
2816 2824 /*
2817 2825 * Bypass VM if caching has been disabled (e.g., locking) or if
2818 2826 * using client-side direct I/O and the file is not mmap'd and
2819 2827 * there are no cached pages.
2820 2828 */
2821 2829 if ((vp->v_flag & VNOCACHE) ||
2822 2830 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2823 2831 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2824 2832 size_t bufsize;
2825 2833 int count;
2826 2834 u_offset_t org_offset;
2827 2835 stable_how4 stab_comm;
2828 2836 nfs4_fwrite:
2829 2837 if (rp->r_flags & R4STALE) {
2830 2838 resid = uiop->uio_resid;
2831 2839 offset = uiop->uio_loffset;
2832 2840 error = rp->r_error;
2833 2841 /*
2834 2842 * A close may have cleared r_error, if so,
2835 2843 * propagate ESTALE error return properly
2836 2844 */
2837 2845 if (error == 0)
2838 2846 error = ESTALE;
2839 2847 goto bottom;
2840 2848 }
2841 2849
2842 2850 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2843 2851 base = kmem_alloc(bufsize, KM_SLEEP);
2844 2852 do {
2845 2853 if (ioflag & FDSYNC)
2846 2854 stab_comm = DATA_SYNC4;
2847 2855 else
2848 2856 stab_comm = FILE_SYNC4;
2849 2857 resid = uiop->uio_resid;
2850 2858 offset = uiop->uio_loffset;
2851 2859 count = MIN(uiop->uio_resid, bufsize);
2852 2860 org_offset = uiop->uio_loffset;
2853 2861 error = uiomove(base, count, UIO_WRITE, uiop);
2854 2862 if (!error) {
2855 2863 error = nfs4write(vp, base, org_offset,
2856 2864 count, cr, &stab_comm);
2857 2865 if (!error) {
2858 2866 mutex_enter(&rp->r_statelock);
2859 2867 if (rp->r_size < uiop->uio_loffset)
2860 2868 rp->r_size = uiop->uio_loffset;
2861 2869 mutex_exit(&rp->r_statelock);
2862 2870 }
2863 2871 }
2864 2872 } while (!error && uiop->uio_resid > 0);
2865 2873 kmem_free(base, bufsize);
2866 2874 goto bottom;
2867 2875 }
2868 2876
2869 2877 bsize = vp->v_vfsp->vfs_bsize;
2870 2878
2871 2879 do {
2872 2880 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2873 2881 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2874 2882 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2875 2883
2876 2884 resid = uiop->uio_resid;
2877 2885 offset = uiop->uio_loffset;
2878 2886
2879 2887 if (rp->r_flags & R4STALE) {
2880 2888 error = rp->r_error;
2881 2889 /*
2882 2890 * A close may have cleared r_error, if so,
2883 2891 * propagate ESTALE error return properly
2884 2892 */
2885 2893 if (error == 0)
2886 2894 error = ESTALE;
2887 2895 break;
2888 2896 }
2889 2897
2890 2898 /*
2891 2899 * Don't create dirty pages faster than they
2892 2900 * can be cleaned so that the system doesn't
2893 2901 * get imbalanced. If the async queue is
2894 2902 * maxed out, then wait for it to drain before
2895 2903 * creating more dirty pages. Also, wait for
2896 2904 * any threads doing pagewalks in the vop_getattr
2897 2905 * entry points so that they don't block for
2898 2906 * long periods.
2899 2907 */
2900 2908 mutex_enter(&rp->r_statelock);
2901 2909 while ((mi->mi_max_threads != 0 &&
2902 2910 rp->r_awcount > 2 * mi->mi_max_threads) ||
2903 2911 rp->r_gcount > 0) {
2904 2912 if (INTR4(vp)) {
2905 2913 klwp_t *lwp = ttolwp(curthread);
2906 2914
2907 2915 if (lwp != NULL)
2908 2916 lwp->lwp_nostop++;
2909 2917 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2910 2918 mutex_exit(&rp->r_statelock);
2911 2919 if (lwp != NULL)
2912 2920 lwp->lwp_nostop--;
2913 2921 error = EINTR;
2914 2922 goto bottom;
2915 2923 }
2916 2924 if (lwp != NULL)
2917 2925 lwp->lwp_nostop--;
2918 2926 } else
2919 2927 cv_wait(&rp->r_cv, &rp->r_statelock);
2920 2928 }
2921 2929 mutex_exit(&rp->r_statelock);
2922 2930
2923 2931 /*
2924 2932 * Touch the page and fault it in if it is not in core
2925 2933 * before segmap_getmapflt or vpm_data_copy can lock it.
2926 2934 * This is to avoid the deadlock if the buffer is mapped
2927 2935 * to the same file through mmap which we want to write.
2928 2936 */
2929 2937 uio_prefaultpages((long)n, uiop);
2930 2938
2931 2939 if (vpm_enable) {
2932 2940 /*
2933 2941 * It will use kpm mappings, so no need to
2934 2942 * pass an address.
2935 2943 */
2936 2944 error = writerp4(rp, NULL, n, uiop, 0);
2937 2945 } else {
2938 2946 if (segmap_kpm) {
2939 2947 int pon = uiop->uio_loffset & PAGEOFFSET;
2940 2948 size_t pn = MIN(PAGESIZE - pon,
2941 2949 uiop->uio_resid);
2942 2950 int pagecreate;
2943 2951
2944 2952 mutex_enter(&rp->r_statelock);
2945 2953 pagecreate = (pon == 0) && (pn == PAGESIZE ||
2946 2954 uiop->uio_loffset + pn >= rp->r_size);
2947 2955 mutex_exit(&rp->r_statelock);
2948 2956
2949 2957 base = segmap_getmapflt(segkmap, vp, off + on,
2950 2958 pn, !pagecreate, S_WRITE);
2951 2959
2952 2960 error = writerp4(rp, base + pon, n, uiop,
2953 2961 pagecreate);
2954 2962
2955 2963 } else {
2956 2964 base = segmap_getmapflt(segkmap, vp, off + on,
2957 2965 n, 0, S_READ);
2958 2966 error = writerp4(rp, base + on, n, uiop, 0);
2959 2967 }
2960 2968 }
2961 2969
2962 2970 if (!error) {
2963 2971 if (mi->mi_flags & MI4_NOAC)
2964 2972 flags = SM_WRITE;
2965 2973 else if ((uiop->uio_loffset % bsize) == 0 ||
2966 2974 IS_SWAPVP(vp)) {
2967 2975 /*
2968 2976 * Have written a whole block.
2969 2977 * Start an asynchronous write
2970 2978 * and mark the buffer to
2971 2979 * indicate that it won't be
2972 2980 * needed again soon.
2973 2981 */
2974 2982 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
2975 2983 } else
2976 2984 flags = 0;
2977 2985 if ((ioflag & (FSYNC|FDSYNC)) ||
2978 2986 (rp->r_flags & R4OUTOFSPACE)) {
2979 2987 flags &= ~SM_ASYNC;
2980 2988 flags |= SM_WRITE;
2981 2989 }
2982 2990 if (vpm_enable) {
2983 2991 error = vpm_sync_pages(vp, off, n, flags);
2984 2992 } else {
2985 2993 error = segmap_release(segkmap, base, flags);
2986 2994 }
2987 2995 } else {
2988 2996 if (vpm_enable) {
2989 2997 (void) vpm_sync_pages(vp, off, n, 0);
2990 2998 } else {
2991 2999 (void) segmap_release(segkmap, base, 0);
2992 3000 }
2993 3001 /*
2994 3002 * In the event that we got an access error while
2995 3003 * faulting in a page for a write-only file just
2996 3004 * force a write.
2997 3005 */
2998 3006 if (error == EACCES)
2999 3007 goto nfs4_fwrite;
3000 3008 }
3001 3009 } while (!error && uiop->uio_resid > 0);
3002 3010
3003 3011 bottom:
3004 3012 if (error) {
3005 3013 uiop->uio_resid = resid + remainder;
3006 3014 uiop->uio_loffset = offset;
3007 3015 } else {
3008 3016 uiop->uio_resid += remainder;
3009 3017
3010 3018 mutex_enter(&rp->r_statev4_lock);
3011 3019 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3012 3020 gethrestime(&rp->r_attr.va_mtime);
3013 3021 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3014 3022 }
3015 3023 mutex_exit(&rp->r_statev4_lock);
3016 3024 }
3017 3025
3018 3026 nfs_rw_exit(&rp->r_lkserlock);
3019 3027
3020 3028 return (error);
3021 3029 }
3022 3030
3023 3031 /*
3024 3032 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3025 3033 */
3026 3034 static int
3027 3035 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3028 3036 int flags, cred_t *cr)
3029 3037 {
3030 3038 struct buf *bp;
3031 3039 int error;
3032 3040 page_t *savepp;
3033 3041 uchar_t fsdata;
3034 3042 stable_how4 stab_comm;
3035 3043
3036 3044 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3037 3045 bp = pageio_setup(pp, len, vp, flags);
3038 3046 ASSERT(bp != NULL);
3039 3047
3040 3048 /*
3041 3049 * pageio_setup should have set b_addr to 0. This
3042 3050 * is correct since we want to do I/O on a page
3043 3051 * boundary. bp_mapin will use this addr to calculate
3044 3052 * an offset, and then set b_addr to the kernel virtual
3045 3053 * address it allocated for us.
3046 3054 */
3047 3055 ASSERT(bp->b_un.b_addr == 0);
3048 3056
3049 3057 bp->b_edev = 0;
3050 3058 bp->b_dev = 0;
3051 3059 bp->b_lblkno = lbtodb(off);
3052 3060 bp->b_file = vp;
3053 3061 bp->b_offset = (offset_t)off;
3054 3062 bp_mapin(bp);
3055 3063
3056 3064 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3057 3065 freemem > desfree)
3058 3066 stab_comm = UNSTABLE4;
3059 3067 else
3060 3068 stab_comm = FILE_SYNC4;
3061 3069
3062 3070 error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3063 3071
3064 3072 bp_mapout(bp);
3065 3073 pageio_done(bp);
3066 3074
3067 3075 if (stab_comm == UNSTABLE4)
3068 3076 fsdata = C_DELAYCOMMIT;
3069 3077 else
3070 3078 fsdata = C_NOCOMMIT;
3071 3079
3072 3080 savepp = pp;
3073 3081 do {
3074 3082 pp->p_fsdata = fsdata;
3075 3083 } while ((pp = pp->p_next) != savepp);
3076 3084
3077 3085 return (error);
3078 3086 }
3079 3087
3080 3088 /*
3081 3089 */
3082 3090 static int
3083 3091 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3084 3092 {
3085 3093 nfs4_open_owner_t *oop;
3086 3094 nfs4_open_stream_t *osp;
3087 3095 rnode4_t *rp = VTOR4(vp);
3088 3096 mntinfo4_t *mi = VTOMI4(vp);
3089 3097 int reopen_needed;
3090 3098
3091 3099 ASSERT(nfs_zone() == mi->mi_zone);
3092 3100
3093 3101
3094 3102 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3095 3103 if (!oop)
3096 3104 return (EIO);
3097 3105
3098 3106 /* returns with 'os_sync_lock' held */
3099 3107 osp = find_open_stream(oop, rp);
3100 3108 if (!osp) {
3101 3109 open_owner_rele(oop);
3102 3110 return (EIO);
3103 3111 }
3104 3112
3105 3113 if (osp->os_failed_reopen) {
3106 3114 mutex_exit(&osp->os_sync_lock);
3107 3115 open_stream_rele(osp, rp);
3108 3116 open_owner_rele(oop);
3109 3117 return (EIO);
3110 3118 }
3111 3119
3112 3120 /*
3113 3121 * Determine whether a reopen is needed. If this
3114 3122 * is a delegation open stream, then the os_delegation bit
3115 3123 * should be set.
3116 3124 */
3117 3125
3118 3126 reopen_needed = osp->os_delegation;
3119 3127
3120 3128 mutex_exit(&osp->os_sync_lock);
3121 3129 open_owner_rele(oop);
3122 3130
3123 3131 if (reopen_needed) {
3124 3132 nfs4_error_zinit(ep);
3125 3133 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3126 3134 mutex_enter(&osp->os_sync_lock);
3127 3135 if (ep->error || ep->stat || osp->os_failed_reopen) {
3128 3136 mutex_exit(&osp->os_sync_lock);
3129 3137 open_stream_rele(osp, rp);
3130 3138 return (EIO);
3131 3139 }
3132 3140 mutex_exit(&osp->os_sync_lock);
3133 3141 }
3134 3142 open_stream_rele(osp, rp);
3135 3143
3136 3144 return (0);
3137 3145 }
3138 3146
3139 3147 /*
3140 3148 * Write to file. Writes to remote server in largest size
3141 3149 * chunks that the server can handle. Write is synchronous.
3142 3150 */
3143 3151 static int
3144 3152 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3145 3153 stable_how4 *stab_comm)
3146 3154 {
3147 3155 mntinfo4_t *mi;
3148 3156 COMPOUND4args_clnt args;
3149 3157 COMPOUND4res_clnt res;
3150 3158 WRITE4args *wargs;
3151 3159 WRITE4res *wres;
3152 3160 nfs_argop4 argop[2];
3153 3161 nfs_resop4 *resop;
3154 3162 int tsize;
3155 3163 stable_how4 stable;
3156 3164 rnode4_t *rp;
3157 3165 int doqueue = 1;
3158 3166 bool_t needrecov;
3159 3167 nfs4_recov_state_t recov_state;
3160 3168 nfs4_stateid_types_t sid_types;
3161 3169 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3162 3170 int recov;
3163 3171
3164 3172 rp = VTOR4(vp);
3165 3173 mi = VTOMI4(vp);
3166 3174
3167 3175 ASSERT(nfs_zone() == mi->mi_zone);
3168 3176
3169 3177 stable = *stab_comm;
3170 3178 *stab_comm = FILE_SYNC4;
3171 3179
3172 3180 needrecov = FALSE;
3173 3181 recov_state.rs_flags = 0;
3174 3182 recov_state.rs_num_retry_despite_err = 0;
3175 3183 nfs4_init_stateid_types(&sid_types);
3176 3184
3177 3185 /* Is curthread the recovery thread? */
3178 3186 mutex_enter(&mi->mi_lock);
3179 3187 recov = (mi->mi_recovthread == curthread);
3180 3188 mutex_exit(&mi->mi_lock);
3181 3189
3182 3190 recov_retry:
3183 3191 args.ctag = TAG_WRITE;
3184 3192 args.array_len = 2;
3185 3193 args.array = argop;
3186 3194
3187 3195 if (!recov) {
3188 3196 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3189 3197 &recov_state, NULL);
3190 3198 if (e.error)
3191 3199 return (e.error);
3192 3200 }
3193 3201
3194 3202 /* 0. putfh target fh */
3195 3203 argop[0].argop = OP_CPUTFH;
3196 3204 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3197 3205
3198 3206 /* 1. write */
3199 3207 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3200 3208
3201 3209 do {
3202 3210
3203 3211 wargs->offset = (offset4)offset;
3204 3212 wargs->data_val = base;
3205 3213
3206 3214 if (mi->mi_io_kstats) {
3207 3215 mutex_enter(&mi->mi_lock);
3208 3216 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3209 3217 mutex_exit(&mi->mi_lock);
3210 3218 }
3211 3219
3212 3220 if ((vp->v_flag & VNOCACHE) ||
3213 3221 (rp->r_flags & R4DIRECTIO) ||
3214 3222 (mi->mi_flags & MI4_DIRECTIO))
3215 3223 tsize = MIN(mi->mi_stsize, count);
3216 3224 else
3217 3225 tsize = MIN(mi->mi_curwrite, count);
3218 3226 wargs->data_len = (uint_t)tsize;
3219 3227 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3220 3228
3221 3229 if (mi->mi_io_kstats) {
3222 3230 mutex_enter(&mi->mi_lock);
3223 3231 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3224 3232 mutex_exit(&mi->mi_lock);
3225 3233 }
3226 3234
3227 3235 if (!recov) {
3228 3236 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3229 3237 if (e.error && !needrecov) {
3230 3238 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3231 3239 &recov_state, needrecov);
3232 3240 return (e.error);
3233 3241 }
3234 3242 } else {
3235 3243 if (e.error)
3236 3244 return (e.error);
3237 3245 }
3238 3246
3239 3247 /*
3240 3248 * Do handling of OLD_STATEID outside
3241 3249 * of the normal recovery framework.
3242 3250 *
3243 3251 * If write receives a BAD stateid error while using a
3244 3252 * delegation stateid, retry using the open stateid (if it
3245 3253 * exists). If it doesn't have an open stateid, reopen the
3246 3254 * file first, then retry.
3247 3255 */
3248 3256 if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3249 3257 sid_types.cur_sid_type != SPEC_SID) {
3250 3258 nfs4_save_stateid(&wargs->stateid, &sid_types);
3251 3259 if (!recov)
3252 3260 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3253 3261 &recov_state, needrecov);
3254 3262 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3255 3263 goto recov_retry;
3256 3264 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3257 3265 sid_types.cur_sid_type == DEL_SID) {
3258 3266 nfs4_save_stateid(&wargs->stateid, &sid_types);
3259 3267 mutex_enter(&rp->r_statev4_lock);
3260 3268 rp->r_deleg_return_pending = TRUE;
3261 3269 mutex_exit(&rp->r_statev4_lock);
3262 3270 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3263 3271 if (!recov)
3264 3272 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3265 3273 &recov_state, needrecov);
3266 3274 (void) xdr_free(xdr_COMPOUND4res_clnt,
3267 3275 (caddr_t)&res);
3268 3276 return (EIO);
3269 3277 }
3270 3278 if (!recov)
3271 3279 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3272 3280 &recov_state, needrecov);
3273 3281 /* hold needed for nfs4delegreturn_thread */
3274 3282 VN_HOLD(vp);
3275 3283 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3276 3284 NFS4_DR_DISCARD), FALSE);
3277 3285 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3278 3286 goto recov_retry;
3279 3287 }
3280 3288
3281 3289 if (needrecov) {
3282 3290 bool_t abort;
3283 3291
3284 3292 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3285 3293 "nfs4write: client got error %d, res.status %d"
3286 3294 ", so start recovery", e.error, res.status));
3287 3295
3288 3296 abort = nfs4_start_recovery(&e,
3289 3297 VTOMI4(vp), vp, NULL, &wargs->stateid,
3290 3298 NULL, OP_WRITE, NULL, NULL, NULL);
3291 3299 if (!e.error) {
3292 3300 e.error = geterrno4(res.status);
3293 3301 (void) xdr_free(xdr_COMPOUND4res_clnt,
3294 3302 (caddr_t)&res);
3295 3303 }
3296 3304 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3297 3305 &recov_state, needrecov);
3298 3306 if (abort == FALSE)
3299 3307 goto recov_retry;
3300 3308 return (e.error);
3301 3309 }
3302 3310
3303 3311 if (res.status) {
3304 3312 e.error = geterrno4(res.status);
3305 3313 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3306 3314 if (!recov)
3307 3315 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3308 3316 &recov_state, needrecov);
3309 3317 return (e.error);
3310 3318 }
3311 3319
3312 3320 resop = &res.array[1]; /* write res */
3313 3321 wres = &resop->nfs_resop4_u.opwrite;
3314 3322
3315 3323 if ((int)wres->count > tsize) {
3316 3324 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3317 3325
3318 3326 zcmn_err(getzoneid(), CE_WARN,
3319 3327 "nfs4write: server wrote %u, requested was %u",
3320 3328 (int)wres->count, tsize);
3321 3329 if (!recov)
3322 3330 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3323 3331 &recov_state, needrecov);
3324 3332 return (EIO);
3325 3333 }
3326 3334 if (wres->committed == UNSTABLE4) {
3327 3335 *stab_comm = UNSTABLE4;
3328 3336 if (wargs->stable == DATA_SYNC4 ||
3329 3337 wargs->stable == FILE_SYNC4) {
3330 3338 (void) xdr_free(xdr_COMPOUND4res_clnt,
3331 3339 (caddr_t)&res);
3332 3340 zcmn_err(getzoneid(), CE_WARN,
3333 3341 "nfs4write: server %s did not commit "
3334 3342 "to stable storage",
3335 3343 rp->r_server->sv_hostname);
3336 3344 if (!recov)
3337 3345 nfs4_end_fop(VTOMI4(vp), vp, NULL,
3338 3346 OH_WRITE, &recov_state, needrecov);
3339 3347 return (EIO);
3340 3348 }
3341 3349 }
3342 3350
3343 3351 tsize = (int)wres->count;
3344 3352 count -= tsize;
3345 3353 base += tsize;
3346 3354 offset += tsize;
3347 3355 if (mi->mi_io_kstats) {
3348 3356 mutex_enter(&mi->mi_lock);
3349 3357 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3350 3358 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3351 3359 tsize;
3352 3360 mutex_exit(&mi->mi_lock);
3353 3361 }
3354 3362 lwp_stat_update(LWP_STAT_OUBLK, 1);
3355 3363 mutex_enter(&rp->r_statelock);
3356 3364 if (rp->r_flags & R4HAVEVERF) {
3357 3365 if (rp->r_writeverf != wres->writeverf) {
3358 3366 nfs4_set_mod(vp);
3359 3367 rp->r_writeverf = wres->writeverf;
3360 3368 }
3361 3369 } else {
3362 3370 rp->r_writeverf = wres->writeverf;
3363 3371 rp->r_flags |= R4HAVEVERF;
3364 3372 }
3365 3373 PURGE_ATTRCACHE4_LOCKED(rp);
3366 3374 rp->r_flags |= R4WRITEMODIFIED;
3367 3375 gethrestime(&rp->r_attr.va_mtime);
3368 3376 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3369 3377 mutex_exit(&rp->r_statelock);
3370 3378 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3371 3379 } while (count);
3372 3380
3373 3381 if (!recov)
3374 3382 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3375 3383 needrecov);
3376 3384
3377 3385 return (e.error);
3378 3386 }
3379 3387
3380 3388 /*
3381 3389 * Read from a file. Reads data in largest chunks our interface can handle.
3382 3390 */
3383 3391 static int
3384 3392 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3385 3393 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3386 3394 {
3387 3395 mntinfo4_t *mi;
3388 3396 COMPOUND4args_clnt args;
3389 3397 COMPOUND4res_clnt res;
3390 3398 READ4args *rargs;
3391 3399 nfs_argop4 argop[2];
3392 3400 int tsize;
3393 3401 int doqueue;
3394 3402 rnode4_t *rp;
3395 3403 int data_len;
3396 3404 bool_t is_eof;
3397 3405 bool_t needrecov = FALSE;
3398 3406 nfs4_recov_state_t recov_state;
3399 3407 nfs4_stateid_types_t sid_types;
3400 3408 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3401 3409
3402 3410 rp = VTOR4(vp);
3403 3411 mi = VTOMI4(vp);
3404 3412 doqueue = 1;
3405 3413
3406 3414 ASSERT(nfs_zone() == mi->mi_zone);
3407 3415
3408 3416 args.ctag = async ? TAG_READAHEAD : TAG_READ;
3409 3417
3410 3418 args.array_len = 2;
3411 3419 args.array = argop;
3412 3420
3413 3421 nfs4_init_stateid_types(&sid_types);
3414 3422
3415 3423 recov_state.rs_flags = 0;
3416 3424 recov_state.rs_num_retry_despite_err = 0;
3417 3425
3418 3426 recov_retry:
3419 3427 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3420 3428 &recov_state, NULL);
3421 3429 if (e.error)
3422 3430 return (e.error);
3423 3431
3424 3432 /* putfh target fh */
3425 3433 argop[0].argop = OP_CPUTFH;
3426 3434 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3427 3435
3428 3436 /* read */
3429 3437 argop[1].argop = OP_READ;
3430 3438 rargs = &argop[1].nfs_argop4_u.opread;
3431 3439 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3432 3440 OP_READ, &sid_types, async);
3433 3441
3434 3442 do {
3435 3443 if (mi->mi_io_kstats) {
3436 3444 mutex_enter(&mi->mi_lock);
3437 3445 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3438 3446 mutex_exit(&mi->mi_lock);
3439 3447 }
3440 3448
3441 3449 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3442 3450 "nfs4read: %s call, rp %s",
3443 3451 needrecov ? "recov" : "first",
3444 3452 rnode4info(rp)));
3445 3453
3446 3454 if ((vp->v_flag & VNOCACHE) ||
3447 3455 (rp->r_flags & R4DIRECTIO) ||
3448 3456 (mi->mi_flags & MI4_DIRECTIO))
3449 3457 tsize = MIN(mi->mi_tsize, count);
3450 3458 else
3451 3459 tsize = MIN(mi->mi_curread, count);
3452 3460
3453 3461 rargs->offset = (offset4)offset;
3454 3462 rargs->count = (count4)tsize;
3455 3463 rargs->res_data_val_alt = NULL;
3456 3464 rargs->res_mblk = NULL;
3457 3465 rargs->res_uiop = NULL;
3458 3466 rargs->res_maxsize = 0;
3459 3467 rargs->wlist = NULL;
3460 3468
3461 3469 if (uiop)
3462 3470 rargs->res_uiop = uiop;
3463 3471 else
3464 3472 rargs->res_data_val_alt = base;
3465 3473 rargs->res_maxsize = tsize;
3466 3474
3467 3475 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3468 3476 #ifdef DEBUG
3469 3477 if (nfs4read_error_inject) {
3470 3478 res.status = nfs4read_error_inject;
3471 3479 nfs4read_error_inject = 0;
3472 3480 }
3473 3481 #endif
3474 3482
3475 3483 if (mi->mi_io_kstats) {
3476 3484 mutex_enter(&mi->mi_lock);
3477 3485 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3478 3486 mutex_exit(&mi->mi_lock);
3479 3487 }
3480 3488
3481 3489 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3482 3490 if (e.error != 0 && !needrecov) {
3483 3491 nfs4_end_fop(mi, vp, NULL, OH_READ,
3484 3492 &recov_state, needrecov);
3485 3493 return (e.error);
3486 3494 }
3487 3495
3488 3496 /*
3489 3497 * Do proper retry for OLD and BAD stateid errors outside
3490 3498 * of the normal recovery framework. There are two differences
3491 3499 * between async and sync reads. The first is that we allow
3492 3500 * retry on BAD_STATEID for async reads, but not sync reads.
3493 3501 * The second is that we mark the file dead for a failed
3494 3502 * attempt with a special stateid for sync reads, but just
3495 3503 * return EIO for async reads.
3496 3504 *
3497 3505 * If a sync read receives a BAD stateid error while using a
3498 3506 * delegation stateid, retry using the open stateid (if it
3499 3507 * exists). If it doesn't have an open stateid, reopen the
3500 3508 * file first, then retry.
3501 3509 */
3502 3510 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3503 3511 res.status == NFS4ERR_BAD_STATEID) && async) {
3504 3512 nfs4_end_fop(mi, vp, NULL, OH_READ,
3505 3513 &recov_state, needrecov);
3506 3514 if (sid_types.cur_sid_type == SPEC_SID) {
3507 3515 (void) xdr_free(xdr_COMPOUND4res_clnt,
3508 3516 (caddr_t)&res);
3509 3517 return (EIO);
3510 3518 }
3511 3519 nfs4_save_stateid(&rargs->stateid, &sid_types);
3512 3520 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3513 3521 goto recov_retry;
3514 3522 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3515 3523 !async && sid_types.cur_sid_type != SPEC_SID) {
3516 3524 nfs4_save_stateid(&rargs->stateid, &sid_types);
3517 3525 nfs4_end_fop(mi, vp, NULL, OH_READ,
3518 3526 &recov_state, needrecov);
3519 3527 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3520 3528 goto recov_retry;
3521 3529 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3522 3530 sid_types.cur_sid_type == DEL_SID) {
3523 3531 nfs4_save_stateid(&rargs->stateid, &sid_types);
3524 3532 mutex_enter(&rp->r_statev4_lock);
3525 3533 rp->r_deleg_return_pending = TRUE;
3526 3534 mutex_exit(&rp->r_statev4_lock);
3527 3535 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3528 3536 nfs4_end_fop(mi, vp, NULL, OH_READ,
3529 3537 &recov_state, needrecov);
3530 3538 (void) xdr_free(xdr_COMPOUND4res_clnt,
3531 3539 (caddr_t)&res);
3532 3540 return (EIO);
3533 3541 }
3534 3542 nfs4_end_fop(mi, vp, NULL, OH_READ,
3535 3543 &recov_state, needrecov);
3536 3544 /* hold needed for nfs4delegreturn_thread */
3537 3545 VN_HOLD(vp);
3538 3546 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3539 3547 NFS4_DR_DISCARD), FALSE);
3540 3548 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3541 3549 goto recov_retry;
3542 3550 }
3543 3551 if (needrecov) {
3544 3552 bool_t abort;
3545 3553
3546 3554 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3547 3555 "nfs4read: initiating recovery\n"));
3548 3556 abort = nfs4_start_recovery(&e,
3549 3557 mi, vp, NULL, &rargs->stateid,
3550 3558 NULL, OP_READ, NULL, NULL, NULL);
3551 3559 nfs4_end_fop(mi, vp, NULL, OH_READ,
3552 3560 &recov_state, needrecov);
3553 3561 /*
3554 3562 * Do not retry if we got OLD_STATEID using a special
3555 3563 * stateid. This avoids looping with a broken server.
3556 3564 */
3557 3565 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3558 3566 sid_types.cur_sid_type == SPEC_SID)
3559 3567 abort = TRUE;
3560 3568
3561 3569 if (abort == FALSE) {
3562 3570 /*
3563 3571 * Need to retry all possible stateids in
3564 3572 * case the recovery error wasn't stateid
3565 3573 * related or the stateids have become
3566 3574 * stale (server reboot).
3567 3575 */
3568 3576 nfs4_init_stateid_types(&sid_types);
3569 3577 (void) xdr_free(xdr_COMPOUND4res_clnt,
3570 3578 (caddr_t)&res);
3571 3579 goto recov_retry;
3572 3580 }
3573 3581
3574 3582 if (!e.error) {
3575 3583 e.error = geterrno4(res.status);
3576 3584 (void) xdr_free(xdr_COMPOUND4res_clnt,
3577 3585 (caddr_t)&res);
3578 3586 }
3579 3587 return (e.error);
3580 3588 }
3581 3589
3582 3590 if (res.status) {
3583 3591 e.error = geterrno4(res.status);
3584 3592 nfs4_end_fop(mi, vp, NULL, OH_READ,
3585 3593 &recov_state, needrecov);
3586 3594 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3587 3595 return (e.error);
3588 3596 }
3589 3597
3590 3598 data_len = res.array[1].nfs_resop4_u.opread.data_len;
3591 3599 count -= data_len;
3592 3600 if (base)
3593 3601 base += data_len;
3594 3602 offset += data_len;
3595 3603 if (mi->mi_io_kstats) {
3596 3604 mutex_enter(&mi->mi_lock);
3597 3605 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3598 3606 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3599 3607 mutex_exit(&mi->mi_lock);
3600 3608 }
3601 3609 lwp_stat_update(LWP_STAT_INBLK, 1);
3602 3610 is_eof = res.array[1].nfs_resop4_u.opread.eof;
3603 3611 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3604 3612
3605 3613 } while (count && !is_eof);
3606 3614
3607 3615 *residp = count;
3608 3616
3609 3617 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3610 3618
3611 3619 return (e.error);
3612 3620 }
3613 3621
3614 3622 /* ARGSUSED */
3615 3623 static int
3616 3624 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3617 3625 caller_context_t *ct)
3618 3626 {
3619 3627 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3620 3628 return (EIO);
3621 3629 switch (cmd) {
3622 3630 case _FIODIRECTIO:
3623 3631 return (nfs4_directio(vp, (int)arg, cr));
3624 3632 default:
3625 3633 return (ENOTTY);
3626 3634 }
3627 3635 }
3628 3636
3629 3637 /* ARGSUSED */
3630 3638 int
3631 3639 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3632 3640 caller_context_t *ct)
3633 3641 {
3634 3642 int error;
3635 3643 rnode4_t *rp = VTOR4(vp);
3636 3644
3637 3645 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3638 3646 return (EIO);
3639 3647 /*
3640 3648 * If it has been specified that the return value will
3641 3649 * just be used as a hint, and we are only being asked
3642 3650 * for size, fsid or rdevid, then return the client's
3643 3651 * notion of these values without checking to make sure
3644 3652 * that the attribute cache is up to date.
3645 3653 * The whole point is to avoid an over the wire GETATTR
3646 3654 * call.
3647 3655 */
3648 3656 if (flags & ATTR_HINT) {
3649 3657 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3650 3658 mutex_enter(&rp->r_statelock);
3651 3659 if (vap->va_mask & AT_SIZE)
3652 3660 vap->va_size = rp->r_size;
3653 3661 if (vap->va_mask & AT_FSID)
3654 3662 vap->va_fsid = rp->r_attr.va_fsid;
3655 3663 if (vap->va_mask & AT_RDEV)
3656 3664 vap->va_rdev = rp->r_attr.va_rdev;
3657 3665 mutex_exit(&rp->r_statelock);
3658 3666 return (0);
3659 3667 }
3660 3668 }
3661 3669
3662 3670 /*
3663 3671 * Only need to flush pages if asking for the mtime
3664 3672 * and if there any dirty pages or any outstanding
3665 3673 * asynchronous (write) requests for this file.
3666 3674 */
3667 3675 if (vap->va_mask & AT_MTIME) {
3668 3676 rp = VTOR4(vp);
3669 3677 if (nfs4_has_pages(vp)) {
3670 3678 mutex_enter(&rp->r_statev4_lock);
3671 3679 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3672 3680 mutex_exit(&rp->r_statev4_lock);
3673 3681 if (rp->r_flags & R4DIRTY ||
3674 3682 rp->r_awcount > 0) {
3675 3683 mutex_enter(&rp->r_statelock);
3676 3684 rp->r_gcount++;
3677 3685 mutex_exit(&rp->r_statelock);
3678 3686 error =
3679 3687 nfs4_putpage(vp, (u_offset_t)0,
3680 3688 0, 0, cr, NULL);
3681 3689 mutex_enter(&rp->r_statelock);
3682 3690 if (error && (error == ENOSPC ||
3683 3691 error == EDQUOT)) {
3684 3692 if (!rp->r_error)
3685 3693 rp->r_error = error;
3686 3694 }
3687 3695 if (--rp->r_gcount == 0)
3688 3696 cv_broadcast(&rp->r_cv);
3689 3697 mutex_exit(&rp->r_statelock);
3690 3698 }
3691 3699 } else {
3692 3700 mutex_exit(&rp->r_statev4_lock);
3693 3701 }
3694 3702 }
3695 3703 }
3696 3704 return (nfs4getattr(vp, vap, cr));
3697 3705 }
3698 3706
3699 3707 int
3700 3708 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3701 3709 {
3702 3710 /*
3703 3711 * If these are the only two bits cleared
3704 3712 * on the server then return 0 (OK) else
3705 3713 * return 1 (BAD).
3706 3714 */
3707 3715 on_client &= ~(S_ISUID|S_ISGID);
3708 3716 if (on_client == from_server)
3709 3717 return (0);
3710 3718 else
3711 3719 return (1);
3712 3720 }
3713 3721
3714 3722 /*ARGSUSED4*/
3715 3723 static int
3716 3724 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3717 3725 caller_context_t *ct)
3718 3726 {
3719 3727 int error;
3720 3728
3721 3729 if (vap->va_mask & AT_NOSET)
3722 3730 return (EINVAL);
3723 3731
3724 3732 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3725 3733 return (EIO);
3726 3734
3727 3735 /*
3728 3736 * Don't call secpolicy_vnode_setattr, the client cannot
3729 3737 * use its cached attributes to make security decisions
3730 3738 * as the server may be faking mode bits or mapping uid/gid.
3731 3739 * Always just let the server to the checking.
3732 3740 * If we provide the ability to remove basic priviledges
3733 3741 * to setattr (e.g. basic without chmod) then we will
3734 3742 * need to add a check here before calling the server.
3735 3743 */
3736 3744 error = nfs4setattr(vp, vap, flags, cr, NULL);
3737 3745
3738 3746 if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
3739 3747 vnevent_truncate(vp, ct);
3740 3748
3741 3749 return (error);
3742 3750 }
3743 3751
3744 3752 /*
3745 3753 * To replace the "guarded" version 3 setattr, we use two types of compound
3746 3754 * setattr requests:
3747 3755 * 1. The "normal" setattr, used when the size of the file isn't being
3748 3756 * changed - { Putfh <fh>; Setattr; Getattr }/
3749 3757 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3750 3758 * with only ctime as the argument. If the server ctime differs from
3751 3759 * what is cached on the client, the verify will fail, but we would
3752 3760 * already have the ctime from the preceding getattr, so just set it
3753 3761 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3754 3762 * Setattr; Getattr }.
3755 3763 *
3756 3764 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3757 3765 * this setattr and NULL if they are not.
3758 3766 */
3759 3767 static int
3760 3768 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3761 3769 vsecattr_t *vsap)
3762 3770 {
3763 3771 COMPOUND4args_clnt args;
3764 3772 COMPOUND4res_clnt res, *resp = NULL;
3765 3773 nfs4_ga_res_t *garp = NULL;
3766 3774 int numops = 3; /* { Putfh; Setattr; Getattr } */
3767 3775 nfs_argop4 argop[5];
3768 3776 int verify_argop = -1;
3769 3777 int setattr_argop = 1;
3770 3778 nfs_resop4 *resop;
3771 3779 vattr_t va;
3772 3780 rnode4_t *rp;
3773 3781 int doqueue = 1;
3774 3782 uint_t mask = vap->va_mask;
3775 3783 mode_t omode;
3776 3784 vsecattr_t *vsp;
3777 3785 timestruc_t ctime;
3778 3786 bool_t needrecov = FALSE;
3779 3787 nfs4_recov_state_t recov_state;
3780 3788 nfs4_stateid_types_t sid_types;
3781 3789 stateid4 stateid;
3782 3790 hrtime_t t;
3783 3791 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3784 3792 servinfo4_t *svp;
3785 3793 bitmap4 supp_attrs;
3786 3794
3787 3795 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3788 3796 rp = VTOR4(vp);
3789 3797 nfs4_init_stateid_types(&sid_types);
3790 3798
3791 3799 /*
3792 3800 * Only need to flush pages if there are any pages and
3793 3801 * if the file is marked as dirty in some fashion. The
3794 3802 * file must be flushed so that we can accurately
3795 3803 * determine the size of the file and the cached data
3796 3804 * after the SETATTR returns. A file is considered to
3797 3805 * be dirty if it is either marked with R4DIRTY, has
3798 3806 * outstanding i/o's active, or is mmap'd. In this
3799 3807 * last case, we can't tell whether there are dirty
3800 3808 * pages, so we flush just to be sure.
3801 3809 */
3802 3810 if (nfs4_has_pages(vp) &&
3803 3811 ((rp->r_flags & R4DIRTY) ||
3804 3812 rp->r_count > 0 ||
3805 3813 rp->r_mapcnt > 0)) {
3806 3814 ASSERT(vp->v_type != VCHR);
3807 3815 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3808 3816 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3809 3817 mutex_enter(&rp->r_statelock);
3810 3818 if (!rp->r_error)
3811 3819 rp->r_error = e.error;
3812 3820 mutex_exit(&rp->r_statelock);
3813 3821 }
3814 3822 }
3815 3823
3816 3824 if (mask & AT_SIZE) {
3817 3825 /*
3818 3826 * Verification setattr compound for non-deleg AT_SIZE:
3819 3827 * { Putfh; Getattr; Verify; Setattr; Getattr }
3820 3828 * Set ctime local here (outside the do_again label)
3821 3829 * so that subsequent retries (after failed VERIFY)
3822 3830 * will use ctime from GETATTR results (from failed
3823 3831 * verify compound) as VERIFY arg.
3824 3832 * If file has delegation, then VERIFY(time_metadata)
3825 3833 * is of little added value, so don't bother.
3826 3834 */
3827 3835 mutex_enter(&rp->r_statev4_lock);
3828 3836 if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3829 3837 rp->r_deleg_return_pending) {
3830 3838 numops = 5;
3831 3839 ctime = rp->r_attr.va_ctime;
3832 3840 }
3833 3841 mutex_exit(&rp->r_statev4_lock);
3834 3842 }
3835 3843
3836 3844 recov_state.rs_flags = 0;
3837 3845 recov_state.rs_num_retry_despite_err = 0;
3838 3846
3839 3847 args.ctag = TAG_SETATTR;
3840 3848 do_again:
3841 3849 recov_retry:
3842 3850 setattr_argop = numops - 2;
3843 3851
3844 3852 args.array = argop;
3845 3853 args.array_len = numops;
3846 3854
3847 3855 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3848 3856 if (e.error)
3849 3857 return (e.error);
3850 3858
3851 3859
3852 3860 /* putfh target fh */
3853 3861 argop[0].argop = OP_CPUTFH;
3854 3862 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3855 3863
3856 3864 if (numops == 5) {
3857 3865 /*
3858 3866 * We only care about the ctime, but need to get mtime
3859 3867 * and size for proper cache update.
3860 3868 */
3861 3869 /* getattr */
3862 3870 argop[1].argop = OP_GETATTR;
3863 3871 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3864 3872 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3865 3873
3866 3874 /* verify - set later in loop */
3867 3875 verify_argop = 2;
3868 3876 }
3869 3877
3870 3878 /* setattr */
3871 3879 svp = rp->r_server;
3872 3880 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3873 3881 supp_attrs = svp->sv_supp_attrs;
3874 3882 nfs_rw_exit(&svp->sv_lock);
3875 3883
3876 3884 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3877 3885 supp_attrs, &e.error, &sid_types);
3878 3886 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3879 3887 if (e.error) {
3880 3888 /* req time field(s) overflow - return immediately */
3881 3889 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3882 3890 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3883 3891 opsetattr.obj_attributes);
3884 3892 return (e.error);
3885 3893 }
3886 3894 omode = rp->r_attr.va_mode;
3887 3895
3888 3896 /* getattr */
3889 3897 argop[numops-1].argop = OP_GETATTR;
3890 3898 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3891 3899 /*
3892 3900 * If we are setting the ACL (indicated only by vsap != NULL), request
3893 3901 * the ACL in this getattr. The ACL returned from this getattr will be
3894 3902 * used in updating the ACL cache.
3895 3903 */
3896 3904 if (vsap != NULL)
3897 3905 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3898 3906 FATTR4_ACL_MASK;
3899 3907 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3900 3908
3901 3909 /*
3902 3910 * setattr iterates if the object size is set and the cached ctime
3903 3911 * does not match the file ctime. In that case, verify the ctime first.
3904 3912 */
3905 3913
3906 3914 do {
3907 3915 if (verify_argop != -1) {
3908 3916 /*
3909 3917 * Verify that the ctime match before doing setattr.
3910 3918 */
3911 3919 va.va_mask = AT_CTIME;
3912 3920 va.va_ctime = ctime;
3913 3921 svp = rp->r_server;
3914 3922 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3915 3923 supp_attrs = svp->sv_supp_attrs;
3916 3924 nfs_rw_exit(&svp->sv_lock);
3917 3925 e.error = nfs4args_verify(&argop[verify_argop], &va,
3918 3926 OP_VERIFY, supp_attrs);
3919 3927 if (e.error) {
3920 3928 /* req time field(s) overflow - return */
3921 3929 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3922 3930 needrecov);
3923 3931 break;
3924 3932 }
3925 3933 }
3926 3934
3927 3935 doqueue = 1;
3928 3936
3929 3937 t = gethrtime();
3930 3938
3931 3939 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3932 3940
3933 3941 /*
3934 3942 * Purge the access cache and ACL cache if changing either the
3935 3943 * owner of the file, the group owner, or the mode. These may
3936 3944 * change the access permissions of the file, so purge old
3937 3945 * information and start over again.
3938 3946 */
3939 3947 if (mask & (AT_UID | AT_GID | AT_MODE)) {
3940 3948 (void) nfs4_access_purge_rp(rp);
3941 3949 if (rp->r_secattr != NULL) {
3942 3950 mutex_enter(&rp->r_statelock);
3943 3951 vsp = rp->r_secattr;
3944 3952 rp->r_secattr = NULL;
3945 3953 mutex_exit(&rp->r_statelock);
3946 3954 if (vsp != NULL)
3947 3955 nfs4_acl_free_cache(vsp);
3948 3956 }
3949 3957 }
3950 3958
3951 3959 /*
3952 3960 * If res.array_len == numops, then everything succeeded,
3953 3961 * except for possibly the final getattr. If only the
3954 3962 * last getattr failed, give up, and don't try recovery.
3955 3963 */
3956 3964 if (res.array_len == numops) {
3957 3965 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3958 3966 needrecov);
3959 3967 if (! e.error)
3960 3968 resp = &res;
3961 3969 break;
3962 3970 }
3963 3971
3964 3972 /*
3965 3973 * if either rpc call failed or completely succeeded - done
3966 3974 */
3967 3975 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3968 3976 if (e.error) {
3969 3977 PURGE_ATTRCACHE4(vp);
3970 3978 if (!needrecov) {
3971 3979 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3972 3980 needrecov);
3973 3981 break;
3974 3982 }
3975 3983 }
3976 3984
3977 3985 /*
3978 3986 * Do proper retry for OLD_STATEID outside of the normal
3979 3987 * recovery framework.
3980 3988 */
3981 3989 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3982 3990 sid_types.cur_sid_type != SPEC_SID &&
3983 3991 sid_types.cur_sid_type != NO_SID) {
3984 3992 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3985 3993 needrecov);
3986 3994 nfs4_save_stateid(&stateid, &sid_types);
3987 3995 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3988 3996 opsetattr.obj_attributes);
3989 3997 if (verify_argop != -1) {
3990 3998 nfs4args_verify_free(&argop[verify_argop]);
3991 3999 verify_argop = -1;
3992 4000 }
3993 4001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3994 4002 goto recov_retry;
3995 4003 }
3996 4004
3997 4005 if (needrecov) {
3998 4006 bool_t abort;
3999 4007
4000 4008 abort = nfs4_start_recovery(&e,
4001 4009 VTOMI4(vp), vp, NULL, NULL, NULL,
4002 4010 OP_SETATTR, NULL, NULL, NULL);
4003 4011 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4004 4012 needrecov);
4005 4013 /*
4006 4014 * Do not retry if we failed with OLD_STATEID using
4007 4015 * a special stateid. This is done to avoid looping
4008 4016 * with a broken server.
4009 4017 */
4010 4018 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4011 4019 (sid_types.cur_sid_type == SPEC_SID ||
4012 4020 sid_types.cur_sid_type == NO_SID))
4013 4021 abort = TRUE;
4014 4022 if (!e.error) {
4015 4023 if (res.status == NFS4ERR_BADOWNER)
4016 4024 nfs4_log_badowner(VTOMI4(vp),
4017 4025 OP_SETATTR);
4018 4026
4019 4027 e.error = geterrno4(res.status);
4020 4028 (void) xdr_free(xdr_COMPOUND4res_clnt,
4021 4029 (caddr_t)&res);
4022 4030 }
4023 4031 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4024 4032 opsetattr.obj_attributes);
4025 4033 if (verify_argop != -1) {
4026 4034 nfs4args_verify_free(&argop[verify_argop]);
4027 4035 verify_argop = -1;
4028 4036 }
4029 4037 if (abort == FALSE) {
4030 4038 /*
4031 4039 * Need to retry all possible stateids in
4032 4040 * case the recovery error wasn't stateid
4033 4041 * related or the stateids have become
4034 4042 * stale (server reboot).
4035 4043 */
4036 4044 nfs4_init_stateid_types(&sid_types);
4037 4045 goto recov_retry;
4038 4046 }
4039 4047 return (e.error);
4040 4048 }
4041 4049
4042 4050 /*
4043 4051 * Need to call nfs4_end_op before nfs4getattr to
4044 4052 * avoid potential nfs4_start_op deadlock. See RFE
4045 4053 * 4777612. Calls to nfs4_invalidate_pages() and
4046 4054 * nfs4_purge_stale_fh() might also generate over the
4047 4055 * wire calls which my cause nfs4_start_op() deadlock.
4048 4056 */
4049 4057 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4050 4058
4051 4059 /*
4052 4060 * Check to update lease.
4053 4061 */
4054 4062 resp = &res;
4055 4063 if (res.status == NFS4_OK) {
4056 4064 break;
4057 4065 }
4058 4066
4059 4067 /*
4060 4068 * Check if verify failed to see if try again
4061 4069 */
4062 4070 if ((verify_argop == -1) || (res.array_len != 3)) {
4063 4071 /*
4064 4072 * can't continue...
4065 4073 */
4066 4074 if (res.status == NFS4ERR_BADOWNER)
4067 4075 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4068 4076
4069 4077 e.error = geterrno4(res.status);
4070 4078 } else {
4071 4079 /*
4072 4080 * When the verify request fails, the client ctime is
4073 4081 * not in sync with the server. This is the same as
4074 4082 * the version 3 "not synchronized" error, and we
4075 4083 * handle it in a similar manner (XXX do we need to???).
4076 4084 * Use the ctime returned in the first getattr for
4077 4085 * the input to the next verify.
4078 4086 * If we couldn't get the attributes, then we give up
4079 4087 * because we can't complete the operation as required.
4080 4088 */
4081 4089 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4082 4090 }
4083 4091 if (e.error) {
4084 4092 PURGE_ATTRCACHE4(vp);
4085 4093 nfs4_purge_stale_fh(e.error, vp, cr);
4086 4094 } else {
4087 4095 /*
4088 4096 * retry with a new verify value
4089 4097 */
4090 4098 ctime = garp->n4g_va.va_ctime;
4091 4099 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4092 4100 resp = NULL;
4093 4101 }
4094 4102 if (!e.error) {
4095 4103 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4096 4104 opsetattr.obj_attributes);
4097 4105 if (verify_argop != -1) {
4098 4106 nfs4args_verify_free(&argop[verify_argop]);
4099 4107 verify_argop = -1;
4100 4108 }
4101 4109 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4102 4110 goto do_again;
4103 4111 }
4104 4112 } while (!e.error);
4105 4113
4106 4114 if (e.error) {
4107 4115 /*
4108 4116 * If we are here, rfs4call has an irrecoverable error - return
4109 4117 */
4110 4118 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4111 4119 opsetattr.obj_attributes);
4112 4120 if (verify_argop != -1) {
4113 4121 nfs4args_verify_free(&argop[verify_argop]);
4114 4122 verify_argop = -1;
4115 4123 }
4116 4124 if (resp)
4117 4125 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4118 4126 return (e.error);
4119 4127 }
4120 4128
4121 4129
4122 4130
4123 4131 /*
4124 4132 * If changing the size of the file, invalidate
4125 4133 * any local cached data which is no longer part
4126 4134 * of the file. We also possibly invalidate the
4127 4135 * last page in the file. We could use
4128 4136 * pvn_vpzero(), but this would mark the page as
4129 4137 * modified and require it to be written back to
4130 4138 * the server for no particularly good reason.
4131 4139 * This way, if we access it, then we bring it
4132 4140 * back in. A read should be cheaper than a
4133 4141 * write.
4134 4142 */
4135 4143 if (mask & AT_SIZE) {
4136 4144 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4137 4145 }
4138 4146
4139 4147 /* either no error or one of the postop getattr failed */
4140 4148
4141 4149 /*
4142 4150 * XXX Perform a simplified version of wcc checking. Instead of
4143 4151 * have another getattr to get pre-op, just purge cache if
4144 4152 * any of the ops prior to and including the getattr failed.
4145 4153 * If the getattr succeeded then update the attrcache accordingly.
4146 4154 */
4147 4155
4148 4156 garp = NULL;
4149 4157 if (res.status == NFS4_OK) {
4150 4158 /*
4151 4159 * Last getattr
4152 4160 */
4153 4161 resop = &res.array[numops - 1];
4154 4162 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4155 4163 }
4156 4164 /*
4157 4165 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4158 4166 * rather than filling it. See the function itself for details.
4159 4167 */
4160 4168 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4161 4169 if (garp != NULL) {
4162 4170 if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4163 4171 nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4164 4172 vs_ace4_destroy(&garp->n4g_vsa);
4165 4173 } else {
4166 4174 if (vsap != NULL) {
4167 4175 /*
4168 4176 * The ACL was supposed to be set and to be
4169 4177 * returned in the last getattr of this
4170 4178 * compound, but for some reason the getattr
4171 4179 * result doesn't contain the ACL. In this
4172 4180 * case, purge the ACL cache.
4173 4181 */
4174 4182 if (rp->r_secattr != NULL) {
4175 4183 mutex_enter(&rp->r_statelock);
4176 4184 vsp = rp->r_secattr;
4177 4185 rp->r_secattr = NULL;
4178 4186 mutex_exit(&rp->r_statelock);
4179 4187 if (vsp != NULL)
4180 4188 nfs4_acl_free_cache(vsp);
4181 4189 }
4182 4190 }
4183 4191 }
4184 4192 }
4185 4193
4186 4194 if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4187 4195 /*
4188 4196 * Set the size, rather than relying on getting it updated
4189 4197 * via a GETATTR. With delegations the client tries to
4190 4198 * suppress GETATTR calls.
4191 4199 */
4192 4200 mutex_enter(&rp->r_statelock);
4193 4201 rp->r_size = vap->va_size;
4194 4202 mutex_exit(&rp->r_statelock);
4195 4203 }
4196 4204
4197 4205 /*
4198 4206 * Can free up request args and res
4199 4207 */
4200 4208 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4201 4209 opsetattr.obj_attributes);
4202 4210 if (verify_argop != -1) {
4203 4211 nfs4args_verify_free(&argop[verify_argop]);
4204 4212 verify_argop = -1;
4205 4213 }
4206 4214 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4207 4215
4208 4216 /*
4209 4217 * Some servers will change the mode to clear the setuid
4210 4218 * and setgid bits when changing the uid or gid. The
4211 4219 * client needs to compensate appropriately.
4212 4220 */
4213 4221 if (mask & (AT_UID | AT_GID)) {
4214 4222 int terror, do_setattr;
4215 4223
4216 4224 do_setattr = 0;
4217 4225 va.va_mask = AT_MODE;
4218 4226 terror = nfs4getattr(vp, &va, cr);
4219 4227 if (!terror &&
4220 4228 (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4221 4229 (!(mask & AT_MODE) && va.va_mode != omode))) {
4222 4230 va.va_mask = AT_MODE;
4223 4231 if (mask & AT_MODE) {
4224 4232 /*
4225 4233 * We asked the mode to be changed and what
4226 4234 * we just got from the server in getattr is
4227 4235 * not what we wanted it to be, so set it now.
4228 4236 */
4229 4237 va.va_mode = vap->va_mode;
4230 4238 do_setattr = 1;
4231 4239 } else {
4232 4240 /*
4233 4241 * We did not ask the mode to be changed,
4234 4242 * Check to see that the server just cleared
4235 4243 * I_SUID and I_GUID from it. If not then
4236 4244 * set mode to omode with UID/GID cleared.
4237 4245 */
4238 4246 if (nfs4_compare_modes(va.va_mode, omode)) {
4239 4247 omode &= ~(S_ISUID|S_ISGID);
4240 4248 va.va_mode = omode;
4241 4249 do_setattr = 1;
4242 4250 }
4243 4251 }
4244 4252
4245 4253 if (do_setattr)
4246 4254 (void) nfs4setattr(vp, &va, 0, cr, NULL);
4247 4255 }
4248 4256 }
4249 4257
4250 4258 return (e.error);
4251 4259 }
4252 4260
4253 4261 /* ARGSUSED */
4254 4262 static int
4255 4263 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4256 4264 {
4257 4265 COMPOUND4args_clnt args;
4258 4266 COMPOUND4res_clnt res;
4259 4267 int doqueue;
4260 4268 uint32_t acc, resacc, argacc;
4261 4269 rnode4_t *rp;
4262 4270 cred_t *cred, *ncr, *ncrfree = NULL;
4263 4271 nfs4_access_type_t cacc;
4264 4272 int num_ops;
4265 4273 nfs_argop4 argop[3];
4266 4274 nfs_resop4 *resop;
4267 4275 bool_t needrecov = FALSE, do_getattr;
4268 4276 nfs4_recov_state_t recov_state;
4269 4277 int rpc_error;
4270 4278 hrtime_t t;
4271 4279 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4272 4280 mntinfo4_t *mi = VTOMI4(vp);
4273 4281
4274 4282 if (nfs_zone() != mi->mi_zone)
4275 4283 return (EIO);
4276 4284
4277 4285 acc = 0;
4278 4286 if (mode & VREAD)
4279 4287 acc |= ACCESS4_READ;
4280 4288 if (mode & VWRITE) {
4281 4289 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4282 4290 return (EROFS);
4283 4291 if (vp->v_type == VDIR)
4284 4292 acc |= ACCESS4_DELETE;
4285 4293 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4286 4294 }
4287 4295 if (mode & VEXEC) {
4288 4296 if (vp->v_type == VDIR)
4289 4297 acc |= ACCESS4_LOOKUP;
4290 4298 else
4291 4299 acc |= ACCESS4_EXECUTE;
4292 4300 }
4293 4301
4294 4302 if (VTOR4(vp)->r_acache != NULL) {
4295 4303 e.error = nfs4_validate_caches(vp, cr);
4296 4304 if (e.error)
4297 4305 return (e.error);
4298 4306 }
4299 4307
4300 4308 rp = VTOR4(vp);
4301 4309 if (vp->v_type == VDIR)
4302 4310 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4303 4311 ACCESS4_EXTEND | ACCESS4_LOOKUP;
4304 4312 else
4305 4313 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4306 4314 ACCESS4_EXECUTE;
4307 4315 recov_state.rs_flags = 0;
4308 4316 recov_state.rs_num_retry_despite_err = 0;
4309 4317
4310 4318 cred = cr;
4311 4319 /*
4312 4320 * ncr and ncrfree both initially
4313 4321 * point to the memory area returned
4314 4322 * by crnetadjust();
4315 4323 * ncrfree not NULL when exiting means
4316 4324 * that we need to release it
4317 4325 */
4318 4326 ncr = crnetadjust(cred);
4319 4327 ncrfree = ncr;
4320 4328
4321 4329 tryagain:
4322 4330 cacc = nfs4_access_check(rp, acc, cred);
4323 4331 if (cacc == NFS4_ACCESS_ALLOWED) {
4324 4332 if (ncrfree != NULL)
4325 4333 crfree(ncrfree);
4326 4334 return (0);
4327 4335 }
4328 4336 if (cacc == NFS4_ACCESS_DENIED) {
4329 4337 /*
4330 4338 * If the cred can be adjusted, try again
4331 4339 * with the new cred.
4332 4340 */
4333 4341 if (ncr != NULL) {
4334 4342 cred = ncr;
4335 4343 ncr = NULL;
4336 4344 goto tryagain;
4337 4345 }
4338 4346 if (ncrfree != NULL)
4339 4347 crfree(ncrfree);
4340 4348 return (EACCES);
4341 4349 }
4342 4350
4343 4351 recov_retry:
4344 4352 /*
4345 4353 * Don't take with r_statev4_lock here. r_deleg_type could
4346 4354 * change as soon as lock is released. Since it is an int,
4347 4355 * there is no atomicity issue.
4348 4356 */
4349 4357 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4350 4358 num_ops = do_getattr ? 3 : 2;
4351 4359
4352 4360 args.ctag = TAG_ACCESS;
4353 4361
4354 4362 args.array_len = num_ops;
4355 4363 args.array = argop;
4356 4364
4357 4365 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4358 4366 &recov_state, NULL)) {
4359 4367 if (ncrfree != NULL)
4360 4368 crfree(ncrfree);
4361 4369 return (e.error);
4362 4370 }
4363 4371
4364 4372 /* putfh target fh */
4365 4373 argop[0].argop = OP_CPUTFH;
4366 4374 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4367 4375
4368 4376 /* access */
4369 4377 argop[1].argop = OP_ACCESS;
4370 4378 argop[1].nfs_argop4_u.opaccess.access = argacc;
4371 4379
4372 4380 /* getattr */
4373 4381 if (do_getattr) {
4374 4382 argop[2].argop = OP_GETATTR;
4375 4383 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4376 4384 argop[2].nfs_argop4_u.opgetattr.mi = mi;
4377 4385 }
4378 4386
4379 4387 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4380 4388 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4381 4389 rnode4info(VTOR4(vp))));
4382 4390
4383 4391 doqueue = 1;
4384 4392 t = gethrtime();
4385 4393 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4386 4394 rpc_error = e.error;
4387 4395
4388 4396 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4389 4397 if (needrecov) {
4390 4398 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4391 4399 "nfs4_access: initiating recovery\n"));
4392 4400
4393 4401 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4394 4402 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4395 4403 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4396 4404 &recov_state, needrecov);
4397 4405 if (!e.error)
4398 4406 (void) xdr_free(xdr_COMPOUND4res_clnt,
4399 4407 (caddr_t)&res);
4400 4408 goto recov_retry;
4401 4409 }
4402 4410 }
4403 4411 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4404 4412
4405 4413 if (e.error)
4406 4414 goto out;
4407 4415
4408 4416 if (res.status) {
4409 4417 e.error = geterrno4(res.status);
4410 4418 /*
4411 4419 * This might generate over the wire calls throught
4412 4420 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4413 4421 * here to avoid a deadlock.
4414 4422 */
4415 4423 nfs4_purge_stale_fh(e.error, vp, cr);
4416 4424 goto out;
4417 4425 }
4418 4426 resop = &res.array[1]; /* access res */
4419 4427
4420 4428 resacc = resop->nfs_resop4_u.opaccess.access;
4421 4429
4422 4430 if (do_getattr) {
4423 4431 resop++; /* getattr res */
4424 4432 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4425 4433 t, cr, FALSE, NULL);
4426 4434 }
4427 4435
4428 4436 if (!e.error) {
4429 4437 nfs4_access_cache(rp, argacc, resacc, cred);
4430 4438 /*
4431 4439 * we just cached results with cred; if cred is the
4432 4440 * adjusted credentials from crnetadjust, we do not want
4433 4441 * to release them before exiting: hence setting ncrfree
4434 4442 * to NULL
4435 4443 */
4436 4444 if (cred != cr)
4437 4445 ncrfree = NULL;
4438 4446 /* XXX check the supported bits too? */
4439 4447 if ((acc & resacc) != acc) {
4440 4448 /*
4441 4449 * The following code implements the semantic
4442 4450 * that a setuid root program has *at least* the
4443 4451 * permissions of the user that is running the
4444 4452 * program. See rfs3call() for more portions
4445 4453 * of the implementation of this functionality.
4446 4454 */
4447 4455 /* XXX-LP */
4448 4456 if (ncr != NULL) {
4449 4457 (void) xdr_free(xdr_COMPOUND4res_clnt,
4450 4458 (caddr_t)&res);
4451 4459 cred = ncr;
4452 4460 ncr = NULL;
4453 4461 goto tryagain;
4454 4462 }
4455 4463 e.error = EACCES;
4456 4464 }
4457 4465 }
4458 4466
4459 4467 out:
4460 4468 if (!rpc_error)
4461 4469 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4462 4470
4463 4471 if (ncrfree != NULL)
4464 4472 crfree(ncrfree);
4465 4473
4466 4474 return (e.error);
4467 4475 }
4468 4476
4469 4477 /* ARGSUSED */
4470 4478 static int
4471 4479 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4472 4480 {
4473 4481 COMPOUND4args_clnt args;
4474 4482 COMPOUND4res_clnt res;
4475 4483 int doqueue;
4476 4484 rnode4_t *rp;
4477 4485 nfs_argop4 argop[3];
4478 4486 nfs_resop4 *resop;
4479 4487 READLINK4res *lr_res;
4480 4488 nfs4_ga_res_t *garp;
4481 4489 uint_t len;
4482 4490 char *linkdata;
4483 4491 bool_t needrecov = FALSE;
4484 4492 nfs4_recov_state_t recov_state;
4485 4493 hrtime_t t;
4486 4494 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4487 4495
4488 4496 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4489 4497 return (EIO);
4490 4498 /*
4491 4499 * Can't readlink anything other than a symbolic link.
4492 4500 */
4493 4501 if (vp->v_type != VLNK)
4494 4502 return (EINVAL);
4495 4503
4496 4504 rp = VTOR4(vp);
4497 4505 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4498 4506 e.error = nfs4_validate_caches(vp, cr);
4499 4507 if (e.error)
4500 4508 return (e.error);
4501 4509 mutex_enter(&rp->r_statelock);
4502 4510 if (rp->r_symlink.contents != NULL) {
4503 4511 e.error = uiomove(rp->r_symlink.contents,
4504 4512 rp->r_symlink.len, UIO_READ, uiop);
4505 4513 mutex_exit(&rp->r_statelock);
4506 4514 return (e.error);
4507 4515 }
4508 4516 mutex_exit(&rp->r_statelock);
4509 4517 }
4510 4518 recov_state.rs_flags = 0;
4511 4519 recov_state.rs_num_retry_despite_err = 0;
4512 4520
4513 4521 recov_retry:
4514 4522 args.array_len = 3;
4515 4523 args.array = argop;
4516 4524 args.ctag = TAG_READLINK;
4517 4525
4518 4526 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4519 4527 if (e.error) {
4520 4528 return (e.error);
4521 4529 }
4522 4530
4523 4531 /* 0. putfh symlink fh */
4524 4532 argop[0].argop = OP_CPUTFH;
4525 4533 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4526 4534
4527 4535 /* 1. readlink */
4528 4536 argop[1].argop = OP_READLINK;
4529 4537
4530 4538 /* 2. getattr */
4531 4539 argop[2].argop = OP_GETATTR;
4532 4540 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4533 4541 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4534 4542
4535 4543 doqueue = 1;
4536 4544
4537 4545 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4538 4546 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4539 4547 rnode4info(VTOR4(vp))));
4540 4548
4541 4549 t = gethrtime();
4542 4550
4543 4551 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4544 4552
4545 4553 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4546 4554 if (needrecov) {
4547 4555 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4548 4556 "nfs4_readlink: initiating recovery\n"));
4549 4557
4550 4558 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4551 4559 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4552 4560 if (!e.error)
4553 4561 (void) xdr_free(xdr_COMPOUND4res_clnt,
4554 4562 (caddr_t)&res);
4555 4563
4556 4564 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4557 4565 needrecov);
4558 4566 goto recov_retry;
4559 4567 }
4560 4568 }
4561 4569
4562 4570 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4563 4571
4564 4572 if (e.error)
4565 4573 return (e.error);
4566 4574
4567 4575 /*
4568 4576 * There is an path in the code below which calls
4569 4577 * nfs4_purge_stale_fh(), which may generate otw calls through
4570 4578 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4571 4579 * here to avoid nfs4_start_op() deadlock.
4572 4580 */
4573 4581
4574 4582 if (res.status && (res.array_len < args.array_len)) {
4575 4583 /*
4576 4584 * either Putfh or Link failed
4577 4585 */
4578 4586 e.error = geterrno4(res.status);
4579 4587 nfs4_purge_stale_fh(e.error, vp, cr);
4580 4588 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4581 4589 return (e.error);
4582 4590 }
4583 4591
4584 4592 resop = &res.array[1]; /* readlink res */
4585 4593 lr_res = &resop->nfs_resop4_u.opreadlink;
4586 4594
4587 4595 /*
4588 4596 * treat symlink names as data
4589 4597 */
4590 4598 linkdata = utf8_to_str(&lr_res->link, &len, NULL);
4591 4599 if (linkdata != NULL) {
4592 4600 int uio_len = len - 1;
4593 4601 /* len includes null byte, which we won't uiomove */
4594 4602 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4595 4603 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4596 4604 mutex_enter(&rp->r_statelock);
4597 4605 if (rp->r_symlink.contents == NULL) {
4598 4606 rp->r_symlink.contents = linkdata;
4599 4607 rp->r_symlink.len = uio_len;
4600 4608 rp->r_symlink.size = len;
4601 4609 mutex_exit(&rp->r_statelock);
4602 4610 } else {
4603 4611 mutex_exit(&rp->r_statelock);
4604 4612 kmem_free(linkdata, len);
4605 4613 }
4606 4614 } else {
4607 4615 kmem_free(linkdata, len);
4608 4616 }
4609 4617 }
4610 4618 if (res.status == NFS4_OK) {
4611 4619 resop++; /* getattr res */
4612 4620 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4613 4621 }
4614 4622 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4615 4623
4616 4624 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4617 4625
4618 4626 /*
4619 4627 * The over the wire error for attempting to readlink something
4620 4628 * other than a symbolic link is ENXIO. However, we need to
4621 4629 * return EINVAL instead of ENXIO, so we map it here.
4622 4630 */
4623 4631 return (e.error == ENXIO ? EINVAL : e.error);
4624 4632 }
4625 4633
4626 4634 /*
4627 4635 * Flush local dirty pages to stable storage on the server.
4628 4636 *
4629 4637 * If FNODSYNC is specified, then there is nothing to do because
4630 4638 * metadata changes are not cached on the client before being
4631 4639 * sent to the server.
4632 4640 */
4633 4641 /* ARGSUSED */
4634 4642 static int
4635 4643 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4636 4644 {
4637 4645 int error;
4638 4646
4639 4647 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4640 4648 return (0);
4641 4649 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4642 4650 return (EIO);
4643 4651 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4644 4652 if (!error)
4645 4653 error = VTOR4(vp)->r_error;
4646 4654 return (error);
4647 4655 }
4648 4656
4649 4657 /*
4650 4658 * Weirdness: if the file was removed or the target of a rename
4651 4659 * operation while it was open, it got renamed instead. Here we
4652 4660 * remove the renamed file.
4653 4661 */
4654 4662 /* ARGSUSED */
4655 4663 void
4656 4664 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4657 4665 {
4658 4666 rnode4_t *rp;
4659 4667
4660 4668 ASSERT(vp != DNLC_NO_VNODE);
4661 4669
4662 4670 rp = VTOR4(vp);
4663 4671
4664 4672 if (IS_SHADOW(vp, rp)) {
4665 4673 sv_inactive(vp);
4666 4674 return;
4667 4675 }
4668 4676
4669 4677 /*
4670 4678 * If this is coming from the wrong zone, we let someone in the right
4671 4679 * zone take care of it asynchronously. We can get here due to
4672 4680 * VN_RELE() being called from pageout() or fsflush(). This call may
4673 4681 * potentially turn into an expensive no-op if, for instance, v_count
4674 4682 * gets incremented in the meantime, but it's still correct.
4675 4683 */
4676 4684 if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4677 4685 nfs4_async_inactive(vp, cr);
4678 4686 return;
4679 4687 }
4680 4688
4681 4689 /*
4682 4690 * Some of the cleanup steps might require over-the-wire
4683 4691 * operations. Since VOP_INACTIVE can get called as a result of
4684 4692 * other over-the-wire operations (e.g., an attribute cache update
4685 4693 * can lead to a DNLC purge), doing those steps now would lead to a
4686 4694 * nested call to the recovery framework, which can deadlock. So
4687 4695 * do any over-the-wire cleanups asynchronously, in a separate
4688 4696 * thread.
4689 4697 */
4690 4698
4691 4699 mutex_enter(&rp->r_os_lock);
4692 4700 mutex_enter(&rp->r_statelock);
4693 4701 mutex_enter(&rp->r_statev4_lock);
4694 4702
4695 4703 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4696 4704 mutex_exit(&rp->r_statev4_lock);
4697 4705 mutex_exit(&rp->r_statelock);
4698 4706 mutex_exit(&rp->r_os_lock);
4699 4707 nfs4_async_inactive(vp, cr);
4700 4708 return;
4701 4709 }
4702 4710
4703 4711 if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4704 4712 rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4705 4713 mutex_exit(&rp->r_statev4_lock);
4706 4714 mutex_exit(&rp->r_statelock);
4707 4715 mutex_exit(&rp->r_os_lock);
4708 4716 nfs4_async_inactive(vp, cr);
4709 4717 return;
4710 4718 }
4711 4719
4712 4720 if (rp->r_unldvp != NULL) {
4713 4721 mutex_exit(&rp->r_statev4_lock);
4714 4722 mutex_exit(&rp->r_statelock);
4715 4723 mutex_exit(&rp->r_os_lock);
4716 4724 nfs4_async_inactive(vp, cr);
4717 4725 return;
4718 4726 }
4719 4727 mutex_exit(&rp->r_statev4_lock);
4720 4728 mutex_exit(&rp->r_statelock);
4721 4729 mutex_exit(&rp->r_os_lock);
4722 4730
4723 4731 rp4_addfree(rp, cr);
4724 4732 }
4725 4733
4726 4734 /*
4727 4735 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4728 4736 * various bits of state. The caller must not refer to vp after this call.
4729 4737 */
4730 4738
4731 4739 void
4732 4740 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4733 4741 {
4734 4742 rnode4_t *rp = VTOR4(vp);
4735 4743 nfs4_recov_state_t recov_state;
4736 4744 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4737 4745 vnode_t *unldvp;
4738 4746 char *unlname;
4739 4747 cred_t *unlcred;
4740 4748 COMPOUND4args_clnt args;
4741 4749 COMPOUND4res_clnt res, *resp;
4742 4750 nfs_argop4 argop[2];
4743 4751 int doqueue;
4744 4752 #ifdef DEBUG
4745 4753 char *name;
4746 4754 #endif
4747 4755
4748 4756 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4749 4757 ASSERT(!IS_SHADOW(vp, rp));
4750 4758
4751 4759 #ifdef DEBUG
4752 4760 name = fn_name(VTOSV(vp)->sv_name);
4753 4761 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4754 4762 "release vnode %s", name));
4755 4763 kmem_free(name, MAXNAMELEN);
4756 4764 #endif
4757 4765
4758 4766 if (vp->v_type == VREG) {
4759 4767 bool_t recov_failed = FALSE;
4760 4768
4761 4769 e.error = nfs4close_all(vp, cr);
4762 4770 if (e.error) {
4763 4771 /* Check to see if recovery failed */
4764 4772 mutex_enter(&(VTOMI4(vp)->mi_lock));
4765 4773 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4766 4774 recov_failed = TRUE;
4767 4775 mutex_exit(&(VTOMI4(vp)->mi_lock));
4768 4776 if (!recov_failed) {
4769 4777 mutex_enter(&rp->r_statelock);
4770 4778 if (rp->r_flags & R4RECOVERR)
4771 4779 recov_failed = TRUE;
4772 4780 mutex_exit(&rp->r_statelock);
4773 4781 }
4774 4782 if (recov_failed) {
4775 4783 NFS4_DEBUG(nfs4_client_recov_debug,
4776 4784 (CE_NOTE, "nfs4_inactive_otw: "
4777 4785 "close failed (recovery failure)"));
4778 4786 }
4779 4787 }
4780 4788 }
4781 4789
4782 4790 redo:
4783 4791 if (rp->r_unldvp == NULL) {
4784 4792 rp4_addfree(rp, cr);
4785 4793 return;
4786 4794 }
4787 4795
4788 4796 /*
4789 4797 * Save the vnode pointer for the directory where the
4790 4798 * unlinked-open file got renamed, then set it to NULL
4791 4799 * to prevent another thread from getting here before
4792 4800 * we're done with the remove. While we have the
4793 4801 * statelock, make local copies of the pertinent rnode
4794 4802 * fields. If we weren't to do this in an atomic way, the
4795 4803 * the unl* fields could become inconsistent with respect
4796 4804 * to each other due to a race condition between this
4797 4805 * code and nfs_remove(). See bug report 1034328.
4798 4806 */
4799 4807 mutex_enter(&rp->r_statelock);
4800 4808 if (rp->r_unldvp == NULL) {
4801 4809 mutex_exit(&rp->r_statelock);
4802 4810 rp4_addfree(rp, cr);
4803 4811 return;
4804 4812 }
4805 4813
4806 4814 unldvp = rp->r_unldvp;
4807 4815 rp->r_unldvp = NULL;
4808 4816 unlname = rp->r_unlname;
4809 4817 rp->r_unlname = NULL;
4810 4818 unlcred = rp->r_unlcred;
4811 4819 rp->r_unlcred = NULL;
4812 4820 mutex_exit(&rp->r_statelock);
4813 4821
4814 4822 /*
4815 4823 * If there are any dirty pages left, then flush
4816 4824 * them. This is unfortunate because they just
4817 4825 * may get thrown away during the remove operation,
4818 4826 * but we have to do this for correctness.
4819 4827 */
4820 4828 if (nfs4_has_pages(vp) &&
4821 4829 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4822 4830 ASSERT(vp->v_type != VCHR);
4823 4831 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4824 4832 if (e.error) {
4825 4833 mutex_enter(&rp->r_statelock);
4826 4834 if (!rp->r_error)
4827 4835 rp->r_error = e.error;
4828 4836 mutex_exit(&rp->r_statelock);
4829 4837 }
4830 4838 }
4831 4839
4832 4840 recov_state.rs_flags = 0;
4833 4841 recov_state.rs_num_retry_despite_err = 0;
4834 4842 recov_retry_remove:
4835 4843 /*
4836 4844 * Do the remove operation on the renamed file
4837 4845 */
4838 4846 args.ctag = TAG_INACTIVE;
4839 4847
4840 4848 /*
4841 4849 * Remove ops: putfh dir; remove
4842 4850 */
4843 4851 args.array_len = 2;
4844 4852 args.array = argop;
4845 4853
4846 4854 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4847 4855 if (e.error) {
4848 4856 kmem_free(unlname, MAXNAMELEN);
4849 4857 crfree(unlcred);
4850 4858 VN_RELE(unldvp);
4851 4859 /*
4852 4860 * Try again; this time around r_unldvp will be NULL, so we'll
4853 4861 * just call rp4_addfree() and return.
4854 4862 */
4855 4863 goto redo;
4856 4864 }
4857 4865
4858 4866 /* putfh directory */
4859 4867 argop[0].argop = OP_CPUTFH;
4860 4868 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4861 4869
4862 4870 /* remove */
4863 4871 argop[1].argop = OP_CREMOVE;
4864 4872 argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4865 4873
4866 4874 doqueue = 1;
4867 4875 resp = &res;
4868 4876
4869 4877 #if 0 /* notyet */
4870 4878 /*
4871 4879 * Can't do this yet. We may be being called from
4872 4880 * dnlc_purge_XXX while that routine is holding a
4873 4881 * mutex lock to the nc_rele list. The calls to
4874 4882 * nfs3_cache_wcc_data may result in calls to
4875 4883 * dnlc_purge_XXX. This will result in a deadlock.
4876 4884 */
4877 4885 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4878 4886 if (e.error) {
4879 4887 PURGE_ATTRCACHE4(unldvp);
4880 4888 resp = NULL;
4881 4889 } else if (res.status) {
4882 4890 e.error = geterrno4(res.status);
4883 4891 PURGE_ATTRCACHE4(unldvp);
4884 4892 /*
4885 4893 * This code is inactive right now
4886 4894 * but if made active there should
4887 4895 * be a nfs4_end_op() call before
4888 4896 * nfs4_purge_stale_fh to avoid start_op()
4889 4897 * deadlock. See BugId: 4948726
4890 4898 */
4891 4899 nfs4_purge_stale_fh(error, unldvp, cr);
4892 4900 } else {
4893 4901 nfs_resop4 *resop;
4894 4902 REMOVE4res *rm_res;
4895 4903
4896 4904 resop = &res.array[1];
4897 4905 rm_res = &resop->nfs_resop4_u.opremove;
4898 4906 /*
4899 4907 * Update directory cache attribute,
4900 4908 * readdir and dnlc caches.
4901 4909 */
4902 4910 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4903 4911 }
4904 4912 #else
4905 4913 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4906 4914
4907 4915 PURGE_ATTRCACHE4(unldvp);
4908 4916 #endif
4909 4917
4910 4918 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4911 4919 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4912 4920 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4913 4921 if (!e.error)
4914 4922 (void) xdr_free(xdr_COMPOUND4res_clnt,
4915 4923 (caddr_t)&res);
4916 4924 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4917 4925 &recov_state, TRUE);
4918 4926 goto recov_retry_remove;
4919 4927 }
4920 4928 }
4921 4929 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4922 4930
4923 4931 /*
4924 4932 * Release stuff held for the remove
4925 4933 */
4926 4934 VN_RELE(unldvp);
4927 4935 if (!e.error && resp)
4928 4936 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4929 4937
4930 4938 kmem_free(unlname, MAXNAMELEN);
4931 4939 crfree(unlcred);
4932 4940 goto redo;
4933 4941 }
4934 4942
4935 4943 /*
4936 4944 * Remote file system operations having to do with directory manipulation.
4937 4945 */
4938 4946 /* ARGSUSED3 */
4939 4947 int
4940 4948 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4941 4949 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4942 4950 int *direntflags, pathname_t *realpnp)
4943 4951 {
4944 4952 int error;
4945 4953 vnode_t *vp, *avp = NULL;
4946 4954 rnode4_t *drp;
4947 4955
4948 4956 *vpp = NULL;
4949 4957 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4950 4958 return (EPERM);
4951 4959 /*
4952 4960 * if LOOKUP_XATTR, must replace dvp (object) with
4953 4961 * object's attrdir before continuing with lookup
4954 4962 */
4955 4963 if (flags & LOOKUP_XATTR) {
4956 4964 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4957 4965 if (error)
4958 4966 return (error);
4959 4967
4960 4968 dvp = avp;
4961 4969
4962 4970 /*
4963 4971 * If lookup is for "", just return dvp now. The attrdir
4964 4972 * has already been activated (from nfs4lookup_xattr), and
4965 4973 * the caller will RELE the original dvp -- not
4966 4974 * the attrdir. So, set vpp and return.
4967 4975 * Currently, when the LOOKUP_XATTR flag is
4968 4976 * passed to VOP_LOOKUP, the name is always empty, and
4969 4977 * shortcircuiting here avoids 3 unneeded lock/unlock
4970 4978 * pairs.
4971 4979 *
4972 4980 * If a non-empty name was provided, then it is the
4973 4981 * attribute name, and it will be looked up below.
4974 4982 */
4975 4983 if (*nm == '\0') {
4976 4984 *vpp = dvp;
4977 4985 return (0);
4978 4986 }
4979 4987
4980 4988 /*
4981 4989 * The vfs layer never sends a name when asking for the
4982 4990 * attrdir, so we should never get here (unless of course
4983 4991 * name is passed at some time in future -- at which time
4984 4992 * we'll blow up here).
4985 4993 */
4986 4994 ASSERT(0);
4987 4995 }
4988 4996
4989 4997 drp = VTOR4(dvp);
4990 4998 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4991 4999 return (EINTR);
4992 5000
4993 5001 error = nfs4lookup(dvp, nm, vpp, cr, 0);
4994 5002 nfs_rw_exit(&drp->r_rwlock);
4995 5003
4996 5004 /*
4997 5005 * If vnode is a device, create special vnode.
4998 5006 */
4999 5007 if (!error && ISVDEV((*vpp)->v_type)) {
5000 5008 vp = *vpp;
5001 5009 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
5002 5010 VN_RELE(vp);
5003 5011 }
5004 5012
5005 5013 return (error);
5006 5014 }
5007 5015
5008 5016 /* ARGSUSED */
5009 5017 static int
5010 5018 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5011 5019 {
5012 5020 int error;
5013 5021 rnode4_t *drp;
5014 5022 int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5015 5023 mntinfo4_t *mi;
5016 5024
5017 5025 mi = VTOMI4(dvp);
5018 5026 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5019 5027 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5020 5028 return (EINVAL);
5021 5029
5022 5030 drp = VTOR4(dvp);
5023 5031 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5024 5032 return (EINTR);
5025 5033
5026 5034 mutex_enter(&drp->r_statelock);
5027 5035 /*
5028 5036 * If the server doesn't support xattrs just return EINVAL
5029 5037 */
5030 5038 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5031 5039 mutex_exit(&drp->r_statelock);
5032 5040 nfs_rw_exit(&drp->r_rwlock);
5033 5041 return (EINVAL);
5034 5042 }
5035 5043
5036 5044 /*
5037 5045 * If there is a cached xattr directory entry,
5038 5046 * use it as long as the attributes are valid. If the
5039 5047 * attributes are not valid, take the simple approach and
5040 5048 * free the cached value and re-fetch a new value.
5041 5049 *
5042 5050 * We don't negative entry cache for now, if we did we
5043 5051 * would need to check if the file has changed on every
5044 5052 * lookup. But xattrs don't exist very often and failing
5045 5053 * an openattr is not much more expensive than and NVERIFY or GETATTR
5046 5054 * so do an openattr over the wire for now.
5047 5055 */
5048 5056 if (drp->r_xattr_dir != NULL) {
5049 5057 if (ATTRCACHE4_VALID(dvp)) {
5050 5058 VN_HOLD(drp->r_xattr_dir);
5051 5059 *vpp = drp->r_xattr_dir;
5052 5060 mutex_exit(&drp->r_statelock);
5053 5061 nfs_rw_exit(&drp->r_rwlock);
5054 5062 return (0);
5055 5063 }
5056 5064 VN_RELE(drp->r_xattr_dir);
5057 5065 drp->r_xattr_dir = NULL;
5058 5066 }
5059 5067 mutex_exit(&drp->r_statelock);
5060 5068
5061 5069 error = nfs4openattr(dvp, vpp, cflag, cr);
5062 5070
5063 5071 nfs_rw_exit(&drp->r_rwlock);
5064 5072
5065 5073 return (error);
5066 5074 }
5067 5075
5068 5076 static int
5069 5077 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5070 5078 {
5071 5079 int error;
5072 5080 rnode4_t *drp;
5073 5081
5074 5082 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5075 5083
5076 5084 /*
5077 5085 * If lookup is for "", just return dvp. Don't need
5078 5086 * to send it over the wire, look it up in the dnlc,
5079 5087 * or perform any access checks.
5080 5088 */
5081 5089 if (*nm == '\0') {
5082 5090 VN_HOLD(dvp);
5083 5091 *vpp = dvp;
5084 5092 return (0);
5085 5093 }
5086 5094
5087 5095 /*
5088 5096 * Can't do lookups in non-directories.
5089 5097 */
5090 5098 if (dvp->v_type != VDIR)
5091 5099 return (ENOTDIR);
5092 5100
5093 5101 /*
5094 5102 * If lookup is for ".", just return dvp. Don't need
5095 5103 * to send it over the wire or look it up in the dnlc,
5096 5104 * just need to check access.
5097 5105 */
5098 5106 if (nm[0] == '.' && nm[1] == '\0') {
5099 5107 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5100 5108 if (error)
5101 5109 return (error);
5102 5110 VN_HOLD(dvp);
5103 5111 *vpp = dvp;
5104 5112 return (0);
5105 5113 }
5106 5114
5107 5115 drp = VTOR4(dvp);
5108 5116 if (!(drp->r_flags & R4LOOKUP)) {
5109 5117 mutex_enter(&drp->r_statelock);
5110 5118 drp->r_flags |= R4LOOKUP;
5111 5119 mutex_exit(&drp->r_statelock);
5112 5120 }
5113 5121
5114 5122 *vpp = NULL;
5115 5123 /*
5116 5124 * Lookup this name in the DNLC. If there is no entry
5117 5125 * lookup over the wire.
5118 5126 */
5119 5127 if (!skipdnlc)
5120 5128 *vpp = dnlc_lookup(dvp, nm);
5121 5129 if (*vpp == NULL) {
5122 5130 /*
5123 5131 * We need to go over the wire to lookup the name.
5124 5132 */
5125 5133 return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5126 5134 }
5127 5135
5128 5136 /*
5129 5137 * We hit on the dnlc
5130 5138 */
5131 5139 if (*vpp != DNLC_NO_VNODE ||
5132 5140 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5133 5141 /*
5134 5142 * But our attrs may not be valid.
5135 5143 */
5136 5144 if (ATTRCACHE4_VALID(dvp)) {
5137 5145 error = nfs4_waitfor_purge_complete(dvp);
5138 5146 if (error) {
5139 5147 VN_RELE(*vpp);
5140 5148 *vpp = NULL;
5141 5149 return (error);
5142 5150 }
5143 5151
5144 5152 /*
5145 5153 * If after the purge completes, check to make sure
5146 5154 * our attrs are still valid.
5147 5155 */
5148 5156 if (ATTRCACHE4_VALID(dvp)) {
5149 5157 /*
5150 5158 * If we waited for a purge we may have
5151 5159 * lost our vnode so look it up again.
5152 5160 */
5153 5161 VN_RELE(*vpp);
5154 5162 *vpp = dnlc_lookup(dvp, nm);
5155 5163 if (*vpp == NULL)
5156 5164 return (nfs4lookupnew_otw(dvp,
5157 5165 nm, vpp, cr));
5158 5166
5159 5167 /*
5160 5168 * The access cache should almost always hit
5161 5169 */
5162 5170 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5163 5171
5164 5172 if (error) {
5165 5173 VN_RELE(*vpp);
5166 5174 *vpp = NULL;
5167 5175 return (error);
5168 5176 }
5169 5177 if (*vpp == DNLC_NO_VNODE) {
5170 5178 VN_RELE(*vpp);
5171 5179 *vpp = NULL;
5172 5180 return (ENOENT);
5173 5181 }
5174 5182 return (0);
5175 5183 }
5176 5184 }
5177 5185 }
5178 5186
5179 5187 ASSERT(*vpp != NULL);
5180 5188
5181 5189 /*
5182 5190 * We may have gotten here we have one of the following cases:
5183 5191 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5184 5192 * need to validate them.
5185 5193 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always
5186 5194 * must validate.
5187 5195 *
5188 5196 * Go to the server and check if the directory has changed, if
5189 5197 * it hasn't we are done and can use the dnlc entry.
5190 5198 */
5191 5199 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5192 5200 }
5193 5201
5194 5202 /*
5195 5203 * Go to the server and check if the directory has changed, if
5196 5204 * it hasn't we are done and can use the dnlc entry. If it
5197 5205 * has changed we get a new copy of its attributes and check
5198 5206 * the access for VEXEC, then relookup the filename and
5199 5207 * get its filehandle and attributes.
5200 5208 *
5201 5209 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5202 5210 * if the NVERIFY failed we must
5203 5211 * purge the caches
5204 5212 * cache new attributes (will set r_time_attr_inval)
5205 5213 * cache new access
5206 5214 * recheck VEXEC access
5207 5215 * add name to dnlc, possibly negative
5208 5216 * if LOOKUP succeeded
5209 5217 * cache new attributes
5210 5218 * else
5211 5219 * set a new r_time_attr_inval for dvp
5212 5220 * check to make sure we have access
5213 5221 *
5214 5222 * The vpp returned is the vnode passed in if the directory is valid,
5215 5223 * a new vnode if successful lookup, or NULL on error.
5216 5224 */
5217 5225 static int
5218 5226 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5219 5227 {
5220 5228 COMPOUND4args_clnt args;
5221 5229 COMPOUND4res_clnt res;
5222 5230 fattr4 *ver_fattr;
5223 5231 fattr4_change dchange;
5224 5232 int32_t *ptr;
5225 5233 int argoplist_size = 7 * sizeof (nfs_argop4);
5226 5234 nfs_argop4 *argop;
5227 5235 int doqueue;
5228 5236 mntinfo4_t *mi;
5229 5237 nfs4_recov_state_t recov_state;
5230 5238 hrtime_t t;
5231 5239 int isdotdot;
5232 5240 vnode_t *nvp;
5233 5241 nfs_fh4 *fhp;
5234 5242 nfs4_sharedfh_t *sfhp;
5235 5243 nfs4_access_type_t cacc;
5236 5244 rnode4_t *nrp;
5237 5245 rnode4_t *drp = VTOR4(dvp);
5238 5246 nfs4_ga_res_t *garp = NULL;
5239 5247 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5240 5248
5241 5249 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5242 5250 ASSERT(nm != NULL);
5243 5251 ASSERT(nm[0] != '\0');
5244 5252 ASSERT(dvp->v_type == VDIR);
5245 5253 ASSERT(nm[0] != '.' || nm[1] != '\0');
5246 5254 ASSERT(*vpp != NULL);
5247 5255
5248 5256 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5249 5257 isdotdot = 1;
5250 5258 args.ctag = TAG_LOOKUP_VPARENT;
5251 5259 } else {
5252 5260 /*
5253 5261 * If dvp were a stub, it should have triggered and caused
5254 5262 * a mount for us to get this far.
5255 5263 */
5256 5264 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5257 5265
5258 5266 isdotdot = 0;
5259 5267 args.ctag = TAG_LOOKUP_VALID;
5260 5268 }
5261 5269
5262 5270 mi = VTOMI4(dvp);
5263 5271 recov_state.rs_flags = 0;
5264 5272 recov_state.rs_num_retry_despite_err = 0;
5265 5273
5266 5274 nvp = NULL;
5267 5275
5268 5276 /* Save the original mount point security information */
5269 5277 (void) save_mnt_secinfo(mi->mi_curr_serv);
5270 5278
5271 5279 recov_retry:
5272 5280 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5273 5281 &recov_state, NULL);
5274 5282 if (e.error) {
5275 5283 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5276 5284 VN_RELE(*vpp);
5277 5285 *vpp = NULL;
5278 5286 return (e.error);
5279 5287 }
5280 5288
5281 5289 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5282 5290
5283 5291 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5284 5292 args.array_len = 7;
5285 5293 args.array = argop;
5286 5294
5287 5295 /* 0. putfh file */
5288 5296 argop[0].argop = OP_CPUTFH;
5289 5297 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5290 5298
5291 5299 /* 1. nverify the change info */
5292 5300 argop[1].argop = OP_NVERIFY;
5293 5301 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5294 5302 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5295 5303 ver_fattr->attrlist4 = (char *)&dchange;
5296 5304 ptr = (int32_t *)&dchange;
5297 5305 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5298 5306 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5299 5307
5300 5308 /* 2. getattr directory */
5301 5309 argop[2].argop = OP_GETATTR;
5302 5310 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5303 5311 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5304 5312
5305 5313 /* 3. access directory */
5306 5314 argop[3].argop = OP_ACCESS;
5307 5315 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5308 5316 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5309 5317
5310 5318 /* 4. lookup name */
5311 5319 if (isdotdot) {
5312 5320 argop[4].argop = OP_LOOKUPP;
5313 5321 } else {
5314 5322 argop[4].argop = OP_CLOOKUP;
5315 5323 argop[4].nfs_argop4_u.opclookup.cname = nm;
5316 5324 }
5317 5325
5318 5326 /* 5. resulting file handle */
5319 5327 argop[5].argop = OP_GETFH;
5320 5328
5321 5329 /* 6. resulting file attributes */
5322 5330 argop[6].argop = OP_GETATTR;
5323 5331 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5324 5332 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5325 5333
5326 5334 doqueue = 1;
5327 5335 t = gethrtime();
5328 5336
5329 5337 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5330 5338
5331 5339 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5332 5340 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5333 5341 if (e.error != 0 && *vpp != NULL)
5334 5342 VN_RELE(*vpp);
5335 5343 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5336 5344 &recov_state, FALSE);
5337 5345 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5338 5346 kmem_free(argop, argoplist_size);
5339 5347 return (e.error);
5340 5348 }
5341 5349
5342 5350 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5343 5351 /*
5344 5352 * For WRONGSEC of a non-dotdot case, send secinfo directly
5345 5353 * from this thread, do not go thru the recovery thread since
5346 5354 * we need the nm information.
5347 5355 *
5348 5356 * Not doing dotdot case because there is no specification
5349 5357 * for (PUTFH, SECINFO "..") yet.
5350 5358 */
5351 5359 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5352 5360 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5353 5361 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5354 5362 &recov_state, FALSE);
5355 5363 else
5356 5364 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5357 5365 &recov_state, TRUE);
5358 5366 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5359 5367 kmem_free(argop, argoplist_size);
5360 5368 if (!e.error)
5361 5369 goto recov_retry;
5362 5370 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5363 5371 VN_RELE(*vpp);
5364 5372 *vpp = NULL;
5365 5373 return (e.error);
5366 5374 }
5367 5375
5368 5376 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5369 5377 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5370 5378 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5371 5379 &recov_state, TRUE);
5372 5380
5373 5381 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5374 5382 kmem_free(argop, argoplist_size);
5375 5383 goto recov_retry;
5376 5384 }
5377 5385 }
5378 5386
5379 5387 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5380 5388
5381 5389 if (e.error || res.array_len == 0) {
5382 5390 /*
5383 5391 * If e.error isn't set, then reply has no ops (or we couldn't
5384 5392 * be here). The only legal way to reply without an op array
5385 5393 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5386 5394 * be in the reply for all other status values.
5387 5395 *
5388 5396 * For valid replies without an ops array, return ENOTSUP
5389 5397 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5390 5398 * return EIO -- don't trust status.
5391 5399 */
5392 5400 if (e.error == 0)
5393 5401 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5394 5402 ENOTSUP : EIO;
5395 5403 VN_RELE(*vpp);
5396 5404 *vpp = NULL;
5397 5405 kmem_free(argop, argoplist_size);
5398 5406 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5399 5407 return (e.error);
5400 5408 }
5401 5409
5402 5410 if (res.status != NFS4ERR_SAME) {
5403 5411 e.error = geterrno4(res.status);
5404 5412
5405 5413 /*
5406 5414 * The NVERIFY "failed" so the directory has changed
5407 5415 * First make sure PUTFH succeeded and NVERIFY "failed"
5408 5416 * cleanly.
5409 5417 */
5410 5418 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5411 5419 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5412 5420 nfs4_purge_stale_fh(e.error, dvp, cr);
5413 5421 VN_RELE(*vpp);
5414 5422 *vpp = NULL;
5415 5423 goto exit;
5416 5424 }
5417 5425
5418 5426 /*
5419 5427 * We know the NVERIFY "failed" so we must:
5420 5428 * purge the caches (access and indirectly dnlc if needed)
5421 5429 */
5422 5430 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5423 5431
5424 5432 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5425 5433 nfs4_purge_stale_fh(e.error, dvp, cr);
5426 5434 VN_RELE(*vpp);
5427 5435 *vpp = NULL;
5428 5436 goto exit;
5429 5437 }
5430 5438
5431 5439 /*
5432 5440 * Install new cached attributes for the directory
5433 5441 */
5434 5442 nfs4_attr_cache(dvp,
5435 5443 &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5436 5444 t, cr, FALSE, NULL);
5437 5445
5438 5446 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5439 5447 nfs4_purge_stale_fh(e.error, dvp, cr);
5440 5448 VN_RELE(*vpp);
5441 5449 *vpp = NULL;
5442 5450 e.error = geterrno4(res.status);
5443 5451 goto exit;
5444 5452 }
5445 5453
5446 5454 /*
5447 5455 * Now we know the directory is valid,
5448 5456 * cache new directory access
5449 5457 */
5450 5458 nfs4_access_cache(drp,
5451 5459 args.array[3].nfs_argop4_u.opaccess.access,
5452 5460 res.array[3].nfs_resop4_u.opaccess.access, cr);
5453 5461
5454 5462 /*
5455 5463 * recheck VEXEC access
5456 5464 */
5457 5465 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5458 5466 if (cacc != NFS4_ACCESS_ALLOWED) {
5459 5467 /*
5460 5468 * Directory permissions might have been revoked
5461 5469 */
5462 5470 if (cacc == NFS4_ACCESS_DENIED) {
5463 5471 e.error = EACCES;
5464 5472 VN_RELE(*vpp);
5465 5473 *vpp = NULL;
5466 5474 goto exit;
5467 5475 }
5468 5476
5469 5477 /*
5470 5478 * Somehow we must not have asked for enough
5471 5479 * so try a singleton ACCESS, should never happen.
5472 5480 */
5473 5481 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5474 5482 if (e.error) {
5475 5483 VN_RELE(*vpp);
5476 5484 *vpp = NULL;
5477 5485 goto exit;
5478 5486 }
5479 5487 }
5480 5488
5481 5489 e.error = geterrno4(res.status);
5482 5490 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5483 5491 /*
5484 5492 * The lookup failed, probably no entry
5485 5493 */
5486 5494 if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5487 5495 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5488 5496 } else {
5489 5497 /*
5490 5498 * Might be some other error, so remove
5491 5499 * the dnlc entry to make sure we start all
5492 5500 * over again, next time.
5493 5501 */
5494 5502 dnlc_remove(dvp, nm);
5495 5503 }
5496 5504 VN_RELE(*vpp);
5497 5505 *vpp = NULL;
5498 5506 goto exit;
5499 5507 }
5500 5508
5501 5509 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5502 5510 /*
5503 5511 * The file exists but we can't get its fh for
5504 5512 * some unknown reason. Remove it from the dnlc
5505 5513 * and error out to be safe.
5506 5514 */
5507 5515 dnlc_remove(dvp, nm);
5508 5516 VN_RELE(*vpp);
5509 5517 *vpp = NULL;
5510 5518 goto exit;
5511 5519 }
5512 5520 fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5513 5521 if (fhp->nfs_fh4_len == 0) {
5514 5522 /*
5515 5523 * The file exists but a bogus fh
5516 5524 * some unknown reason. Remove it from the dnlc
5517 5525 * and error out to be safe.
5518 5526 */
5519 5527 e.error = ENOENT;
5520 5528 dnlc_remove(dvp, nm);
5521 5529 VN_RELE(*vpp);
5522 5530 *vpp = NULL;
5523 5531 goto exit;
5524 5532 }
5525 5533 sfhp = sfh4_get(fhp, mi);
5526 5534
5527 5535 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5528 5536 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5529 5537
5530 5538 /*
5531 5539 * Make the new rnode
5532 5540 */
5533 5541 if (isdotdot) {
5534 5542 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5535 5543 if (e.error) {
5536 5544 sfh4_rele(&sfhp);
5537 5545 VN_RELE(*vpp);
5538 5546 *vpp = NULL;
5539 5547 goto exit;
5540 5548 }
5541 5549 /*
5542 5550 * XXX if nfs4_make_dotdot uses an existing rnode
5543 5551 * XXX it doesn't update the attributes.
5544 5552 * XXX for now just save them again to save an OTW
5545 5553 */
5546 5554 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5547 5555 } else {
5548 5556 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5549 5557 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5550 5558 /*
5551 5559 * If v_type == VNON, then garp was NULL because
5552 5560 * the last op in the compound failed and makenfs4node
5553 5561 * could not find the vnode for sfhp. It created
5554 5562 * a new vnode, so we have nothing to purge here.
5555 5563 */
5556 5564 if (nvp->v_type == VNON) {
5557 5565 vattr_t vattr;
5558 5566
5559 5567 vattr.va_mask = AT_TYPE;
5560 5568 /*
5561 5569 * N.B. We've already called nfs4_end_fop above.
5562 5570 */
5563 5571 e.error = nfs4getattr(nvp, &vattr, cr);
5564 5572 if (e.error) {
5565 5573 sfh4_rele(&sfhp);
5566 5574 VN_RELE(*vpp);
5567 5575 *vpp = NULL;
5568 5576 VN_RELE(nvp);
5569 5577 goto exit;
5570 5578 }
5571 5579 nvp->v_type = vattr.va_type;
5572 5580 }
5573 5581 }
5574 5582 sfh4_rele(&sfhp);
5575 5583
5576 5584 nrp = VTOR4(nvp);
5577 5585 mutex_enter(&nrp->r_statev4_lock);
5578 5586 if (!nrp->created_v4) {
5579 5587 mutex_exit(&nrp->r_statev4_lock);
5580 5588 dnlc_update(dvp, nm, nvp);
5581 5589 } else
5582 5590 mutex_exit(&nrp->r_statev4_lock);
5583 5591
5584 5592 VN_RELE(*vpp);
5585 5593 *vpp = nvp;
5586 5594 } else {
5587 5595 hrtime_t now;
5588 5596 hrtime_t delta = 0;
5589 5597
5590 5598 e.error = 0;
5591 5599
5592 5600 /*
5593 5601 * Because the NVERIFY "succeeded" we know that the
5594 5602 * directory attributes are still valid
5595 5603 * so update r_time_attr_inval
5596 5604 */
5597 5605 now = gethrtime();
5598 5606 mutex_enter(&drp->r_statelock);
5599 5607 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5600 5608 delta = now - drp->r_time_attr_saved;
5601 5609 if (delta < mi->mi_acdirmin)
5602 5610 delta = mi->mi_acdirmin;
5603 5611 else if (delta > mi->mi_acdirmax)
5604 5612 delta = mi->mi_acdirmax;
5605 5613 }
5606 5614 drp->r_time_attr_inval = now + delta;
5607 5615 mutex_exit(&drp->r_statelock);
5608 5616 dnlc_update(dvp, nm, *vpp);
5609 5617
5610 5618 /*
5611 5619 * Even though we have a valid directory attr cache
5612 5620 * and dnlc entry, we may not have access.
5613 5621 * This should almost always hit the cache.
5614 5622 */
5615 5623 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5616 5624 if (e.error) {
5617 5625 VN_RELE(*vpp);
5618 5626 *vpp = NULL;
5619 5627 }
5620 5628
5621 5629 if (*vpp == DNLC_NO_VNODE) {
5622 5630 VN_RELE(*vpp);
5623 5631 *vpp = NULL;
5624 5632 e.error = ENOENT;
5625 5633 }
5626 5634 }
5627 5635
5628 5636 exit:
5629 5637 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5630 5638 kmem_free(argop, argoplist_size);
5631 5639 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5632 5640 return (e.error);
5633 5641 }
5634 5642
5635 5643 /*
5636 5644 * We need to go over the wire to lookup the name, but
5637 5645 * while we are there verify the directory has not
5638 5646 * changed but if it has, get new attributes and check access
5639 5647 *
5640 5648 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5641 5649 * NVERIFY GETATTR ACCESS
5642 5650 *
5643 5651 * With the results:
5644 5652 * if the NVERIFY failed we must purge the caches, add new attributes,
5645 5653 * and cache new access.
5646 5654 * set a new r_time_attr_inval
5647 5655 * add name to dnlc, possibly negative
5648 5656 * if LOOKUP succeeded
5649 5657 * cache new attributes
5650 5658 */
5651 5659 static int
5652 5660 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5653 5661 {
5654 5662 COMPOUND4args_clnt args;
5655 5663 COMPOUND4res_clnt res;
5656 5664 fattr4 *ver_fattr;
5657 5665 fattr4_change dchange;
5658 5666 int32_t *ptr;
5659 5667 nfs4_ga_res_t *garp = NULL;
5660 5668 int argoplist_size = 9 * sizeof (nfs_argop4);
5661 5669 nfs_argop4 *argop;
5662 5670 int doqueue;
5663 5671 mntinfo4_t *mi;
5664 5672 nfs4_recov_state_t recov_state;
5665 5673 hrtime_t t;
5666 5674 int isdotdot;
5667 5675 vnode_t *nvp;
5668 5676 nfs_fh4 *fhp;
5669 5677 nfs4_sharedfh_t *sfhp;
5670 5678 nfs4_access_type_t cacc;
5671 5679 rnode4_t *nrp;
5672 5680 rnode4_t *drp = VTOR4(dvp);
5673 5681 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5674 5682
5675 5683 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5676 5684 ASSERT(nm != NULL);
5677 5685 ASSERT(nm[0] != '\0');
5678 5686 ASSERT(dvp->v_type == VDIR);
5679 5687 ASSERT(nm[0] != '.' || nm[1] != '\0');
5680 5688 ASSERT(*vpp == NULL);
5681 5689
5682 5690 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5683 5691 isdotdot = 1;
5684 5692 args.ctag = TAG_LOOKUP_PARENT;
5685 5693 } else {
5686 5694 /*
5687 5695 * If dvp were a stub, it should have triggered and caused
5688 5696 * a mount for us to get this far.
5689 5697 */
5690 5698 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5691 5699
5692 5700 isdotdot = 0;
5693 5701 args.ctag = TAG_LOOKUP;
5694 5702 }
5695 5703
5696 5704 mi = VTOMI4(dvp);
5697 5705 recov_state.rs_flags = 0;
5698 5706 recov_state.rs_num_retry_despite_err = 0;
5699 5707
5700 5708 nvp = NULL;
5701 5709
5702 5710 /* Save the original mount point security information */
5703 5711 (void) save_mnt_secinfo(mi->mi_curr_serv);
5704 5712
5705 5713 recov_retry:
5706 5714 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5707 5715 &recov_state, NULL);
5708 5716 if (e.error) {
5709 5717 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5710 5718 return (e.error);
5711 5719 }
5712 5720
5713 5721 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5714 5722
5715 5723 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5716 5724 args.array_len = 9;
5717 5725 args.array = argop;
5718 5726
5719 5727 /* 0. putfh file */
5720 5728 argop[0].argop = OP_CPUTFH;
5721 5729 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5722 5730
5723 5731 /* 1. savefh for the nverify */
5724 5732 argop[1].argop = OP_SAVEFH;
5725 5733
5726 5734 /* 2. lookup name */
5727 5735 if (isdotdot) {
5728 5736 argop[2].argop = OP_LOOKUPP;
5729 5737 } else {
5730 5738 argop[2].argop = OP_CLOOKUP;
5731 5739 argop[2].nfs_argop4_u.opclookup.cname = nm;
5732 5740 }
5733 5741
5734 5742 /* 3. resulting file handle */
5735 5743 argop[3].argop = OP_GETFH;
5736 5744
5737 5745 /* 4. resulting file attributes */
5738 5746 argop[4].argop = OP_GETATTR;
5739 5747 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5740 5748 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5741 5749
5742 5750 /* 5. restorefh back the directory for the nverify */
5743 5751 argop[5].argop = OP_RESTOREFH;
5744 5752
5745 5753 /* 6. nverify the change info */
5746 5754 argop[6].argop = OP_NVERIFY;
5747 5755 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5748 5756 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5749 5757 ver_fattr->attrlist4 = (char *)&dchange;
5750 5758 ptr = (int32_t *)&dchange;
5751 5759 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5752 5760 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5753 5761
5754 5762 /* 7. getattr directory */
5755 5763 argop[7].argop = OP_GETATTR;
5756 5764 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5757 5765 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5758 5766
5759 5767 /* 8. access directory */
5760 5768 argop[8].argop = OP_ACCESS;
5761 5769 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5762 5770 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5763 5771
5764 5772 doqueue = 1;
5765 5773 t = gethrtime();
5766 5774
5767 5775 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5768 5776
5769 5777 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5770 5778 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5771 5779 if (e.error != 0 && *vpp != NULL)
5772 5780 VN_RELE(*vpp);
5773 5781 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5774 5782 &recov_state, FALSE);
5775 5783 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5776 5784 kmem_free(argop, argoplist_size);
5777 5785 return (e.error);
5778 5786 }
5779 5787
5780 5788 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5781 5789 /*
5782 5790 * For WRONGSEC of a non-dotdot case, send secinfo directly
5783 5791 * from this thread, do not go thru the recovery thread since
5784 5792 * we need the nm information.
5785 5793 *
5786 5794 * Not doing dotdot case because there is no specification
5787 5795 * for (PUTFH, SECINFO "..") yet.
5788 5796 */
5789 5797 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5790 5798 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5791 5799 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5792 5800 &recov_state, FALSE);
5793 5801 else
5794 5802 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5795 5803 &recov_state, TRUE);
5796 5804 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5797 5805 kmem_free(argop, argoplist_size);
5798 5806 if (!e.error)
5799 5807 goto recov_retry;
5800 5808 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5801 5809 return (e.error);
5802 5810 }
5803 5811
5804 5812 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5805 5813 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5806 5814 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5807 5815 &recov_state, TRUE);
5808 5816
5809 5817 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5810 5818 kmem_free(argop, argoplist_size);
5811 5819 goto recov_retry;
5812 5820 }
5813 5821 }
5814 5822
5815 5823 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5816 5824
5817 5825 if (e.error || res.array_len == 0) {
5818 5826 /*
5819 5827 * If e.error isn't set, then reply has no ops (or we couldn't
5820 5828 * be here). The only legal way to reply without an op array
5821 5829 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5822 5830 * be in the reply for all other status values.
5823 5831 *
5824 5832 * For valid replies without an ops array, return ENOTSUP
5825 5833 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5826 5834 * return EIO -- don't trust status.
5827 5835 */
5828 5836 if (e.error == 0)
5829 5837 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5830 5838 ENOTSUP : EIO;
5831 5839
5832 5840 kmem_free(argop, argoplist_size);
5833 5841 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5834 5842 return (e.error);
5835 5843 }
5836 5844
5837 5845 e.error = geterrno4(res.status);
5838 5846
5839 5847 /*
5840 5848 * The PUTFH and SAVEFH may have failed.
5841 5849 */
5842 5850 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5843 5851 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5844 5852 nfs4_purge_stale_fh(e.error, dvp, cr);
5845 5853 goto exit;
5846 5854 }
5847 5855
5848 5856 /*
5849 5857 * Check if the file exists, if it does delay entering
5850 5858 * into the dnlc until after we update the directory
5851 5859 * attributes so we don't cause it to get purged immediately.
5852 5860 */
5853 5861 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5854 5862 /*
5855 5863 * The lookup failed, probably no entry
5856 5864 */
5857 5865 if (e.error == ENOENT && nfs4_lookup_neg_cache)
5858 5866 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5859 5867 goto exit;
5860 5868 }
5861 5869
5862 5870 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5863 5871 /*
5864 5872 * The file exists but we can't get its fh for
5865 5873 * some unknown reason. Error out to be safe.
5866 5874 */
5867 5875 goto exit;
5868 5876 }
5869 5877
5870 5878 fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5871 5879 if (fhp->nfs_fh4_len == 0) {
5872 5880 /*
5873 5881 * The file exists but a bogus fh
5874 5882 * some unknown reason. Error out to be safe.
5875 5883 */
5876 5884 e.error = EIO;
5877 5885 goto exit;
5878 5886 }
5879 5887 sfhp = sfh4_get(fhp, mi);
5880 5888
5881 5889 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5882 5890 sfh4_rele(&sfhp);
5883 5891 goto exit;
5884 5892 }
5885 5893 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5886 5894
5887 5895 /*
5888 5896 * The RESTOREFH may have failed
5889 5897 */
5890 5898 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5891 5899 sfh4_rele(&sfhp);
5892 5900 e.error = EIO;
5893 5901 goto exit;
5894 5902 }
5895 5903
5896 5904 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5897 5905 /*
5898 5906 * First make sure the NVERIFY failed as we expected,
5899 5907 * if it didn't then be conservative and error out
5900 5908 * as we can't trust the directory.
5901 5909 */
5902 5910 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5903 5911 sfh4_rele(&sfhp);
5904 5912 e.error = EIO;
5905 5913 goto exit;
5906 5914 }
5907 5915
5908 5916 /*
5909 5917 * We know the NVERIFY "failed" so the directory has changed,
5910 5918 * so we must:
5911 5919 * purge the caches (access and indirectly dnlc if needed)
5912 5920 */
5913 5921 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5914 5922
5915 5923 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5916 5924 sfh4_rele(&sfhp);
5917 5925 goto exit;
5918 5926 }
5919 5927 nfs4_attr_cache(dvp,
5920 5928 &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5921 5929 t, cr, FALSE, NULL);
5922 5930
5923 5931 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5924 5932 nfs4_purge_stale_fh(e.error, dvp, cr);
5925 5933 sfh4_rele(&sfhp);
5926 5934 e.error = geterrno4(res.status);
5927 5935 goto exit;
5928 5936 }
5929 5937
5930 5938 /*
5931 5939 * Now we know the directory is valid,
5932 5940 * cache new directory access
5933 5941 */
5934 5942 nfs4_access_cache(drp,
5935 5943 args.array[8].nfs_argop4_u.opaccess.access,
5936 5944 res.array[8].nfs_resop4_u.opaccess.access, cr);
5937 5945
5938 5946 /*
5939 5947 * recheck VEXEC access
5940 5948 */
5941 5949 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5942 5950 if (cacc != NFS4_ACCESS_ALLOWED) {
5943 5951 /*
5944 5952 * Directory permissions might have been revoked
5945 5953 */
5946 5954 if (cacc == NFS4_ACCESS_DENIED) {
5947 5955 sfh4_rele(&sfhp);
5948 5956 e.error = EACCES;
5949 5957 goto exit;
5950 5958 }
5951 5959
5952 5960 /*
5953 5961 * Somehow we must not have asked for enough
5954 5962 * so try a singleton ACCESS should never happen
5955 5963 */
5956 5964 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5957 5965 if (e.error) {
5958 5966 sfh4_rele(&sfhp);
5959 5967 goto exit;
5960 5968 }
5961 5969 }
5962 5970
5963 5971 e.error = geterrno4(res.status);
5964 5972 } else {
5965 5973 hrtime_t now;
5966 5974 hrtime_t delta = 0;
5967 5975
5968 5976 e.error = 0;
5969 5977
5970 5978 /*
5971 5979 * Because the NVERIFY "succeeded" we know that the
5972 5980 * directory attributes are still valid
5973 5981 * so update r_time_attr_inval
5974 5982 */
5975 5983 now = gethrtime();
5976 5984 mutex_enter(&drp->r_statelock);
5977 5985 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5978 5986 delta = now - drp->r_time_attr_saved;
5979 5987 if (delta < mi->mi_acdirmin)
5980 5988 delta = mi->mi_acdirmin;
5981 5989 else if (delta > mi->mi_acdirmax)
5982 5990 delta = mi->mi_acdirmax;
5983 5991 }
5984 5992 drp->r_time_attr_inval = now + delta;
5985 5993 mutex_exit(&drp->r_statelock);
5986 5994
5987 5995 /*
5988 5996 * Even though we have a valid directory attr cache,
5989 5997 * we may not have access.
5990 5998 * This should almost always hit the cache.
5991 5999 */
5992 6000 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5993 6001 if (e.error) {
5994 6002 sfh4_rele(&sfhp);
5995 6003 goto exit;
5996 6004 }
5997 6005 }
5998 6006
5999 6007 /*
6000 6008 * Now we have successfully completed the lookup, if the
6001 6009 * directory has changed we now have the valid attributes.
6002 6010 * We also know we have directory access.
6003 6011 * Create the new rnode and insert it in the dnlc.
6004 6012 */
6005 6013 if (isdotdot) {
6006 6014 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
6007 6015 if (e.error) {
6008 6016 sfh4_rele(&sfhp);
6009 6017 goto exit;
6010 6018 }
6011 6019 /*
6012 6020 * XXX if nfs4_make_dotdot uses an existing rnode
6013 6021 * XXX it doesn't update the attributes.
6014 6022 * XXX for now just save them again to save an OTW
6015 6023 */
6016 6024 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6017 6025 } else {
6018 6026 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6019 6027 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6020 6028 }
6021 6029 sfh4_rele(&sfhp);
6022 6030
6023 6031 nrp = VTOR4(nvp);
6024 6032 mutex_enter(&nrp->r_statev4_lock);
6025 6033 if (!nrp->created_v4) {
6026 6034 mutex_exit(&nrp->r_statev4_lock);
6027 6035 dnlc_update(dvp, nm, nvp);
6028 6036 } else
6029 6037 mutex_exit(&nrp->r_statev4_lock);
6030 6038
6031 6039 *vpp = nvp;
6032 6040
6033 6041 exit:
6034 6042 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6035 6043 kmem_free(argop, argoplist_size);
6036 6044 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6037 6045 return (e.error);
6038 6046 }
6039 6047
6040 6048 #ifdef DEBUG
6041 6049 void
6042 6050 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6043 6051 {
6044 6052 uint_t i, len;
6045 6053 zoneid_t zoneid = getzoneid();
6046 6054 char *s;
6047 6055
6048 6056 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6049 6057 for (i = 0; i < argcnt; i++) {
6050 6058 nfs_argop4 *op = &argbase[i];
6051 6059 switch (op->argop) {
6052 6060 case OP_CPUTFH:
6053 6061 case OP_PUTFH:
6054 6062 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6055 6063 break;
6056 6064 case OP_PUTROOTFH:
6057 6065 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6058 6066 break;
6059 6067 case OP_CLOOKUP:
6060 6068 s = op->nfs_argop4_u.opclookup.cname;
6061 6069 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6062 6070 break;
6063 6071 case OP_LOOKUP:
6064 6072 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6065 6073 &len, NULL);
6066 6074 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6067 6075 kmem_free(s, len);
6068 6076 break;
6069 6077 case OP_LOOKUPP:
6070 6078 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6071 6079 break;
6072 6080 case OP_GETFH:
6073 6081 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6074 6082 break;
6075 6083 case OP_GETATTR:
6076 6084 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6077 6085 break;
6078 6086 case OP_OPENATTR:
6079 6087 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6080 6088 break;
6081 6089 default:
6082 6090 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6083 6091 op->argop);
6084 6092 break;
6085 6093 }
6086 6094 }
6087 6095 }
6088 6096 #endif
6089 6097
6090 6098 /*
6091 6099 * nfs4lookup_setup - constructs a multi-lookup compound request.
6092 6100 *
6093 6101 * Given the path "nm1/nm2/.../nmn", the following compound requests
6094 6102 * may be created:
6095 6103 *
6096 6104 * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6097 6105 * is faster, for now.
6098 6106 *
6099 6107 * l4_getattrs indicates the type of compound requested.
6100 6108 *
6101 6109 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6102 6110 *
6103 6111 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} }
6104 6112 *
6105 6113 * total number of ops is n + 1.
6106 6114 *
6107 6115 * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6108 6116 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6109 6117 * before the last component, and only get attributes
6110 6118 * for the last component. Note that the second-to-last
6111 6119 * pathname component is XATTR_RPATH, which does NOT go
6112 6120 * over-the-wire as a lookup.
6113 6121 *
6114 6122 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6115 6123 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6116 6124 *
6117 6125 * and total number of ops is n + 5.
6118 6126 *
6119 6127 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6120 6128 * attribute directory: create lookups plus an OPENATTR
6121 6129 * replacing the last lookup. Note that the last pathname
6122 6130 * component is XATTR_RPATH, which does NOT go over-the-wire
6123 6131 * as a lookup.
6124 6132 *
6125 6133 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6126 6134 * Openattr; Getfh; Getattr }
6127 6135 *
6128 6136 * and total number of ops is n + 5.
6129 6137 *
6130 6138 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6131 6139 * nodes too.
6132 6140 *
6133 6141 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6134 6142 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr }
6135 6143 *
6136 6144 * and total number of ops is 3*n + 1.
6137 6145 *
6138 6146 * All cases: returns the index in the arg array of the final LOOKUP op, or
6139 6147 * -1 if no LOOKUPs were used.
6140 6148 */
6141 6149 int
6142 6150 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6143 6151 {
6144 6152 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6145 6153 nfs_argop4 *argbase, *argop;
6146 6154 int arglen, argcnt;
6147 6155 int n = 1; /* number of components */
6148 6156 int nga = 1; /* number of Getattr's in request */
6149 6157 char c = '\0', *s, *p;
6150 6158 int lookup_idx = -1;
6151 6159 int argoplist_size;
6152 6160
6153 6161 /* set lookuparg response result to 0 */
6154 6162 lookupargp->resp->status = NFS4_OK;
6155 6163
6156 6164 /* skip leading "/" or "." e.g. ".//./" if there is */
6157 6165 for (; ; nm++) {
6158 6166 if (*nm != '/' && *nm != '.')
6159 6167 break;
6160 6168
6161 6169 /* ".." is counted as 1 component */
6162 6170 if (*nm == '.' && *(nm + 1) != '/')
6163 6171 break;
6164 6172 }
6165 6173
6166 6174 /*
6167 6175 * Find n = number of components - nm must be null terminated
6168 6176 * Skip "." components.
6169 6177 */
6170 6178 if (*nm != '\0')
6171 6179 for (n = 1, s = nm; *s != '\0'; s++) {
6172 6180 if ((*s == '/') && (*(s + 1) != '/') &&
6173 6181 (*(s + 1) != '\0') &&
6174 6182 !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6175 6183 *(s + 2) == '\0')))
6176 6184 n++;
6177 6185 }
6178 6186 else
6179 6187 n = 0;
6180 6188
6181 6189 /*
6182 6190 * nga is number of components that need Getfh+Getattr
6183 6191 */
6184 6192 switch (l4_getattrs) {
6185 6193 case LKP4_NO_ATTRIBUTES:
6186 6194 nga = 0;
6187 6195 break;
6188 6196 case LKP4_ALL_ATTRIBUTES:
6189 6197 nga = n;
6190 6198 /*
6191 6199 * Always have at least 1 getfh, getattr pair
6192 6200 */
6193 6201 if (nga == 0)
6194 6202 nga++;
6195 6203 break;
6196 6204 case LKP4_LAST_ATTRDIR:
6197 6205 case LKP4_LAST_NAMED_ATTR:
6198 6206 nga = n+1;
6199 6207 break;
6200 6208 }
6201 6209
6202 6210 /*
6203 6211 * If change to use the filehandle attr instead of getfh
6204 6212 * the following line can be deleted.
6205 6213 */
6206 6214 nga *= 2;
6207 6215
6208 6216 /*
6209 6217 * calculate number of ops in request as
6210 6218 * header + trailer + lookups + getattrs
6211 6219 */
6212 6220 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6213 6221
6214 6222 argoplist_size = arglen * sizeof (nfs_argop4);
6215 6223 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6216 6224 lookupargp->argsp->array = argop;
6217 6225
6218 6226 argcnt = lookupargp->header_len;
6219 6227 argop += argcnt;
6220 6228
6221 6229 /*
6222 6230 * loop and create a lookup op and possibly getattr/getfh for
6223 6231 * each component. Skip "." components.
6224 6232 */
6225 6233 for (s = nm; *s != '\0'; s = p) {
6226 6234 /*
6227 6235 * Set up a pathname struct for each component if needed
6228 6236 */
6229 6237 while (*s == '/')
6230 6238 s++;
6231 6239 if (*s == '\0')
6232 6240 break;
6233 6241
6234 6242 for (p = s; (*p != '/') && (*p != '\0'); p++)
6235 6243 ;
6236 6244 c = *p;
6237 6245 *p = '\0';
6238 6246
6239 6247 if (s[0] == '.' && s[1] == '\0') {
6240 6248 *p = c;
6241 6249 continue;
6242 6250 }
6243 6251 if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6244 6252 strcmp(s, XATTR_RPATH) == 0) {
6245 6253 /* getfh XXX may not be needed in future */
6246 6254 argop->argop = OP_GETFH;
6247 6255 argop++;
6248 6256 argcnt++;
6249 6257
6250 6258 /* getattr */
6251 6259 argop->argop = OP_GETATTR;
6252 6260 argop->nfs_argop4_u.opgetattr.attr_request =
6253 6261 lookupargp->ga_bits;
6254 6262 argop->nfs_argop4_u.opgetattr.mi =
6255 6263 lookupargp->mi;
6256 6264 argop++;
6257 6265 argcnt++;
6258 6266
6259 6267 /* openattr */
6260 6268 argop->argop = OP_OPENATTR;
6261 6269 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6262 6270 strcmp(s, XATTR_RPATH) == 0) {
6263 6271 /* openattr */
6264 6272 argop->argop = OP_OPENATTR;
6265 6273 argop++;
6266 6274 argcnt++;
6267 6275
6268 6276 /* getfh XXX may not be needed in future */
6269 6277 argop->argop = OP_GETFH;
6270 6278 argop++;
6271 6279 argcnt++;
6272 6280
6273 6281 /* getattr */
6274 6282 argop->argop = OP_GETATTR;
6275 6283 argop->nfs_argop4_u.opgetattr.attr_request =
6276 6284 lookupargp->ga_bits;
6277 6285 argop->nfs_argop4_u.opgetattr.mi =
6278 6286 lookupargp->mi;
6279 6287 argop++;
6280 6288 argcnt++;
6281 6289 *p = c;
6282 6290 continue;
6283 6291 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6284 6292 /* lookupp */
6285 6293 argop->argop = OP_LOOKUPP;
6286 6294 } else {
6287 6295 /* lookup */
6288 6296 argop->argop = OP_LOOKUP;
6289 6297 (void) str_to_utf8(s,
6290 6298 &argop->nfs_argop4_u.oplookup.objname);
6291 6299 }
6292 6300 lookup_idx = argcnt;
6293 6301 argop++;
6294 6302 argcnt++;
6295 6303
6296 6304 *p = c;
6297 6305
6298 6306 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6299 6307 /* getfh XXX may not be needed in future */
6300 6308 argop->argop = OP_GETFH;
6301 6309 argop++;
6302 6310 argcnt++;
6303 6311
6304 6312 /* getattr */
6305 6313 argop->argop = OP_GETATTR;
6306 6314 argop->nfs_argop4_u.opgetattr.attr_request =
6307 6315 lookupargp->ga_bits;
6308 6316 argop->nfs_argop4_u.opgetattr.mi =
6309 6317 lookupargp->mi;
6310 6318 argop++;
6311 6319 argcnt++;
6312 6320 }
6313 6321 }
6314 6322
6315 6323 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6316 6324 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6317 6325 if (needgetfh) {
6318 6326 /* stick in a post-lookup getfh */
6319 6327 argop->argop = OP_GETFH;
6320 6328 argcnt++;
6321 6329 argop++;
6322 6330 }
6323 6331 /* post-lookup getattr */
6324 6332 argop->argop = OP_GETATTR;
6325 6333 argop->nfs_argop4_u.opgetattr.attr_request =
6326 6334 lookupargp->ga_bits;
6327 6335 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6328 6336 argcnt++;
6329 6337 }
6330 6338 argcnt += lookupargp->trailer_len; /* actual op count */
6331 6339 lookupargp->argsp->array_len = argcnt;
6332 6340 lookupargp->arglen = arglen;
6333 6341
6334 6342 #ifdef DEBUG
6335 6343 if (nfs4_client_lookup_debug)
6336 6344 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6337 6345 #endif
6338 6346
6339 6347 return (lookup_idx);
6340 6348 }
6341 6349
6342 6350 static int
6343 6351 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6344 6352 {
6345 6353 COMPOUND4args_clnt args;
6346 6354 COMPOUND4res_clnt res;
6347 6355 GETFH4res *gf_res = NULL;
6348 6356 nfs_argop4 argop[4];
6349 6357 nfs_resop4 *resop = NULL;
6350 6358 nfs4_sharedfh_t *sfhp;
6351 6359 hrtime_t t;
6352 6360 nfs4_error_t e;
6353 6361
6354 6362 rnode4_t *drp;
6355 6363 int doqueue = 1;
6356 6364 vnode_t *vp;
6357 6365 int needrecov = 0;
6358 6366 nfs4_recov_state_t recov_state;
6359 6367
6360 6368 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6361 6369
6362 6370 *avp = NULL;
6363 6371 recov_state.rs_flags = 0;
6364 6372 recov_state.rs_num_retry_despite_err = 0;
6365 6373
6366 6374 recov_retry:
6367 6375 /* COMPOUND: putfh, openattr, getfh, getattr */
6368 6376 args.array_len = 4;
6369 6377 args.array = argop;
6370 6378 args.ctag = TAG_OPENATTR;
6371 6379
6372 6380 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6373 6381 if (e.error)
6374 6382 return (e.error);
6375 6383
6376 6384 drp = VTOR4(dvp);
6377 6385
6378 6386 /* putfh */
6379 6387 argop[0].argop = OP_CPUTFH;
6380 6388 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6381 6389
6382 6390 /* openattr */
6383 6391 argop[1].argop = OP_OPENATTR;
6384 6392 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6385 6393
6386 6394 /* getfh */
6387 6395 argop[2].argop = OP_GETFH;
6388 6396
6389 6397 /* getattr */
6390 6398 argop[3].argop = OP_GETATTR;
6391 6399 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6392 6400 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6393 6401
6394 6402 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6395 6403 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6396 6404 rnode4info(drp)));
6397 6405
6398 6406 t = gethrtime();
6399 6407
6400 6408 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6401 6409
6402 6410 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6403 6411 if (needrecov) {
6404 6412 bool_t abort;
6405 6413
6406 6414 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6407 6415 "nfs4openattr: initiating recovery\n"));
6408 6416
6409 6417 abort = nfs4_start_recovery(&e,
6410 6418 VTOMI4(dvp), dvp, NULL, NULL, NULL,
6411 6419 OP_OPENATTR, NULL, NULL, NULL);
6412 6420 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6413 6421 if (!e.error) {
6414 6422 e.error = geterrno4(res.status);
6415 6423 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6416 6424 }
6417 6425 if (abort == FALSE)
6418 6426 goto recov_retry;
6419 6427 return (e.error);
6420 6428 }
6421 6429
6422 6430 if (e.error) {
6423 6431 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6424 6432 return (e.error);
6425 6433 }
6426 6434
6427 6435 if (res.status) {
6428 6436 /*
6429 6437 * If OTW errro is NOTSUPP, then it should be
6430 6438 * translated to EINVAL. All Solaris file system
6431 6439 * implementations return EINVAL to the syscall layer
6432 6440 * when the attrdir cannot be created due to an
6433 6441 * implementation restriction or noxattr mount option.
6434 6442 */
6435 6443 if (res.status == NFS4ERR_NOTSUPP) {
6436 6444 mutex_enter(&drp->r_statelock);
6437 6445 if (drp->r_xattr_dir)
6438 6446 VN_RELE(drp->r_xattr_dir);
6439 6447 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6440 6448 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6441 6449 mutex_exit(&drp->r_statelock);
6442 6450
6443 6451 e.error = EINVAL;
6444 6452 } else {
6445 6453 e.error = geterrno4(res.status);
6446 6454 }
6447 6455
6448 6456 if (e.error) {
6449 6457 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6450 6458 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6451 6459 needrecov);
6452 6460 return (e.error);
6453 6461 }
6454 6462 }
6455 6463
6456 6464 resop = &res.array[0]; /* putfh res */
6457 6465 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6458 6466
6459 6467 resop = &res.array[1]; /* openattr res */
6460 6468 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6461 6469
6462 6470 resop = &res.array[2]; /* getfh res */
6463 6471 gf_res = &resop->nfs_resop4_u.opgetfh;
6464 6472 if (gf_res->object.nfs_fh4_len == 0) {
6465 6473 *avp = NULL;
6466 6474 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6467 6475 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6468 6476 return (ENOENT);
6469 6477 }
6470 6478
6471 6479 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6472 6480 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6473 6481 dvp->v_vfsp, t, cr, dvp,
6474 6482 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6475 6483 sfh4_rele(&sfhp);
6476 6484
6477 6485 if (e.error)
6478 6486 PURGE_ATTRCACHE4(vp);
6479 6487
6480 6488 mutex_enter(&vp->v_lock);
6481 6489 vp->v_flag |= V_XATTRDIR;
6482 6490 mutex_exit(&vp->v_lock);
6483 6491
6484 6492 *avp = vp;
6485 6493
6486 6494 mutex_enter(&drp->r_statelock);
6487 6495 if (drp->r_xattr_dir)
6488 6496 VN_RELE(drp->r_xattr_dir);
6489 6497 VN_HOLD(vp);
6490 6498 drp->r_xattr_dir = vp;
6491 6499
6492 6500 /*
6493 6501 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6494 6502 * NULL. xattrs could be created at any time, and we have no
6495 6503 * way to update pc4_xattr_exists in the base object if/when
6496 6504 * it happens.
6497 6505 */
6498 6506 drp->r_pathconf.pc4_xattr_valid = 0;
6499 6507
6500 6508 mutex_exit(&drp->r_statelock);
6501 6509
6502 6510 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6503 6511
6504 6512 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6505 6513
6506 6514 return (0);
6507 6515 }
6508 6516
6509 6517 /* ARGSUSED */
6510 6518 static int
6511 6519 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6512 6520 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6513 6521 vsecattr_t *vsecp)
6514 6522 {
6515 6523 int error;
6516 6524 vnode_t *vp = NULL;
6517 6525 rnode4_t *rp;
6518 6526 struct vattr vattr;
6519 6527 rnode4_t *drp;
6520 6528 vnode_t *tempvp;
6521 6529 enum createmode4 createmode;
6522 6530 bool_t must_trunc = FALSE;
6523 6531 int truncating = 0;
6524 6532
6525 6533 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6526 6534 return (EPERM);
6527 6535 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6528 6536 return (EINVAL);
6529 6537 }
6530 6538
6531 6539 /* . and .. have special meaning in the protocol, reject them. */
6532 6540
6533 6541 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6534 6542 return (EISDIR);
6535 6543
6536 6544 drp = VTOR4(dvp);
6537 6545
6538 6546 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6539 6547 return (EINTR);
6540 6548
6541 6549 top:
6542 6550 /*
6543 6551 * We make a copy of the attributes because the caller does not
6544 6552 * expect us to change what va points to.
6545 6553 */
6546 6554 vattr = *va;
6547 6555
6548 6556 /*
6549 6557 * If the pathname is "", then dvp is the root vnode of
6550 6558 * a remote file mounted over a local directory.
6551 6559 * All that needs to be done is access
6552 6560 * checking and truncation. Note that we avoid doing
6553 6561 * open w/ create because the parent directory might
6554 6562 * be in pseudo-fs and the open would fail.
6555 6563 */
6556 6564 if (*nm == '\0') {
6557 6565 error = 0;
6558 6566 VN_HOLD(dvp);
6559 6567 vp = dvp;
6560 6568 must_trunc = TRUE;
6561 6569 } else {
6562 6570 /*
6563 6571 * We need to go over the wire, just to be sure whether the
6564 6572 * file exists or not. Using the DNLC can be dangerous in
6565 6573 * this case when making a decision regarding existence.
6566 6574 */
6567 6575 error = nfs4lookup(dvp, nm, &vp, cr, 1);
6568 6576 }
6569 6577
6570 6578 if (exclusive)
6571 6579 createmode = EXCLUSIVE4;
6572 6580 else
6573 6581 createmode = GUARDED4;
6574 6582
6575 6583 /*
6576 6584 * error would be set if the file does not exist on the
6577 6585 * server, so lets go create it.
6578 6586 */
6579 6587 if (error) {
6580 6588 goto create_otw;
6581 6589 }
6582 6590
6583 6591 /*
6584 6592 * File does exist on the server
6585 6593 */
6586 6594 if (exclusive == EXCL)
6587 6595 error = EEXIST;
6588 6596 else if (vp->v_type == VDIR && (mode & VWRITE))
6589 6597 error = EISDIR;
6590 6598 else {
6591 6599 /*
6592 6600 * If vnode is a device, create special vnode.
6593 6601 */
6594 6602 if (ISVDEV(vp->v_type)) {
6595 6603 tempvp = vp;
6596 6604 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6597 6605 VN_RELE(tempvp);
6598 6606 }
6599 6607 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6600 6608 if ((vattr.va_mask & AT_SIZE) &&
6601 6609 vp->v_type == VREG) {
6602 6610 rp = VTOR4(vp);
6603 6611 /*
6604 6612 * Check here for large file handled
6605 6613 * by LF-unaware process (as
6606 6614 * ufs_create() does)
6607 6615 */
6608 6616 if (!(flags & FOFFMAX)) {
6609 6617 mutex_enter(&rp->r_statelock);
6610 6618 if (rp->r_size > MAXOFF32_T)
6611 6619 error = EOVERFLOW;
6612 6620 mutex_exit(&rp->r_statelock);
6613 6621 }
6614 6622
6615 6623 /* if error is set then we need to return */
6616 6624 if (error) {
6617 6625 nfs_rw_exit(&drp->r_rwlock);
6618 6626 VN_RELE(vp);
6619 6627 return (error);
6620 6628 }
6621 6629
6622 6630 if (must_trunc) {
6623 6631 vattr.va_mask = AT_SIZE;
6624 6632 error = nfs4setattr(vp, &vattr, 0, cr,
6625 6633 NULL);
6626 6634 } else {
6627 6635 /*
6628 6636 * we know we have a regular file that already
6629 6637 * exists and we may end up truncating the file
6630 6638 * as a result of the open_otw, so flush out
6631 6639 * any dirty pages for this file first.
6632 6640 */
6633 6641 if (nfs4_has_pages(vp) &&
6634 6642 ((rp->r_flags & R4DIRTY) ||
6635 6643 rp->r_count > 0 ||
6636 6644 rp->r_mapcnt > 0)) {
6637 6645 error = nfs4_putpage(vp,
6638 6646 (offset_t)0, 0, 0, cr, ct);
6639 6647 if (error && (error == ENOSPC ||
6640 6648 error == EDQUOT)) {
6641 6649 mutex_enter(
6642 6650 &rp->r_statelock);
6643 6651 if (!rp->r_error)
6644 6652 rp->r_error =
6645 6653 error;
6646 6654 mutex_exit(
6647 6655 &rp->r_statelock);
6648 6656 }
6649 6657 }
6650 6658 vattr.va_mask = (AT_SIZE |
6651 6659 AT_TYPE | AT_MODE);
6652 6660 vattr.va_type = VREG;
6653 6661 createmode = UNCHECKED4;
6654 6662 truncating = 1;
6655 6663 goto create_otw;
6656 6664 }
6657 6665 }
6658 6666 }
6659 6667 }
6660 6668 nfs_rw_exit(&drp->r_rwlock);
6661 6669 if (error) {
6662 6670 VN_RELE(vp);
6663 6671 } else {
6664 6672 vnode_t *tvp;
6665 6673 rnode4_t *trp;
6666 6674 tvp = vp;
6667 6675 if (vp->v_type == VREG) {
6668 6676 trp = VTOR4(vp);
6669 6677 if (IS_SHADOW(vp, trp))
6670 6678 tvp = RTOV4(trp);
6671 6679 }
6672 6680
6673 6681 if (must_trunc) {
6674 6682 /*
6675 6683 * existing file got truncated, notify.
6676 6684 */
6677 6685 vnevent_create(tvp, ct);
6678 6686 }
6679 6687
6680 6688 *vpp = vp;
6681 6689 }
6682 6690 return (error);
6683 6691
6684 6692 create_otw:
6685 6693 dnlc_remove(dvp, nm);
6686 6694
6687 6695 ASSERT(vattr.va_mask & AT_TYPE);
6688 6696
6689 6697 /*
6690 6698 * If not a regular file let nfs4mknod() handle it.
6691 6699 */
6692 6700 if (vattr.va_type != VREG) {
6693 6701 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6694 6702 nfs_rw_exit(&drp->r_rwlock);
6695 6703 return (error);
6696 6704 }
6697 6705
6698 6706 /*
6699 6707 * It _is_ a regular file.
6700 6708 */
6701 6709 ASSERT(vattr.va_mask & AT_MODE);
6702 6710 if (MANDMODE(vattr.va_mode)) {
6703 6711 nfs_rw_exit(&drp->r_rwlock);
6704 6712 return (EACCES);
6705 6713 }
6706 6714
6707 6715 /*
6708 6716 * If this happens to be a mknod of a regular file, then flags will
6709 6717 * have neither FREAD or FWRITE. However, we must set at least one
6710 6718 * for the call to nfs4open_otw. If it's open(O_CREAT) driving
6711 6719 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6712 6720 * set (based on openmode specified by app).
6713 6721 */
6714 6722 if ((flags & (FREAD|FWRITE)) == 0)
6715 6723 flags |= (FREAD|FWRITE);
6716 6724
6717 6725 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6718 6726
6719 6727 if (vp != NULL) {
6720 6728 /* if create was successful, throw away the file's pages */
6721 6729 if (!error && (vattr.va_mask & AT_SIZE))
6722 6730 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6723 6731 cr);
6724 6732 /* release the lookup hold */
6725 6733 VN_RELE(vp);
6726 6734 vp = NULL;
6727 6735 }
6728 6736
6729 6737 /*
6730 6738 * validate that we opened a regular file. This handles a misbehaving
6731 6739 * server that returns an incorrect FH.
6732 6740 */
6733 6741 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6734 6742 error = EISDIR;
6735 6743 VN_RELE(*vpp);
6736 6744 }
6737 6745
6738 6746 /*
6739 6747 * If this is not an exclusive create, then the CREATE
6740 6748 * request will be made with the GUARDED mode set. This
6741 6749 * means that the server will return EEXIST if the file
6742 6750 * exists. The file could exist because of a retransmitted
6743 6751 * request. In this case, we recover by starting over and
6744 6752 * checking to see whether the file exists. This second
6745 6753 * time through it should and a CREATE request will not be
6746 6754 * sent.
6747 6755 *
6748 6756 * This handles the problem of a dangling CREATE request
6749 6757 * which contains attributes which indicate that the file
6750 6758 * should be truncated. This retransmitted request could
6751 6759 * possibly truncate valid data in the file if not caught
6752 6760 * by the duplicate request mechanism on the server or if
6753 6761 * not caught by other means. The scenario is:
6754 6762 *
6755 6763 * Client transmits CREATE request with size = 0
6756 6764 * Client times out, retransmits request.
6757 6765 * Response to the first request arrives from the server
6758 6766 * and the client proceeds on.
6759 6767 * Client writes data to the file.
6760 6768 * The server now processes retransmitted CREATE request
6761 6769 * and truncates file.
6762 6770 *
6763 6771 * The use of the GUARDED CREATE request prevents this from
6764 6772 * happening because the retransmitted CREATE would fail
6765 6773 * with EEXIST and would not truncate the file.
6766 6774 */
6767 6775 if (error == EEXIST && exclusive == NONEXCL) {
6768 6776 #ifdef DEBUG
6769 6777 nfs4_create_misses++;
6770 6778 #endif
6771 6779 goto top;
6772 6780 }
6773 6781 nfs_rw_exit(&drp->r_rwlock);
6774 6782 if (truncating && !error && *vpp) {
6775 6783 vnode_t *tvp;
6776 6784 rnode4_t *trp;
6777 6785 /*
6778 6786 * existing file got truncated, notify.
6779 6787 */
6780 6788 tvp = *vpp;
6781 6789 trp = VTOR4(tvp);
6782 6790 if (IS_SHADOW(tvp, trp))
6783 6791 tvp = RTOV4(trp);
6784 6792 vnevent_create(tvp, ct);
6785 6793 }
6786 6794 return (error);
6787 6795 }
6788 6796
6789 6797 /*
6790 6798 * Create compound (for mkdir, mknod, symlink):
6791 6799 * { Putfh <dfh>; Create; Getfh; Getattr }
6792 6800 * It's okay if setattr failed to set gid - this is not considered
6793 6801 * an error, but purge attrs in that case.
6794 6802 */
6795 6803 static int
6796 6804 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6797 6805 vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6798 6806 {
6799 6807 int need_end_op = FALSE;
6800 6808 COMPOUND4args_clnt args;
6801 6809 COMPOUND4res_clnt res, *resp = NULL;
6802 6810 nfs_argop4 *argop;
6803 6811 nfs_resop4 *resop;
6804 6812 int doqueue;
6805 6813 mntinfo4_t *mi;
6806 6814 rnode4_t *drp = VTOR4(dvp);
6807 6815 change_info4 *cinfo;
6808 6816 GETFH4res *gf_res;
6809 6817 struct vattr vattr;
6810 6818 vnode_t *vp;
6811 6819 fattr4 *crattr;
6812 6820 bool_t needrecov = FALSE;
6813 6821 nfs4_recov_state_t recov_state;
6814 6822 nfs4_sharedfh_t *sfhp = NULL;
6815 6823 hrtime_t t;
6816 6824 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6817 6825 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6818 6826 dirattr_info_t dinfo, *dinfop;
6819 6827 servinfo4_t *svp;
6820 6828 bitmap4 supp_attrs;
6821 6829
6822 6830 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6823 6831 type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6824 6832
6825 6833 mi = VTOMI4(dvp);
6826 6834
6827 6835 /*
6828 6836 * Make sure we properly deal with setting the right gid
6829 6837 * on a new directory to reflect the parent's setgid bit
6830 6838 */
6831 6839 setgid_flag = 0;
6832 6840 if (type == NF4DIR) {
6833 6841 struct vattr dva;
6834 6842
6835 6843 va->va_mode &= ~VSGID;
6836 6844 dva.va_mask = AT_MODE | AT_GID;
6837 6845 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6838 6846
6839 6847 /*
6840 6848 * If the parent's directory has the setgid bit set
6841 6849 * _and_ the client was able to get a valid mapping
6842 6850 * for the parent dir's owner_group, we want to
6843 6851 * append NVERIFY(owner_group == dva.va_gid) and
6844 6852 * SETTATTR to the CREATE compound.
6845 6853 */
6846 6854 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6847 6855 setgid_flag = 1;
6848 6856 va->va_mode |= VSGID;
6849 6857 if (dva.va_gid != GID_NOBODY) {
6850 6858 va->va_mask |= AT_GID;
6851 6859 va->va_gid = dva.va_gid;
6852 6860 }
6853 6861 }
6854 6862 }
6855 6863 }
6856 6864
6857 6865 /*
6858 6866 * Create ops:
6859 6867 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6860 6868 * 5:restorefh(dir) 6:getattr(dir)
6861 6869 *
6862 6870 * if (setgid)
6863 6871 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6864 6872 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6865 6873 * 8:nverify 9:setattr
6866 6874 */
6867 6875 if (setgid_flag) {
6868 6876 numops = 10;
6869 6877 idx_create = 1;
6870 6878 idx_fattr = 3;
6871 6879 } else {
6872 6880 numops = 7;
6873 6881 idx_create = 2;
6874 6882 idx_fattr = 4;
6875 6883 }
6876 6884
6877 6885 ASSERT(nfs_zone() == mi->mi_zone);
6878 6886 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6879 6887 return (EINTR);
6880 6888 }
6881 6889 recov_state.rs_flags = 0;
6882 6890 recov_state.rs_num_retry_despite_err = 0;
6883 6891
6884 6892 argoplist_size = numops * sizeof (nfs_argop4);
6885 6893 argop = kmem_alloc(argoplist_size, KM_SLEEP);
6886 6894
6887 6895 recov_retry:
6888 6896 if (type == NF4LNK)
6889 6897 args.ctag = TAG_SYMLINK;
6890 6898 else if (type == NF4DIR)
6891 6899 args.ctag = TAG_MKDIR;
6892 6900 else
6893 6901 args.ctag = TAG_MKNOD;
6894 6902
6895 6903 args.array_len = numops;
6896 6904 args.array = argop;
6897 6905
6898 6906 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6899 6907 nfs_rw_exit(&drp->r_rwlock);
6900 6908 kmem_free(argop, argoplist_size);
6901 6909 return (e.error);
6902 6910 }
6903 6911 need_end_op = TRUE;
6904 6912
6905 6913
6906 6914 /* 0: putfh directory */
6907 6915 argop[0].argop = OP_CPUTFH;
6908 6916 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6909 6917
6910 6918 /* 1/2: Create object */
6911 6919 argop[idx_create].argop = OP_CCREATE;
6912 6920 argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6913 6921 argop[idx_create].nfs_argop4_u.opccreate.type = type;
6914 6922 if (type == NF4LNK) {
6915 6923 /*
6916 6924 * symlink, treat name as data
6917 6925 */
6918 6926 ASSERT(data != NULL);
6919 6927 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6920 6928 (char *)data;
6921 6929 }
6922 6930 if (type == NF4BLK || type == NF4CHR) {
6923 6931 ASSERT(data != NULL);
6924 6932 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6925 6933 *((specdata4 *)data);
6926 6934 }
6927 6935
6928 6936 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6929 6937
6930 6938 svp = drp->r_server;
6931 6939 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6932 6940 supp_attrs = svp->sv_supp_attrs;
6933 6941 nfs_rw_exit(&svp->sv_lock);
6934 6942
6935 6943 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6936 6944 nfs_rw_exit(&drp->r_rwlock);
6937 6945 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6938 6946 e.error = EINVAL;
6939 6947 kmem_free(argop, argoplist_size);
6940 6948 return (e.error);
6941 6949 }
6942 6950
6943 6951 /* 2/3: getfh fh of created object */
6944 6952 ASSERT(idx_create + 1 == idx_fattr - 1);
6945 6953 argop[idx_create + 1].argop = OP_GETFH;
6946 6954
6947 6955 /* 3/4: getattr of new object */
6948 6956 argop[idx_fattr].argop = OP_GETATTR;
6949 6957 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6950 6958 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6951 6959
6952 6960 if (setgid_flag) {
6953 6961 vattr_t _v;
6954 6962
6955 6963 argop[4].argop = OP_SAVEFH;
6956 6964
6957 6965 argop[5].argop = OP_CPUTFH;
6958 6966 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6959 6967
6960 6968 argop[6].argop = OP_GETATTR;
6961 6969 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6962 6970 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6963 6971
6964 6972 argop[7].argop = OP_RESTOREFH;
6965 6973
6966 6974 /*
6967 6975 * nverify
6968 6976 *
6969 6977 * XXX - Revisit the last argument to nfs4_end_op()
6970 6978 * once 5020486 is fixed.
6971 6979 */
6972 6980 _v.va_mask = AT_GID;
6973 6981 _v.va_gid = va->va_gid;
6974 6982 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6975 6983 supp_attrs)) {
6976 6984 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6977 6985 nfs_rw_exit(&drp->r_rwlock);
6978 6986 nfs4_fattr4_free(crattr);
6979 6987 kmem_free(argop, argoplist_size);
6980 6988 return (e.error);
6981 6989 }
6982 6990
6983 6991 /*
6984 6992 * setattr
6985 6993 *
6986 6994 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
6987 6995 * so no need for stateid or flags. Also we specify NULL
6988 6996 * rp since we're only interested in setting owner_group
6989 6997 * attributes.
6990 6998 */
6991 6999 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
6992 7000 &e.error, 0);
6993 7001
6994 7002 if (e.error) {
6995 7003 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6996 7004 nfs_rw_exit(&drp->r_rwlock);
6997 7005 nfs4_fattr4_free(crattr);
6998 7006 nfs4args_verify_free(&argop[8]);
6999 7007 kmem_free(argop, argoplist_size);
7000 7008 return (e.error);
7001 7009 }
7002 7010 } else {
7003 7011 argop[1].argop = OP_SAVEFH;
7004 7012
7005 7013 argop[5].argop = OP_RESTOREFH;
7006 7014
7007 7015 argop[6].argop = OP_GETATTR;
7008 7016 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7009 7017 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7010 7018 }
7011 7019
7012 7020 dnlc_remove(dvp, nm);
7013 7021
7014 7022 doqueue = 1;
7015 7023 t = gethrtime();
7016 7024 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7017 7025
7018 7026 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7019 7027 if (e.error) {
7020 7028 PURGE_ATTRCACHE4(dvp);
7021 7029 if (!needrecov)
7022 7030 goto out;
7023 7031 }
7024 7032
7025 7033 if (needrecov) {
7026 7034 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7027 7035 OP_CREATE, NULL, NULL, NULL) == FALSE) {
7028 7036 nfs4_end_op(mi, dvp, NULL, &recov_state,
7029 7037 needrecov);
7030 7038 need_end_op = FALSE;
7031 7039 nfs4_fattr4_free(crattr);
7032 7040 if (setgid_flag) {
7033 7041 nfs4args_verify_free(&argop[8]);
7034 7042 nfs4args_setattr_free(&argop[9]);
7035 7043 }
7036 7044 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7037 7045 goto recov_retry;
7038 7046 }
7039 7047 }
7040 7048
7041 7049 resp = &res;
7042 7050
7043 7051 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7044 7052
7045 7053 if (res.status == NFS4ERR_BADOWNER)
7046 7054 nfs4_log_badowner(mi, OP_CREATE);
7047 7055
7048 7056 e.error = geterrno4(res.status);
7049 7057
7050 7058 /*
7051 7059 * This check is left over from when create was implemented
7052 7060 * using a setattr op (instead of createattrs). If the
7053 7061 * putfh/create/getfh failed, the error was returned. If
7054 7062 * setattr/getattr failed, we keep going.
7055 7063 *
7056 7064 * It might be better to get rid of the GETFH also, and just
7057 7065 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7058 7066 * Then if any of the operations failed, we could return the
7059 7067 * error now, and remove much of the error code below.
7060 7068 */
7061 7069 if (res.array_len <= idx_fattr) {
7062 7070 /*
7063 7071 * Either Putfh, Create or Getfh failed.
7064 7072 */
7065 7073 PURGE_ATTRCACHE4(dvp);
7066 7074 /*
7067 7075 * nfs4_purge_stale_fh() may generate otw calls through
7068 7076 * nfs4_invalidate_pages. Hence the need to call
7069 7077 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7070 7078 */
7071 7079 nfs4_end_op(mi, dvp, NULL, &recov_state,
7072 7080 needrecov);
7073 7081 need_end_op = FALSE;
7074 7082 nfs4_purge_stale_fh(e.error, dvp, cr);
7075 7083 goto out;
7076 7084 }
7077 7085 }
7078 7086
7079 7087 resop = &res.array[idx_create]; /* create res */
7080 7088 cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7081 7089
7082 7090 resop = &res.array[idx_create + 1]; /* getfh res */
7083 7091 gf_res = &resop->nfs_resop4_u.opgetfh;
7084 7092
7085 7093 sfhp = sfh4_get(&gf_res->object, mi);
7086 7094 if (e.error) {
7087 7095 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7088 7096 fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7089 7097 if (vp->v_type == VNON) {
7090 7098 vattr.va_mask = AT_TYPE;
7091 7099 /*
7092 7100 * Need to call nfs4_end_op before nfs4getattr to avoid
7093 7101 * potential nfs4_start_op deadlock. See RFE 4777612.
7094 7102 */
7095 7103 nfs4_end_op(mi, dvp, NULL, &recov_state,
7096 7104 needrecov);
7097 7105 need_end_op = FALSE;
7098 7106 e.error = nfs4getattr(vp, &vattr, cr);
7099 7107 if (e.error) {
7100 7108 VN_RELE(vp);
7101 7109 *vpp = NULL;
7102 7110 goto out;
7103 7111 }
7104 7112 vp->v_type = vattr.va_type;
7105 7113 }
7106 7114 e.error = 0;
7107 7115 } else {
7108 7116 *vpp = vp = makenfs4node(sfhp,
7109 7117 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7110 7118 dvp->v_vfsp, t, cr,
7111 7119 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7112 7120 }
7113 7121
7114 7122 /*
7115 7123 * If compound succeeded, then update dir attrs
7116 7124 */
7117 7125 if (res.status == NFS4_OK) {
7118 7126 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7119 7127 dinfo.di_cred = cr;
7120 7128 dinfo.di_time_call = t;
7121 7129 dinfop = &dinfo;
7122 7130 } else
7123 7131 dinfop = NULL;
7124 7132
7125 7133 /* Update directory cache attribute, readdir and dnlc caches */
7126 7134 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7127 7135
7128 7136 out:
7129 7137 if (sfhp != NULL)
7130 7138 sfh4_rele(&sfhp);
7131 7139 nfs_rw_exit(&drp->r_rwlock);
7132 7140 nfs4_fattr4_free(crattr);
7133 7141 if (setgid_flag) {
7134 7142 nfs4args_verify_free(&argop[8]);
7135 7143 nfs4args_setattr_free(&argop[9]);
7136 7144 }
7137 7145 if (resp)
7138 7146 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7139 7147 if (need_end_op)
7140 7148 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7141 7149
7142 7150 kmem_free(argop, argoplist_size);
7143 7151 return (e.error);
7144 7152 }
7145 7153
7146 7154 /* ARGSUSED */
7147 7155 static int
7148 7156 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7149 7157 int mode, vnode_t **vpp, cred_t *cr)
7150 7158 {
7151 7159 int error;
7152 7160 vnode_t *vp;
7153 7161 nfs_ftype4 type;
7154 7162 specdata4 spec, *specp = NULL;
7155 7163
7156 7164 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7157 7165
7158 7166 switch (va->va_type) {
7159 7167 case VCHR:
7160 7168 case VBLK:
7161 7169 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7162 7170 spec.specdata1 = getmajor(va->va_rdev);
7163 7171 spec.specdata2 = getminor(va->va_rdev);
7164 7172 specp = &spec;
7165 7173 break;
7166 7174
7167 7175 case VFIFO:
7168 7176 type = NF4FIFO;
7169 7177 break;
7170 7178 case VSOCK:
7171 7179 type = NF4SOCK;
7172 7180 break;
7173 7181
7174 7182 default:
7175 7183 return (EINVAL);
7176 7184 }
7177 7185
7178 7186 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7179 7187 if (error) {
7180 7188 return (error);
7181 7189 }
7182 7190
7183 7191 /*
7184 7192 * This might not be needed any more; special case to deal
7185 7193 * with problematic v2/v3 servers. Since create was unable
7186 7194 * to set group correctly, not sure what hope setattr has.
7187 7195 */
7188 7196 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7189 7197 va->va_mask = AT_GID;
7190 7198 (void) nfs4setattr(vp, va, 0, cr, NULL);
7191 7199 }
7192 7200
7193 7201 /*
7194 7202 * If vnode is a device create special vnode
7195 7203 */
7196 7204 if (ISVDEV(vp->v_type)) {
7197 7205 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7198 7206 VN_RELE(vp);
7199 7207 } else {
7200 7208 *vpp = vp;
7201 7209 }
7202 7210 return (error);
7203 7211 }
7204 7212
7205 7213 /*
7206 7214 * Remove requires that the current fh be the target directory.
7207 7215 * After the operation, the current fh is unchanged.
7208 7216 * The compound op structure is:
7209 7217 * PUTFH(targetdir), REMOVE
7210 7218 *
7211 7219 * Weirdness: if the vnode to be removed is open
7212 7220 * we rename it instead of removing it and nfs_inactive
7213 7221 * will remove the new name.
7214 7222 */
7215 7223 /* ARGSUSED */
7216 7224 static int
7217 7225 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7218 7226 {
7219 7227 COMPOUND4args_clnt args;
7220 7228 COMPOUND4res_clnt res, *resp = NULL;
7221 7229 REMOVE4res *rm_res;
7222 7230 nfs_argop4 argop[3];
7223 7231 nfs_resop4 *resop;
7224 7232 vnode_t *vp;
7225 7233 char *tmpname;
7226 7234 int doqueue;
7227 7235 mntinfo4_t *mi;
7228 7236 rnode4_t *rp;
7229 7237 rnode4_t *drp;
7230 7238 int needrecov = 0;
7231 7239 nfs4_recov_state_t recov_state;
7232 7240 int isopen;
7233 7241 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7234 7242 dirattr_info_t dinfo;
7235 7243
7236 7244 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7237 7245 return (EPERM);
7238 7246 drp = VTOR4(dvp);
7239 7247 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7240 7248 return (EINTR);
7241 7249
7242 7250 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7243 7251 if (e.error) {
7244 7252 nfs_rw_exit(&drp->r_rwlock);
7245 7253 return (e.error);
7246 7254 }
7247 7255
7248 7256 if (vp->v_type == VDIR) {
7249 7257 VN_RELE(vp);
7250 7258 nfs_rw_exit(&drp->r_rwlock);
7251 7259 return (EISDIR);
7252 7260 }
7253 7261
7254 7262 /*
7255 7263 * First just remove the entry from the name cache, as it
7256 7264 * is most likely the only entry for this vp.
7257 7265 */
7258 7266 dnlc_remove(dvp, nm);
7259 7267
7260 7268 rp = VTOR4(vp);
7261 7269
7262 7270 /*
7263 7271 * For regular file types, check to see if the file is open by looking
7264 7272 * at the open streams.
7265 7273 * For all other types, check the reference count on the vnode. Since
7266 7274 * they are not opened OTW they never have an open stream.
7267 7275 *
7268 7276 * If the file is open, rename it to .nfsXXXX.
7269 7277 */
7270 7278 if (vp->v_type != VREG) {
7271 7279 /*
7272 7280 * If the file has a v_count > 1 then there may be more than one
7273 7281 * entry in the name cache due multiple links or an open file,
7274 7282 * but we don't have the real reference count so flush all
7275 7283 * possible entries.
7276 7284 */
7277 7285 if (vp->v_count > 1)
7278 7286 dnlc_purge_vp(vp);
7279 7287
7280 7288 /*
7281 7289 * Now we have the real reference count.
7282 7290 */
7283 7291 isopen = vp->v_count > 1;
7284 7292 } else {
7285 7293 mutex_enter(&rp->r_os_lock);
7286 7294 isopen = list_head(&rp->r_open_streams) != NULL;
7287 7295 mutex_exit(&rp->r_os_lock);
7288 7296 }
7289 7297
7290 7298 mutex_enter(&rp->r_statelock);
7291 7299 if (isopen &&
7292 7300 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7293 7301 mutex_exit(&rp->r_statelock);
7294 7302 tmpname = newname();
7295 7303 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7296 7304 if (e.error)
7297 7305 kmem_free(tmpname, MAXNAMELEN);
7298 7306 else {
7299 7307 mutex_enter(&rp->r_statelock);
7300 7308 if (rp->r_unldvp == NULL) {
7301 7309 VN_HOLD(dvp);
7302 7310 rp->r_unldvp = dvp;
7303 7311 if (rp->r_unlcred != NULL)
7304 7312 crfree(rp->r_unlcred);
7305 7313 crhold(cr);
7306 7314 rp->r_unlcred = cr;
7307 7315 rp->r_unlname = tmpname;
7308 7316 } else {
7309 7317 kmem_free(rp->r_unlname, MAXNAMELEN);
7310 7318 rp->r_unlname = tmpname;
7311 7319 }
7312 7320 mutex_exit(&rp->r_statelock);
7313 7321 }
7314 7322 VN_RELE(vp);
7315 7323 nfs_rw_exit(&drp->r_rwlock);
7316 7324 return (e.error);
7317 7325 }
7318 7326 /*
7319 7327 * Actually remove the file/dir
7320 7328 */
7321 7329 mutex_exit(&rp->r_statelock);
7322 7330
7323 7331 /*
7324 7332 * We need to flush any dirty pages which happen to
7325 7333 * be hanging around before removing the file.
7326 7334 * This shouldn't happen very often since in NFSv4
7327 7335 * we should be close to open consistent.
7328 7336 */
7329 7337 if (nfs4_has_pages(vp) &&
7330 7338 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7331 7339 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7332 7340 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7333 7341 mutex_enter(&rp->r_statelock);
7334 7342 if (!rp->r_error)
7335 7343 rp->r_error = e.error;
7336 7344 mutex_exit(&rp->r_statelock);
7337 7345 }
7338 7346 }
7339 7347
7340 7348 mi = VTOMI4(dvp);
7341 7349
7342 7350 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7343 7351 recov_state.rs_flags = 0;
7344 7352 recov_state.rs_num_retry_despite_err = 0;
7345 7353
7346 7354 recov_retry:
7347 7355 /*
7348 7356 * Remove ops: putfh dir; remove
7349 7357 */
7350 7358 args.ctag = TAG_REMOVE;
7351 7359 args.array_len = 3;
7352 7360 args.array = argop;
7353 7361
7354 7362 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7355 7363 if (e.error) {
7356 7364 nfs_rw_exit(&drp->r_rwlock);
7357 7365 VN_RELE(vp);
7358 7366 return (e.error);
7359 7367 }
7360 7368
7361 7369 /* putfh directory */
7362 7370 argop[0].argop = OP_CPUTFH;
7363 7371 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7364 7372
7365 7373 /* remove */
7366 7374 argop[1].argop = OP_CREMOVE;
7367 7375 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7368 7376
7369 7377 /* getattr dir */
7370 7378 argop[2].argop = OP_GETATTR;
7371 7379 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7372 7380 argop[2].nfs_argop4_u.opgetattr.mi = mi;
7373 7381
7374 7382 doqueue = 1;
7375 7383 dinfo.di_time_call = gethrtime();
7376 7384 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7377 7385
7378 7386 PURGE_ATTRCACHE4(vp);
7379 7387
7380 7388 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7381 7389 if (e.error)
7382 7390 PURGE_ATTRCACHE4(dvp);
7383 7391
7384 7392 if (needrecov) {
7385 7393 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7386 7394 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7387 7395 if (!e.error)
7388 7396 (void) xdr_free(xdr_COMPOUND4res_clnt,
7389 7397 (caddr_t)&res);
7390 7398 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7391 7399 needrecov);
7392 7400 goto recov_retry;
7393 7401 }
7394 7402 }
7395 7403
7396 7404 /*
7397 7405 * Matching nfs4_end_op() for start_op() above.
7398 7406 * There is a path in the code below which calls
7399 7407 * nfs4_purge_stale_fh(), which may generate otw calls through
7400 7408 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7401 7409 * here to avoid nfs4_start_op() deadlock.
7402 7410 */
7403 7411 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7404 7412
7405 7413 if (!e.error) {
7406 7414 resp = &res;
7407 7415
7408 7416 if (res.status) {
7409 7417 e.error = geterrno4(res.status);
7410 7418 PURGE_ATTRCACHE4(dvp);
7411 7419 nfs4_purge_stale_fh(e.error, dvp, cr);
7412 7420 } else {
7413 7421 resop = &res.array[1]; /* remove res */
7414 7422 rm_res = &resop->nfs_resop4_u.opremove;
7415 7423
7416 7424 dinfo.di_garp =
7417 7425 &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7418 7426 dinfo.di_cred = cr;
7419 7427
7420 7428 /* Update directory attr, readdir and dnlc caches */
7421 7429 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7422 7430 &dinfo);
7423 7431 }
7424 7432 }
7425 7433 nfs_rw_exit(&drp->r_rwlock);
7426 7434 if (resp)
7427 7435 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7428 7436
7429 7437 if (e.error == 0) {
7430 7438 vnode_t *tvp;
7431 7439 rnode4_t *trp;
7432 7440 trp = VTOR4(vp);
7433 7441 tvp = vp;
7434 7442 if (IS_SHADOW(vp, trp))
7435 7443 tvp = RTOV4(trp);
7436 7444 vnevent_remove(tvp, dvp, nm, ct);
7437 7445 }
7438 7446 VN_RELE(vp);
7439 7447 return (e.error);
7440 7448 }
7441 7449
7442 7450 /*
7443 7451 * Link requires that the current fh be the target directory and the
7444 7452 * saved fh be the source fh. After the operation, the current fh is unchanged.
7445 7453 * Thus the compound op structure is:
7446 7454 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7447 7455 * GETATTR(file)
7448 7456 */
7449 7457 /* ARGSUSED */
7450 7458 static int
7451 7459 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7452 7460 caller_context_t *ct, int flags)
7453 7461 {
7454 7462 COMPOUND4args_clnt args;
7455 7463 COMPOUND4res_clnt res, *resp = NULL;
7456 7464 LINK4res *ln_res;
7457 7465 int argoplist_size = 7 * sizeof (nfs_argop4);
7458 7466 nfs_argop4 *argop;
7459 7467 nfs_resop4 *resop;
7460 7468 vnode_t *realvp, *nvp;
7461 7469 int doqueue;
7462 7470 mntinfo4_t *mi;
7463 7471 rnode4_t *tdrp;
7464 7472 bool_t needrecov = FALSE;
7465 7473 nfs4_recov_state_t recov_state;
7466 7474 hrtime_t t;
7467 7475 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7468 7476 dirattr_info_t dinfo;
7469 7477
7470 7478 ASSERT(*tnm != '\0');
7471 7479 ASSERT(tdvp->v_type == VDIR);
7472 7480 ASSERT(nfs4_consistent_type(tdvp));
7473 7481 ASSERT(nfs4_consistent_type(svp));
7474 7482
7475 7483 if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7476 7484 return (EPERM);
7477 7485 if (VOP_REALVP(svp, &realvp, ct) == 0) {
7478 7486 svp = realvp;
7479 7487 ASSERT(nfs4_consistent_type(svp));
7480 7488 }
7481 7489
7482 7490 tdrp = VTOR4(tdvp);
7483 7491 mi = VTOMI4(svp);
7484 7492
7485 7493 if (!(mi->mi_flags & MI4_LINK)) {
7486 7494 return (EOPNOTSUPP);
7487 7495 }
7488 7496 recov_state.rs_flags = 0;
7489 7497 recov_state.rs_num_retry_despite_err = 0;
7490 7498
7491 7499 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7492 7500 return (EINTR);
7493 7501
7494 7502 recov_retry:
7495 7503 argop = kmem_alloc(argoplist_size, KM_SLEEP);
7496 7504
7497 7505 args.ctag = TAG_LINK;
7498 7506
7499 7507 /*
7500 7508 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7501 7509 * restorefh; getattr(fl)
7502 7510 */
7503 7511 args.array_len = 7;
7504 7512 args.array = argop;
7505 7513
7506 7514 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7507 7515 if (e.error) {
7508 7516 kmem_free(argop, argoplist_size);
7509 7517 nfs_rw_exit(&tdrp->r_rwlock);
7510 7518 return (e.error);
7511 7519 }
7512 7520
7513 7521 /* 0. putfh file */
7514 7522 argop[0].argop = OP_CPUTFH;
7515 7523 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7516 7524
7517 7525 /* 1. save current fh to free up the space for the dir */
7518 7526 argop[1].argop = OP_SAVEFH;
7519 7527
7520 7528 /* 2. putfh targetdir */
7521 7529 argop[2].argop = OP_CPUTFH;
7522 7530 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7523 7531
7524 7532 /* 3. link: current_fh is targetdir, saved_fh is source */
7525 7533 argop[3].argop = OP_CLINK;
7526 7534 argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7527 7535
7528 7536 /* 4. Get attributes of dir */
7529 7537 argop[4].argop = OP_GETATTR;
7530 7538 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7531 7539 argop[4].nfs_argop4_u.opgetattr.mi = mi;
7532 7540
7533 7541 /* 5. If link was successful, restore current vp to file */
7534 7542 argop[5].argop = OP_RESTOREFH;
7535 7543
7536 7544 /* 6. Get attributes of linked object */
7537 7545 argop[6].argop = OP_GETATTR;
7538 7546 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7539 7547 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7540 7548
7541 7549 dnlc_remove(tdvp, tnm);
7542 7550
7543 7551 doqueue = 1;
7544 7552 t = gethrtime();
7545 7553
7546 7554 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7547 7555
7548 7556 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7549 7557 if (e.error != 0 && !needrecov) {
7550 7558 PURGE_ATTRCACHE4(tdvp);
7551 7559 PURGE_ATTRCACHE4(svp);
7552 7560 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7553 7561 goto out;
7554 7562 }
7555 7563
7556 7564 if (needrecov) {
7557 7565 bool_t abort;
7558 7566
7559 7567 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7560 7568 NULL, NULL, OP_LINK, NULL, NULL, NULL);
7561 7569 if (abort == FALSE) {
7562 7570 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7563 7571 needrecov);
7564 7572 kmem_free(argop, argoplist_size);
7565 7573 if (!e.error)
7566 7574 (void) xdr_free(xdr_COMPOUND4res_clnt,
7567 7575 (caddr_t)&res);
7568 7576 goto recov_retry;
7569 7577 } else {
7570 7578 if (e.error != 0) {
7571 7579 PURGE_ATTRCACHE4(tdvp);
7572 7580 PURGE_ATTRCACHE4(svp);
7573 7581 nfs4_end_op(VTOMI4(svp), svp, tdvp,
7574 7582 &recov_state, needrecov);
7575 7583 goto out;
7576 7584 }
7577 7585 /* fall through for res.status case */
7578 7586 }
7579 7587 }
7580 7588
7581 7589 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7582 7590
7583 7591 resp = &res;
7584 7592 if (res.status) {
7585 7593 /* If link succeeded, then don't return error */
7586 7594 e.error = geterrno4(res.status);
7587 7595 if (res.array_len <= 4) {
7588 7596 /*
7589 7597 * Either Putfh, Savefh, Putfh dir, or Link failed
7590 7598 */
7591 7599 PURGE_ATTRCACHE4(svp);
7592 7600 PURGE_ATTRCACHE4(tdvp);
7593 7601 if (e.error == EOPNOTSUPP) {
7594 7602 mutex_enter(&mi->mi_lock);
7595 7603 mi->mi_flags &= ~MI4_LINK;
7596 7604 mutex_exit(&mi->mi_lock);
7597 7605 }
7598 7606 /* Remap EISDIR to EPERM for non-root user for SVVS */
7599 7607 /* XXX-LP */
7600 7608 if (e.error == EISDIR && crgetuid(cr) != 0)
7601 7609 e.error = EPERM;
7602 7610 goto out;
7603 7611 }
7604 7612 }
7605 7613
7606 7614 /* either no error or one of the postop getattr failed */
7607 7615
7608 7616 /*
7609 7617 * XXX - if LINK succeeded, but no attrs were returned for link
7610 7618 * file, purge its cache.
7611 7619 *
7612 7620 * XXX Perform a simplified version of wcc checking. Instead of
7613 7621 * have another getattr to get pre-op, just purge cache if
7614 7622 * any of the ops prior to and including the getattr failed.
7615 7623 * If the getattr succeeded then update the attrcache accordingly.
7616 7624 */
7617 7625
7618 7626 /*
7619 7627 * update cache with link file postattrs.
7620 7628 * Note: at this point resop points to link res.
7621 7629 */
7622 7630 resop = &res.array[3]; /* link res */
7623 7631 ln_res = &resop->nfs_resop4_u.oplink;
7624 7632 if (res.status == NFS4_OK)
7625 7633 e.error = nfs4_update_attrcache(res.status,
7626 7634 &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7627 7635 t, svp, cr);
7628 7636
7629 7637 /*
7630 7638 * Call makenfs4node to create the new shadow vp for tnm.
7631 7639 * We pass NULL attrs because we just cached attrs for
7632 7640 * the src object. All we're trying to accomplish is to
7633 7641 * to create the new shadow vnode.
7634 7642 */
7635 7643 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7636 7644 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7637 7645
7638 7646 /* Update target cache attribute, readdir and dnlc caches */
7639 7647 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7640 7648 dinfo.di_time_call = t;
7641 7649 dinfo.di_cred = cr;
7642 7650
7643 7651 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7644 7652 ASSERT(nfs4_consistent_type(tdvp));
7645 7653 ASSERT(nfs4_consistent_type(svp));
7646 7654 ASSERT(nfs4_consistent_type(nvp));
7647 7655 VN_RELE(nvp);
7648 7656
7649 7657 if (!e.error) {
7650 7658 vnode_t *tvp;
7651 7659 rnode4_t *trp;
7652 7660 /*
7653 7661 * Notify the source file of this link operation.
7654 7662 */
7655 7663 trp = VTOR4(svp);
7656 7664 tvp = svp;
7657 7665 if (IS_SHADOW(svp, trp))
7658 7666 tvp = RTOV4(trp);
7659 7667 vnevent_link(tvp, ct);
7660 7668 }
7661 7669 out:
7662 7670 kmem_free(argop, argoplist_size);
7663 7671 if (resp)
7664 7672 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7665 7673
7666 7674 nfs_rw_exit(&tdrp->r_rwlock);
7667 7675
7668 7676 return (e.error);
7669 7677 }
7670 7678
7671 7679 /* ARGSUSED */
7672 7680 static int
7673 7681 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7674 7682 caller_context_t *ct, int flags)
7675 7683 {
7676 7684 vnode_t *realvp;
7677 7685
7678 7686 if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7679 7687 return (EPERM);
7680 7688 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7681 7689 ndvp = realvp;
7682 7690
7683 7691 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7684 7692 }
7685 7693
7686 7694 /*
7687 7695 * nfs4rename does the real work of renaming in NFS Version 4.
7688 7696 *
7689 7697 * A file handle is considered volatile for renaming purposes if either
7690 7698 * of the volatile bits are turned on. However, the compound may differ
7691 7699 * based on the likelihood of the filehandle to change during rename.
7692 7700 */
7693 7701 static int
7694 7702 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7695 7703 caller_context_t *ct)
7696 7704 {
7697 7705 int error;
7698 7706 mntinfo4_t *mi;
7699 7707 vnode_t *nvp = NULL;
7700 7708 vnode_t *ovp = NULL;
7701 7709 char *tmpname = NULL;
7702 7710 rnode4_t *rp;
7703 7711 rnode4_t *odrp;
7704 7712 rnode4_t *ndrp;
7705 7713 int did_link = 0;
7706 7714 int do_link = 1;
7707 7715 nfsstat4 stat = NFS4_OK;
7708 7716
7709 7717 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7710 7718 ASSERT(nfs4_consistent_type(odvp));
7711 7719 ASSERT(nfs4_consistent_type(ndvp));
7712 7720
7713 7721 if (onm[0] == '.' && (onm[1] == '\0' ||
7714 7722 (onm[1] == '.' && onm[2] == '\0')))
7715 7723 return (EINVAL);
7716 7724
7717 7725 if (nnm[0] == '.' && (nnm[1] == '\0' ||
7718 7726 (nnm[1] == '.' && nnm[2] == '\0')))
7719 7727 return (EINVAL);
7720 7728
7721 7729 odrp = VTOR4(odvp);
7722 7730 ndrp = VTOR4(ndvp);
7723 7731 if ((intptr_t)odrp < (intptr_t)ndrp) {
7724 7732 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7725 7733 return (EINTR);
7726 7734 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7727 7735 nfs_rw_exit(&odrp->r_rwlock);
7728 7736 return (EINTR);
7729 7737 }
7730 7738 } else {
7731 7739 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7732 7740 return (EINTR);
7733 7741 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7734 7742 nfs_rw_exit(&ndrp->r_rwlock);
7735 7743 return (EINTR);
7736 7744 }
7737 7745 }
7738 7746
7739 7747 /*
7740 7748 * Lookup the target file. If it exists, it needs to be
7741 7749 * checked to see whether it is a mount point and whether
7742 7750 * it is active (open).
7743 7751 */
7744 7752 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7745 7753 if (!error) {
7746 7754 int isactive;
7747 7755
7748 7756 ASSERT(nfs4_consistent_type(nvp));
7749 7757 /*
7750 7758 * If this file has been mounted on, then just
7751 7759 * return busy because renaming to it would remove
7752 7760 * the mounted file system from the name space.
7753 7761 */
7754 7762 if (vn_ismntpt(nvp)) {
7755 7763 VN_RELE(nvp);
7756 7764 nfs_rw_exit(&odrp->r_rwlock);
7757 7765 nfs_rw_exit(&ndrp->r_rwlock);
7758 7766 return (EBUSY);
7759 7767 }
7760 7768
7761 7769 /*
7762 7770 * First just remove the entry from the name cache, as it
7763 7771 * is most likely the only entry for this vp.
7764 7772 */
7765 7773 dnlc_remove(ndvp, nnm);
7766 7774
7767 7775 rp = VTOR4(nvp);
7768 7776
7769 7777 if (nvp->v_type != VREG) {
7770 7778 /*
7771 7779 * Purge the name cache of all references to this vnode
7772 7780 * so that we can check the reference count to infer
7773 7781 * whether it is active or not.
7774 7782 */
7775 7783 if (nvp->v_count > 1)
7776 7784 dnlc_purge_vp(nvp);
7777 7785
7778 7786 isactive = nvp->v_count > 1;
7779 7787 } else {
7780 7788 mutex_enter(&rp->r_os_lock);
7781 7789 isactive = list_head(&rp->r_open_streams) != NULL;
7782 7790 mutex_exit(&rp->r_os_lock);
7783 7791 }
7784 7792
7785 7793 /*
7786 7794 * If the vnode is active and is not a directory,
7787 7795 * arrange to rename it to a
7788 7796 * temporary file so that it will continue to be
7789 7797 * accessible. This implements the "unlink-open-file"
7790 7798 * semantics for the target of a rename operation.
7791 7799 * Before doing this though, make sure that the
7792 7800 * source and target files are not already the same.
7793 7801 */
7794 7802 if (isactive && nvp->v_type != VDIR) {
7795 7803 /*
7796 7804 * Lookup the source name.
7797 7805 */
7798 7806 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7799 7807
7800 7808 /*
7801 7809 * The source name *should* already exist.
7802 7810 */
7803 7811 if (error) {
7804 7812 VN_RELE(nvp);
7805 7813 nfs_rw_exit(&odrp->r_rwlock);
7806 7814 nfs_rw_exit(&ndrp->r_rwlock);
7807 7815 return (error);
7808 7816 }
7809 7817
7810 7818 ASSERT(nfs4_consistent_type(ovp));
7811 7819
7812 7820 /*
7813 7821 * Compare the two vnodes. If they are the same,
7814 7822 * just release all held vnodes and return success.
7815 7823 */
7816 7824 if (VN_CMP(ovp, nvp)) {
7817 7825 VN_RELE(ovp);
7818 7826 VN_RELE(nvp);
7819 7827 nfs_rw_exit(&odrp->r_rwlock);
7820 7828 nfs_rw_exit(&ndrp->r_rwlock);
7821 7829 return (0);
7822 7830 }
7823 7831
7824 7832 /*
7825 7833 * Can't mix and match directories and non-
7826 7834 * directories in rename operations. We already
7827 7835 * know that the target is not a directory. If
7828 7836 * the source is a directory, return an error.
7829 7837 */
7830 7838 if (ovp->v_type == VDIR) {
7831 7839 VN_RELE(ovp);
7832 7840 VN_RELE(nvp);
7833 7841 nfs_rw_exit(&odrp->r_rwlock);
7834 7842 nfs_rw_exit(&ndrp->r_rwlock);
7835 7843 return (ENOTDIR);
7836 7844 }
7837 7845 link_call:
7838 7846 /*
7839 7847 * The target file exists, is not the same as
7840 7848 * the source file, and is active. We first
7841 7849 * try to Link it to a temporary filename to
7842 7850 * avoid having the server removing the file
7843 7851 * completely (which could cause data loss to
7844 7852 * the user's POV in the event the Rename fails
7845 7853 * -- see bug 1165874).
7846 7854 */
7847 7855 /*
7848 7856 * The do_link and did_link booleans are
7849 7857 * introduced in the event we get NFS4ERR_FILE_OPEN
7850 7858 * returned for the Rename. Some servers can
7851 7859 * not Rename over an Open file, so they return
7852 7860 * this error. The client needs to Remove the
7853 7861 * newly created Link and do two Renames, just
7854 7862 * as if the server didn't support LINK.
7855 7863 */
7856 7864 tmpname = newname();
7857 7865 error = 0;
7858 7866
7859 7867 if (do_link) {
7860 7868 error = nfs4_link(ndvp, nvp, tmpname, cr,
7861 7869 NULL, 0);
7862 7870 }
7863 7871 if (error == EOPNOTSUPP || !do_link) {
7864 7872 error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7865 7873 cr, NULL, 0);
7866 7874 did_link = 0;
7867 7875 } else {
7868 7876 did_link = 1;
7869 7877 }
7870 7878 if (error) {
7871 7879 kmem_free(tmpname, MAXNAMELEN);
7872 7880 VN_RELE(ovp);
7873 7881 VN_RELE(nvp);
7874 7882 nfs_rw_exit(&odrp->r_rwlock);
7875 7883 nfs_rw_exit(&ndrp->r_rwlock);
7876 7884 return (error);
7877 7885 }
7878 7886
7879 7887 mutex_enter(&rp->r_statelock);
7880 7888 if (rp->r_unldvp == NULL) {
7881 7889 VN_HOLD(ndvp);
7882 7890 rp->r_unldvp = ndvp;
7883 7891 if (rp->r_unlcred != NULL)
7884 7892 crfree(rp->r_unlcred);
7885 7893 crhold(cr);
7886 7894 rp->r_unlcred = cr;
7887 7895 rp->r_unlname = tmpname;
7888 7896 } else {
7889 7897 if (rp->r_unlname)
7890 7898 kmem_free(rp->r_unlname, MAXNAMELEN);
7891 7899 rp->r_unlname = tmpname;
7892 7900 }
7893 7901 mutex_exit(&rp->r_statelock);
7894 7902 }
7895 7903
7896 7904 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7897 7905
7898 7906 ASSERT(nfs4_consistent_type(nvp));
7899 7907 }
7900 7908
7901 7909 if (ovp == NULL) {
7902 7910 /*
7903 7911 * When renaming directories to be a subdirectory of a
7904 7912 * different parent, the dnlc entry for ".." will no
7905 7913 * longer be valid, so it must be removed.
7906 7914 *
7907 7915 * We do a lookup here to determine whether we are renaming
7908 7916 * a directory and we need to check if we are renaming
7909 7917 * an unlinked file. This might have already been done
7910 7918 * in previous code, so we check ovp == NULL to avoid
7911 7919 * doing it twice.
7912 7920 */
7913 7921 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7914 7922 /*
7915 7923 * The source name *should* already exist.
7916 7924 */
7917 7925 if (error) {
7918 7926 nfs_rw_exit(&odrp->r_rwlock);
7919 7927 nfs_rw_exit(&ndrp->r_rwlock);
7920 7928 if (nvp) {
7921 7929 VN_RELE(nvp);
7922 7930 }
7923 7931 return (error);
7924 7932 }
7925 7933 ASSERT(ovp != NULL);
7926 7934 ASSERT(nfs4_consistent_type(ovp));
7927 7935 }
7928 7936
7929 7937 /*
7930 7938 * Is the object being renamed a dir, and if so, is
7931 7939 * it being renamed to a child of itself? The underlying
7932 7940 * fs should ultimately return EINVAL for this case;
7933 7941 * however, buggy beta non-Solaris NFSv4 servers at
7934 7942 * interop testing events have allowed this behavior,
7935 7943 * and it caused our client to panic due to a recursive
7936 7944 * mutex_enter in fn_move.
7937 7945 *
7938 7946 * The tedious locking in fn_move could be changed to
7939 7947 * deal with this case, and the client could avoid the
7940 7948 * panic; however, the client would just confuse itself
7941 7949 * later and misbehave. A better way to handle the broken
7942 7950 * server is to detect this condition and return EINVAL
7943 7951 * without ever sending the the bogus rename to the server.
7944 7952 * We know the rename is invalid -- just fail it now.
7945 7953 */
7946 7954 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7947 7955 VN_RELE(ovp);
7948 7956 nfs_rw_exit(&odrp->r_rwlock);
7949 7957 nfs_rw_exit(&ndrp->r_rwlock);
7950 7958 if (nvp) {
7951 7959 VN_RELE(nvp);
7952 7960 }
7953 7961 return (EINVAL);
7954 7962 }
7955 7963
7956 7964 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7957 7965
7958 7966 /*
7959 7967 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7960 7968 * possible for the filehandle to change due to the rename.
7961 7969 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7962 7970 * the fh will not change because of the rename, but we still need
7963 7971 * to update its rnode entry with the new name for
7964 7972 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7965 7973 * has no effect on these for now, but for future improvements,
7966 7974 * we might want to use it too to simplify handling of files
7967 7975 * that are open with that flag on. (XXX)
7968 7976 */
7969 7977 mi = VTOMI4(odvp);
7970 7978 if (NFS4_VOLATILE_FH(mi))
7971 7979 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7972 7980 &stat);
7973 7981 else
7974 7982 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7975 7983 &stat);
7976 7984
7977 7985 ASSERT(nfs4_consistent_type(odvp));
7978 7986 ASSERT(nfs4_consistent_type(ndvp));
7979 7987 ASSERT(nfs4_consistent_type(ovp));
7980 7988
7981 7989 if (stat == NFS4ERR_FILE_OPEN && did_link) {
7982 7990 do_link = 0;
7983 7991 /*
7984 7992 * Before the 'link_call' code, we did a nfs4_lookup
7985 7993 * that puts a VN_HOLD on nvp. After the nfs4_link
7986 7994 * call we call VN_RELE to match that hold. We need
7987 7995 * to place an additional VN_HOLD here since we will
7988 7996 * be hitting that VN_RELE again.
7989 7997 */
7990 7998 VN_HOLD(nvp);
7991 7999
7992 8000 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
7993 8001
7994 8002 /* Undo the unlinked file naming stuff we just did */
7995 8003 mutex_enter(&rp->r_statelock);
7996 8004 if (rp->r_unldvp) {
7997 8005 VN_RELE(ndvp);
7998 8006 rp->r_unldvp = NULL;
7999 8007 if (rp->r_unlcred != NULL)
8000 8008 crfree(rp->r_unlcred);
8001 8009 rp->r_unlcred = NULL;
8002 8010 /* rp->r_unlanme points to tmpname */
8003 8011 if (rp->r_unlname)
8004 8012 kmem_free(rp->r_unlname, MAXNAMELEN);
8005 8013 rp->r_unlname = NULL;
8006 8014 }
8007 8015 mutex_exit(&rp->r_statelock);
8008 8016
8009 8017 if (nvp) {
8010 8018 VN_RELE(nvp);
8011 8019 }
8012 8020 goto link_call;
8013 8021 }
8014 8022
8015 8023 if (error) {
8016 8024 VN_RELE(ovp);
8017 8025 nfs_rw_exit(&odrp->r_rwlock);
8018 8026 nfs_rw_exit(&ndrp->r_rwlock);
8019 8027 if (nvp) {
8020 8028 VN_RELE(nvp);
8021 8029 }
8022 8030 return (error);
8023 8031 }
8024 8032
8025 8033 /*
8026 8034 * when renaming directories to be a subdirectory of a
8027 8035 * different parent, the dnlc entry for ".." will no
8028 8036 * longer be valid, so it must be removed
8029 8037 */
8030 8038 rp = VTOR4(ovp);
8031 8039 if (ndvp != odvp) {
8032 8040 if (ovp->v_type == VDIR) {
8033 8041 dnlc_remove(ovp, "..");
8034 8042 if (rp->r_dir != NULL)
8035 8043 nfs4_purge_rddir_cache(ovp);
8036 8044 }
8037 8045 }
8038 8046
8039 8047 /*
8040 8048 * If we are renaming the unlinked file, update the
8041 8049 * r_unldvp and r_unlname as needed.
8042 8050 */
8043 8051 mutex_enter(&rp->r_statelock);
8044 8052 if (rp->r_unldvp != NULL) {
8045 8053 if (strcmp(rp->r_unlname, onm) == 0) {
8046 8054 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8047 8055 rp->r_unlname[MAXNAMELEN - 1] = '\0';
8048 8056 if (ndvp != rp->r_unldvp) {
8049 8057 VN_RELE(rp->r_unldvp);
8050 8058 rp->r_unldvp = ndvp;
8051 8059 VN_HOLD(ndvp);
8052 8060 }
8053 8061 }
8054 8062 }
8055 8063 mutex_exit(&rp->r_statelock);
8056 8064
8057 8065 /*
8058 8066 * Notify the rename vnevents to source vnode, and to the target
8059 8067 * vnode if it already existed.
8060 8068 */
8061 8069 if (error == 0) {
8062 8070 vnode_t *tvp;
8063 8071 rnode4_t *trp;
8064 8072 /*
8065 8073 * Notify the vnode. Each links is represented by
8066 8074 * a different vnode, in nfsv4.
8067 8075 */
8068 8076 if (nvp) {
8069 8077 trp = VTOR4(nvp);
8070 8078 tvp = nvp;
8071 8079 if (IS_SHADOW(nvp, trp))
8072 8080 tvp = RTOV4(trp);
8073 8081 vnevent_rename_dest(tvp, ndvp, nnm, ct);
8074 8082 }
8075 8083
8076 8084 /*
8077 8085 * if the source and destination directory are not the
8078 8086 * same notify the destination directory.
8079 8087 */
8080 8088 if (VTOR4(odvp) != VTOR4(ndvp)) {
8081 8089 trp = VTOR4(ndvp);
8082 8090 tvp = ndvp;
8083 8091 if (IS_SHADOW(ndvp, trp))
8084 8092 tvp = RTOV4(trp);
8085 8093 vnevent_rename_dest_dir(tvp, ct);
8086 8094 }
8087 8095
8088 8096 trp = VTOR4(ovp);
8089 8097 tvp = ovp;
8090 8098 if (IS_SHADOW(ovp, trp))
8091 8099 tvp = RTOV4(trp);
8092 8100 vnevent_rename_src(tvp, odvp, onm, ct);
8093 8101 }
8094 8102
8095 8103 if (nvp) {
8096 8104 VN_RELE(nvp);
8097 8105 }
8098 8106 VN_RELE(ovp);
8099 8107
8100 8108 nfs_rw_exit(&odrp->r_rwlock);
8101 8109 nfs_rw_exit(&ndrp->r_rwlock);
8102 8110
8103 8111 return (error);
8104 8112 }
8105 8113
8106 8114 /*
8107 8115 * When the parent directory has changed, sv_dfh must be updated
8108 8116 */
8109 8117 static void
8110 8118 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8111 8119 {
8112 8120 svnode_t *sv = VTOSV(vp);
8113 8121 nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8114 8122 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8115 8123
8116 8124 sfh4_hold(new_dfh);
8117 8125 sv->sv_dfh = new_dfh;
8118 8126 sfh4_rele(&old_dfh);
8119 8127 }
8120 8128
8121 8129 /*
8122 8130 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8123 8131 * when it is known that the filehandle is persistent through rename.
8124 8132 *
8125 8133 * Rename requires that the current fh be the target directory and the
8126 8134 * saved fh be the source directory. After the operation, the current fh
8127 8135 * is unchanged.
8128 8136 * The compound op structure for persistent fh rename is:
8129 8137 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8130 8138 * Rather than bother with the directory postop args, we'll simply
8131 8139 * update that a change occurred in the cache, so no post-op getattrs.
8132 8140 */
8133 8141 static int
8134 8142 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8135 8143 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8136 8144 {
8137 8145 COMPOUND4args_clnt args;
8138 8146 COMPOUND4res_clnt res, *resp = NULL;
8139 8147 nfs_argop4 *argop;
8140 8148 nfs_resop4 *resop;
8141 8149 int doqueue, argoplist_size;
8142 8150 mntinfo4_t *mi;
8143 8151 rnode4_t *odrp = VTOR4(odvp);
8144 8152 rnode4_t *ndrp = VTOR4(ndvp);
8145 8153 RENAME4res *rn_res;
8146 8154 bool_t needrecov;
8147 8155 nfs4_recov_state_t recov_state;
8148 8156 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8149 8157 dirattr_info_t dinfo, *dinfop;
8150 8158
8151 8159 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8152 8160
8153 8161 recov_state.rs_flags = 0;
8154 8162 recov_state.rs_num_retry_despite_err = 0;
8155 8163
8156 8164 /*
8157 8165 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8158 8166 *
8159 8167 * If source/target are different dirs, then append putfh(src); getattr
8160 8168 */
8161 8169 args.array_len = (odvp == ndvp) ? 5 : 7;
8162 8170 argoplist_size = args.array_len * sizeof (nfs_argop4);
8163 8171 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8164 8172
8165 8173 recov_retry:
8166 8174 *statp = NFS4_OK;
8167 8175
8168 8176 /* No need to Lookup the file, persistent fh */
8169 8177 args.ctag = TAG_RENAME;
8170 8178
8171 8179 mi = VTOMI4(odvp);
8172 8180 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8173 8181 if (e.error) {
8174 8182 kmem_free(argop, argoplist_size);
8175 8183 return (e.error);
8176 8184 }
8177 8185
8178 8186 /* 0: putfh source directory */
8179 8187 argop[0].argop = OP_CPUTFH;
8180 8188 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8181 8189
8182 8190 /* 1: Save source fh to free up current for target */
8183 8191 argop[1].argop = OP_SAVEFH;
8184 8192
8185 8193 /* 2: putfh targetdir */
8186 8194 argop[2].argop = OP_CPUTFH;
8187 8195 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8188 8196
8189 8197 /* 3: current_fh is targetdir, saved_fh is sourcedir */
8190 8198 argop[3].argop = OP_CRENAME;
8191 8199 argop[3].nfs_argop4_u.opcrename.coldname = onm;
8192 8200 argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8193 8201
8194 8202 /* 4: getattr (targetdir) */
8195 8203 argop[4].argop = OP_GETATTR;
8196 8204 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8197 8205 argop[4].nfs_argop4_u.opgetattr.mi = mi;
8198 8206
8199 8207 if (ndvp != odvp) {
8200 8208
8201 8209 /* 5: putfh (sourcedir) */
8202 8210 argop[5].argop = OP_CPUTFH;
8203 8211 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8204 8212
8205 8213 /* 6: getattr (sourcedir) */
8206 8214 argop[6].argop = OP_GETATTR;
8207 8215 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8208 8216 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8209 8217 }
8210 8218
8211 8219 dnlc_remove(odvp, onm);
8212 8220 dnlc_remove(ndvp, nnm);
8213 8221
8214 8222 doqueue = 1;
8215 8223 dinfo.di_time_call = gethrtime();
8216 8224 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8217 8225
8218 8226 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8219 8227 if (e.error) {
8220 8228 PURGE_ATTRCACHE4(odvp);
8221 8229 PURGE_ATTRCACHE4(ndvp);
8222 8230 } else {
8223 8231 *statp = res.status;
8224 8232 }
8225 8233
8226 8234 if (needrecov) {
8227 8235 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8228 8236 OP_RENAME, NULL, NULL, NULL) == FALSE) {
8229 8237 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8230 8238 if (!e.error)
8231 8239 (void) xdr_free(xdr_COMPOUND4res_clnt,
8232 8240 (caddr_t)&res);
8233 8241 goto recov_retry;
8234 8242 }
8235 8243 }
8236 8244
8237 8245 if (!e.error) {
8238 8246 resp = &res;
8239 8247 /*
8240 8248 * as long as OP_RENAME
8241 8249 */
8242 8250 if (res.status != NFS4_OK && res.array_len <= 4) {
8243 8251 e.error = geterrno4(res.status);
8244 8252 PURGE_ATTRCACHE4(odvp);
8245 8253 PURGE_ATTRCACHE4(ndvp);
8246 8254 /*
8247 8255 * System V defines rename to return EEXIST, not
8248 8256 * ENOTEMPTY if the target directory is not empty.
8249 8257 * Over the wire, the error is NFSERR_ENOTEMPTY
8250 8258 * which geterrno4 maps to ENOTEMPTY.
8251 8259 */
8252 8260 if (e.error == ENOTEMPTY)
8253 8261 e.error = EEXIST;
8254 8262 } else {
8255 8263
8256 8264 resop = &res.array[3]; /* rename res */
8257 8265 rn_res = &resop->nfs_resop4_u.oprename;
8258 8266
8259 8267 if (res.status == NFS4_OK) {
8260 8268 /*
8261 8269 * Update target attribute, readdir and dnlc
8262 8270 * caches.
8263 8271 */
8264 8272 dinfo.di_garp =
8265 8273 &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8266 8274 dinfo.di_cred = cr;
8267 8275 dinfop = &dinfo;
8268 8276 } else
8269 8277 dinfop = NULL;
8270 8278
8271 8279 nfs4_update_dircaches(&rn_res->target_cinfo,
8272 8280 ndvp, NULL, NULL, dinfop);
8273 8281
8274 8282 /*
8275 8283 * Update source attribute, readdir and dnlc caches
8276 8284 *
8277 8285 */
8278 8286 if (ndvp != odvp) {
8279 8287 update_parentdir_sfh(renvp, ndvp);
8280 8288
8281 8289 if (dinfop)
8282 8290 dinfo.di_garp =
8283 8291 &(res.array[6].nfs_resop4_u.
8284 8292 opgetattr.ga_res);
8285 8293
8286 8294 nfs4_update_dircaches(&rn_res->source_cinfo,
8287 8295 odvp, NULL, NULL, dinfop);
8288 8296 }
8289 8297
8290 8298 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8291 8299 nnm);
8292 8300 }
8293 8301 }
8294 8302
8295 8303 if (resp)
8296 8304 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8297 8305 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8298 8306 kmem_free(argop, argoplist_size);
8299 8307
8300 8308 return (e.error);
8301 8309 }
8302 8310
8303 8311 /*
8304 8312 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8305 8313 * it is possible for the filehandle to change due to the rename.
8306 8314 *
8307 8315 * The compound req in this case includes a post-rename lookup and getattr
8308 8316 * to ensure that we have the correct fh and attributes for the object.
8309 8317 *
8310 8318 * Rename requires that the current fh be the target directory and the
8311 8319 * saved fh be the source directory. After the operation, the current fh
8312 8320 * is unchanged.
8313 8321 *
8314 8322 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8315 8323 * update the filehandle for the renamed object. We also get the old
8316 8324 * filehandle for historical reasons; this should be taken out sometime.
8317 8325 * This results in a rather cumbersome compound...
8318 8326 *
8319 8327 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8320 8328 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8321 8329 *
8322 8330 */
8323 8331 static int
8324 8332 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8325 8333 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8326 8334 {
8327 8335 COMPOUND4args_clnt args;
8328 8336 COMPOUND4res_clnt res, *resp = NULL;
8329 8337 int argoplist_size;
8330 8338 nfs_argop4 *argop;
8331 8339 nfs_resop4 *resop;
8332 8340 int doqueue;
8333 8341 mntinfo4_t *mi;
8334 8342 rnode4_t *odrp = VTOR4(odvp); /* old directory */
8335 8343 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */
8336 8344 rnode4_t *orp = VTOR4(ovp); /* object being renamed */
8337 8345 RENAME4res *rn_res;
8338 8346 GETFH4res *ngf_res;
8339 8347 bool_t needrecov;
8340 8348 nfs4_recov_state_t recov_state;
8341 8349 hrtime_t t;
8342 8350 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8343 8351 dirattr_info_t dinfo, *dinfop = &dinfo;
8344 8352
8345 8353 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8346 8354
8347 8355 recov_state.rs_flags = 0;
8348 8356 recov_state.rs_num_retry_despite_err = 0;
8349 8357
8350 8358 recov_retry:
8351 8359 *statp = NFS4_OK;
8352 8360
8353 8361 /*
8354 8362 * There is a window between the RPC and updating the path and
8355 8363 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery
8356 8364 * code, so that it doesn't try to use the old path during that
8357 8365 * window.
8358 8366 */
8359 8367 mutex_enter(&orp->r_statelock);
8360 8368 while (orp->r_flags & R4RECEXPFH) {
8361 8369 klwp_t *lwp = ttolwp(curthread);
8362 8370
8363 8371 if (lwp != NULL)
8364 8372 lwp->lwp_nostop++;
8365 8373 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8366 8374 mutex_exit(&orp->r_statelock);
8367 8375 if (lwp != NULL)
8368 8376 lwp->lwp_nostop--;
8369 8377 return (EINTR);
8370 8378 }
8371 8379 if (lwp != NULL)
8372 8380 lwp->lwp_nostop--;
8373 8381 }
8374 8382 orp->r_flags |= R4RECEXPFH;
8375 8383 mutex_exit(&orp->r_statelock);
8376 8384
8377 8385 mi = VTOMI4(odvp);
8378 8386
8379 8387 args.ctag = TAG_RENAME_VFH;
8380 8388 args.array_len = (odvp == ndvp) ? 10 : 12;
8381 8389 argoplist_size = args.array_len * sizeof (nfs_argop4);
8382 8390 argop = kmem_alloc(argoplist_size, KM_SLEEP);
8383 8391
8384 8392 /*
8385 8393 * Rename ops:
8386 8394 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8387 8395 * PUTFH(targetdir), RENAME, GETATTR(targetdir)
8388 8396 * LOOKUP(trgt), GETFH(new), GETATTR,
8389 8397 *
8390 8398 * if (odvp != ndvp)
8391 8399 * add putfh(sourcedir), getattr(sourcedir) }
8392 8400 */
8393 8401 args.array = argop;
8394 8402
8395 8403 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8396 8404 &recov_state, NULL);
8397 8405 if (e.error) {
8398 8406 kmem_free(argop, argoplist_size);
8399 8407 mutex_enter(&orp->r_statelock);
8400 8408 orp->r_flags &= ~R4RECEXPFH;
8401 8409 cv_broadcast(&orp->r_cv);
8402 8410 mutex_exit(&orp->r_statelock);
8403 8411 return (e.error);
8404 8412 }
8405 8413
8406 8414 /* 0: putfh source directory */
8407 8415 argop[0].argop = OP_CPUTFH;
8408 8416 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8409 8417
8410 8418 /* 1: Save source fh to free up current for target */
8411 8419 argop[1].argop = OP_SAVEFH;
8412 8420
8413 8421 /* 2: Lookup pre-rename fh of renamed object */
8414 8422 argop[2].argop = OP_CLOOKUP;
8415 8423 argop[2].nfs_argop4_u.opclookup.cname = onm;
8416 8424
8417 8425 /* 3: getfh fh of renamed object (before rename) */
8418 8426 argop[3].argop = OP_GETFH;
8419 8427
8420 8428 /* 4: putfh targetdir */
8421 8429 argop[4].argop = OP_CPUTFH;
8422 8430 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8423 8431
8424 8432 /* 5: current_fh is targetdir, saved_fh is sourcedir */
8425 8433 argop[5].argop = OP_CRENAME;
8426 8434 argop[5].nfs_argop4_u.opcrename.coldname = onm;
8427 8435 argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8428 8436
8429 8437 /* 6: getattr of target dir (post op attrs) */
8430 8438 argop[6].argop = OP_GETATTR;
8431 8439 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8432 8440 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8433 8441
8434 8442 /* 7: Lookup post-rename fh of renamed object */
8435 8443 argop[7].argop = OP_CLOOKUP;
8436 8444 argop[7].nfs_argop4_u.opclookup.cname = nnm;
8437 8445
8438 8446 /* 8: getfh fh of renamed object (after rename) */
8439 8447 argop[8].argop = OP_GETFH;
8440 8448
8441 8449 /* 9: getattr of renamed object */
8442 8450 argop[9].argop = OP_GETATTR;
8443 8451 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8444 8452 argop[9].nfs_argop4_u.opgetattr.mi = mi;
8445 8453
8446 8454 /*
8447 8455 * If source/target dirs are different, then get new post-op
8448 8456 * attrs for source dir also.
8449 8457 */
8450 8458 if (ndvp != odvp) {
8451 8459 /* 10: putfh (sourcedir) */
8452 8460 argop[10].argop = OP_CPUTFH;
8453 8461 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8454 8462
8455 8463 /* 11: getattr (sourcedir) */
8456 8464 argop[11].argop = OP_GETATTR;
8457 8465 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8458 8466 argop[11].nfs_argop4_u.opgetattr.mi = mi;
8459 8467 }
8460 8468
8461 8469 dnlc_remove(odvp, onm);
8462 8470 dnlc_remove(ndvp, nnm);
8463 8471
8464 8472 doqueue = 1;
8465 8473 t = gethrtime();
8466 8474 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8467 8475
8468 8476 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8469 8477 if (e.error) {
8470 8478 PURGE_ATTRCACHE4(odvp);
8471 8479 PURGE_ATTRCACHE4(ndvp);
8472 8480 if (!needrecov) {
8473 8481 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8474 8482 &recov_state, needrecov);
8475 8483 goto out;
8476 8484 }
8477 8485 } else {
8478 8486 *statp = res.status;
8479 8487 }
8480 8488
8481 8489 if (needrecov) {
8482 8490 bool_t abort;
8483 8491
8484 8492 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8485 8493 OP_RENAME, NULL, NULL, NULL);
8486 8494 if (abort == FALSE) {
8487 8495 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8488 8496 &recov_state, needrecov);
8489 8497 kmem_free(argop, argoplist_size);
8490 8498 if (!e.error)
8491 8499 (void) xdr_free(xdr_COMPOUND4res_clnt,
8492 8500 (caddr_t)&res);
8493 8501 mutex_enter(&orp->r_statelock);
8494 8502 orp->r_flags &= ~R4RECEXPFH;
8495 8503 cv_broadcast(&orp->r_cv);
8496 8504 mutex_exit(&orp->r_statelock);
8497 8505 goto recov_retry;
8498 8506 } else {
8499 8507 if (e.error != 0) {
8500 8508 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8501 8509 &recov_state, needrecov);
8502 8510 goto out;
8503 8511 }
8504 8512 /* fall through for res.status case */
8505 8513 }
8506 8514 }
8507 8515
8508 8516 resp = &res;
8509 8517 /*
8510 8518 * If OP_RENAME (or any prev op) failed, then return an error.
8511 8519 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8512 8520 */
8513 8521 if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8514 8522 /*
8515 8523 * Error in an op other than last Getattr
8516 8524 */
8517 8525 e.error = geterrno4(res.status);
8518 8526 PURGE_ATTRCACHE4(odvp);
8519 8527 PURGE_ATTRCACHE4(ndvp);
8520 8528 /*
8521 8529 * System V defines rename to return EEXIST, not
8522 8530 * ENOTEMPTY if the target directory is not empty.
8523 8531 * Over the wire, the error is NFSERR_ENOTEMPTY
8524 8532 * which geterrno4 maps to ENOTEMPTY.
8525 8533 */
8526 8534 if (e.error == ENOTEMPTY)
8527 8535 e.error = EEXIST;
8528 8536 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8529 8537 needrecov);
8530 8538 goto out;
8531 8539 }
8532 8540
8533 8541 /* rename results */
8534 8542 rn_res = &res.array[5].nfs_resop4_u.oprename;
8535 8543
8536 8544 if (res.status == NFS4_OK) {
8537 8545 /* Update target attribute, readdir and dnlc caches */
8538 8546 dinfo.di_garp =
8539 8547 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8540 8548 dinfo.di_cred = cr;
8541 8549 dinfo.di_time_call = t;
8542 8550 } else
8543 8551 dinfop = NULL;
8544 8552
8545 8553 /* Update source cache attribute, readdir and dnlc caches */
8546 8554 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8547 8555
8548 8556 /* Update source cache attribute, readdir and dnlc caches */
8549 8557 if (ndvp != odvp) {
8550 8558 update_parentdir_sfh(ovp, ndvp);
8551 8559
8552 8560 /*
8553 8561 * If dinfop is non-NULL, then compound succeded, so
8554 8562 * set di_garp to attrs for source dir. dinfop is only
8555 8563 * set to NULL when compound fails.
8556 8564 */
8557 8565 if (dinfop)
8558 8566 dinfo.di_garp =
8559 8567 &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8560 8568 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8561 8569 dinfop);
8562 8570 }
8563 8571
8564 8572 /*
8565 8573 * Update the rnode with the new component name and args,
8566 8574 * and if the file handle changed, also update it with the new fh.
8567 8575 * This is only necessary if the target object has an rnode
8568 8576 * entry and there is no need to create one for it.
8569 8577 */
8570 8578 resop = &res.array[8]; /* getfh new res */
8571 8579 ngf_res = &resop->nfs_resop4_u.opgetfh;
8572 8580
8573 8581 /*
8574 8582 * Update the path and filehandle for the renamed object.
8575 8583 */
8576 8584 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8577 8585
8578 8586 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8579 8587
8580 8588 if (res.status == NFS4_OK) {
8581 8589 resop++; /* getattr res */
8582 8590 e.error = nfs4_update_attrcache(res.status,
8583 8591 &resop->nfs_resop4_u.opgetattr.ga_res,
8584 8592 t, ovp, cr);
8585 8593 }
8586 8594
8587 8595 out:
8588 8596 kmem_free(argop, argoplist_size);
8589 8597 if (resp)
8590 8598 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8591 8599 mutex_enter(&orp->r_statelock);
8592 8600 orp->r_flags &= ~R4RECEXPFH;
8593 8601 cv_broadcast(&orp->r_cv);
8594 8602 mutex_exit(&orp->r_statelock);
8595 8603
8596 8604 return (e.error);
8597 8605 }
8598 8606
8599 8607 /* ARGSUSED */
8600 8608 static int
8601 8609 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8602 8610 caller_context_t *ct, int flags, vsecattr_t *vsecp)
8603 8611 {
8604 8612 int error;
8605 8613 vnode_t *vp;
8606 8614
8607 8615 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8608 8616 return (EPERM);
8609 8617 /*
8610 8618 * As ".." has special meaning and rather than send a mkdir
8611 8619 * over the wire to just let the server freak out, we just
8612 8620 * short circuit it here and return EEXIST
8613 8621 */
8614 8622 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8615 8623 return (EEXIST);
8616 8624
8617 8625 /*
8618 8626 * Decision to get the right gid and setgid bit of the
8619 8627 * new directory is now made in call_nfs4_create_req.
8620 8628 */
8621 8629 va->va_mask |= AT_MODE;
8622 8630 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8623 8631 if (error)
8624 8632 return (error);
8625 8633
8626 8634 *vpp = vp;
8627 8635 return (0);
8628 8636 }
8629 8637
8630 8638
8631 8639 /*
8632 8640 * rmdir is using the same remove v4 op as does remove.
8633 8641 * Remove requires that the current fh be the target directory.
8634 8642 * After the operation, the current fh is unchanged.
8635 8643 * The compound op structure is:
8636 8644 * PUTFH(targetdir), REMOVE
8637 8645 */
8638 8646 /*ARGSUSED4*/
8639 8647 static int
8640 8648 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8641 8649 caller_context_t *ct, int flags)
8642 8650 {
8643 8651 int need_end_op = FALSE;
8644 8652 COMPOUND4args_clnt args;
8645 8653 COMPOUND4res_clnt res, *resp = NULL;
8646 8654 REMOVE4res *rm_res;
8647 8655 nfs_argop4 argop[3];
8648 8656 nfs_resop4 *resop;
8649 8657 vnode_t *vp;
8650 8658 int doqueue;
8651 8659 mntinfo4_t *mi;
8652 8660 rnode4_t *drp;
8653 8661 bool_t needrecov = FALSE;
8654 8662 nfs4_recov_state_t recov_state;
8655 8663 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8656 8664 dirattr_info_t dinfo, *dinfop;
8657 8665
8658 8666 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8659 8667 return (EPERM);
8660 8668 /*
8661 8669 * As ".." has special meaning and rather than send a rmdir
8662 8670 * over the wire to just let the server freak out, we just
8663 8671 * short circuit it here and return EEXIST
8664 8672 */
8665 8673 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8666 8674 return (EEXIST);
8667 8675
8668 8676 drp = VTOR4(dvp);
8669 8677 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8670 8678 return (EINTR);
8671 8679
8672 8680 /*
8673 8681 * Attempt to prevent a rmdir(".") from succeeding.
8674 8682 */
8675 8683 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8676 8684 if (e.error) {
8677 8685 nfs_rw_exit(&drp->r_rwlock);
8678 8686 return (e.error);
8679 8687 }
8680 8688 if (vp == cdir) {
8681 8689 VN_RELE(vp);
8682 8690 nfs_rw_exit(&drp->r_rwlock);
8683 8691 return (EINVAL);
8684 8692 }
8685 8693
8686 8694 /*
8687 8695 * Since nfsv4 remove op works on both files and directories,
8688 8696 * check that the removed object is indeed a directory.
8689 8697 */
8690 8698 if (vp->v_type != VDIR) {
8691 8699 VN_RELE(vp);
8692 8700 nfs_rw_exit(&drp->r_rwlock);
8693 8701 return (ENOTDIR);
8694 8702 }
8695 8703
8696 8704 /*
8697 8705 * First just remove the entry from the name cache, as it
8698 8706 * is most likely an entry for this vp.
8699 8707 */
8700 8708 dnlc_remove(dvp, nm);
8701 8709
8702 8710 /*
8703 8711 * If there vnode reference count is greater than one, then
8704 8712 * there may be additional references in the DNLC which will
8705 8713 * need to be purged. First, trying removing the entry for
8706 8714 * the parent directory and see if that removes the additional
8707 8715 * reference(s). If that doesn't do it, then use dnlc_purge_vp
8708 8716 * to completely remove any references to the directory which
8709 8717 * might still exist in the DNLC.
8710 8718 */
8711 8719 if (vp->v_count > 1) {
8712 8720 dnlc_remove(vp, "..");
8713 8721 if (vp->v_count > 1)
8714 8722 dnlc_purge_vp(vp);
8715 8723 }
8716 8724
8717 8725 mi = VTOMI4(dvp);
8718 8726 recov_state.rs_flags = 0;
8719 8727 recov_state.rs_num_retry_despite_err = 0;
8720 8728
8721 8729 recov_retry:
8722 8730 args.ctag = TAG_RMDIR;
8723 8731
8724 8732 /*
8725 8733 * Rmdir ops: putfh dir; remove
8726 8734 */
8727 8735 args.array_len = 3;
8728 8736 args.array = argop;
8729 8737
8730 8738 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8731 8739 if (e.error) {
8732 8740 nfs_rw_exit(&drp->r_rwlock);
8733 8741 return (e.error);
8734 8742 }
8735 8743 need_end_op = TRUE;
8736 8744
8737 8745 /* putfh directory */
8738 8746 argop[0].argop = OP_CPUTFH;
8739 8747 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8740 8748
8741 8749 /* remove */
8742 8750 argop[1].argop = OP_CREMOVE;
8743 8751 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8744 8752
8745 8753 /* getattr (postop attrs for dir that contained removed dir) */
8746 8754 argop[2].argop = OP_GETATTR;
8747 8755 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8748 8756 argop[2].nfs_argop4_u.opgetattr.mi = mi;
8749 8757
8750 8758 dinfo.di_time_call = gethrtime();
8751 8759 doqueue = 1;
8752 8760 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8753 8761
8754 8762 PURGE_ATTRCACHE4(vp);
8755 8763
8756 8764 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8757 8765 if (e.error) {
8758 8766 PURGE_ATTRCACHE4(dvp);
8759 8767 }
8760 8768
8761 8769 if (needrecov) {
8762 8770 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8763 8771 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8764 8772 if (!e.error)
8765 8773 (void) xdr_free(xdr_COMPOUND4res_clnt,
8766 8774 (caddr_t)&res);
8767 8775
8768 8776 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8769 8777 needrecov);
8770 8778 need_end_op = FALSE;
8771 8779 goto recov_retry;
8772 8780 }
8773 8781 }
8774 8782
8775 8783 if (!e.error) {
8776 8784 resp = &res;
8777 8785
8778 8786 /*
8779 8787 * Only return error if first 2 ops (OP_REMOVE or earlier)
8780 8788 * failed.
8781 8789 */
8782 8790 if (res.status != NFS4_OK && res.array_len <= 2) {
8783 8791 e.error = geterrno4(res.status);
8784 8792 PURGE_ATTRCACHE4(dvp);
8785 8793 nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8786 8794 &recov_state, needrecov);
8787 8795 need_end_op = FALSE;
8788 8796 nfs4_purge_stale_fh(e.error, dvp, cr);
8789 8797 /*
8790 8798 * System V defines rmdir to return EEXIST, not
8791 8799 * ENOTEMPTY if the directory is not empty. Over
8792 8800 * the wire, the error is NFSERR_ENOTEMPTY which
8793 8801 * geterrno4 maps to ENOTEMPTY.
8794 8802 */
8795 8803 if (e.error == ENOTEMPTY)
8796 8804 e.error = EEXIST;
8797 8805 } else {
8798 8806 resop = &res.array[1]; /* remove res */
8799 8807 rm_res = &resop->nfs_resop4_u.opremove;
8800 8808
8801 8809 if (res.status == NFS4_OK) {
8802 8810 resop = &res.array[2]; /* dir attrs */
8803 8811 dinfo.di_garp =
8804 8812 &resop->nfs_resop4_u.opgetattr.ga_res;
8805 8813 dinfo.di_cred = cr;
8806 8814 dinfop = &dinfo;
8807 8815 } else
8808 8816 dinfop = NULL;
8809 8817
8810 8818 /* Update dir attribute, readdir and dnlc caches */
8811 8819 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8812 8820 dinfop);
8813 8821
8814 8822 /* destroy rddir cache for dir that was removed */
8815 8823 if (VTOR4(vp)->r_dir != NULL)
8816 8824 nfs4_purge_rddir_cache(vp);
8817 8825 }
8818 8826 }
8819 8827
8820 8828 if (need_end_op)
8821 8829 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8822 8830
8823 8831 nfs_rw_exit(&drp->r_rwlock);
8824 8832
8825 8833 if (resp)
8826 8834 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8827 8835
8828 8836 if (e.error == 0) {
8829 8837 vnode_t *tvp;
8830 8838 rnode4_t *trp;
8831 8839 trp = VTOR4(vp);
8832 8840 tvp = vp;
8833 8841 if (IS_SHADOW(vp, trp))
8834 8842 tvp = RTOV4(trp);
8835 8843 vnevent_rmdir(tvp, dvp, nm, ct);
8836 8844 }
8837 8845
8838 8846 VN_RELE(vp);
8839 8847
8840 8848 return (e.error);
8841 8849 }
8842 8850
8843 8851 /* ARGSUSED */
8844 8852 static int
8845 8853 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8846 8854 caller_context_t *ct, int flags)
8847 8855 {
8848 8856 int error;
8849 8857 vnode_t *vp;
8850 8858 rnode4_t *rp;
8851 8859 char *contents;
8852 8860 mntinfo4_t *mi = VTOMI4(dvp);
8853 8861
8854 8862 if (nfs_zone() != mi->mi_zone)
8855 8863 return (EPERM);
8856 8864 if (!(mi->mi_flags & MI4_SYMLINK))
8857 8865 return (EOPNOTSUPP);
8858 8866
8859 8867 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8860 8868 if (error)
8861 8869 return (error);
8862 8870
8863 8871 ASSERT(nfs4_consistent_type(vp));
8864 8872 rp = VTOR4(vp);
8865 8873 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8866 8874
8867 8875 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8868 8876
8869 8877 if (contents != NULL) {
8870 8878 mutex_enter(&rp->r_statelock);
8871 8879 if (rp->r_symlink.contents == NULL) {
8872 8880 rp->r_symlink.len = strlen(tnm);
8873 8881 bcopy(tnm, contents, rp->r_symlink.len);
8874 8882 rp->r_symlink.contents = contents;
8875 8883 rp->r_symlink.size = MAXPATHLEN;
8876 8884 mutex_exit(&rp->r_statelock);
8877 8885 } else {
8878 8886 mutex_exit(&rp->r_statelock);
8879 8887 kmem_free((void *)contents, MAXPATHLEN);
8880 8888 }
8881 8889 }
8882 8890 }
8883 8891 VN_RELE(vp);
8884 8892
8885 8893 return (error);
8886 8894 }
8887 8895
8888 8896
8889 8897 /*
8890 8898 * Read directory entries.
8891 8899 * There are some weird things to look out for here. The uio_loffset
8892 8900 * field is either 0 or it is the offset returned from a previous
8893 8901 * readdir. It is an opaque value used by the server to find the
8894 8902 * correct directory block to read. The count field is the number
8895 8903 * of blocks to read on the server. This is advisory only, the server
8896 8904 * may return only one block's worth of entries. Entries may be compressed
8897 8905 * on the server.
8898 8906 */
8899 8907 /* ARGSUSED */
8900 8908 static int
8901 8909 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8902 8910 caller_context_t *ct, int flags)
8903 8911 {
8904 8912 int error;
8905 8913 uint_t count;
8906 8914 rnode4_t *rp;
8907 8915 rddir4_cache *rdc;
8908 8916 rddir4_cache *rrdc;
8909 8917
8910 8918 if (nfs_zone() != VTOMI4(vp)->mi_zone)
8911 8919 return (EIO);
8912 8920 rp = VTOR4(vp);
8913 8921
8914 8922 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8915 8923
8916 8924 /*
8917 8925 * Make sure that the directory cache is valid.
8918 8926 */
8919 8927 if (rp->r_dir != NULL) {
8920 8928 if (nfs_disable_rddir_cache != 0) {
8921 8929 /*
8922 8930 * Setting nfs_disable_rddir_cache in /etc/system
8923 8931 * allows interoperability with servers that do not
8924 8932 * properly update the attributes of directories.
8925 8933 * Any cached information gets purged before an
8926 8934 * access is made to it.
8927 8935 */
8928 8936 nfs4_purge_rddir_cache(vp);
8929 8937 }
8930 8938
8931 8939 error = nfs4_validate_caches(vp, cr);
8932 8940 if (error)
8933 8941 return (error);
8934 8942 }
8935 8943
8936 8944 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8937 8945
8938 8946 /*
8939 8947 * Short circuit last readdir which always returns 0 bytes.
8940 8948 * This can be done after the directory has been read through
8941 8949 * completely at least once. This will set r_direof which
8942 8950 * can be used to find the value of the last cookie.
8943 8951 */
8944 8952 mutex_enter(&rp->r_statelock);
8945 8953 if (rp->r_direof != NULL &&
8946 8954 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8947 8955 mutex_exit(&rp->r_statelock);
8948 8956 #ifdef DEBUG
8949 8957 nfs4_readdir_cache_shorts++;
8950 8958 #endif
8951 8959 if (eofp)
8952 8960 *eofp = 1;
8953 8961 return (0);
8954 8962 }
8955 8963
8956 8964 /*
8957 8965 * Look for a cache entry. Cache entries are identified
8958 8966 * by the NFS cookie value and the byte count requested.
8959 8967 */
8960 8968 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8961 8969
8962 8970 /*
8963 8971 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8964 8972 */
8965 8973 if (rdc == NULL) {
8966 8974 mutex_exit(&rp->r_statelock);
8967 8975 return (EINTR);
8968 8976 }
8969 8977
8970 8978 /*
8971 8979 * Check to see if we need to fill this entry in.
8972 8980 */
8973 8981 if (rdc->flags & RDDIRREQ) {
8974 8982 rdc->flags &= ~RDDIRREQ;
8975 8983 rdc->flags |= RDDIR;
8976 8984 mutex_exit(&rp->r_statelock);
8977 8985
8978 8986 /*
8979 8987 * Do the readdir.
8980 8988 */
8981 8989 nfs4readdir(vp, rdc, cr);
8982 8990
8983 8991 /*
8984 8992 * Reacquire the lock, so that we can continue
8985 8993 */
8986 8994 mutex_enter(&rp->r_statelock);
8987 8995 /*
8988 8996 * The entry is now complete
8989 8997 */
8990 8998 rdc->flags &= ~RDDIR;
8991 8999 }
8992 9000
8993 9001 ASSERT(!(rdc->flags & RDDIR));
8994 9002
8995 9003 /*
8996 9004 * If an error occurred while attempting
8997 9005 * to fill the cache entry, mark the entry invalid and
8998 9006 * just return the error.
8999 9007 */
9000 9008 if (rdc->error) {
9001 9009 error = rdc->error;
9002 9010 rdc->flags |= RDDIRREQ;
9003 9011 rddir4_cache_rele(rp, rdc);
9004 9012 mutex_exit(&rp->r_statelock);
9005 9013 return (error);
9006 9014 }
9007 9015
9008 9016 /*
9009 9017 * The cache entry is complete and good,
9010 9018 * copyout the dirent structs to the calling
9011 9019 * thread.
9012 9020 */
9013 9021 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9014 9022
9015 9023 /*
9016 9024 * If no error occurred during the copyout,
9017 9025 * update the offset in the uio struct to
9018 9026 * contain the value of the next NFS 4 cookie
9019 9027 * and set the eof value appropriately.
9020 9028 */
9021 9029 if (!error) {
9022 9030 uiop->uio_loffset = rdc->nfs4_ncookie;
9023 9031 if (eofp)
9024 9032 *eofp = rdc->eof;
9025 9033 }
9026 9034
9027 9035 /*
9028 9036 * Decide whether to do readahead. Don't if we
9029 9037 * have already read to the end of directory.
9030 9038 */
9031 9039 if (rdc->eof) {
9032 9040 /*
9033 9041 * Make the entry the direof only if it is cached
9034 9042 */
9035 9043 if (rdc->flags & RDDIRCACHED)
9036 9044 rp->r_direof = rdc;
9037 9045 rddir4_cache_rele(rp, rdc);
9038 9046 mutex_exit(&rp->r_statelock);
9039 9047 return (error);
9040 9048 }
9041 9049
9042 9050 /* Determine if a readdir readahead should be done */
9043 9051 if (!(rp->r_flags & R4LOOKUP)) {
9044 9052 rddir4_cache_rele(rp, rdc);
9045 9053 mutex_exit(&rp->r_statelock);
9046 9054 return (error);
9047 9055 }
9048 9056
9049 9057 /*
9050 9058 * Now look for a readahead entry.
9051 9059 *
9052 9060 * Check to see whether we found an entry for the readahead.
9053 9061 * If so, we don't need to do anything further, so free the new
9054 9062 * entry if one was allocated. Otherwise, allocate a new entry, add
9055 9063 * it to the cache, and then initiate an asynchronous readdir
9056 9064 * operation to fill it.
9057 9065 */
9058 9066 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9059 9067
9060 9068 /*
9061 9069 * A readdir cache entry could not be obtained for the readahead. In
9062 9070 * this case we skip the readahead and return.
9063 9071 */
9064 9072 if (rrdc == NULL) {
9065 9073 rddir4_cache_rele(rp, rdc);
9066 9074 mutex_exit(&rp->r_statelock);
9067 9075 return (error);
9068 9076 }
9069 9077
9070 9078 /*
9071 9079 * Check to see if we need to fill this entry in.
9072 9080 */
9073 9081 if (rrdc->flags & RDDIRREQ) {
9074 9082 rrdc->flags &= ~RDDIRREQ;
9075 9083 rrdc->flags |= RDDIR;
9076 9084 rddir4_cache_rele(rp, rdc);
9077 9085 mutex_exit(&rp->r_statelock);
9078 9086 #ifdef DEBUG
9079 9087 nfs4_readdir_readahead++;
9080 9088 #endif
9081 9089 /*
9082 9090 * Do the readdir.
9083 9091 */
9084 9092 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9085 9093 return (error);
9086 9094 }
9087 9095
9088 9096 rddir4_cache_rele(rp, rrdc);
9089 9097 rddir4_cache_rele(rp, rdc);
9090 9098 mutex_exit(&rp->r_statelock);
9091 9099 return (error);
9092 9100 }
9093 9101
9094 9102 static int
9095 9103 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9096 9104 {
9097 9105 int error;
9098 9106 rnode4_t *rp;
9099 9107
9100 9108 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9101 9109
9102 9110 rp = VTOR4(vp);
9103 9111
9104 9112 /*
9105 9113 * Obtain the readdir results for the caller.
9106 9114 */
9107 9115 nfs4readdir(vp, rdc, cr);
9108 9116
9109 9117 mutex_enter(&rp->r_statelock);
9110 9118 /*
9111 9119 * The entry is now complete
9112 9120 */
9113 9121 rdc->flags &= ~RDDIR;
9114 9122
9115 9123 error = rdc->error;
9116 9124 if (error)
9117 9125 rdc->flags |= RDDIRREQ;
9118 9126 rddir4_cache_rele(rp, rdc);
9119 9127 mutex_exit(&rp->r_statelock);
9120 9128
9121 9129 return (error);
9122 9130 }
9123 9131
9124 9132 /*
9125 9133 * Read directory entries.
9126 9134 * There are some weird things to look out for here. The uio_loffset
9127 9135 * field is either 0 or it is the offset returned from a previous
9128 9136 * readdir. It is an opaque value used by the server to find the
9129 9137 * correct directory block to read. The count field is the number
9130 9138 * of blocks to read on the server. This is advisory only, the server
9131 9139 * may return only one block's worth of entries. Entries may be compressed
9132 9140 * on the server.
9133 9141 *
9134 9142 * Generates the following compound request:
9135 9143 * 1. If readdir offset is zero and no dnlc entry for parent exists,
9136 9144 * must include a Lookupp as well. In this case, send:
9137 9145 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9138 9146 * 2. Otherwise just do: { Putfh <fh>; Readdir }
9139 9147 *
9140 9148 * Get complete attributes and filehandles for entries if this is the
9141 9149 * first read of the directory. Otherwise, just get fileid's.
9142 9150 */
9143 9151 static void
9144 9152 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9145 9153 {
9146 9154 COMPOUND4args_clnt args;
9147 9155 COMPOUND4res_clnt res;
9148 9156 READDIR4args *rargs;
9149 9157 READDIR4res_clnt *rd_res;
9150 9158 bitmap4 rd_bitsval;
9151 9159 nfs_argop4 argop[5];
9152 9160 nfs_resop4 *resop;
9153 9161 rnode4_t *rp = VTOR4(vp);
9154 9162 mntinfo4_t *mi = VTOMI4(vp);
9155 9163 int doqueue;
9156 9164 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */
9157 9165 vnode_t *dvp;
9158 9166 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9159 9167 int num_ops, res_opcnt;
9160 9168 bool_t needrecov = FALSE;
9161 9169 nfs4_recov_state_t recov_state;
9162 9170 hrtime_t t;
9163 9171 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9164 9172
9165 9173 ASSERT(nfs_zone() == mi->mi_zone);
9166 9174 ASSERT(rdc->flags & RDDIR);
9167 9175 ASSERT(rdc->entries == NULL);
9168 9176
9169 9177 /*
9170 9178 * If rp were a stub, it should have triggered and caused
9171 9179 * a mount for us to get this far.
9172 9180 */
9173 9181 ASSERT(!RP_ISSTUB(rp));
9174 9182
9175 9183 num_ops = 2;
9176 9184 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9177 9185 /*
9178 9186 * Since nfsv4 readdir may not return entries for "." and "..",
9179 9187 * the client must recreate them:
9180 9188 * To find the correct nodeid, do the following:
9181 9189 * For current node, get nodeid from dnlc.
9182 9190 * - if current node is rootvp, set pnodeid to nodeid.
9183 9191 * - else if parent is in the dnlc, get its nodeid from there.
9184 9192 * - else add LOOKUPP+GETATTR to compound.
9185 9193 */
9186 9194 nodeid = rp->r_attr.va_nodeid;
9187 9195 if (vp->v_flag & VROOT) {
9188 9196 pnodeid = nodeid; /* root of mount point */
9189 9197 } else {
9190 9198 dvp = dnlc_lookup(vp, "..");
9191 9199 if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9192 9200 /* parent in dnlc cache - no need for otw */
9193 9201 pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9194 9202 } else {
9195 9203 /*
9196 9204 * parent not in dnlc cache,
9197 9205 * do lookupp to get its id
9198 9206 */
9199 9207 num_ops = 5;
9200 9208 pnodeid = 0; /* set later by getattr parent */
9201 9209 }
9202 9210 if (dvp)
9203 9211 VN_RELE(dvp);
9204 9212 }
9205 9213 }
9206 9214 recov_state.rs_flags = 0;
9207 9215 recov_state.rs_num_retry_despite_err = 0;
9208 9216
9209 9217 /* Save the original mount point security flavor */
9210 9218 (void) save_mnt_secinfo(mi->mi_curr_serv);
9211 9219
9212 9220 recov_retry:
9213 9221 args.ctag = TAG_READDIR;
9214 9222
9215 9223 args.array = argop;
9216 9224 args.array_len = num_ops;
9217 9225
9218 9226 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9219 9227 &recov_state, NULL)) {
9220 9228 /*
9221 9229 * If readdir a node that is a stub for a crossed mount point,
9222 9230 * keep the original secinfo flavor for the current file
9223 9231 * system, not the crossed one.
9224 9232 */
9225 9233 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9226 9234 rdc->error = e.error;
9227 9235 return;
9228 9236 }
9229 9237
9230 9238 /*
9231 9239 * Determine which attrs to request for dirents. This code
9232 9240 * must be protected by nfs4_start/end_fop because of r_server
9233 9241 * (which will change during failover recovery).
9234 9242 *
9235 9243 */
9236 9244 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9237 9245 /*
9238 9246 * Get all vattr attrs plus filehandle and rdattr_error
9239 9247 */
9240 9248 rd_bitsval = NFS4_VATTR_MASK |
9241 9249 FATTR4_RDATTR_ERROR_MASK |
9242 9250 FATTR4_FILEHANDLE_MASK;
9243 9251
9244 9252 if (rp->r_flags & R4READDIRWATTR) {
9245 9253 mutex_enter(&rp->r_statelock);
9246 9254 rp->r_flags &= ~R4READDIRWATTR;
9247 9255 mutex_exit(&rp->r_statelock);
9248 9256 }
9249 9257 } else {
9250 9258 servinfo4_t *svp = rp->r_server;
9251 9259
9252 9260 /*
9253 9261 * Already read directory. Use readdir with
9254 9262 * no attrs (except for mounted_on_fileid) for updates.
9255 9263 */
9256 9264 rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9257 9265
9258 9266 /*
9259 9267 * request mounted on fileid if supported, else request
9260 9268 * fileid. maybe we should verify that fileid is supported
9261 9269 * and request something else if not.
9262 9270 */
9263 9271 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9264 9272 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9265 9273 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9266 9274 nfs_rw_exit(&svp->sv_lock);
9267 9275 }
9268 9276
9269 9277 /* putfh directory fh */
9270 9278 argop[0].argop = OP_CPUTFH;
9271 9279 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9272 9280
9273 9281 argop[1].argop = OP_READDIR;
9274 9282 rargs = &argop[1].nfs_argop4_u.opreaddir;
9275 9283 /*
9276 9284 * 1 and 2 are reserved for client "." and ".." entry offset.
9277 9285 * cookie 0 should be used over-the-wire to start reading at
9278 9286 * the beginning of the directory excluding "." and "..".
9279 9287 */
9280 9288 if (rdc->nfs4_cookie == 0 ||
9281 9289 rdc->nfs4_cookie == 1 ||
9282 9290 rdc->nfs4_cookie == 2) {
9283 9291 rargs->cookie = (nfs_cookie4)0;
9284 9292 rargs->cookieverf = 0;
9285 9293 } else {
9286 9294 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9287 9295 mutex_enter(&rp->r_statelock);
9288 9296 rargs->cookieverf = rp->r_cookieverf4;
9289 9297 mutex_exit(&rp->r_statelock);
9290 9298 }
9291 9299 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9292 9300 rargs->maxcount = mi->mi_tsize;
9293 9301 rargs->attr_request = rd_bitsval;
9294 9302 rargs->rdc = rdc;
9295 9303 rargs->dvp = vp;
9296 9304 rargs->mi = mi;
9297 9305 rargs->cr = cr;
9298 9306
9299 9307
9300 9308 /*
9301 9309 * If count < than the minimum required, we return no entries
9302 9310 * and fail with EINVAL
9303 9311 */
9304 9312 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9305 9313 rdc->error = EINVAL;
9306 9314 goto out;
9307 9315 }
9308 9316
9309 9317 if (args.array_len == 5) {
9310 9318 /*
9311 9319 * Add lookupp and getattr for parent nodeid.
9312 9320 */
9313 9321 argop[2].argop = OP_LOOKUPP;
9314 9322
9315 9323 argop[3].argop = OP_GETFH;
9316 9324
9317 9325 /* getattr parent */
9318 9326 argop[4].argop = OP_GETATTR;
9319 9327 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9320 9328 argop[4].nfs_argop4_u.opgetattr.mi = mi;
9321 9329 }
9322 9330
9323 9331 doqueue = 1;
9324 9332
9325 9333 if (mi->mi_io_kstats) {
9326 9334 mutex_enter(&mi->mi_lock);
9327 9335 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9328 9336 mutex_exit(&mi->mi_lock);
9329 9337 }
9330 9338
9331 9339 /* capture the time of this call */
9332 9340 rargs->t = t = gethrtime();
9333 9341
9334 9342 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9335 9343
9336 9344 if (mi->mi_io_kstats) {
9337 9345 mutex_enter(&mi->mi_lock);
9338 9346 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9339 9347 mutex_exit(&mi->mi_lock);
9340 9348 }
9341 9349
9342 9350 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9343 9351
9344 9352 /*
9345 9353 * If RPC error occurred and it isn't an error that
9346 9354 * triggers recovery, then go ahead and fail now.
9347 9355 */
9348 9356 if (e.error != 0 && !needrecov) {
9349 9357 rdc->error = e.error;
9350 9358 goto out;
9351 9359 }
9352 9360
9353 9361 if (needrecov) {
9354 9362 bool_t abort;
9355 9363
9356 9364 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9357 9365 "nfs4readdir: initiating recovery.\n"));
9358 9366
9359 9367 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9360 9368 NULL, OP_READDIR, NULL, NULL, NULL);
9361 9369 if (abort == FALSE) {
9362 9370 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9363 9371 &recov_state, needrecov);
9364 9372 if (!e.error)
9365 9373 (void) xdr_free(xdr_COMPOUND4res_clnt,
9366 9374 (caddr_t)&res);
9367 9375 if (rdc->entries != NULL) {
9368 9376 kmem_free(rdc->entries, rdc->entlen);
9369 9377 rdc->entries = NULL;
9370 9378 }
9371 9379 goto recov_retry;
9372 9380 }
9373 9381
9374 9382 if (e.error != 0) {
9375 9383 rdc->error = e.error;
9376 9384 goto out;
9377 9385 }
9378 9386
9379 9387 /* fall through for res.status case */
9380 9388 }
9381 9389
9382 9390 res_opcnt = res.array_len;
9383 9391
9384 9392 /*
9385 9393 * If compound failed first 2 ops (PUTFH+READDIR), then return
9386 9394 * failure here. Subsequent ops are for filling out dot-dot
9387 9395 * dirent, and if they fail, we still want to give the caller
9388 9396 * the dirents returned by (the successful) READDIR op, so we need
9389 9397 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9390 9398 *
9391 9399 * One example where PUTFH+READDIR ops would succeed but
9392 9400 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9393 9401 * but lacks x. In this case, a POSIX server's VOP_READDIR
9394 9402 * would succeed; however, VOP_LOOKUP(..) would fail since no
9395 9403 * x perm. We need to come up with a non-vendor-specific way
9396 9404 * for a POSIX server to return d_ino from dotdot's dirent if
9397 9405 * client only requests mounted_on_fileid, and just say the
9398 9406 * LOOKUPP succeeded and fill out the GETATTR. However, if
9399 9407 * client requested any mandatory attrs, server would be required
9400 9408 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9401 9409 * for dotdot.
9402 9410 */
9403 9411
9404 9412 if (res.status) {
9405 9413 if (res_opcnt <= 2) {
9406 9414 e.error = geterrno4(res.status);
9407 9415 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9408 9416 &recov_state, needrecov);
9409 9417 nfs4_purge_stale_fh(e.error, vp, cr);
9410 9418 rdc->error = e.error;
9411 9419 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9412 9420 if (rdc->entries != NULL) {
9413 9421 kmem_free(rdc->entries, rdc->entlen);
9414 9422 rdc->entries = NULL;
9415 9423 }
9416 9424 /*
9417 9425 * If readdir a node that is a stub for a
9418 9426 * crossed mount point, keep the original
9419 9427 * secinfo flavor for the current file system,
9420 9428 * not the crossed one.
9421 9429 */
9422 9430 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9423 9431 return;
9424 9432 }
9425 9433 }
9426 9434
9427 9435 resop = &res.array[1]; /* readdir res */
9428 9436 rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9429 9437
9430 9438 mutex_enter(&rp->r_statelock);
9431 9439 rp->r_cookieverf4 = rd_res->cookieverf;
9432 9440 mutex_exit(&rp->r_statelock);
9433 9441
9434 9442 /*
9435 9443 * For "." and ".." entries
9436 9444 * e.g.
9437 9445 * seek(cookie=0) -> "." entry with d_off = 1
9438 9446 * seek(cookie=1) -> ".." entry with d_off = 2
9439 9447 */
9440 9448 if (cookie == (nfs_cookie4) 0) {
9441 9449 if (rd_res->dotp)
9442 9450 rd_res->dotp->d_ino = nodeid;
9443 9451 if (rd_res->dotdotp)
9444 9452 rd_res->dotdotp->d_ino = pnodeid;
9445 9453 }
9446 9454 if (cookie == (nfs_cookie4) 1) {
9447 9455 if (rd_res->dotdotp)
9448 9456 rd_res->dotdotp->d_ino = pnodeid;
9449 9457 }
9450 9458
9451 9459
9452 9460 /* LOOKUPP+GETATTR attemped */
9453 9461 if (args.array_len == 5 && rd_res->dotdotp) {
9454 9462 if (res.status == NFS4_OK && res_opcnt == 5) {
9455 9463 nfs_fh4 *fhp;
9456 9464 nfs4_sharedfh_t *sfhp;
9457 9465 vnode_t *pvp;
9458 9466 nfs4_ga_res_t *garp;
9459 9467
9460 9468 resop++; /* lookupp */
9461 9469 resop++; /* getfh */
9462 9470 fhp = &resop->nfs_resop4_u.opgetfh.object;
9463 9471
9464 9472 resop++; /* getattr of parent */
9465 9473
9466 9474 /*
9467 9475 * First, take care of finishing the
9468 9476 * readdir results.
9469 9477 */
9470 9478 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9471 9479 /*
9472 9480 * The d_ino of .. must be the inode number
9473 9481 * of the mounted filesystem.
9474 9482 */
9475 9483 if (garp->n4g_va.va_mask & AT_NODEID)
9476 9484 rd_res->dotdotp->d_ino =
9477 9485 garp->n4g_va.va_nodeid;
9478 9486
9479 9487
9480 9488 /*
9481 9489 * Next, create the ".." dnlc entry
9482 9490 */
9483 9491 sfhp = sfh4_get(fhp, mi);
9484 9492 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9485 9493 dnlc_update(vp, "..", pvp);
9486 9494 VN_RELE(pvp);
9487 9495 }
9488 9496 sfh4_rele(&sfhp);
9489 9497 }
9490 9498 }
9491 9499
9492 9500 if (mi->mi_io_kstats) {
9493 9501 mutex_enter(&mi->mi_lock);
9494 9502 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9495 9503 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9496 9504 mutex_exit(&mi->mi_lock);
9497 9505 }
9498 9506
9499 9507 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9500 9508
9501 9509 out:
9502 9510 /*
9503 9511 * If readdir a node that is a stub for a crossed mount point,
9504 9512 * keep the original secinfo flavor for the current file system,
9505 9513 * not the crossed one.
9506 9514 */
9507 9515 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9508 9516
9509 9517 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9510 9518 }
9511 9519
9512 9520
9513 9521 static int
9514 9522 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9515 9523 {
9516 9524 rnode4_t *rp = VTOR4(bp->b_vp);
9517 9525 int count;
9518 9526 int error;
9519 9527 cred_t *cred_otw = NULL;
9520 9528 offset_t offset;
9521 9529 nfs4_open_stream_t *osp = NULL;
9522 9530 bool_t first_time = TRUE; /* first time getting otw cred */
9523 9531 bool_t last_time = FALSE; /* last time getting otw cred */
9524 9532
9525 9533 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9526 9534
9527 9535 DTRACE_IO1(start, struct buf *, bp);
9528 9536 offset = ldbtob(bp->b_lblkno);
9529 9537
9530 9538 if (bp->b_flags & B_READ) {
9531 9539 read_again:
9532 9540 /*
9533 9541 * Releases the osp, if it is provided.
9534 9542 * Puts a hold on the cred_otw and the new osp (if found).
9535 9543 */
9536 9544 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9537 9545 &first_time, &last_time);
9538 9546 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9539 9547 offset, bp->b_bcount, &bp->b_resid, cred_otw,
9540 9548 readahead, NULL);
9541 9549 crfree(cred_otw);
9542 9550 if (!error) {
9543 9551 if (bp->b_resid) {
9544 9552 /*
9545 9553 * Didn't get it all because we hit EOF,
9546 9554 * zero all the memory beyond the EOF.
9547 9555 */
9548 9556 /* bzero(rdaddr + */
9549 9557 bzero(bp->b_un.b_addr +
9550 9558 bp->b_bcount - bp->b_resid, bp->b_resid);
9551 9559 }
9552 9560 mutex_enter(&rp->r_statelock);
9553 9561 if (bp->b_resid == bp->b_bcount &&
9554 9562 offset >= rp->r_size) {
9555 9563 /*
9556 9564 * We didn't read anything at all as we are
9557 9565 * past EOF. Return an error indicator back
9558 9566 * but don't destroy the pages (yet).
9559 9567 */
9560 9568 error = NFS_EOF;
9561 9569 }
9562 9570 mutex_exit(&rp->r_statelock);
9563 9571 } else if (error == EACCES && last_time == FALSE) {
9564 9572 goto read_again;
9565 9573 }
9566 9574 } else {
9567 9575 if (!(rp->r_flags & R4STALE)) {
9568 9576 write_again:
9569 9577 /*
9570 9578 * Releases the osp, if it is provided.
9571 9579 * Puts a hold on the cred_otw and the new
9572 9580 * osp (if found).
9573 9581 */
9574 9582 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9575 9583 &first_time, &last_time);
9576 9584 mutex_enter(&rp->r_statelock);
9577 9585 count = MIN(bp->b_bcount, rp->r_size - offset);
9578 9586 mutex_exit(&rp->r_statelock);
9579 9587 if (count < 0)
9580 9588 cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9581 9589 #ifdef DEBUG
9582 9590 if (count == 0) {
9583 9591 zoneid_t zoneid = getzoneid();
9584 9592
9585 9593 zcmn_err(zoneid, CE_WARN,
9586 9594 "nfs4_bio: zero length write at %lld",
9587 9595 offset);
9588 9596 zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9589 9597 "b_bcount=%ld, file size=%lld",
9590 9598 rp->r_flags, (long)bp->b_bcount,
9591 9599 rp->r_size);
9592 9600 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9593 9601 if (nfs4_bio_do_stop)
9594 9602 debug_enter("nfs4_bio");
9595 9603 }
9596 9604 #endif
9597 9605 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9598 9606 count, cred_otw, stab_comm);
9599 9607 if (error == EACCES && last_time == FALSE) {
9600 9608 crfree(cred_otw);
9601 9609 goto write_again;
9602 9610 }
9603 9611 bp->b_error = error;
9604 9612 if (error && error != EINTR &&
9605 9613 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9606 9614 /*
9607 9615 * Don't print EDQUOT errors on the console.
9608 9616 * Don't print asynchronous EACCES errors.
9609 9617 * Don't print EFBIG errors.
9610 9618 * Print all other write errors.
9611 9619 */
9612 9620 if (error != EDQUOT && error != EFBIG &&
9613 9621 (error != EACCES ||
9614 9622 !(bp->b_flags & B_ASYNC)))
9615 9623 nfs4_write_error(bp->b_vp,
9616 9624 error, cred_otw);
9617 9625 /*
9618 9626 * Update r_error and r_flags as appropriate.
9619 9627 * If the error was ESTALE, then mark the
9620 9628 * rnode as not being writeable and save
9621 9629 * the error status. Otherwise, save any
9622 9630 * errors which occur from asynchronous
9623 9631 * page invalidations. Any errors occurring
9624 9632 * from other operations should be saved
9625 9633 * by the caller.
9626 9634 */
9627 9635 mutex_enter(&rp->r_statelock);
9628 9636 if (error == ESTALE) {
9629 9637 rp->r_flags |= R4STALE;
9630 9638 if (!rp->r_error)
9631 9639 rp->r_error = error;
9632 9640 } else if (!rp->r_error &&
9633 9641 (bp->b_flags &
9634 9642 (B_INVAL|B_FORCE|B_ASYNC)) ==
9635 9643 (B_INVAL|B_FORCE|B_ASYNC)) {
9636 9644 rp->r_error = error;
9637 9645 }
9638 9646 mutex_exit(&rp->r_statelock);
9639 9647 }
9640 9648 crfree(cred_otw);
9641 9649 } else {
9642 9650 error = rp->r_error;
9643 9651 /*
9644 9652 * A close may have cleared r_error, if so,
9645 9653 * propagate ESTALE error return properly
9646 9654 */
9647 9655 if (error == 0)
9648 9656 error = ESTALE;
9649 9657 }
9650 9658 }
9651 9659
9652 9660 if (error != 0 && error != NFS_EOF)
9653 9661 bp->b_flags |= B_ERROR;
9654 9662
9655 9663 if (osp)
9656 9664 open_stream_rele(osp, rp);
9657 9665
9658 9666 DTRACE_IO1(done, struct buf *, bp);
9659 9667
9660 9668 return (error);
9661 9669 }
9662 9670
9663 9671 /* ARGSUSED */
9664 9672 int
9665 9673 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9666 9674 {
9667 9675 return (EREMOTE);
9668 9676 }
9669 9677
9670 9678 /* ARGSUSED2 */
9671 9679 int
9672 9680 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9673 9681 {
9674 9682 rnode4_t *rp = VTOR4(vp);
9675 9683
9676 9684 if (!write_lock) {
9677 9685 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9678 9686 return (V_WRITELOCK_FALSE);
9679 9687 }
9680 9688
9681 9689 if ((rp->r_flags & R4DIRECTIO) ||
9682 9690 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9683 9691 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9684 9692 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9685 9693 return (V_WRITELOCK_FALSE);
9686 9694 nfs_rw_exit(&rp->r_rwlock);
9687 9695 }
9688 9696
9689 9697 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9690 9698 return (V_WRITELOCK_TRUE);
9691 9699 }
9692 9700
9693 9701 /* ARGSUSED */
9694 9702 void
9695 9703 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9696 9704 {
9697 9705 rnode4_t *rp = VTOR4(vp);
9698 9706
9699 9707 nfs_rw_exit(&rp->r_rwlock);
9700 9708 }
9701 9709
9702 9710 /* ARGSUSED */
9703 9711 static int
9704 9712 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9705 9713 {
9706 9714 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9707 9715 return (EIO);
9708 9716
9709 9717 /*
9710 9718 * Because we stuff the readdir cookie into the offset field
9711 9719 * someone may attempt to do an lseek with the cookie which
9712 9720 * we want to succeed.
9713 9721 */
9714 9722 if (vp->v_type == VDIR)
9715 9723 return (0);
9716 9724 if (*noffp < 0)
9717 9725 return (EINVAL);
9718 9726 return (0);
9719 9727 }
9720 9728
9721 9729
9722 9730 /*
9723 9731 * Return all the pages from [off..off+len) in file
9724 9732 */
9725 9733 /* ARGSUSED */
9726 9734 static int
9727 9735 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9728 9736 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9729 9737 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9730 9738 {
9731 9739 rnode4_t *rp;
9732 9740 int error;
9733 9741 mntinfo4_t *mi;
9734 9742
9735 9743 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9736 9744 return (EIO);
9737 9745 rp = VTOR4(vp);
9738 9746 if (IS_SHADOW(vp, rp))
9739 9747 vp = RTOV4(rp);
9740 9748
9741 9749 if (vp->v_flag & VNOMAP)
9742 9750 return (ENOSYS);
9743 9751
9744 9752 if (protp != NULL)
9745 9753 *protp = PROT_ALL;
9746 9754
9747 9755 /*
9748 9756 * Now validate that the caches are up to date.
9749 9757 */
9750 9758 if (error = nfs4_validate_caches(vp, cr))
9751 9759 return (error);
9752 9760
9753 9761 mi = VTOMI4(vp);
9754 9762 retry:
9755 9763 mutex_enter(&rp->r_statelock);
9756 9764
9757 9765 /*
9758 9766 * Don't create dirty pages faster than they
9759 9767 * can be cleaned so that the system doesn't
9760 9768 * get imbalanced. If the async queue is
9761 9769 * maxed out, then wait for it to drain before
9762 9770 * creating more dirty pages. Also, wait for
9763 9771 * any threads doing pagewalks in the vop_getattr
9764 9772 * entry points so that they don't block for
9765 9773 * long periods.
9766 9774 */
9767 9775 if (rw == S_CREATE) {
9768 9776 while ((mi->mi_max_threads != 0 &&
9769 9777 rp->r_awcount > 2 * mi->mi_max_threads) ||
9770 9778 rp->r_gcount > 0)
9771 9779 cv_wait(&rp->r_cv, &rp->r_statelock);
9772 9780 }
9773 9781
9774 9782 /*
9775 9783 * If we are getting called as a side effect of an nfs_write()
9776 9784 * operation the local file size might not be extended yet.
9777 9785 * In this case we want to be able to return pages of zeroes.
9778 9786 */
9779 9787 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9780 9788 NFS4_DEBUG(nfs4_pageio_debug,
9781 9789 (CE_NOTE, "getpage beyond EOF: off=%lld, "
9782 9790 "len=%llu, size=%llu, attrsize =%llu", off,
9783 9791 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9784 9792 mutex_exit(&rp->r_statelock);
9785 9793 return (EFAULT); /* beyond EOF */
9786 9794 }
9787 9795
9788 9796 mutex_exit(&rp->r_statelock);
9789 9797
9790 9798 if (len <= PAGESIZE) {
9791 9799 error = nfs4_getapage(vp, off, len, protp, pl, plsz,
9792 9800 seg, addr, rw, cr);
9793 9801 NFS4_DEBUG(nfs4_pageio_debug && error,
9794 9802 (CE_NOTE, "getpage error %d; off=%lld, "
9795 9803 "len=%lld", error, off, (u_longlong_t)len));
9796 9804 } else {
9797 9805 error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9798 9806 pl, plsz, seg, addr, rw, cr);
9799 9807 NFS4_DEBUG(nfs4_pageio_debug && error,
9800 9808 (CE_NOTE, "getpages error %d; off=%lld, "
9801 9809 "len=%lld", error, off, (u_longlong_t)len));
9802 9810 }
9803 9811
9804 9812 switch (error) {
9805 9813 case NFS_EOF:
9806 9814 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9807 9815 goto retry;
9808 9816 case ESTALE:
9809 9817 nfs4_purge_stale_fh(error, vp, cr);
9810 9818 }
9811 9819
9812 9820 return (error);
9813 9821 }
9814 9822
9815 9823 /*
9816 9824 * Called from pvn_getpages or nfs4_getpage to get a particular page.
9817 9825 */
9818 9826 /* ARGSUSED */
9819 9827 static int
9820 9828 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9821 9829 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9822 9830 enum seg_rw rw, cred_t *cr)
9823 9831 {
9824 9832 rnode4_t *rp;
9825 9833 uint_t bsize;
9826 9834 struct buf *bp;
9827 9835 page_t *pp;
9828 9836 u_offset_t lbn;
9829 9837 u_offset_t io_off;
9830 9838 u_offset_t blkoff;
9831 9839 u_offset_t rablkoff;
9832 9840 size_t io_len;
9833 9841 uint_t blksize;
9834 9842 int error;
9835 9843 int readahead;
9836 9844 int readahead_issued = 0;
9837 9845 int ra_window; /* readahead window */
9838 9846 page_t *pagefound;
9839 9847 page_t *savepp;
9840 9848
9841 9849 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9842 9850 return (EIO);
9843 9851
9844 9852 rp = VTOR4(vp);
9845 9853 ASSERT(!IS_SHADOW(vp, rp));
9846 9854 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9847 9855
9848 9856 reread:
9849 9857 bp = NULL;
9850 9858 pp = NULL;
9851 9859 pagefound = NULL;
9852 9860
9853 9861 if (pl != NULL)
9854 9862 pl[0] = NULL;
9855 9863
9856 9864 error = 0;
9857 9865 lbn = off / bsize;
9858 9866 blkoff = lbn * bsize;
9859 9867
9860 9868 /*
9861 9869 * Queueing up the readahead before doing the synchronous read
9862 9870 * results in a significant increase in read throughput because
9863 9871 * of the increased parallelism between the async threads and
9864 9872 * the process context.
9865 9873 */
9866 9874 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9867 9875 rw != S_CREATE &&
9868 9876 !(vp->v_flag & VNOCACHE)) {
9869 9877 mutex_enter(&rp->r_statelock);
9870 9878
9871 9879 /*
9872 9880 * Calculate the number of readaheads to do.
9873 9881 * a) No readaheads at offset = 0.
9874 9882 * b) Do maximum(nfs4_nra) readaheads when the readahead
9875 9883 * window is closed.
9876 9884 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9877 9885 * upon how far the readahead window is open or close.
9878 9886 * d) No readaheads if rp->r_nextr is not within the scope
9879 9887 * of the readahead window (random i/o).
9880 9888 */
9881 9889
9882 9890 if (off == 0)
9883 9891 readahead = 0;
9884 9892 else if (blkoff == rp->r_nextr)
9885 9893 readahead = nfs4_nra;
9886 9894 else if (rp->r_nextr > blkoff &&
9887 9895 ((ra_window = (rp->r_nextr - blkoff) / bsize)
9888 9896 <= (nfs4_nra - 1)))
9889 9897 readahead = nfs4_nra - ra_window;
9890 9898 else
9891 9899 readahead = 0;
9892 9900
9893 9901 rablkoff = rp->r_nextr;
9894 9902 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9895 9903 mutex_exit(&rp->r_statelock);
9896 9904 if (nfs4_async_readahead(vp, rablkoff + bsize,
9897 9905 addr + (rablkoff + bsize - off),
9898 9906 seg, cr, nfs4_readahead) < 0) {
9899 9907 mutex_enter(&rp->r_statelock);
9900 9908 break;
9901 9909 }
9902 9910 readahead--;
9903 9911 rablkoff += bsize;
9904 9912 /*
9905 9913 * Indicate that we did a readahead so
9906 9914 * readahead offset is not updated
9907 9915 * by the synchronous read below.
9908 9916 */
9909 9917 readahead_issued = 1;
9910 9918 mutex_enter(&rp->r_statelock);
9911 9919 /*
9912 9920 * set readahead offset to
9913 9921 * offset of last async readahead
9914 9922 * request.
9915 9923 */
9916 9924 rp->r_nextr = rablkoff;
9917 9925 }
9918 9926 mutex_exit(&rp->r_statelock);
9919 9927 }
9920 9928
9921 9929 again:
9922 9930 if ((pagefound = page_exists(vp, off)) == NULL) {
9923 9931 if (pl == NULL) {
9924 9932 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9925 9933 nfs4_readahead);
9926 9934 } else if (rw == S_CREATE) {
9927 9935 /*
9928 9936 * Block for this page is not allocated, or the offset
9929 9937 * is beyond the current allocation size, or we're
9930 9938 * allocating a swap slot and the page was not found,
9931 9939 * so allocate it and return a zero page.
9932 9940 */
9933 9941 if ((pp = page_create_va(vp, off,
9934 9942 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9935 9943 cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9936 9944 io_len = PAGESIZE;
9937 9945 mutex_enter(&rp->r_statelock);
9938 9946 rp->r_nextr = off + PAGESIZE;
9939 9947 mutex_exit(&rp->r_statelock);
9940 9948 } else {
9941 9949 /*
9942 9950 * Need to go to server to get a block
9943 9951 */
9944 9952 mutex_enter(&rp->r_statelock);
9945 9953 if (blkoff < rp->r_size &&
9946 9954 blkoff + bsize > rp->r_size) {
9947 9955 /*
9948 9956 * If less than a block left in
9949 9957 * file read less than a block.
9950 9958 */
9951 9959 if (rp->r_size <= off) {
9952 9960 /*
9953 9961 * Trying to access beyond EOF,
9954 9962 * set up to get at least one page.
9955 9963 */
9956 9964 blksize = off + PAGESIZE - blkoff;
9957 9965 } else
9958 9966 blksize = rp->r_size - blkoff;
9959 9967 } else if ((off == 0) ||
9960 9968 (off != rp->r_nextr && !readahead_issued)) {
9961 9969 blksize = PAGESIZE;
9962 9970 blkoff = off; /* block = page here */
9963 9971 } else
9964 9972 blksize = bsize;
9965 9973 mutex_exit(&rp->r_statelock);
9966 9974
9967 9975 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9968 9976 &io_len, blkoff, blksize, 0);
9969 9977
9970 9978 /*
9971 9979 * Some other thread has entered the page,
9972 9980 * so just use it.
9973 9981 */
9974 9982 if (pp == NULL)
9975 9983 goto again;
9976 9984
9977 9985 /*
9978 9986 * Now round the request size up to page boundaries.
9979 9987 * This ensures that the entire page will be
9980 9988 * initialized to zeroes if EOF is encountered.
9981 9989 */
9982 9990 io_len = ptob(btopr(io_len));
9983 9991
9984 9992 bp = pageio_setup(pp, io_len, vp, B_READ);
9985 9993 ASSERT(bp != NULL);
9986 9994
9987 9995 /*
9988 9996 * pageio_setup should have set b_addr to 0. This
9989 9997 * is correct since we want to do I/O on a page
9990 9998 * boundary. bp_mapin will use this addr to calculate
9991 9999 * an offset, and then set b_addr to the kernel virtual
9992 10000 * address it allocated for us.
9993 10001 */
9994 10002 ASSERT(bp->b_un.b_addr == 0);
9995 10003
9996 10004 bp->b_edev = 0;
9997 10005 bp->b_dev = 0;
9998 10006 bp->b_lblkno = lbtodb(io_off);
9999 10007 bp->b_file = vp;
10000 10008 bp->b_offset = (offset_t)off;
10001 10009 bp_mapin(bp);
10002 10010
10003 10011 /*
10004 10012 * If doing a write beyond what we believe is EOF,
10005 10013 * don't bother trying to read the pages from the
10006 10014 * server, we'll just zero the pages here. We
10007 10015 * don't check that the rw flag is S_WRITE here
10008 10016 * because some implementations may attempt a
10009 10017 * read access to the buffer before copying data.
10010 10018 */
10011 10019 mutex_enter(&rp->r_statelock);
10012 10020 if (io_off >= rp->r_size && seg == segkmap) {
10013 10021 mutex_exit(&rp->r_statelock);
10014 10022 bzero(bp->b_un.b_addr, io_len);
10015 10023 } else {
10016 10024 mutex_exit(&rp->r_statelock);
10017 10025 error = nfs4_bio(bp, NULL, cr, FALSE);
10018 10026 }
10019 10027
10020 10028 /*
10021 10029 * Unmap the buffer before freeing it.
10022 10030 */
10023 10031 bp_mapout(bp);
10024 10032 pageio_done(bp);
10025 10033
10026 10034 savepp = pp;
10027 10035 do {
10028 10036 pp->p_fsdata = C_NOCOMMIT;
10029 10037 } while ((pp = pp->p_next) != savepp);
10030 10038
10031 10039 if (error == NFS_EOF) {
10032 10040 /*
10033 10041 * If doing a write system call just return
10034 10042 * zeroed pages, else user tried to get pages
10035 10043 * beyond EOF, return error. We don't check
10036 10044 * that the rw flag is S_WRITE here because
10037 10045 * some implementations may attempt a read
10038 10046 * access to the buffer before copying data.
10039 10047 */
10040 10048 if (seg == segkmap)
10041 10049 error = 0;
10042 10050 else
10043 10051 error = EFAULT;
10044 10052 }
10045 10053
10046 10054 if (!readahead_issued && !error) {
10047 10055 mutex_enter(&rp->r_statelock);
10048 10056 rp->r_nextr = io_off + io_len;
10049 10057 mutex_exit(&rp->r_statelock);
10050 10058 }
10051 10059 }
10052 10060 }
10053 10061
10054 10062 out:
10055 10063 if (pl == NULL)
10056 10064 return (error);
10057 10065
10058 10066 if (error) {
10059 10067 if (pp != NULL)
10060 10068 pvn_read_done(pp, B_ERROR);
10061 10069 return (error);
10062 10070 }
10063 10071
10064 10072 if (pagefound) {
10065 10073 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10066 10074
10067 10075 /*
10068 10076 * Page exists in the cache, acquire the appropriate lock.
10069 10077 * If this fails, start all over again.
10070 10078 */
10071 10079 if ((pp = page_lookup(vp, off, se)) == NULL) {
10072 10080 #ifdef DEBUG
10073 10081 nfs4_lostpage++;
10074 10082 #endif
10075 10083 goto reread;
10076 10084 }
10077 10085 pl[0] = pp;
10078 10086 pl[1] = NULL;
10079 10087 return (0);
10080 10088 }
10081 10089
10082 10090 if (pp != NULL)
10083 10091 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10084 10092
10085 10093 return (error);
10086 10094 }
10087 10095
10088 10096 static void
10089 10097 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10090 10098 cred_t *cr)
10091 10099 {
10092 10100 int error;
10093 10101 page_t *pp;
10094 10102 u_offset_t io_off;
10095 10103 size_t io_len;
10096 10104 struct buf *bp;
10097 10105 uint_t bsize, blksize;
10098 10106 rnode4_t *rp = VTOR4(vp);
10099 10107 page_t *savepp;
10100 10108
10101 10109 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10102 10110
10103 10111 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10104 10112
10105 10113 mutex_enter(&rp->r_statelock);
10106 10114 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10107 10115 /*
10108 10116 * If less than a block left in file read less
10109 10117 * than a block.
10110 10118 */
10111 10119 blksize = rp->r_size - blkoff;
10112 10120 } else
10113 10121 blksize = bsize;
10114 10122 mutex_exit(&rp->r_statelock);
10115 10123
10116 10124 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10117 10125 &io_off, &io_len, blkoff, blksize, 1);
10118 10126 /*
10119 10127 * The isra flag passed to the kluster function is 1, we may have
10120 10128 * gotten a return value of NULL for a variety of reasons (# of free
10121 10129 * pages < minfree, someone entered the page on the vnode etc). In all
10122 10130 * cases, we want to punt on the readahead.
10123 10131 */
10124 10132 if (pp == NULL)
10125 10133 return;
10126 10134
10127 10135 /*
10128 10136 * Now round the request size up to page boundaries.
10129 10137 * This ensures that the entire page will be
10130 10138 * initialized to zeroes if EOF is encountered.
10131 10139 */
10132 10140 io_len = ptob(btopr(io_len));
10133 10141
10134 10142 bp = pageio_setup(pp, io_len, vp, B_READ);
10135 10143 ASSERT(bp != NULL);
10136 10144
10137 10145 /*
10138 10146 * pageio_setup should have set b_addr to 0. This is correct since
10139 10147 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10140 10148 * to calculate an offset, and then set b_addr to the kernel virtual
10141 10149 * address it allocated for us.
10142 10150 */
10143 10151 ASSERT(bp->b_un.b_addr == 0);
10144 10152
10145 10153 bp->b_edev = 0;
10146 10154 bp->b_dev = 0;
10147 10155 bp->b_lblkno = lbtodb(io_off);
10148 10156 bp->b_file = vp;
10149 10157 bp->b_offset = (offset_t)blkoff;
10150 10158 bp_mapin(bp);
10151 10159
10152 10160 /*
10153 10161 * If doing a write beyond what we believe is EOF, don't bother trying
10154 10162 * to read the pages from the server, we'll just zero the pages here.
10155 10163 * We don't check that the rw flag is S_WRITE here because some
10156 10164 * implementations may attempt a read access to the buffer before
10157 10165 * copying data.
10158 10166 */
10159 10167 mutex_enter(&rp->r_statelock);
10160 10168 if (io_off >= rp->r_size && seg == segkmap) {
10161 10169 mutex_exit(&rp->r_statelock);
10162 10170 bzero(bp->b_un.b_addr, io_len);
10163 10171 error = 0;
10164 10172 } else {
10165 10173 mutex_exit(&rp->r_statelock);
10166 10174 error = nfs4_bio(bp, NULL, cr, TRUE);
10167 10175 if (error == NFS_EOF)
10168 10176 error = 0;
10169 10177 }
10170 10178
10171 10179 /*
10172 10180 * Unmap the buffer before freeing it.
10173 10181 */
10174 10182 bp_mapout(bp);
10175 10183 pageio_done(bp);
10176 10184
10177 10185 savepp = pp;
10178 10186 do {
10179 10187 pp->p_fsdata = C_NOCOMMIT;
10180 10188 } while ((pp = pp->p_next) != savepp);
10181 10189
10182 10190 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10183 10191
10184 10192 /*
10185 10193 * In case of error set readahead offset
10186 10194 * to the lowest offset.
10187 10195 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10188 10196 */
10189 10197 if (error && rp->r_nextr > io_off) {
10190 10198 mutex_enter(&rp->r_statelock);
10191 10199 if (rp->r_nextr > io_off)
10192 10200 rp->r_nextr = io_off;
10193 10201 mutex_exit(&rp->r_statelock);
10194 10202 }
10195 10203 }
10196 10204
10197 10205 /*
10198 10206 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10199 10207 * If len == 0, do from off to EOF.
10200 10208 *
10201 10209 * The normal cases should be len == 0 && off == 0 (entire vp list) or
10202 10210 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10203 10211 * (from pageout).
10204 10212 */
10205 10213 /* ARGSUSED */
10206 10214 static int
10207 10215 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10208 10216 caller_context_t *ct)
10209 10217 {
10210 10218 int error;
10211 10219 rnode4_t *rp;
10212 10220
10213 10221 ASSERT(cr != NULL);
10214 10222
10215 10223 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10216 10224 return (EIO);
10217 10225
10218 10226 rp = VTOR4(vp);
10219 10227 if (IS_SHADOW(vp, rp))
10220 10228 vp = RTOV4(rp);
10221 10229
10222 10230 /*
10223 10231 * XXX - Why should this check be made here?
10224 10232 */
10225 10233 if (vp->v_flag & VNOMAP)
10226 10234 return (ENOSYS);
10227 10235
10228 10236 if (len == 0 && !(flags & B_INVAL) &&
10229 10237 (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10230 10238 return (0);
10231 10239
10232 10240 mutex_enter(&rp->r_statelock);
10233 10241 rp->r_count++;
10234 10242 mutex_exit(&rp->r_statelock);
10235 10243 error = nfs4_putpages(vp, off, len, flags, cr);
10236 10244 mutex_enter(&rp->r_statelock);
10237 10245 rp->r_count--;
10238 10246 cv_broadcast(&rp->r_cv);
10239 10247 mutex_exit(&rp->r_statelock);
10240 10248
10241 10249 return (error);
10242 10250 }
10243 10251
10244 10252 /*
10245 10253 * Write out a single page, possibly klustering adjacent dirty pages.
10246 10254 */
10247 10255 int
10248 10256 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10249 10257 int flags, cred_t *cr)
10250 10258 {
10251 10259 u_offset_t io_off;
10252 10260 u_offset_t lbn_off;
10253 10261 u_offset_t lbn;
10254 10262 size_t io_len;
10255 10263 uint_t bsize;
10256 10264 int error;
10257 10265 rnode4_t *rp;
10258 10266
10259 10267 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10260 10268 ASSERT(pp != NULL);
10261 10269 ASSERT(cr != NULL);
10262 10270 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10263 10271
10264 10272 rp = VTOR4(vp);
10265 10273 ASSERT(rp->r_count > 0);
10266 10274 ASSERT(!IS_SHADOW(vp, rp));
10267 10275
10268 10276 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10269 10277 lbn = pp->p_offset / bsize;
10270 10278 lbn_off = lbn * bsize;
10271 10279
10272 10280 /*
10273 10281 * Find a kluster that fits in one block, or in
10274 10282 * one page if pages are bigger than blocks. If
10275 10283 * there is less file space allocated than a whole
10276 10284 * page, we'll shorten the i/o request below.
10277 10285 */
10278 10286 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10279 10287 roundup(bsize, PAGESIZE), flags);
10280 10288
10281 10289 /*
10282 10290 * pvn_write_kluster shouldn't have returned a page with offset
10283 10291 * behind the original page we were given. Verify that.
10284 10292 */
10285 10293 ASSERT((pp->p_offset / bsize) >= lbn);
10286 10294
10287 10295 /*
10288 10296 * Now pp will have the list of kept dirty pages marked for
10289 10297 * write back. It will also handle invalidation and freeing
10290 10298 * of pages that are not dirty. Check for page length rounding
10291 10299 * problems.
10292 10300 */
10293 10301 if (io_off + io_len > lbn_off + bsize) {
10294 10302 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10295 10303 io_len = lbn_off + bsize - io_off;
10296 10304 }
10297 10305 /*
10298 10306 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10299 10307 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10300 10308 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10301 10309 * progress and the r_size has not been made consistent with the
10302 10310 * new size of the file. When the uiomove() completes the r_size is
10303 10311 * updated and the R4MODINPROGRESS flag is cleared.
10304 10312 *
10305 10313 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10306 10314 * consistent value of r_size. Without this handshaking, it is
10307 10315 * possible that nfs4_bio() picks up the old value of r_size
10308 10316 * before the uiomove() in writerp4() completes. This will result
10309 10317 * in the write through nfs4_bio() being dropped.
10310 10318 *
10311 10319 * More precisely, there is a window between the time the uiomove()
10312 10320 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10313 10321 * operation intervenes in this window, the page will be picked up,
10314 10322 * because it is dirty (it will be unlocked, unless it was
10315 10323 * pagecreate'd). When the page is picked up as dirty, the dirty
10316 10324 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10317 10325 * checked. This will still be the old size. Therefore the page will
10318 10326 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10319 10327 * the page will be found to be clean and the write will be dropped.
10320 10328 */
10321 10329 if (rp->r_flags & R4MODINPROGRESS) {
10322 10330 mutex_enter(&rp->r_statelock);
10323 10331 if ((rp->r_flags & R4MODINPROGRESS) &&
10324 10332 rp->r_modaddr + MAXBSIZE > io_off &&
10325 10333 rp->r_modaddr < io_off + io_len) {
10326 10334 page_t *plist;
10327 10335 /*
10328 10336 * A write is in progress for this region of the file.
10329 10337 * If we did not detect R4MODINPROGRESS here then this
10330 10338 * path through nfs_putapage() would eventually go to
10331 10339 * nfs4_bio() and may not write out all of the data
10332 10340 * in the pages. We end up losing data. So we decide
10333 10341 * to set the modified bit on each page in the page
10334 10342 * list and mark the rnode with R4DIRTY. This write
10335 10343 * will be restarted at some later time.
10336 10344 */
10337 10345 plist = pp;
10338 10346 while (plist != NULL) {
10339 10347 pp = plist;
10340 10348 page_sub(&plist, pp);
10341 10349 hat_setmod(pp);
10342 10350 page_io_unlock(pp);
10343 10351 page_unlock(pp);
10344 10352 }
10345 10353 rp->r_flags |= R4DIRTY;
10346 10354 mutex_exit(&rp->r_statelock);
10347 10355 if (offp)
10348 10356 *offp = io_off;
10349 10357 if (lenp)
10350 10358 *lenp = io_len;
10351 10359 return (0);
10352 10360 }
10353 10361 mutex_exit(&rp->r_statelock);
10354 10362 }
10355 10363
10356 10364 if (flags & B_ASYNC) {
10357 10365 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10358 10366 nfs4_sync_putapage);
10359 10367 } else
10360 10368 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10361 10369
10362 10370 if (offp)
10363 10371 *offp = io_off;
10364 10372 if (lenp)
10365 10373 *lenp = io_len;
10366 10374 return (error);
10367 10375 }
10368 10376
10369 10377 static int
10370 10378 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10371 10379 int flags, cred_t *cr)
10372 10380 {
10373 10381 int error;
10374 10382 rnode4_t *rp;
10375 10383
10376 10384 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10377 10385
10378 10386 flags |= B_WRITE;
10379 10387
10380 10388 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10381 10389
10382 10390 rp = VTOR4(vp);
10383 10391
10384 10392 if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10385 10393 error == EACCES) &&
10386 10394 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10387 10395 if (!(rp->r_flags & R4OUTOFSPACE)) {
10388 10396 mutex_enter(&rp->r_statelock);
10389 10397 rp->r_flags |= R4OUTOFSPACE;
10390 10398 mutex_exit(&rp->r_statelock);
10391 10399 }
10392 10400 flags |= B_ERROR;
10393 10401 pvn_write_done(pp, flags);
10394 10402 /*
10395 10403 * If this was not an async thread, then try again to
10396 10404 * write out the pages, but this time, also destroy
10397 10405 * them whether or not the write is successful. This
10398 10406 * will prevent memory from filling up with these
10399 10407 * pages and destroying them is the only alternative
10400 10408 * if they can't be written out.
10401 10409 *
10402 10410 * Don't do this if this is an async thread because
10403 10411 * when the pages are unlocked in pvn_write_done,
10404 10412 * some other thread could have come along, locked
10405 10413 * them, and queued for an async thread. It would be
10406 10414 * possible for all of the async threads to be tied
10407 10415 * up waiting to lock the pages again and they would
10408 10416 * all already be locked and waiting for an async
10409 10417 * thread to handle them. Deadlock.
10410 10418 */
10411 10419 if (!(flags & B_ASYNC)) {
10412 10420 error = nfs4_putpage(vp, io_off, io_len,
10413 10421 B_INVAL | B_FORCE, cr, NULL);
10414 10422 }
10415 10423 } else {
10416 10424 if (error)
10417 10425 flags |= B_ERROR;
10418 10426 else if (rp->r_flags & R4OUTOFSPACE) {
10419 10427 mutex_enter(&rp->r_statelock);
10420 10428 rp->r_flags &= ~R4OUTOFSPACE;
10421 10429 mutex_exit(&rp->r_statelock);
10422 10430 }
10423 10431 pvn_write_done(pp, flags);
10424 10432 if (freemem < desfree)
10425 10433 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10426 10434 NFS4_WRITE_NOWAIT);
10427 10435 }
10428 10436
10429 10437 return (error);
10430 10438 }
10431 10439
10432 10440 #ifdef DEBUG
10433 10441 int nfs4_force_open_before_mmap = 0;
10434 10442 #endif
10435 10443
10436 10444 /* ARGSUSED */
10437 10445 static int
10438 10446 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10439 10447 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10440 10448 caller_context_t *ct)
10441 10449 {
10442 10450 struct segvn_crargs vn_a;
10443 10451 int error = 0;
10444 10452 rnode4_t *rp = VTOR4(vp);
10445 10453 mntinfo4_t *mi = VTOMI4(vp);
10446 10454
10447 10455 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10448 10456 return (EIO);
10449 10457
10450 10458 if (vp->v_flag & VNOMAP)
10451 10459 return (ENOSYS);
10452 10460
10453 10461 if (off < 0 || (off + len) < 0)
10454 10462 return (ENXIO);
10455 10463
10456 10464 if (vp->v_type != VREG)
10457 10465 return (ENODEV);
10458 10466
10459 10467 /*
10460 10468 * If the file is delegated to the client don't do anything.
10461 10469 * If the file is not delegated, then validate the data cache.
10462 10470 */
10463 10471 mutex_enter(&rp->r_statev4_lock);
10464 10472 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10465 10473 mutex_exit(&rp->r_statev4_lock);
10466 10474 error = nfs4_validate_caches(vp, cr);
10467 10475 if (error)
10468 10476 return (error);
10469 10477 } else {
10470 10478 mutex_exit(&rp->r_statev4_lock);
10471 10479 }
10472 10480
10473 10481 /*
10474 10482 * Check to see if the vnode is currently marked as not cachable.
10475 10483 * This means portions of the file are locked (through VOP_FRLOCK).
10476 10484 * In this case the map request must be refused. We use
10477 10485 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10478 10486 *
10479 10487 * Atomically increment r_inmap after acquiring r_rwlock. The
10480 10488 * idea here is to acquire r_rwlock to block read/write and
10481 10489 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10482 10490 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10483 10491 * and we can prevent the deadlock that would have occurred
10484 10492 * when nfs4_addmap() would have acquired it out of order.
10485 10493 *
10486 10494 * Since we are not protecting r_inmap by any lock, we do not
10487 10495 * hold any lock when we decrement it. We atomically decrement
10488 10496 * r_inmap after we release r_lkserlock.
10489 10497 */
10490 10498
10491 10499 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10492 10500 return (EINTR);
10493 10501 atomic_add_int(&rp->r_inmap, 1);
10494 10502 nfs_rw_exit(&rp->r_rwlock);
10495 10503
10496 10504 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10497 10505 atomic_add_int(&rp->r_inmap, -1);
10498 10506 return (EINTR);
10499 10507 }
10500 10508
10501 10509
10502 10510 if (vp->v_flag & VNOCACHE) {
10503 10511 error = EAGAIN;
10504 10512 goto done;
10505 10513 }
10506 10514
10507 10515 /*
10508 10516 * Don't allow concurrent locks and mapping if mandatory locking is
10509 10517 * enabled.
10510 10518 */
10511 10519 if (flk_has_remote_locks(vp)) {
10512 10520 struct vattr va;
10513 10521 va.va_mask = AT_MODE;
10514 10522 error = nfs4getattr(vp, &va, cr);
10515 10523 if (error != 0)
10516 10524 goto done;
10517 10525 if (MANDLOCK(vp, va.va_mode)) {
10518 10526 error = EAGAIN;
10519 10527 goto done;
10520 10528 }
10521 10529 }
10522 10530
10523 10531 /*
10524 10532 * It is possible that the rnode has a lost lock request that we
10525 10533 * are still trying to recover, and that the request conflicts with
10526 10534 * this map request.
10527 10535 *
10528 10536 * An alternative approach would be for nfs4_safemap() to consider
10529 10537 * queued lock requests when deciding whether to set or clear
10530 10538 * VNOCACHE. This would require the frlock code path to call
10531 10539 * nfs4_safemap() after enqueing a lost request.
10532 10540 */
10533 10541 if (nfs4_map_lost_lock_conflict(vp)) {
10534 10542 error = EAGAIN;
10535 10543 goto done;
10536 10544 }
10537 10545
10538 10546 as_rangelock(as);
10539 10547 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10540 10548 if (error != 0) {
10541 10549 as_rangeunlock(as);
10542 10550 goto done;
10543 10551 }
10544 10552
10545 10553 if (vp->v_type == VREG) {
10546 10554 /*
10547 10555 * We need to retrieve the open stream
10548 10556 */
10549 10557 nfs4_open_stream_t *osp = NULL;
10550 10558 nfs4_open_owner_t *oop = NULL;
10551 10559
10552 10560 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10553 10561 if (oop != NULL) {
10554 10562 /* returns with 'os_sync_lock' held */
10555 10563 osp = find_open_stream(oop, rp);
10556 10564 open_owner_rele(oop);
10557 10565 }
10558 10566 if (osp == NULL) {
10559 10567 #ifdef DEBUG
10560 10568 if (nfs4_force_open_before_mmap) {
10561 10569 error = EIO;
10562 10570 goto done;
10563 10571 }
10564 10572 #endif
10565 10573 /* returns with 'os_sync_lock' held */
10566 10574 error = open_and_get_osp(vp, cr, &osp);
10567 10575 if (osp == NULL) {
10568 10576 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10569 10577 "nfs4_map: we tried to OPEN the file "
10570 10578 "but again no osp, so fail with EIO"));
10571 10579 goto done;
10572 10580 }
10573 10581 }
10574 10582
10575 10583 if (osp->os_failed_reopen) {
10576 10584 mutex_exit(&osp->os_sync_lock);
10577 10585 open_stream_rele(osp, rp);
10578 10586 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10579 10587 "nfs4_map: os_failed_reopen set on "
10580 10588 "osp %p, cr %p, rp %s", (void *)osp,
10581 10589 (void *)cr, rnode4info(rp)));
10582 10590 error = EIO;
10583 10591 goto done;
10584 10592 }
10585 10593 mutex_exit(&osp->os_sync_lock);
10586 10594 open_stream_rele(osp, rp);
10587 10595 }
10588 10596
10589 10597 vn_a.vp = vp;
10590 10598 vn_a.offset = off;
10591 10599 vn_a.type = (flags & MAP_TYPE);
10592 10600 vn_a.prot = (uchar_t)prot;
10593 10601 vn_a.maxprot = (uchar_t)maxprot;
10594 10602 vn_a.flags = (flags & ~MAP_TYPE);
10595 10603 vn_a.cred = cr;
10596 10604 vn_a.amp = NULL;
10597 10605 vn_a.szc = 0;
10598 10606 vn_a.lgrp_mem_policy_flags = 0;
10599 10607
10600 10608 error = as_map(as, *addrp, len, segvn_create, &vn_a);
10601 10609 as_rangeunlock(as);
10602 10610
10603 10611 done:
10604 10612 nfs_rw_exit(&rp->r_lkserlock);
10605 10613 atomic_add_int(&rp->r_inmap, -1);
10606 10614 return (error);
10607 10615 }
10608 10616
10609 10617 /*
10610 10618 * We're most likely dealing with a kernel module that likes to READ
10611 10619 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10612 10620 * officially OPEN the file to create the necessary client state
10613 10621 * for bookkeeping of os_mmap_read/write counts.
10614 10622 *
10615 10623 * Since VOP_MAP only passes in a pointer to the vnode rather than
10616 10624 * a double pointer, we can't handle the case where nfs4open_otw()
10617 10625 * returns a different vnode than the one passed into VOP_MAP (since
10618 10626 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case,
10619 10627 * we return NULL and let nfs4_map() fail. Note: the only case where
10620 10628 * this should happen is if the file got removed and replaced with the
10621 10629 * same name on the server (in addition to the fact that we're trying
10622 10630 * to VOP_MAP withouth VOP_OPENing the file in the first place).
10623 10631 */
10624 10632 static int
10625 10633 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10626 10634 {
10627 10635 rnode4_t *rp, *drp;
10628 10636 vnode_t *dvp, *open_vp;
10629 10637 char file_name[MAXNAMELEN];
10630 10638 int just_created;
10631 10639 nfs4_open_stream_t *osp;
10632 10640 nfs4_open_owner_t *oop;
10633 10641 int error;
10634 10642
10635 10643 *ospp = NULL;
10636 10644 open_vp = map_vp;
10637 10645
10638 10646 rp = VTOR4(open_vp);
10639 10647 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10640 10648 return (error);
10641 10649 drp = VTOR4(dvp);
10642 10650
10643 10651 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10644 10652 VN_RELE(dvp);
10645 10653 return (EINTR);
10646 10654 }
10647 10655
10648 10656 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10649 10657 nfs_rw_exit(&drp->r_rwlock);
10650 10658 VN_RELE(dvp);
10651 10659 return (error);
10652 10660 }
10653 10661
10654 10662 mutex_enter(&rp->r_statev4_lock);
10655 10663 if (rp->created_v4) {
10656 10664 rp->created_v4 = 0;
10657 10665 mutex_exit(&rp->r_statev4_lock);
10658 10666
10659 10667 dnlc_update(dvp, file_name, open_vp);
10660 10668 /* This is needed so we don't bump the open ref count */
10661 10669 just_created = 1;
10662 10670 } else {
10663 10671 mutex_exit(&rp->r_statev4_lock);
10664 10672 just_created = 0;
10665 10673 }
10666 10674
10667 10675 VN_HOLD(map_vp);
10668 10676
10669 10677 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10670 10678 just_created);
10671 10679 if (error) {
10672 10680 nfs_rw_exit(&drp->r_rwlock);
10673 10681 VN_RELE(dvp);
10674 10682 VN_RELE(map_vp);
10675 10683 return (error);
10676 10684 }
10677 10685
10678 10686 nfs_rw_exit(&drp->r_rwlock);
10679 10687 VN_RELE(dvp);
10680 10688
10681 10689 /*
10682 10690 * If nfs4open_otw() returned a different vnode then "undo"
10683 10691 * the open and return failure to the caller.
10684 10692 */
10685 10693 if (!VN_CMP(open_vp, map_vp)) {
10686 10694 nfs4_error_t e;
10687 10695
10688 10696 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10689 10697 "open returned a different vnode"));
10690 10698 /*
10691 10699 * If there's an error, ignore it,
10692 10700 * and let VOP_INACTIVE handle it.
10693 10701 */
10694 10702 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10695 10703 CLOSE_NORM, 0, 0, 0);
10696 10704 VN_RELE(map_vp);
10697 10705 return (EIO);
10698 10706 }
10699 10707
10700 10708 VN_RELE(map_vp);
10701 10709
10702 10710 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10703 10711 if (!oop) {
10704 10712 nfs4_error_t e;
10705 10713
10706 10714 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10707 10715 "no open owner"));
10708 10716 /*
10709 10717 * If there's an error, ignore it,
10710 10718 * and let VOP_INACTIVE handle it.
10711 10719 */
10712 10720 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10713 10721 CLOSE_NORM, 0, 0, 0);
10714 10722 return (EIO);
10715 10723 }
10716 10724 osp = find_open_stream(oop, rp);
10717 10725 open_owner_rele(oop);
10718 10726 *ospp = osp;
10719 10727 return (0);
10720 10728 }
10721 10729
10722 10730 /*
10723 10731 * Please be aware that when this function is called, the address space write
10724 10732 * a_lock is held. Do not put over the wire calls in this function.
10725 10733 */
10726 10734 /* ARGSUSED */
10727 10735 static int
10728 10736 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10729 10737 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10730 10738 caller_context_t *ct)
10731 10739 {
10732 10740 rnode4_t *rp;
10733 10741 int error = 0;
10734 10742 mntinfo4_t *mi;
10735 10743
10736 10744 mi = VTOMI4(vp);
10737 10745 rp = VTOR4(vp);
10738 10746
10739 10747 if (nfs_zone() != mi->mi_zone)
10740 10748 return (EIO);
10741 10749 if (vp->v_flag & VNOMAP)
10742 10750 return (ENOSYS);
10743 10751
10744 10752 /*
10745 10753 * Don't need to update the open stream first, since this
10746 10754 * mmap can't add any additional share access that isn't
10747 10755 * already contained in the open stream (for the case where we
10748 10756 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10749 10757 * take into account os_mmap_read[write] counts).
10750 10758 */
10751 10759 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10752 10760
10753 10761 if (vp->v_type == VREG) {
10754 10762 /*
10755 10763 * We need to retrieve the open stream and update the counts.
10756 10764 * If there is no open stream here, something is wrong.
10757 10765 */
10758 10766 nfs4_open_stream_t *osp = NULL;
10759 10767 nfs4_open_owner_t *oop = NULL;
10760 10768
10761 10769 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10762 10770 if (oop != NULL) {
10763 10771 /* returns with 'os_sync_lock' held */
10764 10772 osp = find_open_stream(oop, rp);
10765 10773 open_owner_rele(oop);
10766 10774 }
10767 10775 if (osp == NULL) {
10768 10776 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10769 10777 "nfs4_addmap: we should have an osp"
10770 10778 "but we don't, so fail with EIO"));
10771 10779 error = EIO;
10772 10780 goto out;
10773 10781 }
10774 10782
10775 10783 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10776 10784 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10777 10785
10778 10786 /*
10779 10787 * Update the map count in the open stream.
10780 10788 * This is necessary in the case where we
10781 10789 * open/mmap/close/, then the server reboots, and we
10782 10790 * attempt to reopen. If the mmap doesn't add share
10783 10791 * access then we send an invalid reopen with
10784 10792 * access = NONE.
10785 10793 *
10786 10794 * We need to specifically check each PROT_* so a mmap
10787 10795 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10788 10796 * read and write access. A simple comparison of prot
10789 10797 * to ~PROT_WRITE to determine read access is insufficient
10790 10798 * since prot can be |= with PROT_USER, etc.
10791 10799 */
10792 10800
10793 10801 /*
10794 10802 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10795 10803 */
10796 10804 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10797 10805 osp->os_mmap_write += btopr(len);
10798 10806 if (maxprot & PROT_READ)
10799 10807 osp->os_mmap_read += btopr(len);
10800 10808 if (maxprot & PROT_EXEC)
10801 10809 osp->os_mmap_read += btopr(len);
10802 10810 /*
10803 10811 * Ensure that os_mmap_read gets incremented, even if
10804 10812 * maxprot were to look like PROT_NONE.
10805 10813 */
10806 10814 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10807 10815 !(maxprot & PROT_EXEC))
10808 10816 osp->os_mmap_read += btopr(len);
10809 10817 osp->os_mapcnt += btopr(len);
10810 10818 mutex_exit(&osp->os_sync_lock);
10811 10819 open_stream_rele(osp, rp);
10812 10820 }
10813 10821
10814 10822 out:
10815 10823 /*
10816 10824 * If we got an error, then undo our
10817 10825 * incrementing of 'r_mapcnt'.
10818 10826 */
10819 10827
10820 10828 if (error) {
10821 10829 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10822 10830 ASSERT(rp->r_mapcnt >= 0);
10823 10831 }
10824 10832 return (error);
10825 10833 }
10826 10834
10827 10835 /* ARGSUSED */
10828 10836 static int
10829 10837 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10830 10838 {
10831 10839
10832 10840 return (VTOR4(vp1) == VTOR4(vp2));
10833 10841 }
10834 10842
10835 10843 /* ARGSUSED */
10836 10844 static int
10837 10845 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10838 10846 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10839 10847 caller_context_t *ct)
10840 10848 {
10841 10849 int rc;
10842 10850 u_offset_t start, end;
10843 10851 rnode4_t *rp;
10844 10852 int error = 0, intr = INTR4(vp);
10845 10853 nfs4_error_t e;
10846 10854
10847 10855 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10848 10856 return (EIO);
10849 10857
10850 10858 /* check for valid cmd parameter */
10851 10859 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10852 10860 return (EINVAL);
10853 10861
10854 10862 /* Verify l_type. */
10855 10863 switch (bfp->l_type) {
10856 10864 case F_RDLCK:
10857 10865 if (cmd != F_GETLK && !(flag & FREAD))
10858 10866 return (EBADF);
10859 10867 break;
10860 10868 case F_WRLCK:
10861 10869 if (cmd != F_GETLK && !(flag & FWRITE))
10862 10870 return (EBADF);
10863 10871 break;
10864 10872 case F_UNLCK:
10865 10873 intr = 0;
10866 10874 break;
10867 10875
10868 10876 default:
10869 10877 return (EINVAL);
10870 10878 }
10871 10879
10872 10880 /* check the validity of the lock range */
10873 10881 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10874 10882 return (rc);
10875 10883 if (rc = flk_check_lock_data(start, end, MAXEND))
10876 10884 return (rc);
10877 10885
10878 10886 /*
10879 10887 * If the filesystem is mounted using local locking, pass the
10880 10888 * request off to the local locking code.
10881 10889 */
10882 10890 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10883 10891 if (cmd == F_SETLK || cmd == F_SETLKW) {
10884 10892 /*
10885 10893 * For complete safety, we should be holding
10886 10894 * r_lkserlock. However, we can't call
10887 10895 * nfs4_safelock and then fs_frlock while
10888 10896 * holding r_lkserlock, so just invoke
10889 10897 * nfs4_safelock and expect that this will
10890 10898 * catch enough of the cases.
10891 10899 */
10892 10900 if (!nfs4_safelock(vp, bfp, cr))
10893 10901 return (EAGAIN);
10894 10902 }
10895 10903 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10896 10904 }
10897 10905
10898 10906 rp = VTOR4(vp);
10899 10907
10900 10908 /*
10901 10909 * Check whether the given lock request can proceed, given the
10902 10910 * current file mappings.
10903 10911 */
10904 10912 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10905 10913 return (EINTR);
10906 10914 if (cmd == F_SETLK || cmd == F_SETLKW) {
10907 10915 if (!nfs4_safelock(vp, bfp, cr)) {
10908 10916 rc = EAGAIN;
10909 10917 goto done;
10910 10918 }
10911 10919 }
10912 10920
10913 10921 /*
10914 10922 * Flush the cache after waiting for async I/O to finish. For new
10915 10923 * locks, this is so that the process gets the latest bits from the
10916 10924 * server. For unlocks, this is so that other clients see the
10917 10925 * latest bits once the file has been unlocked. If currently dirty
10918 10926 * pages can't be flushed, then don't allow a lock to be set. But
10919 10927 * allow unlocks to succeed, to avoid having orphan locks on the
10920 10928 * server.
10921 10929 */
10922 10930 if (cmd != F_GETLK) {
10923 10931 mutex_enter(&rp->r_statelock);
10924 10932 while (rp->r_count > 0) {
10925 10933 if (intr) {
10926 10934 klwp_t *lwp = ttolwp(curthread);
10927 10935
10928 10936 if (lwp != NULL)
10929 10937 lwp->lwp_nostop++;
10930 10938 if (cv_wait_sig(&rp->r_cv,
10931 10939 &rp->r_statelock) == 0) {
10932 10940 if (lwp != NULL)
10933 10941 lwp->lwp_nostop--;
10934 10942 rc = EINTR;
10935 10943 break;
10936 10944 }
10937 10945 if (lwp != NULL)
10938 10946 lwp->lwp_nostop--;
10939 10947 } else
10940 10948 cv_wait(&rp->r_cv, &rp->r_statelock);
10941 10949 }
10942 10950 mutex_exit(&rp->r_statelock);
10943 10951 if (rc != 0)
10944 10952 goto done;
10945 10953 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10946 10954 if (error) {
10947 10955 if (error == ENOSPC || error == EDQUOT) {
10948 10956 mutex_enter(&rp->r_statelock);
10949 10957 if (!rp->r_error)
10950 10958 rp->r_error = error;
10951 10959 mutex_exit(&rp->r_statelock);
10952 10960 }
10953 10961 if (bfp->l_type != F_UNLCK) {
10954 10962 rc = ENOLCK;
10955 10963 goto done;
10956 10964 }
10957 10965 }
10958 10966 }
10959 10967
10960 10968 /*
10961 10969 * Call the lock manager to do the real work of contacting
10962 10970 * the server and obtaining the lock.
10963 10971 */
10964 10972 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10965 10973 cr, &e, NULL, NULL);
10966 10974 rc = e.error;
10967 10975
10968 10976 if (rc == 0)
10969 10977 nfs4_lockcompletion(vp, cmd);
10970 10978
10971 10979 done:
10972 10980 nfs_rw_exit(&rp->r_lkserlock);
10973 10981
10974 10982 return (rc);
10975 10983 }
10976 10984
10977 10985 /*
10978 10986 * Free storage space associated with the specified vnode. The portion
10979 10987 * to be freed is specified by bfp->l_start and bfp->l_len (already
10980 10988 * normalized to a "whence" of 0).
10981 10989 *
10982 10990 * This is an experimental facility whose continued existence is not
10983 10991 * guaranteed. Currently, we only support the special case
10984 10992 * of l_len == 0, meaning free to end of file.
10985 10993 */
10986 10994 /* ARGSUSED */
10987 10995 static int
10988 10996 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10989 10997 offset_t offset, cred_t *cr, caller_context_t *ct)
10990 10998 {
10991 10999 int error;
10992 11000
10993 11001 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10994 11002 return (EIO);
10995 11003 ASSERT(vp->v_type == VREG);
10996 11004 if (cmd != F_FREESP)
10997 11005 return (EINVAL);
10998 11006
10999 11007 error = convoff(vp, bfp, 0, offset);
11000 11008 if (!error) {
11001 11009 ASSERT(bfp->l_start >= 0);
11002 11010 if (bfp->l_len == 0) {
11003 11011 struct vattr va;
11004 11012
11005 11013 va.va_mask = AT_SIZE;
11006 11014 va.va_size = bfp->l_start;
11007 11015 error = nfs4setattr(vp, &va, 0, cr, NULL);
11008 11016
11009 11017 if (error == 0 && bfp->l_start == 0)
11010 11018 vnevent_truncate(vp, ct);
11011 11019 } else
11012 11020 error = EINVAL;
11013 11021 }
11014 11022
11015 11023 return (error);
11016 11024 }
11017 11025
11018 11026 /* ARGSUSED */
11019 11027 int
11020 11028 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11021 11029 {
11022 11030 rnode4_t *rp;
11023 11031 rp = VTOR4(vp);
11024 11032
11025 11033 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11026 11034 vp = RTOV4(rp);
11027 11035 }
11028 11036 *vpp = vp;
11029 11037 return (0);
11030 11038 }
11031 11039
11032 11040 /*
11033 11041 * Setup and add an address space callback to do the work of the delmap call.
11034 11042 * The callback will (and must be) deleted in the actual callback function.
11035 11043 *
11036 11044 * This is done in order to take care of the problem that we have with holding
11037 11045 * the address space's a_lock for a long period of time (e.g. if the NFS server
11038 11046 * is down). Callbacks will be executed in the address space code while the
11039 11047 * a_lock is not held. Holding the address space's a_lock causes things such
11040 11048 * as ps and fork to hang because they are trying to acquire this lock as well.
11041 11049 */
11042 11050 /* ARGSUSED */
11043 11051 static int
11044 11052 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11045 11053 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11046 11054 caller_context_t *ct)
11047 11055 {
11048 11056 int caller_found;
11049 11057 int error;
11050 11058 rnode4_t *rp;
11051 11059 nfs4_delmap_args_t *dmapp;
11052 11060 nfs4_delmapcall_t *delmap_call;
11053 11061
11054 11062 if (vp->v_flag & VNOMAP)
11055 11063 return (ENOSYS);
11056 11064
11057 11065 /*
11058 11066 * A process may not change zones if it has NFS pages mmap'ed
11059 11067 * in, so we can't legitimately get here from the wrong zone.
11060 11068 */
11061 11069 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11062 11070
11063 11071 rp = VTOR4(vp);
11064 11072
11065 11073 /*
11066 11074 * The way that the address space of this process deletes its mapping
11067 11075 * of this file is via the following call chains:
11068 11076 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11069 11077 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11070 11078 *
11071 11079 * With the use of address space callbacks we are allowed to drop the
11072 11080 * address space lock, a_lock, while executing the NFS operations that
11073 11081 * need to go over the wire. Returning EAGAIN to the caller of this
11074 11082 * function is what drives the execution of the callback that we add
11075 11083 * below. The callback will be executed by the address space code
11076 11084 * after dropping the a_lock. When the callback is finished, since
11077 11085 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11078 11086 * is called again on the same segment to finish the rest of the work
11079 11087 * that needs to happen during unmapping.
11080 11088 *
11081 11089 * This action of calling back into the segment driver causes
11082 11090 * nfs4_delmap() to get called again, but since the callback was
11083 11091 * already executed at this point, it already did the work and there
11084 11092 * is nothing left for us to do.
11085 11093 *
11086 11094 * To Summarize:
11087 11095 * - The first time nfs4_delmap is called by the current thread is when
11088 11096 * we add the caller associated with this delmap to the delmap caller
11089 11097 * list, add the callback, and return EAGAIN.
11090 11098 * - The second time in this call chain when nfs4_delmap is called we
11091 11099 * will find this caller in the delmap caller list and realize there
11092 11100 * is no more work to do thus removing this caller from the list and
11093 11101 * returning the error that was set in the callback execution.
11094 11102 */
11095 11103 caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11096 11104 if (caller_found) {
11097 11105 /*
11098 11106 * 'error' is from the actual delmap operations. To avoid
11099 11107 * hangs, we need to handle the return of EAGAIN differently
11100 11108 * since this is what drives the callback execution.
11101 11109 * In this case, we don't want to return EAGAIN and do the
11102 11110 * callback execution because there are none to execute.
11103 11111 */
11104 11112 if (error == EAGAIN)
11105 11113 return (0);
11106 11114 else
11107 11115 return (error);
11108 11116 }
11109 11117
11110 11118 /* current caller was not in the list */
11111 11119 delmap_call = nfs4_init_delmapcall();
11112 11120
11113 11121 mutex_enter(&rp->r_statelock);
11114 11122 list_insert_tail(&rp->r_indelmap, delmap_call);
11115 11123 mutex_exit(&rp->r_statelock);
11116 11124
11117 11125 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11118 11126
11119 11127 dmapp->vp = vp;
11120 11128 dmapp->off = off;
11121 11129 dmapp->addr = addr;
11122 11130 dmapp->len = len;
11123 11131 dmapp->prot = prot;
11124 11132 dmapp->maxprot = maxprot;
11125 11133 dmapp->flags = flags;
11126 11134 dmapp->cr = cr;
11127 11135 dmapp->caller = delmap_call;
11128 11136
11129 11137 error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11130 11138 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11131 11139
11132 11140 return (error ? error : EAGAIN);
11133 11141 }
11134 11142
11135 11143 static nfs4_delmapcall_t *
11136 11144 nfs4_init_delmapcall()
11137 11145 {
11138 11146 nfs4_delmapcall_t *delmap_call;
11139 11147
11140 11148 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11141 11149 delmap_call->call_id = curthread;
11142 11150 delmap_call->error = 0;
11143 11151
11144 11152 return (delmap_call);
11145 11153 }
11146 11154
11147 11155 static void
11148 11156 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11149 11157 {
11150 11158 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11151 11159 }
11152 11160
11153 11161 /*
11154 11162 * Searches for the current delmap caller (based on curthread) in the list of
11155 11163 * callers. If it is found, we remove it and free the delmap caller.
11156 11164 * Returns:
11157 11165 * 0 if the caller wasn't found
11158 11166 * 1 if the caller was found, removed and freed. *errp will be set
11159 11167 * to what the result of the delmap was.
11160 11168 */
11161 11169 static int
11162 11170 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11163 11171 {
11164 11172 nfs4_delmapcall_t *delmap_call;
11165 11173
11166 11174 /*
11167 11175 * If the list doesn't exist yet, we create it and return
11168 11176 * that the caller wasn't found. No list = no callers.
11169 11177 */
11170 11178 mutex_enter(&rp->r_statelock);
11171 11179 if (!(rp->r_flags & R4DELMAPLIST)) {
11172 11180 /* The list does not exist */
11173 11181 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11174 11182 offsetof(nfs4_delmapcall_t, call_node));
11175 11183 rp->r_flags |= R4DELMAPLIST;
11176 11184 mutex_exit(&rp->r_statelock);
11177 11185 return (0);
11178 11186 } else {
11179 11187 /* The list exists so search it */
11180 11188 for (delmap_call = list_head(&rp->r_indelmap);
11181 11189 delmap_call != NULL;
11182 11190 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11183 11191 if (delmap_call->call_id == curthread) {
11184 11192 /* current caller is in the list */
11185 11193 *errp = delmap_call->error;
11186 11194 list_remove(&rp->r_indelmap, delmap_call);
11187 11195 mutex_exit(&rp->r_statelock);
11188 11196 nfs4_free_delmapcall(delmap_call);
11189 11197 return (1);
11190 11198 }
11191 11199 }
11192 11200 }
11193 11201 mutex_exit(&rp->r_statelock);
11194 11202 return (0);
11195 11203 }
11196 11204
11197 11205 /*
11198 11206 * Remove some pages from an mmap'd vnode. Just update the
11199 11207 * count of pages. If doing close-to-open, then flush and
11200 11208 * commit all of the pages associated with this file.
11201 11209 * Otherwise, start an asynchronous page flush to write out
11202 11210 * any dirty pages. This will also associate a credential
11203 11211 * with the rnode which can be used to write the pages.
11204 11212 */
11205 11213 /* ARGSUSED */
11206 11214 static void
11207 11215 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11208 11216 {
11209 11217 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11210 11218 rnode4_t *rp;
11211 11219 mntinfo4_t *mi;
11212 11220 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg;
11213 11221
11214 11222 rp = VTOR4(dmapp->vp);
11215 11223 mi = VTOMI4(dmapp->vp);
11216 11224
11217 11225 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11218 11226 ASSERT(rp->r_mapcnt >= 0);
11219 11227
11220 11228 /*
11221 11229 * Initiate a page flush and potential commit if there are
11222 11230 * pages, the file system was not mounted readonly, the segment
11223 11231 * was mapped shared, and the pages themselves were writeable.
11224 11232 */
11225 11233 if (nfs4_has_pages(dmapp->vp) &&
11226 11234 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11227 11235 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11228 11236 mutex_enter(&rp->r_statelock);
11229 11237 rp->r_flags |= R4DIRTY;
11230 11238 mutex_exit(&rp->r_statelock);
11231 11239 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11232 11240 dmapp->len, dmapp->cr);
11233 11241 if (!e.error) {
11234 11242 mutex_enter(&rp->r_statelock);
11235 11243 e.error = rp->r_error;
11236 11244 rp->r_error = 0;
11237 11245 mutex_exit(&rp->r_statelock);
11238 11246 }
11239 11247 } else
11240 11248 e.error = 0;
11241 11249
11242 11250 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11243 11251 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11244 11252 B_INVAL, dmapp->cr, NULL);
11245 11253
11246 11254 if (e.error) {
11247 11255 e.stat = puterrno4(e.error);
11248 11256 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11249 11257 OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11250 11258 dmapp->caller->error = e.error;
11251 11259 }
11252 11260
11253 11261 /* Check to see if we need to close the file */
11254 11262
11255 11263 if (dmapp->vp->v_type == VREG) {
11256 11264 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11257 11265 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11258 11266
11259 11267 if (e.error != 0 || e.stat != NFS4_OK) {
11260 11268 /*
11261 11269 * Since it is possible that e.error == 0 and
11262 11270 * e.stat != NFS4_OK (and vice versa),
11263 11271 * we do the proper checking in order to get both
11264 11272 * e.error and e.stat reporting the correct info.
11265 11273 */
11266 11274 if (e.stat == NFS4_OK)
11267 11275 e.stat = puterrno4(e.error);
11268 11276 if (e.error == 0)
11269 11277 e.error = geterrno4(e.stat);
11270 11278
11271 11279 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11272 11280 OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11273 11281 dmapp->caller->error = e.error;
11274 11282 }
11275 11283 }
11276 11284
11277 11285 (void) as_delete_callback(as, arg);
11278 11286 kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11279 11287 }
11280 11288
11281 11289
11282 11290 static uint_t
11283 11291 fattr4_maxfilesize_to_bits(uint64_t ll)
11284 11292 {
11285 11293 uint_t l = 1;
11286 11294
11287 11295 if (ll == 0) {
11288 11296 return (0);
11289 11297 }
11290 11298
11291 11299 if (ll & 0xffffffff00000000) {
11292 11300 l += 32; ll >>= 32;
11293 11301 }
11294 11302 if (ll & 0xffff0000) {
11295 11303 l += 16; ll >>= 16;
11296 11304 }
11297 11305 if (ll & 0xff00) {
11298 11306 l += 8; ll >>= 8;
11299 11307 }
11300 11308 if (ll & 0xf0) {
11301 11309 l += 4; ll >>= 4;
11302 11310 }
11303 11311 if (ll & 0xc) {
11304 11312 l += 2; ll >>= 2;
11305 11313 }
11306 11314 if (ll & 0x2) {
11307 11315 l += 1;
11308 11316 }
11309 11317 return (l);
11310 11318 }
11311 11319
11312 11320 static int
11313 11321 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11314 11322 {
11315 11323 vnode_t *avp = NULL;
11316 11324 int error;
11317 11325
11318 11326 if ((error = nfs4lookup_xattr(vp, "", &avp,
11319 11327 LOOKUP_XATTR, cr)) == 0)
11320 11328 error = do_xattr_exists_check(avp, valp, cr);
11321 11329 if (avp)
11322 11330 VN_RELE(avp);
11323 11331
11324 11332 return (error);
11325 11333 }
11326 11334
11327 11335 /* ARGSUSED */
11328 11336 int
11329 11337 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11330 11338 caller_context_t *ct)
11331 11339 {
11332 11340 int error;
11333 11341 hrtime_t t;
11334 11342 rnode4_t *rp;
11335 11343 nfs4_ga_res_t gar;
11336 11344 nfs4_ga_ext_res_t ger;
11337 11345
11338 11346 gar.n4g_ext_res = &ger;
11339 11347
11340 11348 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11341 11349 return (EIO);
11342 11350 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11343 11351 *valp = MAXPATHLEN;
11344 11352 return (0);
11345 11353 }
11346 11354 if (cmd == _PC_ACL_ENABLED) {
11347 11355 *valp = _ACL_ACE_ENABLED;
11348 11356 return (0);
11349 11357 }
11350 11358
11351 11359 rp = VTOR4(vp);
11352 11360 if (cmd == _PC_XATTR_EXISTS) {
11353 11361 /*
11354 11362 * The existence of the xattr directory is not sufficient
11355 11363 * for determining whether generic user attributes exists.
11356 11364 * The attribute directory could only be a transient directory
11357 11365 * used for Solaris sysattr support. Do a small readdir
11358 11366 * to verify if the only entries are sysattrs or not.
11359 11367 *
11360 11368 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11361 11369 * is NULL. Once the xadir vp exists, we can create xattrs,
11362 11370 * and we don't have any way to update the "base" object's
11363 11371 * pc4_xattr_exists from the xattr or xadir. Maybe FEM
11364 11372 * could help out.
11365 11373 */
11366 11374 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11367 11375 rp->r_xattr_dir == NULL) {
11368 11376 return (nfs4_have_xattrs(vp, valp, cr));
11369 11377 }
11370 11378 } else { /* OLD CODE */
11371 11379 if (ATTRCACHE4_VALID(vp)) {
11372 11380 mutex_enter(&rp->r_statelock);
11373 11381 if (rp->r_pathconf.pc4_cache_valid) {
11374 11382 error = 0;
11375 11383 switch (cmd) {
11376 11384 case _PC_FILESIZEBITS:
11377 11385 *valp =
11378 11386 rp->r_pathconf.pc4_filesizebits;
11379 11387 break;
11380 11388 case _PC_LINK_MAX:
11381 11389 *valp =
11382 11390 rp->r_pathconf.pc4_link_max;
11383 11391 break;
11384 11392 case _PC_NAME_MAX:
11385 11393 *valp =
11386 11394 rp->r_pathconf.pc4_name_max;
11387 11395 break;
11388 11396 case _PC_CHOWN_RESTRICTED:
11389 11397 *valp =
11390 11398 rp->r_pathconf.pc4_chown_restricted;
11391 11399 break;
11392 11400 case _PC_NO_TRUNC:
11393 11401 *valp =
11394 11402 rp->r_pathconf.pc4_no_trunc;
11395 11403 break;
11396 11404 default:
11397 11405 error = EINVAL;
11398 11406 break;
11399 11407 }
11400 11408 mutex_exit(&rp->r_statelock);
11401 11409 #ifdef DEBUG
11402 11410 nfs4_pathconf_cache_hits++;
11403 11411 #endif
11404 11412 return (error);
11405 11413 }
11406 11414 mutex_exit(&rp->r_statelock);
11407 11415 }
11408 11416 }
11409 11417 #ifdef DEBUG
11410 11418 nfs4_pathconf_cache_misses++;
11411 11419 #endif
11412 11420
11413 11421 t = gethrtime();
11414 11422
11415 11423 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11416 11424
11417 11425 if (error) {
11418 11426 mutex_enter(&rp->r_statelock);
11419 11427 rp->r_pathconf.pc4_cache_valid = FALSE;
11420 11428 rp->r_pathconf.pc4_xattr_valid = FALSE;
11421 11429 mutex_exit(&rp->r_statelock);
11422 11430 return (error);
11423 11431 }
11424 11432
11425 11433 /* interpret the max filesize */
11426 11434 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11427 11435 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11428 11436
11429 11437 /* Store the attributes we just received */
11430 11438 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11431 11439
11432 11440 switch (cmd) {
11433 11441 case _PC_FILESIZEBITS:
11434 11442 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11435 11443 break;
11436 11444 case _PC_LINK_MAX:
11437 11445 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11438 11446 break;
11439 11447 case _PC_NAME_MAX:
11440 11448 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11441 11449 break;
11442 11450 case _PC_CHOWN_RESTRICTED:
11443 11451 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11444 11452 break;
11445 11453 case _PC_NO_TRUNC:
11446 11454 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11447 11455 break;
11448 11456 case _PC_XATTR_EXISTS:
11449 11457 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11450 11458 if (error = nfs4_have_xattrs(vp, valp, cr))
11451 11459 return (error);
11452 11460 }
11453 11461 break;
11454 11462 default:
11455 11463 return (EINVAL);
11456 11464 }
11457 11465
11458 11466 return (0);
11459 11467 }
11460 11468
11461 11469 /*
11462 11470 * Called by async thread to do synchronous pageio. Do the i/o, wait
11463 11471 * for it to complete, and cleanup the page list when done.
11464 11472 */
11465 11473 static int
11466 11474 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11467 11475 int flags, cred_t *cr)
11468 11476 {
11469 11477 int error;
11470 11478
11471 11479 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11472 11480
11473 11481 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11474 11482 if (flags & B_READ)
11475 11483 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11476 11484 else
11477 11485 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11478 11486 return (error);
11479 11487 }
11480 11488
11481 11489 /* ARGSUSED */
11482 11490 static int
11483 11491 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11484 11492 int flags, cred_t *cr, caller_context_t *ct)
11485 11493 {
11486 11494 int error;
11487 11495 rnode4_t *rp;
11488 11496
11489 11497 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11490 11498 return (EIO);
11491 11499
11492 11500 if (pp == NULL)
11493 11501 return (EINVAL);
11494 11502
11495 11503 rp = VTOR4(vp);
11496 11504 mutex_enter(&rp->r_statelock);
11497 11505 rp->r_count++;
11498 11506 mutex_exit(&rp->r_statelock);
11499 11507
11500 11508 if (flags & B_ASYNC) {
11501 11509 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11502 11510 nfs4_sync_pageio);
11503 11511 } else
11504 11512 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11505 11513 mutex_enter(&rp->r_statelock);
11506 11514 rp->r_count--;
11507 11515 cv_broadcast(&rp->r_cv);
11508 11516 mutex_exit(&rp->r_statelock);
11509 11517 return (error);
11510 11518 }
11511 11519
11512 11520 /* ARGSUSED */
11513 11521 static void
11514 11522 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11515 11523 caller_context_t *ct)
11516 11524 {
11517 11525 int error;
11518 11526 rnode4_t *rp;
11519 11527 page_t *plist;
11520 11528 page_t *pptr;
11521 11529 offset3 offset;
11522 11530 count3 len;
11523 11531 k_sigset_t smask;
11524 11532
11525 11533 /*
11526 11534 * We should get called with fl equal to either B_FREE or
11527 11535 * B_INVAL. Any other value is illegal.
11528 11536 *
11529 11537 * The page that we are either supposed to free or destroy
11530 11538 * should be exclusive locked and its io lock should not
11531 11539 * be held.
11532 11540 */
11533 11541 ASSERT(fl == B_FREE || fl == B_INVAL);
11534 11542 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11535 11543
11536 11544 rp = VTOR4(vp);
11537 11545
11538 11546 /*
11539 11547 * If the page doesn't need to be committed or we shouldn't
11540 11548 * even bother attempting to commit it, then just make sure
11541 11549 * that the p_fsdata byte is clear and then either free or
11542 11550 * destroy the page as appropriate.
11543 11551 */
11544 11552 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11545 11553 pp->p_fsdata = C_NOCOMMIT;
11546 11554 if (fl == B_FREE)
11547 11555 page_free(pp, dn);
11548 11556 else
11549 11557 page_destroy(pp, dn);
11550 11558 return;
11551 11559 }
11552 11560
11553 11561 /*
11554 11562 * If there is a page invalidation operation going on, then
11555 11563 * if this is one of the pages being destroyed, then just
11556 11564 * clear the p_fsdata byte and then either free or destroy
11557 11565 * the page as appropriate.
11558 11566 */
11559 11567 mutex_enter(&rp->r_statelock);
11560 11568 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11561 11569 mutex_exit(&rp->r_statelock);
11562 11570 pp->p_fsdata = C_NOCOMMIT;
11563 11571 if (fl == B_FREE)
11564 11572 page_free(pp, dn);
11565 11573 else
11566 11574 page_destroy(pp, dn);
11567 11575 return;
11568 11576 }
11569 11577
11570 11578 /*
11571 11579 * If we are freeing this page and someone else is already
11572 11580 * waiting to do a commit, then just unlock the page and
11573 11581 * return. That other thread will take care of commiting
11574 11582 * this page. The page can be freed sometime after the
11575 11583 * commit has finished. Otherwise, if the page is marked
11576 11584 * as delay commit, then we may be getting called from
11577 11585 * pvn_write_done, one page at a time. This could result
11578 11586 * in one commit per page, so we end up doing lots of small
11579 11587 * commits instead of fewer larger commits. This is bad,
11580 11588 * we want do as few commits as possible.
11581 11589 */
11582 11590 if (fl == B_FREE) {
11583 11591 if (rp->r_flags & R4COMMITWAIT) {
11584 11592 page_unlock(pp);
11585 11593 mutex_exit(&rp->r_statelock);
11586 11594 return;
11587 11595 }
11588 11596 if (pp->p_fsdata == C_DELAYCOMMIT) {
11589 11597 pp->p_fsdata = C_COMMIT;
11590 11598 page_unlock(pp);
11591 11599 mutex_exit(&rp->r_statelock);
11592 11600 return;
11593 11601 }
11594 11602 }
11595 11603
11596 11604 /*
11597 11605 * Check to see if there is a signal which would prevent an
11598 11606 * attempt to commit the pages from being successful. If so,
11599 11607 * then don't bother with all of the work to gather pages and
11600 11608 * generate the unsuccessful RPC. Just return from here and
11601 11609 * let the page be committed at some later time.
11602 11610 */
11603 11611 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11604 11612 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11605 11613 sigunintr(&smask);
11606 11614 page_unlock(pp);
11607 11615 mutex_exit(&rp->r_statelock);
11608 11616 return;
11609 11617 }
11610 11618 sigunintr(&smask);
11611 11619
11612 11620 /*
11613 11621 * We are starting to need to commit pages, so let's try
11614 11622 * to commit as many as possible at once to reduce the
11615 11623 * overhead.
11616 11624 *
11617 11625 * Set the `commit inprogress' state bit. We must
11618 11626 * first wait until any current one finishes. Then
11619 11627 * we initialize the c_pages list with this page.
11620 11628 */
11621 11629 while (rp->r_flags & R4COMMIT) {
11622 11630 rp->r_flags |= R4COMMITWAIT;
11623 11631 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11624 11632 rp->r_flags &= ~R4COMMITWAIT;
11625 11633 }
11626 11634 rp->r_flags |= R4COMMIT;
11627 11635 mutex_exit(&rp->r_statelock);
11628 11636 ASSERT(rp->r_commit.c_pages == NULL);
11629 11637 rp->r_commit.c_pages = pp;
11630 11638 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11631 11639 rp->r_commit.c_commlen = PAGESIZE;
11632 11640
11633 11641 /*
11634 11642 * Gather together all other pages which can be committed.
11635 11643 * They will all be chained off r_commit.c_pages.
11636 11644 */
11637 11645 nfs4_get_commit(vp);
11638 11646
11639 11647 /*
11640 11648 * Clear the `commit inprogress' status and disconnect
11641 11649 * the list of pages to be committed from the rnode.
11642 11650 * At this same time, we also save the starting offset
11643 11651 * and length of data to be committed on the server.
11644 11652 */
11645 11653 plist = rp->r_commit.c_pages;
11646 11654 rp->r_commit.c_pages = NULL;
11647 11655 offset = rp->r_commit.c_commbase;
11648 11656 len = rp->r_commit.c_commlen;
11649 11657 mutex_enter(&rp->r_statelock);
11650 11658 rp->r_flags &= ~R4COMMIT;
11651 11659 cv_broadcast(&rp->r_commit.c_cv);
11652 11660 mutex_exit(&rp->r_statelock);
11653 11661
11654 11662 if (curproc == proc_pageout || curproc == proc_fsflush ||
11655 11663 nfs_zone() != VTOMI4(vp)->mi_zone) {
11656 11664 nfs4_async_commit(vp, plist, offset, len,
11657 11665 cr, do_nfs4_async_commit);
11658 11666 return;
11659 11667 }
11660 11668
11661 11669 /*
11662 11670 * Actually generate the COMMIT op over the wire operation.
11663 11671 */
11664 11672 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11665 11673
11666 11674 /*
11667 11675 * If we got an error during the commit, just unlock all
11668 11676 * of the pages. The pages will get retransmitted to the
11669 11677 * server during a putpage operation.
11670 11678 */
11671 11679 if (error) {
11672 11680 while (plist != NULL) {
11673 11681 pptr = plist;
11674 11682 page_sub(&plist, pptr);
11675 11683 page_unlock(pptr);
11676 11684 }
11677 11685 return;
11678 11686 }
11679 11687
11680 11688 /*
11681 11689 * We've tried as hard as we can to commit the data to stable
11682 11690 * storage on the server. We just unlock the rest of the pages
11683 11691 * and clear the commit required state. They will be put
11684 11692 * onto the tail of the cachelist if they are nolonger
11685 11693 * mapped.
11686 11694 */
11687 11695 while (plist != pp) {
11688 11696 pptr = plist;
11689 11697 page_sub(&plist, pptr);
11690 11698 pptr->p_fsdata = C_NOCOMMIT;
11691 11699 page_unlock(pptr);
11692 11700 }
11693 11701
11694 11702 /*
11695 11703 * It is possible that nfs4_commit didn't return error but
11696 11704 * some other thread has modified the page we are going
11697 11705 * to free/destroy.
11698 11706 * In this case we need to rewrite the page. Do an explicit check
11699 11707 * before attempting to free/destroy the page. If modified, needs to
11700 11708 * be rewritten so unlock the page and return.
11701 11709 */
11702 11710 if (hat_ismod(pp)) {
11703 11711 pp->p_fsdata = C_NOCOMMIT;
11704 11712 page_unlock(pp);
11705 11713 return;
11706 11714 }
11707 11715
11708 11716 /*
11709 11717 * Now, as appropriate, either free or destroy the page
11710 11718 * that we were called with.
11711 11719 */
11712 11720 pp->p_fsdata = C_NOCOMMIT;
11713 11721 if (fl == B_FREE)
11714 11722 page_free(pp, dn);
11715 11723 else
11716 11724 page_destroy(pp, dn);
11717 11725 }
11718 11726
11719 11727 /*
11720 11728 * Commit requires that the current fh be the file written to.
11721 11729 * The compound op structure is:
11722 11730 * PUTFH(file), COMMIT
11723 11731 */
11724 11732 static int
11725 11733 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11726 11734 {
11727 11735 COMPOUND4args_clnt args;
11728 11736 COMPOUND4res_clnt res;
11729 11737 COMMIT4res *cm_res;
11730 11738 nfs_argop4 argop[2];
11731 11739 nfs_resop4 *resop;
11732 11740 int doqueue;
11733 11741 mntinfo4_t *mi;
11734 11742 rnode4_t *rp;
11735 11743 cred_t *cred_otw = NULL;
11736 11744 bool_t needrecov = FALSE;
11737 11745 nfs4_recov_state_t recov_state;
11738 11746 nfs4_open_stream_t *osp = NULL;
11739 11747 bool_t first_time = TRUE; /* first time getting OTW cred */
11740 11748 bool_t last_time = FALSE; /* last time getting OTW cred */
11741 11749 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11742 11750
11743 11751 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11744 11752
11745 11753 rp = VTOR4(vp);
11746 11754
11747 11755 mi = VTOMI4(vp);
11748 11756 recov_state.rs_flags = 0;
11749 11757 recov_state.rs_num_retry_despite_err = 0;
11750 11758 get_commit_cred:
11751 11759 /*
11752 11760 * Releases the osp, if a valid open stream is provided.
11753 11761 * Puts a hold on the cred_otw and the new osp (if found).
11754 11762 */
11755 11763 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11756 11764 &first_time, &last_time);
11757 11765 args.ctag = TAG_COMMIT;
11758 11766 recov_retry:
11759 11767 /*
11760 11768 * Commit ops: putfh file; commit
11761 11769 */
11762 11770 args.array_len = 2;
11763 11771 args.array = argop;
11764 11772
11765 11773 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11766 11774 &recov_state, NULL);
11767 11775 if (e.error) {
11768 11776 crfree(cred_otw);
11769 11777 if (osp != NULL)
11770 11778 open_stream_rele(osp, rp);
11771 11779 return (e.error);
11772 11780 }
11773 11781
11774 11782 /* putfh directory */
11775 11783 argop[0].argop = OP_CPUTFH;
11776 11784 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11777 11785
11778 11786 /* commit */
11779 11787 argop[1].argop = OP_COMMIT;
11780 11788 argop[1].nfs_argop4_u.opcommit.offset = offset;
11781 11789 argop[1].nfs_argop4_u.opcommit.count = count;
11782 11790
11783 11791 doqueue = 1;
11784 11792 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11785 11793
11786 11794 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11787 11795 if (!needrecov && e.error) {
11788 11796 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11789 11797 needrecov);
11790 11798 crfree(cred_otw);
11791 11799 if (e.error == EACCES && last_time == FALSE)
11792 11800 goto get_commit_cred;
11793 11801 if (osp != NULL)
11794 11802 open_stream_rele(osp, rp);
11795 11803 return (e.error);
11796 11804 }
11797 11805
11798 11806 if (needrecov) {
11799 11807 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11800 11808 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11801 11809 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11802 11810 &recov_state, needrecov);
11803 11811 if (!e.error)
11804 11812 (void) xdr_free(xdr_COMPOUND4res_clnt,
11805 11813 (caddr_t)&res);
11806 11814 goto recov_retry;
11807 11815 }
11808 11816 if (e.error) {
11809 11817 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11810 11818 &recov_state, needrecov);
11811 11819 crfree(cred_otw);
11812 11820 if (osp != NULL)
11813 11821 open_stream_rele(osp, rp);
11814 11822 return (e.error);
11815 11823 }
11816 11824 /* fall through for res.status case */
11817 11825 }
11818 11826
11819 11827 if (res.status) {
11820 11828 e.error = geterrno4(res.status);
11821 11829 if (e.error == EACCES && last_time == FALSE) {
11822 11830 crfree(cred_otw);
11823 11831 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11824 11832 &recov_state, needrecov);
11825 11833 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11826 11834 goto get_commit_cred;
11827 11835 }
11828 11836 /*
11829 11837 * Can't do a nfs4_purge_stale_fh here because this
11830 11838 * can cause a deadlock. nfs4_commit can
11831 11839 * be called from nfs4_dispose which can be called
11832 11840 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh
11833 11841 * can call back to pvn_vplist_dirty.
11834 11842 */
11835 11843 if (e.error == ESTALE) {
11836 11844 mutex_enter(&rp->r_statelock);
11837 11845 rp->r_flags |= R4STALE;
11838 11846 if (!rp->r_error)
11839 11847 rp->r_error = e.error;
11840 11848 mutex_exit(&rp->r_statelock);
11841 11849 PURGE_ATTRCACHE4(vp);
11842 11850 } else {
11843 11851 mutex_enter(&rp->r_statelock);
11844 11852 if (!rp->r_error)
11845 11853 rp->r_error = e.error;
11846 11854 mutex_exit(&rp->r_statelock);
11847 11855 }
11848 11856 } else {
11849 11857 ASSERT(rp->r_flags & R4HAVEVERF);
11850 11858 resop = &res.array[1]; /* commit res */
11851 11859 cm_res = &resop->nfs_resop4_u.opcommit;
11852 11860 mutex_enter(&rp->r_statelock);
11853 11861 if (cm_res->writeverf == rp->r_writeverf) {
11854 11862 mutex_exit(&rp->r_statelock);
11855 11863 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11856 11864 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11857 11865 &recov_state, needrecov);
11858 11866 crfree(cred_otw);
11859 11867 if (osp != NULL)
11860 11868 open_stream_rele(osp, rp);
11861 11869 return (0);
11862 11870 }
11863 11871 nfs4_set_mod(vp);
11864 11872 rp->r_writeverf = cm_res->writeverf;
11865 11873 mutex_exit(&rp->r_statelock);
11866 11874 e.error = NFS_VERF_MISMATCH;
11867 11875 }
11868 11876
11869 11877 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11870 11878 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11871 11879 crfree(cred_otw);
11872 11880 if (osp != NULL)
11873 11881 open_stream_rele(osp, rp);
11874 11882
11875 11883 return (e.error);
11876 11884 }
11877 11885
11878 11886 static void
11879 11887 nfs4_set_mod(vnode_t *vp)
11880 11888 {
11881 11889 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11882 11890
11883 11891 /* make sure we're looking at the master vnode, not a shadow */
11884 11892 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11885 11893 }
11886 11894
11887 11895 /*
11888 11896 * This function is used to gather a page list of the pages which
11889 11897 * can be committed on the server.
11890 11898 *
11891 11899 * The calling thread must have set R4COMMIT. This bit is used to
11892 11900 * serialize access to the commit structure in the rnode. As long
11893 11901 * as the thread has set R4COMMIT, then it can manipulate the commit
11894 11902 * structure without requiring any other locks.
11895 11903 *
11896 11904 * When this function is called from nfs4_dispose() the page passed
11897 11905 * into nfs4_dispose() will be SE_EXCL locked, and so this function
11898 11906 * will skip it. This is not a problem since we initially add the
11899 11907 * page to the r_commit page list.
11900 11908 *
11901 11909 */
11902 11910 static void
11903 11911 nfs4_get_commit(vnode_t *vp)
11904 11912 {
11905 11913 rnode4_t *rp;
11906 11914 page_t *pp;
11907 11915 kmutex_t *vphm;
11908 11916
11909 11917 rp = VTOR4(vp);
11910 11918
11911 11919 ASSERT(rp->r_flags & R4COMMIT);
11912 11920
11913 11921 /* make sure we're looking at the master vnode, not a shadow */
11914 11922
11915 11923 if (IS_SHADOW(vp, rp))
11916 11924 vp = RTOV4(rp);
11917 11925
11918 11926 vphm = page_vnode_mutex(vp);
11919 11927 mutex_enter(vphm);
11920 11928
11921 11929 /*
11922 11930 * If there are no pages associated with this vnode, then
11923 11931 * just return.
11924 11932 */
11925 11933 if ((pp = vp->v_pages) == NULL) {
11926 11934 mutex_exit(vphm);
11927 11935 return;
11928 11936 }
11929 11937
11930 11938 /*
11931 11939 * Step through all of the pages associated with this vnode
11932 11940 * looking for pages which need to be committed.
11933 11941 */
11934 11942 do {
11935 11943 /* Skip marker pages. */
11936 11944 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11937 11945 continue;
11938 11946
11939 11947 /*
11940 11948 * First short-cut everything (without the page_lock)
11941 11949 * and see if this page does not need to be committed
11942 11950 * or is modified if so then we'll just skip it.
11943 11951 */
11944 11952 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11945 11953 continue;
11946 11954
11947 11955 /*
11948 11956 * Attempt to lock the page. If we can't, then
11949 11957 * someone else is messing with it or we have been
11950 11958 * called from nfs4_dispose and this is the page that
11951 11959 * nfs4_dispose was called with.. anyway just skip it.
11952 11960 */
11953 11961 if (!page_trylock(pp, SE_EXCL))
11954 11962 continue;
11955 11963
11956 11964 /*
11957 11965 * Lets check again now that we have the page lock.
11958 11966 */
11959 11967 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11960 11968 page_unlock(pp);
11961 11969 continue;
11962 11970 }
11963 11971
11964 11972 /* this had better not be a free page */
11965 11973 ASSERT(PP_ISFREE(pp) == 0);
11966 11974
11967 11975 /*
11968 11976 * The page needs to be committed and we locked it.
11969 11977 * Update the base and length parameters and add it
11970 11978 * to r_pages.
11971 11979 */
11972 11980 if (rp->r_commit.c_pages == NULL) {
11973 11981 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11974 11982 rp->r_commit.c_commlen = PAGESIZE;
11975 11983 } else if (pp->p_offset < rp->r_commit.c_commbase) {
11976 11984 rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11977 11985 (offset3)pp->p_offset + rp->r_commit.c_commlen;
11978 11986 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11979 11987 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11980 11988 <= pp->p_offset) {
11981 11989 rp->r_commit.c_commlen = (offset3)pp->p_offset -
11982 11990 rp->r_commit.c_commbase + PAGESIZE;
11983 11991 }
11984 11992 page_add(&rp->r_commit.c_pages, pp);
11985 11993 } while ((pp = pp->p_vpnext) != vp->v_pages);
11986 11994
11987 11995 mutex_exit(vphm);
11988 11996 }
11989 11997
11990 11998 /*
11991 11999 * This routine is used to gather together a page list of the pages
11992 12000 * which are to be committed on the server. This routine must not
11993 12001 * be called if the calling thread holds any locked pages.
11994 12002 *
11995 12003 * The calling thread must have set R4COMMIT. This bit is used to
11996 12004 * serialize access to the commit structure in the rnode. As long
11997 12005 * as the thread has set R4COMMIT, then it can manipulate the commit
11998 12006 * structure without requiring any other locks.
11999 12007 */
12000 12008 static void
12001 12009 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
12002 12010 {
12003 12011
12004 12012 rnode4_t *rp;
12005 12013 page_t *pp;
12006 12014 u_offset_t end;
12007 12015 u_offset_t off;
12008 12016 ASSERT(len != 0);
12009 12017 rp = VTOR4(vp);
12010 12018 ASSERT(rp->r_flags & R4COMMIT);
12011 12019
12012 12020 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12013 12021
12014 12022 /* make sure we're looking at the master vnode, not a shadow */
12015 12023
12016 12024 if (IS_SHADOW(vp, rp))
12017 12025 vp = RTOV4(rp);
12018 12026
12019 12027 /*
12020 12028 * If there are no pages associated with this vnode, then
12021 12029 * just return.
12022 12030 */
12023 12031 if ((pp = vp->v_pages) == NULL)
12024 12032 return;
12025 12033 /*
12026 12034 * Calculate the ending offset.
12027 12035 */
12028 12036 end = soff + len;
12029 12037 for (off = soff; off < end; off += PAGESIZE) {
12030 12038 /*
12031 12039 * Lookup each page by vp, offset.
12032 12040 */
12033 12041 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12034 12042 continue;
12035 12043 /*
12036 12044 * If this page does not need to be committed or is
12037 12045 * modified, then just skip it.
12038 12046 */
12039 12047 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12040 12048 page_unlock(pp);
12041 12049 continue;
12042 12050 }
12043 12051
12044 12052 ASSERT(PP_ISFREE(pp) == 0);
12045 12053 /*
12046 12054 * The page needs to be committed and we locked it.
12047 12055 * Update the base and length parameters and add it
12048 12056 * to r_pages.
12049 12057 */
12050 12058 if (rp->r_commit.c_pages == NULL) {
12051 12059 rp->r_commit.c_commbase = (offset3)pp->p_offset;
12052 12060 rp->r_commit.c_commlen = PAGESIZE;
12053 12061 } else {
12054 12062 rp->r_commit.c_commlen = (offset3)pp->p_offset -
12055 12063 rp->r_commit.c_commbase + PAGESIZE;
12056 12064 }
12057 12065 page_add(&rp->r_commit.c_pages, pp);
12058 12066 }
12059 12067 }
12060 12068
12061 12069 /*
12062 12070 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12063 12071 * Flushes and commits data to the server.
12064 12072 */
12065 12073 static int
12066 12074 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12067 12075 {
12068 12076 int error;
12069 12077 verifier4 write_verf;
12070 12078 rnode4_t *rp = VTOR4(vp);
12071 12079
12072 12080 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12073 12081
12074 12082 /*
12075 12083 * Flush the data portion of the file and then commit any
12076 12084 * portions which need to be committed. This may need to
12077 12085 * be done twice if the server has changed state since
12078 12086 * data was last written. The data will need to be
12079 12087 * rewritten to the server and then a new commit done.
12080 12088 *
12081 12089 * In fact, this may need to be done several times if the
12082 12090 * server is having problems and crashing while we are
12083 12091 * attempting to do this.
12084 12092 */
12085 12093
12086 12094 top:
12087 12095 /*
12088 12096 * Do a flush based on the poff and plen arguments. This
12089 12097 * will synchronously write out any modified pages in the
12090 12098 * range specified by (poff, plen). This starts all of the
12091 12099 * i/o operations which will be waited for in the next
12092 12100 * call to nfs4_putpage
12093 12101 */
12094 12102
12095 12103 mutex_enter(&rp->r_statelock);
12096 12104 write_verf = rp->r_writeverf;
12097 12105 mutex_exit(&rp->r_statelock);
12098 12106
12099 12107 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12100 12108 if (error == EAGAIN)
12101 12109 error = 0;
12102 12110
12103 12111 /*
12104 12112 * Do a flush based on the poff and plen arguments. This
12105 12113 * will synchronously write out any modified pages in the
12106 12114 * range specified by (poff, plen) and wait until all of
12107 12115 * the asynchronous i/o's in that range are done as well.
12108 12116 */
12109 12117 if (!error)
12110 12118 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12111 12119
12112 12120 if (error)
12113 12121 return (error);
12114 12122
12115 12123 mutex_enter(&rp->r_statelock);
12116 12124 if (rp->r_writeverf != write_verf) {
12117 12125 mutex_exit(&rp->r_statelock);
12118 12126 goto top;
12119 12127 }
12120 12128 mutex_exit(&rp->r_statelock);
12121 12129
12122 12130 /*
12123 12131 * Now commit any pages which might need to be committed.
12124 12132 * If the error, NFS_VERF_MISMATCH, is returned, then
12125 12133 * start over with the flush operation.
12126 12134 */
12127 12135 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12128 12136
12129 12137 if (error == NFS_VERF_MISMATCH)
12130 12138 goto top;
12131 12139
12132 12140 return (error);
12133 12141 }
12134 12142
12135 12143 /*
12136 12144 * nfs4_commit_vp() will wait for other pending commits and
12137 12145 * will either commit the whole file or a range, plen dictates
12138 12146 * if we commit whole file. a value of zero indicates the whole
12139 12147 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12140 12148 */
12141 12149 static int
12142 12150 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12143 12151 cred_t *cr, int wait_on_writes)
12144 12152 {
12145 12153 rnode4_t *rp;
12146 12154 page_t *plist;
12147 12155 offset3 offset;
12148 12156 count3 len;
12149 12157
12150 12158 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12151 12159
12152 12160 rp = VTOR4(vp);
12153 12161
12154 12162 /*
12155 12163 * before we gather commitable pages make
12156 12164 * sure there are no outstanding async writes
12157 12165 */
12158 12166 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12159 12167 mutex_enter(&rp->r_statelock);
12160 12168 while (rp->r_count > 0) {
12161 12169 cv_wait(&rp->r_cv, &rp->r_statelock);
12162 12170 }
12163 12171 mutex_exit(&rp->r_statelock);
12164 12172 }
12165 12173
12166 12174 /*
12167 12175 * Set the `commit inprogress' state bit. We must
12168 12176 * first wait until any current one finishes.
12169 12177 */
12170 12178 mutex_enter(&rp->r_statelock);
12171 12179 while (rp->r_flags & R4COMMIT) {
12172 12180 rp->r_flags |= R4COMMITWAIT;
12173 12181 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12174 12182 rp->r_flags &= ~R4COMMITWAIT;
12175 12183 }
12176 12184 rp->r_flags |= R4COMMIT;
12177 12185 mutex_exit(&rp->r_statelock);
12178 12186
12179 12187 /*
12180 12188 * Gather all of the pages which need to be
12181 12189 * committed.
12182 12190 */
12183 12191 if (plen == 0)
12184 12192 nfs4_get_commit(vp);
12185 12193 else
12186 12194 nfs4_get_commit_range(vp, poff, plen);
12187 12195
12188 12196 /*
12189 12197 * Clear the `commit inprogress' bit and disconnect the
12190 12198 * page list which was gathered by nfs4_get_commit.
12191 12199 */
12192 12200 plist = rp->r_commit.c_pages;
12193 12201 rp->r_commit.c_pages = NULL;
12194 12202 offset = rp->r_commit.c_commbase;
12195 12203 len = rp->r_commit.c_commlen;
12196 12204 mutex_enter(&rp->r_statelock);
12197 12205 rp->r_flags &= ~R4COMMIT;
12198 12206 cv_broadcast(&rp->r_commit.c_cv);
12199 12207 mutex_exit(&rp->r_statelock);
12200 12208
12201 12209 /*
12202 12210 * If any pages need to be committed, commit them and
12203 12211 * then unlock them so that they can be freed some
12204 12212 * time later.
12205 12213 */
12206 12214 if (plist == NULL)
12207 12215 return (0);
12208 12216
12209 12217 /*
12210 12218 * No error occurred during the flush portion
12211 12219 * of this operation, so now attempt to commit
12212 12220 * the data to stable storage on the server.
12213 12221 *
12214 12222 * This will unlock all of the pages on the list.
12215 12223 */
12216 12224 return (nfs4_sync_commit(vp, plist, offset, len, cr));
12217 12225 }
12218 12226
12219 12227 static int
12220 12228 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12221 12229 cred_t *cr)
12222 12230 {
12223 12231 int error;
12224 12232 page_t *pp;
12225 12233
12226 12234 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12227 12235
12228 12236 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12229 12237
12230 12238 /*
12231 12239 * If we got an error, then just unlock all of the pages
12232 12240 * on the list.
12233 12241 */
12234 12242 if (error) {
12235 12243 while (plist != NULL) {
12236 12244 pp = plist;
12237 12245 page_sub(&plist, pp);
12238 12246 page_unlock(pp);
12239 12247 }
12240 12248 return (error);
12241 12249 }
12242 12250 /*
12243 12251 * We've tried as hard as we can to commit the data to stable
12244 12252 * storage on the server. We just unlock the pages and clear
12245 12253 * the commit required state. They will get freed later.
12246 12254 */
12247 12255 while (plist != NULL) {
12248 12256 pp = plist;
12249 12257 page_sub(&plist, pp);
12250 12258 pp->p_fsdata = C_NOCOMMIT;
12251 12259 page_unlock(pp);
12252 12260 }
12253 12261
12254 12262 return (error);
12255 12263 }
12256 12264
12257 12265 static void
12258 12266 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12259 12267 cred_t *cr)
12260 12268 {
12261 12269
12262 12270 (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12263 12271 }
12264 12272
12265 12273 /*ARGSUSED*/
12266 12274 static int
12267 12275 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12268 12276 caller_context_t *ct)
12269 12277 {
12270 12278 int error = 0;
12271 12279 mntinfo4_t *mi;
12272 12280 vattr_t va;
12273 12281 vsecattr_t nfsace4_vsap;
12274 12282
12275 12283 mi = VTOMI4(vp);
12276 12284 if (nfs_zone() != mi->mi_zone)
12277 12285 return (EIO);
12278 12286 if (mi->mi_flags & MI4_ACL) {
12279 12287 /* if we have a delegation, return it */
12280 12288 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12281 12289 (void) nfs4delegreturn(VTOR4(vp),
12282 12290 NFS4_DR_REOPEN|NFS4_DR_PUSH);
12283 12291
12284 12292 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12285 12293 NFS4_ACL_SET);
12286 12294 if (error) /* EINVAL */
12287 12295 return (error);
12288 12296
12289 12297 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12290 12298 /*
12291 12299 * These are aclent_t type entries.
12292 12300 */
12293 12301 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12294 12302 vp->v_type == VDIR, FALSE);
12295 12303 if (error)
12296 12304 return (error);
12297 12305 } else {
12298 12306 /*
12299 12307 * These are ace_t type entries.
12300 12308 */
12301 12309 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12302 12310 FALSE);
12303 12311 if (error)
12304 12312 return (error);
12305 12313 }
12306 12314 bzero(&va, sizeof (va));
12307 12315 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12308 12316 vs_ace4_destroy(&nfsace4_vsap);
12309 12317 return (error);
12310 12318 }
12311 12319 return (ENOSYS);
12312 12320 }
12313 12321
12314 12322 /* ARGSUSED */
12315 12323 int
12316 12324 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12317 12325 caller_context_t *ct)
12318 12326 {
12319 12327 int error;
12320 12328 mntinfo4_t *mi;
12321 12329 nfs4_ga_res_t gar;
12322 12330 rnode4_t *rp = VTOR4(vp);
12323 12331
12324 12332 mi = VTOMI4(vp);
12325 12333 if (nfs_zone() != mi->mi_zone)
12326 12334 return (EIO);
12327 12335
12328 12336 bzero(&gar, sizeof (gar));
12329 12337 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12330 12338
12331 12339 /*
12332 12340 * vsecattr->vsa_mask holds the original acl request mask.
12333 12341 * This is needed when determining what to return.
12334 12342 * (See: nfs4_create_getsecattr_return())
12335 12343 */
12336 12344 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12337 12345 if (error) /* EINVAL */
12338 12346 return (error);
12339 12347
12340 12348 /*
12341 12349 * If this is a referral stub, don't try to go OTW for an ACL
12342 12350 */
12343 12351 if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12344 12352 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12345 12353
12346 12354 if (mi->mi_flags & MI4_ACL) {
12347 12355 /*
12348 12356 * Check if the data is cached and the cache is valid. If it
12349 12357 * is we don't go over the wire.
12350 12358 */
12351 12359 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12352 12360 mutex_enter(&rp->r_statelock);
12353 12361 if (rp->r_secattr != NULL) {
12354 12362 error = nfs4_create_getsecattr_return(
12355 12363 rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12356 12364 rp->r_attr.va_gid,
12357 12365 vp->v_type == VDIR);
12358 12366 if (!error) { /* error == 0 - Success! */
12359 12367 mutex_exit(&rp->r_statelock);
12360 12368 return (error);
12361 12369 }
12362 12370 }
12363 12371 mutex_exit(&rp->r_statelock);
12364 12372 }
12365 12373
12366 12374 /*
12367 12375 * The getattr otw call will always get both the acl, in
12368 12376 * the form of a list of nfsace4's, and the number of acl
12369 12377 * entries; independent of the value of gar.n4g_vsa.vsa_mask.
12370 12378 */
12371 12379 gar.n4g_va.va_mask = AT_ALL;
12372 12380 error = nfs4_getattr_otw(vp, &gar, cr, 1);
12373 12381 if (error) {
12374 12382 vs_ace4_destroy(&gar.n4g_vsa);
12375 12383 if (error == ENOTSUP || error == EOPNOTSUPP)
12376 12384 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12377 12385 return (error);
12378 12386 }
12379 12387
12380 12388 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12381 12389 /*
12382 12390 * No error was returned, but according to the response
12383 12391 * bitmap, neither was an acl.
12384 12392 */
12385 12393 vs_ace4_destroy(&gar.n4g_vsa);
12386 12394 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12387 12395 return (error);
12388 12396 }
12389 12397
12390 12398 /*
12391 12399 * Update the cache with the ACL.
12392 12400 */
12393 12401 nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12394 12402
12395 12403 error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12396 12404 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12397 12405 vp->v_type == VDIR);
12398 12406 vs_ace4_destroy(&gar.n4g_vsa);
12399 12407 if ((error) && (vsecattr->vsa_mask &
12400 12408 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12401 12409 (error != EACCES)) {
12402 12410 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12403 12411 }
12404 12412 return (error);
12405 12413 }
12406 12414 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12407 12415 return (error);
12408 12416 }
12409 12417
12410 12418 /*
12411 12419 * The function returns:
12412 12420 * - 0 (zero) if the passed in "acl_mask" is a valid request.
12413 12421 * - EINVAL if the passed in "acl_mask" is an invalid request.
12414 12422 *
12415 12423 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12416 12424 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12417 12425 *
12418 12426 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12419 12427 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12420 12428 * - We have a count field set without the corresponding acl field set. (e.g. -
12421 12429 * VSA_ACECNT is set, but VSA_ACE is not)
12422 12430 */
12423 12431 static int
12424 12432 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12425 12433 {
12426 12434 /* Shortcut the masks that are always valid. */
12427 12435 if (acl_mask == (VSA_ACE | VSA_ACECNT))
12428 12436 return (0);
12429 12437 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12430 12438 return (0);
12431 12439
12432 12440 if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12433 12441 /*
12434 12442 * We can't have any VSA_ACL type stuff in the mask now.
12435 12443 */
12436 12444 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12437 12445 VSA_DFACLCNT))
12438 12446 return (EINVAL);
12439 12447
12440 12448 if (op == NFS4_ACL_SET) {
12441 12449 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12442 12450 return (EINVAL);
12443 12451 }
12444 12452 }
12445 12453
12446 12454 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12447 12455 /*
12448 12456 * We can't have any VSA_ACE type stuff in the mask now.
12449 12457 */
12450 12458 if (acl_mask & (VSA_ACE | VSA_ACECNT))
12451 12459 return (EINVAL);
12452 12460
12453 12461 if (op == NFS4_ACL_SET) {
12454 12462 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12455 12463 return (EINVAL);
12456 12464
12457 12465 if ((acl_mask & VSA_DFACLCNT) &&
12458 12466 !(acl_mask & VSA_DFACL))
12459 12467 return (EINVAL);
12460 12468 }
12461 12469 }
12462 12470 return (0);
12463 12471 }
12464 12472
12465 12473 /*
12466 12474 * The theory behind creating the correct getsecattr return is simply this:
12467 12475 * "Don't return anything that the caller is not expecting to have to free."
12468 12476 */
12469 12477 static int
12470 12478 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12471 12479 uid_t uid, gid_t gid, int isdir)
12472 12480 {
12473 12481 int error = 0;
12474 12482 /* Save the mask since the translators modify it. */
12475 12483 uint_t orig_mask = vsap->vsa_mask;
12476 12484
12477 12485 if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12478 12486 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12479 12487
12480 12488 if (error)
12481 12489 return (error);
12482 12490
12483 12491 /*
12484 12492 * If the caller only asked for the ace count (VSA_ACECNT)
12485 12493 * don't give them the full acl (VSA_ACE), free it.
12486 12494 */
12487 12495 if (!orig_mask & VSA_ACE) {
12488 12496 if (vsap->vsa_aclentp != NULL) {
12489 12497 kmem_free(vsap->vsa_aclentp,
12490 12498 vsap->vsa_aclcnt * sizeof (ace_t));
12491 12499 vsap->vsa_aclentp = NULL;
12492 12500 }
12493 12501 }
12494 12502 vsap->vsa_mask = orig_mask;
12495 12503
12496 12504 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12497 12505 VSA_DFACLCNT)) {
12498 12506 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12499 12507 isdir, FALSE);
12500 12508
12501 12509 if (error)
12502 12510 return (error);
12503 12511
12504 12512 /*
12505 12513 * If the caller only asked for the acl count (VSA_ACLCNT)
12506 12514 * and/or the default acl count (VSA_DFACLCNT) don't give them
12507 12515 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12508 12516 */
12509 12517 if (!orig_mask & VSA_ACL) {
12510 12518 if (vsap->vsa_aclentp != NULL) {
12511 12519 kmem_free(vsap->vsa_aclentp,
12512 12520 vsap->vsa_aclcnt * sizeof (aclent_t));
12513 12521 vsap->vsa_aclentp = NULL;
12514 12522 }
12515 12523 }
12516 12524
12517 12525 if (!orig_mask & VSA_DFACL) {
12518 12526 if (vsap->vsa_dfaclentp != NULL) {
12519 12527 kmem_free(vsap->vsa_dfaclentp,
12520 12528 vsap->vsa_dfaclcnt * sizeof (aclent_t));
12521 12529 vsap->vsa_dfaclentp = NULL;
12522 12530 }
12523 12531 }
12524 12532 vsap->vsa_mask = orig_mask;
12525 12533 }
12526 12534 return (0);
12527 12535 }
12528 12536
12529 12537 /* ARGSUSED */
12530 12538 int
12531 12539 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12532 12540 caller_context_t *ct)
12533 12541 {
12534 12542 int error;
12535 12543
12536 12544 if (nfs_zone() != VTOMI4(vp)->mi_zone)
12537 12545 return (EIO);
12538 12546 /*
12539 12547 * check for valid cmd parameter
12540 12548 */
12541 12549 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12542 12550 return (EINVAL);
12543 12551
12544 12552 /*
12545 12553 * Check access permissions
12546 12554 */
12547 12555 if ((cmd & F_SHARE) &&
12548 12556 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12549 12557 (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12550 12558 return (EBADF);
12551 12559
12552 12560 /*
12553 12561 * If the filesystem is mounted using local locking, pass the
12554 12562 * request off to the local share code.
12555 12563 */
12556 12564 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12557 12565 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12558 12566
12559 12567 switch (cmd) {
12560 12568 case F_SHARE:
12561 12569 case F_UNSHARE:
12562 12570 /*
12563 12571 * This will be properly implemented later,
12564 12572 * see RFE: 4823948 .
12565 12573 */
12566 12574 error = EAGAIN;
12567 12575 break;
12568 12576
12569 12577 case F_HASREMOTELOCKS:
12570 12578 /*
12571 12579 * NFS client can't store remote locks itself
12572 12580 */
12573 12581 shr->s_access = 0;
12574 12582 error = 0;
12575 12583 break;
12576 12584
12577 12585 default:
12578 12586 error = EINVAL;
12579 12587 break;
12580 12588 }
12581 12589
12582 12590 return (error);
12583 12591 }
12584 12592
12585 12593 /*
12586 12594 * Common code called by directory ops to update the attrcache
12587 12595 */
12588 12596 static int
12589 12597 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12590 12598 hrtime_t t, vnode_t *vp, cred_t *cr)
12591 12599 {
12592 12600 int error = 0;
12593 12601
12594 12602 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12595 12603
12596 12604 if (status != NFS4_OK) {
12597 12605 /* getattr not done or failed */
12598 12606 PURGE_ATTRCACHE4(vp);
12599 12607 return (error);
12600 12608 }
12601 12609
12602 12610 if (garp) {
12603 12611 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12604 12612 } else {
12605 12613 PURGE_ATTRCACHE4(vp);
12606 12614 }
12607 12615 return (error);
12608 12616 }
12609 12617
12610 12618 /*
12611 12619 * Update directory caches for directory modification ops (link, rename, etc.)
12612 12620 * When dinfo is NULL, manage dircaches in the old way.
12613 12621 */
12614 12622 static void
12615 12623 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12616 12624 dirattr_info_t *dinfo)
12617 12625 {
12618 12626 rnode4_t *drp = VTOR4(dvp);
12619 12627
12620 12628 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12621 12629
12622 12630 /* Purge rddir cache for dir since it changed */
12623 12631 if (drp->r_dir != NULL)
12624 12632 nfs4_purge_rddir_cache(dvp);
12625 12633
12626 12634 /*
12627 12635 * If caller provided dinfo, then use it to manage dir caches.
12628 12636 */
12629 12637 if (dinfo != NULL) {
12630 12638 if (vp != NULL) {
12631 12639 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12632 12640 if (!VTOR4(vp)->created_v4) {
12633 12641 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12634 12642 dnlc_update(dvp, nm, vp);
12635 12643 } else {
12636 12644 /*
12637 12645 * XXX don't update if the created_v4 flag is
12638 12646 * set
12639 12647 */
12640 12648 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12641 12649 NFS4_DEBUG(nfs4_client_state_debug,
12642 12650 (CE_NOTE, "nfs4_update_dircaches: "
12643 12651 "don't update dnlc: created_v4 flag"));
12644 12652 }
12645 12653 }
12646 12654
12647 12655 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12648 12656 dinfo->di_cred, FALSE, cinfo);
12649 12657
12650 12658 return;
12651 12659 }
12652 12660
12653 12661 /*
12654 12662 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12655 12663 * Since caller modified dir but didn't receive post-dirmod-op dir
12656 12664 * attrs, the dir's attrs must be purged.
12657 12665 *
12658 12666 * XXX this check and dnlc update/purge should really be atomic,
12659 12667 * XXX but can't use rnode statelock because it'll deadlock in
12660 12668 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12661 12669 * XXX does occur.
12662 12670 *
12663 12671 * XXX We also may want to check that atomic is true in the
12664 12672 * XXX change_info struct. If it is not, the change_info may
12665 12673 * XXX reflect changes by more than one clients which means that
12666 12674 * XXX our cache may not be valid.
12667 12675 */
12668 12676 PURGE_ATTRCACHE4(dvp);
12669 12677 if (drp->r_change == cinfo->before) {
12670 12678 /* no changes took place in the directory prior to our link */
12671 12679 if (vp != NULL) {
12672 12680 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12673 12681 if (!VTOR4(vp)->created_v4) {
12674 12682 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12675 12683 dnlc_update(dvp, nm, vp);
12676 12684 } else {
12677 12685 /*
12678 12686 * XXX dont' update if the created_v4 flag
12679 12687 * is set
12680 12688 */
12681 12689 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12682 12690 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12683 12691 "nfs4_update_dircaches: don't"
12684 12692 " update dnlc: created_v4 flag"));
12685 12693 }
12686 12694 }
12687 12695 } else {
12688 12696 /* Another client modified directory - purge its dnlc cache */
12689 12697 dnlc_purge_vp(dvp);
12690 12698 }
12691 12699 }
12692 12700
12693 12701 /*
12694 12702 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12695 12703 * file.
12696 12704 *
12697 12705 * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12698 12706 * file (ie: client recovery) and otherwise set to FALSE.
12699 12707 *
12700 12708 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12701 12709 * initiated) calling functions.
12702 12710 *
12703 12711 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12704 12712 * of resending a 'lost' open request.
12705 12713 *
12706 12714 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12707 12715 * server that hands out BAD_SEQID on open confirm.
12708 12716 *
12709 12717 * Errors are returned via the nfs4_error_t parameter.
12710 12718 */
12711 12719 void
12712 12720 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12713 12721 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12714 12722 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12715 12723 {
12716 12724 COMPOUND4args_clnt args;
12717 12725 COMPOUND4res_clnt res;
12718 12726 nfs_argop4 argop[2];
12719 12727 nfs_resop4 *resop;
12720 12728 int doqueue = 1;
12721 12729 mntinfo4_t *mi;
12722 12730 OPEN_CONFIRM4args *open_confirm_args;
12723 12731 int needrecov;
12724 12732
12725 12733 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12726 12734 #if DEBUG
12727 12735 mutex_enter(&oop->oo_lock);
12728 12736 ASSERT(oop->oo_seqid_inuse);
12729 12737 mutex_exit(&oop->oo_lock);
12730 12738 #endif
12731 12739
12732 12740 recov_retry_confirm:
12733 12741 nfs4_error_zinit(ep);
12734 12742 *retry_open = FALSE;
12735 12743
12736 12744 if (resend)
12737 12745 args.ctag = TAG_OPEN_CONFIRM_LOST;
12738 12746 else
12739 12747 args.ctag = TAG_OPEN_CONFIRM;
12740 12748
12741 12749 args.array_len = 2;
12742 12750 args.array = argop;
12743 12751
12744 12752 /* putfh target fh */
12745 12753 argop[0].argop = OP_CPUTFH;
12746 12754 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12747 12755
12748 12756 argop[1].argop = OP_OPEN_CONFIRM;
12749 12757 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12750 12758
12751 12759 (*seqid) += 1;
12752 12760 open_confirm_args->seqid = *seqid;
12753 12761 open_confirm_args->open_stateid = *stateid;
12754 12762
12755 12763 mi = VTOMI4(vp);
12756 12764
12757 12765 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12758 12766
12759 12767 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12760 12768 nfs4_set_open_seqid((*seqid), oop, args.ctag);
12761 12769 }
12762 12770
12763 12771 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12764 12772 if (!needrecov && ep->error)
12765 12773 return;
12766 12774
12767 12775 if (needrecov) {
12768 12776 bool_t abort = FALSE;
12769 12777
12770 12778 if (reopening_file == FALSE) {
12771 12779 nfs4_bseqid_entry_t *bsep = NULL;
12772 12780
12773 12781 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12774 12782 bsep = nfs4_create_bseqid_entry(oop, NULL,
12775 12783 vp, 0, args.ctag,
12776 12784 open_confirm_args->seqid);
12777 12785
12778 12786 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12779 12787 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12780 12788 if (bsep) {
12781 12789 kmem_free(bsep, sizeof (*bsep));
12782 12790 if (num_bseqid_retryp &&
12783 12791 --(*num_bseqid_retryp) == 0)
12784 12792 abort = TRUE;
12785 12793 }
12786 12794 }
12787 12795 if ((ep->error == ETIMEDOUT ||
12788 12796 res.status == NFS4ERR_RESOURCE) &&
12789 12797 abort == FALSE && resend == FALSE) {
12790 12798 if (!ep->error)
12791 12799 (void) xdr_free(xdr_COMPOUND4res_clnt,
12792 12800 (caddr_t)&res);
12793 12801
12794 12802 delay(SEC_TO_TICK(confirm_retry_sec));
12795 12803 goto recov_retry_confirm;
12796 12804 }
12797 12805 /* State may have changed so retry the entire OPEN op */
12798 12806 if (abort == FALSE)
12799 12807 *retry_open = TRUE;
12800 12808 else
12801 12809 *retry_open = FALSE;
12802 12810 if (!ep->error)
12803 12811 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12804 12812 return;
12805 12813 }
12806 12814
12807 12815 if (res.status) {
12808 12816 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12809 12817 return;
12810 12818 }
12811 12819
12812 12820 resop = &res.array[1]; /* open confirm res */
12813 12821 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12814 12822 stateid, sizeof (*stateid));
12815 12823
12816 12824 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12817 12825 }
12818 12826
12819 12827 /*
12820 12828 * Return the credentials associated with a client state object. The
12821 12829 * caller is responsible for freeing the credentials.
12822 12830 */
12823 12831
12824 12832 static cred_t *
12825 12833 state_to_cred(nfs4_open_stream_t *osp)
12826 12834 {
12827 12835 cred_t *cr;
12828 12836
12829 12837 /*
12830 12838 * It's ok to not lock the open stream and open owner to get
12831 12839 * the oo_cred since this is only written once (upon creation)
12832 12840 * and will not change.
12833 12841 */
12834 12842 cr = osp->os_open_owner->oo_cred;
12835 12843 crhold(cr);
12836 12844
12837 12845 return (cr);
12838 12846 }
12839 12847
12840 12848 /*
12841 12849 * nfs4_find_sysid
12842 12850 *
12843 12851 * Find the sysid for the knetconfig associated with the given mi.
12844 12852 */
12845 12853 static struct lm_sysid *
12846 12854 nfs4_find_sysid(mntinfo4_t *mi)
12847 12855 {
12848 12856 ASSERT(nfs_zone() == mi->mi_zone);
12849 12857
12850 12858 /*
12851 12859 * Switch from RDMA knconf to original mount knconf
12852 12860 */
12853 12861 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12854 12862 mi->mi_curr_serv->sv_hostname, NULL));
12855 12863 }
12856 12864
12857 12865 #ifdef DEBUG
12858 12866 /*
12859 12867 * Return a string version of the call type for easy reading.
12860 12868 */
12861 12869 static char *
12862 12870 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12863 12871 {
12864 12872 switch (ctype) {
12865 12873 case NFS4_LCK_CTYPE_NORM:
12866 12874 return ("NORMAL");
12867 12875 case NFS4_LCK_CTYPE_RECLAIM:
12868 12876 return ("RECLAIM");
12869 12877 case NFS4_LCK_CTYPE_RESEND:
12870 12878 return ("RESEND");
12871 12879 case NFS4_LCK_CTYPE_REINSTATE:
12872 12880 return ("REINSTATE");
12873 12881 default:
12874 12882 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12875 12883 "type %d", ctype);
12876 12884 return ("");
12877 12885 }
12878 12886 }
12879 12887 #endif
12880 12888
12881 12889 /*
12882 12890 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12883 12891 * Unlock requests don't have an over-the-wire locktype, so we just return
12884 12892 * something non-threatening.
12885 12893 */
12886 12894
12887 12895 static nfs_lock_type4
12888 12896 flk_to_locktype(int cmd, int l_type)
12889 12897 {
12890 12898 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12891 12899
12892 12900 switch (l_type) {
12893 12901 case F_UNLCK:
12894 12902 return (READ_LT);
12895 12903 case F_RDLCK:
12896 12904 if (cmd == F_SETLK)
12897 12905 return (READ_LT);
12898 12906 else
12899 12907 return (READW_LT);
12900 12908 case F_WRLCK:
12901 12909 if (cmd == F_SETLK)
12902 12910 return (WRITE_LT);
12903 12911 else
12904 12912 return (WRITEW_LT);
12905 12913 }
12906 12914 panic("flk_to_locktype");
12907 12915 /*NOTREACHED*/
12908 12916 }
12909 12917
12910 12918 /*
12911 12919 * Do some preliminary checks for nfs4frlock.
12912 12920 */
12913 12921 static int
12914 12922 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12915 12923 u_offset_t offset)
12916 12924 {
12917 12925 int error = 0;
12918 12926
12919 12927 /*
12920 12928 * If we are setting a lock, check that the file is opened
12921 12929 * with the correct mode.
12922 12930 */
12923 12931 if (cmd == F_SETLK || cmd == F_SETLKW) {
12924 12932 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12925 12933 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12926 12934 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12927 12935 "nfs4frlock_validate_args: file was opened with "
12928 12936 "incorrect mode"));
12929 12937 return (EBADF);
12930 12938 }
12931 12939 }
12932 12940
12933 12941 /* Convert the offset. It may need to be restored before returning. */
12934 12942 if (error = convoff(vp, flk, 0, offset)) {
12935 12943 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12936 12944 "nfs4frlock_validate_args: convoff => error= %d\n",
12937 12945 error));
12938 12946 return (error);
12939 12947 }
12940 12948
12941 12949 return (error);
12942 12950 }
12943 12951
12944 12952 /*
12945 12953 * Set the flock64's lm_sysid for nfs4frlock.
12946 12954 */
12947 12955 static int
12948 12956 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12949 12957 {
12950 12958 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12951 12959
12952 12960 /* Find the lm_sysid */
12953 12961 *lspp = nfs4_find_sysid(VTOMI4(vp));
12954 12962
12955 12963 if (*lspp == NULL) {
12956 12964 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12957 12965 "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12958 12966 return (ENOLCK);
12959 12967 }
12960 12968
12961 12969 flk->l_sysid = lm_sysidt(*lspp);
12962 12970
12963 12971 return (0);
12964 12972 }
12965 12973
↓ open down ↓ |
12591 lines elided |
↑ open up ↑ |
12966 12974 /*
12967 12975 * Do the remaining preliminary setup for nfs4frlock.
12968 12976 */
12969 12977 static void
12970 12978 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12971 12979 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12972 12980 cred_t **cred_otw)
12973 12981 {
12974 12982 /*
12975 12983 * set tick_delay to the base delay time.
12976 - * (NFS4_BASE_WAIT_TIME is in secs)
12984 + * (nfs4_base_wait_time is in msecs)
12977 12985 */
12978 12986
12979 - *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12987 + *tick_delayp = drv_usectohz(nfs4_base_wait_time * 1000);
12980 12988
12981 12989 /*
12982 12990 * If lock is relative to EOF, we need the newest length of the
12983 12991 * file. Therefore invalidate the ATTR_CACHE.
12984 12992 */
12985 12993
12986 12994 *whencep = flk->l_whence;
12987 12995
12988 12996 if (*whencep == 2) /* SEEK_END */
12989 12997 PURGE_ATTRCACHE4(vp);
12990 12998
12991 12999 recov_statep->rs_flags = 0;
12992 13000 recov_statep->rs_num_retry_despite_err = 0;
12993 13001 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12994 13002 }
12995 13003
12996 13004 /*
12997 13005 * Initialize and allocate the data structures necessary for
12998 13006 * the nfs4frlock call.
12999 13007 * Allocates argsp's op array, frees up the saved_rqstpp if there is one.
13000 13008 */
13001 13009 static void
13002 13010 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
13003 13011 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
13004 13012 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
13005 13013 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
13006 13014 {
13007 13015 int argoplist_size;
13008 13016 int num_ops = 2;
13009 13017
13010 13018 *retry = FALSE;
13011 13019 *did_start_fop = FALSE;
13012 13020 *skip_get_err = FALSE;
13013 13021 lost_rqstp->lr_op = 0;
13014 13022 argoplist_size = num_ops * sizeof (nfs_argop4);
13015 13023 /* fill array with zero */
13016 13024 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13017 13025
13018 13026 *argspp = argsp;
13019 13027 *respp = NULL;
13020 13028
13021 13029 argsp->array_len = num_ops;
13022 13030 argsp->array = *argopp;
13023 13031
13024 13032 /* initialize in case of error; will get real value down below */
13025 13033 argsp->ctag = TAG_NONE;
13026 13034
13027 13035 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13028 13036 *op_hintp = OH_LOCKU;
13029 13037 else
13030 13038 *op_hintp = OH_OTHER;
13031 13039 }
13032 13040
13033 13041 /*
13034 13042 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign
13035 13043 * the proper nfs4_server_t for this instance of nfs4frlock.
13036 13044 * Returns 0 (success) or an errno value.
13037 13045 */
13038 13046 static int
13039 13047 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13040 13048 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13041 13049 bool_t *did_start_fop, bool_t *startrecovp)
13042 13050 {
13043 13051 int error = 0;
13044 13052 rnode4_t *rp;
13045 13053
13046 13054 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13047 13055
13048 13056 if (ctype == NFS4_LCK_CTYPE_NORM) {
13049 13057 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13050 13058 recov_statep, startrecovp);
13051 13059 if (error)
13052 13060 return (error);
13053 13061 *did_start_fop = TRUE;
13054 13062 } else {
13055 13063 *did_start_fop = FALSE;
13056 13064 *startrecovp = FALSE;
13057 13065 }
13058 13066
13059 13067 if (!error) {
13060 13068 rp = VTOR4(vp);
13061 13069
13062 13070 /* If the file failed recovery, just quit. */
13063 13071 mutex_enter(&rp->r_statelock);
13064 13072 if (rp->r_flags & R4RECOVERR) {
13065 13073 error = EIO;
13066 13074 }
13067 13075 mutex_exit(&rp->r_statelock);
13068 13076 }
13069 13077
13070 13078 return (error);
13071 13079 }
13072 13080
13073 13081 /*
13074 13082 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A
13075 13083 * resend nfs4frlock call is initiated by the recovery framework.
13076 13084 * Acquires the lop and oop seqid synchronization.
13077 13085 */
13078 13086 static void
13079 13087 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13080 13088 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13081 13089 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13082 13090 LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13083 13091 {
13084 13092 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13085 13093 int error;
13086 13094
13087 13095 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13088 13096 (CE_NOTE,
13089 13097 "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13090 13098 ASSERT(resend_rqstp != NULL);
13091 13099 ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13092 13100 resend_rqstp->lr_op == OP_LOCKU);
13093 13101
13094 13102 *oopp = resend_rqstp->lr_oop;
13095 13103 if (resend_rqstp->lr_oop) {
13096 13104 open_owner_hold(resend_rqstp->lr_oop);
13097 13105 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13098 13106 ASSERT(error == 0); /* recov thread always succeeds */
13099 13107 }
13100 13108
13101 13109 /* Must resend this lost lock/locku request. */
13102 13110 ASSERT(resend_rqstp->lr_lop != NULL);
13103 13111 *lopp = resend_rqstp->lr_lop;
13104 13112 lock_owner_hold(resend_rqstp->lr_lop);
13105 13113 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13106 13114 ASSERT(error == 0); /* recov thread always succeeds */
13107 13115
13108 13116 *ospp = resend_rqstp->lr_osp;
13109 13117 if (*ospp)
13110 13118 open_stream_hold(resend_rqstp->lr_osp);
13111 13119
13112 13120 if (resend_rqstp->lr_op == OP_LOCK) {
13113 13121 LOCK4args *lock_args;
13114 13122
13115 13123 argop->argop = OP_LOCK;
13116 13124 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13117 13125 lock_args->locktype = resend_rqstp->lr_locktype;
13118 13126 lock_args->reclaim =
13119 13127 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13120 13128 lock_args->offset = resend_rqstp->lr_flk->l_start;
13121 13129 lock_args->length = resend_rqstp->lr_flk->l_len;
13122 13130 if (lock_args->length == 0)
13123 13131 lock_args->length = ~lock_args->length;
13124 13132 nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13125 13133 mi2clientid(mi), &lock_args->locker);
13126 13134
13127 13135 switch (resend_rqstp->lr_ctype) {
13128 13136 case NFS4_LCK_CTYPE_RESEND:
13129 13137 argsp->ctag = TAG_LOCK_RESEND;
13130 13138 break;
13131 13139 case NFS4_LCK_CTYPE_REINSTATE:
13132 13140 argsp->ctag = TAG_LOCK_REINSTATE;
13133 13141 break;
13134 13142 case NFS4_LCK_CTYPE_RECLAIM:
13135 13143 argsp->ctag = TAG_LOCK_RECLAIM;
13136 13144 break;
13137 13145 default:
13138 13146 argsp->ctag = TAG_LOCK_UNKNOWN;
13139 13147 break;
13140 13148 }
13141 13149 } else {
13142 13150 LOCKU4args *locku_args;
13143 13151 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13144 13152
13145 13153 argop->argop = OP_LOCKU;
13146 13154 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13147 13155 locku_args->locktype = READ_LT;
13148 13156 locku_args->seqid = lop->lock_seqid + 1;
13149 13157 mutex_enter(&lop->lo_lock);
13150 13158 locku_args->lock_stateid = lop->lock_stateid;
13151 13159 mutex_exit(&lop->lo_lock);
13152 13160 locku_args->offset = resend_rqstp->lr_flk->l_start;
13153 13161 locku_args->length = resend_rqstp->lr_flk->l_len;
13154 13162 if (locku_args->length == 0)
13155 13163 locku_args->length = ~locku_args->length;
13156 13164
13157 13165 switch (resend_rqstp->lr_ctype) {
13158 13166 case NFS4_LCK_CTYPE_RESEND:
13159 13167 argsp->ctag = TAG_LOCKU_RESEND;
13160 13168 break;
13161 13169 case NFS4_LCK_CTYPE_REINSTATE:
13162 13170 argsp->ctag = TAG_LOCKU_REINSTATE;
13163 13171 break;
13164 13172 default:
13165 13173 argsp->ctag = TAG_LOCK_UNKNOWN;
13166 13174 break;
13167 13175 }
13168 13176 }
13169 13177 }
13170 13178
13171 13179 /*
13172 13180 * Setup the LOCKT4 arguments.
13173 13181 */
13174 13182 static void
13175 13183 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13176 13184 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13177 13185 rnode4_t *rp)
13178 13186 {
13179 13187 LOCKT4args *lockt_args;
13180 13188
13181 13189 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13182 13190 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13183 13191 argop->argop = OP_LOCKT;
13184 13192 argsp->ctag = TAG_LOCKT;
13185 13193 lockt_args = &argop->nfs_argop4_u.oplockt;
13186 13194
13187 13195 /*
13188 13196 * The locktype will be READ_LT unless it's
13189 13197 * a write lock. We do this because the Solaris
13190 13198 * system call allows the combination of
13191 13199 * F_UNLCK and F_GETLK* and so in that case the
13192 13200 * unlock is mapped to a read.
13193 13201 */
13194 13202 if (flk->l_type == F_WRLCK)
13195 13203 lockt_args->locktype = WRITE_LT;
13196 13204 else
13197 13205 lockt_args->locktype = READ_LT;
13198 13206
13199 13207 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13200 13208 /* set the lock owner4 args */
13201 13209 nfs4_setlockowner_args(&lockt_args->owner, rp,
13202 13210 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13203 13211 flk->l_pid);
13204 13212 lockt_args->offset = flk->l_start;
13205 13213 lockt_args->length = flk->l_len;
13206 13214 if (flk->l_len == 0)
13207 13215 lockt_args->length = ~lockt_args->length;
13208 13216
13209 13217 *lockt_argsp = lockt_args;
13210 13218 }
13211 13219
13212 13220 /*
13213 13221 * If the client is holding a delegation, and the open stream to be used
13214 13222 * with this lock request is a delegation open stream, then re-open the stream.
13215 13223 * Sets the nfs4_error_t to all zeros unless the open stream has already
13216 13224 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY
13217 13225 * means the caller should retry (like a recovery retry).
13218 13226 */
13219 13227 static void
13220 13228 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13221 13229 {
13222 13230 open_delegation_type4 dt;
13223 13231 bool_t reopen_needed, force;
13224 13232 nfs4_open_stream_t *osp;
13225 13233 open_claim_type4 oclaim;
13226 13234 rnode4_t *rp = VTOR4(vp);
13227 13235 mntinfo4_t *mi = VTOMI4(vp);
13228 13236
13229 13237 ASSERT(nfs_zone() == mi->mi_zone);
13230 13238
13231 13239 nfs4_error_zinit(ep);
13232 13240
13233 13241 mutex_enter(&rp->r_statev4_lock);
13234 13242 dt = rp->r_deleg_type;
13235 13243 mutex_exit(&rp->r_statev4_lock);
13236 13244
13237 13245 if (dt != OPEN_DELEGATE_NONE) {
13238 13246 nfs4_open_owner_t *oop;
13239 13247
13240 13248 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13241 13249 if (!oop) {
13242 13250 ep->stat = NFS4ERR_IO;
13243 13251 return;
13244 13252 }
13245 13253 /* returns with 'os_sync_lock' held */
13246 13254 osp = find_open_stream(oop, rp);
13247 13255 if (!osp) {
13248 13256 open_owner_rele(oop);
13249 13257 ep->stat = NFS4ERR_IO;
13250 13258 return;
13251 13259 }
13252 13260
13253 13261 if (osp->os_failed_reopen) {
13254 13262 NFS4_DEBUG((nfs4_open_stream_debug ||
13255 13263 nfs4_client_lock_debug), (CE_NOTE,
13256 13264 "nfs4frlock_check_deleg: os_failed_reopen set "
13257 13265 "for osp %p, cr %p, rp %s", (void *)osp,
13258 13266 (void *)cr, rnode4info(rp)));
13259 13267 mutex_exit(&osp->os_sync_lock);
13260 13268 open_stream_rele(osp, rp);
13261 13269 open_owner_rele(oop);
13262 13270 ep->stat = NFS4ERR_IO;
13263 13271 return;
13264 13272 }
13265 13273
13266 13274 /*
13267 13275 * Determine whether a reopen is needed. If this
13268 13276 * is a delegation open stream, then send the open
13269 13277 * to the server to give visibility to the open owner.
13270 13278 * Even if it isn't a delegation open stream, we need
13271 13279 * to check if the previous open CLAIM_DELEGATE_CUR
13272 13280 * was sufficient.
13273 13281 */
13274 13282
13275 13283 reopen_needed = osp->os_delegation ||
13276 13284 ((lt == F_RDLCK &&
13277 13285 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13278 13286 (lt == F_WRLCK &&
13279 13287 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13280 13288
13281 13289 mutex_exit(&osp->os_sync_lock);
13282 13290 open_owner_rele(oop);
13283 13291
13284 13292 if (reopen_needed) {
13285 13293 /*
13286 13294 * Always use CLAIM_PREVIOUS after server reboot.
13287 13295 * The server will reject CLAIM_DELEGATE_CUR if
13288 13296 * it is used during the grace period.
13289 13297 */
13290 13298 mutex_enter(&mi->mi_lock);
13291 13299 if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13292 13300 oclaim = CLAIM_PREVIOUS;
13293 13301 force = TRUE;
13294 13302 } else {
13295 13303 oclaim = CLAIM_DELEGATE_CUR;
13296 13304 force = FALSE;
13297 13305 }
13298 13306 mutex_exit(&mi->mi_lock);
13299 13307
13300 13308 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13301 13309 if (ep->error == EAGAIN) {
13302 13310 nfs4_error_zinit(ep);
13303 13311 ep->stat = NFS4ERR_DELAY;
13304 13312 }
13305 13313 }
13306 13314 open_stream_rele(osp, rp);
13307 13315 osp = NULL;
13308 13316 }
13309 13317 }
13310 13318
13311 13319 /*
13312 13320 * Setup the LOCKU4 arguments.
13313 13321 * Returns errors via the nfs4_error_t.
13314 13322 * NFS4_OK no problems. *go_otwp is TRUE if call should go
13315 13323 * over-the-wire. The caller must release the
13316 13324 * reference on *lopp.
13317 13325 * NFS4ERR_DELAY caller should retry (like recovery retry)
13318 13326 * (other) unrecoverable error.
13319 13327 */
13320 13328 static void
13321 13329 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13322 13330 LOCKU4args **locku_argsp, flock64_t *flk,
13323 13331 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13324 13332 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13325 13333 bool_t *skip_get_err, bool_t *go_otwp)
13326 13334 {
13327 13335 nfs4_lock_owner_t *lop = NULL;
13328 13336 LOCKU4args *locku_args;
13329 13337 pid_t pid;
13330 13338 bool_t is_spec = FALSE;
13331 13339 rnode4_t *rp = VTOR4(vp);
13332 13340
13333 13341 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13334 13342 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13335 13343
13336 13344 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13337 13345 if (ep->error || ep->stat)
13338 13346 return;
13339 13347
13340 13348 argop->argop = OP_LOCKU;
13341 13349 if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13342 13350 argsp->ctag = TAG_LOCKU_REINSTATE;
13343 13351 else
13344 13352 argsp->ctag = TAG_LOCKU;
13345 13353 locku_args = &argop->nfs_argop4_u.oplocku;
13346 13354 *locku_argsp = locku_args;
13347 13355
13348 13356 /*
13349 13357 * XXX what should locku_args->locktype be?
13350 13358 * setting to ALWAYS be READ_LT so at least
13351 13359 * it is a valid locktype.
13352 13360 */
13353 13361
13354 13362 locku_args->locktype = READ_LT;
13355 13363
13356 13364 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13357 13365 flk->l_pid;
13358 13366
13359 13367 /*
13360 13368 * Get the lock owner stateid. If no lock owner
13361 13369 * exists, return success.
13362 13370 */
13363 13371 lop = find_lock_owner(rp, pid, LOWN_ANY);
13364 13372 *lopp = lop;
13365 13373 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13366 13374 is_spec = TRUE;
13367 13375 if (!lop || is_spec) {
13368 13376 /*
13369 13377 * No lock owner so no locks to unlock.
13370 13378 * Return success. If there was a failed
13371 13379 * reclaim earlier, the lock might still be
13372 13380 * registered with the local locking code,
13373 13381 * so notify it of the unlock.
13374 13382 *
13375 13383 * If the lockowner is using a special stateid,
13376 13384 * then the original lock request (that created
13377 13385 * this lockowner) was never successful, so we
13378 13386 * have no lock to undo OTW.
13379 13387 */
13380 13388 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13381 13389 "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13382 13390 "(%ld) so return success", (long)pid));
13383 13391
13384 13392 if (ctype == NFS4_LCK_CTYPE_NORM)
13385 13393 flk->l_pid = curproc->p_pid;
13386 13394 nfs4_register_lock_locally(vp, flk, flag, offset);
13387 13395 /*
13388 13396 * Release our hold and NULL out so final_cleanup
13389 13397 * doesn't try to end a lock seqid sync we
13390 13398 * never started.
13391 13399 */
13392 13400 if (is_spec) {
13393 13401 lock_owner_rele(lop);
13394 13402 *lopp = NULL;
13395 13403 }
13396 13404 *skip_get_err = TRUE;
13397 13405 *go_otwp = FALSE;
13398 13406 return;
13399 13407 }
13400 13408
13401 13409 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13402 13410 if (ep->error == EAGAIN) {
13403 13411 lock_owner_rele(lop);
13404 13412 *lopp = NULL;
13405 13413 return;
13406 13414 }
13407 13415
13408 13416 mutex_enter(&lop->lo_lock);
13409 13417 locku_args->lock_stateid = lop->lock_stateid;
13410 13418 mutex_exit(&lop->lo_lock);
13411 13419 locku_args->seqid = lop->lock_seqid + 1;
13412 13420
13413 13421 /* leave the ref count on lop, rele after RPC call */
13414 13422
13415 13423 locku_args->offset = flk->l_start;
13416 13424 locku_args->length = flk->l_len;
13417 13425 if (flk->l_len == 0)
13418 13426 locku_args->length = ~locku_args->length;
13419 13427
13420 13428 *go_otwp = TRUE;
13421 13429 }
13422 13430
13423 13431 /*
13424 13432 * Setup the LOCK4 arguments.
13425 13433 *
13426 13434 * Returns errors via the nfs4_error_t.
13427 13435 * NFS4_OK no problems
13428 13436 * NFS4ERR_DELAY caller should retry (like recovery retry)
13429 13437 * (other) unrecoverable error
13430 13438 */
13431 13439 static void
13432 13440 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13433 13441 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13434 13442 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13435 13443 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13436 13444 {
13437 13445 LOCK4args *lock_args;
13438 13446 nfs4_open_owner_t *oop = NULL;
13439 13447 nfs4_open_stream_t *osp = NULL;
13440 13448 nfs4_lock_owner_t *lop = NULL;
13441 13449 pid_t pid;
13442 13450 rnode4_t *rp = VTOR4(vp);
13443 13451
13444 13452 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13445 13453
13446 13454 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13447 13455 if (ep->error || ep->stat != NFS4_OK)
13448 13456 return;
13449 13457
13450 13458 argop->argop = OP_LOCK;
13451 13459 if (ctype == NFS4_LCK_CTYPE_NORM)
13452 13460 argsp->ctag = TAG_LOCK;
13453 13461 else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13454 13462 argsp->ctag = TAG_RELOCK;
13455 13463 else
13456 13464 argsp->ctag = TAG_LOCK_REINSTATE;
13457 13465 lock_args = &argop->nfs_argop4_u.oplock;
13458 13466 lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13459 13467 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13460 13468 /*
13461 13469 * Get the lock owner. If no lock owner exists,
13462 13470 * create a 'temporary' one and grab the open seqid
13463 13471 * synchronization (which puts a hold on the open
13464 13472 * owner and open stream).
13465 13473 * This also grabs the lock seqid synchronization.
13466 13474 */
13467 13475 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13468 13476 ep->stat =
13469 13477 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13470 13478
13471 13479 if (ep->stat != NFS4_OK)
13472 13480 goto out;
13473 13481
13474 13482 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13475 13483 &lock_args->locker);
13476 13484
13477 13485 lock_args->offset = flk->l_start;
13478 13486 lock_args->length = flk->l_len;
13479 13487 if (flk->l_len == 0)
13480 13488 lock_args->length = ~lock_args->length;
13481 13489 *lock_argsp = lock_args;
13482 13490 out:
13483 13491 *oopp = oop;
13484 13492 *ospp = osp;
13485 13493 *lopp = lop;
13486 13494 }
13487 13495
13488 13496 /*
13489 13497 * After we get the reply from the server, record the proper information
13490 13498 * for possible resend lock requests.
13491 13499 *
13492 13500 * Allocates memory for the saved_rqstp if we have a lost lock to save.
13493 13501 */
13494 13502 static void
13495 13503 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13496 13504 nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13497 13505 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13498 13506 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13499 13507 {
13500 13508 bool_t unlock = (flk->l_type == F_UNLCK);
13501 13509
13502 13510 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13503 13511 ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13504 13512 ctype == NFS4_LCK_CTYPE_REINSTATE);
13505 13513
13506 13514 if (error != 0 && !unlock) {
13507 13515 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13508 13516 nfs4_client_lock_debug), (CE_NOTE,
13509 13517 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13510 13518 " for lop %p", (void *)lop));
13511 13519 ASSERT(lop != NULL);
13512 13520 mutex_enter(&lop->lo_lock);
13513 13521 lop->lo_pending_rqsts = 1;
13514 13522 mutex_exit(&lop->lo_lock);
13515 13523 }
13516 13524
13517 13525 lost_rqstp->lr_putfirst = FALSE;
13518 13526 lost_rqstp->lr_op = 0;
13519 13527
13520 13528 /*
13521 13529 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13522 13530 * recovery purposes so that the lock request that was sent
13523 13531 * can be saved and re-issued later. Ditto for EIO from a forced
13524 13532 * unmount. This is done to have the client's local locking state
13525 13533 * match the v4 server's state; that is, the request was
13526 13534 * potentially received and accepted by the server but the client
13527 13535 * thinks it was not.
13528 13536 */
13529 13537 if (error == ETIMEDOUT || error == EINTR ||
13530 13538 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13531 13539 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13532 13540 nfs4_client_lock_debug), (CE_NOTE,
13533 13541 "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13534 13542 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13535 13543 (void *)lop, (void *)oop, (void *)osp));
13536 13544 if (unlock)
13537 13545 lost_rqstp->lr_op = OP_LOCKU;
13538 13546 else {
13539 13547 lost_rqstp->lr_op = OP_LOCK;
13540 13548 lost_rqstp->lr_locktype = locktype;
13541 13549 }
13542 13550 /*
13543 13551 * Objects are held and rele'd via the recovery code.
13544 13552 * See nfs4_save_lost_rqst.
13545 13553 */
13546 13554 lost_rqstp->lr_vp = vp;
13547 13555 lost_rqstp->lr_dvp = NULL;
13548 13556 lost_rqstp->lr_oop = oop;
13549 13557 lost_rqstp->lr_osp = osp;
13550 13558 lost_rqstp->lr_lop = lop;
13551 13559 lost_rqstp->lr_cr = cr;
13552 13560 switch (ctype) {
13553 13561 case NFS4_LCK_CTYPE_NORM:
13554 13562 flk->l_pid = ttoproc(curthread)->p_pid;
13555 13563 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13556 13564 break;
13557 13565 case NFS4_LCK_CTYPE_REINSTATE:
13558 13566 lost_rqstp->lr_putfirst = TRUE;
13559 13567 lost_rqstp->lr_ctype = ctype;
13560 13568 break;
13561 13569 default:
13562 13570 break;
13563 13571 }
13564 13572 lost_rqstp->lr_flk = flk;
13565 13573 }
13566 13574 }
13567 13575
13568 13576 /*
13569 13577 * Update lop's seqid. Also update the seqid stored in a resend request,
13570 13578 * if any. (Some recovery errors increment the seqid, and we may have to
13571 13579 * send the resend request again.)
13572 13580 */
13573 13581
13574 13582 static void
13575 13583 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13576 13584 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13577 13585 {
13578 13586 if (lock_args) {
13579 13587 if (lock_args->locker.new_lock_owner == TRUE)
13580 13588 nfs4_get_and_set_next_open_seqid(oop, tag_type);
13581 13589 else {
13582 13590 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13583 13591 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13584 13592 }
13585 13593 } else if (locku_args) {
13586 13594 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13587 13595 nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13588 13596 }
13589 13597 }
13590 13598
13591 13599 /*
13592 13600 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13593 13601 * COMPOUND4 args/res for calls that need to retry.
13594 13602 * Switches the *cred_otwp to base_cr.
13595 13603 */
13596 13604 static void
13597 13605 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13598 13606 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13599 13607 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13600 13608 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13601 13609 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13602 13610 {
13603 13611 nfs4_open_owner_t *oop = *oopp;
13604 13612 nfs4_open_stream_t *osp = *ospp;
13605 13613 nfs4_lock_owner_t *lop = *lopp;
13606 13614 nfs_argop4 *argop = (*argspp)->array;
13607 13615
13608 13616 if (*did_start_fop) {
13609 13617 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13610 13618 needrecov);
13611 13619 *did_start_fop = FALSE;
13612 13620 }
13613 13621 ASSERT((*argspp)->array_len == 2);
13614 13622 if (argop[1].argop == OP_LOCK)
13615 13623 nfs4args_lock_free(&argop[1]);
13616 13624 else if (argop[1].argop == OP_LOCKT)
13617 13625 nfs4args_lockt_free(&argop[1]);
13618 13626 kmem_free(argop, 2 * sizeof (nfs_argop4));
13619 13627 if (!error)
13620 13628 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13621 13629 *argspp = NULL;
13622 13630 *respp = NULL;
13623 13631
13624 13632 if (lop) {
13625 13633 nfs4_end_lock_seqid_sync(lop);
13626 13634 lock_owner_rele(lop);
13627 13635 *lopp = NULL;
13628 13636 }
13629 13637
13630 13638 /* need to free up the reference on osp for lock args */
13631 13639 if (osp != NULL) {
13632 13640 open_stream_rele(osp, VTOR4(vp));
13633 13641 *ospp = NULL;
13634 13642 }
13635 13643
13636 13644 /* need to free up the reference on oop for lock args */
13637 13645 if (oop != NULL) {
13638 13646 nfs4_end_open_seqid_sync(oop);
13639 13647 open_owner_rele(oop);
13640 13648 *oopp = NULL;
13641 13649 }
13642 13650
13643 13651 crfree(*cred_otwp);
13644 13652 *cred_otwp = base_cr;
13645 13653 crhold(*cred_otwp);
13646 13654 }
13647 13655
13648 13656 /*
13649 13657 * Function to process the client's recovery for nfs4frlock.
13650 13658 * Returns TRUE if we should retry the lock request; FALSE otherwise.
13651 13659 *
13652 13660 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13653 13661 * COMPOUND4 args/res for calls that need to retry.
13654 13662 *
13655 13663 * Note: the rp's r_lkserlock is *not* dropped during this path.
13656 13664 */
13657 13665 static bool_t
13658 13666 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13659 13667 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13660 13668 LOCK4args *lock_args, LOCKU4args *locku_args,
13661 13669 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13662 13670 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13663 13671 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13664 13672 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13665 13673 {
13666 13674 nfs4_open_owner_t *oop = *oopp;
13667 13675 nfs4_open_stream_t *osp = *ospp;
13668 13676 nfs4_lock_owner_t *lop = *lopp;
13669 13677
13670 13678 bool_t abort, retry;
13671 13679
13672 13680 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13673 13681 ASSERT((*argspp) != NULL);
13674 13682 ASSERT((*respp) != NULL);
13675 13683 if (lock_args || locku_args)
13676 13684 ASSERT(lop != NULL);
13677 13685
13678 13686 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13679 13687 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13680 13688
13681 13689 retry = TRUE;
13682 13690 abort = FALSE;
13683 13691 if (needrecov) {
13684 13692 nfs4_bseqid_entry_t *bsep = NULL;
13685 13693 nfs_opnum4 op;
13686 13694
13687 13695 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13688 13696
13689 13697 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13690 13698 seqid4 seqid;
13691 13699
13692 13700 if (lock_args) {
13693 13701 if (lock_args->locker.new_lock_owner == TRUE)
13694 13702 seqid = lock_args->locker.locker4_u.
13695 13703 open_owner.open_seqid;
13696 13704 else
13697 13705 seqid = lock_args->locker.locker4_u.
13698 13706 lock_owner.lock_seqid;
13699 13707 } else if (locku_args) {
13700 13708 seqid = locku_args->seqid;
13701 13709 } else {
13702 13710 seqid = 0;
13703 13711 }
13704 13712
13705 13713 bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13706 13714 flk->l_pid, (*argspp)->ctag, seqid);
13707 13715 }
13708 13716
13709 13717 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13710 13718 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13711 13719 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13712 13720 NULL, op, bsep, NULL, NULL);
13713 13721
13714 13722 if (bsep)
13715 13723 kmem_free(bsep, sizeof (*bsep));
13716 13724 }
13717 13725
13718 13726 /*
13719 13727 * Return that we do not want to retry the request for 3 cases:
13720 13728 * 1. If we received EINTR or are bailing out because of a forced
13721 13729 * unmount, we came into this code path just for the sake of
13722 13730 * initiating recovery, we now need to return the error.
13723 13731 * 2. If we have aborted recovery.
13724 13732 * 3. We received NFS4ERR_BAD_SEQID.
13725 13733 */
13726 13734 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13727 13735 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13728 13736 retry = FALSE;
13729 13737
13730 13738 if (*did_start_fop == TRUE) {
13731 13739 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13732 13740 needrecov);
13733 13741 *did_start_fop = FALSE;
13734 13742 }
13735 13743
13736 13744 if (retry == TRUE) {
13737 13745 nfs_argop4 *argop;
13738 13746
13739 13747 argop = (*argspp)->array;
13740 13748 ASSERT((*argspp)->array_len == 2);
13741 13749
13742 13750 if (argop[1].argop == OP_LOCK)
13743 13751 nfs4args_lock_free(&argop[1]);
13744 13752 else if (argop[1].argop == OP_LOCKT)
13745 13753 nfs4args_lockt_free(&argop[1]);
13746 13754 kmem_free(argop, 2 * sizeof (nfs_argop4));
13747 13755 if (!ep->error)
13748 13756 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13749 13757 *respp = NULL;
13750 13758 *argspp = NULL;
13751 13759 }
13752 13760
13753 13761 if (lop != NULL) {
13754 13762 nfs4_end_lock_seqid_sync(lop);
13755 13763 lock_owner_rele(lop);
13756 13764 }
13757 13765
13758 13766 *lopp = NULL;
13759 13767
13760 13768 /* need to free up the reference on osp for lock args */
13761 13769 if (osp != NULL) {
13762 13770 open_stream_rele(osp, rp);
13763 13771 *ospp = NULL;
13764 13772 }
13765 13773
13766 13774 /* need to free up the reference on oop for lock args */
13767 13775 if (oop != NULL) {
13768 13776 nfs4_end_open_seqid_sync(oop);
13769 13777 open_owner_rele(oop);
13770 13778 *oopp = NULL;
13771 13779 }
13772 13780
13773 13781 return (retry);
13774 13782 }
13775 13783
13776 13784 /*
13777 13785 * Handles the successful reply from the server for nfs4frlock.
13778 13786 */
13779 13787 static void
13780 13788 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13781 13789 vnode_t *vp, int flag, u_offset_t offset,
13782 13790 nfs4_lost_rqst_t *resend_rqstp)
13783 13791 {
13784 13792 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13785 13793 if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13786 13794 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13787 13795 if (ctype == NFS4_LCK_CTYPE_NORM) {
13788 13796 flk->l_pid = ttoproc(curthread)->p_pid;
13789 13797 /*
13790 13798 * We do not register lost locks locally in
13791 13799 * the 'resend' case since the user/application
13792 13800 * doesn't think we have the lock.
13793 13801 */
13794 13802 ASSERT(!resend_rqstp);
13795 13803 nfs4_register_lock_locally(vp, flk, flag, offset);
13796 13804 }
13797 13805 }
13798 13806 }
13799 13807
13800 13808 /*
13801 13809 * Handle the DENIED reply from the server for nfs4frlock.
13802 13810 * Returns TRUE if we should retry the request; FALSE otherwise.
13803 13811 *
13804 13812 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13805 13813 * COMPOUND4 args/res for calls that need to retry. Can also
13806 13814 * drop and regrab the r_lkserlock.
13807 13815 */
13808 13816 static bool_t
13809 13817 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13810 13818 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13811 13819 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13812 13820 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13813 13821 nfs4_recov_state_t *recov_statep, int needrecov,
13814 13822 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13815 13823 clock_t *tick_delayp, short *whencep, int *errorp,
13816 13824 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13817 13825 bool_t *skip_get_err)
13818 13826 {
13819 13827 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13820 13828
13821 13829 if (lock_args) {
13822 13830 nfs4_open_owner_t *oop = *oopp;
13823 13831 nfs4_open_stream_t *osp = *ospp;
13824 13832 nfs4_lock_owner_t *lop = *lopp;
13825 13833 int intr;
13826 13834
13827 13835 /*
13828 13836 * Blocking lock needs to sleep and retry from the request.
13829 13837 *
13830 13838 * Do not block and wait for 'resend' or 'reinstate'
13831 13839 * lock requests, just return the error.
13832 13840 *
13833 13841 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13834 13842 */
13835 13843 if (cmd == F_SETLKW) {
13836 13844 rnode4_t *rp = VTOR4(vp);
13837 13845 nfs_argop4 *argop = (*argspp)->array;
13838 13846
13839 13847 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13840 13848
13841 13849 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13842 13850 recov_statep, needrecov);
13843 13851 *did_start_fop = FALSE;
13844 13852 ASSERT((*argspp)->array_len == 2);
13845 13853 if (argop[1].argop == OP_LOCK)
13846 13854 nfs4args_lock_free(&argop[1]);
13847 13855 else if (argop[1].argop == OP_LOCKT)
13848 13856 nfs4args_lockt_free(&argop[1]);
13849 13857 kmem_free(argop, 2 * sizeof (nfs_argop4));
13850 13858 if (*respp)
13851 13859 (void) xdr_free(xdr_COMPOUND4res_clnt,
13852 13860 (caddr_t)*respp);
13853 13861 *argspp = NULL;
13854 13862 *respp = NULL;
13855 13863 nfs4_end_lock_seqid_sync(lop);
13856 13864 lock_owner_rele(lop);
13857 13865 *lopp = NULL;
13858 13866 if (osp != NULL) {
13859 13867 open_stream_rele(osp, rp);
13860 13868 *ospp = NULL;
13861 13869 }
13862 13870 if (oop != NULL) {
13863 13871 nfs4_end_open_seqid_sync(oop);
13864 13872 open_owner_rele(oop);
13865 13873 *oopp = NULL;
13866 13874 }
13867 13875
13868 13876 nfs_rw_exit(&rp->r_lkserlock);
13869 13877
13870 13878 intr = nfs4_block_and_wait(tick_delayp, rp);
13871 13879
13872 13880 if (intr) {
13873 13881 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13874 13882 RW_WRITER, FALSE);
13875 13883 *errorp = EINTR;
13876 13884 return (FALSE);
13877 13885 }
13878 13886
13879 13887 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13880 13888 RW_WRITER, FALSE);
13881 13889
13882 13890 /*
13883 13891 * Make sure we are still safe to lock with
13884 13892 * regards to mmapping.
13885 13893 */
13886 13894 if (!nfs4_safelock(vp, flk, cr)) {
13887 13895 *errorp = EAGAIN;
13888 13896 return (FALSE);
13889 13897 }
13890 13898
13891 13899 return (TRUE);
13892 13900 }
13893 13901 if (ctype == NFS4_LCK_CTYPE_NORM)
13894 13902 *errorp = EAGAIN;
13895 13903 *skip_get_err = TRUE;
13896 13904 flk->l_whence = 0;
13897 13905 *whencep = 0;
13898 13906 return (FALSE);
13899 13907 } else if (lockt_args) {
13900 13908 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13901 13909 "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13902 13910
13903 13911 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13904 13912 flk, lockt_args);
13905 13913
13906 13914 /* according to NLM code */
13907 13915 *errorp = 0;
13908 13916 *whencep = 0;
13909 13917 *skip_get_err = TRUE;
13910 13918 return (FALSE);
13911 13919 }
13912 13920 return (FALSE);
13913 13921 }
13914 13922
13915 13923 /*
13916 13924 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13917 13925 */
13918 13926 static void
13919 13927 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13920 13928 {
13921 13929 switch (resp->status) {
13922 13930 case NFS4ERR_ACCESS:
13923 13931 case NFS4ERR_ADMIN_REVOKED:
13924 13932 case NFS4ERR_BADHANDLE:
13925 13933 case NFS4ERR_BAD_RANGE:
13926 13934 case NFS4ERR_BAD_SEQID:
13927 13935 case NFS4ERR_BAD_STATEID:
13928 13936 case NFS4ERR_BADXDR:
13929 13937 case NFS4ERR_DEADLOCK:
13930 13938 case NFS4ERR_DELAY:
13931 13939 case NFS4ERR_EXPIRED:
13932 13940 case NFS4ERR_FHEXPIRED:
13933 13941 case NFS4ERR_GRACE:
13934 13942 case NFS4ERR_INVAL:
13935 13943 case NFS4ERR_ISDIR:
13936 13944 case NFS4ERR_LEASE_MOVED:
13937 13945 case NFS4ERR_LOCK_NOTSUPP:
13938 13946 case NFS4ERR_LOCK_RANGE:
13939 13947 case NFS4ERR_MOVED:
13940 13948 case NFS4ERR_NOFILEHANDLE:
13941 13949 case NFS4ERR_NO_GRACE:
13942 13950 case NFS4ERR_OLD_STATEID:
13943 13951 case NFS4ERR_OPENMODE:
13944 13952 case NFS4ERR_RECLAIM_BAD:
13945 13953 case NFS4ERR_RECLAIM_CONFLICT:
13946 13954 case NFS4ERR_RESOURCE:
13947 13955 case NFS4ERR_SERVERFAULT:
13948 13956 case NFS4ERR_STALE:
13949 13957 case NFS4ERR_STALE_CLIENTID:
13950 13958 case NFS4ERR_STALE_STATEID:
13951 13959 return;
13952 13960 default:
13953 13961 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13954 13962 "nfs4frlock_results_default: got unrecognizable "
13955 13963 "res.status %d", resp->status));
13956 13964 *errorp = NFS4ERR_INVAL;
13957 13965 }
13958 13966 }
13959 13967
13960 13968 /*
13961 13969 * The lock request was successful, so update the client's state.
13962 13970 */
13963 13971 static void
13964 13972 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13965 13973 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13966 13974 vnode_t *vp, flock64_t *flk, cred_t *cr,
13967 13975 nfs4_lost_rqst_t *resend_rqstp)
13968 13976 {
13969 13977 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13970 13978
13971 13979 if (lock_args) {
13972 13980 LOCK4res *lock_res;
13973 13981
13974 13982 lock_res = &resop->nfs_resop4_u.oplock;
13975 13983 /* update the stateid with server's response */
13976 13984
13977 13985 if (lock_args->locker.new_lock_owner == TRUE) {
13978 13986 mutex_enter(&lop->lo_lock);
13979 13987 lop->lo_just_created = NFS4_PERM_CREATED;
13980 13988 mutex_exit(&lop->lo_lock);
13981 13989 }
13982 13990
13983 13991 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13984 13992
13985 13993 /*
13986 13994 * If the lock was the result of a resending a lost
13987 13995 * request, we've synched up the stateid and seqid
13988 13996 * with the server, but now the server might be out of sync
13989 13997 * with what the application thinks it has for locks.
13990 13998 * Clean that up here. It's unclear whether we should do
13991 13999 * this even if the filesystem has been forcibly unmounted.
13992 14000 * For most servers, it's probably wasted effort, but
13993 14001 * RFC3530 lets servers require that unlocks exactly match
13994 14002 * the locks that are held.
13995 14003 */
13996 14004 if (resend_rqstp != NULL &&
13997 14005 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13998 14006 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13999 14007 } else {
14000 14008 flk->l_whence = 0;
14001 14009 }
14002 14010 } else if (locku_args) {
14003 14011 LOCKU4res *locku_res;
14004 14012
14005 14013 locku_res = &resop->nfs_resop4_u.oplocku;
14006 14014
14007 14015 /* Update the stateid with the server's response */
14008 14016 nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
14009 14017 } else if (lockt_args) {
14010 14018 /* Switch the lock type to express success, see fcntl */
14011 14019 flk->l_type = F_UNLCK;
14012 14020 flk->l_whence = 0;
14013 14021 }
14014 14022 }
14015 14023
14016 14024 /*
14017 14025 * Do final cleanup before exiting nfs4frlock.
14018 14026 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14019 14027 * COMPOUND4 args/res for calls that haven't already.
14020 14028 */
14021 14029 static void
14022 14030 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14023 14031 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14024 14032 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14025 14033 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14026 14034 short whence, u_offset_t offset, struct lm_sysid *ls,
14027 14035 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14028 14036 bool_t did_start_fop, bool_t skip_get_err,
14029 14037 cred_t *cred_otw, cred_t *cred)
14030 14038 {
14031 14039 mntinfo4_t *mi = VTOMI4(vp);
14032 14040 rnode4_t *rp = VTOR4(vp);
14033 14041 int error = *errorp;
14034 14042 nfs_argop4 *argop;
14035 14043 int do_flush_pages = 0;
14036 14044
14037 14045 ASSERT(nfs_zone() == mi->mi_zone);
14038 14046 /*
14039 14047 * The client recovery code wants the raw status information,
14040 14048 * so don't map the NFS status code to an errno value for
14041 14049 * non-normal call types.
14042 14050 */
14043 14051 if (ctype == NFS4_LCK_CTYPE_NORM) {
14044 14052 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14045 14053 *errorp = geterrno4(resp->status);
14046 14054 if (did_start_fop == TRUE)
14047 14055 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14048 14056 needrecov);
14049 14057
14050 14058 /*
14051 14059 * We've established a new lock on the server, so invalidate
14052 14060 * the pages associated with the vnode to get the most up to
14053 14061 * date pages from the server after acquiring the lock. We
14054 14062 * want to be sure that the read operation gets the newest data.
14055 14063 * N.B.
14056 14064 * We used to do this in nfs4frlock_results_ok but that doesn't
14057 14065 * work since VOP_PUTPAGE can call nfs4_commit which calls
14058 14066 * nfs4_start_fop. We flush the pages below after calling
14059 14067 * nfs4_end_fop above
14060 14068 * The flush of the page cache must be done after
14061 14069 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14062 14070 */
14063 14071 if (!error && resp && resp->status == NFS4_OK)
14064 14072 do_flush_pages = 1;
14065 14073 }
14066 14074 if (argsp) {
14067 14075 ASSERT(argsp->array_len == 2);
14068 14076 argop = argsp->array;
14069 14077 if (argop[1].argop == OP_LOCK)
14070 14078 nfs4args_lock_free(&argop[1]);
14071 14079 else if (argop[1].argop == OP_LOCKT)
14072 14080 nfs4args_lockt_free(&argop[1]);
14073 14081 kmem_free(argop, 2 * sizeof (nfs_argop4));
14074 14082 if (resp)
14075 14083 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14076 14084 }
14077 14085
14078 14086 /* free the reference on the lock owner */
14079 14087 if (lop != NULL) {
14080 14088 nfs4_end_lock_seqid_sync(lop);
14081 14089 lock_owner_rele(lop);
14082 14090 }
14083 14091
14084 14092 /* need to free up the reference on osp for lock args */
14085 14093 if (osp != NULL)
14086 14094 open_stream_rele(osp, rp);
14087 14095
14088 14096 /* need to free up the reference on oop for lock args */
14089 14097 if (oop != NULL) {
14090 14098 nfs4_end_open_seqid_sync(oop);
14091 14099 open_owner_rele(oop);
14092 14100 }
14093 14101
14094 14102 if (do_flush_pages)
14095 14103 nfs4_flush_pages(vp, cred);
14096 14104
14097 14105 (void) convoff(vp, flk, whence, offset);
14098 14106
14099 14107 lm_rel_sysid(ls);
14100 14108
14101 14109 /*
14102 14110 * Record debug information in the event we get EINVAL.
14103 14111 */
14104 14112 mutex_enter(&mi->mi_lock);
14105 14113 if (*errorp == EINVAL && (lock_args || locku_args) &&
14106 14114 (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14107 14115 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14108 14116 zcmn_err(getzoneid(), CE_NOTE,
14109 14117 "%s operation failed with "
14110 14118 "EINVAL probably since the server, %s,"
14111 14119 " doesn't support POSIX style locking",
14112 14120 lock_args ? "LOCK" : "LOCKU",
14113 14121 mi->mi_curr_serv->sv_hostname);
14114 14122 mi->mi_flags |= MI4_LOCK_DEBUG;
14115 14123 }
14116 14124 }
14117 14125 mutex_exit(&mi->mi_lock);
14118 14126
14119 14127 if (cred_otw)
14120 14128 crfree(cred_otw);
14121 14129 }
14122 14130
14123 14131 /*
14124 14132 * This calls the server and the local locking code.
14125 14133 *
14126 14134 * Client locks are registerred locally by oring the sysid with
14127 14135 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14128 14136 * We need to distinguish between the two to avoid collision in case one
14129 14137 * machine is used as both client and server.
14130 14138 *
14131 14139 * Blocking lock requests will continually retry to acquire the lock
14132 14140 * forever.
14133 14141 *
14134 14142 * The ctype is defined as follows:
14135 14143 * NFS4_LCK_CTYPE_NORM: normal lock request.
14136 14144 *
14137 14145 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client
14138 14146 * recovery, get the pid from flk instead of curproc, and don't reregister
14139 14147 * the lock locally.
14140 14148 *
14141 14149 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14142 14150 * that we will use the information passed in via resend_rqstp to setup the
14143 14151 * lock/locku request. This resend is the exact same request as the 'lost
14144 14152 * lock', and is initiated by the recovery framework. A successful resend
14145 14153 * request can initiate one or more reinstate requests.
14146 14154 *
14147 14155 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14148 14156 * does not trigger additional reinstate requests. This lock call type is
14149 14157 * set for setting the v4 server's locking state back to match what the
14150 14158 * client's local locking state is in the event of a received 'lost lock'.
14151 14159 *
14152 14160 * Errors are returned via the nfs4_error_t parameter.
14153 14161 */
14154 14162 void
14155 14163 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14156 14164 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14157 14165 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14158 14166 {
14159 14167 COMPOUND4args_clnt args, *argsp = NULL;
14160 14168 COMPOUND4res_clnt res, *resp = NULL;
14161 14169 nfs_argop4 *argop;
14162 14170 nfs_resop4 *resop;
14163 14171 rnode4_t *rp;
14164 14172 int doqueue = 1;
14165 14173 clock_t tick_delay; /* delay in clock ticks */
14166 14174 struct lm_sysid *ls;
14167 14175 LOCK4args *lock_args = NULL;
14168 14176 LOCKU4args *locku_args = NULL;
14169 14177 LOCKT4args *lockt_args = NULL;
14170 14178 nfs4_open_owner_t *oop = NULL;
14171 14179 nfs4_open_stream_t *osp = NULL;
14172 14180 nfs4_lock_owner_t *lop = NULL;
14173 14181 bool_t needrecov = FALSE;
14174 14182 nfs4_recov_state_t recov_state;
14175 14183 short whence;
14176 14184 nfs4_op_hint_t op_hint;
14177 14185 nfs4_lost_rqst_t lost_rqst;
14178 14186 bool_t retry = FALSE;
14179 14187 bool_t did_start_fop = FALSE;
14180 14188 bool_t skip_get_err = FALSE;
14181 14189 cred_t *cred_otw = NULL;
14182 14190 bool_t recovonly; /* just queue request */
14183 14191 int frc_no_reclaim = 0;
14184 14192 #ifdef DEBUG
14185 14193 char *name;
14186 14194 #endif
14187 14195
14188 14196 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14189 14197
14190 14198 #ifdef DEBUG
14191 14199 name = fn_name(VTOSV(vp)->sv_name);
14192 14200 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14193 14201 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14194 14202 "length %"PRIu64", pid %d, sysid %d, call type %s, "
14195 14203 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14196 14204 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14197 14205 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14198 14206 resend_rqstp ? "TRUE" : "FALSE"));
14199 14207 kmem_free(name, MAXNAMELEN);
14200 14208 #endif
14201 14209
14202 14210 nfs4_error_zinit(ep);
14203 14211 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14204 14212 if (ep->error)
14205 14213 return;
14206 14214 ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14207 14215 if (ep->error)
14208 14216 return;
14209 14217 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14210 14218 vp, cr, &cred_otw);
14211 14219
14212 14220 recov_retry:
14213 14221 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14214 14222 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14215 14223 rp = VTOR4(vp);
14216 14224
14217 14225 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14218 14226 &did_start_fop, &recovonly);
14219 14227
14220 14228 if (ep->error)
14221 14229 goto out;
14222 14230
14223 14231 if (recovonly) {
14224 14232 /*
14225 14233 * Leave the request for the recovery system to deal with.
14226 14234 */
14227 14235 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14228 14236 ASSERT(cmd != F_GETLK);
14229 14237 ASSERT(flk->l_type == F_UNLCK);
14230 14238
14231 14239 nfs4_error_init(ep, EINTR);
14232 14240 needrecov = TRUE;
14233 14241 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14234 14242 if (lop != NULL) {
14235 14243 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14236 14244 NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14237 14245 (void) nfs4_start_recovery(ep,
14238 14246 VTOMI4(vp), vp, NULL, NULL,
14239 14247 (lost_rqst.lr_op == OP_LOCK ||
14240 14248 lost_rqst.lr_op == OP_LOCKU) ?
14241 14249 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14242 14250 lock_owner_rele(lop);
14243 14251 lop = NULL;
14244 14252 }
14245 14253 flk->l_pid = curproc->p_pid;
14246 14254 nfs4_register_lock_locally(vp, flk, flag, offset);
14247 14255 goto out;
14248 14256 }
14249 14257
14250 14258 /* putfh directory fh */
14251 14259 argop[0].argop = OP_CPUTFH;
14252 14260 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14253 14261
14254 14262 /*
14255 14263 * Set up the over-the-wire arguments and get references to the
14256 14264 * open owner, etc.
14257 14265 */
14258 14266
14259 14267 if (ctype == NFS4_LCK_CTYPE_RESEND ||
14260 14268 ctype == NFS4_LCK_CTYPE_REINSTATE) {
14261 14269 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14262 14270 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14263 14271 } else {
14264 14272 bool_t go_otw = TRUE;
14265 14273
14266 14274 ASSERT(resend_rqstp == NULL);
14267 14275
14268 14276 switch (cmd) {
14269 14277 case F_GETLK:
14270 14278 case F_O_GETLK:
14271 14279 nfs4frlock_setup_lockt_args(ctype, &argop[1],
14272 14280 &lockt_args, argsp, flk, rp);
14273 14281 break;
14274 14282 case F_SETLKW:
14275 14283 case F_SETLK:
14276 14284 if (flk->l_type == F_UNLCK)
14277 14285 nfs4frlock_setup_locku_args(ctype,
14278 14286 &argop[1], &locku_args, flk,
14279 14287 &lop, ep, argsp,
14280 14288 vp, flag, offset, cr,
14281 14289 &skip_get_err, &go_otw);
14282 14290 else
14283 14291 nfs4frlock_setup_lock_args(ctype,
14284 14292 &lock_args, &oop, &osp, &lop, &argop[1],
14285 14293 argsp, flk, cmd, vp, cr, ep);
14286 14294
14287 14295 if (ep->error)
14288 14296 goto out;
14289 14297
14290 14298 switch (ep->stat) {
14291 14299 case NFS4_OK:
14292 14300 break;
14293 14301 case NFS4ERR_DELAY:
14294 14302 /* recov thread never gets this error */
14295 14303 ASSERT(resend_rqstp == NULL);
14296 14304 ASSERT(did_start_fop);
14297 14305
14298 14306 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14299 14307 &recov_state, TRUE);
14300 14308 did_start_fop = FALSE;
14301 14309 if (argop[1].argop == OP_LOCK)
14302 14310 nfs4args_lock_free(&argop[1]);
14303 14311 else if (argop[1].argop == OP_LOCKT)
14304 14312 nfs4args_lockt_free(&argop[1]);
14305 14313 kmem_free(argop, 2 * sizeof (nfs_argop4));
14306 14314 argsp = NULL;
14307 14315 goto recov_retry;
14308 14316 default:
14309 14317 ep->error = EIO;
14310 14318 goto out;
14311 14319 }
14312 14320 break;
14313 14321 default:
14314 14322 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14315 14323 "nfs4_frlock: invalid cmd %d", cmd));
14316 14324 ep->error = EINVAL;
14317 14325 goto out;
14318 14326 }
14319 14327
14320 14328 if (!go_otw)
14321 14329 goto out;
14322 14330 }
14323 14331
14324 14332 /* XXX should we use the local reclock as a cache ? */
14325 14333 /*
14326 14334 * Unregister the lock with the local locking code before
14327 14335 * contacting the server. This avoids a potential race where
14328 14336 * another process gets notified that it has been granted a lock
14329 14337 * before we can unregister ourselves locally.
14330 14338 */
14331 14339 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14332 14340 if (ctype == NFS4_LCK_CTYPE_NORM)
14333 14341 flk->l_pid = ttoproc(curthread)->p_pid;
14334 14342 nfs4_register_lock_locally(vp, flk, flag, offset);
14335 14343 }
14336 14344
14337 14345 /*
14338 14346 * Send the server the lock request. Continually loop with a delay
14339 14347 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14340 14348 */
14341 14349 resp = &res;
14342 14350
14343 14351 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14344 14352 (CE_NOTE,
14345 14353 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14346 14354 rnode4info(rp)));
14347 14355
14348 14356 if (lock_args && frc_no_reclaim) {
14349 14357 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14350 14358 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14351 14359 "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14352 14360 lock_args->reclaim = FALSE;
14353 14361 if (did_reclaimp)
14354 14362 *did_reclaimp = 0;
14355 14363 }
14356 14364
14357 14365 /*
14358 14366 * Do the OTW call.
14359 14367 */
14360 14368 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14361 14369
14362 14370 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14363 14371 "nfs4frlock: error %d, status %d", ep->error, resp->status));
14364 14372
14365 14373 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14366 14374 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14367 14375 "nfs4frlock: needrecov %d", needrecov));
14368 14376
14369 14377 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14370 14378 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14371 14379 args.ctag);
14372 14380
14373 14381 /*
14374 14382 * Check if one of these mutually exclusive error cases has
14375 14383 * happened:
14376 14384 * need to swap credentials due to access error
14377 14385 * recovery is needed
14378 14386 * different error (only known case is missing Kerberos ticket)
14379 14387 */
14380 14388
14381 14389 if ((ep->error == EACCES ||
14382 14390 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14383 14391 cred_otw != cr) {
14384 14392 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14385 14393 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14386 14394 cr, &cred_otw);
14387 14395 goto recov_retry;
14388 14396 }
14389 14397
14390 14398 if (needrecov) {
14391 14399 /*
14392 14400 * LOCKT requests don't need to recover from lost
14393 14401 * requests since they don't create/modify state.
14394 14402 */
14395 14403 if ((ep->error == EINTR ||
14396 14404 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14397 14405 lockt_args)
14398 14406 goto out;
14399 14407 /*
14400 14408 * Do not attempt recovery for requests initiated by
14401 14409 * the recovery framework. Let the framework redrive them.
14402 14410 */
14403 14411 if (ctype != NFS4_LCK_CTYPE_NORM)
14404 14412 goto out;
14405 14413 else {
14406 14414 ASSERT(resend_rqstp == NULL);
14407 14415 }
14408 14416
14409 14417 nfs4frlock_save_lost_rqst(ctype, ep->error,
14410 14418 flk_to_locktype(cmd, flk->l_type),
14411 14419 oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14412 14420
14413 14421 retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14414 14422 &resp, lock_args, locku_args, &oop, &osp, &lop,
14415 14423 rp, vp, &recov_state, op_hint, &did_start_fop,
14416 14424 cmd != F_GETLK ? &lost_rqst : NULL, flk);
14417 14425
14418 14426 if (retry) {
14419 14427 ASSERT(oop == NULL);
14420 14428 ASSERT(osp == NULL);
14421 14429 ASSERT(lop == NULL);
14422 14430 goto recov_retry;
14423 14431 }
14424 14432 goto out;
14425 14433 }
14426 14434
14427 14435 /*
14428 14436 * Bail out if have reached this point with ep->error set. Can
14429 14437 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14430 14438 * This happens if Kerberos ticket has expired or has been
14431 14439 * destroyed.
14432 14440 */
14433 14441 if (ep->error != 0)
14434 14442 goto out;
14435 14443
14436 14444 /*
14437 14445 * Process the reply.
14438 14446 */
14439 14447 switch (resp->status) {
14440 14448 case NFS4_OK:
14441 14449 resop = &resp->array[1];
14442 14450 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14443 14451 resend_rqstp);
14444 14452 /*
14445 14453 * Have a successful lock operation, now update state.
14446 14454 */
14447 14455 nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14448 14456 resop, lop, vp, flk, cr, resend_rqstp);
14449 14457 break;
14450 14458
14451 14459 case NFS4ERR_DENIED:
14452 14460 resop = &resp->array[1];
14453 14461 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14454 14462 &oop, &osp, &lop, cmd, vp, flk, op_hint,
14455 14463 &recov_state, needrecov, &argsp, &resp,
14456 14464 &tick_delay, &whence, &ep->error, resop, cr,
14457 14465 &did_start_fop, &skip_get_err);
14458 14466
14459 14467 if (retry) {
14460 14468 ASSERT(oop == NULL);
14461 14469 ASSERT(osp == NULL);
14462 14470 ASSERT(lop == NULL);
14463 14471 goto recov_retry;
14464 14472 }
14465 14473 break;
14466 14474 /*
14467 14475 * If the server won't let us reclaim, fall-back to trying to lock
14468 14476 * the file from scratch. Code elsewhere will check the changeinfo
14469 14477 * to ensure the file hasn't been changed.
14470 14478 */
14471 14479 case NFS4ERR_NO_GRACE:
14472 14480 if (lock_args && lock_args->reclaim == TRUE) {
14473 14481 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14474 14482 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14475 14483 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14476 14484 frc_no_reclaim = 1;
14477 14485 /* clean up before retrying */
14478 14486 needrecov = 0;
14479 14487 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14480 14488 lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14481 14489 &recov_state, op_hint, &did_start_fop, NULL, flk);
14482 14490 goto recov_retry;
14483 14491 }
14484 14492 /* FALLTHROUGH */
14485 14493
14486 14494 default:
14487 14495 nfs4frlock_results_default(resp, &ep->error);
14488 14496 break;
14489 14497 }
14490 14498 out:
14491 14499 /*
14492 14500 * Process and cleanup from error. Make interrupted unlock
14493 14501 * requests look successful, since they will be handled by the
14494 14502 * client recovery code.
14495 14503 */
14496 14504 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14497 14505 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14498 14506 lock_args, locku_args, did_start_fop,
14499 14507 skip_get_err, cred_otw, cr);
14500 14508
14501 14509 if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14502 14510 (cmd == F_SETLK || cmd == F_SETLKW))
14503 14511 ep->error = 0;
14504 14512 }
14505 14513
14506 14514 /*
14507 14515 * nfs4_safelock:
14508 14516 *
14509 14517 * Return non-zero if the given lock request can be handled without
14510 14518 * violating the constraints on concurrent mapping and locking.
14511 14519 */
14512 14520
14513 14521 static int
14514 14522 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14515 14523 {
14516 14524 rnode4_t *rp = VTOR4(vp);
14517 14525 struct vattr va;
14518 14526 int error;
14519 14527
14520 14528 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14521 14529 ASSERT(rp->r_mapcnt >= 0);
14522 14530 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14523 14531 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14524 14532 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14525 14533 bfp->l_start, bfp->l_len, rp->r_mapcnt));
14526 14534
14527 14535 if (rp->r_mapcnt == 0)
14528 14536 return (1); /* always safe if not mapped */
14529 14537
14530 14538 /*
14531 14539 * If the file is already mapped and there are locks, then they
14532 14540 * should be all safe locks. So adding or removing a lock is safe
14533 14541 * as long as the new request is safe (i.e., whole-file, meaning
14534 14542 * length and starting offset are both zero).
14535 14543 */
14536 14544
14537 14545 if (bfp->l_start != 0 || bfp->l_len != 0) {
14538 14546 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14539 14547 "cannot lock a memory mapped file unless locking the "
14540 14548 "entire file: start %"PRIx64", len %"PRIx64,
14541 14549 bfp->l_start, bfp->l_len));
14542 14550 return (0);
14543 14551 }
14544 14552
14545 14553 /* mandatory locking and mapping don't mix */
14546 14554 va.va_mask = AT_MODE;
14547 14555 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14548 14556 if (error != 0) {
14549 14557 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14550 14558 "getattr error %d", error));
14551 14559 return (0); /* treat errors conservatively */
14552 14560 }
14553 14561 if (MANDLOCK(vp, va.va_mode)) {
14554 14562 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14555 14563 "cannot mandatory lock and mmap a file"));
14556 14564 return (0);
14557 14565 }
14558 14566
14559 14567 return (1);
14560 14568 }
14561 14569
14562 14570
14563 14571 /*
14564 14572 * Register the lock locally within Solaris.
14565 14573 * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14566 14574 * recording locks locally.
14567 14575 *
14568 14576 * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14569 14577 * are registered locally.
14570 14578 */
14571 14579 void
14572 14580 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14573 14581 u_offset_t offset)
14574 14582 {
14575 14583 int oldsysid;
14576 14584 int error;
14577 14585 #ifdef DEBUG
14578 14586 char *name;
14579 14587 #endif
14580 14588
14581 14589 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14582 14590
14583 14591 #ifdef DEBUG
14584 14592 name = fn_name(VTOSV(vp)->sv_name);
14585 14593 NFS4_DEBUG(nfs4_client_lock_debug,
14586 14594 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14587 14595 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14588 14596 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14589 14597 flk->l_sysid));
14590 14598 kmem_free(name, MAXNAMELEN);
14591 14599 #endif
14592 14600
14593 14601 /* register the lock with local locking */
14594 14602 oldsysid = flk->l_sysid;
14595 14603 flk->l_sysid |= LM_SYSID_CLIENT;
14596 14604 error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14597 14605 #ifdef DEBUG
14598 14606 if (error != 0) {
14599 14607 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14600 14608 "nfs4_register_lock_locally: could not register with"
14601 14609 " local locking"));
14602 14610 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14603 14611 "error %d, vp 0x%p, pid %d, sysid 0x%x",
14604 14612 error, (void *)vp, flk->l_pid, flk->l_sysid));
14605 14613 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14606 14614 "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14607 14615 flk->l_type, flk->l_start, flk->l_len));
14608 14616 (void) reclock(vp, flk, 0, flag, offset, NULL);
14609 14617 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14610 14618 "blocked by pid %d sysid 0x%x type %d "
14611 14619 "off 0x%" PRIx64 " len 0x%" PRIx64,
14612 14620 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14613 14621 flk->l_len));
14614 14622 }
14615 14623 #endif
14616 14624 flk->l_sysid = oldsysid;
14617 14625 }
14618 14626
14619 14627 /*
14620 14628 * nfs4_lockrelease:
14621 14629 *
14622 14630 * Release any locks on the given vnode that are held by the current
14623 14631 * process. Also removes the lock owner (if one exists) from the rnode's
14624 14632 * list.
14625 14633 */
14626 14634 static int
14627 14635 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14628 14636 {
14629 14637 flock64_t ld;
14630 14638 int ret, error;
14631 14639 rnode4_t *rp;
14632 14640 nfs4_lock_owner_t *lop;
14633 14641 nfs4_recov_state_t recov_state;
14634 14642 mntinfo4_t *mi;
14635 14643 bool_t possible_orphan = FALSE;
14636 14644 bool_t recovonly;
14637 14645
14638 14646 ASSERT((uintptr_t)vp > KERNELBASE);
14639 14647 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14640 14648
14641 14649 rp = VTOR4(vp);
14642 14650 mi = VTOMI4(vp);
14643 14651
14644 14652 /*
14645 14653 * If we have not locked anything then we can
14646 14654 * just return since we have no work to do.
14647 14655 */
14648 14656 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14649 14657 return (0);
14650 14658 }
14651 14659
14652 14660 /*
14653 14661 * We need to comprehend that another thread may
14654 14662 * kick off recovery and the lock_owner we have stashed
14655 14663 * in lop might be invalid so we should NOT cache it
14656 14664 * locally!
14657 14665 */
14658 14666 recov_state.rs_flags = 0;
14659 14667 recov_state.rs_num_retry_despite_err = 0;
14660 14668 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14661 14669 &recovonly);
14662 14670 if (error) {
14663 14671 mutex_enter(&rp->r_statelock);
14664 14672 rp->r_flags |= R4LODANGLERS;
14665 14673 mutex_exit(&rp->r_statelock);
14666 14674 return (error);
14667 14675 }
14668 14676
14669 14677 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14670 14678
14671 14679 /*
14672 14680 * Check if the lock owner might have a lock (request was sent but
14673 14681 * no response was received). Also check if there are any remote
14674 14682 * locks on the file. (In theory we shouldn't have to make this
14675 14683 * second check if there's no lock owner, but for now we'll be
14676 14684 * conservative and do it anyway.) If either condition is true,
14677 14685 * send an unlock for the entire file to the server.
14678 14686 *
14679 14687 * Note that no explicit synchronization is needed here. At worst,
14680 14688 * flk_has_remote_locks() will return a false positive, in which case
14681 14689 * the unlock call wastes time but doesn't harm correctness.
14682 14690 */
14683 14691
14684 14692 if (lop) {
14685 14693 mutex_enter(&lop->lo_lock);
14686 14694 possible_orphan = lop->lo_pending_rqsts;
14687 14695 mutex_exit(&lop->lo_lock);
14688 14696 lock_owner_rele(lop);
14689 14697 }
14690 14698
14691 14699 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14692 14700
14693 14701 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14694 14702 "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14695 14703 "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14696 14704 (void *)lop));
14697 14705
14698 14706 if (possible_orphan || flk_has_remote_locks(vp)) {
14699 14707 ld.l_type = F_UNLCK; /* set to unlock entire file */
14700 14708 ld.l_whence = 0; /* unlock from start of file */
14701 14709 ld.l_start = 0;
14702 14710 ld.l_len = 0; /* do entire file */
14703 14711
14704 14712 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14705 14713 cr, NULL);
14706 14714
14707 14715 if (ret != 0) {
14708 14716 /*
14709 14717 * If VOP_FRLOCK fails, make sure we unregister
14710 14718 * local locks before we continue.
14711 14719 */
14712 14720 ld.l_pid = ttoproc(curthread)->p_pid;
14713 14721 nfs4_register_lock_locally(vp, &ld, flag, offset);
14714 14722 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14715 14723 "nfs4_lockrelease: lock release error on vp"
14716 14724 " %p: error %d.\n", (void *)vp, ret));
14717 14725 }
14718 14726 }
14719 14727
14720 14728 recov_state.rs_flags = 0;
14721 14729 recov_state.rs_num_retry_despite_err = 0;
14722 14730 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14723 14731 &recovonly);
14724 14732 if (error) {
14725 14733 mutex_enter(&rp->r_statelock);
14726 14734 rp->r_flags |= R4LODANGLERS;
14727 14735 mutex_exit(&rp->r_statelock);
14728 14736 return (error);
14729 14737 }
14730 14738
14731 14739 /*
14732 14740 * So, here we're going to need to retrieve the lock-owner
14733 14741 * again (in case recovery has done a switch-a-roo) and
14734 14742 * remove it because we can.
14735 14743 */
14736 14744 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14737 14745
14738 14746 if (lop) {
14739 14747 nfs4_rnode_remove_lock_owner(rp, lop);
↓ open down ↓ |
1750 lines elided |
↑ open up ↑ |
14740 14748 lock_owner_rele(lop);
14741 14749 }
14742 14750
14743 14751 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14744 14752 return (0);
14745 14753 }
14746 14754
14747 14755 /*
14748 14756 * Wait for 'tick_delay' clock ticks.
14749 14757 * Implement exponential backoff until hit the lease_time of this nfs4_server.
14750 - * NOTE: lock_lease_time is in seconds.
14758 + *
14759 + * The client should retry to acquire the lock faster than the lease period.
14760 + * We use roughly half of the lease time to use a similar calculation as it is
14761 + * used in nfs4_renew_lease_thread().
14751 14762 *
14752 14763 * XXX For future improvements, should implement a waiting queue scheme.
14753 14764 */
14754 14765 static int
14755 14766 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14756 14767 {
14757 - long milliseconds_delay;
14758 - time_t lock_lease_time;
14768 + long max_msec_delay = 1 * 1000; /* 1 sec */
14769 + nfs4_server_t *sp;
14770 + mntinfo4_t *mi = VTOMI4(RTOV4(rp));
14759 14771
14760 14772 /* wait tick_delay clock ticks or siginteruptus */
14761 14773 if (delay_sig(*tick_delay)) {
14762 14774 return (EINTR);
14763 14775 }
14776 +
14764 14777 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14765 14778 "reissue the lock request: blocked for %ld clock ticks: %ld "
14766 14779 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14767 14780
14768 - /* get the lease time */
14769 - lock_lease_time = r2lease_time(rp);
14770 -
14771 - /* drv_hztousec converts ticks to microseconds */
14772 - milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14773 - if (milliseconds_delay < lock_lease_time * 1000) {
14774 - *tick_delay = 2 * *tick_delay;
14775 - if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14776 - *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14781 + /*
14782 + * Get the current lease time and propagation time for the server
14783 + * associated with the given file. Note that both times could
14784 + * change immediately after this section.
14785 + */
14786 + nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
14787 + sp = find_nfs4_server(mi);
14788 + if (sp != NULL) {
14789 + if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) {
14790 + max_msec_delay = sp->s_lease_time * 1000 / 2 -
14791 + (3 * sp->propagation_delay.tv_sec *
14792 + 1000);
14793 + }
14794 + mutex_exit(&sp->s_lock);
14795 + nfs4_server_rele(sp);
14777 14796 }
14797 + nfs_rw_exit(&mi->mi_recovlock);
14798 +
14799 + max_msec_delay = MAX(max_msec_delay, nfs4_base_wait_time);
14800 + *tick_delay = MIN(drv_usectohz(max_msec_delay * 1000), *tick_delay * 2);
14778 14801 return (0);
14779 14802 }
14780 14803
14781 -
14782 14804 void
14783 14805 nfs4_vnops_init(void)
14784 14806 {
14785 14807 }
14786 14808
14787 14809 void
14788 14810 nfs4_vnops_fini(void)
14789 14811 {
14790 14812 }
14791 14813
14792 14814 /*
14793 14815 * Return a reference to the directory (parent) vnode for a given vnode,
14794 14816 * using the saved pathname information and the directory file handle. The
14795 14817 * caller is responsible for disposing of the reference.
14796 14818 * Returns zero or an errno value.
14797 14819 *
14798 14820 * Caller should set need_start_op to FALSE if it is the recovery
14799 14821 * thread, or if a start_fop has already been done. Otherwise, TRUE.
14800 14822 */
14801 14823 int
14802 14824 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14803 14825 {
14804 14826 svnode_t *svnp;
14805 14827 vnode_t *dvp = NULL;
14806 14828 servinfo4_t *svp;
14807 14829 nfs4_fname_t *mfname;
14808 14830 int error;
14809 14831
14810 14832 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14811 14833
14812 14834 if (vp->v_flag & VROOT) {
14813 14835 nfs4_sharedfh_t *sfh;
14814 14836 nfs_fh4 fh;
14815 14837 mntinfo4_t *mi;
14816 14838
14817 14839 ASSERT(vp->v_type == VREG);
14818 14840
14819 14841 mi = VTOMI4(vp);
14820 14842 svp = mi->mi_curr_serv;
14821 14843 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14822 14844 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14823 14845 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14824 14846 sfh = sfh4_get(&fh, VTOMI4(vp));
14825 14847 nfs_rw_exit(&svp->sv_lock);
14826 14848 mfname = mi->mi_fname;
14827 14849 fn_hold(mfname);
14828 14850 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14829 14851 sfh4_rele(&sfh);
14830 14852
14831 14853 if (dvp->v_type == VNON)
14832 14854 dvp->v_type = VDIR;
14833 14855 *dvpp = dvp;
14834 14856 return (0);
14835 14857 }
14836 14858
14837 14859 svnp = VTOSV(vp);
14838 14860
14839 14861 if (svnp == NULL) {
14840 14862 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14841 14863 "shadow node is NULL"));
14842 14864 return (EINVAL);
14843 14865 }
14844 14866
14845 14867 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14846 14868 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14847 14869 "shadow node name or dfh val == NULL"));
14848 14870 return (EINVAL);
14849 14871 }
14850 14872
14851 14873 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14852 14874 (int)need_start_op);
14853 14875 if (error != 0) {
14854 14876 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14855 14877 "nfs4_make_dotdot returned %d", error));
14856 14878 return (error);
14857 14879 }
14858 14880 if (!dvp) {
14859 14881 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14860 14882 "nfs4_make_dotdot returned a NULL dvp"));
14861 14883 return (EIO);
14862 14884 }
14863 14885 if (dvp->v_type == VNON)
14864 14886 dvp->v_type = VDIR;
14865 14887 ASSERT(dvp->v_type == VDIR);
14866 14888 if (VTOR4(vp)->r_flags & R4ISXATTR) {
14867 14889 mutex_enter(&dvp->v_lock);
14868 14890 dvp->v_flag |= V_XATTRDIR;
14869 14891 mutex_exit(&dvp->v_lock);
14870 14892 }
14871 14893 *dvpp = dvp;
14872 14894 return (0);
14873 14895 }
14874 14896
14875 14897 /*
14876 14898 * Copy the (final) component name of vp to fnamep. maxlen is the maximum
14877 14899 * length that fnamep can accept, including the trailing null.
14878 14900 * Returns 0 if okay, returns an errno value if there was a problem.
14879 14901 */
14880 14902
14881 14903 int
14882 14904 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14883 14905 {
14884 14906 char *fn;
14885 14907 int err = 0;
14886 14908 servinfo4_t *svp;
14887 14909 svnode_t *shvp;
14888 14910
14889 14911 /*
14890 14912 * If the file being opened has VROOT set, then this is
14891 14913 * a "file" mount. sv_name will not be interesting, so
14892 14914 * go back to the servinfo4 to get the original mount
14893 14915 * path and strip off all but the final edge. Otherwise
14894 14916 * just return the name from the shadow vnode.
14895 14917 */
14896 14918
14897 14919 if (vp->v_flag & VROOT) {
14898 14920
14899 14921 svp = VTOMI4(vp)->mi_curr_serv;
14900 14922 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14901 14923
14902 14924 fn = strrchr(svp->sv_path, '/');
14903 14925 if (fn == NULL)
14904 14926 err = EINVAL;
14905 14927 else
14906 14928 fn++;
14907 14929 } else {
14908 14930 shvp = VTOSV(vp);
14909 14931 fn = fn_name(shvp->sv_name);
14910 14932 }
14911 14933
14912 14934 if (err == 0)
14913 14935 if (strlen(fn) < maxlen)
14914 14936 (void) strcpy(fnamep, fn);
14915 14937 else
14916 14938 err = ENAMETOOLONG;
14917 14939
14918 14940 if (vp->v_flag & VROOT)
14919 14941 nfs_rw_exit(&svp->sv_lock);
14920 14942 else
14921 14943 kmem_free(fn, MAXNAMELEN);
14922 14944
14923 14945 return (err);
14924 14946 }
14925 14947
14926 14948 /*
14927 14949 * Bookkeeping for a close that doesn't need to go over the wire.
14928 14950 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14929 14951 * it is left at 1.
14930 14952 */
14931 14953 void
14932 14954 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14933 14955 {
14934 14956 rnode4_t *rp;
14935 14957 mntinfo4_t *mi;
14936 14958
14937 14959 mi = VTOMI4(vp);
14938 14960 rp = VTOR4(vp);
14939 14961
14940 14962 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14941 14963 "rp=%p osp=%p", (void *)rp, (void *)osp));
14942 14964 ASSERT(nfs_zone() == mi->mi_zone);
14943 14965 ASSERT(mutex_owned(&osp->os_sync_lock));
14944 14966 ASSERT(*have_lockp);
14945 14967
14946 14968 if (!osp->os_valid ||
14947 14969 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14948 14970 return;
14949 14971 }
14950 14972
14951 14973 /*
14952 14974 * This removes the reference obtained at OPEN; ie,
14953 14975 * when the open stream structure was created.
14954 14976 *
14955 14977 * We don't have to worry about calling 'open_stream_rele'
14956 14978 * since we our currently holding a reference to this
14957 14979 * open stream which means the count can not go to 0 with
14958 14980 * this decrement.
14959 14981 */
14960 14982 ASSERT(osp->os_ref_count >= 2);
14961 14983 osp->os_ref_count--;
14962 14984 osp->os_valid = 0;
14963 14985 mutex_exit(&osp->os_sync_lock);
14964 14986 *have_lockp = 0;
14965 14987
14966 14988 nfs4_dec_state_ref_count(mi);
14967 14989 }
14968 14990
14969 14991 /*
14970 14992 * Close all remaining open streams on the rnode. These open streams
14971 14993 * could be here because:
14972 14994 * - The close attempted at either close or delmap failed
14973 14995 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14974 14996 * - Someone did mknod on a regular file but never opened it
14975 14997 */
14976 14998 int
14977 14999 nfs4close_all(vnode_t *vp, cred_t *cr)
14978 15000 {
14979 15001 nfs4_open_stream_t *osp;
14980 15002 int error;
14981 15003 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14982 15004 rnode4_t *rp;
14983 15005
14984 15006 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14985 15007
14986 15008 error = 0;
14987 15009 rp = VTOR4(vp);
14988 15010
14989 15011 /*
14990 15012 * At this point, all we know is that the last time
14991 15013 * someone called vn_rele, the count was 1. Since then,
14992 15014 * the vnode could have been re-activated. We want to
14993 15015 * loop through the open streams and close each one, but
14994 15016 * we have to be careful since once we release the rnode
14995 15017 * hash bucket lock, someone else is free to come in and
14996 15018 * re-activate the rnode and add new open streams. The
14997 15019 * strategy is take the rnode hash bucket lock, verify that
14998 15020 * the count is still 1, grab the open stream off the
14999 15021 * head of the list and mark it invalid, then release the
15000 15022 * rnode hash bucket lock and proceed with that open stream.
15001 15023 * This is ok because nfs4close_one() will acquire the proper
15002 15024 * open/create to close/destroy synchronization for open
15003 15025 * streams, and will ensure that if someone has reopened
15004 15026 * the open stream after we've dropped the hash bucket lock
15005 15027 * then we'll just simply return without destroying the
15006 15028 * open stream.
15007 15029 * Repeat until the list is empty.
15008 15030 */
15009 15031
15010 15032 for (;;) {
15011 15033
15012 15034 /* make sure vnode hasn't been reactivated */
15013 15035 rw_enter(&rp->r_hashq->r_lock, RW_READER);
15014 15036 mutex_enter(&vp->v_lock);
15015 15037 if (vp->v_count > 1) {
15016 15038 mutex_exit(&vp->v_lock);
15017 15039 rw_exit(&rp->r_hashq->r_lock);
15018 15040 break;
15019 15041 }
15020 15042 /*
15021 15043 * Grabbing r_os_lock before releasing v_lock prevents
15022 15044 * a window where the rnode/open stream could get
15023 15045 * reactivated (and os_force_close set to 0) before we
15024 15046 * had a chance to set os_force_close to 1.
15025 15047 */
15026 15048 mutex_enter(&rp->r_os_lock);
15027 15049 mutex_exit(&vp->v_lock);
15028 15050
15029 15051 osp = list_head(&rp->r_open_streams);
15030 15052 if (!osp) {
15031 15053 /* nothing left to CLOSE OTW, so return */
15032 15054 mutex_exit(&rp->r_os_lock);
15033 15055 rw_exit(&rp->r_hashq->r_lock);
15034 15056 break;
15035 15057 }
15036 15058
15037 15059 mutex_enter(&rp->r_statev4_lock);
15038 15060 /* the file can't still be mem mapped */
15039 15061 ASSERT(rp->r_mapcnt == 0);
15040 15062 if (rp->created_v4)
15041 15063 rp->created_v4 = 0;
15042 15064 mutex_exit(&rp->r_statev4_lock);
15043 15065
15044 15066 /*
15045 15067 * Grab a ref on this open stream; nfs4close_one
15046 15068 * will mark it as invalid
15047 15069 */
15048 15070 mutex_enter(&osp->os_sync_lock);
15049 15071 osp->os_ref_count++;
15050 15072 osp->os_force_close = 1;
15051 15073 mutex_exit(&osp->os_sync_lock);
15052 15074 mutex_exit(&rp->r_os_lock);
15053 15075 rw_exit(&rp->r_hashq->r_lock);
15054 15076
15055 15077 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15056 15078
15057 15079 /* Update error if it isn't already non-zero */
15058 15080 if (error == 0) {
15059 15081 if (e.error)
15060 15082 error = e.error;
15061 15083 else if (e.stat)
15062 15084 error = geterrno4(e.stat);
15063 15085 }
15064 15086
15065 15087 #ifdef DEBUG
15066 15088 nfs4close_all_cnt++;
15067 15089 #endif
15068 15090 /* Release the ref on osp acquired above. */
15069 15091 open_stream_rele(osp, rp);
15070 15092
15071 15093 /* Proceed to the next open stream, if any */
15072 15094 }
15073 15095 return (error);
15074 15096 }
15075 15097
15076 15098 /*
15077 15099 * nfs4close_one - close one open stream for a file if needed.
15078 15100 *
15079 15101 * "close_type" indicates which close path this is:
15080 15102 * CLOSE_NORM: close initiated via VOP_CLOSE.
15081 15103 * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15082 15104 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces
15083 15105 * the close and release of client state for this open stream
15084 15106 * (unless someone else has the open stream open).
15085 15107 * CLOSE_RESEND: indicates the request is a replay of an earlier request
15086 15108 * (e.g., due to abort because of a signal).
15087 15109 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15088 15110 *
15089 15111 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15090 15112 * recovery. Instead, the caller is expected to deal with retries.
15091 15113 *
15092 15114 * The caller can either pass in the osp ('provided_osp') or not.
15093 15115 *
15094 15116 * 'access_bits' represents the access we are closing/downgrading.
15095 15117 *
15096 15118 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the
15097 15119 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15098 15120 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15099 15121 *
15100 15122 * Errors are returned via the nfs4_error_t.
15101 15123 */
15102 15124 void
15103 15125 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15104 15126 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15105 15127 nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15106 15128 uint_t mmap_flags)
15107 15129 {
15108 15130 nfs4_open_owner_t *oop;
15109 15131 nfs4_open_stream_t *osp = NULL;
15110 15132 int retry = 0;
15111 15133 int num_retries = NFS4_NUM_RECOV_RETRIES;
15112 15134 rnode4_t *rp;
15113 15135 mntinfo4_t *mi;
15114 15136 nfs4_recov_state_t recov_state;
15115 15137 cred_t *cred_otw = NULL;
15116 15138 bool_t recovonly = FALSE;
15117 15139 int isrecov;
15118 15140 int force_close;
15119 15141 int close_failed = 0;
15120 15142 int did_dec_count = 0;
15121 15143 int did_start_op = 0;
15122 15144 int did_force_recovlock = 0;
15123 15145 int did_start_seqid_sync = 0;
15124 15146 int have_sync_lock = 0;
15125 15147
15126 15148 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15127 15149
15128 15150 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15129 15151 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15130 15152 (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15131 15153 len, maxprot, mmap_flags, access_bits));
15132 15154
15133 15155 nfs4_error_zinit(ep);
15134 15156 rp = VTOR4(vp);
15135 15157 mi = VTOMI4(vp);
15136 15158 isrecov = (close_type == CLOSE_RESEND ||
15137 15159 close_type == CLOSE_AFTER_RESEND);
15138 15160
15139 15161 /*
15140 15162 * First get the open owner.
15141 15163 */
15142 15164 if (!provided_osp) {
15143 15165 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15144 15166 } else {
15145 15167 oop = provided_osp->os_open_owner;
15146 15168 ASSERT(oop != NULL);
15147 15169 open_owner_hold(oop);
15148 15170 }
15149 15171
15150 15172 if (!oop) {
15151 15173 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15152 15174 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15153 15175 "close type %d", (void *)rp, (void *)mi, (void *)cr,
15154 15176 (void *)provided_osp, close_type));
15155 15177 ep->error = EIO;
15156 15178 goto out;
15157 15179 }
15158 15180
15159 15181 cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15160 15182 recov_retry:
15161 15183 osp = NULL;
15162 15184 close_failed = 0;
15163 15185 force_close = (close_type == CLOSE_FORCE);
15164 15186 retry = 0;
15165 15187 did_start_op = 0;
15166 15188 did_force_recovlock = 0;
15167 15189 did_start_seqid_sync = 0;
15168 15190 have_sync_lock = 0;
15169 15191 recovonly = FALSE;
15170 15192 recov_state.rs_flags = 0;
15171 15193 recov_state.rs_num_retry_despite_err = 0;
15172 15194
15173 15195 /*
15174 15196 * Second synchronize with recovery.
15175 15197 */
15176 15198 if (!isrecov) {
15177 15199 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15178 15200 &recov_state, &recovonly);
15179 15201 if (!ep->error) {
15180 15202 did_start_op = 1;
15181 15203 } else {
15182 15204 close_failed = 1;
15183 15205 /*
15184 15206 * If we couldn't get start_fop, but have to
15185 15207 * cleanup state, then at least acquire the
15186 15208 * mi_recovlock so we can synchronize with
15187 15209 * recovery.
15188 15210 */
15189 15211 if (close_type == CLOSE_FORCE) {
15190 15212 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15191 15213 RW_READER, FALSE);
15192 15214 did_force_recovlock = 1;
15193 15215 } else
15194 15216 goto out;
15195 15217 }
15196 15218 }
15197 15219
15198 15220 /*
15199 15221 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15200 15222 * set 'recovonly' to TRUE since most likely this is due to
15201 15223 * reovery being active (MI4_RECOV_ACTIV). If recovery is active,
15202 15224 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15203 15225 * to retry, causing us to loop until recovery finishes. Plus we
15204 15226 * don't need protection over the open seqid since we're not going
15205 15227 * OTW, hence don't need to use the seqid.
15206 15228 */
15207 15229 if (recovonly == FALSE) {
15208 15230 /* need to grab the open owner sync before 'os_sync_lock' */
15209 15231 ep->error = nfs4_start_open_seqid_sync(oop, mi);
15210 15232 if (ep->error == EAGAIN) {
15211 15233 ASSERT(!isrecov);
15212 15234 if (did_start_op)
15213 15235 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15214 15236 &recov_state, TRUE);
15215 15237 if (did_force_recovlock)
15216 15238 nfs_rw_exit(&mi->mi_recovlock);
15217 15239 goto recov_retry;
15218 15240 }
15219 15241 did_start_seqid_sync = 1;
15220 15242 }
15221 15243
15222 15244 /*
15223 15245 * Third get an open stream and acquire 'os_sync_lock' to
15224 15246 * sychronize the opening/creating of an open stream with the
15225 15247 * closing/destroying of an open stream.
15226 15248 */
15227 15249 if (!provided_osp) {
15228 15250 /* returns with 'os_sync_lock' held */
15229 15251 osp = find_open_stream(oop, rp);
15230 15252 if (!osp) {
15231 15253 ep->error = EIO;
15232 15254 goto out;
15233 15255 }
15234 15256 } else {
15235 15257 osp = provided_osp;
15236 15258 open_stream_hold(osp);
15237 15259 mutex_enter(&osp->os_sync_lock);
15238 15260 }
15239 15261 have_sync_lock = 1;
15240 15262
15241 15263 ASSERT(oop == osp->os_open_owner);
15242 15264
15243 15265 /*
15244 15266 * Fourth, do any special pre-OTW CLOSE processing
15245 15267 * based on the specific close type.
15246 15268 */
15247 15269 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15248 15270 !did_dec_count) {
15249 15271 ASSERT(osp->os_open_ref_count > 0);
15250 15272 osp->os_open_ref_count--;
15251 15273 did_dec_count = 1;
15252 15274 if (osp->os_open_ref_count == 0)
15253 15275 osp->os_final_close = 1;
15254 15276 }
15255 15277
15256 15278 if (close_type == CLOSE_FORCE) {
15257 15279 /* see if somebody reopened the open stream. */
15258 15280 if (!osp->os_force_close) {
15259 15281 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15260 15282 "nfs4close_one: skip CLOSE_FORCE as osp %p "
15261 15283 "was reopened, vp %p", (void *)osp, (void *)vp));
15262 15284 ep->error = 0;
15263 15285 ep->stat = NFS4_OK;
15264 15286 goto out;
15265 15287 }
15266 15288
15267 15289 if (!osp->os_final_close && !did_dec_count) {
15268 15290 osp->os_open_ref_count--;
15269 15291 did_dec_count = 1;
15270 15292 }
15271 15293
15272 15294 /*
15273 15295 * We can't depend on os_open_ref_count being 0 due to the
15274 15296 * way executables are opened (VN_RELE to match a VOP_OPEN).
15275 15297 */
15276 15298 #ifdef NOTYET
15277 15299 ASSERT(osp->os_open_ref_count == 0);
15278 15300 #endif
15279 15301 if (osp->os_open_ref_count != 0) {
15280 15302 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15281 15303 "nfs4close_one: should panic here on an "
15282 15304 "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15283 15305 "since this is probably the exec problem."));
15284 15306
15285 15307 osp->os_open_ref_count = 0;
15286 15308 }
15287 15309
15288 15310 /*
15289 15311 * There is the possibility that nfs4close_one()
15290 15312 * for close_type == CLOSE_DELMAP couldn't find the
15291 15313 * open stream, thus couldn't decrement its os_mapcnt;
15292 15314 * therefore we can't use this ASSERT yet.
15293 15315 */
15294 15316 #ifdef NOTYET
15295 15317 ASSERT(osp->os_mapcnt == 0);
15296 15318 #endif
15297 15319 osp->os_mapcnt = 0;
15298 15320 }
15299 15321
15300 15322 if (close_type == CLOSE_DELMAP && !did_dec_count) {
15301 15323 ASSERT(osp->os_mapcnt >= btopr(len));
15302 15324
15303 15325 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15304 15326 osp->os_mmap_write -= btopr(len);
15305 15327 if (maxprot & PROT_READ)
15306 15328 osp->os_mmap_read -= btopr(len);
15307 15329 if (maxprot & PROT_EXEC)
15308 15330 osp->os_mmap_read -= btopr(len);
15309 15331 /* mirror the PROT_NONE check in nfs4_addmap() */
15310 15332 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15311 15333 !(maxprot & PROT_EXEC))
15312 15334 osp->os_mmap_read -= btopr(len);
15313 15335 osp->os_mapcnt -= btopr(len);
15314 15336 did_dec_count = 1;
15315 15337 }
15316 15338
15317 15339 if (recovonly) {
15318 15340 nfs4_lost_rqst_t lost_rqst;
15319 15341
15320 15342 /* request should not already be in recovery queue */
15321 15343 ASSERT(lrp == NULL);
15322 15344 nfs4_error_init(ep, EINTR);
15323 15345 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15324 15346 osp, cred_otw, vp);
15325 15347 mutex_exit(&osp->os_sync_lock);
15326 15348 have_sync_lock = 0;
15327 15349 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15328 15350 lost_rqst.lr_op == OP_CLOSE ?
15329 15351 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15330 15352 close_failed = 1;
15331 15353 force_close = 0;
15332 15354 goto close_cleanup;
15333 15355 }
15334 15356
15335 15357 /*
15336 15358 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15337 15359 * we stopped operating on the open owner's <old oo_name, old seqid>
15338 15360 * space, which means we stopped operating on the open stream
15339 15361 * too. So don't go OTW (as the seqid is likely bad, and the
15340 15362 * stateid could be stale, potentially triggering a false
15341 15363 * setclientid), and just clean up the client's internal state.
15342 15364 */
15343 15365 if (osp->os_orig_oo_name != oop->oo_name) {
15344 15366 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15345 15367 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15346 15368 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15347 15369 "oo_name %" PRIx64")",
15348 15370 (void *)osp, (void *)oop, osp->os_orig_oo_name,
15349 15371 oop->oo_name));
15350 15372 close_failed = 1;
15351 15373 }
15352 15374
15353 15375 /* If the file failed recovery, just quit. */
15354 15376 mutex_enter(&rp->r_statelock);
15355 15377 if (rp->r_flags & R4RECOVERR) {
15356 15378 close_failed = 1;
15357 15379 }
15358 15380 mutex_exit(&rp->r_statelock);
15359 15381
15360 15382 /*
15361 15383 * If the force close path failed to obtain start_fop
15362 15384 * then skip the OTW close and just remove the state.
15363 15385 */
15364 15386 if (close_failed)
15365 15387 goto close_cleanup;
15366 15388
15367 15389 /*
15368 15390 * Fifth, check to see if there are still mapped pages or other
15369 15391 * opens using this open stream. If there are then we can't
15370 15392 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15371 15393 */
15372 15394 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15373 15395 nfs4_lost_rqst_t new_lost_rqst;
15374 15396 bool_t needrecov = FALSE;
15375 15397 cred_t *odg_cred_otw = NULL;
15376 15398 seqid4 open_dg_seqid = 0;
15377 15399
15378 15400 if (osp->os_delegation) {
15379 15401 /*
15380 15402 * If this open stream was never OPENed OTW then we
15381 15403 * surely can't DOWNGRADE it (especially since the
15382 15404 * osp->open_stateid is really a delegation stateid
15383 15405 * when os_delegation is 1).
15384 15406 */
15385 15407 if (access_bits & FREAD)
15386 15408 osp->os_share_acc_read--;
15387 15409 if (access_bits & FWRITE)
15388 15410 osp->os_share_acc_write--;
15389 15411 osp->os_share_deny_none--;
15390 15412 nfs4_error_zinit(ep);
15391 15413 goto out;
15392 15414 }
15393 15415 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15394 15416 lrp, ep, &odg_cred_otw, &open_dg_seqid);
15395 15417 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15396 15418 if (needrecov && !isrecov) {
15397 15419 bool_t abort;
15398 15420 nfs4_bseqid_entry_t *bsep = NULL;
15399 15421
15400 15422 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15401 15423 bsep = nfs4_create_bseqid_entry(oop, NULL,
15402 15424 vp, 0,
15403 15425 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15404 15426 open_dg_seqid);
15405 15427
15406 15428 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15407 15429 oop, osp, odg_cred_otw, vp, access_bits, 0);
15408 15430 mutex_exit(&osp->os_sync_lock);
15409 15431 have_sync_lock = 0;
15410 15432 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15411 15433 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15412 15434 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15413 15435 bsep, NULL, NULL);
15414 15436 if (odg_cred_otw)
15415 15437 crfree(odg_cred_otw);
15416 15438 if (bsep)
15417 15439 kmem_free(bsep, sizeof (*bsep));
15418 15440
15419 15441 if (abort == TRUE)
15420 15442 goto out;
15421 15443
15422 15444 if (did_start_seqid_sync) {
15423 15445 nfs4_end_open_seqid_sync(oop);
15424 15446 did_start_seqid_sync = 0;
15425 15447 }
15426 15448 open_stream_rele(osp, rp);
15427 15449
15428 15450 if (did_start_op)
15429 15451 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15430 15452 &recov_state, FALSE);
15431 15453 if (did_force_recovlock)
15432 15454 nfs_rw_exit(&mi->mi_recovlock);
15433 15455
15434 15456 goto recov_retry;
15435 15457 } else {
15436 15458 if (odg_cred_otw)
15437 15459 crfree(odg_cred_otw);
15438 15460 }
15439 15461 goto out;
15440 15462 }
15441 15463
15442 15464 /*
15443 15465 * If this open stream was created as the results of an open
15444 15466 * while holding a delegation, then just release it; no need
15445 15467 * to do an OTW close. Otherwise do a "normal" OTW close.
15446 15468 */
15447 15469 if (osp->os_delegation) {
15448 15470 nfs4close_notw(vp, osp, &have_sync_lock);
15449 15471 nfs4_error_zinit(ep);
15450 15472 goto out;
15451 15473 }
15452 15474
15453 15475 /*
15454 15476 * If this stream is not valid, we're done.
15455 15477 */
15456 15478 if (!osp->os_valid) {
15457 15479 nfs4_error_zinit(ep);
15458 15480 goto out;
15459 15481 }
15460 15482
15461 15483 /*
15462 15484 * Last open or mmap ref has vanished, need to do an OTW close.
15463 15485 * First check to see if a close is still necessary.
15464 15486 */
15465 15487 if (osp->os_failed_reopen) {
15466 15488 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15467 15489 "don't close OTW osp %p since reopen failed.",
15468 15490 (void *)osp));
15469 15491 /*
15470 15492 * Reopen of the open stream failed, hence the
15471 15493 * stateid of the open stream is invalid/stale, and
15472 15494 * sending this OTW would incorrectly cause another
15473 15495 * round of recovery. In this case, we need to set
15474 15496 * the 'os_valid' bit to 0 so another thread doesn't
15475 15497 * come in and re-open this open stream before
15476 15498 * this "closing" thread cleans up state (decrementing
15477 15499 * the nfs4_server_t's state_ref_count and decrementing
15478 15500 * the os_ref_count).
15479 15501 */
15480 15502 osp->os_valid = 0;
15481 15503 /*
15482 15504 * This removes the reference obtained at OPEN; ie,
15483 15505 * when the open stream structure was created.
15484 15506 *
15485 15507 * We don't have to worry about calling 'open_stream_rele'
15486 15508 * since we our currently holding a reference to this
15487 15509 * open stream which means the count can not go to 0 with
15488 15510 * this decrement.
15489 15511 */
15490 15512 ASSERT(osp->os_ref_count >= 2);
15491 15513 osp->os_ref_count--;
15492 15514 nfs4_error_zinit(ep);
15493 15515 close_failed = 0;
15494 15516 goto close_cleanup;
15495 15517 }
15496 15518
15497 15519 ASSERT(osp->os_ref_count > 1);
15498 15520
15499 15521 /*
15500 15522 * Sixth, try the CLOSE OTW.
15501 15523 */
15502 15524 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15503 15525 close_type, ep, &have_sync_lock);
15504 15526
15505 15527 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15506 15528 /*
15507 15529 * Let the recovery thread be responsible for
15508 15530 * removing the state for CLOSE.
15509 15531 */
15510 15532 close_failed = 1;
15511 15533 force_close = 0;
15512 15534 retry = 0;
15513 15535 }
15514 15536
15515 15537 /* See if we need to retry with a different cred */
15516 15538 if ((ep->error == EACCES ||
15517 15539 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15518 15540 cred_otw != cr) {
15519 15541 crfree(cred_otw);
15520 15542 cred_otw = cr;
15521 15543 crhold(cred_otw);
15522 15544 retry = 1;
15523 15545 }
15524 15546
15525 15547 if (ep->error || ep->stat)
15526 15548 close_failed = 1;
15527 15549
15528 15550 if (retry && !isrecov && num_retries-- > 0) {
15529 15551 if (have_sync_lock) {
15530 15552 mutex_exit(&osp->os_sync_lock);
15531 15553 have_sync_lock = 0;
15532 15554 }
15533 15555 if (did_start_seqid_sync) {
15534 15556 nfs4_end_open_seqid_sync(oop);
15535 15557 did_start_seqid_sync = 0;
15536 15558 }
15537 15559 open_stream_rele(osp, rp);
15538 15560
15539 15561 if (did_start_op)
15540 15562 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15541 15563 &recov_state, FALSE);
15542 15564 if (did_force_recovlock)
15543 15565 nfs_rw_exit(&mi->mi_recovlock);
15544 15566 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15545 15567 "nfs4close_one: need to retry the close "
15546 15568 "operation"));
15547 15569 goto recov_retry;
15548 15570 }
15549 15571 close_cleanup:
15550 15572 /*
15551 15573 * Seventh and lastly, process our results.
15552 15574 */
15553 15575 if (close_failed && force_close) {
15554 15576 /*
15555 15577 * It's ok to drop and regrab the 'os_sync_lock' since
15556 15578 * nfs4close_notw() will recheck to make sure the
15557 15579 * "close"/removal of state should happen.
15558 15580 */
15559 15581 if (!have_sync_lock) {
15560 15582 mutex_enter(&osp->os_sync_lock);
15561 15583 have_sync_lock = 1;
15562 15584 }
15563 15585 /*
15564 15586 * This is last call, remove the ref on the open
15565 15587 * stream created by open and clean everything up.
15566 15588 */
15567 15589 osp->os_pending_close = 0;
15568 15590 nfs4close_notw(vp, osp, &have_sync_lock);
15569 15591 nfs4_error_zinit(ep);
15570 15592 }
15571 15593
15572 15594 if (!close_failed) {
15573 15595 if (have_sync_lock) {
15574 15596 osp->os_pending_close = 0;
15575 15597 mutex_exit(&osp->os_sync_lock);
15576 15598 have_sync_lock = 0;
15577 15599 } else {
15578 15600 mutex_enter(&osp->os_sync_lock);
15579 15601 osp->os_pending_close = 0;
15580 15602 mutex_exit(&osp->os_sync_lock);
15581 15603 }
15582 15604 if (did_start_op && recov_state.rs_sp != NULL) {
15583 15605 mutex_enter(&recov_state.rs_sp->s_lock);
15584 15606 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15585 15607 mutex_exit(&recov_state.rs_sp->s_lock);
15586 15608 } else {
15587 15609 nfs4_dec_state_ref_count(mi);
15588 15610 }
15589 15611 nfs4_error_zinit(ep);
15590 15612 }
15591 15613
15592 15614 out:
15593 15615 if (have_sync_lock)
15594 15616 mutex_exit(&osp->os_sync_lock);
15595 15617 if (did_start_op)
15596 15618 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15597 15619 recovonly ? TRUE : FALSE);
15598 15620 if (did_force_recovlock)
15599 15621 nfs_rw_exit(&mi->mi_recovlock);
15600 15622 if (cred_otw)
15601 15623 crfree(cred_otw);
15602 15624 if (osp)
15603 15625 open_stream_rele(osp, rp);
15604 15626 if (oop) {
15605 15627 if (did_start_seqid_sync)
15606 15628 nfs4_end_open_seqid_sync(oop);
15607 15629 open_owner_rele(oop);
15608 15630 }
15609 15631 }
15610 15632
15611 15633 /*
15612 15634 * Convert information returned by the server in the LOCK4denied
15613 15635 * structure to the form required by fcntl.
15614 15636 */
15615 15637 static void
15616 15638 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15617 15639 {
15618 15640 nfs4_lo_name_t *lo;
15619 15641
15620 15642 #ifdef DEBUG
15621 15643 if (denied_to_flk_debug) {
15622 15644 lockt_denied_debug = lockt_denied;
15623 15645 debug_enter("lockt_denied");
15624 15646 }
15625 15647 #endif
15626 15648
15627 15649 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15628 15650 flk->l_whence = 0; /* aka SEEK_SET */
15629 15651 flk->l_start = lockt_denied->offset;
15630 15652 flk->l_len = lockt_denied->length;
15631 15653
15632 15654 /*
15633 15655 * If the blocking clientid matches our client id, then we can
15634 15656 * interpret the lockowner (since we built it). If not, then
15635 15657 * fabricate a sysid and pid. Note that the l_sysid field
15636 15658 * in *flk already has the local sysid.
15637 15659 */
15638 15660
15639 15661 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15640 15662
15641 15663 if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15642 15664 lo = (nfs4_lo_name_t *)
15643 15665 lockt_denied->owner.owner_val;
15644 15666
15645 15667 flk->l_pid = lo->ln_pid;
15646 15668 } else {
15647 15669 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15648 15670 "denied_to_flk: bad lock owner length\n"));
15649 15671
15650 15672 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15651 15673 }
15652 15674 } else {
15653 15675 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15654 15676 "denied_to_flk: foreign clientid\n"));
15655 15677
15656 15678 /*
15657 15679 * Construct a new sysid which should be different from
15658 15680 * sysids of other systems.
15659 15681 */
15660 15682
15661 15683 flk->l_sysid++;
15662 15684 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15663 15685 }
15664 15686 }
15665 15687
15666 15688 static pid_t
15667 15689 lo_to_pid(lock_owner4 *lop)
15668 15690 {
15669 15691 pid_t pid = 0;
15670 15692 uchar_t *cp;
15671 15693 int i;
15672 15694
15673 15695 cp = (uchar_t *)&lop->clientid;
15674 15696
15675 15697 for (i = 0; i < sizeof (lop->clientid); i++)
15676 15698 pid += (pid_t)*cp++;
15677 15699
15678 15700 cp = (uchar_t *)lop->owner_val;
15679 15701
15680 15702 for (i = 0; i < lop->owner_len; i++)
15681 15703 pid += (pid_t)*cp++;
15682 15704
15683 15705 return (pid);
15684 15706 }
15685 15707
15686 15708 /*
15687 15709 * Given a lock pointer, returns the length of that lock.
15688 15710 * "end" is the last locked offset the "l_len" covers from
15689 15711 * the start of the lock.
15690 15712 */
15691 15713 static off64_t
15692 15714 lock_to_end(flock64_t *lock)
15693 15715 {
15694 15716 off64_t lock_end;
15695 15717
15696 15718 if (lock->l_len == 0)
15697 15719 lock_end = (off64_t)MAXEND;
15698 15720 else
15699 15721 lock_end = lock->l_start + lock->l_len - 1;
15700 15722
15701 15723 return (lock_end);
15702 15724 }
15703 15725
15704 15726 /*
15705 15727 * Given the end of a lock, it will return you the length "l_len" for that lock.
15706 15728 */
15707 15729 static off64_t
15708 15730 end_to_len(off64_t start, off64_t end)
15709 15731 {
15710 15732 off64_t lock_len;
15711 15733
15712 15734 ASSERT(end >= start);
15713 15735 if (end == MAXEND)
15714 15736 lock_len = 0;
15715 15737 else
15716 15738 lock_len = end - start + 1;
15717 15739
15718 15740 return (lock_len);
15719 15741 }
15720 15742
15721 15743 /*
15722 15744 * On given end for a lock it determines if it is the last locked offset
15723 15745 * or not, if so keeps it as is, else adds one to return the length for
15724 15746 * valid start.
15725 15747 */
15726 15748 static off64_t
15727 15749 start_check(off64_t x)
15728 15750 {
15729 15751 if (x == MAXEND)
15730 15752 return (x);
15731 15753 else
15732 15754 return (x + 1);
15733 15755 }
15734 15756
15735 15757 /*
15736 15758 * See if these two locks overlap, and if so return 1;
15737 15759 * otherwise, return 0.
15738 15760 */
15739 15761 static int
15740 15762 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15741 15763 {
15742 15764 off64_t llfp_end, curfp_end;
15743 15765
15744 15766 llfp_end = lock_to_end(llfp);
15745 15767 curfp_end = lock_to_end(curfp);
15746 15768
15747 15769 if (((llfp_end >= curfp->l_start) &&
15748 15770 (llfp->l_start <= curfp->l_start)) ||
15749 15771 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15750 15772 return (1);
15751 15773 return (0);
15752 15774 }
15753 15775
15754 15776 /*
15755 15777 * Determine what the intersecting lock region is, and add that to the
15756 15778 * 'nl_llpp' locklist in increasing order (by l_start).
15757 15779 */
15758 15780 static void
15759 15781 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15760 15782 locklist_t **nl_llpp, vnode_t *vp)
15761 15783 {
15762 15784 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15763 15785 off64_t lost_flp_end, local_flp_end, len, start;
15764 15786
15765 15787 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15766 15788
15767 15789 if (!locks_intersect(lost_flp, local_flp))
15768 15790 return;
15769 15791
15770 15792 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15771 15793 "locks intersect"));
15772 15794
15773 15795 lost_flp_end = lock_to_end(lost_flp);
15774 15796 local_flp_end = lock_to_end(local_flp);
15775 15797
15776 15798 /* Find the starting point of the intersecting region */
15777 15799 if (local_flp->l_start > lost_flp->l_start)
15778 15800 start = local_flp->l_start;
15779 15801 else
15780 15802 start = lost_flp->l_start;
15781 15803
15782 15804 /* Find the lenght of the intersecting region */
15783 15805 if (lost_flp_end < local_flp_end)
15784 15806 len = end_to_len(start, lost_flp_end);
15785 15807 else
15786 15808 len = end_to_len(start, local_flp_end);
15787 15809
15788 15810 /*
15789 15811 * Prepare the flock structure for the intersection found and insert
15790 15812 * it into the new list in increasing l_start order. This list contains
15791 15813 * intersections of locks registered by the client with the local host
15792 15814 * and the lost lock.
15793 15815 * The lock type of this lock is the same as that of the local_flp.
15794 15816 */
15795 15817 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15796 15818 intersect_llp->ll_flock.l_start = start;
15797 15819 intersect_llp->ll_flock.l_len = len;
15798 15820 intersect_llp->ll_flock.l_type = local_flp->l_type;
15799 15821 intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15800 15822 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15801 15823 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */
15802 15824 intersect_llp->ll_vp = vp;
15803 15825
15804 15826 tmp_fllp = *nl_llpp;
15805 15827 cur_fllp = NULL;
15806 15828 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15807 15829 intersect_llp->ll_flock.l_start) {
15808 15830 cur_fllp = tmp_fllp;
15809 15831 tmp_fllp = tmp_fllp->ll_next;
15810 15832 }
15811 15833 if (cur_fllp == NULL) {
15812 15834 /* first on the list */
15813 15835 intersect_llp->ll_next = *nl_llpp;
15814 15836 *nl_llpp = intersect_llp;
15815 15837 } else {
15816 15838 intersect_llp->ll_next = cur_fllp->ll_next;
15817 15839 cur_fllp->ll_next = intersect_llp;
15818 15840 }
15819 15841
15820 15842 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15821 15843 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15822 15844 intersect_llp->ll_flock.l_start,
15823 15845 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15824 15846 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15825 15847 }
15826 15848
15827 15849 /*
15828 15850 * Our local locking current state is potentially different than
15829 15851 * what the NFSv4 server thinks we have due to a lost lock that was
15830 15852 * resent and then received. We need to reset our "NFSv4" locking
15831 15853 * state to match the current local locking state for this pid since
15832 15854 * that is what the user/application sees as what the world is.
15833 15855 *
15834 15856 * We cannot afford to drop the open/lock seqid sync since then we can
15835 15857 * get confused about what the current local locking state "is" versus
15836 15858 * "was".
15837 15859 *
15838 15860 * If we are unable to fix up the locks, we send SIGLOST to the affected
15839 15861 * process. This is not done if the filesystem has been forcibly
15840 15862 * unmounted, in case the process has already exited and a new process
15841 15863 * exists with the same pid.
15842 15864 */
15843 15865 static void
15844 15866 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15845 15867 nfs4_lock_owner_t *lop)
15846 15868 {
15847 15869 locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15848 15870 mntinfo4_t *mi = VTOMI4(vp);
15849 15871 const int cmd = F_SETLK;
15850 15872 off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15851 15873 flock64_t ul_fl;
15852 15874
15853 15875 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15854 15876 "nfs4_reinstitute_local_lock_state"));
15855 15877
15856 15878 /*
15857 15879 * Find active locks for this vp from the local locking code.
15858 15880 * Scan through this list and find out the locks that intersect with
15859 15881 * the lost lock. Once we find the lock that intersects, add the
15860 15882 * intersection area as a new lock to a new list "ri_llp". The lock
15861 15883 * type of the intersection region lock added to ri_llp is the same
15862 15884 * as that found in the active lock list, "list". The intersecting
15863 15885 * region locks are added to ri_llp in increasing l_start order.
15864 15886 */
15865 15887 ASSERT(nfs_zone() == mi->mi_zone);
15866 15888
15867 15889 locks = flk_active_locks_for_vp(vp);
15868 15890 ri_llp = NULL;
15869 15891
15870 15892 for (llp = locks; llp != NULL; llp = llp->ll_next) {
15871 15893 ASSERT(llp->ll_vp == vp);
15872 15894 /*
15873 15895 * Pick locks that belong to this pid/lockowner
15874 15896 */
15875 15897 if (llp->ll_flock.l_pid != lost_flp->l_pid)
15876 15898 continue;
15877 15899
15878 15900 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15879 15901 }
15880 15902
15881 15903 /*
15882 15904 * Now we have the list of intersections with the lost lock. These are
15883 15905 * the locks that were/are active before the server replied to the
15884 15906 * last/lost lock. Issue these locks to the server here. Playing these
15885 15907 * locks to the server will re-establish aur current local locking state
15886 15908 * with the v4 server.
15887 15909 * If we get an error, send SIGLOST to the application for that lock.
15888 15910 */
15889 15911
15890 15912 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15891 15913 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15892 15914 "nfs4_reinstitute_local_lock_state: need to issue "
15893 15915 "flock: [%"PRIx64" - %"PRIx64"] : %s",
15894 15916 llp->ll_flock.l_start,
15895 15917 llp->ll_flock.l_start + llp->ll_flock.l_len,
15896 15918 llp->ll_flock.l_type == F_RDLCK ? "READ" :
15897 15919 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15898 15920 /*
15899 15921 * No need to relock what we already have
15900 15922 */
15901 15923 if (llp->ll_flock.l_type == lost_flp->l_type)
15902 15924 continue;
15903 15925
15904 15926 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15905 15927 }
15906 15928
15907 15929 /*
15908 15930 * Now keeping the start of the lost lock as our reference parse the
15909 15931 * newly created ri_llp locklist to find the ranges that we have locked
15910 15932 * with the v4 server but not in the current local locking. We need
15911 15933 * to unlock these ranges.
15912 15934 * These ranges can also be reffered to as those ranges, where the lost
15913 15935 * lock does not overlap with the locks in the ri_llp but are locked
15914 15936 * since the server replied to the lost lock.
15915 15937 */
15916 15938 cur_start = lost_flp->l_start;
15917 15939 lost_flp_end = lock_to_end(lost_flp);
15918 15940
15919 15941 ul_fl.l_type = F_UNLCK;
15920 15942 ul_fl.l_whence = 0; /* aka SEEK_SET */
15921 15943 ul_fl.l_sysid = lost_flp->l_sysid;
15922 15944 ul_fl.l_pid = lost_flp->l_pid;
15923 15945
15924 15946 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15925 15947 llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15926 15948
15927 15949 if (llp->ll_flock.l_start <= cur_start) {
15928 15950 cur_start = start_check(llp_ll_flock_end);
15929 15951 continue;
15930 15952 }
15931 15953 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15932 15954 "nfs4_reinstitute_local_lock_state: "
15933 15955 "UNLOCK [%"PRIx64" - %"PRIx64"]",
15934 15956 cur_start, llp->ll_flock.l_start));
15935 15957
15936 15958 ul_fl.l_start = cur_start;
15937 15959 ul_fl.l_len = end_to_len(cur_start,
15938 15960 (llp->ll_flock.l_start - 1));
15939 15961
15940 15962 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15941 15963 cur_start = start_check(llp_ll_flock_end);
15942 15964 }
15943 15965
15944 15966 /*
15945 15967 * In the case where the lost lock ends after all intersecting locks,
15946 15968 * unlock the last part of the lost lock range.
15947 15969 */
15948 15970 if (cur_start != start_check(lost_flp_end)) {
15949 15971 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15950 15972 "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15951 15973 "lost lock region [%"PRIx64" - %"PRIx64"]",
15952 15974 cur_start, lost_flp->l_start + lost_flp->l_len));
15953 15975
15954 15976 ul_fl.l_start = cur_start;
15955 15977 /*
15956 15978 * Is it an to-EOF lock? if so unlock till the end
15957 15979 */
15958 15980 if (lost_flp->l_len == 0)
15959 15981 ul_fl.l_len = 0;
15960 15982 else
15961 15983 ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15962 15984
15963 15985 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15964 15986 }
15965 15987
15966 15988 if (locks != NULL)
15967 15989 flk_free_locklist(locks);
15968 15990
15969 15991 /* Free up our newly created locklist */
15970 15992 for (llp = ri_llp; llp != NULL; ) {
15971 15993 tmp_llp = llp->ll_next;
15972 15994 kmem_free(llp, sizeof (locklist_t));
15973 15995 llp = tmp_llp;
15974 15996 }
15975 15997
15976 15998 /*
15977 15999 * Now return back to the original calling nfs4frlock()
15978 16000 * and let us naturally drop our seqid syncs.
15979 16001 */
15980 16002 }
15981 16003
15982 16004 /*
15983 16005 * Create a lost state record for the given lock reinstantiation request
15984 16006 * and push it onto the lost state queue.
15985 16007 */
15986 16008 static void
15987 16009 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15988 16010 nfs4_lock_owner_t *lop)
15989 16011 {
15990 16012 nfs4_lost_rqst_t req;
15991 16013 nfs_lock_type4 locktype;
15992 16014 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15993 16015
15994 16016 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15995 16017
15996 16018 locktype = flk_to_locktype(cmd, flk->l_type);
15997 16019 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15998 16020 NULL, NULL, lop, flk, &req, cr, vp);
15999 16021 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
16000 16022 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
16001 16023 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
16002 16024 NULL, NULL, NULL);
16003 16025 }
↓ open down ↓ |
1212 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX