Print this page
3484 enhance and document tail follow support
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_vnops.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
24 24 */
25 25 /*
26 26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 27 */
28 28
29 29 /*
30 30 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
31 31 * All Rights Reserved
32 32 */
33 33
34 +/*
35 + * Copyright (c) 2013, Joyent, Inc. All rights reserved.
36 + */
37 +
34 38 #include <sys/param.h>
35 39 #include <sys/types.h>
36 40 #include <sys/systm.h>
37 41 #include <sys/cred.h>
38 42 #include <sys/time.h>
39 43 #include <sys/vnode.h>
40 44 #include <sys/vfs.h>
41 45 #include <sys/vfs_opreg.h>
42 46 #include <sys/file.h>
43 47 #include <sys/filio.h>
44 48 #include <sys/uio.h>
45 49 #include <sys/buf.h>
46 50 #include <sys/mman.h>
47 51 #include <sys/pathname.h>
48 52 #include <sys/dirent.h>
49 53 #include <sys/debug.h>
50 54 #include <sys/vmsystm.h>
51 55 #include <sys/fcntl.h>
52 56 #include <sys/flock.h>
53 57 #include <sys/swap.h>
54 58 #include <sys/errno.h>
55 59 #include <sys/strsubr.h>
56 60 #include <sys/sysmacros.h>
57 61 #include <sys/kmem.h>
58 62 #include <sys/cmn_err.h>
59 63 #include <sys/pathconf.h>
60 64 #include <sys/utsname.h>
61 65 #include <sys/dnlc.h>
62 66 #include <sys/acl.h>
63 67 #include <sys/systeminfo.h>
64 68 #include <sys/policy.h>
65 69 #include <sys/sdt.h>
66 70 #include <sys/list.h>
67 71 #include <sys/stat.h>
68 72 #include <sys/zone.h>
69 73
70 74 #include <rpc/types.h>
71 75 #include <rpc/auth.h>
72 76 #include <rpc/clnt.h>
73 77
74 78 #include <nfs/nfs.h>
75 79 #include <nfs/nfs_clnt.h>
76 80 #include <nfs/nfs_acl.h>
77 81 #include <nfs/lm.h>
78 82 #include <nfs/nfs4.h>
79 83 #include <nfs/nfs4_kprot.h>
80 84 #include <nfs/rnode4.h>
81 85 #include <nfs/nfs4_clnt.h>
82 86
83 87 #include <vm/hat.h>
84 88 #include <vm/as.h>
85 89 #include <vm/page.h>
86 90 #include <vm/pvn.h>
87 91 #include <vm/seg.h>
88 92 #include <vm/seg_map.h>
89 93 #include <vm/seg_kpm.h>
90 94 #include <vm/seg_vn.h>
91 95
92 96 #include <fs/fs_subr.h>
93 97
94 98 #include <sys/ddi.h>
95 99 #include <sys/int_fmtio.h>
96 100 #include <sys/fs/autofs.h>
97 101
98 102 typedef struct {
99 103 nfs4_ga_res_t *di_garp;
100 104 cred_t *di_cred;
101 105 hrtime_t di_time_call;
102 106 } dirattr_info_t;
103 107
104 108 typedef enum nfs4_acl_op {
105 109 NFS4_ACL_GET,
106 110 NFS4_ACL_SET
107 111 } nfs4_acl_op_t;
108 112
109 113 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi);
110 114
111 115 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
112 116 char *, dirattr_info_t *);
113 117
114 118 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
115 119 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
116 120 nfs4_error_t *, int *);
117 121 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
118 122 cred_t *);
119 123 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
120 124 stable_how4 *);
121 125 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
122 126 cred_t *, bool_t, struct uio *);
123 127 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
124 128 vsecattr_t *);
125 129 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
126 130 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
127 131 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
128 132 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
129 133 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
130 134 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
131 135 int, vnode_t **, cred_t *);
132 136 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
133 137 cred_t *, int, int, enum createmode4, int);
134 138 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
135 139 caller_context_t *);
136 140 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
137 141 vnode_t *, char *, cred_t *, nfsstat4 *);
138 142 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
139 143 vnode_t *, char *, cred_t *, nfsstat4 *);
140 144 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
141 145 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
142 146 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
143 147 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
144 148 page_t *[], size_t, struct seg *, caddr_t,
145 149 enum seg_rw, cred_t *);
146 150 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
147 151 cred_t *);
148 152 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
149 153 int, cred_t *);
150 154 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
151 155 int, cred_t *);
152 156 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *);
153 157 static void nfs4_set_mod(vnode_t *);
154 158 static void nfs4_get_commit(vnode_t *);
155 159 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
156 160 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
157 161 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
158 162 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
159 163 cred_t *);
160 164 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
161 165 cred_t *);
162 166 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
163 167 hrtime_t, vnode_t *, cred_t *);
164 168 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
165 169 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
166 170 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
167 171 u_offset_t);
168 172 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
169 173 static int nfs4_block_and_wait(clock_t *, rnode4_t *);
170 174 static cred_t *state_to_cred(nfs4_open_stream_t *);
171 175 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
172 176 static pid_t lo_to_pid(lock_owner4 *);
173 177 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
174 178 cred_t *, nfs4_lock_owner_t *);
175 179 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
176 180 nfs4_lock_owner_t *);
177 181 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
178 182 static void nfs4_delmap_callback(struct as *, void *, uint_t);
179 183 static void nfs4_free_delmapcall(nfs4_delmapcall_t *);
180 184 static nfs4_delmapcall_t *nfs4_init_delmapcall();
181 185 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
182 186 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
183 187 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
184 188 uid_t, gid_t, int);
185 189
186 190 /*
187 191 * Routines that implement the setting of v4 args for the misc. ops
188 192 */
189 193 static void nfs4args_lock_free(nfs_argop4 *);
190 194 static void nfs4args_lockt_free(nfs_argop4 *);
191 195 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
192 196 int, rnode4_t *, cred_t *, bitmap4, int *,
193 197 nfs4_stateid_types_t *);
194 198 static void nfs4args_setattr_free(nfs_argop4 *);
195 199 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
196 200 bitmap4);
197 201 static void nfs4args_verify_free(nfs_argop4 *);
198 202 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
199 203 WRITE4args **, nfs4_stateid_types_t *);
200 204
201 205 /*
202 206 * These are the vnode ops functions that implement the vnode interface to
203 207 * the networked file system. See more comments below at nfs4_vnodeops.
204 208 */
205 209 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
206 210 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
207 211 caller_context_t *);
208 212 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *,
209 213 caller_context_t *);
210 214 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *,
211 215 caller_context_t *);
212 216 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
213 217 caller_context_t *);
214 218 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
215 219 caller_context_t *);
216 220 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
217 221 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *,
218 222 caller_context_t *);
219 223 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
220 224 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
221 225 int, vnode_t **, cred_t *, int, caller_context_t *,
222 226 vsecattr_t *);
223 227 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
224 228 int);
225 229 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
226 230 caller_context_t *, int);
227 231 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
228 232 caller_context_t *, int);
229 233 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
230 234 cred_t *, caller_context_t *, int, vsecattr_t *);
231 235 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
232 236 caller_context_t *, int);
233 237 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
234 238 cred_t *, caller_context_t *, int);
235 239 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
236 240 caller_context_t *, int);
237 241 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
238 242 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
239 243 page_t *[], size_t, struct seg *, caddr_t,
240 244 enum seg_rw, cred_t *, caller_context_t *);
241 245 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
242 246 caller_context_t *);
243 247 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
244 248 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
245 249 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
246 250 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
247 251 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
248 252 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
249 253 struct flk_callback *, cred_t *, caller_context_t *);
250 254 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
251 255 cred_t *, caller_context_t *);
252 256 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
253 257 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
254 258 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
255 259 cred_t *, caller_context_t *);
256 260 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
257 261 caller_context_t *);
258 262 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
259 263 caller_context_t *);
260 264 /*
261 265 * These vnode ops are required to be called from outside this source file,
262 266 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
263 267 * as static.
264 268 */
265 269 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
266 270 caller_context_t *);
267 271 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
268 272 int nfs4_lookup(vnode_t *, char *, vnode_t **,
269 273 struct pathname *, int, vnode_t *, cred_t *,
270 274 caller_context_t *, int *, pathname_t *);
271 275 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
272 276 int nfs4_rwlock(vnode_t *, int, caller_context_t *);
273 277 void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
274 278 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
275 279 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
276 280 caller_context_t *);
277 281 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
278 282 caller_context_t *);
279 283 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
280 284 caller_context_t *);
281 285
282 286 /*
283 287 * Used for nfs4_commit_vp() to indicate if we should
284 288 * wait on pending writes.
285 289 */
286 290 #define NFS4_WRITE_NOWAIT 0
287 291 #define NFS4_WRITE_WAIT 1
288 292
289 293 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */
290 294
291 295 /*
292 296 * Error flags used to pass information about certain special errors
293 297 * which need to be handled specially.
294 298 */
295 299 #define NFS_EOF -98
296 300 #define NFS_VERF_MISMATCH -97
297 301
298 302 /*
299 303 * Flags used to differentiate between which operation drove the
300 304 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
301 305 */
302 306 #define NFS4_CLOSE_OP 0x1
303 307 #define NFS4_DELMAP_OP 0x2
304 308 #define NFS4_INACTIVE_OP 0x3
305 309
306 310 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
307 311
308 312 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
309 313 #define ALIGN64(x, ptr, sz) \
310 314 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \
311 315 if (x) { \
312 316 x = sizeof (uint64_t) - (x); \
313 317 sz -= (x); \
314 318 ptr += (x); \
315 319 }
316 320
317 321 #ifdef DEBUG
318 322 int nfs4_client_attr_debug = 0;
319 323 int nfs4_client_state_debug = 0;
320 324 int nfs4_client_shadow_debug = 0;
321 325 int nfs4_client_lock_debug = 0;
322 326 int nfs4_seqid_sync = 0;
323 327 int nfs4_client_map_debug = 0;
324 328 static int nfs4_pageio_debug = 0;
325 329 int nfs4_client_inactive_debug = 0;
326 330 int nfs4_client_recov_debug = 0;
327 331 int nfs4_client_failover_debug = 0;
328 332 int nfs4_client_call_debug = 0;
329 333 int nfs4_client_lookup_debug = 0;
330 334 int nfs4_client_zone_debug = 0;
331 335 int nfs4_lost_rqst_debug = 0;
332 336 int nfs4_rdattrerr_debug = 0;
333 337 int nfs4_open_stream_debug = 0;
334 338
335 339 int nfs4read_error_inject;
336 340
337 341 static int nfs4_create_misses = 0;
338 342
339 343 static int nfs4_readdir_cache_shorts = 0;
340 344 static int nfs4_readdir_readahead = 0;
341 345
342 346 static int nfs4_bio_do_stop = 0;
343 347
344 348 static int nfs4_lostpage = 0; /* number of times we lost original page */
345 349
346 350 int nfs4_mmap_debug = 0;
347 351
348 352 static int nfs4_pathconf_cache_hits = 0;
349 353 static int nfs4_pathconf_cache_misses = 0;
350 354
351 355 int nfs4close_all_cnt;
352 356 int nfs4close_one_debug = 0;
353 357 int nfs4close_notw_debug = 0;
354 358
355 359 int denied_to_flk_debug = 0;
356 360 void *lockt_denied_debug;
357 361
358 362 #endif
359 363
360 364 /*
361 365 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
362 366 * or NFS4ERR_RESOURCE.
363 367 */
364 368 static int confirm_retry_sec = 30;
365 369
366 370 static int nfs4_lookup_neg_cache = 1;
367 371
368 372 /*
369 373 * number of pages to read ahead
370 374 * optimized for 100 base-T.
371 375 */
372 376 static int nfs4_nra = 4;
373 377
374 378 static int nfs4_do_symlink_cache = 1;
375 379
376 380 static int nfs4_pathconf_disable_cache = 0;
377 381
378 382 /*
379 383 * These are the vnode ops routines which implement the vnode interface to
380 384 * the networked file system. These routines just take their parameters,
381 385 * make them look networkish by putting the right info into interface structs,
382 386 * and then calling the appropriate remote routine(s) to do the work.
383 387 *
384 388 * Note on directory name lookup cacheing: If we detect a stale fhandle,
385 389 * we purge the directory cache relative to that vnode. This way, the
386 390 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for
387 391 * more details on rnode locking.
388 392 */
389 393
390 394 struct vnodeops *nfs4_vnodeops;
391 395
392 396 const fs_operation_def_t nfs4_vnodeops_template[] = {
393 397 VOPNAME_OPEN, { .vop_open = nfs4_open },
394 398 VOPNAME_CLOSE, { .vop_close = nfs4_close },
395 399 VOPNAME_READ, { .vop_read = nfs4_read },
396 400 VOPNAME_WRITE, { .vop_write = nfs4_write },
397 401 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl },
398 402 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr },
399 403 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr },
400 404 VOPNAME_ACCESS, { .vop_access = nfs4_access },
401 405 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup },
402 406 VOPNAME_CREATE, { .vop_create = nfs4_create },
403 407 VOPNAME_REMOVE, { .vop_remove = nfs4_remove },
404 408 VOPNAME_LINK, { .vop_link = nfs4_link },
405 409 VOPNAME_RENAME, { .vop_rename = nfs4_rename },
406 410 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir },
407 411 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir },
408 412 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir },
409 413 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink },
410 414 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink },
411 415 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync },
412 416 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
413 417 VOPNAME_FID, { .vop_fid = nfs4_fid },
414 418 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
415 419 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
416 420 VOPNAME_SEEK, { .vop_seek = nfs4_seek },
417 421 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock },
418 422 VOPNAME_SPACE, { .vop_space = nfs4_space },
419 423 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
420 424 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage },
421 425 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage },
422 426 VOPNAME_MAP, { .vop_map = nfs4_map },
423 427 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap },
424 428 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap },
425 429 /* no separate nfs4_dump */
426 430 VOPNAME_DUMP, { .vop_dump = nfs_dump },
427 431 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
428 432 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio },
429 433 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose },
430 434 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr },
431 435 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
432 436 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock },
433 437 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
434 438 NULL, NULL
435 439 };
436 440
437 441 /*
438 442 * The following are subroutines and definitions to set args or get res
439 443 * for the different nfsv4 ops
440 444 */
441 445
442 446 void
443 447 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
444 448 {
445 449 int i;
446 450
447 451 for (i = 0; i < arglen; i++) {
448 452 if (argop[i].argop == OP_LOOKUP) {
449 453 kmem_free(
450 454 argop[i].nfs_argop4_u.oplookup.
451 455 objname.utf8string_val,
452 456 argop[i].nfs_argop4_u.oplookup.
453 457 objname.utf8string_len);
454 458 }
455 459 }
456 460 }
457 461
458 462 static void
459 463 nfs4args_lock_free(nfs_argop4 *argop)
460 464 {
461 465 locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
462 466
463 467 if (locker->new_lock_owner == TRUE) {
464 468 open_to_lock_owner4 *open_owner;
465 469
466 470 open_owner = &locker->locker4_u.open_owner;
467 471 if (open_owner->lock_owner.owner_val != NULL) {
468 472 kmem_free(open_owner->lock_owner.owner_val,
469 473 open_owner->lock_owner.owner_len);
470 474 }
471 475 }
472 476 }
473 477
474 478 static void
475 479 nfs4args_lockt_free(nfs_argop4 *argop)
476 480 {
477 481 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
478 482
479 483 if (lowner->owner_val != NULL) {
480 484 kmem_free(lowner->owner_val, lowner->owner_len);
481 485 }
482 486 }
483 487
484 488 static void
485 489 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
486 490 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
487 491 nfs4_stateid_types_t *sid_types)
488 492 {
489 493 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
490 494 mntinfo4_t *mi;
491 495
492 496 argop->argop = OP_SETATTR;
493 497 /*
494 498 * The stateid is set to 0 if client is not modifying the size
495 499 * and otherwise to whatever nfs4_get_stateid() returns.
496 500 *
497 501 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
498 502 * state struct could be found for the process/file pair. We may
499 503 * want to change this in the future (by OPENing the file). See
500 504 * bug # 4474852.
501 505 */
502 506 if (vap->va_mask & AT_SIZE) {
503 507
504 508 ASSERT(rp != NULL);
505 509 mi = VTOMI4(RTOV4(rp));
506 510
507 511 argop->nfs_argop4_u.opsetattr.stateid =
508 512 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
509 513 OP_SETATTR, sid_types, FALSE);
510 514 } else {
511 515 bzero(&argop->nfs_argop4_u.opsetattr.stateid,
512 516 sizeof (stateid4));
513 517 }
514 518
515 519 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
516 520 if (*error)
517 521 bzero(attr, sizeof (*attr));
518 522 }
519 523
520 524 static void
521 525 nfs4args_setattr_free(nfs_argop4 *argop)
522 526 {
523 527 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
524 528 }
525 529
526 530 static int
527 531 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
528 532 bitmap4 supp)
529 533 {
530 534 fattr4 *attr;
531 535 int error = 0;
532 536
533 537 argop->argop = op;
534 538 switch (op) {
535 539 case OP_VERIFY:
536 540 attr = &argop->nfs_argop4_u.opverify.obj_attributes;
537 541 break;
538 542 case OP_NVERIFY:
539 543 attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
540 544 break;
541 545 default:
542 546 return (EINVAL);
543 547 }
544 548 if (!error)
545 549 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
546 550 if (error)
547 551 bzero(attr, sizeof (*attr));
548 552 return (error);
549 553 }
550 554
551 555 static void
552 556 nfs4args_verify_free(nfs_argop4 *argop)
553 557 {
554 558 switch (argop->argop) {
555 559 case OP_VERIFY:
556 560 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
557 561 break;
558 562 case OP_NVERIFY:
559 563 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
560 564 break;
561 565 default:
562 566 break;
563 567 }
564 568 }
565 569
566 570 static void
567 571 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
568 572 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
569 573 {
570 574 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
571 575 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
572 576
573 577 argop->argop = OP_WRITE;
574 578 wargs->stable = stable;
575 579 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
576 580 mi, OP_WRITE, sid_tp);
577 581 wargs->mblk = NULL;
578 582 *wargs_pp = wargs;
579 583 }
580 584
581 585 void
582 586 nfs4args_copen_free(OPEN4cargs *open_args)
583 587 {
584 588 if (open_args->owner.owner_val) {
585 589 kmem_free(open_args->owner.owner_val,
586 590 open_args->owner.owner_len);
587 591 }
588 592 if ((open_args->opentype == OPEN4_CREATE) &&
589 593 (open_args->mode != EXCLUSIVE4)) {
590 594 nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
591 595 }
592 596 }
593 597
594 598 /*
595 599 * XXX: This is referenced in modstubs.s
596 600 */
597 601 struct vnodeops *
598 602 nfs4_getvnodeops(void)
599 603 {
600 604 return (nfs4_vnodeops);
601 605 }
602 606
603 607 /*
604 608 * The OPEN operation opens a regular file.
605 609 */
606 610 /*ARGSUSED3*/
607 611 static int
608 612 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
609 613 {
610 614 vnode_t *dvp = NULL;
611 615 rnode4_t *rp, *drp;
612 616 int error;
613 617 int just_been_created;
614 618 char fn[MAXNAMELEN];
615 619
616 620 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
617 621 if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
618 622 return (EIO);
619 623 rp = VTOR4(*vpp);
620 624
621 625 /*
622 626 * Check to see if opening something besides a regular file;
623 627 * if so skip the OTW call
624 628 */
625 629 if ((*vpp)->v_type != VREG) {
626 630 error = nfs4_open_non_reg_file(vpp, flag, cr);
627 631 return (error);
628 632 }
629 633
630 634 /*
631 635 * XXX - would like a check right here to know if the file is
632 636 * executable or not, so as to skip OTW
633 637 */
634 638
635 639 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
636 640 return (error);
637 641
638 642 drp = VTOR4(dvp);
639 643 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
640 644 return (EINTR);
641 645
642 646 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
643 647 nfs_rw_exit(&drp->r_rwlock);
644 648 return (error);
645 649 }
646 650
647 651 /*
648 652 * See if this file has just been CREATEd.
649 653 * If so, clear the flag and update the dnlc, which was previously
650 654 * skipped in nfs4_create.
651 655 * XXX need better serilization on this.
652 656 * XXX move this into the nf4open_otw call, after we have
653 657 * XXX acquired the open owner seqid sync.
654 658 */
655 659 mutex_enter(&rp->r_statev4_lock);
656 660 if (rp->created_v4) {
657 661 rp->created_v4 = 0;
658 662 mutex_exit(&rp->r_statev4_lock);
659 663
660 664 dnlc_update(dvp, fn, *vpp);
661 665 /* This is needed so we don't bump the open ref count */
662 666 just_been_created = 1;
663 667 } else {
664 668 mutex_exit(&rp->r_statev4_lock);
665 669 just_been_created = 0;
666 670 }
667 671
668 672 /*
669 673 * If caller specified O_TRUNC/FTRUNC, then be sure to set
670 674 * FWRITE (to drive successful setattr(size=0) after open)
671 675 */
672 676 if (flag & FTRUNC)
673 677 flag |= FWRITE;
674 678
675 679 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
676 680 just_been_created);
677 681
678 682 if (!error && !((*vpp)->v_flag & VROOT))
679 683 dnlc_update(dvp, fn, *vpp);
680 684
681 685 nfs_rw_exit(&drp->r_rwlock);
682 686
683 687 /* release the hold from vtodv */
684 688 VN_RELE(dvp);
685 689
686 690 /* exchange the shadow for the master vnode, if needed */
687 691
688 692 if (error == 0 && IS_SHADOW(*vpp, rp))
689 693 sv_exchange(vpp);
690 694
691 695 return (error);
692 696 }
693 697
694 698 /*
695 699 * See if there's a "lost open" request to be saved and recovered.
696 700 */
697 701 static void
698 702 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
699 703 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
700 704 vnode_t *dvp, OPEN4cargs *open_args)
701 705 {
702 706 vfs_t *vfsp;
703 707 char *srccfp;
704 708
705 709 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
706 710
707 711 if (error != ETIMEDOUT && error != EINTR &&
708 712 !NFS4_FRC_UNMT_ERR(error, vfsp)) {
709 713 lost_rqstp->lr_op = 0;
710 714 return;
711 715 }
712 716
713 717 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
714 718 "nfs4open_save_lost_rqst: error %d", error));
715 719
716 720 lost_rqstp->lr_op = OP_OPEN;
717 721
718 722 /*
719 723 * The vp (if it is not NULL) and dvp are held and rele'd via
720 724 * the recovery code. See nfs4_save_lost_rqst.
721 725 */
722 726 lost_rqstp->lr_vp = vp;
723 727 lost_rqstp->lr_dvp = dvp;
724 728 lost_rqstp->lr_oop = oop;
725 729 lost_rqstp->lr_osp = NULL;
726 730 lost_rqstp->lr_lop = NULL;
727 731 lost_rqstp->lr_cr = cr;
728 732 lost_rqstp->lr_flk = NULL;
729 733 lost_rqstp->lr_oacc = open_args->share_access;
730 734 lost_rqstp->lr_odeny = open_args->share_deny;
731 735 lost_rqstp->lr_oclaim = open_args->claim;
732 736 if (open_args->claim == CLAIM_DELEGATE_CUR) {
733 737 lost_rqstp->lr_ostateid =
734 738 open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
735 739 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
736 740 } else {
737 741 srccfp = open_args->open_claim4_u.cfile;
738 742 }
739 743 lost_rqstp->lr_ofile.utf8string_len = 0;
740 744 lost_rqstp->lr_ofile.utf8string_val = NULL;
741 745 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
742 746 lost_rqstp->lr_putfirst = FALSE;
743 747 }
744 748
745 749 struct nfs4_excl_time {
746 750 uint32 seconds;
747 751 uint32 nseconds;
748 752 };
749 753
750 754 /*
751 755 * The OPEN operation creates and/or opens a regular file
752 756 *
753 757 * ARGSUSED
754 758 */
755 759 static int
756 760 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
757 761 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
758 762 enum createmode4 createmode, int file_just_been_created)
759 763 {
760 764 rnode4_t *rp;
761 765 rnode4_t *drp = VTOR4(dvp);
762 766 vnode_t *vp = NULL;
763 767 vnode_t *vpi = *vpp;
764 768 bool_t needrecov = FALSE;
765 769
766 770 int doqueue = 1;
767 771
768 772 COMPOUND4args_clnt args;
769 773 COMPOUND4res_clnt res;
770 774 nfs_argop4 *argop;
771 775 nfs_resop4 *resop;
772 776 int argoplist_size;
773 777 int idx_open, idx_fattr;
774 778
775 779 GETFH4res *gf_res = NULL;
776 780 OPEN4res *op_res = NULL;
777 781 nfs4_ga_res_t *garp;
778 782 fattr4 *attr = NULL;
779 783 struct nfs4_excl_time verf;
780 784 bool_t did_excl_setup = FALSE;
781 785 int created_osp;
782 786
783 787 OPEN4cargs *open_args;
784 788 nfs4_open_owner_t *oop = NULL;
785 789 nfs4_open_stream_t *osp = NULL;
786 790 seqid4 seqid = 0;
787 791 bool_t retry_open = FALSE;
788 792 nfs4_recov_state_t recov_state;
789 793 nfs4_lost_rqst_t lost_rqst;
790 794 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
791 795 hrtime_t t;
792 796 int acc = 0;
793 797 cred_t *cred_otw = NULL; /* cred used to do the RPC call */
794 798 cred_t *ncr = NULL;
795 799
796 800 nfs4_sharedfh_t *otw_sfh;
797 801 nfs4_sharedfh_t *orig_sfh;
798 802 int fh_differs = 0;
799 803 int numops, setgid_flag;
800 804 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
801 805
802 806 /*
803 807 * Make sure we properly deal with setting the right gid on
804 808 * a newly created file to reflect the parent's setgid bit
805 809 */
806 810 setgid_flag = 0;
807 811 if (create_flag && in_va) {
808 812
809 813 /*
810 814 * If there is grpid mount flag used or
811 815 * the parent's directory has the setgid bit set
812 816 * _and_ the client was able to get a valid mapping
813 817 * for the parent dir's owner_group, we want to
814 818 * append NVERIFY(owner_group == dva.va_gid) and
815 819 * SETATTR to the CREATE compound.
816 820 */
817 821 mutex_enter(&drp->r_statelock);
818 822 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
819 823 drp->r_attr.va_mode & VSGID) &&
820 824 drp->r_attr.va_gid != GID_NOBODY) {
821 825 in_va->va_mask |= AT_GID;
822 826 in_va->va_gid = drp->r_attr.va_gid;
823 827 setgid_flag = 1;
824 828 }
825 829 mutex_exit(&drp->r_statelock);
826 830 }
827 831
828 832 /*
829 833 * Normal/non-create compound:
830 834 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
831 835 *
832 836 * Open(create) compound no setgid:
833 837 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
834 838 * RESTOREFH + GETATTR
835 839 *
836 840 * Open(create) setgid:
837 841 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
838 842 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
839 843 * NVERIFY(grp) + SETATTR
840 844 */
841 845 if (setgid_flag) {
842 846 numops = 10;
843 847 idx_open = 1;
844 848 idx_fattr = 3;
845 849 } else if (create_flag) {
846 850 numops = 7;
847 851 idx_open = 2;
848 852 idx_fattr = 4;
849 853 } else {
850 854 numops = 4;
851 855 idx_open = 1;
852 856 idx_fattr = 3;
853 857 }
854 858
855 859 args.array_len = numops;
856 860 argoplist_size = numops * sizeof (nfs_argop4);
857 861 argop = kmem_alloc(argoplist_size, KM_SLEEP);
858 862
859 863 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
860 864 "open %s open flag 0x%x cred %p", file_name, open_flag,
861 865 (void *)cr));
862 866
863 867 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
864 868 if (create_flag) {
865 869 /*
866 870 * We are to create a file. Initialize the passed in vnode
867 871 * pointer.
868 872 */
869 873 vpi = NULL;
870 874 } else {
871 875 /*
872 876 * Check to see if the client owns a read delegation and is
873 877 * trying to open for write. If so, then return the delegation
874 878 * to avoid the server doing a cb_recall and returning DELAY.
875 879 * NB - we don't use the statev4_lock here because we'd have
876 880 * to drop the lock anyway and the result would be stale.
877 881 */
878 882 if ((open_flag & FWRITE) &&
879 883 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
880 884 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
881 885
882 886 /*
883 887 * If the file has a delegation, then do an access check up
884 888 * front. This avoids having to an access check later after
885 889 * we've already done start_op, which could deadlock.
886 890 */
887 891 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
888 892 if (open_flag & FREAD &&
889 893 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
890 894 acc |= VREAD;
891 895 if (open_flag & FWRITE &&
892 896 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
893 897 acc |= VWRITE;
894 898 }
895 899 }
896 900
897 901 drp = VTOR4(dvp);
898 902
899 903 recov_state.rs_flags = 0;
900 904 recov_state.rs_num_retry_despite_err = 0;
901 905 cred_otw = cr;
902 906
903 907 recov_retry:
904 908 fh_differs = 0;
905 909 nfs4_error_zinit(&e);
906 910
907 911 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
908 912 if (e.error) {
909 913 if (ncr != NULL)
910 914 crfree(ncr);
911 915 kmem_free(argop, argoplist_size);
912 916 return (e.error);
913 917 }
914 918
915 919 args.ctag = TAG_OPEN;
916 920 args.array_len = numops;
917 921 args.array = argop;
918 922
919 923 /* putfh directory fh */
920 924 argop[0].argop = OP_CPUTFH;
921 925 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
922 926
923 927 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
924 928 argop[idx_open].argop = OP_COPEN;
925 929 open_args = &argop[idx_open].nfs_argop4_u.opcopen;
926 930 open_args->claim = CLAIM_NULL;
927 931
928 932 /* name of file */
929 933 open_args->open_claim4_u.cfile = file_name;
930 934 open_args->owner.owner_len = 0;
931 935 open_args->owner.owner_val = NULL;
932 936
933 937 if (create_flag) {
934 938 /* CREATE a file */
935 939 open_args->opentype = OPEN4_CREATE;
936 940 open_args->mode = createmode;
937 941 if (createmode == EXCLUSIVE4) {
938 942 if (did_excl_setup == FALSE) {
939 943 verf.seconds = zone_get_hostid(NULL);
940 944 if (verf.seconds != 0)
941 945 verf.nseconds = newnum();
942 946 else {
943 947 timestruc_t now;
944 948
945 949 gethrestime(&now);
946 950 verf.seconds = now.tv_sec;
947 951 verf.nseconds = now.tv_nsec;
948 952 }
949 953 /*
950 954 * Since the server will use this value for the
951 955 * mtime, make sure that it can't overflow. Zero
952 956 * out the MSB. The actual value does not matter
953 957 * here, only its uniqeness.
954 958 */
955 959 verf.seconds &= INT32_MAX;
956 960 did_excl_setup = TRUE;
957 961 }
958 962
959 963 /* Now copy over verifier to OPEN4args. */
960 964 open_args->createhow4_u.createverf = *(uint64_t *)&verf;
961 965 } else {
962 966 int v_error;
963 967 bitmap4 supp_attrs;
964 968 servinfo4_t *svp;
965 969
966 970 attr = &open_args->createhow4_u.createattrs;
967 971
968 972 svp = drp->r_server;
969 973 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
970 974 supp_attrs = svp->sv_supp_attrs;
971 975 nfs_rw_exit(&svp->sv_lock);
972 976
973 977 /* GUARDED4 or UNCHECKED4 */
974 978 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
975 979 supp_attrs);
976 980 if (v_error) {
977 981 bzero(attr, sizeof (*attr));
978 982 nfs4args_copen_free(open_args);
979 983 nfs4_end_op(VTOMI4(dvp), dvp, vpi,
980 984 &recov_state, FALSE);
981 985 if (ncr != NULL)
982 986 crfree(ncr);
983 987 kmem_free(argop, argoplist_size);
984 988 return (v_error);
985 989 }
986 990 }
987 991 } else {
988 992 /* NO CREATE */
989 993 open_args->opentype = OPEN4_NOCREATE;
990 994 }
991 995
992 996 if (recov_state.rs_sp != NULL) {
993 997 mutex_enter(&recov_state.rs_sp->s_lock);
994 998 open_args->owner.clientid = recov_state.rs_sp->clientid;
995 999 mutex_exit(&recov_state.rs_sp->s_lock);
996 1000 } else {
997 1001 /* XXX should we just fail here? */
998 1002 open_args->owner.clientid = 0;
999 1003 }
1000 1004
1001 1005 /*
1002 1006 * This increments oop's ref count or creates a temporary 'just_created'
1003 1007 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1004 1008 * completes.
1005 1009 */
1006 1010 mutex_enter(&VTOMI4(dvp)->mi_lock);
1007 1011
1008 1012 /* See if a permanent or just created open owner exists */
1009 1013 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1010 1014 if (!oop) {
1011 1015 /*
1012 1016 * This open owner does not exist so create a temporary
1013 1017 * just created one.
1014 1018 */
1015 1019 oop = create_open_owner(cr, VTOMI4(dvp));
1016 1020 ASSERT(oop != NULL);
1017 1021 }
1018 1022 mutex_exit(&VTOMI4(dvp)->mi_lock);
1019 1023
1020 1024 /* this length never changes, do alloc before seqid sync */
1021 1025 open_args->owner.owner_len = sizeof (oop->oo_name);
1022 1026 open_args->owner.owner_val =
1023 1027 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1024 1028
1025 1029 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1026 1030 if (e.error == EAGAIN) {
1027 1031 open_owner_rele(oop);
1028 1032 nfs4args_copen_free(open_args);
1029 1033 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1030 1034 if (ncr != NULL) {
1031 1035 crfree(ncr);
1032 1036 ncr = NULL;
1033 1037 }
1034 1038 goto recov_retry;
1035 1039 }
1036 1040
1037 1041 /* Check to see if we need to do the OTW call */
1038 1042 if (!create_flag) {
1039 1043 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1040 1044 file_just_been_created, &e.error, acc, &recov_state)) {
1041 1045
1042 1046 /*
1043 1047 * The OTW open is not necessary. Either
1044 1048 * the open can succeed without it (eg.
1045 1049 * delegation, error == 0) or the open
1046 1050 * must fail due to an access failure
1047 1051 * (error != 0). In either case, tidy
1048 1052 * up and return.
1049 1053 */
1050 1054
1051 1055 nfs4_end_open_seqid_sync(oop);
1052 1056 open_owner_rele(oop);
1053 1057 nfs4args_copen_free(open_args);
1054 1058 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1055 1059 if (ncr != NULL)
1056 1060 crfree(ncr);
1057 1061 kmem_free(argop, argoplist_size);
1058 1062 return (e.error);
1059 1063 }
1060 1064 }
1061 1065
1062 1066 bcopy(&oop->oo_name, open_args->owner.owner_val,
1063 1067 open_args->owner.owner_len);
1064 1068
1065 1069 seqid = nfs4_get_open_seqid(oop) + 1;
1066 1070 open_args->seqid = seqid;
1067 1071 open_args->share_access = 0;
1068 1072 if (open_flag & FREAD)
1069 1073 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1070 1074 if (open_flag & FWRITE)
1071 1075 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1072 1076 open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1073 1077
1074 1078
1075 1079
1076 1080 /*
1077 1081 * getfh w/sanity check for idx_open/idx_fattr
1078 1082 */
1079 1083 ASSERT((idx_open + 1) == (idx_fattr - 1));
1080 1084 argop[idx_open + 1].argop = OP_GETFH;
1081 1085
1082 1086 /* getattr */
1083 1087 argop[idx_fattr].argop = OP_GETATTR;
1084 1088 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1085 1089 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1086 1090
1087 1091 if (setgid_flag) {
1088 1092 vattr_t _v;
1089 1093 servinfo4_t *svp;
1090 1094 bitmap4 supp_attrs;
1091 1095
1092 1096 svp = drp->r_server;
1093 1097 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1094 1098 supp_attrs = svp->sv_supp_attrs;
1095 1099 nfs_rw_exit(&svp->sv_lock);
1096 1100
1097 1101 /*
1098 1102 * For setgid case, we need to:
1099 1103 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1100 1104 */
1101 1105 argop[4].argop = OP_SAVEFH;
1102 1106
1103 1107 argop[5].argop = OP_CPUTFH;
1104 1108 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1105 1109
1106 1110 argop[6].argop = OP_GETATTR;
1107 1111 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1108 1112 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1109 1113
1110 1114 argop[7].argop = OP_RESTOREFH;
1111 1115
1112 1116 /*
1113 1117 * nverify
1114 1118 */
1115 1119 _v.va_mask = AT_GID;
1116 1120 _v.va_gid = in_va->va_gid;
1117 1121 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1118 1122 supp_attrs))) {
1119 1123
1120 1124 /*
1121 1125 * setattr
1122 1126 *
1123 1127 * We _know_ we're not messing with AT_SIZE or
1124 1128 * AT_XTIME, so no need for stateid or flags.
1125 1129 * Also we specify NULL rp since we're only
1126 1130 * interested in setting owner_group attributes.
1127 1131 */
1128 1132 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1129 1133 supp_attrs, &e.error, 0);
1130 1134 if (e.error)
1131 1135 nfs4args_verify_free(&argop[8]);
1132 1136 }
1133 1137
1134 1138 if (e.error) {
1135 1139 /*
1136 1140 * XXX - Revisit the last argument to nfs4_end_op()
1137 1141 * once 5020486 is fixed.
1138 1142 */
1139 1143 nfs4_end_open_seqid_sync(oop);
1140 1144 open_owner_rele(oop);
1141 1145 nfs4args_copen_free(open_args);
1142 1146 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1143 1147 if (ncr != NULL)
1144 1148 crfree(ncr);
1145 1149 kmem_free(argop, argoplist_size);
1146 1150 return (e.error);
1147 1151 }
1148 1152 } else if (create_flag) {
1149 1153 argop[1].argop = OP_SAVEFH;
1150 1154
1151 1155 argop[5].argop = OP_RESTOREFH;
1152 1156
1153 1157 argop[6].argop = OP_GETATTR;
1154 1158 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1155 1159 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1156 1160 }
1157 1161
1158 1162 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1159 1163 "nfs4open_otw: %s call, nm %s, rp %s",
1160 1164 needrecov ? "recov" : "first", file_name,
1161 1165 rnode4info(VTOR4(dvp))));
1162 1166
1163 1167 t = gethrtime();
1164 1168
1165 1169 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1166 1170
1167 1171 if (!e.error && nfs4_need_to_bump_seqid(&res))
1168 1172 nfs4_set_open_seqid(seqid, oop, args.ctag);
1169 1173
1170 1174 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1171 1175
1172 1176 if (e.error || needrecov) {
1173 1177 bool_t abort = FALSE;
1174 1178
1175 1179 if (needrecov) {
1176 1180 nfs4_bseqid_entry_t *bsep = NULL;
1177 1181
1178 1182 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1179 1183 cred_otw, vpi, dvp, open_args);
1180 1184
1181 1185 if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1182 1186 bsep = nfs4_create_bseqid_entry(oop, NULL,
1183 1187 vpi, 0, args.ctag, open_args->seqid);
1184 1188 num_bseqid_retry--;
1185 1189 }
1186 1190
1187 1191 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1188 1192 NULL, lost_rqst.lr_op == OP_OPEN ?
1189 1193 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1190 1194
1191 1195 if (bsep)
1192 1196 kmem_free(bsep, sizeof (*bsep));
1193 1197 /* give up if we keep getting BAD_SEQID */
1194 1198 if (num_bseqid_retry == 0)
1195 1199 abort = TRUE;
1196 1200 if (abort == TRUE && e.error == 0)
1197 1201 e.error = geterrno4(res.status);
1198 1202 }
1199 1203 nfs4_end_open_seqid_sync(oop);
1200 1204 open_owner_rele(oop);
1201 1205 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1202 1206 nfs4args_copen_free(open_args);
1203 1207 if (setgid_flag) {
1204 1208 nfs4args_verify_free(&argop[8]);
1205 1209 nfs4args_setattr_free(&argop[9]);
1206 1210 }
1207 1211 if (!e.error)
1208 1212 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1209 1213 if (ncr != NULL) {
1210 1214 crfree(ncr);
1211 1215 ncr = NULL;
1212 1216 }
1213 1217 if (!needrecov || abort == TRUE || e.error == EINTR ||
1214 1218 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1215 1219 kmem_free(argop, argoplist_size);
1216 1220 return (e.error);
1217 1221 }
1218 1222 goto recov_retry;
1219 1223 }
1220 1224
1221 1225 /*
1222 1226 * Will check and update lease after checking the rflag for
1223 1227 * OPEN_CONFIRM in the successful OPEN call.
1224 1228 */
1225 1229 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1226 1230
1227 1231 /*
1228 1232 * XXX what if we're crossing mount points from server1:/drp
1229 1233 * to server2:/drp/rp.
1230 1234 */
1231 1235
1232 1236 /* Signal our end of use of the open seqid */
1233 1237 nfs4_end_open_seqid_sync(oop);
1234 1238
1235 1239 /*
1236 1240 * This will destroy the open owner if it was just created,
1237 1241 * and no one else has put a reference on it.
1238 1242 */
1239 1243 open_owner_rele(oop);
1240 1244 if (create_flag && (createmode != EXCLUSIVE4) &&
1241 1245 res.status == NFS4ERR_BADOWNER)
1242 1246 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1243 1247
1244 1248 e.error = geterrno4(res.status);
1245 1249 nfs4args_copen_free(open_args);
1246 1250 if (setgid_flag) {
1247 1251 nfs4args_verify_free(&argop[8]);
1248 1252 nfs4args_setattr_free(&argop[9]);
1249 1253 }
1250 1254 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1251 1255 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1252 1256 /*
1253 1257 * If the reply is NFS4ERR_ACCESS, it may be because
1254 1258 * we are root (no root net access). If the real uid
1255 1259 * is not root, then retry with the real uid instead.
1256 1260 */
1257 1261 if (ncr != NULL) {
1258 1262 crfree(ncr);
1259 1263 ncr = NULL;
1260 1264 }
1261 1265 if (res.status == NFS4ERR_ACCESS &&
1262 1266 (ncr = crnetadjust(cred_otw)) != NULL) {
1263 1267 cred_otw = ncr;
1264 1268 goto recov_retry;
1265 1269 }
1266 1270 kmem_free(argop, argoplist_size);
1267 1271 return (e.error);
1268 1272 }
1269 1273
1270 1274 resop = &res.array[idx_open]; /* open res */
1271 1275 op_res = &resop->nfs_resop4_u.opopen;
1272 1276
1273 1277 #ifdef DEBUG
1274 1278 /*
1275 1279 * verify attrset bitmap
1276 1280 */
1277 1281 if (create_flag &&
1278 1282 (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1279 1283 /* make sure attrset returned is what we asked for */
1280 1284 /* XXX Ignore this 'error' for now */
1281 1285 if (attr->attrmask != op_res->attrset)
1282 1286 /* EMPTY */;
1283 1287 }
1284 1288 #endif
1285 1289
1286 1290 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1287 1291 mutex_enter(&VTOMI4(dvp)->mi_lock);
1288 1292 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1289 1293 mutex_exit(&VTOMI4(dvp)->mi_lock);
1290 1294 }
1291 1295
1292 1296 resop = &res.array[idx_open + 1]; /* getfh res */
1293 1297 gf_res = &resop->nfs_resop4_u.opgetfh;
1294 1298
1295 1299 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1296 1300
1297 1301 /*
1298 1302 * The open stateid has been updated on the server but not
1299 1303 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache->
1300 1304 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1301 1305 * WRITE call. That, however, will use the old stateid, so go ahead
1302 1306 * and upate the open stateid now, before any call to makenfs4node.
1303 1307 */
1304 1308 if (vpi) {
1305 1309 nfs4_open_stream_t *tmp_osp;
1306 1310 rnode4_t *tmp_rp = VTOR4(vpi);
1307 1311
1308 1312 tmp_osp = find_open_stream(oop, tmp_rp);
1309 1313 if (tmp_osp) {
1310 1314 tmp_osp->open_stateid = op_res->stateid;
1311 1315 mutex_exit(&tmp_osp->os_sync_lock);
1312 1316 open_stream_rele(tmp_osp, tmp_rp);
1313 1317 }
1314 1318
1315 1319 /*
1316 1320 * We must determine if the file handle given by the otw open
1317 1321 * is the same as the file handle which was passed in with
1318 1322 * *vpp. This case can be reached if the file we are trying
1319 1323 * to open has been removed and another file has been created
1320 1324 * having the same file name. The passed in vnode is released
1321 1325 * later.
1322 1326 */
1323 1327 orig_sfh = VTOR4(vpi)->r_fh;
1324 1328 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1325 1329 }
1326 1330
1327 1331 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1328 1332
1329 1333 if (create_flag || fh_differs) {
1330 1334 int rnode_err = 0;
1331 1335
1332 1336 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1333 1337 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1334 1338
1335 1339 if (e.error)
1336 1340 PURGE_ATTRCACHE4(vp);
1337 1341 /*
1338 1342 * For the newly created vp case, make sure the rnode
1339 1343 * isn't bad before using it.
1340 1344 */
1341 1345 mutex_enter(&(VTOR4(vp))->r_statelock);
1342 1346 if (VTOR4(vp)->r_flags & R4RECOVERR)
1343 1347 rnode_err = EIO;
1344 1348 mutex_exit(&(VTOR4(vp))->r_statelock);
1345 1349
1346 1350 if (rnode_err) {
1347 1351 nfs4_end_open_seqid_sync(oop);
1348 1352 nfs4args_copen_free(open_args);
1349 1353 if (setgid_flag) {
1350 1354 nfs4args_verify_free(&argop[8]);
1351 1355 nfs4args_setattr_free(&argop[9]);
1352 1356 }
1353 1357 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1354 1358 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1355 1359 needrecov);
1356 1360 open_owner_rele(oop);
1357 1361 VN_RELE(vp);
1358 1362 if (ncr != NULL)
1359 1363 crfree(ncr);
1360 1364 sfh4_rele(&otw_sfh);
1361 1365 kmem_free(argop, argoplist_size);
1362 1366 return (EIO);
1363 1367 }
1364 1368 } else {
1365 1369 vp = vpi;
1366 1370 }
1367 1371 sfh4_rele(&otw_sfh);
1368 1372
1369 1373 /*
1370 1374 * It seems odd to get a full set of attrs and then not update
1371 1375 * the object's attrcache in the non-create case. Create case uses
1372 1376 * the attrs since makenfs4node checks to see if the attrs need to
1373 1377 * be updated (and then updates them). The non-create case should
1374 1378 * update attrs also.
1375 1379 */
1376 1380 if (! create_flag && ! fh_differs && !e.error) {
1377 1381 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1378 1382 }
1379 1383
1380 1384 nfs4_error_zinit(&e);
1381 1385 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1382 1386 /* This does not do recovery for vp explicitly. */
1383 1387 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1384 1388 &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1385 1389
1386 1390 if (e.error || e.stat) {
1387 1391 nfs4_end_open_seqid_sync(oop);
1388 1392 nfs4args_copen_free(open_args);
1389 1393 if (setgid_flag) {
1390 1394 nfs4args_verify_free(&argop[8]);
1391 1395 nfs4args_setattr_free(&argop[9]);
1392 1396 }
1393 1397 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1394 1398 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1395 1399 needrecov);
1396 1400 open_owner_rele(oop);
1397 1401 if (create_flag || fh_differs) {
1398 1402 /* rele the makenfs4node */
1399 1403 VN_RELE(vp);
1400 1404 }
1401 1405 if (ncr != NULL) {
1402 1406 crfree(ncr);
1403 1407 ncr = NULL;
1404 1408 }
1405 1409 if (retry_open == TRUE) {
1406 1410 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1407 1411 "nfs4open_otw: retry the open since OPEN "
1408 1412 "CONFIRM failed with error %d stat %d",
1409 1413 e.error, e.stat));
1410 1414 if (create_flag && createmode == GUARDED4) {
1411 1415 NFS4_DEBUG(nfs4_client_recov_debug,
1412 1416 (CE_NOTE, "nfs4open_otw: switch "
1413 1417 "createmode from GUARDED4 to "
1414 1418 "UNCHECKED4"));
1415 1419 createmode = UNCHECKED4;
1416 1420 }
1417 1421 goto recov_retry;
1418 1422 }
1419 1423 if (!e.error) {
1420 1424 if (create_flag && (createmode != EXCLUSIVE4) &&
1421 1425 e.stat == NFS4ERR_BADOWNER)
1422 1426 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1423 1427
1424 1428 e.error = geterrno4(e.stat);
1425 1429 }
1426 1430 kmem_free(argop, argoplist_size);
1427 1431 return (e.error);
1428 1432 }
1429 1433 }
1430 1434
1431 1435 rp = VTOR4(vp);
1432 1436
1433 1437 mutex_enter(&rp->r_statev4_lock);
1434 1438 if (create_flag)
1435 1439 rp->created_v4 = 1;
1436 1440 mutex_exit(&rp->r_statev4_lock);
1437 1441
1438 1442 mutex_enter(&oop->oo_lock);
1439 1443 /* Doesn't matter if 'oo_just_created' already was set as this */
1440 1444 oop->oo_just_created = NFS4_PERM_CREATED;
1441 1445 if (oop->oo_cred_otw)
1442 1446 crfree(oop->oo_cred_otw);
1443 1447 oop->oo_cred_otw = cred_otw;
1444 1448 crhold(oop->oo_cred_otw);
1445 1449 mutex_exit(&oop->oo_lock);
1446 1450
1447 1451 /* returns with 'os_sync_lock' held */
1448 1452 osp = find_or_create_open_stream(oop, rp, &created_osp);
1449 1453 if (!osp) {
1450 1454 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1451 1455 "nfs4open_otw: failed to create an open stream"));
1452 1456 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1453 1457 "signal our end of use of the open seqid"));
1454 1458
1455 1459 nfs4_end_open_seqid_sync(oop);
1456 1460 open_owner_rele(oop);
1457 1461 nfs4args_copen_free(open_args);
1458 1462 if (setgid_flag) {
1459 1463 nfs4args_verify_free(&argop[8]);
1460 1464 nfs4args_setattr_free(&argop[9]);
1461 1465 }
1462 1466 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1463 1467 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1464 1468 if (create_flag || fh_differs)
1465 1469 VN_RELE(vp);
1466 1470 if (ncr != NULL)
1467 1471 crfree(ncr);
1468 1472
1469 1473 kmem_free(argop, argoplist_size);
1470 1474 return (EINVAL);
1471 1475
1472 1476 }
1473 1477
1474 1478 osp->open_stateid = op_res->stateid;
1475 1479
1476 1480 if (open_flag & FREAD)
1477 1481 osp->os_share_acc_read++;
1478 1482 if (open_flag & FWRITE)
1479 1483 osp->os_share_acc_write++;
1480 1484 osp->os_share_deny_none++;
1481 1485
1482 1486 /*
1483 1487 * Need to reset this bitfield for the possible case where we were
1484 1488 * going to OTW CLOSE the file, got a non-recoverable error, and before
1485 1489 * we could retry the CLOSE, OPENed the file again.
1486 1490 */
1487 1491 ASSERT(osp->os_open_owner->oo_seqid_inuse);
1488 1492 osp->os_final_close = 0;
1489 1493 osp->os_force_close = 0;
1490 1494 #ifdef DEBUG
1491 1495 if (osp->os_failed_reopen)
1492 1496 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1493 1497 " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1494 1498 (void *)osp, (void *)cr, rnode4info(rp)));
1495 1499 #endif
1496 1500 osp->os_failed_reopen = 0;
1497 1501
1498 1502 mutex_exit(&osp->os_sync_lock);
1499 1503
1500 1504 nfs4_end_open_seqid_sync(oop);
1501 1505
1502 1506 if (created_osp && recov_state.rs_sp != NULL) {
1503 1507 mutex_enter(&recov_state.rs_sp->s_lock);
1504 1508 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1505 1509 mutex_exit(&recov_state.rs_sp->s_lock);
1506 1510 }
1507 1511
1508 1512 /* get rid of our reference to find oop */
1509 1513 open_owner_rele(oop);
1510 1514
1511 1515 open_stream_rele(osp, rp);
1512 1516
1513 1517 /* accept delegation, if any */
1514 1518 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1515 1519
1516 1520 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1517 1521
1518 1522 if (createmode == EXCLUSIVE4 &&
1519 1523 (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1520 1524 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1521 1525 " EXCLUSIVE4: sending a SETATTR"));
1522 1526 /*
1523 1527 * If doing an exclusive create, then generate
1524 1528 * a SETATTR to set the initial attributes.
1525 1529 * Try to set the mtime and the atime to the
1526 1530 * server's current time. It is somewhat
1527 1531 * expected that these fields will be used to
1528 1532 * store the exclusive create cookie. If not,
1529 1533 * server implementors will need to know that
1530 1534 * a SETATTR will follow an exclusive create
1531 1535 * and the cookie should be destroyed if
1532 1536 * appropriate.
1533 1537 *
1534 1538 * The AT_GID and AT_SIZE bits are turned off
1535 1539 * so that the SETATTR request will not attempt
1536 1540 * to process these. The gid will be set
1537 1541 * separately if appropriate. The size is turned
1538 1542 * off because it is assumed that a new file will
1539 1543 * be created empty and if the file wasn't empty,
1540 1544 * then the exclusive create will have failed
1541 1545 * because the file must have existed already.
1542 1546 * Therefore, no truncate operation is needed.
1543 1547 */
1544 1548 in_va->va_mask &= ~(AT_GID | AT_SIZE);
1545 1549 in_va->va_mask |= (AT_MTIME | AT_ATIME);
1546 1550
1547 1551 e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1548 1552 if (e.error) {
1549 1553 /*
1550 1554 * Couldn't correct the attributes of
1551 1555 * the newly created file and the
1552 1556 * attributes are wrong. Remove the
1553 1557 * file and return an error to the
1554 1558 * application.
1555 1559 */
1556 1560 /* XXX will this take care of client state ? */
1557 1561 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1558 1562 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1559 1563 " remove file", e.error));
1560 1564 VN_RELE(vp);
1561 1565 (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1562 1566 /*
1563 1567 * Since we've reled the vnode and removed
1564 1568 * the file we now need to return the error.
1565 1569 * At this point we don't want to update the
1566 1570 * dircaches, call nfs4_waitfor_purge_complete
1567 1571 * or set vpp to vp so we need to skip these
1568 1572 * as well.
1569 1573 */
1570 1574 goto skip_update_dircaches;
1571 1575 }
1572 1576 }
1573 1577
1574 1578 /*
1575 1579 * If we created or found the correct vnode, due to create_flag or
1576 1580 * fh_differs being set, then update directory cache attribute, readdir
1577 1581 * and dnlc caches.
1578 1582 */
1579 1583 if (create_flag || fh_differs) {
1580 1584 dirattr_info_t dinfo, *dinfop;
1581 1585
1582 1586 /*
1583 1587 * Make sure getattr succeeded before using results.
1584 1588 * note: op 7 is getattr(dir) for both flavors of
1585 1589 * open(create).
1586 1590 */
1587 1591 if (create_flag && res.status == NFS4_OK) {
1588 1592 dinfo.di_time_call = t;
1589 1593 dinfo.di_cred = cr;
1590 1594 dinfo.di_garp =
1591 1595 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1592 1596 dinfop = &dinfo;
1593 1597 } else {
1594 1598 dinfop = NULL;
1595 1599 }
1596 1600
1597 1601 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1598 1602 dinfop);
1599 1603 }
1600 1604
1601 1605 /*
1602 1606 * If the page cache for this file was flushed from actions
1603 1607 * above, it was done asynchronously and if that is true,
1604 1608 * there is a need to wait here for it to complete. This must
1605 1609 * be done outside of start_fop/end_fop.
1606 1610 */
1607 1611 (void) nfs4_waitfor_purge_complete(vp);
1608 1612
1609 1613 /*
1610 1614 * It is implicit that we are in the open case (create_flag == 0) since
1611 1615 * fh_differs can only be set to a non-zero value in the open case.
1612 1616 */
1613 1617 if (fh_differs != 0 && vpi != NULL)
1614 1618 VN_RELE(vpi);
1615 1619
1616 1620 /*
1617 1621 * Be sure to set *vpp to the correct value before returning.
1618 1622 */
1619 1623 *vpp = vp;
1620 1624
1621 1625 skip_update_dircaches:
1622 1626
1623 1627 nfs4args_copen_free(open_args);
1624 1628 if (setgid_flag) {
1625 1629 nfs4args_verify_free(&argop[8]);
1626 1630 nfs4args_setattr_free(&argop[9]);
1627 1631 }
1628 1632 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1629 1633
1630 1634 if (ncr)
1631 1635 crfree(ncr);
1632 1636 kmem_free(argop, argoplist_size);
1633 1637 return (e.error);
1634 1638 }
1635 1639
1636 1640 /*
1637 1641 * Reopen an open instance. cf. nfs4open_otw().
1638 1642 *
1639 1643 * Errors are returned by the nfs4_error_t parameter.
1640 1644 * - ep->error contains an errno value or zero.
1641 1645 * - if it is zero, ep->stat is set to an NFS status code, if any.
1642 1646 * If the file could not be reopened, but the caller should continue, the
1643 1647 * file is marked dead and no error values are returned. If the caller
1644 1648 * should stop recovering open files and start over, either the ep->error
1645 1649 * value or ep->stat will indicate an error (either something that requires
1646 1650 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile
1647 1651 * filehandles) may be handled silently by this routine.
1648 1652 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1649 1653 * will be started, so the caller should not do it.
1650 1654 *
1651 1655 * Gotos:
1652 1656 * - kill_file : reopen failed in such a fashion to constitute marking the
1653 1657 * file dead and setting the open stream's 'os_failed_reopen' as 1. This
1654 1658 * is for cases where recovery is not possible.
1655 1659 * - failed_reopen : same as above, except that the file has already been
1656 1660 * marked dead, so no need to do it again.
1657 1661 * - bailout : reopen failed but we are able to recover and retry the reopen -
1658 1662 * either within this function immediately or via the calling function.
1659 1663 */
1660 1664
1661 1665 void
1662 1666 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1663 1667 open_claim_type4 claim, bool_t frc_use_claim_previous,
1664 1668 bool_t is_recov)
1665 1669 {
1666 1670 COMPOUND4args_clnt args;
1667 1671 COMPOUND4res_clnt res;
1668 1672 nfs_argop4 argop[4];
1669 1673 nfs_resop4 *resop;
1670 1674 OPEN4res *op_res = NULL;
1671 1675 OPEN4cargs *open_args;
1672 1676 GETFH4res *gf_res;
1673 1677 rnode4_t *rp = VTOR4(vp);
1674 1678 int doqueue = 1;
1675 1679 cred_t *cr = NULL, *cred_otw = NULL;
1676 1680 nfs4_open_owner_t *oop = NULL;
1677 1681 seqid4 seqid;
1678 1682 nfs4_ga_res_t *garp;
1679 1683 char fn[MAXNAMELEN];
1680 1684 nfs4_recov_state_t recov = {NULL, 0};
1681 1685 nfs4_lost_rqst_t lost_rqst;
1682 1686 mntinfo4_t *mi = VTOMI4(vp);
1683 1687 bool_t abort;
1684 1688 char *failed_msg = "";
1685 1689 int fh_different;
1686 1690 hrtime_t t;
1687 1691 nfs4_bseqid_entry_t *bsep = NULL;
1688 1692
1689 1693 ASSERT(nfs4_consistent_type(vp));
1690 1694 ASSERT(nfs_zone() == mi->mi_zone);
1691 1695
1692 1696 nfs4_error_zinit(ep);
1693 1697
1694 1698 /* this is the cred used to find the open owner */
1695 1699 cr = state_to_cred(osp);
1696 1700 if (cr == NULL) {
1697 1701 failed_msg = "Couldn't reopen: no cred";
1698 1702 goto kill_file;
1699 1703 }
1700 1704 /* use this cred for OTW operations */
1701 1705 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1702 1706
1703 1707 top:
1704 1708 nfs4_error_zinit(ep);
1705 1709
1706 1710 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1707 1711 /* File system has been unmounted, quit */
1708 1712 ep->error = EIO;
1709 1713 failed_msg = "Couldn't reopen: file system has been unmounted";
1710 1714 goto kill_file;
1711 1715 }
1712 1716
1713 1717 oop = osp->os_open_owner;
1714 1718
1715 1719 ASSERT(oop != NULL);
1716 1720 if (oop == NULL) { /* be defensive in non-DEBUG */
1717 1721 failed_msg = "can't reopen: no open owner";
1718 1722 goto kill_file;
1719 1723 }
1720 1724 open_owner_hold(oop);
1721 1725
1722 1726 ep->error = nfs4_start_open_seqid_sync(oop, mi);
1723 1727 if (ep->error) {
1724 1728 open_owner_rele(oop);
1725 1729 oop = NULL;
1726 1730 goto bailout;
1727 1731 }
1728 1732
1729 1733 /*
1730 1734 * If the rnode has a delegation and the delegation has been
1731 1735 * recovered and the server didn't request a recall and the caller
1732 1736 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1733 1737 * recovery) and the rnode hasn't been marked dead, then install
1734 1738 * the delegation stateid in the open stream. Otherwise, proceed
1735 1739 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1736 1740 */
1737 1741 mutex_enter(&rp->r_statev4_lock);
1738 1742 if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1739 1743 !rp->r_deleg_return_pending &&
1740 1744 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1741 1745 !rp->r_deleg_needs_recall &&
1742 1746 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1743 1747 !(rp->r_flags & R4RECOVERR)) {
1744 1748 mutex_enter(&osp->os_sync_lock);
1745 1749 osp->os_delegation = 1;
1746 1750 osp->open_stateid = rp->r_deleg_stateid;
1747 1751 mutex_exit(&osp->os_sync_lock);
1748 1752 mutex_exit(&rp->r_statev4_lock);
1749 1753 goto bailout;
1750 1754 }
1751 1755 mutex_exit(&rp->r_statev4_lock);
1752 1756
1753 1757 /*
1754 1758 * If the file failed recovery, just quit. This failure need not
1755 1759 * affect other reopens, so don't return an error.
1756 1760 */
1757 1761 mutex_enter(&rp->r_statelock);
1758 1762 if (rp->r_flags & R4RECOVERR) {
1759 1763 mutex_exit(&rp->r_statelock);
1760 1764 ep->error = 0;
1761 1765 goto failed_reopen;
1762 1766 }
1763 1767 mutex_exit(&rp->r_statelock);
1764 1768
1765 1769 /*
1766 1770 * argop is empty here
1767 1771 *
1768 1772 * PUTFH, OPEN, GETATTR
1769 1773 */
1770 1774 args.ctag = TAG_REOPEN;
1771 1775 args.array_len = 4;
1772 1776 args.array = argop;
1773 1777
1774 1778 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1775 1779 "nfs4_reopen: file is type %d, id %s",
1776 1780 vp->v_type, rnode4info(VTOR4(vp))));
1777 1781
1778 1782 argop[0].argop = OP_CPUTFH;
1779 1783
1780 1784 if (claim != CLAIM_PREVIOUS) {
1781 1785 /*
1782 1786 * if this is a file mount then
1783 1787 * use the mntinfo parentfh
1784 1788 */
1785 1789 argop[0].nfs_argop4_u.opcputfh.sfh =
1786 1790 (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1787 1791 VTOSV(vp)->sv_dfh;
1788 1792 } else {
1789 1793 /* putfh fh to reopen */
1790 1794 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1791 1795 }
1792 1796
1793 1797 argop[1].argop = OP_COPEN;
1794 1798 open_args = &argop[1].nfs_argop4_u.opcopen;
1795 1799 open_args->claim = claim;
1796 1800
1797 1801 if (claim == CLAIM_NULL) {
1798 1802
1799 1803 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1800 1804 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1801 1805 "failed for vp 0x%p for CLAIM_NULL with %m",
1802 1806 (void *)vp);
1803 1807 failed_msg = "Couldn't reopen: vtoname failed for "
1804 1808 "CLAIM_NULL";
1805 1809 /* nothing allocated yet */
1806 1810 goto kill_file;
1807 1811 }
1808 1812
1809 1813 open_args->open_claim4_u.cfile = fn;
1810 1814 } else if (claim == CLAIM_PREVIOUS) {
1811 1815
1812 1816 /*
1813 1817 * We have two cases to deal with here:
1814 1818 * 1) We're being called to reopen files in order to satisfy
1815 1819 * a lock operation request which requires us to explicitly
1816 1820 * reopen files which were opened under a delegation. If
1817 1821 * we're in recovery, we *must* use CLAIM_PREVIOUS. In
1818 1822 * that case, frc_use_claim_previous is TRUE and we must
1819 1823 * use the rnode's current delegation type (r_deleg_type).
1820 1824 * 2) We're reopening files during some form of recovery.
1821 1825 * In this case, frc_use_claim_previous is FALSE and we
1822 1826 * use the delegation type appropriate for recovery
1823 1827 * (r_deleg_needs_recovery).
1824 1828 */
1825 1829 mutex_enter(&rp->r_statev4_lock);
1826 1830 open_args->open_claim4_u.delegate_type =
1827 1831 frc_use_claim_previous ?
1828 1832 rp->r_deleg_type :
1829 1833 rp->r_deleg_needs_recovery;
1830 1834 mutex_exit(&rp->r_statev4_lock);
1831 1835
1832 1836 } else if (claim == CLAIM_DELEGATE_CUR) {
1833 1837
1834 1838 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1835 1839 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1836 1840 "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1837 1841 "with %m", (void *)vp);
1838 1842 failed_msg = "Couldn't reopen: vtoname failed for "
1839 1843 "CLAIM_DELEGATE_CUR";
1840 1844 /* nothing allocated yet */
1841 1845 goto kill_file;
1842 1846 }
1843 1847
1844 1848 mutex_enter(&rp->r_statev4_lock);
1845 1849 open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1846 1850 rp->r_deleg_stateid;
1847 1851 mutex_exit(&rp->r_statev4_lock);
1848 1852
1849 1853 open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1850 1854 }
1851 1855 open_args->opentype = OPEN4_NOCREATE;
1852 1856 open_args->owner.clientid = mi2clientid(mi);
1853 1857 open_args->owner.owner_len = sizeof (oop->oo_name);
1854 1858 open_args->owner.owner_val =
1855 1859 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1856 1860 bcopy(&oop->oo_name, open_args->owner.owner_val,
1857 1861 open_args->owner.owner_len);
1858 1862 open_args->share_access = 0;
1859 1863 open_args->share_deny = 0;
1860 1864
1861 1865 mutex_enter(&osp->os_sync_lock);
1862 1866 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1863 1867 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1864 1868 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1865 1869 (void *)osp, (void *)rp, osp->os_share_acc_read,
1866 1870 osp->os_share_acc_write, osp->os_open_ref_count,
1867 1871 osp->os_mmap_read, osp->os_mmap_write, claim));
1868 1872
1869 1873 if (osp->os_share_acc_read || osp->os_mmap_read)
1870 1874 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1871 1875 if (osp->os_share_acc_write || osp->os_mmap_write)
1872 1876 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1873 1877 if (osp->os_share_deny_read)
1874 1878 open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1875 1879 if (osp->os_share_deny_write)
1876 1880 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1877 1881 mutex_exit(&osp->os_sync_lock);
1878 1882
1879 1883 seqid = nfs4_get_open_seqid(oop) + 1;
1880 1884 open_args->seqid = seqid;
1881 1885
1882 1886 /* Construct the getfh part of the compound */
1883 1887 argop[2].argop = OP_GETFH;
1884 1888
1885 1889 /* Construct the getattr part of the compound */
1886 1890 argop[3].argop = OP_GETATTR;
1887 1891 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1888 1892 argop[3].nfs_argop4_u.opgetattr.mi = mi;
1889 1893
1890 1894 t = gethrtime();
1891 1895
1892 1896 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1893 1897
1894 1898 if (ep->error) {
1895 1899 if (!is_recov && !frc_use_claim_previous &&
1896 1900 (ep->error == EINTR || ep->error == ETIMEDOUT ||
1897 1901 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1898 1902 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1899 1903 cred_otw, vp, NULL, open_args);
1900 1904 abort = nfs4_start_recovery(ep,
1901 1905 VTOMI4(vp), vp, NULL, NULL,
1902 1906 lost_rqst.lr_op == OP_OPEN ?
1903 1907 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1904 1908 nfs4args_copen_free(open_args);
1905 1909 goto bailout;
1906 1910 }
1907 1911
1908 1912 nfs4args_copen_free(open_args);
1909 1913
1910 1914 if (ep->error == EACCES && cred_otw != cr) {
1911 1915 crfree(cred_otw);
1912 1916 cred_otw = cr;
1913 1917 crhold(cred_otw);
1914 1918 nfs4_end_open_seqid_sync(oop);
1915 1919 open_owner_rele(oop);
1916 1920 oop = NULL;
1917 1921 goto top;
1918 1922 }
1919 1923 if (ep->error == ETIMEDOUT)
1920 1924 goto bailout;
1921 1925 failed_msg = "Couldn't reopen: rpc error";
1922 1926 goto kill_file;
1923 1927 }
1924 1928
1925 1929 if (nfs4_need_to_bump_seqid(&res))
1926 1930 nfs4_set_open_seqid(seqid, oop, args.ctag);
1927 1931
1928 1932 switch (res.status) {
1929 1933 case NFS4_OK:
1930 1934 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1931 1935 mutex_enter(&rp->r_statelock);
1932 1936 rp->r_delay_interval = 0;
1933 1937 mutex_exit(&rp->r_statelock);
1934 1938 }
1935 1939 break;
1936 1940 case NFS4ERR_BAD_SEQID:
1937 1941 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1938 1942 args.ctag, open_args->seqid);
1939 1943
1940 1944 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1941 1945 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1942 1946 NULL, OP_OPEN, bsep, NULL, NULL);
1943 1947
1944 1948 nfs4args_copen_free(open_args);
1945 1949 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1946 1950 nfs4_end_open_seqid_sync(oop);
1947 1951 open_owner_rele(oop);
1948 1952 oop = NULL;
1949 1953 kmem_free(bsep, sizeof (*bsep));
1950 1954
1951 1955 goto kill_file;
1952 1956 case NFS4ERR_NO_GRACE:
1953 1957 nfs4args_copen_free(open_args);
1954 1958 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1955 1959 nfs4_end_open_seqid_sync(oop);
1956 1960 open_owner_rele(oop);
1957 1961 oop = NULL;
1958 1962 if (claim == CLAIM_PREVIOUS) {
1959 1963 /*
1960 1964 * Retry as a plain open. We don't need to worry about
1961 1965 * checking the changeinfo: it is acceptable for a
1962 1966 * client to re-open a file and continue processing
1963 1967 * (in the absence of locks).
1964 1968 */
1965 1969 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1966 1970 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1967 1971 "will retry as CLAIM_NULL"));
1968 1972 claim = CLAIM_NULL;
1969 1973 nfs4_mi_kstat_inc_no_grace(mi);
1970 1974 goto top;
1971 1975 }
1972 1976 failed_msg =
1973 1977 "Couldn't reopen: tried reclaim outside grace period. ";
1974 1978 goto kill_file;
1975 1979 case NFS4ERR_GRACE:
1976 1980 nfs4_set_grace_wait(mi);
1977 1981 nfs4args_copen_free(open_args);
1978 1982 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1979 1983 nfs4_end_open_seqid_sync(oop);
1980 1984 open_owner_rele(oop);
1981 1985 oop = NULL;
1982 1986 ep->error = nfs4_wait_for_grace(mi, &recov);
1983 1987 if (ep->error != 0)
1984 1988 goto bailout;
1985 1989 goto top;
1986 1990 case NFS4ERR_DELAY:
1987 1991 nfs4_set_delay_wait(vp);
1988 1992 nfs4args_copen_free(open_args);
1989 1993 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1990 1994 nfs4_end_open_seqid_sync(oop);
1991 1995 open_owner_rele(oop);
1992 1996 oop = NULL;
1993 1997 ep->error = nfs4_wait_for_delay(vp, &recov);
1994 1998 nfs4_mi_kstat_inc_delay(mi);
1995 1999 if (ep->error != 0)
1996 2000 goto bailout;
1997 2001 goto top;
1998 2002 case NFS4ERR_FHEXPIRED:
1999 2003 /* recover filehandle and retry */
2000 2004 abort = nfs4_start_recovery(ep,
2001 2005 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2002 2006 nfs4args_copen_free(open_args);
2003 2007 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2004 2008 nfs4_end_open_seqid_sync(oop);
2005 2009 open_owner_rele(oop);
2006 2010 oop = NULL;
2007 2011 if (abort == FALSE)
2008 2012 goto top;
2009 2013 failed_msg = "Couldn't reopen: recovery aborted";
2010 2014 goto kill_file;
2011 2015 case NFS4ERR_RESOURCE:
2012 2016 case NFS4ERR_STALE_CLIENTID:
2013 2017 case NFS4ERR_WRONGSEC:
2014 2018 case NFS4ERR_EXPIRED:
2015 2019 /*
2016 2020 * Do not mark the file dead and let the calling
2017 2021 * function initiate recovery.
2018 2022 */
2019 2023 nfs4args_copen_free(open_args);
2020 2024 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2021 2025 nfs4_end_open_seqid_sync(oop);
2022 2026 open_owner_rele(oop);
2023 2027 oop = NULL;
2024 2028 goto bailout;
2025 2029 case NFS4ERR_ACCESS:
2026 2030 if (cred_otw != cr) {
2027 2031 crfree(cred_otw);
2028 2032 cred_otw = cr;
2029 2033 crhold(cred_otw);
2030 2034 nfs4args_copen_free(open_args);
2031 2035 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2032 2036 nfs4_end_open_seqid_sync(oop);
2033 2037 open_owner_rele(oop);
2034 2038 oop = NULL;
2035 2039 goto top;
2036 2040 }
2037 2041 /* fall through */
2038 2042 default:
2039 2043 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2040 2044 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2041 2045 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2042 2046 rnode4info(VTOR4(vp))));
2043 2047 failed_msg = "Couldn't reopen: NFSv4 error";
2044 2048 nfs4args_copen_free(open_args);
2045 2049 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2046 2050 goto kill_file;
2047 2051 }
2048 2052
2049 2053 resop = &res.array[1]; /* open res */
2050 2054 op_res = &resop->nfs_resop4_u.opopen;
2051 2055
2052 2056 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2053 2057
2054 2058 /*
2055 2059 * Check if the path we reopened really is the same
2056 2060 * file. We could end up in a situation where the file
2057 2061 * was removed and a new file created with the same name.
2058 2062 */
2059 2063 resop = &res.array[2];
2060 2064 gf_res = &resop->nfs_resop4_u.opgetfh;
2061 2065 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2062 2066 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2063 2067 if (fh_different) {
2064 2068 if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2065 2069 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2066 2070 /* Oops, we don't have the same file */
2067 2071 if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2068 2072 failed_msg = "Couldn't reopen: Persistent "
2069 2073 "file handle changed";
2070 2074 else
2071 2075 failed_msg = "Couldn't reopen: Volatile "
2072 2076 "(no expire on open) file handle changed";
2073 2077
2074 2078 nfs4args_copen_free(open_args);
2075 2079 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2076 2080 nfs_rw_exit(&mi->mi_fh_lock);
2077 2081 goto kill_file;
2078 2082
2079 2083 } else {
2080 2084 /*
2081 2085 * We have volatile file handles that don't compare.
2082 2086 * If the fids are the same then we assume that the
2083 2087 * file handle expired but the rnode still refers to
2084 2088 * the same file object.
2085 2089 *
2086 2090 * First check that we have fids or not.
2087 2091 * If we don't we have a dumb server so we will
2088 2092 * just assume every thing is ok for now.
2089 2093 */
2090 2094 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2091 2095 rp->r_attr.va_mask & AT_NODEID &&
2092 2096 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2093 2097 /*
2094 2098 * We have fids, but they don't
2095 2099 * compare. So kill the file.
2096 2100 */
2097 2101 failed_msg =
2098 2102 "Couldn't reopen: file handle changed"
2099 2103 " due to mismatched fids";
2100 2104 nfs4args_copen_free(open_args);
2101 2105 (void) xdr_free(xdr_COMPOUND4res_clnt,
2102 2106 (caddr_t)&res);
2103 2107 nfs_rw_exit(&mi->mi_fh_lock);
2104 2108 goto kill_file;
2105 2109 } else {
2106 2110 /*
2107 2111 * We have volatile file handles that refers
2108 2112 * to the same file (at least they have the
2109 2113 * same fid) or we don't have fids so we
2110 2114 * can't tell. :(. We'll be a kind and accepting
2111 2115 * client so we'll update the rnode's file
2112 2116 * handle with the otw handle.
2113 2117 *
2114 2118 * We need to drop mi->mi_fh_lock since
2115 2119 * sh4_update acquires it. Since there is
2116 2120 * only one recovery thread there is no
2117 2121 * race.
2118 2122 */
2119 2123 nfs_rw_exit(&mi->mi_fh_lock);
2120 2124 sfh4_update(rp->r_fh, &gf_res->object);
2121 2125 }
2122 2126 }
2123 2127 } else {
2124 2128 nfs_rw_exit(&mi->mi_fh_lock);
2125 2129 }
2126 2130
2127 2131 ASSERT(nfs4_consistent_type(vp));
2128 2132
2129 2133 /*
2130 2134 * If the server wanted an OPEN_CONFIRM but that fails, just start
2131 2135 * over. Presumably if there is a persistent error it will show up
2132 2136 * when we resend the OPEN.
2133 2137 */
2134 2138 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2135 2139 bool_t retry_open = FALSE;
2136 2140
2137 2141 nfs4open_confirm(vp, &seqid, &op_res->stateid,
2138 2142 cred_otw, is_recov, &retry_open,
2139 2143 oop, FALSE, ep, NULL);
2140 2144 if (ep->error || ep->stat) {
2141 2145 nfs4args_copen_free(open_args);
2142 2146 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2143 2147 nfs4_end_open_seqid_sync(oop);
2144 2148 open_owner_rele(oop);
2145 2149 oop = NULL;
2146 2150 goto top;
2147 2151 }
2148 2152 }
2149 2153
2150 2154 mutex_enter(&osp->os_sync_lock);
2151 2155 osp->open_stateid = op_res->stateid;
2152 2156 osp->os_delegation = 0;
2153 2157 /*
2154 2158 * Need to reset this bitfield for the possible case where we were
2155 2159 * going to OTW CLOSE the file, got a non-recoverable error, and before
2156 2160 * we could retry the CLOSE, OPENed the file again.
2157 2161 */
2158 2162 ASSERT(osp->os_open_owner->oo_seqid_inuse);
2159 2163 osp->os_final_close = 0;
2160 2164 osp->os_force_close = 0;
2161 2165 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2162 2166 osp->os_dc_openacc = open_args->share_access;
2163 2167 mutex_exit(&osp->os_sync_lock);
2164 2168
2165 2169 nfs4_end_open_seqid_sync(oop);
2166 2170
2167 2171 /* accept delegation, if any */
2168 2172 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2169 2173
2170 2174 nfs4args_copen_free(open_args);
2171 2175
2172 2176 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2173 2177
2174 2178 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2175 2179
2176 2180 ASSERT(nfs4_consistent_type(vp));
2177 2181
2178 2182 open_owner_rele(oop);
2179 2183 crfree(cr);
2180 2184 crfree(cred_otw);
2181 2185 return;
2182 2186
2183 2187 kill_file:
2184 2188 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2185 2189 failed_reopen:
2186 2190 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2187 2191 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2188 2192 (void *)osp, (void *)cr, rnode4info(rp)));
2189 2193 mutex_enter(&osp->os_sync_lock);
2190 2194 osp->os_failed_reopen = 1;
2191 2195 mutex_exit(&osp->os_sync_lock);
2192 2196 bailout:
2193 2197 if (oop != NULL) {
2194 2198 nfs4_end_open_seqid_sync(oop);
2195 2199 open_owner_rele(oop);
2196 2200 }
2197 2201 if (cr != NULL)
2198 2202 crfree(cr);
2199 2203 if (cred_otw != NULL)
2200 2204 crfree(cred_otw);
2201 2205 }
2202 2206
2203 2207 /* for . and .. OPENs */
2204 2208 /* ARGSUSED */
2205 2209 static int
2206 2210 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2207 2211 {
2208 2212 rnode4_t *rp;
2209 2213 nfs4_ga_res_t gar;
2210 2214
2211 2215 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2212 2216
2213 2217 /*
2214 2218 * If close-to-open consistency checking is turned off or
2215 2219 * if there is no cached data, we can avoid
2216 2220 * the over the wire getattr. Otherwise, force a
2217 2221 * call to the server to get fresh attributes and to
2218 2222 * check caches. This is required for close-to-open
2219 2223 * consistency.
2220 2224 */
2221 2225 rp = VTOR4(*vpp);
2222 2226 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2223 2227 (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2224 2228 return (0);
2225 2229
2226 2230 gar.n4g_va.va_mask = AT_ALL;
2227 2231 return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2228 2232 }
2229 2233
2230 2234 /*
2231 2235 * CLOSE a file
2232 2236 */
2233 2237 /* ARGSUSED */
2234 2238 static int
2235 2239 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2236 2240 caller_context_t *ct)
2237 2241 {
2238 2242 rnode4_t *rp;
2239 2243 int error = 0;
2240 2244 int r_error = 0;
2241 2245 int n4error = 0;
2242 2246 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2243 2247
2244 2248 /*
2245 2249 * Remove client state for this (lockowner, file) pair.
2246 2250 * Issue otw v4 call to have the server do the same.
2247 2251 */
2248 2252
2249 2253 rp = VTOR4(vp);
2250 2254
2251 2255 /*
2252 2256 * zone_enter(2) prevents processes from changing zones with NFS files
2253 2257 * open; if we happen to get here from the wrong zone we can't do
2254 2258 * anything over the wire.
2255 2259 */
2256 2260 if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2257 2261 /*
2258 2262 * We could attempt to clean up locks, except we're sure
2259 2263 * that the current process didn't acquire any locks on
2260 2264 * the file: any attempt to lock a file belong to another zone
2261 2265 * will fail, and one can't lock an NFS file and then change
2262 2266 * zones, as that fails too.
2263 2267 *
2264 2268 * Returning an error here is the sane thing to do. A
2265 2269 * subsequent call to VN_RELE() which translates to a
2266 2270 * nfs4_inactive() will clean up state: if the zone of the
2267 2271 * vnode's origin is still alive and kicking, the inactive
2268 2272 * thread will handle the request (from the correct zone), and
2269 2273 * everything (minus the OTW close call) should be OK. If the
2270 2274 * zone is going away nfs4_async_inactive() will throw away
2271 2275 * delegations, open streams and cached pages inline.
2272 2276 */
2273 2277 return (EIO);
2274 2278 }
2275 2279
2276 2280 /*
2277 2281 * If we are using local locking for this filesystem, then
2278 2282 * release all of the SYSV style record locks. Otherwise,
2279 2283 * we are doing network locking and we need to release all
2280 2284 * of the network locks. All of the locks held by this
2281 2285 * process on this file are released no matter what the
2282 2286 * incoming reference count is.
2283 2287 */
2284 2288 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2285 2289 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2286 2290 cleanshares(vp, ttoproc(curthread)->p_pid);
2287 2291 } else
2288 2292 e.error = nfs4_lockrelease(vp, flag, offset, cr);
2289 2293
2290 2294 if (e.error) {
2291 2295 struct lm_sysid *lmsid;
2292 2296 lmsid = nfs4_find_sysid(VTOMI4(vp));
2293 2297 if (lmsid == NULL) {
2294 2298 DTRACE_PROBE2(unknown__sysid, int, e.error,
2295 2299 vnode_t *, vp);
2296 2300 } else {
2297 2301 cleanlocks(vp, ttoproc(curthread)->p_pid,
2298 2302 (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2299 2303 }
2300 2304 return (e.error);
2301 2305 }
2302 2306
2303 2307 if (count > 1)
2304 2308 return (0);
2305 2309
2306 2310 /*
2307 2311 * If the file has been `unlinked', then purge the
2308 2312 * DNLC so that this vnode will get reycled quicker
2309 2313 * and the .nfs* file on the server will get removed.
2310 2314 */
2311 2315 if (rp->r_unldvp != NULL)
2312 2316 dnlc_purge_vp(vp);
2313 2317
2314 2318 /*
2315 2319 * If the file was open for write and there are pages,
2316 2320 * do a synchronous flush and commit of all of the
2317 2321 * dirty and uncommitted pages.
2318 2322 */
2319 2323 ASSERT(!e.error);
2320 2324 if ((flag & FWRITE) && nfs4_has_pages(vp))
2321 2325 error = nfs4_putpage_commit(vp, 0, 0, cr);
2322 2326
2323 2327 mutex_enter(&rp->r_statelock);
2324 2328 r_error = rp->r_error;
2325 2329 rp->r_error = 0;
2326 2330 mutex_exit(&rp->r_statelock);
2327 2331
2328 2332 /*
2329 2333 * If this file type is one for which no explicit 'open' was
2330 2334 * done, then bail now (ie. no need for protocol 'close'). If
2331 2335 * there was an error w/the vm subsystem, return _that_ error,
2332 2336 * otherwise, return any errors that may've been reported via
2333 2337 * the rnode.
2334 2338 */
2335 2339 if (vp->v_type != VREG)
2336 2340 return (error ? error : r_error);
2337 2341
2338 2342 /*
2339 2343 * The sync putpage commit may have failed above, but since
2340 2344 * we're working w/a regular file, we need to do the protocol
2341 2345 * 'close' (nfs4close_one will figure out if an otw close is
2342 2346 * needed or not). Report any errors _after_ doing the protocol
2343 2347 * 'close'.
2344 2348 */
2345 2349 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2346 2350 n4error = e.error ? e.error : geterrno4(e.stat);
2347 2351
2348 2352 /*
2349 2353 * Error reporting prio (Hi -> Lo)
2350 2354 *
2351 2355 * i) nfs4_putpage_commit (error)
2352 2356 * ii) rnode's (r_error)
2353 2357 * iii) nfs4close_one (n4error)
2354 2358 */
2355 2359 return (error ? error : (r_error ? r_error : n4error));
2356 2360 }
2357 2361
2358 2362 /*
2359 2363 * Initialize *lost_rqstp.
2360 2364 */
2361 2365
2362 2366 static void
2363 2367 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2364 2368 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2365 2369 vnode_t *vp)
2366 2370 {
2367 2371 if (error != ETIMEDOUT && error != EINTR &&
2368 2372 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2369 2373 lost_rqstp->lr_op = 0;
2370 2374 return;
2371 2375 }
2372 2376
2373 2377 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2374 2378 "nfs4close_save_lost_rqst: error %d", error));
2375 2379
2376 2380 lost_rqstp->lr_op = OP_CLOSE;
2377 2381 /*
2378 2382 * The vp is held and rele'd via the recovery code.
2379 2383 * See nfs4_save_lost_rqst.
2380 2384 */
2381 2385 lost_rqstp->lr_vp = vp;
2382 2386 lost_rqstp->lr_dvp = NULL;
2383 2387 lost_rqstp->lr_oop = oop;
2384 2388 lost_rqstp->lr_osp = osp;
2385 2389 ASSERT(osp != NULL);
2386 2390 ASSERT(mutex_owned(&osp->os_sync_lock));
2387 2391 osp->os_pending_close = 1;
2388 2392 lost_rqstp->lr_lop = NULL;
2389 2393 lost_rqstp->lr_cr = cr;
2390 2394 lost_rqstp->lr_flk = NULL;
2391 2395 lost_rqstp->lr_putfirst = FALSE;
2392 2396 }
2393 2397
2394 2398 /*
2395 2399 * Assumes you already have the open seqid sync grabbed as well as the
2396 2400 * 'os_sync_lock'. Note: this will release the open seqid sync and
2397 2401 * 'os_sync_lock' if client recovery starts. Calling functions have to
2398 2402 * be prepared to handle this.
2399 2403 *
2400 2404 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2401 2405 * was needed and was started, and that the calling function should retry
2402 2406 * this function; otherwise it is returned as 0.
2403 2407 *
2404 2408 * Errors are returned via the nfs4_error_t parameter.
2405 2409 */
2406 2410 static void
2407 2411 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2408 2412 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2409 2413 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2410 2414 {
2411 2415 COMPOUND4args_clnt args;
2412 2416 COMPOUND4res_clnt res;
2413 2417 CLOSE4args *close_args;
2414 2418 nfs_resop4 *resop;
2415 2419 nfs_argop4 argop[3];
2416 2420 int doqueue = 1;
2417 2421 mntinfo4_t *mi;
2418 2422 seqid4 seqid;
2419 2423 vnode_t *vp;
2420 2424 bool_t needrecov = FALSE;
2421 2425 nfs4_lost_rqst_t lost_rqst;
2422 2426 hrtime_t t;
2423 2427
2424 2428 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2425 2429
2426 2430 ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2427 2431
2428 2432 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2429 2433
2430 2434 /* Only set this to 1 if recovery is started */
2431 2435 *recov = 0;
2432 2436
2433 2437 /* do the OTW call to close the file */
2434 2438
2435 2439 if (close_type == CLOSE_RESEND)
2436 2440 args.ctag = TAG_CLOSE_LOST;
2437 2441 else if (close_type == CLOSE_AFTER_RESEND)
2438 2442 args.ctag = TAG_CLOSE_UNDO;
2439 2443 else
2440 2444 args.ctag = TAG_CLOSE;
2441 2445
2442 2446 args.array_len = 3;
2443 2447 args.array = argop;
2444 2448
2445 2449 vp = RTOV4(rp);
2446 2450
2447 2451 mi = VTOMI4(vp);
2448 2452
2449 2453 /* putfh target fh */
2450 2454 argop[0].argop = OP_CPUTFH;
2451 2455 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2452 2456
2453 2457 argop[1].argop = OP_GETATTR;
2454 2458 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2455 2459 argop[1].nfs_argop4_u.opgetattr.mi = mi;
2456 2460
2457 2461 argop[2].argop = OP_CLOSE;
2458 2462 close_args = &argop[2].nfs_argop4_u.opclose;
2459 2463
2460 2464 seqid = nfs4_get_open_seqid(oop) + 1;
2461 2465
2462 2466 close_args->seqid = seqid;
2463 2467 close_args->open_stateid = osp->open_stateid;
2464 2468
2465 2469 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2466 2470 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2467 2471 rnode4info(rp)));
2468 2472
2469 2473 t = gethrtime();
2470 2474
2471 2475 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2472 2476
2473 2477 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2474 2478 nfs4_set_open_seqid(seqid, oop, args.ctag);
2475 2479 }
2476 2480
2477 2481 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2478 2482 if (ep->error && !needrecov) {
2479 2483 /*
2480 2484 * if there was an error and no recovery is to be done
2481 2485 * then then set up the file to flush its cache if
2482 2486 * needed for the next caller.
2483 2487 */
2484 2488 mutex_enter(&rp->r_statelock);
2485 2489 PURGE_ATTRCACHE4_LOCKED(rp);
2486 2490 rp->r_flags &= ~R4WRITEMODIFIED;
2487 2491 mutex_exit(&rp->r_statelock);
2488 2492 return;
2489 2493 }
2490 2494
2491 2495 if (needrecov) {
2492 2496 bool_t abort;
2493 2497 nfs4_bseqid_entry_t *bsep = NULL;
2494 2498
2495 2499 if (close_type != CLOSE_RESEND)
2496 2500 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2497 2501 osp, cred_otw, vp);
2498 2502
2499 2503 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2500 2504 bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2501 2505 0, args.ctag, close_args->seqid);
2502 2506
2503 2507 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2504 2508 "nfs4close_otw: initiating recovery. error %d "
2505 2509 "res.status %d", ep->error, res.status));
2506 2510
2507 2511 /*
2508 2512 * Drop the 'os_sync_lock' here so we don't hit
2509 2513 * a potential recursive mutex_enter via an
2510 2514 * 'open_stream_hold()'.
2511 2515 */
2512 2516 mutex_exit(&osp->os_sync_lock);
2513 2517 *have_sync_lockp = 0;
2514 2518 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2515 2519 (close_type != CLOSE_RESEND &&
2516 2520 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2517 2521 OP_CLOSE, bsep, NULL, NULL);
2518 2522
2519 2523 /* drop open seq sync, and let the calling function regrab it */
2520 2524 nfs4_end_open_seqid_sync(oop);
2521 2525 *did_start_seqid_syncp = 0;
2522 2526
2523 2527 if (bsep)
2524 2528 kmem_free(bsep, sizeof (*bsep));
2525 2529 /*
2526 2530 * For signals, the caller wants to quit, so don't say to
2527 2531 * retry. For forced unmount, if it's a user thread, it
2528 2532 * wants to quit. If it's a recovery thread, the retry
2529 2533 * will happen higher-up on the call stack. Either way,
2530 2534 * don't say to retry.
2531 2535 */
2532 2536 if (abort == FALSE && ep->error != EINTR &&
2533 2537 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2534 2538 close_type != CLOSE_RESEND &&
2535 2539 close_type != CLOSE_AFTER_RESEND)
2536 2540 *recov = 1;
2537 2541 else
2538 2542 *recov = 0;
2539 2543
2540 2544 if (!ep->error)
2541 2545 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2542 2546 return;
2543 2547 }
2544 2548
2545 2549 if (res.status) {
2546 2550 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2547 2551 return;
2548 2552 }
2549 2553
2550 2554 mutex_enter(&rp->r_statev4_lock);
2551 2555 rp->created_v4 = 0;
2552 2556 mutex_exit(&rp->r_statev4_lock);
2553 2557
2554 2558 resop = &res.array[2];
2555 2559 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2556 2560 osp->os_valid = 0;
2557 2561
2558 2562 /*
2559 2563 * This removes the reference obtained at OPEN; ie, when the
2560 2564 * open stream structure was created.
2561 2565 *
2562 2566 * We don't have to worry about calling 'open_stream_rele'
2563 2567 * since we our currently holding a reference to the open
2564 2568 * stream which means the count cannot go to 0 with this
2565 2569 * decrement.
2566 2570 */
2567 2571 ASSERT(osp->os_ref_count >= 2);
2568 2572 osp->os_ref_count--;
2569 2573
2570 2574 if (!ep->error)
2571 2575 nfs4_attr_cache(vp,
2572 2576 &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2573 2577 t, cred_otw, TRUE, NULL);
2574 2578
2575 2579 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2576 2580 " returning %d", ep->error));
2577 2581
2578 2582 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2579 2583 }
2580 2584
2581 2585 /* ARGSUSED */
2582 2586 static int
2583 2587 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2584 2588 caller_context_t *ct)
2585 2589 {
2586 2590 rnode4_t *rp;
2587 2591 u_offset_t off;
2588 2592 offset_t diff;
2589 2593 uint_t on;
2590 2594 uint_t n;
2591 2595 caddr_t base;
2592 2596 uint_t flags;
2593 2597 int error;
2594 2598 mntinfo4_t *mi;
2595 2599
2596 2600 rp = VTOR4(vp);
2597 2601
2598 2602 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2599 2603
2600 2604 if (IS_SHADOW(vp, rp))
2601 2605 vp = RTOV4(rp);
2602 2606
2603 2607 if (vp->v_type != VREG)
2604 2608 return (EISDIR);
2605 2609
2606 2610 mi = VTOMI4(vp);
2607 2611
2608 2612 if (nfs_zone() != mi->mi_zone)
2609 2613 return (EIO);
2610 2614
2611 2615 if (uiop->uio_resid == 0)
2612 2616 return (0);
2613 2617
2614 2618 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2615 2619 return (EINVAL);
2616 2620
2617 2621 mutex_enter(&rp->r_statelock);
2618 2622 if (rp->r_flags & R4RECOVERRP)
2619 2623 error = (rp->r_error ? rp->r_error : EIO);
2620 2624 else
2621 2625 error = 0;
2622 2626 mutex_exit(&rp->r_statelock);
2623 2627 if (error)
2624 2628 return (error);
2625 2629
2626 2630 /*
2627 2631 * Bypass VM if caching has been disabled (e.g., locking) or if
2628 2632 * using client-side direct I/O and the file is not mmap'd and
2629 2633 * there are no cached pages.
2630 2634 */
2631 2635 if ((vp->v_flag & VNOCACHE) ||
2632 2636 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2633 2637 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2634 2638 size_t resid = 0;
2635 2639
2636 2640 return (nfs4read(vp, NULL, uiop->uio_loffset,
2637 2641 uiop->uio_resid, &resid, cr, FALSE, uiop));
2638 2642 }
2639 2643
2640 2644 error = 0;
2641 2645
2642 2646 do {
2643 2647 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2644 2648 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2645 2649 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2646 2650
2647 2651 if (error = nfs4_validate_caches(vp, cr))
2648 2652 break;
2649 2653
2650 2654 mutex_enter(&rp->r_statelock);
2651 2655 while (rp->r_flags & R4INCACHEPURGE) {
2652 2656 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2653 2657 mutex_exit(&rp->r_statelock);
2654 2658 return (EINTR);
2655 2659 }
2656 2660 }
2657 2661 diff = rp->r_size - uiop->uio_loffset;
2658 2662 mutex_exit(&rp->r_statelock);
2659 2663 if (diff <= 0)
2660 2664 break;
2661 2665 if (diff < n)
2662 2666 n = (uint_t)diff;
2663 2667
2664 2668 if (vpm_enable) {
2665 2669 /*
2666 2670 * Copy data.
2667 2671 */
2668 2672 error = vpm_data_copy(vp, off + on, n, uiop,
2669 2673 1, NULL, 0, S_READ);
2670 2674 } else {
2671 2675 base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2672 2676 S_READ);
2673 2677
2674 2678 error = uiomove(base + on, n, UIO_READ, uiop);
2675 2679 }
2676 2680
2677 2681 if (!error) {
2678 2682 /*
2679 2683 * If read a whole block or read to eof,
2680 2684 * won't need this buffer again soon.
2681 2685 */
2682 2686 mutex_enter(&rp->r_statelock);
2683 2687 if (n + on == MAXBSIZE ||
2684 2688 uiop->uio_loffset == rp->r_size)
2685 2689 flags = SM_DONTNEED;
2686 2690 else
2687 2691 flags = 0;
2688 2692 mutex_exit(&rp->r_statelock);
2689 2693 if (vpm_enable) {
2690 2694 error = vpm_sync_pages(vp, off, n, flags);
2691 2695 } else {
2692 2696 error = segmap_release(segkmap, base, flags);
2693 2697 }
2694 2698 } else {
2695 2699 if (vpm_enable) {
2696 2700 (void) vpm_sync_pages(vp, off, n, 0);
2697 2701 } else {
2698 2702 (void) segmap_release(segkmap, base, 0);
2699 2703 }
2700 2704 }
2701 2705 } while (!error && uiop->uio_resid > 0);
2702 2706
2703 2707 return (error);
2704 2708 }
2705 2709
2706 2710 /* ARGSUSED */
2707 2711 static int
2708 2712 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2709 2713 caller_context_t *ct)
2710 2714 {
2711 2715 rlim64_t limit = uiop->uio_llimit;
2712 2716 rnode4_t *rp;
2713 2717 u_offset_t off;
2714 2718 caddr_t base;
2715 2719 uint_t flags;
2716 2720 int remainder;
2717 2721 size_t n;
2718 2722 int on;
2719 2723 int error;
2720 2724 int resid;
2721 2725 u_offset_t offset;
2722 2726 mntinfo4_t *mi;
2723 2727 uint_t bsize;
2724 2728
2725 2729 rp = VTOR4(vp);
2726 2730
2727 2731 if (IS_SHADOW(vp, rp))
2728 2732 vp = RTOV4(rp);
2729 2733
2730 2734 if (vp->v_type != VREG)
2731 2735 return (EISDIR);
2732 2736
2733 2737 mi = VTOMI4(vp);
2734 2738
2735 2739 if (nfs_zone() != mi->mi_zone)
2736 2740 return (EIO);
2737 2741
2738 2742 if (uiop->uio_resid == 0)
2739 2743 return (0);
2740 2744
2741 2745 mutex_enter(&rp->r_statelock);
2742 2746 if (rp->r_flags & R4RECOVERRP)
2743 2747 error = (rp->r_error ? rp->r_error : EIO);
2744 2748 else
2745 2749 error = 0;
2746 2750 mutex_exit(&rp->r_statelock);
2747 2751 if (error)
2748 2752 return (error);
2749 2753
2750 2754 if (ioflag & FAPPEND) {
2751 2755 struct vattr va;
2752 2756
2753 2757 /*
2754 2758 * Must serialize if appending.
2755 2759 */
2756 2760 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2757 2761 nfs_rw_exit(&rp->r_rwlock);
2758 2762 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2759 2763 INTR4(vp)))
2760 2764 return (EINTR);
2761 2765 }
2762 2766
2763 2767 va.va_mask = AT_SIZE;
2764 2768 error = nfs4getattr(vp, &va, cr);
2765 2769 if (error)
2766 2770 return (error);
2767 2771 uiop->uio_loffset = va.va_size;
2768 2772 }
2769 2773
2770 2774 offset = uiop->uio_loffset + uiop->uio_resid;
2771 2775
2772 2776 if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2773 2777 return (EINVAL);
2774 2778
2775 2779 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2776 2780 limit = MAXOFFSET_T;
2777 2781
2778 2782 /*
2779 2783 * Check to make sure that the process will not exceed
2780 2784 * its limit on file size. It is okay to write up to
2781 2785 * the limit, but not beyond. Thus, the write which
2782 2786 * reaches the limit will be short and the next write
2783 2787 * will return an error.
2784 2788 */
2785 2789 remainder = 0;
2786 2790 if (offset > uiop->uio_llimit) {
2787 2791 remainder = offset - uiop->uio_llimit;
2788 2792 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2789 2793 if (uiop->uio_resid <= 0) {
2790 2794 proc_t *p = ttoproc(curthread);
2791 2795
2792 2796 uiop->uio_resid += remainder;
2793 2797 mutex_enter(&p->p_lock);
2794 2798 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2795 2799 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2796 2800 mutex_exit(&p->p_lock);
2797 2801 return (EFBIG);
2798 2802 }
2799 2803 }
2800 2804
2801 2805 /* update the change attribute, if we have a write delegation */
2802 2806
2803 2807 mutex_enter(&rp->r_statev4_lock);
2804 2808 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2805 2809 rp->r_deleg_change++;
2806 2810
2807 2811 mutex_exit(&rp->r_statev4_lock);
2808 2812
2809 2813 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2810 2814 return (EINTR);
2811 2815
2812 2816 /*
2813 2817 * Bypass VM if caching has been disabled (e.g., locking) or if
2814 2818 * using client-side direct I/O and the file is not mmap'd and
2815 2819 * there are no cached pages.
2816 2820 */
2817 2821 if ((vp->v_flag & VNOCACHE) ||
2818 2822 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2819 2823 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2820 2824 size_t bufsize;
2821 2825 int count;
2822 2826 u_offset_t org_offset;
2823 2827 stable_how4 stab_comm;
2824 2828 nfs4_fwrite:
2825 2829 if (rp->r_flags & R4STALE) {
2826 2830 resid = uiop->uio_resid;
2827 2831 offset = uiop->uio_loffset;
2828 2832 error = rp->r_error;
2829 2833 /*
2830 2834 * A close may have cleared r_error, if so,
2831 2835 * propagate ESTALE error return properly
2832 2836 */
2833 2837 if (error == 0)
2834 2838 error = ESTALE;
2835 2839 goto bottom;
2836 2840 }
2837 2841
2838 2842 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2839 2843 base = kmem_alloc(bufsize, KM_SLEEP);
2840 2844 do {
2841 2845 if (ioflag & FDSYNC)
2842 2846 stab_comm = DATA_SYNC4;
2843 2847 else
2844 2848 stab_comm = FILE_SYNC4;
2845 2849 resid = uiop->uio_resid;
2846 2850 offset = uiop->uio_loffset;
2847 2851 count = MIN(uiop->uio_resid, bufsize);
2848 2852 org_offset = uiop->uio_loffset;
2849 2853 error = uiomove(base, count, UIO_WRITE, uiop);
2850 2854 if (!error) {
2851 2855 error = nfs4write(vp, base, org_offset,
2852 2856 count, cr, &stab_comm);
2853 2857 if (!error) {
2854 2858 mutex_enter(&rp->r_statelock);
2855 2859 if (rp->r_size < uiop->uio_loffset)
2856 2860 rp->r_size = uiop->uio_loffset;
2857 2861 mutex_exit(&rp->r_statelock);
2858 2862 }
2859 2863 }
2860 2864 } while (!error && uiop->uio_resid > 0);
2861 2865 kmem_free(base, bufsize);
2862 2866 goto bottom;
2863 2867 }
2864 2868
2865 2869 bsize = vp->v_vfsp->vfs_bsize;
2866 2870
2867 2871 do {
2868 2872 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2869 2873 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2870 2874 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2871 2875
2872 2876 resid = uiop->uio_resid;
2873 2877 offset = uiop->uio_loffset;
2874 2878
2875 2879 if (rp->r_flags & R4STALE) {
2876 2880 error = rp->r_error;
2877 2881 /*
2878 2882 * A close may have cleared r_error, if so,
2879 2883 * propagate ESTALE error return properly
2880 2884 */
2881 2885 if (error == 0)
2882 2886 error = ESTALE;
2883 2887 break;
2884 2888 }
2885 2889
2886 2890 /*
2887 2891 * Don't create dirty pages faster than they
2888 2892 * can be cleaned so that the system doesn't
2889 2893 * get imbalanced. If the async queue is
2890 2894 * maxed out, then wait for it to drain before
2891 2895 * creating more dirty pages. Also, wait for
2892 2896 * any threads doing pagewalks in the vop_getattr
2893 2897 * entry points so that they don't block for
2894 2898 * long periods.
2895 2899 */
2896 2900 mutex_enter(&rp->r_statelock);
2897 2901 while ((mi->mi_max_threads != 0 &&
2898 2902 rp->r_awcount > 2 * mi->mi_max_threads) ||
2899 2903 rp->r_gcount > 0) {
2900 2904 if (INTR4(vp)) {
2901 2905 klwp_t *lwp = ttolwp(curthread);
2902 2906
2903 2907 if (lwp != NULL)
2904 2908 lwp->lwp_nostop++;
2905 2909 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2906 2910 mutex_exit(&rp->r_statelock);
2907 2911 if (lwp != NULL)
2908 2912 lwp->lwp_nostop--;
2909 2913 error = EINTR;
2910 2914 goto bottom;
2911 2915 }
2912 2916 if (lwp != NULL)
2913 2917 lwp->lwp_nostop--;
2914 2918 } else
2915 2919 cv_wait(&rp->r_cv, &rp->r_statelock);
2916 2920 }
2917 2921 mutex_exit(&rp->r_statelock);
2918 2922
2919 2923 /*
2920 2924 * Touch the page and fault it in if it is not in core
2921 2925 * before segmap_getmapflt or vpm_data_copy can lock it.
2922 2926 * This is to avoid the deadlock if the buffer is mapped
2923 2927 * to the same file through mmap which we want to write.
2924 2928 */
2925 2929 uio_prefaultpages((long)n, uiop);
2926 2930
2927 2931 if (vpm_enable) {
2928 2932 /*
2929 2933 * It will use kpm mappings, so no need to
2930 2934 * pass an address.
2931 2935 */
2932 2936 error = writerp4(rp, NULL, n, uiop, 0);
2933 2937 } else {
2934 2938 if (segmap_kpm) {
2935 2939 int pon = uiop->uio_loffset & PAGEOFFSET;
2936 2940 size_t pn = MIN(PAGESIZE - pon,
2937 2941 uiop->uio_resid);
2938 2942 int pagecreate;
2939 2943
2940 2944 mutex_enter(&rp->r_statelock);
2941 2945 pagecreate = (pon == 0) && (pn == PAGESIZE ||
2942 2946 uiop->uio_loffset + pn >= rp->r_size);
2943 2947 mutex_exit(&rp->r_statelock);
2944 2948
2945 2949 base = segmap_getmapflt(segkmap, vp, off + on,
2946 2950 pn, !pagecreate, S_WRITE);
2947 2951
2948 2952 error = writerp4(rp, base + pon, n, uiop,
2949 2953 pagecreate);
2950 2954
2951 2955 } else {
2952 2956 base = segmap_getmapflt(segkmap, vp, off + on,
2953 2957 n, 0, S_READ);
2954 2958 error = writerp4(rp, base + on, n, uiop, 0);
2955 2959 }
2956 2960 }
2957 2961
2958 2962 if (!error) {
2959 2963 if (mi->mi_flags & MI4_NOAC)
2960 2964 flags = SM_WRITE;
2961 2965 else if ((uiop->uio_loffset % bsize) == 0 ||
2962 2966 IS_SWAPVP(vp)) {
2963 2967 /*
2964 2968 * Have written a whole block.
2965 2969 * Start an asynchronous write
2966 2970 * and mark the buffer to
2967 2971 * indicate that it won't be
2968 2972 * needed again soon.
2969 2973 */
2970 2974 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
2971 2975 } else
2972 2976 flags = 0;
2973 2977 if ((ioflag & (FSYNC|FDSYNC)) ||
2974 2978 (rp->r_flags & R4OUTOFSPACE)) {
2975 2979 flags &= ~SM_ASYNC;
2976 2980 flags |= SM_WRITE;
2977 2981 }
2978 2982 if (vpm_enable) {
2979 2983 error = vpm_sync_pages(vp, off, n, flags);
2980 2984 } else {
2981 2985 error = segmap_release(segkmap, base, flags);
2982 2986 }
2983 2987 } else {
2984 2988 if (vpm_enable) {
2985 2989 (void) vpm_sync_pages(vp, off, n, 0);
2986 2990 } else {
2987 2991 (void) segmap_release(segkmap, base, 0);
2988 2992 }
2989 2993 /*
2990 2994 * In the event that we got an access error while
2991 2995 * faulting in a page for a write-only file just
2992 2996 * force a write.
2993 2997 */
2994 2998 if (error == EACCES)
2995 2999 goto nfs4_fwrite;
2996 3000 }
2997 3001 } while (!error && uiop->uio_resid > 0);
2998 3002
2999 3003 bottom:
3000 3004 if (error) {
3001 3005 uiop->uio_resid = resid + remainder;
3002 3006 uiop->uio_loffset = offset;
3003 3007 } else {
3004 3008 uiop->uio_resid += remainder;
3005 3009
3006 3010 mutex_enter(&rp->r_statev4_lock);
3007 3011 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3008 3012 gethrestime(&rp->r_attr.va_mtime);
3009 3013 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3010 3014 }
3011 3015 mutex_exit(&rp->r_statev4_lock);
3012 3016 }
3013 3017
3014 3018 nfs_rw_exit(&rp->r_lkserlock);
3015 3019
3016 3020 return (error);
3017 3021 }
3018 3022
3019 3023 /*
3020 3024 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3021 3025 */
3022 3026 static int
3023 3027 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3024 3028 int flags, cred_t *cr)
3025 3029 {
3026 3030 struct buf *bp;
3027 3031 int error;
3028 3032 page_t *savepp;
3029 3033 uchar_t fsdata;
3030 3034 stable_how4 stab_comm;
3031 3035
3032 3036 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3033 3037 bp = pageio_setup(pp, len, vp, flags);
3034 3038 ASSERT(bp != NULL);
3035 3039
3036 3040 /*
3037 3041 * pageio_setup should have set b_addr to 0. This
3038 3042 * is correct since we want to do I/O on a page
3039 3043 * boundary. bp_mapin will use this addr to calculate
3040 3044 * an offset, and then set b_addr to the kernel virtual
3041 3045 * address it allocated for us.
3042 3046 */
3043 3047 ASSERT(bp->b_un.b_addr == 0);
3044 3048
3045 3049 bp->b_edev = 0;
3046 3050 bp->b_dev = 0;
3047 3051 bp->b_lblkno = lbtodb(off);
3048 3052 bp->b_file = vp;
3049 3053 bp->b_offset = (offset_t)off;
3050 3054 bp_mapin(bp);
3051 3055
3052 3056 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3053 3057 freemem > desfree)
3054 3058 stab_comm = UNSTABLE4;
3055 3059 else
3056 3060 stab_comm = FILE_SYNC4;
3057 3061
3058 3062 error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3059 3063
3060 3064 bp_mapout(bp);
3061 3065 pageio_done(bp);
3062 3066
3063 3067 if (stab_comm == UNSTABLE4)
3064 3068 fsdata = C_DELAYCOMMIT;
3065 3069 else
3066 3070 fsdata = C_NOCOMMIT;
3067 3071
3068 3072 savepp = pp;
3069 3073 do {
3070 3074 pp->p_fsdata = fsdata;
3071 3075 } while ((pp = pp->p_next) != savepp);
3072 3076
3073 3077 return (error);
3074 3078 }
3075 3079
3076 3080 /*
3077 3081 */
3078 3082 static int
3079 3083 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3080 3084 {
3081 3085 nfs4_open_owner_t *oop;
3082 3086 nfs4_open_stream_t *osp;
3083 3087 rnode4_t *rp = VTOR4(vp);
3084 3088 mntinfo4_t *mi = VTOMI4(vp);
3085 3089 int reopen_needed;
3086 3090
3087 3091 ASSERT(nfs_zone() == mi->mi_zone);
3088 3092
3089 3093
3090 3094 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3091 3095 if (!oop)
3092 3096 return (EIO);
3093 3097
3094 3098 /* returns with 'os_sync_lock' held */
3095 3099 osp = find_open_stream(oop, rp);
3096 3100 if (!osp) {
3097 3101 open_owner_rele(oop);
3098 3102 return (EIO);
3099 3103 }
3100 3104
3101 3105 if (osp->os_failed_reopen) {
3102 3106 mutex_exit(&osp->os_sync_lock);
3103 3107 open_stream_rele(osp, rp);
3104 3108 open_owner_rele(oop);
3105 3109 return (EIO);
3106 3110 }
3107 3111
3108 3112 /*
3109 3113 * Determine whether a reopen is needed. If this
3110 3114 * is a delegation open stream, then the os_delegation bit
3111 3115 * should be set.
3112 3116 */
3113 3117
3114 3118 reopen_needed = osp->os_delegation;
3115 3119
3116 3120 mutex_exit(&osp->os_sync_lock);
3117 3121 open_owner_rele(oop);
3118 3122
3119 3123 if (reopen_needed) {
3120 3124 nfs4_error_zinit(ep);
3121 3125 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3122 3126 mutex_enter(&osp->os_sync_lock);
3123 3127 if (ep->error || ep->stat || osp->os_failed_reopen) {
3124 3128 mutex_exit(&osp->os_sync_lock);
3125 3129 open_stream_rele(osp, rp);
3126 3130 return (EIO);
3127 3131 }
3128 3132 mutex_exit(&osp->os_sync_lock);
3129 3133 }
3130 3134 open_stream_rele(osp, rp);
3131 3135
3132 3136 return (0);
3133 3137 }
3134 3138
3135 3139 /*
3136 3140 * Write to file. Writes to remote server in largest size
3137 3141 * chunks that the server can handle. Write is synchronous.
3138 3142 */
3139 3143 static int
3140 3144 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3141 3145 stable_how4 *stab_comm)
3142 3146 {
3143 3147 mntinfo4_t *mi;
3144 3148 COMPOUND4args_clnt args;
3145 3149 COMPOUND4res_clnt res;
3146 3150 WRITE4args *wargs;
3147 3151 WRITE4res *wres;
3148 3152 nfs_argop4 argop[2];
3149 3153 nfs_resop4 *resop;
3150 3154 int tsize;
3151 3155 stable_how4 stable;
3152 3156 rnode4_t *rp;
3153 3157 int doqueue = 1;
3154 3158 bool_t needrecov;
3155 3159 nfs4_recov_state_t recov_state;
3156 3160 nfs4_stateid_types_t sid_types;
3157 3161 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3158 3162 int recov;
3159 3163
3160 3164 rp = VTOR4(vp);
3161 3165 mi = VTOMI4(vp);
3162 3166
3163 3167 ASSERT(nfs_zone() == mi->mi_zone);
3164 3168
3165 3169 stable = *stab_comm;
3166 3170 *stab_comm = FILE_SYNC4;
3167 3171
3168 3172 needrecov = FALSE;
3169 3173 recov_state.rs_flags = 0;
3170 3174 recov_state.rs_num_retry_despite_err = 0;
3171 3175 nfs4_init_stateid_types(&sid_types);
3172 3176
3173 3177 /* Is curthread the recovery thread? */
3174 3178 mutex_enter(&mi->mi_lock);
3175 3179 recov = (mi->mi_recovthread == curthread);
3176 3180 mutex_exit(&mi->mi_lock);
3177 3181
3178 3182 recov_retry:
3179 3183 args.ctag = TAG_WRITE;
3180 3184 args.array_len = 2;
3181 3185 args.array = argop;
3182 3186
3183 3187 if (!recov) {
3184 3188 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3185 3189 &recov_state, NULL);
3186 3190 if (e.error)
3187 3191 return (e.error);
3188 3192 }
3189 3193
3190 3194 /* 0. putfh target fh */
3191 3195 argop[0].argop = OP_CPUTFH;
3192 3196 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3193 3197
3194 3198 /* 1. write */
3195 3199 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3196 3200
3197 3201 do {
3198 3202
3199 3203 wargs->offset = (offset4)offset;
3200 3204 wargs->data_val = base;
3201 3205
3202 3206 if (mi->mi_io_kstats) {
3203 3207 mutex_enter(&mi->mi_lock);
3204 3208 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3205 3209 mutex_exit(&mi->mi_lock);
3206 3210 }
3207 3211
3208 3212 if ((vp->v_flag & VNOCACHE) ||
3209 3213 (rp->r_flags & R4DIRECTIO) ||
3210 3214 (mi->mi_flags & MI4_DIRECTIO))
3211 3215 tsize = MIN(mi->mi_stsize, count);
3212 3216 else
3213 3217 tsize = MIN(mi->mi_curwrite, count);
3214 3218 wargs->data_len = (uint_t)tsize;
3215 3219 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3216 3220
3217 3221 if (mi->mi_io_kstats) {
3218 3222 mutex_enter(&mi->mi_lock);
3219 3223 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3220 3224 mutex_exit(&mi->mi_lock);
3221 3225 }
3222 3226
3223 3227 if (!recov) {
3224 3228 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3225 3229 if (e.error && !needrecov) {
3226 3230 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3227 3231 &recov_state, needrecov);
3228 3232 return (e.error);
3229 3233 }
3230 3234 } else {
3231 3235 if (e.error)
3232 3236 return (e.error);
3233 3237 }
3234 3238
3235 3239 /*
3236 3240 * Do handling of OLD_STATEID outside
3237 3241 * of the normal recovery framework.
3238 3242 *
3239 3243 * If write receives a BAD stateid error while using a
3240 3244 * delegation stateid, retry using the open stateid (if it
3241 3245 * exists). If it doesn't have an open stateid, reopen the
3242 3246 * file first, then retry.
3243 3247 */
3244 3248 if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3245 3249 sid_types.cur_sid_type != SPEC_SID) {
3246 3250 nfs4_save_stateid(&wargs->stateid, &sid_types);
3247 3251 if (!recov)
3248 3252 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3249 3253 &recov_state, needrecov);
3250 3254 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3251 3255 goto recov_retry;
3252 3256 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3253 3257 sid_types.cur_sid_type == DEL_SID) {
3254 3258 nfs4_save_stateid(&wargs->stateid, &sid_types);
3255 3259 mutex_enter(&rp->r_statev4_lock);
3256 3260 rp->r_deleg_return_pending = TRUE;
3257 3261 mutex_exit(&rp->r_statev4_lock);
3258 3262 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3259 3263 if (!recov)
3260 3264 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3261 3265 &recov_state, needrecov);
3262 3266 (void) xdr_free(xdr_COMPOUND4res_clnt,
3263 3267 (caddr_t)&res);
3264 3268 return (EIO);
3265 3269 }
3266 3270 if (!recov)
3267 3271 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3268 3272 &recov_state, needrecov);
3269 3273 /* hold needed for nfs4delegreturn_thread */
3270 3274 VN_HOLD(vp);
3271 3275 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3272 3276 NFS4_DR_DISCARD), FALSE);
3273 3277 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3274 3278 goto recov_retry;
3275 3279 }
3276 3280
3277 3281 if (needrecov) {
3278 3282 bool_t abort;
3279 3283
3280 3284 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3281 3285 "nfs4write: client got error %d, res.status %d"
3282 3286 ", so start recovery", e.error, res.status));
3283 3287
3284 3288 abort = nfs4_start_recovery(&e,
3285 3289 VTOMI4(vp), vp, NULL, &wargs->stateid,
3286 3290 NULL, OP_WRITE, NULL, NULL, NULL);
3287 3291 if (!e.error) {
3288 3292 e.error = geterrno4(res.status);
3289 3293 (void) xdr_free(xdr_COMPOUND4res_clnt,
3290 3294 (caddr_t)&res);
3291 3295 }
3292 3296 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3293 3297 &recov_state, needrecov);
3294 3298 if (abort == FALSE)
3295 3299 goto recov_retry;
3296 3300 return (e.error);
3297 3301 }
3298 3302
3299 3303 if (res.status) {
3300 3304 e.error = geterrno4(res.status);
3301 3305 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3302 3306 if (!recov)
3303 3307 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3304 3308 &recov_state, needrecov);
3305 3309 return (e.error);
3306 3310 }
3307 3311
3308 3312 resop = &res.array[1]; /* write res */
3309 3313 wres = &resop->nfs_resop4_u.opwrite;
3310 3314
3311 3315 if ((int)wres->count > tsize) {
3312 3316 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3313 3317
3314 3318 zcmn_err(getzoneid(), CE_WARN,
3315 3319 "nfs4write: server wrote %u, requested was %u",
3316 3320 (int)wres->count, tsize);
3317 3321 if (!recov)
3318 3322 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3319 3323 &recov_state, needrecov);
3320 3324 return (EIO);
3321 3325 }
3322 3326 if (wres->committed == UNSTABLE4) {
3323 3327 *stab_comm = UNSTABLE4;
3324 3328 if (wargs->stable == DATA_SYNC4 ||
3325 3329 wargs->stable == FILE_SYNC4) {
3326 3330 (void) xdr_free(xdr_COMPOUND4res_clnt,
3327 3331 (caddr_t)&res);
3328 3332 zcmn_err(getzoneid(), CE_WARN,
3329 3333 "nfs4write: server %s did not commit "
3330 3334 "to stable storage",
3331 3335 rp->r_server->sv_hostname);
3332 3336 if (!recov)
3333 3337 nfs4_end_fop(VTOMI4(vp), vp, NULL,
3334 3338 OH_WRITE, &recov_state, needrecov);
3335 3339 return (EIO);
3336 3340 }
3337 3341 }
3338 3342
3339 3343 tsize = (int)wres->count;
3340 3344 count -= tsize;
3341 3345 base += tsize;
3342 3346 offset += tsize;
3343 3347 if (mi->mi_io_kstats) {
3344 3348 mutex_enter(&mi->mi_lock);
3345 3349 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3346 3350 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3347 3351 tsize;
3348 3352 mutex_exit(&mi->mi_lock);
3349 3353 }
3350 3354 lwp_stat_update(LWP_STAT_OUBLK, 1);
3351 3355 mutex_enter(&rp->r_statelock);
3352 3356 if (rp->r_flags & R4HAVEVERF) {
3353 3357 if (rp->r_writeverf != wres->writeverf) {
3354 3358 nfs4_set_mod(vp);
3355 3359 rp->r_writeverf = wres->writeverf;
3356 3360 }
3357 3361 } else {
3358 3362 rp->r_writeverf = wres->writeverf;
3359 3363 rp->r_flags |= R4HAVEVERF;
3360 3364 }
3361 3365 PURGE_ATTRCACHE4_LOCKED(rp);
3362 3366 rp->r_flags |= R4WRITEMODIFIED;
3363 3367 gethrestime(&rp->r_attr.va_mtime);
3364 3368 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3365 3369 mutex_exit(&rp->r_statelock);
3366 3370 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3367 3371 } while (count);
3368 3372
3369 3373 if (!recov)
3370 3374 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3371 3375 needrecov);
3372 3376
3373 3377 return (e.error);
3374 3378 }
3375 3379
3376 3380 /*
3377 3381 * Read from a file. Reads data in largest chunks our interface can handle.
3378 3382 */
3379 3383 static int
3380 3384 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3381 3385 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3382 3386 {
3383 3387 mntinfo4_t *mi;
3384 3388 COMPOUND4args_clnt args;
3385 3389 COMPOUND4res_clnt res;
3386 3390 READ4args *rargs;
3387 3391 nfs_argop4 argop[2];
3388 3392 int tsize;
3389 3393 int doqueue;
3390 3394 rnode4_t *rp;
3391 3395 int data_len;
3392 3396 bool_t is_eof;
3393 3397 bool_t needrecov = FALSE;
3394 3398 nfs4_recov_state_t recov_state;
3395 3399 nfs4_stateid_types_t sid_types;
3396 3400 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3397 3401
3398 3402 rp = VTOR4(vp);
3399 3403 mi = VTOMI4(vp);
3400 3404 doqueue = 1;
3401 3405
3402 3406 ASSERT(nfs_zone() == mi->mi_zone);
3403 3407
3404 3408 args.ctag = async ? TAG_READAHEAD : TAG_READ;
3405 3409
3406 3410 args.array_len = 2;
3407 3411 args.array = argop;
3408 3412
3409 3413 nfs4_init_stateid_types(&sid_types);
3410 3414
3411 3415 recov_state.rs_flags = 0;
3412 3416 recov_state.rs_num_retry_despite_err = 0;
3413 3417
3414 3418 recov_retry:
3415 3419 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3416 3420 &recov_state, NULL);
3417 3421 if (e.error)
3418 3422 return (e.error);
3419 3423
3420 3424 /* putfh target fh */
3421 3425 argop[0].argop = OP_CPUTFH;
3422 3426 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3423 3427
3424 3428 /* read */
3425 3429 argop[1].argop = OP_READ;
3426 3430 rargs = &argop[1].nfs_argop4_u.opread;
3427 3431 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3428 3432 OP_READ, &sid_types, async);
3429 3433
3430 3434 do {
3431 3435 if (mi->mi_io_kstats) {
3432 3436 mutex_enter(&mi->mi_lock);
3433 3437 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3434 3438 mutex_exit(&mi->mi_lock);
3435 3439 }
3436 3440
3437 3441 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3438 3442 "nfs4read: %s call, rp %s",
3439 3443 needrecov ? "recov" : "first",
3440 3444 rnode4info(rp)));
3441 3445
3442 3446 if ((vp->v_flag & VNOCACHE) ||
3443 3447 (rp->r_flags & R4DIRECTIO) ||
3444 3448 (mi->mi_flags & MI4_DIRECTIO))
3445 3449 tsize = MIN(mi->mi_tsize, count);
3446 3450 else
3447 3451 tsize = MIN(mi->mi_curread, count);
3448 3452
3449 3453 rargs->offset = (offset4)offset;
3450 3454 rargs->count = (count4)tsize;
3451 3455 rargs->res_data_val_alt = NULL;
3452 3456 rargs->res_mblk = NULL;
3453 3457 rargs->res_uiop = NULL;
3454 3458 rargs->res_maxsize = 0;
3455 3459 rargs->wlist = NULL;
3456 3460
3457 3461 if (uiop)
3458 3462 rargs->res_uiop = uiop;
3459 3463 else
3460 3464 rargs->res_data_val_alt = base;
3461 3465 rargs->res_maxsize = tsize;
3462 3466
3463 3467 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3464 3468 #ifdef DEBUG
3465 3469 if (nfs4read_error_inject) {
3466 3470 res.status = nfs4read_error_inject;
3467 3471 nfs4read_error_inject = 0;
3468 3472 }
3469 3473 #endif
3470 3474
3471 3475 if (mi->mi_io_kstats) {
3472 3476 mutex_enter(&mi->mi_lock);
3473 3477 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3474 3478 mutex_exit(&mi->mi_lock);
3475 3479 }
3476 3480
3477 3481 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3478 3482 if (e.error != 0 && !needrecov) {
3479 3483 nfs4_end_fop(mi, vp, NULL, OH_READ,
3480 3484 &recov_state, needrecov);
3481 3485 return (e.error);
3482 3486 }
3483 3487
3484 3488 /*
3485 3489 * Do proper retry for OLD and BAD stateid errors outside
3486 3490 * of the normal recovery framework. There are two differences
3487 3491 * between async and sync reads. The first is that we allow
3488 3492 * retry on BAD_STATEID for async reads, but not sync reads.
3489 3493 * The second is that we mark the file dead for a failed
3490 3494 * attempt with a special stateid for sync reads, but just
3491 3495 * return EIO for async reads.
3492 3496 *
3493 3497 * If a sync read receives a BAD stateid error while using a
3494 3498 * delegation stateid, retry using the open stateid (if it
3495 3499 * exists). If it doesn't have an open stateid, reopen the
3496 3500 * file first, then retry.
3497 3501 */
3498 3502 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3499 3503 res.status == NFS4ERR_BAD_STATEID) && async) {
3500 3504 nfs4_end_fop(mi, vp, NULL, OH_READ,
3501 3505 &recov_state, needrecov);
3502 3506 if (sid_types.cur_sid_type == SPEC_SID) {
3503 3507 (void) xdr_free(xdr_COMPOUND4res_clnt,
3504 3508 (caddr_t)&res);
3505 3509 return (EIO);
3506 3510 }
3507 3511 nfs4_save_stateid(&rargs->stateid, &sid_types);
3508 3512 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3509 3513 goto recov_retry;
3510 3514 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3511 3515 !async && sid_types.cur_sid_type != SPEC_SID) {
3512 3516 nfs4_save_stateid(&rargs->stateid, &sid_types);
3513 3517 nfs4_end_fop(mi, vp, NULL, OH_READ,
3514 3518 &recov_state, needrecov);
3515 3519 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3516 3520 goto recov_retry;
3517 3521 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3518 3522 sid_types.cur_sid_type == DEL_SID) {
3519 3523 nfs4_save_stateid(&rargs->stateid, &sid_types);
3520 3524 mutex_enter(&rp->r_statev4_lock);
3521 3525 rp->r_deleg_return_pending = TRUE;
3522 3526 mutex_exit(&rp->r_statev4_lock);
3523 3527 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3524 3528 nfs4_end_fop(mi, vp, NULL, OH_READ,
3525 3529 &recov_state, needrecov);
3526 3530 (void) xdr_free(xdr_COMPOUND4res_clnt,
3527 3531 (caddr_t)&res);
3528 3532 return (EIO);
3529 3533 }
3530 3534 nfs4_end_fop(mi, vp, NULL, OH_READ,
3531 3535 &recov_state, needrecov);
3532 3536 /* hold needed for nfs4delegreturn_thread */
3533 3537 VN_HOLD(vp);
3534 3538 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3535 3539 NFS4_DR_DISCARD), FALSE);
3536 3540 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3537 3541 goto recov_retry;
3538 3542 }
3539 3543 if (needrecov) {
3540 3544 bool_t abort;
3541 3545
3542 3546 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3543 3547 "nfs4read: initiating recovery\n"));
3544 3548 abort = nfs4_start_recovery(&e,
3545 3549 mi, vp, NULL, &rargs->stateid,
3546 3550 NULL, OP_READ, NULL, NULL, NULL);
3547 3551 nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 3552 &recov_state, needrecov);
3549 3553 /*
3550 3554 * Do not retry if we got OLD_STATEID using a special
3551 3555 * stateid. This avoids looping with a broken server.
3552 3556 */
3553 3557 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3554 3558 sid_types.cur_sid_type == SPEC_SID)
3555 3559 abort = TRUE;
3556 3560
3557 3561 if (abort == FALSE) {
3558 3562 /*
3559 3563 * Need to retry all possible stateids in
3560 3564 * case the recovery error wasn't stateid
3561 3565 * related or the stateids have become
3562 3566 * stale (server reboot).
3563 3567 */
3564 3568 nfs4_init_stateid_types(&sid_types);
3565 3569 (void) xdr_free(xdr_COMPOUND4res_clnt,
3566 3570 (caddr_t)&res);
3567 3571 goto recov_retry;
3568 3572 }
3569 3573
3570 3574 if (!e.error) {
3571 3575 e.error = geterrno4(res.status);
3572 3576 (void) xdr_free(xdr_COMPOUND4res_clnt,
3573 3577 (caddr_t)&res);
3574 3578 }
3575 3579 return (e.error);
3576 3580 }
3577 3581
3578 3582 if (res.status) {
3579 3583 e.error = geterrno4(res.status);
3580 3584 nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 3585 &recov_state, needrecov);
3582 3586 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3583 3587 return (e.error);
3584 3588 }
3585 3589
3586 3590 data_len = res.array[1].nfs_resop4_u.opread.data_len;
3587 3591 count -= data_len;
3588 3592 if (base)
3589 3593 base += data_len;
3590 3594 offset += data_len;
3591 3595 if (mi->mi_io_kstats) {
3592 3596 mutex_enter(&mi->mi_lock);
3593 3597 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3594 3598 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3595 3599 mutex_exit(&mi->mi_lock);
3596 3600 }
3597 3601 lwp_stat_update(LWP_STAT_INBLK, 1);
3598 3602 is_eof = res.array[1].nfs_resop4_u.opread.eof;
3599 3603 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3600 3604
3601 3605 } while (count && !is_eof);
3602 3606
3603 3607 *residp = count;
3604 3608
3605 3609 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3606 3610
3607 3611 return (e.error);
3608 3612 }
3609 3613
3610 3614 /* ARGSUSED */
3611 3615 static int
3612 3616 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3613 3617 caller_context_t *ct)
3614 3618 {
3615 3619 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3616 3620 return (EIO);
3617 3621 switch (cmd) {
3618 3622 case _FIODIRECTIO:
3619 3623 return (nfs4_directio(vp, (int)arg, cr));
3620 3624 default:
3621 3625 return (ENOTTY);
3622 3626 }
3623 3627 }
3624 3628
3625 3629 /* ARGSUSED */
3626 3630 int
3627 3631 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3628 3632 caller_context_t *ct)
3629 3633 {
3630 3634 int error;
3631 3635 rnode4_t *rp = VTOR4(vp);
3632 3636
3633 3637 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3634 3638 return (EIO);
3635 3639 /*
3636 3640 * If it has been specified that the return value will
3637 3641 * just be used as a hint, and we are only being asked
3638 3642 * for size, fsid or rdevid, then return the client's
3639 3643 * notion of these values without checking to make sure
3640 3644 * that the attribute cache is up to date.
3641 3645 * The whole point is to avoid an over the wire GETATTR
3642 3646 * call.
3643 3647 */
3644 3648 if (flags & ATTR_HINT) {
3645 3649 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3646 3650 mutex_enter(&rp->r_statelock);
3647 3651 if (vap->va_mask & AT_SIZE)
3648 3652 vap->va_size = rp->r_size;
3649 3653 if (vap->va_mask & AT_FSID)
3650 3654 vap->va_fsid = rp->r_attr.va_fsid;
3651 3655 if (vap->va_mask & AT_RDEV)
3652 3656 vap->va_rdev = rp->r_attr.va_rdev;
3653 3657 mutex_exit(&rp->r_statelock);
3654 3658 return (0);
3655 3659 }
3656 3660 }
3657 3661
3658 3662 /*
3659 3663 * Only need to flush pages if asking for the mtime
3660 3664 * and if there any dirty pages or any outstanding
3661 3665 * asynchronous (write) requests for this file.
3662 3666 */
3663 3667 if (vap->va_mask & AT_MTIME) {
3664 3668 rp = VTOR4(vp);
3665 3669 if (nfs4_has_pages(vp)) {
3666 3670 mutex_enter(&rp->r_statev4_lock);
3667 3671 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3668 3672 mutex_exit(&rp->r_statev4_lock);
3669 3673 if (rp->r_flags & R4DIRTY ||
3670 3674 rp->r_awcount > 0) {
3671 3675 mutex_enter(&rp->r_statelock);
3672 3676 rp->r_gcount++;
3673 3677 mutex_exit(&rp->r_statelock);
3674 3678 error =
3675 3679 nfs4_putpage(vp, (u_offset_t)0,
3676 3680 0, 0, cr, NULL);
3677 3681 mutex_enter(&rp->r_statelock);
3678 3682 if (error && (error == ENOSPC ||
3679 3683 error == EDQUOT)) {
3680 3684 if (!rp->r_error)
3681 3685 rp->r_error = error;
3682 3686 }
3683 3687 if (--rp->r_gcount == 0)
3684 3688 cv_broadcast(&rp->r_cv);
3685 3689 mutex_exit(&rp->r_statelock);
3686 3690 }
3687 3691 } else {
3688 3692 mutex_exit(&rp->r_statev4_lock);
3689 3693 }
3690 3694 }
3691 3695 }
3692 3696 return (nfs4getattr(vp, vap, cr));
3693 3697 }
3694 3698
3695 3699 int
3696 3700 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3697 3701 {
3698 3702 /*
3699 3703 * If these are the only two bits cleared
3700 3704 * on the server then return 0 (OK) else
3701 3705 * return 1 (BAD).
3702 3706 */
3703 3707 on_client &= ~(S_ISUID|S_ISGID);
3704 3708 if (on_client == from_server)
3705 3709 return (0);
3706 3710 else
3707 3711 return (1);
3708 3712 }
3709 3713
3710 3714 /*ARGSUSED4*/
3711 3715 static int
3712 3716 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3713 3717 caller_context_t *ct)
3714 3718 {
3715 3719 if (vap->va_mask & AT_NOSET)
3716 3720 return (EINVAL);
3717 3721
3718 3722 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3719 3723 return (EIO);
3720 3724
3721 3725 /*
3722 3726 * Don't call secpolicy_vnode_setattr, the client cannot
3723 3727 * use its cached attributes to make security decisions
3724 3728 * as the server may be faking mode bits or mapping uid/gid.
3725 3729 * Always just let the server to the checking.
3726 3730 * If we provide the ability to remove basic priviledges
3727 3731 * to setattr (e.g. basic without chmod) then we will
3728 3732 * need to add a check here before calling the server.
3729 3733 */
3730 3734
3731 3735 return (nfs4setattr(vp, vap, flags, cr, NULL));
3732 3736 }
3733 3737
3734 3738 /*
3735 3739 * To replace the "guarded" version 3 setattr, we use two types of compound
3736 3740 * setattr requests:
3737 3741 * 1. The "normal" setattr, used when the size of the file isn't being
3738 3742 * changed - { Putfh <fh>; Setattr; Getattr }/
3739 3743 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3740 3744 * with only ctime as the argument. If the server ctime differs from
3741 3745 * what is cached on the client, the verify will fail, but we would
3742 3746 * already have the ctime from the preceding getattr, so just set it
3743 3747 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3744 3748 * Setattr; Getattr }.
3745 3749 *
3746 3750 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3747 3751 * this setattr and NULL if they are not.
3748 3752 */
3749 3753 static int
3750 3754 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3751 3755 vsecattr_t *vsap)
3752 3756 {
3753 3757 COMPOUND4args_clnt args;
3754 3758 COMPOUND4res_clnt res, *resp = NULL;
3755 3759 nfs4_ga_res_t *garp = NULL;
3756 3760 int numops = 3; /* { Putfh; Setattr; Getattr } */
3757 3761 nfs_argop4 argop[5];
3758 3762 int verify_argop = -1;
3759 3763 int setattr_argop = 1;
3760 3764 nfs_resop4 *resop;
3761 3765 vattr_t va;
3762 3766 rnode4_t *rp;
3763 3767 int doqueue = 1;
3764 3768 uint_t mask = vap->va_mask;
3765 3769 mode_t omode;
3766 3770 vsecattr_t *vsp;
3767 3771 timestruc_t ctime;
3768 3772 bool_t needrecov = FALSE;
3769 3773 nfs4_recov_state_t recov_state;
3770 3774 nfs4_stateid_types_t sid_types;
3771 3775 stateid4 stateid;
3772 3776 hrtime_t t;
3773 3777 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3774 3778 servinfo4_t *svp;
3775 3779 bitmap4 supp_attrs;
3776 3780
3777 3781 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3778 3782 rp = VTOR4(vp);
3779 3783 nfs4_init_stateid_types(&sid_types);
3780 3784
3781 3785 /*
3782 3786 * Only need to flush pages if there are any pages and
3783 3787 * if the file is marked as dirty in some fashion. The
3784 3788 * file must be flushed so that we can accurately
3785 3789 * determine the size of the file and the cached data
3786 3790 * after the SETATTR returns. A file is considered to
3787 3791 * be dirty if it is either marked with R4DIRTY, has
3788 3792 * outstanding i/o's active, or is mmap'd. In this
3789 3793 * last case, we can't tell whether there are dirty
3790 3794 * pages, so we flush just to be sure.
3791 3795 */
3792 3796 if (nfs4_has_pages(vp) &&
3793 3797 ((rp->r_flags & R4DIRTY) ||
3794 3798 rp->r_count > 0 ||
3795 3799 rp->r_mapcnt > 0)) {
3796 3800 ASSERT(vp->v_type != VCHR);
3797 3801 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3798 3802 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3799 3803 mutex_enter(&rp->r_statelock);
3800 3804 if (!rp->r_error)
3801 3805 rp->r_error = e.error;
3802 3806 mutex_exit(&rp->r_statelock);
3803 3807 }
3804 3808 }
3805 3809
3806 3810 if (mask & AT_SIZE) {
3807 3811 /*
3808 3812 * Verification setattr compound for non-deleg AT_SIZE:
3809 3813 * { Putfh; Getattr; Verify; Setattr; Getattr }
3810 3814 * Set ctime local here (outside the do_again label)
3811 3815 * so that subsequent retries (after failed VERIFY)
3812 3816 * will use ctime from GETATTR results (from failed
3813 3817 * verify compound) as VERIFY arg.
3814 3818 * If file has delegation, then VERIFY(time_metadata)
3815 3819 * is of little added value, so don't bother.
3816 3820 */
3817 3821 mutex_enter(&rp->r_statev4_lock);
3818 3822 if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3819 3823 rp->r_deleg_return_pending) {
3820 3824 numops = 5;
3821 3825 ctime = rp->r_attr.va_ctime;
3822 3826 }
3823 3827 mutex_exit(&rp->r_statev4_lock);
3824 3828 }
3825 3829
3826 3830 recov_state.rs_flags = 0;
3827 3831 recov_state.rs_num_retry_despite_err = 0;
3828 3832
3829 3833 args.ctag = TAG_SETATTR;
3830 3834 do_again:
3831 3835 recov_retry:
3832 3836 setattr_argop = numops - 2;
3833 3837
3834 3838 args.array = argop;
3835 3839 args.array_len = numops;
3836 3840
3837 3841 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3838 3842 if (e.error)
3839 3843 return (e.error);
3840 3844
3841 3845
3842 3846 /* putfh target fh */
3843 3847 argop[0].argop = OP_CPUTFH;
3844 3848 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3845 3849
3846 3850 if (numops == 5) {
3847 3851 /*
3848 3852 * We only care about the ctime, but need to get mtime
3849 3853 * and size for proper cache update.
3850 3854 */
3851 3855 /* getattr */
3852 3856 argop[1].argop = OP_GETATTR;
3853 3857 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3854 3858 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3855 3859
3856 3860 /* verify - set later in loop */
3857 3861 verify_argop = 2;
3858 3862 }
3859 3863
3860 3864 /* setattr */
3861 3865 svp = rp->r_server;
3862 3866 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3863 3867 supp_attrs = svp->sv_supp_attrs;
3864 3868 nfs_rw_exit(&svp->sv_lock);
3865 3869
3866 3870 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3867 3871 supp_attrs, &e.error, &sid_types);
3868 3872 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3869 3873 if (e.error) {
3870 3874 /* req time field(s) overflow - return immediately */
3871 3875 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3872 3876 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3873 3877 opsetattr.obj_attributes);
3874 3878 return (e.error);
3875 3879 }
3876 3880 omode = rp->r_attr.va_mode;
3877 3881
3878 3882 /* getattr */
3879 3883 argop[numops-1].argop = OP_GETATTR;
3880 3884 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3881 3885 /*
3882 3886 * If we are setting the ACL (indicated only by vsap != NULL), request
3883 3887 * the ACL in this getattr. The ACL returned from this getattr will be
3884 3888 * used in updating the ACL cache.
3885 3889 */
3886 3890 if (vsap != NULL)
3887 3891 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3888 3892 FATTR4_ACL_MASK;
3889 3893 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3890 3894
3891 3895 /*
3892 3896 * setattr iterates if the object size is set and the cached ctime
3893 3897 * does not match the file ctime. In that case, verify the ctime first.
3894 3898 */
3895 3899
3896 3900 do {
3897 3901 if (verify_argop != -1) {
3898 3902 /*
3899 3903 * Verify that the ctime match before doing setattr.
3900 3904 */
3901 3905 va.va_mask = AT_CTIME;
3902 3906 va.va_ctime = ctime;
3903 3907 svp = rp->r_server;
3904 3908 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 3909 supp_attrs = svp->sv_supp_attrs;
3906 3910 nfs_rw_exit(&svp->sv_lock);
3907 3911 e.error = nfs4args_verify(&argop[verify_argop], &va,
3908 3912 OP_VERIFY, supp_attrs);
3909 3913 if (e.error) {
3910 3914 /* req time field(s) overflow - return */
3911 3915 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3912 3916 needrecov);
3913 3917 break;
3914 3918 }
3915 3919 }
3916 3920
3917 3921 doqueue = 1;
3918 3922
3919 3923 t = gethrtime();
3920 3924
3921 3925 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3922 3926
3923 3927 /*
3924 3928 * Purge the access cache and ACL cache if changing either the
3925 3929 * owner of the file, the group owner, or the mode. These may
3926 3930 * change the access permissions of the file, so purge old
3927 3931 * information and start over again.
3928 3932 */
3929 3933 if (mask & (AT_UID | AT_GID | AT_MODE)) {
3930 3934 (void) nfs4_access_purge_rp(rp);
3931 3935 if (rp->r_secattr != NULL) {
3932 3936 mutex_enter(&rp->r_statelock);
3933 3937 vsp = rp->r_secattr;
3934 3938 rp->r_secattr = NULL;
3935 3939 mutex_exit(&rp->r_statelock);
3936 3940 if (vsp != NULL)
3937 3941 nfs4_acl_free_cache(vsp);
3938 3942 }
3939 3943 }
3940 3944
3941 3945 /*
3942 3946 * If res.array_len == numops, then everything succeeded,
3943 3947 * except for possibly the final getattr. If only the
3944 3948 * last getattr failed, give up, and don't try recovery.
3945 3949 */
3946 3950 if (res.array_len == numops) {
3947 3951 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3948 3952 needrecov);
3949 3953 if (! e.error)
3950 3954 resp = &res;
3951 3955 break;
3952 3956 }
3953 3957
3954 3958 /*
3955 3959 * if either rpc call failed or completely succeeded - done
3956 3960 */
3957 3961 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3958 3962 if (e.error) {
3959 3963 PURGE_ATTRCACHE4(vp);
3960 3964 if (!needrecov) {
3961 3965 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3962 3966 needrecov);
3963 3967 break;
3964 3968 }
3965 3969 }
3966 3970
3967 3971 /*
3968 3972 * Do proper retry for OLD_STATEID outside of the normal
3969 3973 * recovery framework.
3970 3974 */
3971 3975 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3972 3976 sid_types.cur_sid_type != SPEC_SID &&
3973 3977 sid_types.cur_sid_type != NO_SID) {
3974 3978 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3975 3979 needrecov);
3976 3980 nfs4_save_stateid(&stateid, &sid_types);
3977 3981 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3978 3982 opsetattr.obj_attributes);
3979 3983 if (verify_argop != -1) {
3980 3984 nfs4args_verify_free(&argop[verify_argop]);
3981 3985 verify_argop = -1;
3982 3986 }
3983 3987 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3984 3988 goto recov_retry;
3985 3989 }
3986 3990
3987 3991 if (needrecov) {
3988 3992 bool_t abort;
3989 3993
3990 3994 abort = nfs4_start_recovery(&e,
3991 3995 VTOMI4(vp), vp, NULL, NULL, NULL,
3992 3996 OP_SETATTR, NULL, NULL, NULL);
3993 3997 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3994 3998 needrecov);
3995 3999 /*
3996 4000 * Do not retry if we failed with OLD_STATEID using
3997 4001 * a special stateid. This is done to avoid looping
3998 4002 * with a broken server.
3999 4003 */
4000 4004 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4001 4005 (sid_types.cur_sid_type == SPEC_SID ||
4002 4006 sid_types.cur_sid_type == NO_SID))
4003 4007 abort = TRUE;
4004 4008 if (!e.error) {
4005 4009 if (res.status == NFS4ERR_BADOWNER)
4006 4010 nfs4_log_badowner(VTOMI4(vp),
4007 4011 OP_SETATTR);
4008 4012
4009 4013 e.error = geterrno4(res.status);
4010 4014 (void) xdr_free(xdr_COMPOUND4res_clnt,
4011 4015 (caddr_t)&res);
4012 4016 }
4013 4017 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4014 4018 opsetattr.obj_attributes);
4015 4019 if (verify_argop != -1) {
4016 4020 nfs4args_verify_free(&argop[verify_argop]);
4017 4021 verify_argop = -1;
4018 4022 }
4019 4023 if (abort == FALSE) {
4020 4024 /*
4021 4025 * Need to retry all possible stateids in
4022 4026 * case the recovery error wasn't stateid
4023 4027 * related or the stateids have become
4024 4028 * stale (server reboot).
4025 4029 */
4026 4030 nfs4_init_stateid_types(&sid_types);
4027 4031 goto recov_retry;
4028 4032 }
4029 4033 return (e.error);
4030 4034 }
4031 4035
4032 4036 /*
4033 4037 * Need to call nfs4_end_op before nfs4getattr to
4034 4038 * avoid potential nfs4_start_op deadlock. See RFE
4035 4039 * 4777612. Calls to nfs4_invalidate_pages() and
4036 4040 * nfs4_purge_stale_fh() might also generate over the
4037 4041 * wire calls which my cause nfs4_start_op() deadlock.
4038 4042 */
4039 4043 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4040 4044
4041 4045 /*
4042 4046 * Check to update lease.
4043 4047 */
4044 4048 resp = &res;
4045 4049 if (res.status == NFS4_OK) {
4046 4050 break;
4047 4051 }
4048 4052
4049 4053 /*
4050 4054 * Check if verify failed to see if try again
4051 4055 */
4052 4056 if ((verify_argop == -1) || (res.array_len != 3)) {
4053 4057 /*
4054 4058 * can't continue...
4055 4059 */
4056 4060 if (res.status == NFS4ERR_BADOWNER)
4057 4061 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4058 4062
4059 4063 e.error = geterrno4(res.status);
4060 4064 } else {
4061 4065 /*
4062 4066 * When the verify request fails, the client ctime is
4063 4067 * not in sync with the server. This is the same as
4064 4068 * the version 3 "not synchronized" error, and we
4065 4069 * handle it in a similar manner (XXX do we need to???).
4066 4070 * Use the ctime returned in the first getattr for
4067 4071 * the input to the next verify.
4068 4072 * If we couldn't get the attributes, then we give up
4069 4073 * because we can't complete the operation as required.
4070 4074 */
4071 4075 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4072 4076 }
4073 4077 if (e.error) {
4074 4078 PURGE_ATTRCACHE4(vp);
4075 4079 nfs4_purge_stale_fh(e.error, vp, cr);
4076 4080 } else {
4077 4081 /*
4078 4082 * retry with a new verify value
4079 4083 */
4080 4084 ctime = garp->n4g_va.va_ctime;
4081 4085 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4082 4086 resp = NULL;
4083 4087 }
4084 4088 if (!e.error) {
4085 4089 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4086 4090 opsetattr.obj_attributes);
4087 4091 if (verify_argop != -1) {
4088 4092 nfs4args_verify_free(&argop[verify_argop]);
4089 4093 verify_argop = -1;
4090 4094 }
4091 4095 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4092 4096 goto do_again;
4093 4097 }
4094 4098 } while (!e.error);
4095 4099
4096 4100 if (e.error) {
4097 4101 /*
4098 4102 * If we are here, rfs4call has an irrecoverable error - return
4099 4103 */
4100 4104 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4101 4105 opsetattr.obj_attributes);
4102 4106 if (verify_argop != -1) {
4103 4107 nfs4args_verify_free(&argop[verify_argop]);
4104 4108 verify_argop = -1;
4105 4109 }
4106 4110 if (resp)
4107 4111 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4108 4112 return (e.error);
4109 4113 }
4110 4114
4111 4115
4112 4116
4113 4117 /*
4114 4118 * If changing the size of the file, invalidate
4115 4119 * any local cached data which is no longer part
4116 4120 * of the file. We also possibly invalidate the
4117 4121 * last page in the file. We could use
4118 4122 * pvn_vpzero(), but this would mark the page as
4119 4123 * modified and require it to be written back to
4120 4124 * the server for no particularly good reason.
4121 4125 * This way, if we access it, then we bring it
4122 4126 * back in. A read should be cheaper than a
4123 4127 * write.
4124 4128 */
4125 4129 if (mask & AT_SIZE) {
4126 4130 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4127 4131 }
4128 4132
4129 4133 /* either no error or one of the postop getattr failed */
4130 4134
4131 4135 /*
4132 4136 * XXX Perform a simplified version of wcc checking. Instead of
4133 4137 * have another getattr to get pre-op, just purge cache if
4134 4138 * any of the ops prior to and including the getattr failed.
4135 4139 * If the getattr succeeded then update the attrcache accordingly.
4136 4140 */
4137 4141
4138 4142 garp = NULL;
4139 4143 if (res.status == NFS4_OK) {
4140 4144 /*
4141 4145 * Last getattr
4142 4146 */
4143 4147 resop = &res.array[numops - 1];
4144 4148 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4145 4149 }
4146 4150 /*
4147 4151 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4148 4152 * rather than filling it. See the function itself for details.
4149 4153 */
4150 4154 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4151 4155 if (garp != NULL) {
4152 4156 if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4153 4157 nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4154 4158 vs_ace4_destroy(&garp->n4g_vsa);
4155 4159 } else {
4156 4160 if (vsap != NULL) {
4157 4161 /*
4158 4162 * The ACL was supposed to be set and to be
4159 4163 * returned in the last getattr of this
4160 4164 * compound, but for some reason the getattr
4161 4165 * result doesn't contain the ACL. In this
4162 4166 * case, purge the ACL cache.
4163 4167 */
4164 4168 if (rp->r_secattr != NULL) {
4165 4169 mutex_enter(&rp->r_statelock);
4166 4170 vsp = rp->r_secattr;
4167 4171 rp->r_secattr = NULL;
4168 4172 mutex_exit(&rp->r_statelock);
4169 4173 if (vsp != NULL)
4170 4174 nfs4_acl_free_cache(vsp);
4171 4175 }
4172 4176 }
4173 4177 }
4174 4178 }
4175 4179
4176 4180 if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4177 4181 /*
4178 4182 * Set the size, rather than relying on getting it updated
4179 4183 * via a GETATTR. With delegations the client tries to
4180 4184 * suppress GETATTR calls.
4181 4185 */
4182 4186 mutex_enter(&rp->r_statelock);
4183 4187 rp->r_size = vap->va_size;
4184 4188 mutex_exit(&rp->r_statelock);
4185 4189 }
4186 4190
4187 4191 /*
4188 4192 * Can free up request args and res
4189 4193 */
4190 4194 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4191 4195 opsetattr.obj_attributes);
4192 4196 if (verify_argop != -1) {
4193 4197 nfs4args_verify_free(&argop[verify_argop]);
4194 4198 verify_argop = -1;
4195 4199 }
4196 4200 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4197 4201
4198 4202 /*
4199 4203 * Some servers will change the mode to clear the setuid
4200 4204 * and setgid bits when changing the uid or gid. The
4201 4205 * client needs to compensate appropriately.
4202 4206 */
4203 4207 if (mask & (AT_UID | AT_GID)) {
4204 4208 int terror, do_setattr;
4205 4209
4206 4210 do_setattr = 0;
4207 4211 va.va_mask = AT_MODE;
4208 4212 terror = nfs4getattr(vp, &va, cr);
4209 4213 if (!terror &&
4210 4214 (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4211 4215 (!(mask & AT_MODE) && va.va_mode != omode))) {
4212 4216 va.va_mask = AT_MODE;
4213 4217 if (mask & AT_MODE) {
4214 4218 /*
4215 4219 * We asked the mode to be changed and what
4216 4220 * we just got from the server in getattr is
4217 4221 * not what we wanted it to be, so set it now.
4218 4222 */
4219 4223 va.va_mode = vap->va_mode;
4220 4224 do_setattr = 1;
4221 4225 } else {
4222 4226 /*
4223 4227 * We did not ask the mode to be changed,
4224 4228 * Check to see that the server just cleared
4225 4229 * I_SUID and I_GUID from it. If not then
4226 4230 * set mode to omode with UID/GID cleared.
4227 4231 */
4228 4232 if (nfs4_compare_modes(va.va_mode, omode)) {
4229 4233 omode &= ~(S_ISUID|S_ISGID);
4230 4234 va.va_mode = omode;
4231 4235 do_setattr = 1;
4232 4236 }
4233 4237 }
4234 4238
4235 4239 if (do_setattr)
4236 4240 (void) nfs4setattr(vp, &va, 0, cr, NULL);
4237 4241 }
4238 4242 }
4239 4243
4240 4244 return (e.error);
4241 4245 }
4242 4246
4243 4247 /* ARGSUSED */
4244 4248 static int
4245 4249 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4246 4250 {
4247 4251 COMPOUND4args_clnt args;
4248 4252 COMPOUND4res_clnt res;
4249 4253 int doqueue;
4250 4254 uint32_t acc, resacc, argacc;
4251 4255 rnode4_t *rp;
4252 4256 cred_t *cred, *ncr, *ncrfree = NULL;
4253 4257 nfs4_access_type_t cacc;
4254 4258 int num_ops;
4255 4259 nfs_argop4 argop[3];
4256 4260 nfs_resop4 *resop;
4257 4261 bool_t needrecov = FALSE, do_getattr;
4258 4262 nfs4_recov_state_t recov_state;
4259 4263 int rpc_error;
4260 4264 hrtime_t t;
4261 4265 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4262 4266 mntinfo4_t *mi = VTOMI4(vp);
4263 4267
4264 4268 if (nfs_zone() != mi->mi_zone)
4265 4269 return (EIO);
4266 4270
4267 4271 acc = 0;
4268 4272 if (mode & VREAD)
4269 4273 acc |= ACCESS4_READ;
4270 4274 if (mode & VWRITE) {
4271 4275 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4272 4276 return (EROFS);
4273 4277 if (vp->v_type == VDIR)
4274 4278 acc |= ACCESS4_DELETE;
4275 4279 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4276 4280 }
4277 4281 if (mode & VEXEC) {
4278 4282 if (vp->v_type == VDIR)
4279 4283 acc |= ACCESS4_LOOKUP;
4280 4284 else
4281 4285 acc |= ACCESS4_EXECUTE;
4282 4286 }
4283 4287
4284 4288 if (VTOR4(vp)->r_acache != NULL) {
4285 4289 e.error = nfs4_validate_caches(vp, cr);
4286 4290 if (e.error)
4287 4291 return (e.error);
4288 4292 }
4289 4293
4290 4294 rp = VTOR4(vp);
4291 4295 if (vp->v_type == VDIR)
4292 4296 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4293 4297 ACCESS4_EXTEND | ACCESS4_LOOKUP;
4294 4298 else
4295 4299 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4296 4300 ACCESS4_EXECUTE;
4297 4301 recov_state.rs_flags = 0;
4298 4302 recov_state.rs_num_retry_despite_err = 0;
4299 4303
4300 4304 cred = cr;
4301 4305 /*
4302 4306 * ncr and ncrfree both initially
4303 4307 * point to the memory area returned
4304 4308 * by crnetadjust();
4305 4309 * ncrfree not NULL when exiting means
4306 4310 * that we need to release it
4307 4311 */
4308 4312 ncr = crnetadjust(cred);
4309 4313 ncrfree = ncr;
4310 4314
4311 4315 tryagain:
4312 4316 cacc = nfs4_access_check(rp, acc, cred);
4313 4317 if (cacc == NFS4_ACCESS_ALLOWED) {
4314 4318 if (ncrfree != NULL)
4315 4319 crfree(ncrfree);
4316 4320 return (0);
4317 4321 }
4318 4322 if (cacc == NFS4_ACCESS_DENIED) {
4319 4323 /*
4320 4324 * If the cred can be adjusted, try again
4321 4325 * with the new cred.
4322 4326 */
4323 4327 if (ncr != NULL) {
4324 4328 cred = ncr;
4325 4329 ncr = NULL;
4326 4330 goto tryagain;
4327 4331 }
4328 4332 if (ncrfree != NULL)
4329 4333 crfree(ncrfree);
4330 4334 return (EACCES);
4331 4335 }
4332 4336
4333 4337 recov_retry:
4334 4338 /*
4335 4339 * Don't take with r_statev4_lock here. r_deleg_type could
4336 4340 * change as soon as lock is released. Since it is an int,
4337 4341 * there is no atomicity issue.
4338 4342 */
4339 4343 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4340 4344 num_ops = do_getattr ? 3 : 2;
4341 4345
4342 4346 args.ctag = TAG_ACCESS;
4343 4347
4344 4348 args.array_len = num_ops;
4345 4349 args.array = argop;
4346 4350
4347 4351 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4348 4352 &recov_state, NULL)) {
4349 4353 if (ncrfree != NULL)
4350 4354 crfree(ncrfree);
4351 4355 return (e.error);
4352 4356 }
4353 4357
4354 4358 /* putfh target fh */
4355 4359 argop[0].argop = OP_CPUTFH;
4356 4360 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4357 4361
4358 4362 /* access */
4359 4363 argop[1].argop = OP_ACCESS;
4360 4364 argop[1].nfs_argop4_u.opaccess.access = argacc;
4361 4365
4362 4366 /* getattr */
4363 4367 if (do_getattr) {
4364 4368 argop[2].argop = OP_GETATTR;
4365 4369 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4366 4370 argop[2].nfs_argop4_u.opgetattr.mi = mi;
4367 4371 }
4368 4372
4369 4373 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4370 4374 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4371 4375 rnode4info(VTOR4(vp))));
4372 4376
4373 4377 doqueue = 1;
4374 4378 t = gethrtime();
4375 4379 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4376 4380 rpc_error = e.error;
4377 4381
4378 4382 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4379 4383 if (needrecov) {
4380 4384 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4381 4385 "nfs4_access: initiating recovery\n"));
4382 4386
4383 4387 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4384 4388 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4385 4389 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4386 4390 &recov_state, needrecov);
4387 4391 if (!e.error)
4388 4392 (void) xdr_free(xdr_COMPOUND4res_clnt,
4389 4393 (caddr_t)&res);
4390 4394 goto recov_retry;
4391 4395 }
4392 4396 }
4393 4397 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4394 4398
4395 4399 if (e.error)
4396 4400 goto out;
4397 4401
4398 4402 if (res.status) {
4399 4403 e.error = geterrno4(res.status);
4400 4404 /*
4401 4405 * This might generate over the wire calls throught
4402 4406 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4403 4407 * here to avoid a deadlock.
4404 4408 */
4405 4409 nfs4_purge_stale_fh(e.error, vp, cr);
4406 4410 goto out;
4407 4411 }
4408 4412 resop = &res.array[1]; /* access res */
4409 4413
4410 4414 resacc = resop->nfs_resop4_u.opaccess.access;
4411 4415
4412 4416 if (do_getattr) {
4413 4417 resop++; /* getattr res */
4414 4418 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4415 4419 t, cr, FALSE, NULL);
4416 4420 }
4417 4421
4418 4422 if (!e.error) {
4419 4423 nfs4_access_cache(rp, argacc, resacc, cred);
4420 4424 /*
4421 4425 * we just cached results with cred; if cred is the
4422 4426 * adjusted credentials from crnetadjust, we do not want
4423 4427 * to release them before exiting: hence setting ncrfree
4424 4428 * to NULL
4425 4429 */
4426 4430 if (cred != cr)
4427 4431 ncrfree = NULL;
4428 4432 /* XXX check the supported bits too? */
4429 4433 if ((acc & resacc) != acc) {
4430 4434 /*
4431 4435 * The following code implements the semantic
4432 4436 * that a setuid root program has *at least* the
4433 4437 * permissions of the user that is running the
4434 4438 * program. See rfs3call() for more portions
4435 4439 * of the implementation of this functionality.
4436 4440 */
4437 4441 /* XXX-LP */
4438 4442 if (ncr != NULL) {
4439 4443 (void) xdr_free(xdr_COMPOUND4res_clnt,
4440 4444 (caddr_t)&res);
4441 4445 cred = ncr;
4442 4446 ncr = NULL;
4443 4447 goto tryagain;
4444 4448 }
4445 4449 e.error = EACCES;
4446 4450 }
4447 4451 }
4448 4452
4449 4453 out:
4450 4454 if (!rpc_error)
4451 4455 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4452 4456
4453 4457 if (ncrfree != NULL)
4454 4458 crfree(ncrfree);
4455 4459
4456 4460 return (e.error);
4457 4461 }
4458 4462
4459 4463 /* ARGSUSED */
4460 4464 static int
4461 4465 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4462 4466 {
4463 4467 COMPOUND4args_clnt args;
4464 4468 COMPOUND4res_clnt res;
4465 4469 int doqueue;
4466 4470 rnode4_t *rp;
4467 4471 nfs_argop4 argop[3];
4468 4472 nfs_resop4 *resop;
4469 4473 READLINK4res *lr_res;
4470 4474 nfs4_ga_res_t *garp;
4471 4475 uint_t len;
4472 4476 char *linkdata;
4473 4477 bool_t needrecov = FALSE;
4474 4478 nfs4_recov_state_t recov_state;
4475 4479 hrtime_t t;
4476 4480 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4477 4481
4478 4482 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4479 4483 return (EIO);
4480 4484 /*
4481 4485 * Can't readlink anything other than a symbolic link.
4482 4486 */
4483 4487 if (vp->v_type != VLNK)
4484 4488 return (EINVAL);
4485 4489
4486 4490 rp = VTOR4(vp);
4487 4491 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4488 4492 e.error = nfs4_validate_caches(vp, cr);
4489 4493 if (e.error)
4490 4494 return (e.error);
4491 4495 mutex_enter(&rp->r_statelock);
4492 4496 if (rp->r_symlink.contents != NULL) {
4493 4497 e.error = uiomove(rp->r_symlink.contents,
4494 4498 rp->r_symlink.len, UIO_READ, uiop);
4495 4499 mutex_exit(&rp->r_statelock);
4496 4500 return (e.error);
4497 4501 }
4498 4502 mutex_exit(&rp->r_statelock);
4499 4503 }
4500 4504 recov_state.rs_flags = 0;
4501 4505 recov_state.rs_num_retry_despite_err = 0;
4502 4506
4503 4507 recov_retry:
4504 4508 args.array_len = 3;
4505 4509 args.array = argop;
4506 4510 args.ctag = TAG_READLINK;
4507 4511
4508 4512 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4509 4513 if (e.error) {
4510 4514 return (e.error);
4511 4515 }
4512 4516
4513 4517 /* 0. putfh symlink fh */
4514 4518 argop[0].argop = OP_CPUTFH;
4515 4519 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4516 4520
4517 4521 /* 1. readlink */
4518 4522 argop[1].argop = OP_READLINK;
4519 4523
4520 4524 /* 2. getattr */
4521 4525 argop[2].argop = OP_GETATTR;
4522 4526 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4523 4527 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4524 4528
4525 4529 doqueue = 1;
4526 4530
4527 4531 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4528 4532 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4529 4533 rnode4info(VTOR4(vp))));
4530 4534
4531 4535 t = gethrtime();
4532 4536
4533 4537 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4534 4538
4535 4539 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4536 4540 if (needrecov) {
4537 4541 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4538 4542 "nfs4_readlink: initiating recovery\n"));
4539 4543
4540 4544 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4541 4545 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4542 4546 if (!e.error)
4543 4547 (void) xdr_free(xdr_COMPOUND4res_clnt,
4544 4548 (caddr_t)&res);
4545 4549
4546 4550 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4547 4551 needrecov);
4548 4552 goto recov_retry;
4549 4553 }
4550 4554 }
4551 4555
4552 4556 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4553 4557
4554 4558 if (e.error)
4555 4559 return (e.error);
4556 4560
4557 4561 /*
4558 4562 * There is an path in the code below which calls
4559 4563 * nfs4_purge_stale_fh(), which may generate otw calls through
4560 4564 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4561 4565 * here to avoid nfs4_start_op() deadlock.
4562 4566 */
4563 4567
4564 4568 if (res.status && (res.array_len < args.array_len)) {
4565 4569 /*
4566 4570 * either Putfh or Link failed
4567 4571 */
4568 4572 e.error = geterrno4(res.status);
4569 4573 nfs4_purge_stale_fh(e.error, vp, cr);
4570 4574 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4571 4575 return (e.error);
4572 4576 }
4573 4577
4574 4578 resop = &res.array[1]; /* readlink res */
4575 4579 lr_res = &resop->nfs_resop4_u.opreadlink;
4576 4580
4577 4581 /*
4578 4582 * treat symlink names as data
4579 4583 */
4580 4584 linkdata = utf8_to_str(&lr_res->link, &len, NULL);
4581 4585 if (linkdata != NULL) {
4582 4586 int uio_len = len - 1;
4583 4587 /* len includes null byte, which we won't uiomove */
4584 4588 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4585 4589 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4586 4590 mutex_enter(&rp->r_statelock);
4587 4591 if (rp->r_symlink.contents == NULL) {
4588 4592 rp->r_symlink.contents = linkdata;
4589 4593 rp->r_symlink.len = uio_len;
4590 4594 rp->r_symlink.size = len;
4591 4595 mutex_exit(&rp->r_statelock);
4592 4596 } else {
4593 4597 mutex_exit(&rp->r_statelock);
4594 4598 kmem_free(linkdata, len);
4595 4599 }
4596 4600 } else {
4597 4601 kmem_free(linkdata, len);
4598 4602 }
4599 4603 }
4600 4604 if (res.status == NFS4_OK) {
4601 4605 resop++; /* getattr res */
4602 4606 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4603 4607 }
4604 4608 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4605 4609
4606 4610 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4607 4611
4608 4612 /*
4609 4613 * The over the wire error for attempting to readlink something
4610 4614 * other than a symbolic link is ENXIO. However, we need to
4611 4615 * return EINVAL instead of ENXIO, so we map it here.
4612 4616 */
4613 4617 return (e.error == ENXIO ? EINVAL : e.error);
4614 4618 }
4615 4619
4616 4620 /*
4617 4621 * Flush local dirty pages to stable storage on the server.
4618 4622 *
4619 4623 * If FNODSYNC is specified, then there is nothing to do because
4620 4624 * metadata changes are not cached on the client before being
4621 4625 * sent to the server.
4622 4626 */
4623 4627 /* ARGSUSED */
4624 4628 static int
4625 4629 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4626 4630 {
4627 4631 int error;
4628 4632
4629 4633 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4630 4634 return (0);
4631 4635 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4632 4636 return (EIO);
4633 4637 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4634 4638 if (!error)
4635 4639 error = VTOR4(vp)->r_error;
4636 4640 return (error);
4637 4641 }
4638 4642
4639 4643 /*
4640 4644 * Weirdness: if the file was removed or the target of a rename
4641 4645 * operation while it was open, it got renamed instead. Here we
4642 4646 * remove the renamed file.
4643 4647 */
4644 4648 /* ARGSUSED */
4645 4649 void
4646 4650 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4647 4651 {
4648 4652 rnode4_t *rp;
4649 4653
4650 4654 ASSERT(vp != DNLC_NO_VNODE);
4651 4655
4652 4656 rp = VTOR4(vp);
4653 4657
4654 4658 if (IS_SHADOW(vp, rp)) {
4655 4659 sv_inactive(vp);
4656 4660 return;
4657 4661 }
4658 4662
4659 4663 /*
4660 4664 * If this is coming from the wrong zone, we let someone in the right
4661 4665 * zone take care of it asynchronously. We can get here due to
4662 4666 * VN_RELE() being called from pageout() or fsflush(). This call may
4663 4667 * potentially turn into an expensive no-op if, for instance, v_count
4664 4668 * gets incremented in the meantime, but it's still correct.
4665 4669 */
4666 4670 if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4667 4671 nfs4_async_inactive(vp, cr);
4668 4672 return;
4669 4673 }
4670 4674
4671 4675 /*
4672 4676 * Some of the cleanup steps might require over-the-wire
4673 4677 * operations. Since VOP_INACTIVE can get called as a result of
4674 4678 * other over-the-wire operations (e.g., an attribute cache update
4675 4679 * can lead to a DNLC purge), doing those steps now would lead to a
4676 4680 * nested call to the recovery framework, which can deadlock. So
4677 4681 * do any over-the-wire cleanups asynchronously, in a separate
4678 4682 * thread.
4679 4683 */
4680 4684
4681 4685 mutex_enter(&rp->r_os_lock);
4682 4686 mutex_enter(&rp->r_statelock);
4683 4687 mutex_enter(&rp->r_statev4_lock);
4684 4688
4685 4689 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4686 4690 mutex_exit(&rp->r_statev4_lock);
4687 4691 mutex_exit(&rp->r_statelock);
4688 4692 mutex_exit(&rp->r_os_lock);
4689 4693 nfs4_async_inactive(vp, cr);
4690 4694 return;
4691 4695 }
4692 4696
4693 4697 if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4694 4698 rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4695 4699 mutex_exit(&rp->r_statev4_lock);
4696 4700 mutex_exit(&rp->r_statelock);
4697 4701 mutex_exit(&rp->r_os_lock);
4698 4702 nfs4_async_inactive(vp, cr);
4699 4703 return;
4700 4704 }
4701 4705
4702 4706 if (rp->r_unldvp != NULL) {
4703 4707 mutex_exit(&rp->r_statev4_lock);
4704 4708 mutex_exit(&rp->r_statelock);
4705 4709 mutex_exit(&rp->r_os_lock);
4706 4710 nfs4_async_inactive(vp, cr);
4707 4711 return;
4708 4712 }
4709 4713 mutex_exit(&rp->r_statev4_lock);
4710 4714 mutex_exit(&rp->r_statelock);
4711 4715 mutex_exit(&rp->r_os_lock);
4712 4716
4713 4717 rp4_addfree(rp, cr);
4714 4718 }
4715 4719
4716 4720 /*
4717 4721 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4718 4722 * various bits of state. The caller must not refer to vp after this call.
4719 4723 */
4720 4724
4721 4725 void
4722 4726 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4723 4727 {
4724 4728 rnode4_t *rp = VTOR4(vp);
4725 4729 nfs4_recov_state_t recov_state;
4726 4730 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4727 4731 vnode_t *unldvp;
4728 4732 char *unlname;
4729 4733 cred_t *unlcred;
4730 4734 COMPOUND4args_clnt args;
4731 4735 COMPOUND4res_clnt res, *resp;
4732 4736 nfs_argop4 argop[2];
4733 4737 int doqueue;
4734 4738 #ifdef DEBUG
4735 4739 char *name;
4736 4740 #endif
4737 4741
4738 4742 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4739 4743 ASSERT(!IS_SHADOW(vp, rp));
4740 4744
4741 4745 #ifdef DEBUG
4742 4746 name = fn_name(VTOSV(vp)->sv_name);
4743 4747 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4744 4748 "release vnode %s", name));
4745 4749 kmem_free(name, MAXNAMELEN);
4746 4750 #endif
4747 4751
4748 4752 if (vp->v_type == VREG) {
4749 4753 bool_t recov_failed = FALSE;
4750 4754
4751 4755 e.error = nfs4close_all(vp, cr);
4752 4756 if (e.error) {
4753 4757 /* Check to see if recovery failed */
4754 4758 mutex_enter(&(VTOMI4(vp)->mi_lock));
4755 4759 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4756 4760 recov_failed = TRUE;
4757 4761 mutex_exit(&(VTOMI4(vp)->mi_lock));
4758 4762 if (!recov_failed) {
4759 4763 mutex_enter(&rp->r_statelock);
4760 4764 if (rp->r_flags & R4RECOVERR)
4761 4765 recov_failed = TRUE;
4762 4766 mutex_exit(&rp->r_statelock);
4763 4767 }
4764 4768 if (recov_failed) {
4765 4769 NFS4_DEBUG(nfs4_client_recov_debug,
4766 4770 (CE_NOTE, "nfs4_inactive_otw: "
4767 4771 "close failed (recovery failure)"));
4768 4772 }
4769 4773 }
4770 4774 }
4771 4775
4772 4776 redo:
4773 4777 if (rp->r_unldvp == NULL) {
4774 4778 rp4_addfree(rp, cr);
4775 4779 return;
4776 4780 }
4777 4781
4778 4782 /*
4779 4783 * Save the vnode pointer for the directory where the
4780 4784 * unlinked-open file got renamed, then set it to NULL
4781 4785 * to prevent another thread from getting here before
4782 4786 * we're done with the remove. While we have the
4783 4787 * statelock, make local copies of the pertinent rnode
4784 4788 * fields. If we weren't to do this in an atomic way, the
4785 4789 * the unl* fields could become inconsistent with respect
4786 4790 * to each other due to a race condition between this
4787 4791 * code and nfs_remove(). See bug report 1034328.
4788 4792 */
4789 4793 mutex_enter(&rp->r_statelock);
4790 4794 if (rp->r_unldvp == NULL) {
4791 4795 mutex_exit(&rp->r_statelock);
4792 4796 rp4_addfree(rp, cr);
4793 4797 return;
4794 4798 }
4795 4799
4796 4800 unldvp = rp->r_unldvp;
4797 4801 rp->r_unldvp = NULL;
4798 4802 unlname = rp->r_unlname;
4799 4803 rp->r_unlname = NULL;
4800 4804 unlcred = rp->r_unlcred;
4801 4805 rp->r_unlcred = NULL;
4802 4806 mutex_exit(&rp->r_statelock);
4803 4807
4804 4808 /*
4805 4809 * If there are any dirty pages left, then flush
4806 4810 * them. This is unfortunate because they just
4807 4811 * may get thrown away during the remove operation,
4808 4812 * but we have to do this for correctness.
4809 4813 */
4810 4814 if (nfs4_has_pages(vp) &&
4811 4815 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4812 4816 ASSERT(vp->v_type != VCHR);
4813 4817 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4814 4818 if (e.error) {
4815 4819 mutex_enter(&rp->r_statelock);
4816 4820 if (!rp->r_error)
4817 4821 rp->r_error = e.error;
4818 4822 mutex_exit(&rp->r_statelock);
4819 4823 }
4820 4824 }
4821 4825
4822 4826 recov_state.rs_flags = 0;
4823 4827 recov_state.rs_num_retry_despite_err = 0;
4824 4828 recov_retry_remove:
4825 4829 /*
4826 4830 * Do the remove operation on the renamed file
4827 4831 */
4828 4832 args.ctag = TAG_INACTIVE;
4829 4833
4830 4834 /*
4831 4835 * Remove ops: putfh dir; remove
4832 4836 */
4833 4837 args.array_len = 2;
4834 4838 args.array = argop;
4835 4839
4836 4840 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4837 4841 if (e.error) {
4838 4842 kmem_free(unlname, MAXNAMELEN);
4839 4843 crfree(unlcred);
4840 4844 VN_RELE(unldvp);
4841 4845 /*
4842 4846 * Try again; this time around r_unldvp will be NULL, so we'll
4843 4847 * just call rp4_addfree() and return.
4844 4848 */
4845 4849 goto redo;
4846 4850 }
4847 4851
4848 4852 /* putfh directory */
4849 4853 argop[0].argop = OP_CPUTFH;
4850 4854 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4851 4855
4852 4856 /* remove */
4853 4857 argop[1].argop = OP_CREMOVE;
4854 4858 argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4855 4859
4856 4860 doqueue = 1;
4857 4861 resp = &res;
4858 4862
4859 4863 #if 0 /* notyet */
4860 4864 /*
4861 4865 * Can't do this yet. We may be being called from
4862 4866 * dnlc_purge_XXX while that routine is holding a
4863 4867 * mutex lock to the nc_rele list. The calls to
4864 4868 * nfs3_cache_wcc_data may result in calls to
4865 4869 * dnlc_purge_XXX. This will result in a deadlock.
4866 4870 */
4867 4871 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4868 4872 if (e.error) {
4869 4873 PURGE_ATTRCACHE4(unldvp);
4870 4874 resp = NULL;
4871 4875 } else if (res.status) {
4872 4876 e.error = geterrno4(res.status);
4873 4877 PURGE_ATTRCACHE4(unldvp);
4874 4878 /*
4875 4879 * This code is inactive right now
4876 4880 * but if made active there should
4877 4881 * be a nfs4_end_op() call before
4878 4882 * nfs4_purge_stale_fh to avoid start_op()
4879 4883 * deadlock. See BugId: 4948726
4880 4884 */
4881 4885 nfs4_purge_stale_fh(error, unldvp, cr);
4882 4886 } else {
4883 4887 nfs_resop4 *resop;
4884 4888 REMOVE4res *rm_res;
4885 4889
4886 4890 resop = &res.array[1];
4887 4891 rm_res = &resop->nfs_resop4_u.opremove;
4888 4892 /*
4889 4893 * Update directory cache attribute,
4890 4894 * readdir and dnlc caches.
4891 4895 */
4892 4896 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4893 4897 }
4894 4898 #else
4895 4899 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4896 4900
4897 4901 PURGE_ATTRCACHE4(unldvp);
4898 4902 #endif
4899 4903
4900 4904 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4901 4905 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4902 4906 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4903 4907 if (!e.error)
4904 4908 (void) xdr_free(xdr_COMPOUND4res_clnt,
4905 4909 (caddr_t)&res);
4906 4910 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4907 4911 &recov_state, TRUE);
4908 4912 goto recov_retry_remove;
4909 4913 }
4910 4914 }
4911 4915 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4912 4916
4913 4917 /*
4914 4918 * Release stuff held for the remove
4915 4919 */
4916 4920 VN_RELE(unldvp);
4917 4921 if (!e.error && resp)
4918 4922 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4919 4923
4920 4924 kmem_free(unlname, MAXNAMELEN);
4921 4925 crfree(unlcred);
4922 4926 goto redo;
4923 4927 }
4924 4928
4925 4929 /*
4926 4930 * Remote file system operations having to do with directory manipulation.
4927 4931 */
4928 4932 /* ARGSUSED3 */
4929 4933 int
4930 4934 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4931 4935 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4932 4936 int *direntflags, pathname_t *realpnp)
4933 4937 {
4934 4938 int error;
4935 4939 vnode_t *vp, *avp = NULL;
4936 4940 rnode4_t *drp;
4937 4941
4938 4942 *vpp = NULL;
4939 4943 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4940 4944 return (EPERM);
4941 4945 /*
4942 4946 * if LOOKUP_XATTR, must replace dvp (object) with
4943 4947 * object's attrdir before continuing with lookup
4944 4948 */
4945 4949 if (flags & LOOKUP_XATTR) {
4946 4950 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4947 4951 if (error)
4948 4952 return (error);
4949 4953
4950 4954 dvp = avp;
4951 4955
4952 4956 /*
4953 4957 * If lookup is for "", just return dvp now. The attrdir
4954 4958 * has already been activated (from nfs4lookup_xattr), and
4955 4959 * the caller will RELE the original dvp -- not
4956 4960 * the attrdir. So, set vpp and return.
4957 4961 * Currently, when the LOOKUP_XATTR flag is
4958 4962 * passed to VOP_LOOKUP, the name is always empty, and
4959 4963 * shortcircuiting here avoids 3 unneeded lock/unlock
4960 4964 * pairs.
4961 4965 *
4962 4966 * If a non-empty name was provided, then it is the
4963 4967 * attribute name, and it will be looked up below.
4964 4968 */
4965 4969 if (*nm == '\0') {
4966 4970 *vpp = dvp;
4967 4971 return (0);
4968 4972 }
4969 4973
4970 4974 /*
4971 4975 * The vfs layer never sends a name when asking for the
4972 4976 * attrdir, so we should never get here (unless of course
4973 4977 * name is passed at some time in future -- at which time
4974 4978 * we'll blow up here).
4975 4979 */
4976 4980 ASSERT(0);
4977 4981 }
4978 4982
4979 4983 drp = VTOR4(dvp);
4980 4984 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4981 4985 return (EINTR);
4982 4986
4983 4987 error = nfs4lookup(dvp, nm, vpp, cr, 0);
4984 4988 nfs_rw_exit(&drp->r_rwlock);
4985 4989
4986 4990 /*
4987 4991 * If vnode is a device, create special vnode.
4988 4992 */
4989 4993 if (!error && ISVDEV((*vpp)->v_type)) {
4990 4994 vp = *vpp;
4991 4995 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
4992 4996 VN_RELE(vp);
4993 4997 }
4994 4998
4995 4999 return (error);
4996 5000 }
4997 5001
4998 5002 /* ARGSUSED */
4999 5003 static int
5000 5004 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5001 5005 {
5002 5006 int error;
5003 5007 rnode4_t *drp;
5004 5008 int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5005 5009 mntinfo4_t *mi;
5006 5010
5007 5011 mi = VTOMI4(dvp);
5008 5012 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5009 5013 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5010 5014 return (EINVAL);
5011 5015
5012 5016 drp = VTOR4(dvp);
5013 5017 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5014 5018 return (EINTR);
5015 5019
5016 5020 mutex_enter(&drp->r_statelock);
5017 5021 /*
5018 5022 * If the server doesn't support xattrs just return EINVAL
5019 5023 */
5020 5024 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5021 5025 mutex_exit(&drp->r_statelock);
5022 5026 nfs_rw_exit(&drp->r_rwlock);
5023 5027 return (EINVAL);
5024 5028 }
5025 5029
5026 5030 /*
5027 5031 * If there is a cached xattr directory entry,
5028 5032 * use it as long as the attributes are valid. If the
5029 5033 * attributes are not valid, take the simple approach and
5030 5034 * free the cached value and re-fetch a new value.
5031 5035 *
5032 5036 * We don't negative entry cache for now, if we did we
5033 5037 * would need to check if the file has changed on every
5034 5038 * lookup. But xattrs don't exist very often and failing
5035 5039 * an openattr is not much more expensive than and NVERIFY or GETATTR
5036 5040 * so do an openattr over the wire for now.
5037 5041 */
5038 5042 if (drp->r_xattr_dir != NULL) {
5039 5043 if (ATTRCACHE4_VALID(dvp)) {
5040 5044 VN_HOLD(drp->r_xattr_dir);
5041 5045 *vpp = drp->r_xattr_dir;
5042 5046 mutex_exit(&drp->r_statelock);
5043 5047 nfs_rw_exit(&drp->r_rwlock);
5044 5048 return (0);
5045 5049 }
5046 5050 VN_RELE(drp->r_xattr_dir);
5047 5051 drp->r_xattr_dir = NULL;
5048 5052 }
5049 5053 mutex_exit(&drp->r_statelock);
5050 5054
5051 5055 error = nfs4openattr(dvp, vpp, cflag, cr);
5052 5056
5053 5057 nfs_rw_exit(&drp->r_rwlock);
5054 5058
5055 5059 return (error);
5056 5060 }
5057 5061
5058 5062 static int
5059 5063 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5060 5064 {
5061 5065 int error;
5062 5066 rnode4_t *drp;
5063 5067
5064 5068 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5065 5069
5066 5070 /*
5067 5071 * If lookup is for "", just return dvp. Don't need
5068 5072 * to send it over the wire, look it up in the dnlc,
5069 5073 * or perform any access checks.
5070 5074 */
5071 5075 if (*nm == '\0') {
5072 5076 VN_HOLD(dvp);
5073 5077 *vpp = dvp;
5074 5078 return (0);
5075 5079 }
5076 5080
5077 5081 /*
5078 5082 * Can't do lookups in non-directories.
5079 5083 */
5080 5084 if (dvp->v_type != VDIR)
5081 5085 return (ENOTDIR);
5082 5086
5083 5087 /*
5084 5088 * If lookup is for ".", just return dvp. Don't need
5085 5089 * to send it over the wire or look it up in the dnlc,
5086 5090 * just need to check access.
5087 5091 */
5088 5092 if (nm[0] == '.' && nm[1] == '\0') {
5089 5093 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5090 5094 if (error)
5091 5095 return (error);
5092 5096 VN_HOLD(dvp);
5093 5097 *vpp = dvp;
5094 5098 return (0);
5095 5099 }
5096 5100
5097 5101 drp = VTOR4(dvp);
5098 5102 if (!(drp->r_flags & R4LOOKUP)) {
5099 5103 mutex_enter(&drp->r_statelock);
5100 5104 drp->r_flags |= R4LOOKUP;
5101 5105 mutex_exit(&drp->r_statelock);
5102 5106 }
5103 5107
5104 5108 *vpp = NULL;
5105 5109 /*
5106 5110 * Lookup this name in the DNLC. If there is no entry
5107 5111 * lookup over the wire.
5108 5112 */
5109 5113 if (!skipdnlc)
5110 5114 *vpp = dnlc_lookup(dvp, nm);
5111 5115 if (*vpp == NULL) {
5112 5116 /*
5113 5117 * We need to go over the wire to lookup the name.
5114 5118 */
5115 5119 return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5116 5120 }
5117 5121
5118 5122 /*
5119 5123 * We hit on the dnlc
5120 5124 */
5121 5125 if (*vpp != DNLC_NO_VNODE ||
5122 5126 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5123 5127 /*
5124 5128 * But our attrs may not be valid.
5125 5129 */
5126 5130 if (ATTRCACHE4_VALID(dvp)) {
5127 5131 error = nfs4_waitfor_purge_complete(dvp);
5128 5132 if (error) {
5129 5133 VN_RELE(*vpp);
5130 5134 *vpp = NULL;
5131 5135 return (error);
5132 5136 }
5133 5137
5134 5138 /*
5135 5139 * If after the purge completes, check to make sure
5136 5140 * our attrs are still valid.
5137 5141 */
5138 5142 if (ATTRCACHE4_VALID(dvp)) {
5139 5143 /*
5140 5144 * If we waited for a purge we may have
5141 5145 * lost our vnode so look it up again.
5142 5146 */
5143 5147 VN_RELE(*vpp);
5144 5148 *vpp = dnlc_lookup(dvp, nm);
5145 5149 if (*vpp == NULL)
5146 5150 return (nfs4lookupnew_otw(dvp,
5147 5151 nm, vpp, cr));
5148 5152
5149 5153 /*
5150 5154 * The access cache should almost always hit
5151 5155 */
5152 5156 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5153 5157
5154 5158 if (error) {
5155 5159 VN_RELE(*vpp);
5156 5160 *vpp = NULL;
5157 5161 return (error);
5158 5162 }
5159 5163 if (*vpp == DNLC_NO_VNODE) {
5160 5164 VN_RELE(*vpp);
5161 5165 *vpp = NULL;
5162 5166 return (ENOENT);
5163 5167 }
5164 5168 return (0);
5165 5169 }
5166 5170 }
5167 5171 }
5168 5172
5169 5173 ASSERT(*vpp != NULL);
5170 5174
5171 5175 /*
5172 5176 * We may have gotten here we have one of the following cases:
5173 5177 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5174 5178 * need to validate them.
5175 5179 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always
5176 5180 * must validate.
5177 5181 *
5178 5182 * Go to the server and check if the directory has changed, if
5179 5183 * it hasn't we are done and can use the dnlc entry.
5180 5184 */
5181 5185 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5182 5186 }
5183 5187
5184 5188 /*
5185 5189 * Go to the server and check if the directory has changed, if
5186 5190 * it hasn't we are done and can use the dnlc entry. If it
5187 5191 * has changed we get a new copy of its attributes and check
5188 5192 * the access for VEXEC, then relookup the filename and
5189 5193 * get its filehandle and attributes.
5190 5194 *
5191 5195 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5192 5196 * if the NVERIFY failed we must
5193 5197 * purge the caches
5194 5198 * cache new attributes (will set r_time_attr_inval)
5195 5199 * cache new access
5196 5200 * recheck VEXEC access
5197 5201 * add name to dnlc, possibly negative
5198 5202 * if LOOKUP succeeded
5199 5203 * cache new attributes
5200 5204 * else
5201 5205 * set a new r_time_attr_inval for dvp
5202 5206 * check to make sure we have access
5203 5207 *
5204 5208 * The vpp returned is the vnode passed in if the directory is valid,
5205 5209 * a new vnode if successful lookup, or NULL on error.
5206 5210 */
5207 5211 static int
5208 5212 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5209 5213 {
5210 5214 COMPOUND4args_clnt args;
5211 5215 COMPOUND4res_clnt res;
5212 5216 fattr4 *ver_fattr;
5213 5217 fattr4_change dchange;
5214 5218 int32_t *ptr;
5215 5219 int argoplist_size = 7 * sizeof (nfs_argop4);
5216 5220 nfs_argop4 *argop;
5217 5221 int doqueue;
5218 5222 mntinfo4_t *mi;
5219 5223 nfs4_recov_state_t recov_state;
5220 5224 hrtime_t t;
5221 5225 int isdotdot;
5222 5226 vnode_t *nvp;
5223 5227 nfs_fh4 *fhp;
5224 5228 nfs4_sharedfh_t *sfhp;
5225 5229 nfs4_access_type_t cacc;
5226 5230 rnode4_t *nrp;
5227 5231 rnode4_t *drp = VTOR4(dvp);
5228 5232 nfs4_ga_res_t *garp = NULL;
5229 5233 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5230 5234
5231 5235 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5232 5236 ASSERT(nm != NULL);
5233 5237 ASSERT(nm[0] != '\0');
5234 5238 ASSERT(dvp->v_type == VDIR);
5235 5239 ASSERT(nm[0] != '.' || nm[1] != '\0');
5236 5240 ASSERT(*vpp != NULL);
5237 5241
5238 5242 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5239 5243 isdotdot = 1;
5240 5244 args.ctag = TAG_LOOKUP_VPARENT;
5241 5245 } else {
5242 5246 /*
5243 5247 * If dvp were a stub, it should have triggered and caused
5244 5248 * a mount for us to get this far.
5245 5249 */
5246 5250 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5247 5251
5248 5252 isdotdot = 0;
5249 5253 args.ctag = TAG_LOOKUP_VALID;
5250 5254 }
5251 5255
5252 5256 mi = VTOMI4(dvp);
5253 5257 recov_state.rs_flags = 0;
5254 5258 recov_state.rs_num_retry_despite_err = 0;
5255 5259
5256 5260 nvp = NULL;
5257 5261
5258 5262 /* Save the original mount point security information */
5259 5263 (void) save_mnt_secinfo(mi->mi_curr_serv);
5260 5264
5261 5265 recov_retry:
5262 5266 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5263 5267 &recov_state, NULL);
5264 5268 if (e.error) {
5265 5269 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5266 5270 VN_RELE(*vpp);
5267 5271 *vpp = NULL;
5268 5272 return (e.error);
5269 5273 }
5270 5274
5271 5275 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5272 5276
5273 5277 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5274 5278 args.array_len = 7;
5275 5279 args.array = argop;
5276 5280
5277 5281 /* 0. putfh file */
5278 5282 argop[0].argop = OP_CPUTFH;
5279 5283 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5280 5284
5281 5285 /* 1. nverify the change info */
5282 5286 argop[1].argop = OP_NVERIFY;
5283 5287 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5284 5288 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5285 5289 ver_fattr->attrlist4 = (char *)&dchange;
5286 5290 ptr = (int32_t *)&dchange;
5287 5291 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5288 5292 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5289 5293
5290 5294 /* 2. getattr directory */
5291 5295 argop[2].argop = OP_GETATTR;
5292 5296 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5293 5297 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5294 5298
5295 5299 /* 3. access directory */
5296 5300 argop[3].argop = OP_ACCESS;
5297 5301 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5298 5302 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5299 5303
5300 5304 /* 4. lookup name */
5301 5305 if (isdotdot) {
5302 5306 argop[4].argop = OP_LOOKUPP;
5303 5307 } else {
5304 5308 argop[4].argop = OP_CLOOKUP;
5305 5309 argop[4].nfs_argop4_u.opclookup.cname = nm;
5306 5310 }
5307 5311
5308 5312 /* 5. resulting file handle */
5309 5313 argop[5].argop = OP_GETFH;
5310 5314
5311 5315 /* 6. resulting file attributes */
5312 5316 argop[6].argop = OP_GETATTR;
5313 5317 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5314 5318 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5315 5319
5316 5320 doqueue = 1;
5317 5321 t = gethrtime();
5318 5322
5319 5323 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5320 5324
5321 5325 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5322 5326 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5323 5327 if (e.error != 0 && *vpp != NULL)
5324 5328 VN_RELE(*vpp);
5325 5329 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5326 5330 &recov_state, FALSE);
5327 5331 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5328 5332 kmem_free(argop, argoplist_size);
5329 5333 return (e.error);
5330 5334 }
5331 5335
5332 5336 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5333 5337 /*
5334 5338 * For WRONGSEC of a non-dotdot case, send secinfo directly
5335 5339 * from this thread, do not go thru the recovery thread since
5336 5340 * we need the nm information.
5337 5341 *
5338 5342 * Not doing dotdot case because there is no specification
5339 5343 * for (PUTFH, SECINFO "..") yet.
5340 5344 */
5341 5345 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5342 5346 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5343 5347 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5344 5348 &recov_state, FALSE);
5345 5349 else
5346 5350 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5347 5351 &recov_state, TRUE);
5348 5352 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5349 5353 kmem_free(argop, argoplist_size);
5350 5354 if (!e.error)
5351 5355 goto recov_retry;
5352 5356 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5353 5357 VN_RELE(*vpp);
5354 5358 *vpp = NULL;
5355 5359 return (e.error);
5356 5360 }
5357 5361
5358 5362 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5359 5363 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5360 5364 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5361 5365 &recov_state, TRUE);
5362 5366
5363 5367 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5364 5368 kmem_free(argop, argoplist_size);
5365 5369 goto recov_retry;
5366 5370 }
5367 5371 }
5368 5372
5369 5373 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5370 5374
5371 5375 if (e.error || res.array_len == 0) {
5372 5376 /*
5373 5377 * If e.error isn't set, then reply has no ops (or we couldn't
5374 5378 * be here). The only legal way to reply without an op array
5375 5379 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5376 5380 * be in the reply for all other status values.
5377 5381 *
5378 5382 * For valid replies without an ops array, return ENOTSUP
5379 5383 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5380 5384 * return EIO -- don't trust status.
5381 5385 */
5382 5386 if (e.error == 0)
5383 5387 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5384 5388 ENOTSUP : EIO;
5385 5389 VN_RELE(*vpp);
5386 5390 *vpp = NULL;
5387 5391 kmem_free(argop, argoplist_size);
5388 5392 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5389 5393 return (e.error);
5390 5394 }
5391 5395
5392 5396 if (res.status != NFS4ERR_SAME) {
5393 5397 e.error = geterrno4(res.status);
5394 5398
5395 5399 /*
5396 5400 * The NVERIFY "failed" so the directory has changed
5397 5401 * First make sure PUTFH succeeded and NVERIFY "failed"
5398 5402 * cleanly.
5399 5403 */
5400 5404 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5401 5405 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5402 5406 nfs4_purge_stale_fh(e.error, dvp, cr);
5403 5407 VN_RELE(*vpp);
5404 5408 *vpp = NULL;
5405 5409 goto exit;
5406 5410 }
5407 5411
5408 5412 /*
5409 5413 * We know the NVERIFY "failed" so we must:
5410 5414 * purge the caches (access and indirectly dnlc if needed)
5411 5415 */
5412 5416 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5413 5417
5414 5418 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5415 5419 nfs4_purge_stale_fh(e.error, dvp, cr);
5416 5420 VN_RELE(*vpp);
5417 5421 *vpp = NULL;
5418 5422 goto exit;
5419 5423 }
5420 5424
5421 5425 /*
5422 5426 * Install new cached attributes for the directory
5423 5427 */
5424 5428 nfs4_attr_cache(dvp,
5425 5429 &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5426 5430 t, cr, FALSE, NULL);
5427 5431
5428 5432 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5429 5433 nfs4_purge_stale_fh(e.error, dvp, cr);
5430 5434 VN_RELE(*vpp);
5431 5435 *vpp = NULL;
5432 5436 e.error = geterrno4(res.status);
5433 5437 goto exit;
5434 5438 }
5435 5439
5436 5440 /*
5437 5441 * Now we know the directory is valid,
5438 5442 * cache new directory access
5439 5443 */
5440 5444 nfs4_access_cache(drp,
5441 5445 args.array[3].nfs_argop4_u.opaccess.access,
5442 5446 res.array[3].nfs_resop4_u.opaccess.access, cr);
5443 5447
5444 5448 /*
5445 5449 * recheck VEXEC access
5446 5450 */
5447 5451 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5448 5452 if (cacc != NFS4_ACCESS_ALLOWED) {
5449 5453 /*
5450 5454 * Directory permissions might have been revoked
5451 5455 */
5452 5456 if (cacc == NFS4_ACCESS_DENIED) {
5453 5457 e.error = EACCES;
5454 5458 VN_RELE(*vpp);
5455 5459 *vpp = NULL;
5456 5460 goto exit;
5457 5461 }
5458 5462
5459 5463 /*
5460 5464 * Somehow we must not have asked for enough
5461 5465 * so try a singleton ACCESS, should never happen.
5462 5466 */
5463 5467 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5464 5468 if (e.error) {
5465 5469 VN_RELE(*vpp);
5466 5470 *vpp = NULL;
5467 5471 goto exit;
5468 5472 }
5469 5473 }
5470 5474
5471 5475 e.error = geterrno4(res.status);
5472 5476 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5473 5477 /*
5474 5478 * The lookup failed, probably no entry
5475 5479 */
5476 5480 if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5477 5481 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5478 5482 } else {
5479 5483 /*
5480 5484 * Might be some other error, so remove
5481 5485 * the dnlc entry to make sure we start all
5482 5486 * over again, next time.
5483 5487 */
5484 5488 dnlc_remove(dvp, nm);
5485 5489 }
5486 5490 VN_RELE(*vpp);
5487 5491 *vpp = NULL;
5488 5492 goto exit;
5489 5493 }
5490 5494
5491 5495 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5492 5496 /*
5493 5497 * The file exists but we can't get its fh for
5494 5498 * some unknown reason. Remove it from the dnlc
5495 5499 * and error out to be safe.
5496 5500 */
5497 5501 dnlc_remove(dvp, nm);
5498 5502 VN_RELE(*vpp);
5499 5503 *vpp = NULL;
5500 5504 goto exit;
5501 5505 }
5502 5506 fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5503 5507 if (fhp->nfs_fh4_len == 0) {
5504 5508 /*
5505 5509 * The file exists but a bogus fh
5506 5510 * some unknown reason. Remove it from the dnlc
5507 5511 * and error out to be safe.
5508 5512 */
5509 5513 e.error = ENOENT;
5510 5514 dnlc_remove(dvp, nm);
5511 5515 VN_RELE(*vpp);
5512 5516 *vpp = NULL;
5513 5517 goto exit;
5514 5518 }
5515 5519 sfhp = sfh4_get(fhp, mi);
5516 5520
5517 5521 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5518 5522 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5519 5523
5520 5524 /*
5521 5525 * Make the new rnode
5522 5526 */
5523 5527 if (isdotdot) {
5524 5528 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5525 5529 if (e.error) {
5526 5530 sfh4_rele(&sfhp);
5527 5531 VN_RELE(*vpp);
5528 5532 *vpp = NULL;
5529 5533 goto exit;
5530 5534 }
5531 5535 /*
5532 5536 * XXX if nfs4_make_dotdot uses an existing rnode
5533 5537 * XXX it doesn't update the attributes.
5534 5538 * XXX for now just save them again to save an OTW
5535 5539 */
5536 5540 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5537 5541 } else {
5538 5542 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5539 5543 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5540 5544 /*
5541 5545 * If v_type == VNON, then garp was NULL because
5542 5546 * the last op in the compound failed and makenfs4node
5543 5547 * could not find the vnode for sfhp. It created
5544 5548 * a new vnode, so we have nothing to purge here.
5545 5549 */
5546 5550 if (nvp->v_type == VNON) {
5547 5551 vattr_t vattr;
5548 5552
5549 5553 vattr.va_mask = AT_TYPE;
5550 5554 /*
5551 5555 * N.B. We've already called nfs4_end_fop above.
5552 5556 */
5553 5557 e.error = nfs4getattr(nvp, &vattr, cr);
5554 5558 if (e.error) {
5555 5559 sfh4_rele(&sfhp);
5556 5560 VN_RELE(*vpp);
5557 5561 *vpp = NULL;
5558 5562 VN_RELE(nvp);
5559 5563 goto exit;
5560 5564 }
5561 5565 nvp->v_type = vattr.va_type;
5562 5566 }
5563 5567 }
5564 5568 sfh4_rele(&sfhp);
5565 5569
5566 5570 nrp = VTOR4(nvp);
5567 5571 mutex_enter(&nrp->r_statev4_lock);
5568 5572 if (!nrp->created_v4) {
5569 5573 mutex_exit(&nrp->r_statev4_lock);
5570 5574 dnlc_update(dvp, nm, nvp);
5571 5575 } else
5572 5576 mutex_exit(&nrp->r_statev4_lock);
5573 5577
5574 5578 VN_RELE(*vpp);
5575 5579 *vpp = nvp;
5576 5580 } else {
5577 5581 hrtime_t now;
5578 5582 hrtime_t delta = 0;
5579 5583
5580 5584 e.error = 0;
5581 5585
5582 5586 /*
5583 5587 * Because the NVERIFY "succeeded" we know that the
5584 5588 * directory attributes are still valid
5585 5589 * so update r_time_attr_inval
5586 5590 */
5587 5591 now = gethrtime();
5588 5592 mutex_enter(&drp->r_statelock);
5589 5593 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5590 5594 delta = now - drp->r_time_attr_saved;
5591 5595 if (delta < mi->mi_acdirmin)
5592 5596 delta = mi->mi_acdirmin;
5593 5597 else if (delta > mi->mi_acdirmax)
5594 5598 delta = mi->mi_acdirmax;
5595 5599 }
5596 5600 drp->r_time_attr_inval = now + delta;
5597 5601 mutex_exit(&drp->r_statelock);
5598 5602 dnlc_update(dvp, nm, *vpp);
5599 5603
5600 5604 /*
5601 5605 * Even though we have a valid directory attr cache
5602 5606 * and dnlc entry, we may not have access.
5603 5607 * This should almost always hit the cache.
5604 5608 */
5605 5609 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5606 5610 if (e.error) {
5607 5611 VN_RELE(*vpp);
5608 5612 *vpp = NULL;
5609 5613 }
5610 5614
5611 5615 if (*vpp == DNLC_NO_VNODE) {
5612 5616 VN_RELE(*vpp);
5613 5617 *vpp = NULL;
5614 5618 e.error = ENOENT;
5615 5619 }
5616 5620 }
5617 5621
5618 5622 exit:
5619 5623 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5620 5624 kmem_free(argop, argoplist_size);
5621 5625 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5622 5626 return (e.error);
5623 5627 }
5624 5628
5625 5629 /*
5626 5630 * We need to go over the wire to lookup the name, but
5627 5631 * while we are there verify the directory has not
5628 5632 * changed but if it has, get new attributes and check access
5629 5633 *
5630 5634 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5631 5635 * NVERIFY GETATTR ACCESS
5632 5636 *
5633 5637 * With the results:
5634 5638 * if the NVERIFY failed we must purge the caches, add new attributes,
5635 5639 * and cache new access.
5636 5640 * set a new r_time_attr_inval
5637 5641 * add name to dnlc, possibly negative
5638 5642 * if LOOKUP succeeded
5639 5643 * cache new attributes
5640 5644 */
5641 5645 static int
5642 5646 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5643 5647 {
5644 5648 COMPOUND4args_clnt args;
5645 5649 COMPOUND4res_clnt res;
5646 5650 fattr4 *ver_fattr;
5647 5651 fattr4_change dchange;
5648 5652 int32_t *ptr;
5649 5653 nfs4_ga_res_t *garp = NULL;
5650 5654 int argoplist_size = 9 * sizeof (nfs_argop4);
5651 5655 nfs_argop4 *argop;
5652 5656 int doqueue;
5653 5657 mntinfo4_t *mi;
5654 5658 nfs4_recov_state_t recov_state;
5655 5659 hrtime_t t;
5656 5660 int isdotdot;
5657 5661 vnode_t *nvp;
5658 5662 nfs_fh4 *fhp;
5659 5663 nfs4_sharedfh_t *sfhp;
5660 5664 nfs4_access_type_t cacc;
5661 5665 rnode4_t *nrp;
5662 5666 rnode4_t *drp = VTOR4(dvp);
5663 5667 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5664 5668
5665 5669 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5666 5670 ASSERT(nm != NULL);
5667 5671 ASSERT(nm[0] != '\0');
5668 5672 ASSERT(dvp->v_type == VDIR);
5669 5673 ASSERT(nm[0] != '.' || nm[1] != '\0');
5670 5674 ASSERT(*vpp == NULL);
5671 5675
5672 5676 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5673 5677 isdotdot = 1;
5674 5678 args.ctag = TAG_LOOKUP_PARENT;
5675 5679 } else {
5676 5680 /*
5677 5681 * If dvp were a stub, it should have triggered and caused
5678 5682 * a mount for us to get this far.
5679 5683 */
5680 5684 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5681 5685
5682 5686 isdotdot = 0;
5683 5687 args.ctag = TAG_LOOKUP;
5684 5688 }
5685 5689
5686 5690 mi = VTOMI4(dvp);
5687 5691 recov_state.rs_flags = 0;
5688 5692 recov_state.rs_num_retry_despite_err = 0;
5689 5693
5690 5694 nvp = NULL;
5691 5695
5692 5696 /* Save the original mount point security information */
5693 5697 (void) save_mnt_secinfo(mi->mi_curr_serv);
5694 5698
5695 5699 recov_retry:
5696 5700 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5697 5701 &recov_state, NULL);
5698 5702 if (e.error) {
5699 5703 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5700 5704 return (e.error);
5701 5705 }
5702 5706
5703 5707 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5704 5708
5705 5709 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5706 5710 args.array_len = 9;
5707 5711 args.array = argop;
5708 5712
5709 5713 /* 0. putfh file */
5710 5714 argop[0].argop = OP_CPUTFH;
5711 5715 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5712 5716
5713 5717 /* 1. savefh for the nverify */
5714 5718 argop[1].argop = OP_SAVEFH;
5715 5719
5716 5720 /* 2. lookup name */
5717 5721 if (isdotdot) {
5718 5722 argop[2].argop = OP_LOOKUPP;
5719 5723 } else {
5720 5724 argop[2].argop = OP_CLOOKUP;
5721 5725 argop[2].nfs_argop4_u.opclookup.cname = nm;
5722 5726 }
5723 5727
5724 5728 /* 3. resulting file handle */
5725 5729 argop[3].argop = OP_GETFH;
5726 5730
5727 5731 /* 4. resulting file attributes */
5728 5732 argop[4].argop = OP_GETATTR;
5729 5733 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5730 5734 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5731 5735
5732 5736 /* 5. restorefh back the directory for the nverify */
5733 5737 argop[5].argop = OP_RESTOREFH;
5734 5738
5735 5739 /* 6. nverify the change info */
5736 5740 argop[6].argop = OP_NVERIFY;
5737 5741 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5738 5742 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5739 5743 ver_fattr->attrlist4 = (char *)&dchange;
5740 5744 ptr = (int32_t *)&dchange;
5741 5745 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5742 5746 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5743 5747
5744 5748 /* 7. getattr directory */
5745 5749 argop[7].argop = OP_GETATTR;
5746 5750 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5747 5751 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5748 5752
5749 5753 /* 8. access directory */
5750 5754 argop[8].argop = OP_ACCESS;
5751 5755 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5752 5756 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5753 5757
5754 5758 doqueue = 1;
5755 5759 t = gethrtime();
5756 5760
5757 5761 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5758 5762
5759 5763 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5760 5764 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5761 5765 if (e.error != 0 && *vpp != NULL)
5762 5766 VN_RELE(*vpp);
5763 5767 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5764 5768 &recov_state, FALSE);
5765 5769 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5766 5770 kmem_free(argop, argoplist_size);
5767 5771 return (e.error);
5768 5772 }
5769 5773
5770 5774 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5771 5775 /*
5772 5776 * For WRONGSEC of a non-dotdot case, send secinfo directly
5773 5777 * from this thread, do not go thru the recovery thread since
5774 5778 * we need the nm information.
5775 5779 *
5776 5780 * Not doing dotdot case because there is no specification
5777 5781 * for (PUTFH, SECINFO "..") yet.
5778 5782 */
5779 5783 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5780 5784 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5781 5785 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5782 5786 &recov_state, FALSE);
5783 5787 else
5784 5788 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5785 5789 &recov_state, TRUE);
5786 5790 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5787 5791 kmem_free(argop, argoplist_size);
5788 5792 if (!e.error)
5789 5793 goto recov_retry;
5790 5794 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5791 5795 return (e.error);
5792 5796 }
5793 5797
5794 5798 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5795 5799 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5796 5800 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5797 5801 &recov_state, TRUE);
5798 5802
5799 5803 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5800 5804 kmem_free(argop, argoplist_size);
5801 5805 goto recov_retry;
5802 5806 }
5803 5807 }
5804 5808
5805 5809 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5806 5810
5807 5811 if (e.error || res.array_len == 0) {
5808 5812 /*
5809 5813 * If e.error isn't set, then reply has no ops (or we couldn't
5810 5814 * be here). The only legal way to reply without an op array
5811 5815 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5812 5816 * be in the reply for all other status values.
5813 5817 *
5814 5818 * For valid replies without an ops array, return ENOTSUP
5815 5819 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5816 5820 * return EIO -- don't trust status.
5817 5821 */
5818 5822 if (e.error == 0)
5819 5823 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5820 5824 ENOTSUP : EIO;
5821 5825
5822 5826 kmem_free(argop, argoplist_size);
5823 5827 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5824 5828 return (e.error);
5825 5829 }
5826 5830
5827 5831 e.error = geterrno4(res.status);
5828 5832
5829 5833 /*
5830 5834 * The PUTFH and SAVEFH may have failed.
5831 5835 */
5832 5836 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5833 5837 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5834 5838 nfs4_purge_stale_fh(e.error, dvp, cr);
5835 5839 goto exit;
5836 5840 }
5837 5841
5838 5842 /*
5839 5843 * Check if the file exists, if it does delay entering
5840 5844 * into the dnlc until after we update the directory
5841 5845 * attributes so we don't cause it to get purged immediately.
5842 5846 */
5843 5847 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5844 5848 /*
5845 5849 * The lookup failed, probably no entry
5846 5850 */
5847 5851 if (e.error == ENOENT && nfs4_lookup_neg_cache)
5848 5852 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5849 5853 goto exit;
5850 5854 }
5851 5855
5852 5856 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5853 5857 /*
5854 5858 * The file exists but we can't get its fh for
5855 5859 * some unknown reason. Error out to be safe.
5856 5860 */
5857 5861 goto exit;
5858 5862 }
5859 5863
5860 5864 fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5861 5865 if (fhp->nfs_fh4_len == 0) {
5862 5866 /*
5863 5867 * The file exists but a bogus fh
5864 5868 * some unknown reason. Error out to be safe.
5865 5869 */
5866 5870 e.error = EIO;
5867 5871 goto exit;
5868 5872 }
5869 5873 sfhp = sfh4_get(fhp, mi);
5870 5874
5871 5875 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5872 5876 sfh4_rele(&sfhp);
5873 5877 goto exit;
5874 5878 }
5875 5879 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5876 5880
5877 5881 /*
5878 5882 * The RESTOREFH may have failed
5879 5883 */
5880 5884 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5881 5885 sfh4_rele(&sfhp);
5882 5886 e.error = EIO;
5883 5887 goto exit;
5884 5888 }
5885 5889
5886 5890 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5887 5891 /*
5888 5892 * First make sure the NVERIFY failed as we expected,
5889 5893 * if it didn't then be conservative and error out
5890 5894 * as we can't trust the directory.
5891 5895 */
5892 5896 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5893 5897 sfh4_rele(&sfhp);
5894 5898 e.error = EIO;
5895 5899 goto exit;
5896 5900 }
5897 5901
5898 5902 /*
5899 5903 * We know the NVERIFY "failed" so the directory has changed,
5900 5904 * so we must:
5901 5905 * purge the caches (access and indirectly dnlc if needed)
5902 5906 */
5903 5907 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5904 5908
5905 5909 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5906 5910 sfh4_rele(&sfhp);
5907 5911 goto exit;
5908 5912 }
5909 5913 nfs4_attr_cache(dvp,
5910 5914 &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5911 5915 t, cr, FALSE, NULL);
5912 5916
5913 5917 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5914 5918 nfs4_purge_stale_fh(e.error, dvp, cr);
5915 5919 sfh4_rele(&sfhp);
5916 5920 e.error = geterrno4(res.status);
5917 5921 goto exit;
5918 5922 }
5919 5923
5920 5924 /*
5921 5925 * Now we know the directory is valid,
5922 5926 * cache new directory access
5923 5927 */
5924 5928 nfs4_access_cache(drp,
5925 5929 args.array[8].nfs_argop4_u.opaccess.access,
5926 5930 res.array[8].nfs_resop4_u.opaccess.access, cr);
5927 5931
5928 5932 /*
5929 5933 * recheck VEXEC access
5930 5934 */
5931 5935 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5932 5936 if (cacc != NFS4_ACCESS_ALLOWED) {
5933 5937 /*
5934 5938 * Directory permissions might have been revoked
5935 5939 */
5936 5940 if (cacc == NFS4_ACCESS_DENIED) {
5937 5941 sfh4_rele(&sfhp);
5938 5942 e.error = EACCES;
5939 5943 goto exit;
5940 5944 }
5941 5945
5942 5946 /*
5943 5947 * Somehow we must not have asked for enough
5944 5948 * so try a singleton ACCESS should never happen
5945 5949 */
5946 5950 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5947 5951 if (e.error) {
5948 5952 sfh4_rele(&sfhp);
5949 5953 goto exit;
5950 5954 }
5951 5955 }
5952 5956
5953 5957 e.error = geterrno4(res.status);
5954 5958 } else {
5955 5959 hrtime_t now;
5956 5960 hrtime_t delta = 0;
5957 5961
5958 5962 e.error = 0;
5959 5963
5960 5964 /*
5961 5965 * Because the NVERIFY "succeeded" we know that the
5962 5966 * directory attributes are still valid
5963 5967 * so update r_time_attr_inval
5964 5968 */
5965 5969 now = gethrtime();
5966 5970 mutex_enter(&drp->r_statelock);
5967 5971 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5968 5972 delta = now - drp->r_time_attr_saved;
5969 5973 if (delta < mi->mi_acdirmin)
5970 5974 delta = mi->mi_acdirmin;
5971 5975 else if (delta > mi->mi_acdirmax)
5972 5976 delta = mi->mi_acdirmax;
5973 5977 }
5974 5978 drp->r_time_attr_inval = now + delta;
5975 5979 mutex_exit(&drp->r_statelock);
5976 5980
5977 5981 /*
5978 5982 * Even though we have a valid directory attr cache,
5979 5983 * we may not have access.
5980 5984 * This should almost always hit the cache.
5981 5985 */
5982 5986 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5983 5987 if (e.error) {
5984 5988 sfh4_rele(&sfhp);
5985 5989 goto exit;
5986 5990 }
5987 5991 }
5988 5992
5989 5993 /*
5990 5994 * Now we have successfully completed the lookup, if the
5991 5995 * directory has changed we now have the valid attributes.
5992 5996 * We also know we have directory access.
5993 5997 * Create the new rnode and insert it in the dnlc.
5994 5998 */
5995 5999 if (isdotdot) {
5996 6000 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5997 6001 if (e.error) {
5998 6002 sfh4_rele(&sfhp);
5999 6003 goto exit;
6000 6004 }
6001 6005 /*
6002 6006 * XXX if nfs4_make_dotdot uses an existing rnode
6003 6007 * XXX it doesn't update the attributes.
6004 6008 * XXX for now just save them again to save an OTW
6005 6009 */
6006 6010 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6007 6011 } else {
6008 6012 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6009 6013 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6010 6014 }
6011 6015 sfh4_rele(&sfhp);
6012 6016
6013 6017 nrp = VTOR4(nvp);
6014 6018 mutex_enter(&nrp->r_statev4_lock);
6015 6019 if (!nrp->created_v4) {
6016 6020 mutex_exit(&nrp->r_statev4_lock);
6017 6021 dnlc_update(dvp, nm, nvp);
6018 6022 } else
6019 6023 mutex_exit(&nrp->r_statev4_lock);
6020 6024
6021 6025 *vpp = nvp;
6022 6026
6023 6027 exit:
6024 6028 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6025 6029 kmem_free(argop, argoplist_size);
6026 6030 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6027 6031 return (e.error);
6028 6032 }
6029 6033
6030 6034 #ifdef DEBUG
6031 6035 void
6032 6036 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6033 6037 {
6034 6038 uint_t i, len;
6035 6039 zoneid_t zoneid = getzoneid();
6036 6040 char *s;
6037 6041
6038 6042 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6039 6043 for (i = 0; i < argcnt; i++) {
6040 6044 nfs_argop4 *op = &argbase[i];
6041 6045 switch (op->argop) {
6042 6046 case OP_CPUTFH:
6043 6047 case OP_PUTFH:
6044 6048 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6045 6049 break;
6046 6050 case OP_PUTROOTFH:
6047 6051 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6048 6052 break;
6049 6053 case OP_CLOOKUP:
6050 6054 s = op->nfs_argop4_u.opclookup.cname;
6051 6055 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6052 6056 break;
6053 6057 case OP_LOOKUP:
6054 6058 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6055 6059 &len, NULL);
6056 6060 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6057 6061 kmem_free(s, len);
6058 6062 break;
6059 6063 case OP_LOOKUPP:
6060 6064 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6061 6065 break;
6062 6066 case OP_GETFH:
6063 6067 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6064 6068 break;
6065 6069 case OP_GETATTR:
6066 6070 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6067 6071 break;
6068 6072 case OP_OPENATTR:
6069 6073 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6070 6074 break;
6071 6075 default:
6072 6076 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6073 6077 op->argop);
6074 6078 break;
6075 6079 }
6076 6080 }
6077 6081 }
6078 6082 #endif
6079 6083
6080 6084 /*
6081 6085 * nfs4lookup_setup - constructs a multi-lookup compound request.
6082 6086 *
6083 6087 * Given the path "nm1/nm2/.../nmn", the following compound requests
6084 6088 * may be created:
6085 6089 *
6086 6090 * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6087 6091 * is faster, for now.
6088 6092 *
6089 6093 * l4_getattrs indicates the type of compound requested.
6090 6094 *
6091 6095 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6092 6096 *
6093 6097 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} }
6094 6098 *
6095 6099 * total number of ops is n + 1.
6096 6100 *
6097 6101 * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6098 6102 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6099 6103 * before the last component, and only get attributes
6100 6104 * for the last component. Note that the second-to-last
6101 6105 * pathname component is XATTR_RPATH, which does NOT go
6102 6106 * over-the-wire as a lookup.
6103 6107 *
6104 6108 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6105 6109 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6106 6110 *
6107 6111 * and total number of ops is n + 5.
6108 6112 *
6109 6113 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6110 6114 * attribute directory: create lookups plus an OPENATTR
6111 6115 * replacing the last lookup. Note that the last pathname
6112 6116 * component is XATTR_RPATH, which does NOT go over-the-wire
6113 6117 * as a lookup.
6114 6118 *
6115 6119 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6116 6120 * Openattr; Getfh; Getattr }
6117 6121 *
6118 6122 * and total number of ops is n + 5.
6119 6123 *
6120 6124 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6121 6125 * nodes too.
6122 6126 *
6123 6127 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6124 6128 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr }
6125 6129 *
6126 6130 * and total number of ops is 3*n + 1.
6127 6131 *
6128 6132 * All cases: returns the index in the arg array of the final LOOKUP op, or
6129 6133 * -1 if no LOOKUPs were used.
6130 6134 */
6131 6135 int
6132 6136 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6133 6137 {
6134 6138 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6135 6139 nfs_argop4 *argbase, *argop;
6136 6140 int arglen, argcnt;
6137 6141 int n = 1; /* number of components */
6138 6142 int nga = 1; /* number of Getattr's in request */
6139 6143 char c = '\0', *s, *p;
6140 6144 int lookup_idx = -1;
6141 6145 int argoplist_size;
6142 6146
6143 6147 /* set lookuparg response result to 0 */
6144 6148 lookupargp->resp->status = NFS4_OK;
6145 6149
6146 6150 /* skip leading "/" or "." e.g. ".//./" if there is */
6147 6151 for (; ; nm++) {
6148 6152 if (*nm != '/' && *nm != '.')
6149 6153 break;
6150 6154
6151 6155 /* ".." is counted as 1 component */
6152 6156 if (*nm == '.' && *(nm + 1) != '/')
6153 6157 break;
6154 6158 }
6155 6159
6156 6160 /*
6157 6161 * Find n = number of components - nm must be null terminated
6158 6162 * Skip "." components.
6159 6163 */
6160 6164 if (*nm != '\0')
6161 6165 for (n = 1, s = nm; *s != '\0'; s++) {
6162 6166 if ((*s == '/') && (*(s + 1) != '/') &&
6163 6167 (*(s + 1) != '\0') &&
6164 6168 !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6165 6169 *(s + 2) == '\0')))
6166 6170 n++;
6167 6171 }
6168 6172 else
6169 6173 n = 0;
6170 6174
6171 6175 /*
6172 6176 * nga is number of components that need Getfh+Getattr
6173 6177 */
6174 6178 switch (l4_getattrs) {
6175 6179 case LKP4_NO_ATTRIBUTES:
6176 6180 nga = 0;
6177 6181 break;
6178 6182 case LKP4_ALL_ATTRIBUTES:
6179 6183 nga = n;
6180 6184 /*
6181 6185 * Always have at least 1 getfh, getattr pair
6182 6186 */
6183 6187 if (nga == 0)
6184 6188 nga++;
6185 6189 break;
6186 6190 case LKP4_LAST_ATTRDIR:
6187 6191 case LKP4_LAST_NAMED_ATTR:
6188 6192 nga = n+1;
6189 6193 break;
6190 6194 }
6191 6195
6192 6196 /*
6193 6197 * If change to use the filehandle attr instead of getfh
6194 6198 * the following line can be deleted.
6195 6199 */
6196 6200 nga *= 2;
6197 6201
6198 6202 /*
6199 6203 * calculate number of ops in request as
6200 6204 * header + trailer + lookups + getattrs
6201 6205 */
6202 6206 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6203 6207
6204 6208 argoplist_size = arglen * sizeof (nfs_argop4);
6205 6209 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6206 6210 lookupargp->argsp->array = argop;
6207 6211
6208 6212 argcnt = lookupargp->header_len;
6209 6213 argop += argcnt;
6210 6214
6211 6215 /*
6212 6216 * loop and create a lookup op and possibly getattr/getfh for
6213 6217 * each component. Skip "." components.
6214 6218 */
6215 6219 for (s = nm; *s != '\0'; s = p) {
6216 6220 /*
6217 6221 * Set up a pathname struct for each component if needed
6218 6222 */
6219 6223 while (*s == '/')
6220 6224 s++;
6221 6225 if (*s == '\0')
6222 6226 break;
6223 6227
6224 6228 for (p = s; (*p != '/') && (*p != '\0'); p++)
6225 6229 ;
6226 6230 c = *p;
6227 6231 *p = '\0';
6228 6232
6229 6233 if (s[0] == '.' && s[1] == '\0') {
6230 6234 *p = c;
6231 6235 continue;
6232 6236 }
6233 6237 if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6234 6238 strcmp(s, XATTR_RPATH) == 0) {
6235 6239 /* getfh XXX may not be needed in future */
6236 6240 argop->argop = OP_GETFH;
6237 6241 argop++;
6238 6242 argcnt++;
6239 6243
6240 6244 /* getattr */
6241 6245 argop->argop = OP_GETATTR;
6242 6246 argop->nfs_argop4_u.opgetattr.attr_request =
6243 6247 lookupargp->ga_bits;
6244 6248 argop->nfs_argop4_u.opgetattr.mi =
6245 6249 lookupargp->mi;
6246 6250 argop++;
6247 6251 argcnt++;
6248 6252
6249 6253 /* openattr */
6250 6254 argop->argop = OP_OPENATTR;
6251 6255 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6252 6256 strcmp(s, XATTR_RPATH) == 0) {
6253 6257 /* openattr */
6254 6258 argop->argop = OP_OPENATTR;
6255 6259 argop++;
6256 6260 argcnt++;
6257 6261
6258 6262 /* getfh XXX may not be needed in future */
6259 6263 argop->argop = OP_GETFH;
6260 6264 argop++;
6261 6265 argcnt++;
6262 6266
6263 6267 /* getattr */
6264 6268 argop->argop = OP_GETATTR;
6265 6269 argop->nfs_argop4_u.opgetattr.attr_request =
6266 6270 lookupargp->ga_bits;
6267 6271 argop->nfs_argop4_u.opgetattr.mi =
6268 6272 lookupargp->mi;
6269 6273 argop++;
6270 6274 argcnt++;
6271 6275 *p = c;
6272 6276 continue;
6273 6277 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6274 6278 /* lookupp */
6275 6279 argop->argop = OP_LOOKUPP;
6276 6280 } else {
6277 6281 /* lookup */
6278 6282 argop->argop = OP_LOOKUP;
6279 6283 (void) str_to_utf8(s,
6280 6284 &argop->nfs_argop4_u.oplookup.objname);
6281 6285 }
6282 6286 lookup_idx = argcnt;
6283 6287 argop++;
6284 6288 argcnt++;
6285 6289
6286 6290 *p = c;
6287 6291
6288 6292 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6289 6293 /* getfh XXX may not be needed in future */
6290 6294 argop->argop = OP_GETFH;
6291 6295 argop++;
6292 6296 argcnt++;
6293 6297
6294 6298 /* getattr */
6295 6299 argop->argop = OP_GETATTR;
6296 6300 argop->nfs_argop4_u.opgetattr.attr_request =
6297 6301 lookupargp->ga_bits;
6298 6302 argop->nfs_argop4_u.opgetattr.mi =
6299 6303 lookupargp->mi;
6300 6304 argop++;
6301 6305 argcnt++;
6302 6306 }
6303 6307 }
6304 6308
6305 6309 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6306 6310 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6307 6311 if (needgetfh) {
6308 6312 /* stick in a post-lookup getfh */
6309 6313 argop->argop = OP_GETFH;
6310 6314 argcnt++;
6311 6315 argop++;
6312 6316 }
6313 6317 /* post-lookup getattr */
6314 6318 argop->argop = OP_GETATTR;
6315 6319 argop->nfs_argop4_u.opgetattr.attr_request =
6316 6320 lookupargp->ga_bits;
6317 6321 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6318 6322 argcnt++;
6319 6323 }
6320 6324 argcnt += lookupargp->trailer_len; /* actual op count */
6321 6325 lookupargp->argsp->array_len = argcnt;
6322 6326 lookupargp->arglen = arglen;
6323 6327
6324 6328 #ifdef DEBUG
6325 6329 if (nfs4_client_lookup_debug)
6326 6330 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6327 6331 #endif
6328 6332
6329 6333 return (lookup_idx);
6330 6334 }
6331 6335
6332 6336 static int
6333 6337 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6334 6338 {
6335 6339 COMPOUND4args_clnt args;
6336 6340 COMPOUND4res_clnt res;
6337 6341 GETFH4res *gf_res = NULL;
6338 6342 nfs_argop4 argop[4];
6339 6343 nfs_resop4 *resop = NULL;
6340 6344 nfs4_sharedfh_t *sfhp;
6341 6345 hrtime_t t;
6342 6346 nfs4_error_t e;
6343 6347
6344 6348 rnode4_t *drp;
6345 6349 int doqueue = 1;
6346 6350 vnode_t *vp;
6347 6351 int needrecov = 0;
6348 6352 nfs4_recov_state_t recov_state;
6349 6353
6350 6354 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6351 6355
6352 6356 *avp = NULL;
6353 6357 recov_state.rs_flags = 0;
6354 6358 recov_state.rs_num_retry_despite_err = 0;
6355 6359
6356 6360 recov_retry:
6357 6361 /* COMPOUND: putfh, openattr, getfh, getattr */
6358 6362 args.array_len = 4;
6359 6363 args.array = argop;
6360 6364 args.ctag = TAG_OPENATTR;
6361 6365
6362 6366 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6363 6367 if (e.error)
6364 6368 return (e.error);
6365 6369
6366 6370 drp = VTOR4(dvp);
6367 6371
6368 6372 /* putfh */
6369 6373 argop[0].argop = OP_CPUTFH;
6370 6374 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6371 6375
6372 6376 /* openattr */
6373 6377 argop[1].argop = OP_OPENATTR;
6374 6378 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6375 6379
6376 6380 /* getfh */
6377 6381 argop[2].argop = OP_GETFH;
6378 6382
6379 6383 /* getattr */
6380 6384 argop[3].argop = OP_GETATTR;
6381 6385 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6382 6386 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6383 6387
6384 6388 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6385 6389 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6386 6390 rnode4info(drp)));
6387 6391
6388 6392 t = gethrtime();
6389 6393
6390 6394 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6391 6395
6392 6396 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6393 6397 if (needrecov) {
6394 6398 bool_t abort;
6395 6399
6396 6400 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6397 6401 "nfs4openattr: initiating recovery\n"));
6398 6402
6399 6403 abort = nfs4_start_recovery(&e,
6400 6404 VTOMI4(dvp), dvp, NULL, NULL, NULL,
6401 6405 OP_OPENATTR, NULL, NULL, NULL);
6402 6406 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6403 6407 if (!e.error) {
6404 6408 e.error = geterrno4(res.status);
6405 6409 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6406 6410 }
6407 6411 if (abort == FALSE)
6408 6412 goto recov_retry;
6409 6413 return (e.error);
6410 6414 }
6411 6415
6412 6416 if (e.error) {
6413 6417 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6414 6418 return (e.error);
6415 6419 }
6416 6420
6417 6421 if (res.status) {
6418 6422 /*
6419 6423 * If OTW errro is NOTSUPP, then it should be
6420 6424 * translated to EINVAL. All Solaris file system
6421 6425 * implementations return EINVAL to the syscall layer
6422 6426 * when the attrdir cannot be created due to an
6423 6427 * implementation restriction or noxattr mount option.
6424 6428 */
6425 6429 if (res.status == NFS4ERR_NOTSUPP) {
6426 6430 mutex_enter(&drp->r_statelock);
6427 6431 if (drp->r_xattr_dir)
6428 6432 VN_RELE(drp->r_xattr_dir);
6429 6433 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6430 6434 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6431 6435 mutex_exit(&drp->r_statelock);
6432 6436
6433 6437 e.error = EINVAL;
6434 6438 } else {
6435 6439 e.error = geterrno4(res.status);
6436 6440 }
6437 6441
6438 6442 if (e.error) {
6439 6443 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6440 6444 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6441 6445 needrecov);
6442 6446 return (e.error);
6443 6447 }
6444 6448 }
6445 6449
6446 6450 resop = &res.array[0]; /* putfh res */
6447 6451 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6448 6452
6449 6453 resop = &res.array[1]; /* openattr res */
6450 6454 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6451 6455
6452 6456 resop = &res.array[2]; /* getfh res */
6453 6457 gf_res = &resop->nfs_resop4_u.opgetfh;
6454 6458 if (gf_res->object.nfs_fh4_len == 0) {
6455 6459 *avp = NULL;
6456 6460 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6457 6461 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6458 6462 return (ENOENT);
6459 6463 }
6460 6464
6461 6465 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6462 6466 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6463 6467 dvp->v_vfsp, t, cr, dvp,
6464 6468 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6465 6469 sfh4_rele(&sfhp);
6466 6470
6467 6471 if (e.error)
6468 6472 PURGE_ATTRCACHE4(vp);
6469 6473
6470 6474 mutex_enter(&vp->v_lock);
6471 6475 vp->v_flag |= V_XATTRDIR;
6472 6476 mutex_exit(&vp->v_lock);
6473 6477
6474 6478 *avp = vp;
6475 6479
6476 6480 mutex_enter(&drp->r_statelock);
6477 6481 if (drp->r_xattr_dir)
6478 6482 VN_RELE(drp->r_xattr_dir);
6479 6483 VN_HOLD(vp);
6480 6484 drp->r_xattr_dir = vp;
6481 6485
6482 6486 /*
6483 6487 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6484 6488 * NULL. xattrs could be created at any time, and we have no
6485 6489 * way to update pc4_xattr_exists in the base object if/when
6486 6490 * it happens.
6487 6491 */
6488 6492 drp->r_pathconf.pc4_xattr_valid = 0;
6489 6493
6490 6494 mutex_exit(&drp->r_statelock);
6491 6495
6492 6496 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6493 6497
6494 6498 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6495 6499
6496 6500 return (0);
6497 6501 }
6498 6502
6499 6503 /* ARGSUSED */
6500 6504 static int
6501 6505 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6502 6506 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6503 6507 vsecattr_t *vsecp)
6504 6508 {
6505 6509 int error;
6506 6510 vnode_t *vp = NULL;
6507 6511 rnode4_t *rp;
6508 6512 struct vattr vattr;
6509 6513 rnode4_t *drp;
6510 6514 vnode_t *tempvp;
6511 6515 enum createmode4 createmode;
6512 6516 bool_t must_trunc = FALSE;
6513 6517 int truncating = 0;
6514 6518
6515 6519 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6516 6520 return (EPERM);
6517 6521 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6518 6522 return (EINVAL);
6519 6523 }
6520 6524
6521 6525 /* . and .. have special meaning in the protocol, reject them. */
6522 6526
6523 6527 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6524 6528 return (EISDIR);
6525 6529
6526 6530 drp = VTOR4(dvp);
6527 6531
6528 6532 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6529 6533 return (EINTR);
6530 6534
6531 6535 top:
6532 6536 /*
6533 6537 * We make a copy of the attributes because the caller does not
6534 6538 * expect us to change what va points to.
6535 6539 */
6536 6540 vattr = *va;
6537 6541
6538 6542 /*
6539 6543 * If the pathname is "", then dvp is the root vnode of
6540 6544 * a remote file mounted over a local directory.
6541 6545 * All that needs to be done is access
6542 6546 * checking and truncation. Note that we avoid doing
6543 6547 * open w/ create because the parent directory might
6544 6548 * be in pseudo-fs and the open would fail.
6545 6549 */
6546 6550 if (*nm == '\0') {
6547 6551 error = 0;
6548 6552 VN_HOLD(dvp);
6549 6553 vp = dvp;
6550 6554 must_trunc = TRUE;
6551 6555 } else {
6552 6556 /*
6553 6557 * We need to go over the wire, just to be sure whether the
6554 6558 * file exists or not. Using the DNLC can be dangerous in
6555 6559 * this case when making a decision regarding existence.
6556 6560 */
6557 6561 error = nfs4lookup(dvp, nm, &vp, cr, 1);
6558 6562 }
6559 6563
6560 6564 if (exclusive)
6561 6565 createmode = EXCLUSIVE4;
6562 6566 else
6563 6567 createmode = GUARDED4;
6564 6568
6565 6569 /*
6566 6570 * error would be set if the file does not exist on the
6567 6571 * server, so lets go create it.
6568 6572 */
6569 6573 if (error) {
6570 6574 goto create_otw;
6571 6575 }
6572 6576
6573 6577 /*
6574 6578 * File does exist on the server
6575 6579 */
6576 6580 if (exclusive == EXCL)
6577 6581 error = EEXIST;
6578 6582 else if (vp->v_type == VDIR && (mode & VWRITE))
6579 6583 error = EISDIR;
6580 6584 else {
6581 6585 /*
6582 6586 * If vnode is a device, create special vnode.
6583 6587 */
6584 6588 if (ISVDEV(vp->v_type)) {
6585 6589 tempvp = vp;
6586 6590 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6587 6591 VN_RELE(tempvp);
6588 6592 }
6589 6593 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6590 6594 if ((vattr.va_mask & AT_SIZE) &&
6591 6595 vp->v_type == VREG) {
6592 6596 rp = VTOR4(vp);
6593 6597 /*
6594 6598 * Check here for large file handled
6595 6599 * by LF-unaware process (as
6596 6600 * ufs_create() does)
6597 6601 */
6598 6602 if (!(flags & FOFFMAX)) {
6599 6603 mutex_enter(&rp->r_statelock);
6600 6604 if (rp->r_size > MAXOFF32_T)
6601 6605 error = EOVERFLOW;
6602 6606 mutex_exit(&rp->r_statelock);
6603 6607 }
6604 6608
6605 6609 /* if error is set then we need to return */
6606 6610 if (error) {
6607 6611 nfs_rw_exit(&drp->r_rwlock);
6608 6612 VN_RELE(vp);
6609 6613 return (error);
6610 6614 }
6611 6615
6612 6616 if (must_trunc) {
6613 6617 vattr.va_mask = AT_SIZE;
6614 6618 error = nfs4setattr(vp, &vattr, 0, cr,
6615 6619 NULL);
6616 6620 } else {
6617 6621 /*
6618 6622 * we know we have a regular file that already
6619 6623 * exists and we may end up truncating the file
6620 6624 * as a result of the open_otw, so flush out
6621 6625 * any dirty pages for this file first.
6622 6626 */
6623 6627 if (nfs4_has_pages(vp) &&
6624 6628 ((rp->r_flags & R4DIRTY) ||
6625 6629 rp->r_count > 0 ||
6626 6630 rp->r_mapcnt > 0)) {
6627 6631 error = nfs4_putpage(vp,
6628 6632 (offset_t)0, 0, 0, cr, ct);
6629 6633 if (error && (error == ENOSPC ||
6630 6634 error == EDQUOT)) {
6631 6635 mutex_enter(
6632 6636 &rp->r_statelock);
6633 6637 if (!rp->r_error)
6634 6638 rp->r_error =
6635 6639 error;
6636 6640 mutex_exit(
6637 6641 &rp->r_statelock);
6638 6642 }
6639 6643 }
6640 6644 vattr.va_mask = (AT_SIZE |
6641 6645 AT_TYPE | AT_MODE);
6642 6646 vattr.va_type = VREG;
6643 6647 createmode = UNCHECKED4;
6644 6648 truncating = 1;
6645 6649 goto create_otw;
↓ open down ↓ |
6602 lines elided |
↑ open up ↑ |
6646 6650 }
6647 6651 }
6648 6652 }
6649 6653 }
6650 6654 nfs_rw_exit(&drp->r_rwlock);
6651 6655 if (error) {
6652 6656 VN_RELE(vp);
6653 6657 } else {
6654 6658 vnode_t *tvp;
6655 6659 rnode4_t *trp;
6656 - /*
6657 - * existing file got truncated, notify.
6658 - */
6659 6660 tvp = vp;
6660 6661 if (vp->v_type == VREG) {
6661 6662 trp = VTOR4(vp);
6662 6663 if (IS_SHADOW(vp, trp))
6663 6664 tvp = RTOV4(trp);
6664 6665 }
6665 - vnevent_create(tvp, ct);
6666 +
6667 + if (must_trunc) {
6668 + /*
6669 + * existing file got truncated, notify.
6670 + */
6671 + vnevent_create(tvp, ct);
6672 + }
6673 +
6666 6674 *vpp = vp;
6667 6675 }
6668 6676 return (error);
6669 6677
6670 6678 create_otw:
6671 6679 dnlc_remove(dvp, nm);
6672 6680
6673 6681 ASSERT(vattr.va_mask & AT_TYPE);
6674 6682
6675 6683 /*
6676 6684 * If not a regular file let nfs4mknod() handle it.
6677 6685 */
6678 6686 if (vattr.va_type != VREG) {
6679 6687 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6680 6688 nfs_rw_exit(&drp->r_rwlock);
6681 6689 return (error);
6682 6690 }
6683 6691
6684 6692 /*
6685 6693 * It _is_ a regular file.
6686 6694 */
6687 6695 ASSERT(vattr.va_mask & AT_MODE);
6688 6696 if (MANDMODE(vattr.va_mode)) {
6689 6697 nfs_rw_exit(&drp->r_rwlock);
6690 6698 return (EACCES);
6691 6699 }
6692 6700
6693 6701 /*
6694 6702 * If this happens to be a mknod of a regular file, then flags will
6695 6703 * have neither FREAD or FWRITE. However, we must set at least one
6696 6704 * for the call to nfs4open_otw. If it's open(O_CREAT) driving
6697 6705 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6698 6706 * set (based on openmode specified by app).
6699 6707 */
6700 6708 if ((flags & (FREAD|FWRITE)) == 0)
6701 6709 flags |= (FREAD|FWRITE);
6702 6710
6703 6711 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6704 6712
6705 6713 if (vp != NULL) {
6706 6714 /* if create was successful, throw away the file's pages */
6707 6715 if (!error && (vattr.va_mask & AT_SIZE))
6708 6716 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6709 6717 cr);
6710 6718 /* release the lookup hold */
6711 6719 VN_RELE(vp);
6712 6720 vp = NULL;
6713 6721 }
6714 6722
6715 6723 /*
6716 6724 * validate that we opened a regular file. This handles a misbehaving
6717 6725 * server that returns an incorrect FH.
6718 6726 */
6719 6727 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6720 6728 error = EISDIR;
6721 6729 VN_RELE(*vpp);
6722 6730 }
6723 6731
6724 6732 /*
6725 6733 * If this is not an exclusive create, then the CREATE
6726 6734 * request will be made with the GUARDED mode set. This
6727 6735 * means that the server will return EEXIST if the file
6728 6736 * exists. The file could exist because of a retransmitted
6729 6737 * request. In this case, we recover by starting over and
6730 6738 * checking to see whether the file exists. This second
6731 6739 * time through it should and a CREATE request will not be
6732 6740 * sent.
6733 6741 *
6734 6742 * This handles the problem of a dangling CREATE request
6735 6743 * which contains attributes which indicate that the file
6736 6744 * should be truncated. This retransmitted request could
6737 6745 * possibly truncate valid data in the file if not caught
6738 6746 * by the duplicate request mechanism on the server or if
6739 6747 * not caught by other means. The scenario is:
6740 6748 *
6741 6749 * Client transmits CREATE request with size = 0
6742 6750 * Client times out, retransmits request.
6743 6751 * Response to the first request arrives from the server
6744 6752 * and the client proceeds on.
6745 6753 * Client writes data to the file.
6746 6754 * The server now processes retransmitted CREATE request
6747 6755 * and truncates file.
6748 6756 *
6749 6757 * The use of the GUARDED CREATE request prevents this from
6750 6758 * happening because the retransmitted CREATE would fail
6751 6759 * with EEXIST and would not truncate the file.
6752 6760 */
6753 6761 if (error == EEXIST && exclusive == NONEXCL) {
6754 6762 #ifdef DEBUG
6755 6763 nfs4_create_misses++;
6756 6764 #endif
6757 6765 goto top;
6758 6766 }
6759 6767 nfs_rw_exit(&drp->r_rwlock);
6760 6768 if (truncating && !error && *vpp) {
6761 6769 vnode_t *tvp;
6762 6770 rnode4_t *trp;
6763 6771 /*
6764 6772 * existing file got truncated, notify.
6765 6773 */
6766 6774 tvp = *vpp;
6767 6775 trp = VTOR4(tvp);
6768 6776 if (IS_SHADOW(tvp, trp))
6769 6777 tvp = RTOV4(trp);
6770 6778 vnevent_create(tvp, ct);
6771 6779 }
6772 6780 return (error);
6773 6781 }
6774 6782
6775 6783 /*
6776 6784 * Create compound (for mkdir, mknod, symlink):
6777 6785 * { Putfh <dfh>; Create; Getfh; Getattr }
6778 6786 * It's okay if setattr failed to set gid - this is not considered
6779 6787 * an error, but purge attrs in that case.
6780 6788 */
6781 6789 static int
6782 6790 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6783 6791 vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6784 6792 {
6785 6793 int need_end_op = FALSE;
6786 6794 COMPOUND4args_clnt args;
6787 6795 COMPOUND4res_clnt res, *resp = NULL;
6788 6796 nfs_argop4 *argop;
6789 6797 nfs_resop4 *resop;
6790 6798 int doqueue;
6791 6799 mntinfo4_t *mi;
6792 6800 rnode4_t *drp = VTOR4(dvp);
6793 6801 change_info4 *cinfo;
6794 6802 GETFH4res *gf_res;
6795 6803 struct vattr vattr;
6796 6804 vnode_t *vp;
6797 6805 fattr4 *crattr;
6798 6806 bool_t needrecov = FALSE;
6799 6807 nfs4_recov_state_t recov_state;
6800 6808 nfs4_sharedfh_t *sfhp = NULL;
6801 6809 hrtime_t t;
6802 6810 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6803 6811 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6804 6812 dirattr_info_t dinfo, *dinfop;
6805 6813 servinfo4_t *svp;
6806 6814 bitmap4 supp_attrs;
6807 6815
6808 6816 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6809 6817 type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6810 6818
6811 6819 mi = VTOMI4(dvp);
6812 6820
6813 6821 /*
6814 6822 * Make sure we properly deal with setting the right gid
6815 6823 * on a new directory to reflect the parent's setgid bit
6816 6824 */
6817 6825 setgid_flag = 0;
6818 6826 if (type == NF4DIR) {
6819 6827 struct vattr dva;
6820 6828
6821 6829 va->va_mode &= ~VSGID;
6822 6830 dva.va_mask = AT_MODE | AT_GID;
6823 6831 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6824 6832
6825 6833 /*
6826 6834 * If the parent's directory has the setgid bit set
6827 6835 * _and_ the client was able to get a valid mapping
6828 6836 * for the parent dir's owner_group, we want to
6829 6837 * append NVERIFY(owner_group == dva.va_gid) and
6830 6838 * SETTATTR to the CREATE compound.
6831 6839 */
6832 6840 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6833 6841 setgid_flag = 1;
6834 6842 va->va_mode |= VSGID;
6835 6843 if (dva.va_gid != GID_NOBODY) {
6836 6844 va->va_mask |= AT_GID;
6837 6845 va->va_gid = dva.va_gid;
6838 6846 }
6839 6847 }
6840 6848 }
6841 6849 }
6842 6850
6843 6851 /*
6844 6852 * Create ops:
6845 6853 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6846 6854 * 5:restorefh(dir) 6:getattr(dir)
6847 6855 *
6848 6856 * if (setgid)
6849 6857 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6850 6858 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6851 6859 * 8:nverify 9:setattr
6852 6860 */
6853 6861 if (setgid_flag) {
6854 6862 numops = 10;
6855 6863 idx_create = 1;
6856 6864 idx_fattr = 3;
6857 6865 } else {
6858 6866 numops = 7;
6859 6867 idx_create = 2;
6860 6868 idx_fattr = 4;
6861 6869 }
6862 6870
6863 6871 ASSERT(nfs_zone() == mi->mi_zone);
6864 6872 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6865 6873 return (EINTR);
6866 6874 }
6867 6875 recov_state.rs_flags = 0;
6868 6876 recov_state.rs_num_retry_despite_err = 0;
6869 6877
6870 6878 argoplist_size = numops * sizeof (nfs_argop4);
6871 6879 argop = kmem_alloc(argoplist_size, KM_SLEEP);
6872 6880
6873 6881 recov_retry:
6874 6882 if (type == NF4LNK)
6875 6883 args.ctag = TAG_SYMLINK;
6876 6884 else if (type == NF4DIR)
6877 6885 args.ctag = TAG_MKDIR;
6878 6886 else
6879 6887 args.ctag = TAG_MKNOD;
6880 6888
6881 6889 args.array_len = numops;
6882 6890 args.array = argop;
6883 6891
6884 6892 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6885 6893 nfs_rw_exit(&drp->r_rwlock);
6886 6894 kmem_free(argop, argoplist_size);
6887 6895 return (e.error);
6888 6896 }
6889 6897 need_end_op = TRUE;
6890 6898
6891 6899
6892 6900 /* 0: putfh directory */
6893 6901 argop[0].argop = OP_CPUTFH;
6894 6902 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6895 6903
6896 6904 /* 1/2: Create object */
6897 6905 argop[idx_create].argop = OP_CCREATE;
6898 6906 argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6899 6907 argop[idx_create].nfs_argop4_u.opccreate.type = type;
6900 6908 if (type == NF4LNK) {
6901 6909 /*
6902 6910 * symlink, treat name as data
6903 6911 */
6904 6912 ASSERT(data != NULL);
6905 6913 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6906 6914 (char *)data;
6907 6915 }
6908 6916 if (type == NF4BLK || type == NF4CHR) {
6909 6917 ASSERT(data != NULL);
6910 6918 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6911 6919 *((specdata4 *)data);
6912 6920 }
6913 6921
6914 6922 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6915 6923
6916 6924 svp = drp->r_server;
6917 6925 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6918 6926 supp_attrs = svp->sv_supp_attrs;
6919 6927 nfs_rw_exit(&svp->sv_lock);
6920 6928
6921 6929 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6922 6930 nfs_rw_exit(&drp->r_rwlock);
6923 6931 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6924 6932 e.error = EINVAL;
6925 6933 kmem_free(argop, argoplist_size);
6926 6934 return (e.error);
6927 6935 }
6928 6936
6929 6937 /* 2/3: getfh fh of created object */
6930 6938 ASSERT(idx_create + 1 == idx_fattr - 1);
6931 6939 argop[idx_create + 1].argop = OP_GETFH;
6932 6940
6933 6941 /* 3/4: getattr of new object */
6934 6942 argop[idx_fattr].argop = OP_GETATTR;
6935 6943 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6936 6944 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6937 6945
6938 6946 if (setgid_flag) {
6939 6947 vattr_t _v;
6940 6948
6941 6949 argop[4].argop = OP_SAVEFH;
6942 6950
6943 6951 argop[5].argop = OP_CPUTFH;
6944 6952 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6945 6953
6946 6954 argop[6].argop = OP_GETATTR;
6947 6955 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6948 6956 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6949 6957
6950 6958 argop[7].argop = OP_RESTOREFH;
6951 6959
6952 6960 /*
6953 6961 * nverify
6954 6962 *
6955 6963 * XXX - Revisit the last argument to nfs4_end_op()
6956 6964 * once 5020486 is fixed.
6957 6965 */
6958 6966 _v.va_mask = AT_GID;
6959 6967 _v.va_gid = va->va_gid;
6960 6968 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6961 6969 supp_attrs)) {
6962 6970 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6963 6971 nfs_rw_exit(&drp->r_rwlock);
6964 6972 nfs4_fattr4_free(crattr);
6965 6973 kmem_free(argop, argoplist_size);
6966 6974 return (e.error);
6967 6975 }
6968 6976
6969 6977 /*
6970 6978 * setattr
6971 6979 *
6972 6980 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
6973 6981 * so no need for stateid or flags. Also we specify NULL
6974 6982 * rp since we're only interested in setting owner_group
6975 6983 * attributes.
6976 6984 */
6977 6985 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
6978 6986 &e.error, 0);
6979 6987
6980 6988 if (e.error) {
6981 6989 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6982 6990 nfs_rw_exit(&drp->r_rwlock);
6983 6991 nfs4_fattr4_free(crattr);
6984 6992 nfs4args_verify_free(&argop[8]);
6985 6993 kmem_free(argop, argoplist_size);
6986 6994 return (e.error);
6987 6995 }
6988 6996 } else {
6989 6997 argop[1].argop = OP_SAVEFH;
6990 6998
6991 6999 argop[5].argop = OP_RESTOREFH;
6992 7000
6993 7001 argop[6].argop = OP_GETATTR;
6994 7002 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6995 7003 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6996 7004 }
6997 7005
6998 7006 dnlc_remove(dvp, nm);
6999 7007
7000 7008 doqueue = 1;
7001 7009 t = gethrtime();
7002 7010 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7003 7011
7004 7012 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7005 7013 if (e.error) {
7006 7014 PURGE_ATTRCACHE4(dvp);
7007 7015 if (!needrecov)
7008 7016 goto out;
7009 7017 }
7010 7018
7011 7019 if (needrecov) {
7012 7020 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7013 7021 OP_CREATE, NULL, NULL, NULL) == FALSE) {
7014 7022 nfs4_end_op(mi, dvp, NULL, &recov_state,
7015 7023 needrecov);
7016 7024 need_end_op = FALSE;
7017 7025 nfs4_fattr4_free(crattr);
7018 7026 if (setgid_flag) {
7019 7027 nfs4args_verify_free(&argop[8]);
7020 7028 nfs4args_setattr_free(&argop[9]);
7021 7029 }
7022 7030 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7023 7031 goto recov_retry;
7024 7032 }
7025 7033 }
7026 7034
7027 7035 resp = &res;
7028 7036
7029 7037 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7030 7038
7031 7039 if (res.status == NFS4ERR_BADOWNER)
7032 7040 nfs4_log_badowner(mi, OP_CREATE);
7033 7041
7034 7042 e.error = geterrno4(res.status);
7035 7043
7036 7044 /*
7037 7045 * This check is left over from when create was implemented
7038 7046 * using a setattr op (instead of createattrs). If the
7039 7047 * putfh/create/getfh failed, the error was returned. If
7040 7048 * setattr/getattr failed, we keep going.
7041 7049 *
7042 7050 * It might be better to get rid of the GETFH also, and just
7043 7051 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7044 7052 * Then if any of the operations failed, we could return the
7045 7053 * error now, and remove much of the error code below.
7046 7054 */
7047 7055 if (res.array_len <= idx_fattr) {
7048 7056 /*
7049 7057 * Either Putfh, Create or Getfh failed.
7050 7058 */
7051 7059 PURGE_ATTRCACHE4(dvp);
7052 7060 /*
7053 7061 * nfs4_purge_stale_fh() may generate otw calls through
7054 7062 * nfs4_invalidate_pages. Hence the need to call
7055 7063 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7056 7064 */
7057 7065 nfs4_end_op(mi, dvp, NULL, &recov_state,
7058 7066 needrecov);
7059 7067 need_end_op = FALSE;
7060 7068 nfs4_purge_stale_fh(e.error, dvp, cr);
7061 7069 goto out;
7062 7070 }
7063 7071 }
7064 7072
7065 7073 resop = &res.array[idx_create]; /* create res */
7066 7074 cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7067 7075
7068 7076 resop = &res.array[idx_create + 1]; /* getfh res */
7069 7077 gf_res = &resop->nfs_resop4_u.opgetfh;
7070 7078
7071 7079 sfhp = sfh4_get(&gf_res->object, mi);
7072 7080 if (e.error) {
7073 7081 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7074 7082 fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7075 7083 if (vp->v_type == VNON) {
7076 7084 vattr.va_mask = AT_TYPE;
7077 7085 /*
7078 7086 * Need to call nfs4_end_op before nfs4getattr to avoid
7079 7087 * potential nfs4_start_op deadlock. See RFE 4777612.
7080 7088 */
7081 7089 nfs4_end_op(mi, dvp, NULL, &recov_state,
7082 7090 needrecov);
7083 7091 need_end_op = FALSE;
7084 7092 e.error = nfs4getattr(vp, &vattr, cr);
7085 7093 if (e.error) {
7086 7094 VN_RELE(vp);
7087 7095 *vpp = NULL;
7088 7096 goto out;
7089 7097 }
7090 7098 vp->v_type = vattr.va_type;
7091 7099 }
7092 7100 e.error = 0;
7093 7101 } else {
7094 7102 *vpp = vp = makenfs4node(sfhp,
7095 7103 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7096 7104 dvp->v_vfsp, t, cr,
7097 7105 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7098 7106 }
7099 7107
7100 7108 /*
7101 7109 * If compound succeeded, then update dir attrs
7102 7110 */
7103 7111 if (res.status == NFS4_OK) {
7104 7112 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7105 7113 dinfo.di_cred = cr;
7106 7114 dinfo.di_time_call = t;
7107 7115 dinfop = &dinfo;
7108 7116 } else
7109 7117 dinfop = NULL;
7110 7118
7111 7119 /* Update directory cache attribute, readdir and dnlc caches */
7112 7120 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7113 7121
7114 7122 out:
7115 7123 if (sfhp != NULL)
7116 7124 sfh4_rele(&sfhp);
7117 7125 nfs_rw_exit(&drp->r_rwlock);
7118 7126 nfs4_fattr4_free(crattr);
7119 7127 if (setgid_flag) {
7120 7128 nfs4args_verify_free(&argop[8]);
7121 7129 nfs4args_setattr_free(&argop[9]);
7122 7130 }
7123 7131 if (resp)
7124 7132 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7125 7133 if (need_end_op)
7126 7134 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7127 7135
7128 7136 kmem_free(argop, argoplist_size);
7129 7137 return (e.error);
7130 7138 }
7131 7139
7132 7140 /* ARGSUSED */
7133 7141 static int
7134 7142 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7135 7143 int mode, vnode_t **vpp, cred_t *cr)
7136 7144 {
7137 7145 int error;
7138 7146 vnode_t *vp;
7139 7147 nfs_ftype4 type;
7140 7148 specdata4 spec, *specp = NULL;
7141 7149
7142 7150 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7143 7151
7144 7152 switch (va->va_type) {
7145 7153 case VCHR:
7146 7154 case VBLK:
7147 7155 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7148 7156 spec.specdata1 = getmajor(va->va_rdev);
7149 7157 spec.specdata2 = getminor(va->va_rdev);
7150 7158 specp = &spec;
7151 7159 break;
7152 7160
7153 7161 case VFIFO:
7154 7162 type = NF4FIFO;
7155 7163 break;
7156 7164 case VSOCK:
7157 7165 type = NF4SOCK;
7158 7166 break;
7159 7167
7160 7168 default:
7161 7169 return (EINVAL);
7162 7170 }
7163 7171
7164 7172 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7165 7173 if (error) {
7166 7174 return (error);
7167 7175 }
7168 7176
7169 7177 /*
7170 7178 * This might not be needed any more; special case to deal
7171 7179 * with problematic v2/v3 servers. Since create was unable
7172 7180 * to set group correctly, not sure what hope setattr has.
7173 7181 */
7174 7182 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7175 7183 va->va_mask = AT_GID;
7176 7184 (void) nfs4setattr(vp, va, 0, cr, NULL);
7177 7185 }
7178 7186
7179 7187 /*
7180 7188 * If vnode is a device create special vnode
7181 7189 */
7182 7190 if (ISVDEV(vp->v_type)) {
7183 7191 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7184 7192 VN_RELE(vp);
7185 7193 } else {
7186 7194 *vpp = vp;
7187 7195 }
7188 7196 return (error);
7189 7197 }
7190 7198
7191 7199 /*
7192 7200 * Remove requires that the current fh be the target directory.
7193 7201 * After the operation, the current fh is unchanged.
7194 7202 * The compound op structure is:
7195 7203 * PUTFH(targetdir), REMOVE
7196 7204 *
7197 7205 * Weirdness: if the vnode to be removed is open
7198 7206 * we rename it instead of removing it and nfs_inactive
7199 7207 * will remove the new name.
7200 7208 */
7201 7209 /* ARGSUSED */
7202 7210 static int
7203 7211 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7204 7212 {
7205 7213 COMPOUND4args_clnt args;
7206 7214 COMPOUND4res_clnt res, *resp = NULL;
7207 7215 REMOVE4res *rm_res;
7208 7216 nfs_argop4 argop[3];
7209 7217 nfs_resop4 *resop;
7210 7218 vnode_t *vp;
7211 7219 char *tmpname;
7212 7220 int doqueue;
7213 7221 mntinfo4_t *mi;
7214 7222 rnode4_t *rp;
7215 7223 rnode4_t *drp;
7216 7224 int needrecov = 0;
7217 7225 nfs4_recov_state_t recov_state;
7218 7226 int isopen;
7219 7227 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7220 7228 dirattr_info_t dinfo;
7221 7229
7222 7230 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7223 7231 return (EPERM);
7224 7232 drp = VTOR4(dvp);
7225 7233 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7226 7234 return (EINTR);
7227 7235
7228 7236 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7229 7237 if (e.error) {
7230 7238 nfs_rw_exit(&drp->r_rwlock);
7231 7239 return (e.error);
7232 7240 }
7233 7241
7234 7242 if (vp->v_type == VDIR) {
7235 7243 VN_RELE(vp);
7236 7244 nfs_rw_exit(&drp->r_rwlock);
7237 7245 return (EISDIR);
7238 7246 }
7239 7247
7240 7248 /*
7241 7249 * First just remove the entry from the name cache, as it
7242 7250 * is most likely the only entry for this vp.
7243 7251 */
7244 7252 dnlc_remove(dvp, nm);
7245 7253
7246 7254 rp = VTOR4(vp);
7247 7255
7248 7256 /*
7249 7257 * For regular file types, check to see if the file is open by looking
7250 7258 * at the open streams.
7251 7259 * For all other types, check the reference count on the vnode. Since
7252 7260 * they are not opened OTW they never have an open stream.
7253 7261 *
7254 7262 * If the file is open, rename it to .nfsXXXX.
7255 7263 */
7256 7264 if (vp->v_type != VREG) {
7257 7265 /*
7258 7266 * If the file has a v_count > 1 then there may be more than one
7259 7267 * entry in the name cache due multiple links or an open file,
7260 7268 * but we don't have the real reference count so flush all
7261 7269 * possible entries.
7262 7270 */
7263 7271 if (vp->v_count > 1)
7264 7272 dnlc_purge_vp(vp);
7265 7273
7266 7274 /*
7267 7275 * Now we have the real reference count.
7268 7276 */
7269 7277 isopen = vp->v_count > 1;
7270 7278 } else {
7271 7279 mutex_enter(&rp->r_os_lock);
7272 7280 isopen = list_head(&rp->r_open_streams) != NULL;
7273 7281 mutex_exit(&rp->r_os_lock);
7274 7282 }
7275 7283
7276 7284 mutex_enter(&rp->r_statelock);
7277 7285 if (isopen &&
7278 7286 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7279 7287 mutex_exit(&rp->r_statelock);
7280 7288 tmpname = newname();
7281 7289 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7282 7290 if (e.error)
7283 7291 kmem_free(tmpname, MAXNAMELEN);
7284 7292 else {
7285 7293 mutex_enter(&rp->r_statelock);
7286 7294 if (rp->r_unldvp == NULL) {
7287 7295 VN_HOLD(dvp);
7288 7296 rp->r_unldvp = dvp;
7289 7297 if (rp->r_unlcred != NULL)
7290 7298 crfree(rp->r_unlcred);
7291 7299 crhold(cr);
7292 7300 rp->r_unlcred = cr;
7293 7301 rp->r_unlname = tmpname;
7294 7302 } else {
7295 7303 kmem_free(rp->r_unlname, MAXNAMELEN);
7296 7304 rp->r_unlname = tmpname;
7297 7305 }
7298 7306 mutex_exit(&rp->r_statelock);
7299 7307 }
7300 7308 VN_RELE(vp);
7301 7309 nfs_rw_exit(&drp->r_rwlock);
7302 7310 return (e.error);
7303 7311 }
7304 7312 /*
7305 7313 * Actually remove the file/dir
7306 7314 */
7307 7315 mutex_exit(&rp->r_statelock);
7308 7316
7309 7317 /*
7310 7318 * We need to flush any dirty pages which happen to
7311 7319 * be hanging around before removing the file.
7312 7320 * This shouldn't happen very often since in NFSv4
7313 7321 * we should be close to open consistent.
7314 7322 */
7315 7323 if (nfs4_has_pages(vp) &&
7316 7324 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7317 7325 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7318 7326 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7319 7327 mutex_enter(&rp->r_statelock);
7320 7328 if (!rp->r_error)
7321 7329 rp->r_error = e.error;
7322 7330 mutex_exit(&rp->r_statelock);
7323 7331 }
7324 7332 }
7325 7333
7326 7334 mi = VTOMI4(dvp);
7327 7335
7328 7336 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7329 7337 recov_state.rs_flags = 0;
7330 7338 recov_state.rs_num_retry_despite_err = 0;
7331 7339
7332 7340 recov_retry:
7333 7341 /*
7334 7342 * Remove ops: putfh dir; remove
7335 7343 */
7336 7344 args.ctag = TAG_REMOVE;
7337 7345 args.array_len = 3;
7338 7346 args.array = argop;
7339 7347
7340 7348 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7341 7349 if (e.error) {
7342 7350 nfs_rw_exit(&drp->r_rwlock);
7343 7351 VN_RELE(vp);
7344 7352 return (e.error);
7345 7353 }
7346 7354
7347 7355 /* putfh directory */
7348 7356 argop[0].argop = OP_CPUTFH;
7349 7357 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7350 7358
7351 7359 /* remove */
7352 7360 argop[1].argop = OP_CREMOVE;
7353 7361 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7354 7362
7355 7363 /* getattr dir */
7356 7364 argop[2].argop = OP_GETATTR;
7357 7365 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7358 7366 argop[2].nfs_argop4_u.opgetattr.mi = mi;
7359 7367
7360 7368 doqueue = 1;
7361 7369 dinfo.di_time_call = gethrtime();
7362 7370 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7363 7371
7364 7372 PURGE_ATTRCACHE4(vp);
7365 7373
7366 7374 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7367 7375 if (e.error)
7368 7376 PURGE_ATTRCACHE4(dvp);
7369 7377
7370 7378 if (needrecov) {
7371 7379 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7372 7380 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7373 7381 if (!e.error)
7374 7382 (void) xdr_free(xdr_COMPOUND4res_clnt,
7375 7383 (caddr_t)&res);
7376 7384 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7377 7385 needrecov);
7378 7386 goto recov_retry;
7379 7387 }
7380 7388 }
7381 7389
7382 7390 /*
7383 7391 * Matching nfs4_end_op() for start_op() above.
7384 7392 * There is a path in the code below which calls
7385 7393 * nfs4_purge_stale_fh(), which may generate otw calls through
7386 7394 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7387 7395 * here to avoid nfs4_start_op() deadlock.
7388 7396 */
7389 7397 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7390 7398
7391 7399 if (!e.error) {
7392 7400 resp = &res;
7393 7401
7394 7402 if (res.status) {
7395 7403 e.error = geterrno4(res.status);
7396 7404 PURGE_ATTRCACHE4(dvp);
7397 7405 nfs4_purge_stale_fh(e.error, dvp, cr);
7398 7406 } else {
7399 7407 resop = &res.array[1]; /* remove res */
7400 7408 rm_res = &resop->nfs_resop4_u.opremove;
7401 7409
7402 7410 dinfo.di_garp =
7403 7411 &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7404 7412 dinfo.di_cred = cr;
7405 7413
7406 7414 /* Update directory attr, readdir and dnlc caches */
7407 7415 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7408 7416 &dinfo);
7409 7417 }
7410 7418 }
7411 7419 nfs_rw_exit(&drp->r_rwlock);
7412 7420 if (resp)
7413 7421 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7414 7422
7415 7423 if (e.error == 0) {
7416 7424 vnode_t *tvp;
7417 7425 rnode4_t *trp;
7418 7426 trp = VTOR4(vp);
7419 7427 tvp = vp;
7420 7428 if (IS_SHADOW(vp, trp))
7421 7429 tvp = RTOV4(trp);
7422 7430 vnevent_remove(tvp, dvp, nm, ct);
7423 7431 }
7424 7432 VN_RELE(vp);
7425 7433 return (e.error);
7426 7434 }
7427 7435
7428 7436 /*
7429 7437 * Link requires that the current fh be the target directory and the
7430 7438 * saved fh be the source fh. After the operation, the current fh is unchanged.
7431 7439 * Thus the compound op structure is:
7432 7440 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7433 7441 * GETATTR(file)
7434 7442 */
7435 7443 /* ARGSUSED */
7436 7444 static int
7437 7445 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7438 7446 caller_context_t *ct, int flags)
7439 7447 {
7440 7448 COMPOUND4args_clnt args;
7441 7449 COMPOUND4res_clnt res, *resp = NULL;
7442 7450 LINK4res *ln_res;
7443 7451 int argoplist_size = 7 * sizeof (nfs_argop4);
7444 7452 nfs_argop4 *argop;
7445 7453 nfs_resop4 *resop;
7446 7454 vnode_t *realvp, *nvp;
7447 7455 int doqueue;
7448 7456 mntinfo4_t *mi;
7449 7457 rnode4_t *tdrp;
7450 7458 bool_t needrecov = FALSE;
7451 7459 nfs4_recov_state_t recov_state;
7452 7460 hrtime_t t;
7453 7461 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7454 7462 dirattr_info_t dinfo;
7455 7463
7456 7464 ASSERT(*tnm != '\0');
7457 7465 ASSERT(tdvp->v_type == VDIR);
7458 7466 ASSERT(nfs4_consistent_type(tdvp));
7459 7467 ASSERT(nfs4_consistent_type(svp));
7460 7468
7461 7469 if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7462 7470 return (EPERM);
7463 7471 if (VOP_REALVP(svp, &realvp, ct) == 0) {
7464 7472 svp = realvp;
7465 7473 ASSERT(nfs4_consistent_type(svp));
7466 7474 }
7467 7475
7468 7476 tdrp = VTOR4(tdvp);
7469 7477 mi = VTOMI4(svp);
7470 7478
7471 7479 if (!(mi->mi_flags & MI4_LINK)) {
7472 7480 return (EOPNOTSUPP);
7473 7481 }
7474 7482 recov_state.rs_flags = 0;
7475 7483 recov_state.rs_num_retry_despite_err = 0;
7476 7484
7477 7485 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7478 7486 return (EINTR);
7479 7487
7480 7488 recov_retry:
7481 7489 argop = kmem_alloc(argoplist_size, KM_SLEEP);
7482 7490
7483 7491 args.ctag = TAG_LINK;
7484 7492
7485 7493 /*
7486 7494 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7487 7495 * restorefh; getattr(fl)
7488 7496 */
7489 7497 args.array_len = 7;
7490 7498 args.array = argop;
7491 7499
7492 7500 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7493 7501 if (e.error) {
7494 7502 kmem_free(argop, argoplist_size);
7495 7503 nfs_rw_exit(&tdrp->r_rwlock);
7496 7504 return (e.error);
7497 7505 }
7498 7506
7499 7507 /* 0. putfh file */
7500 7508 argop[0].argop = OP_CPUTFH;
7501 7509 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7502 7510
7503 7511 /* 1. save current fh to free up the space for the dir */
7504 7512 argop[1].argop = OP_SAVEFH;
7505 7513
7506 7514 /* 2. putfh targetdir */
7507 7515 argop[2].argop = OP_CPUTFH;
7508 7516 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7509 7517
7510 7518 /* 3. link: current_fh is targetdir, saved_fh is source */
7511 7519 argop[3].argop = OP_CLINK;
7512 7520 argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7513 7521
7514 7522 /* 4. Get attributes of dir */
7515 7523 argop[4].argop = OP_GETATTR;
7516 7524 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7517 7525 argop[4].nfs_argop4_u.opgetattr.mi = mi;
7518 7526
7519 7527 /* 5. If link was successful, restore current vp to file */
7520 7528 argop[5].argop = OP_RESTOREFH;
7521 7529
7522 7530 /* 6. Get attributes of linked object */
7523 7531 argop[6].argop = OP_GETATTR;
7524 7532 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7525 7533 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7526 7534
7527 7535 dnlc_remove(tdvp, tnm);
7528 7536
7529 7537 doqueue = 1;
7530 7538 t = gethrtime();
7531 7539
7532 7540 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7533 7541
7534 7542 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7535 7543 if (e.error != 0 && !needrecov) {
7536 7544 PURGE_ATTRCACHE4(tdvp);
7537 7545 PURGE_ATTRCACHE4(svp);
7538 7546 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7539 7547 goto out;
7540 7548 }
7541 7549
7542 7550 if (needrecov) {
7543 7551 bool_t abort;
7544 7552
7545 7553 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7546 7554 NULL, NULL, OP_LINK, NULL, NULL, NULL);
7547 7555 if (abort == FALSE) {
7548 7556 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7549 7557 needrecov);
7550 7558 kmem_free(argop, argoplist_size);
7551 7559 if (!e.error)
7552 7560 (void) xdr_free(xdr_COMPOUND4res_clnt,
7553 7561 (caddr_t)&res);
7554 7562 goto recov_retry;
7555 7563 } else {
7556 7564 if (e.error != 0) {
7557 7565 PURGE_ATTRCACHE4(tdvp);
7558 7566 PURGE_ATTRCACHE4(svp);
7559 7567 nfs4_end_op(VTOMI4(svp), svp, tdvp,
7560 7568 &recov_state, needrecov);
7561 7569 goto out;
7562 7570 }
7563 7571 /* fall through for res.status case */
7564 7572 }
7565 7573 }
7566 7574
7567 7575 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7568 7576
7569 7577 resp = &res;
7570 7578 if (res.status) {
7571 7579 /* If link succeeded, then don't return error */
7572 7580 e.error = geterrno4(res.status);
7573 7581 if (res.array_len <= 4) {
7574 7582 /*
7575 7583 * Either Putfh, Savefh, Putfh dir, or Link failed
7576 7584 */
7577 7585 PURGE_ATTRCACHE4(svp);
7578 7586 PURGE_ATTRCACHE4(tdvp);
7579 7587 if (e.error == EOPNOTSUPP) {
7580 7588 mutex_enter(&mi->mi_lock);
7581 7589 mi->mi_flags &= ~MI4_LINK;
7582 7590 mutex_exit(&mi->mi_lock);
7583 7591 }
7584 7592 /* Remap EISDIR to EPERM for non-root user for SVVS */
7585 7593 /* XXX-LP */
7586 7594 if (e.error == EISDIR && crgetuid(cr) != 0)
7587 7595 e.error = EPERM;
7588 7596 goto out;
7589 7597 }
7590 7598 }
7591 7599
7592 7600 /* either no error or one of the postop getattr failed */
7593 7601
7594 7602 /*
7595 7603 * XXX - if LINK succeeded, but no attrs were returned for link
7596 7604 * file, purge its cache.
7597 7605 *
7598 7606 * XXX Perform a simplified version of wcc checking. Instead of
7599 7607 * have another getattr to get pre-op, just purge cache if
7600 7608 * any of the ops prior to and including the getattr failed.
7601 7609 * If the getattr succeeded then update the attrcache accordingly.
7602 7610 */
7603 7611
7604 7612 /*
7605 7613 * update cache with link file postattrs.
7606 7614 * Note: at this point resop points to link res.
7607 7615 */
7608 7616 resop = &res.array[3]; /* link res */
7609 7617 ln_res = &resop->nfs_resop4_u.oplink;
7610 7618 if (res.status == NFS4_OK)
7611 7619 e.error = nfs4_update_attrcache(res.status,
7612 7620 &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7613 7621 t, svp, cr);
7614 7622
7615 7623 /*
7616 7624 * Call makenfs4node to create the new shadow vp for tnm.
7617 7625 * We pass NULL attrs because we just cached attrs for
7618 7626 * the src object. All we're trying to accomplish is to
7619 7627 * to create the new shadow vnode.
7620 7628 */
7621 7629 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7622 7630 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7623 7631
7624 7632 /* Update target cache attribute, readdir and dnlc caches */
7625 7633 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7626 7634 dinfo.di_time_call = t;
7627 7635 dinfo.di_cred = cr;
7628 7636
7629 7637 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7630 7638 ASSERT(nfs4_consistent_type(tdvp));
7631 7639 ASSERT(nfs4_consistent_type(svp));
7632 7640 ASSERT(nfs4_consistent_type(nvp));
7633 7641 VN_RELE(nvp);
7634 7642
7635 7643 if (!e.error) {
7636 7644 vnode_t *tvp;
7637 7645 rnode4_t *trp;
7638 7646 /*
7639 7647 * Notify the source file of this link operation.
7640 7648 */
7641 7649 trp = VTOR4(svp);
7642 7650 tvp = svp;
7643 7651 if (IS_SHADOW(svp, trp))
7644 7652 tvp = RTOV4(trp);
7645 7653 vnevent_link(tvp, ct);
7646 7654 }
7647 7655 out:
7648 7656 kmem_free(argop, argoplist_size);
7649 7657 if (resp)
7650 7658 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7651 7659
7652 7660 nfs_rw_exit(&tdrp->r_rwlock);
7653 7661
7654 7662 return (e.error);
7655 7663 }
7656 7664
7657 7665 /* ARGSUSED */
7658 7666 static int
7659 7667 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7660 7668 caller_context_t *ct, int flags)
7661 7669 {
7662 7670 vnode_t *realvp;
7663 7671
7664 7672 if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7665 7673 return (EPERM);
7666 7674 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7667 7675 ndvp = realvp;
7668 7676
7669 7677 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7670 7678 }
7671 7679
7672 7680 /*
7673 7681 * nfs4rename does the real work of renaming in NFS Version 4.
7674 7682 *
7675 7683 * A file handle is considered volatile for renaming purposes if either
7676 7684 * of the volatile bits are turned on. However, the compound may differ
7677 7685 * based on the likelihood of the filehandle to change during rename.
7678 7686 */
7679 7687 static int
7680 7688 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7681 7689 caller_context_t *ct)
7682 7690 {
7683 7691 int error;
7684 7692 mntinfo4_t *mi;
7685 7693 vnode_t *nvp = NULL;
7686 7694 vnode_t *ovp = NULL;
7687 7695 char *tmpname = NULL;
7688 7696 rnode4_t *rp;
7689 7697 rnode4_t *odrp;
7690 7698 rnode4_t *ndrp;
7691 7699 int did_link = 0;
7692 7700 int do_link = 1;
7693 7701 nfsstat4 stat = NFS4_OK;
7694 7702
7695 7703 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7696 7704 ASSERT(nfs4_consistent_type(odvp));
7697 7705 ASSERT(nfs4_consistent_type(ndvp));
7698 7706
7699 7707 if (onm[0] == '.' && (onm[1] == '\0' ||
7700 7708 (onm[1] == '.' && onm[2] == '\0')))
7701 7709 return (EINVAL);
7702 7710
7703 7711 if (nnm[0] == '.' && (nnm[1] == '\0' ||
7704 7712 (nnm[1] == '.' && nnm[2] == '\0')))
7705 7713 return (EINVAL);
7706 7714
7707 7715 odrp = VTOR4(odvp);
7708 7716 ndrp = VTOR4(ndvp);
7709 7717 if ((intptr_t)odrp < (intptr_t)ndrp) {
7710 7718 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7711 7719 return (EINTR);
7712 7720 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7713 7721 nfs_rw_exit(&odrp->r_rwlock);
7714 7722 return (EINTR);
7715 7723 }
7716 7724 } else {
7717 7725 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7718 7726 return (EINTR);
7719 7727 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7720 7728 nfs_rw_exit(&ndrp->r_rwlock);
7721 7729 return (EINTR);
7722 7730 }
7723 7731 }
7724 7732
7725 7733 /*
7726 7734 * Lookup the target file. If it exists, it needs to be
7727 7735 * checked to see whether it is a mount point and whether
7728 7736 * it is active (open).
7729 7737 */
7730 7738 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7731 7739 if (!error) {
7732 7740 int isactive;
7733 7741
7734 7742 ASSERT(nfs4_consistent_type(nvp));
7735 7743 /*
7736 7744 * If this file has been mounted on, then just
7737 7745 * return busy because renaming to it would remove
7738 7746 * the mounted file system from the name space.
7739 7747 */
7740 7748 if (vn_ismntpt(nvp)) {
7741 7749 VN_RELE(nvp);
7742 7750 nfs_rw_exit(&odrp->r_rwlock);
7743 7751 nfs_rw_exit(&ndrp->r_rwlock);
7744 7752 return (EBUSY);
7745 7753 }
7746 7754
7747 7755 /*
7748 7756 * First just remove the entry from the name cache, as it
7749 7757 * is most likely the only entry for this vp.
7750 7758 */
7751 7759 dnlc_remove(ndvp, nnm);
7752 7760
7753 7761 rp = VTOR4(nvp);
7754 7762
7755 7763 if (nvp->v_type != VREG) {
7756 7764 /*
7757 7765 * Purge the name cache of all references to this vnode
7758 7766 * so that we can check the reference count to infer
7759 7767 * whether it is active or not.
7760 7768 */
7761 7769 if (nvp->v_count > 1)
7762 7770 dnlc_purge_vp(nvp);
7763 7771
7764 7772 isactive = nvp->v_count > 1;
7765 7773 } else {
7766 7774 mutex_enter(&rp->r_os_lock);
7767 7775 isactive = list_head(&rp->r_open_streams) != NULL;
7768 7776 mutex_exit(&rp->r_os_lock);
7769 7777 }
7770 7778
7771 7779 /*
7772 7780 * If the vnode is active and is not a directory,
7773 7781 * arrange to rename it to a
7774 7782 * temporary file so that it will continue to be
7775 7783 * accessible. This implements the "unlink-open-file"
7776 7784 * semantics for the target of a rename operation.
7777 7785 * Before doing this though, make sure that the
7778 7786 * source and target files are not already the same.
7779 7787 */
7780 7788 if (isactive && nvp->v_type != VDIR) {
7781 7789 /*
7782 7790 * Lookup the source name.
7783 7791 */
7784 7792 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7785 7793
7786 7794 /*
7787 7795 * The source name *should* already exist.
7788 7796 */
7789 7797 if (error) {
7790 7798 VN_RELE(nvp);
7791 7799 nfs_rw_exit(&odrp->r_rwlock);
7792 7800 nfs_rw_exit(&ndrp->r_rwlock);
7793 7801 return (error);
7794 7802 }
7795 7803
7796 7804 ASSERT(nfs4_consistent_type(ovp));
7797 7805
7798 7806 /*
7799 7807 * Compare the two vnodes. If they are the same,
7800 7808 * just release all held vnodes and return success.
7801 7809 */
7802 7810 if (VN_CMP(ovp, nvp)) {
7803 7811 VN_RELE(ovp);
7804 7812 VN_RELE(nvp);
7805 7813 nfs_rw_exit(&odrp->r_rwlock);
7806 7814 nfs_rw_exit(&ndrp->r_rwlock);
7807 7815 return (0);
7808 7816 }
7809 7817
7810 7818 /*
7811 7819 * Can't mix and match directories and non-
7812 7820 * directories in rename operations. We already
7813 7821 * know that the target is not a directory. If
7814 7822 * the source is a directory, return an error.
7815 7823 */
7816 7824 if (ovp->v_type == VDIR) {
7817 7825 VN_RELE(ovp);
7818 7826 VN_RELE(nvp);
7819 7827 nfs_rw_exit(&odrp->r_rwlock);
7820 7828 nfs_rw_exit(&ndrp->r_rwlock);
7821 7829 return (ENOTDIR);
7822 7830 }
7823 7831 link_call:
7824 7832 /*
7825 7833 * The target file exists, is not the same as
7826 7834 * the source file, and is active. We first
7827 7835 * try to Link it to a temporary filename to
7828 7836 * avoid having the server removing the file
7829 7837 * completely (which could cause data loss to
7830 7838 * the user's POV in the event the Rename fails
7831 7839 * -- see bug 1165874).
7832 7840 */
7833 7841 /*
7834 7842 * The do_link and did_link booleans are
7835 7843 * introduced in the event we get NFS4ERR_FILE_OPEN
7836 7844 * returned for the Rename. Some servers can
7837 7845 * not Rename over an Open file, so they return
7838 7846 * this error. The client needs to Remove the
7839 7847 * newly created Link and do two Renames, just
7840 7848 * as if the server didn't support LINK.
7841 7849 */
7842 7850 tmpname = newname();
7843 7851 error = 0;
7844 7852
7845 7853 if (do_link) {
7846 7854 error = nfs4_link(ndvp, nvp, tmpname, cr,
7847 7855 NULL, 0);
7848 7856 }
7849 7857 if (error == EOPNOTSUPP || !do_link) {
7850 7858 error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7851 7859 cr, NULL, 0);
7852 7860 did_link = 0;
7853 7861 } else {
7854 7862 did_link = 1;
7855 7863 }
7856 7864 if (error) {
7857 7865 kmem_free(tmpname, MAXNAMELEN);
7858 7866 VN_RELE(ovp);
7859 7867 VN_RELE(nvp);
7860 7868 nfs_rw_exit(&odrp->r_rwlock);
7861 7869 nfs_rw_exit(&ndrp->r_rwlock);
7862 7870 return (error);
7863 7871 }
7864 7872
7865 7873 mutex_enter(&rp->r_statelock);
7866 7874 if (rp->r_unldvp == NULL) {
7867 7875 VN_HOLD(ndvp);
7868 7876 rp->r_unldvp = ndvp;
7869 7877 if (rp->r_unlcred != NULL)
7870 7878 crfree(rp->r_unlcred);
7871 7879 crhold(cr);
7872 7880 rp->r_unlcred = cr;
7873 7881 rp->r_unlname = tmpname;
7874 7882 } else {
7875 7883 if (rp->r_unlname)
7876 7884 kmem_free(rp->r_unlname, MAXNAMELEN);
7877 7885 rp->r_unlname = tmpname;
7878 7886 }
7879 7887 mutex_exit(&rp->r_statelock);
7880 7888 }
7881 7889
7882 7890 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7883 7891
7884 7892 ASSERT(nfs4_consistent_type(nvp));
7885 7893 }
7886 7894
7887 7895 if (ovp == NULL) {
7888 7896 /*
7889 7897 * When renaming directories to be a subdirectory of a
7890 7898 * different parent, the dnlc entry for ".." will no
7891 7899 * longer be valid, so it must be removed.
7892 7900 *
7893 7901 * We do a lookup here to determine whether we are renaming
7894 7902 * a directory and we need to check if we are renaming
7895 7903 * an unlinked file. This might have already been done
7896 7904 * in previous code, so we check ovp == NULL to avoid
7897 7905 * doing it twice.
7898 7906 */
7899 7907 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7900 7908 /*
7901 7909 * The source name *should* already exist.
7902 7910 */
7903 7911 if (error) {
7904 7912 nfs_rw_exit(&odrp->r_rwlock);
7905 7913 nfs_rw_exit(&ndrp->r_rwlock);
7906 7914 if (nvp) {
7907 7915 VN_RELE(nvp);
7908 7916 }
7909 7917 return (error);
7910 7918 }
7911 7919 ASSERT(ovp != NULL);
7912 7920 ASSERT(nfs4_consistent_type(ovp));
7913 7921 }
7914 7922
7915 7923 /*
7916 7924 * Is the object being renamed a dir, and if so, is
7917 7925 * it being renamed to a child of itself? The underlying
7918 7926 * fs should ultimately return EINVAL for this case;
7919 7927 * however, buggy beta non-Solaris NFSv4 servers at
7920 7928 * interop testing events have allowed this behavior,
7921 7929 * and it caused our client to panic due to a recursive
7922 7930 * mutex_enter in fn_move.
7923 7931 *
7924 7932 * The tedious locking in fn_move could be changed to
7925 7933 * deal with this case, and the client could avoid the
7926 7934 * panic; however, the client would just confuse itself
7927 7935 * later and misbehave. A better way to handle the broken
7928 7936 * server is to detect this condition and return EINVAL
7929 7937 * without ever sending the the bogus rename to the server.
7930 7938 * We know the rename is invalid -- just fail it now.
7931 7939 */
7932 7940 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7933 7941 VN_RELE(ovp);
7934 7942 nfs_rw_exit(&odrp->r_rwlock);
7935 7943 nfs_rw_exit(&ndrp->r_rwlock);
7936 7944 if (nvp) {
7937 7945 VN_RELE(nvp);
7938 7946 }
7939 7947 return (EINVAL);
7940 7948 }
7941 7949
7942 7950 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7943 7951
7944 7952 /*
7945 7953 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7946 7954 * possible for the filehandle to change due to the rename.
7947 7955 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7948 7956 * the fh will not change because of the rename, but we still need
7949 7957 * to update its rnode entry with the new name for
7950 7958 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7951 7959 * has no effect on these for now, but for future improvements,
7952 7960 * we might want to use it too to simplify handling of files
7953 7961 * that are open with that flag on. (XXX)
7954 7962 */
7955 7963 mi = VTOMI4(odvp);
7956 7964 if (NFS4_VOLATILE_FH(mi))
7957 7965 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7958 7966 &stat);
7959 7967 else
7960 7968 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7961 7969 &stat);
7962 7970
7963 7971 ASSERT(nfs4_consistent_type(odvp));
7964 7972 ASSERT(nfs4_consistent_type(ndvp));
7965 7973 ASSERT(nfs4_consistent_type(ovp));
7966 7974
7967 7975 if (stat == NFS4ERR_FILE_OPEN && did_link) {
7968 7976 do_link = 0;
7969 7977 /*
7970 7978 * Before the 'link_call' code, we did a nfs4_lookup
7971 7979 * that puts a VN_HOLD on nvp. After the nfs4_link
7972 7980 * call we call VN_RELE to match that hold. We need
7973 7981 * to place an additional VN_HOLD here since we will
7974 7982 * be hitting that VN_RELE again.
7975 7983 */
7976 7984 VN_HOLD(nvp);
7977 7985
7978 7986 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
7979 7987
7980 7988 /* Undo the unlinked file naming stuff we just did */
7981 7989 mutex_enter(&rp->r_statelock);
7982 7990 if (rp->r_unldvp) {
7983 7991 VN_RELE(ndvp);
7984 7992 rp->r_unldvp = NULL;
7985 7993 if (rp->r_unlcred != NULL)
7986 7994 crfree(rp->r_unlcred);
7987 7995 rp->r_unlcred = NULL;
7988 7996 /* rp->r_unlanme points to tmpname */
7989 7997 if (rp->r_unlname)
7990 7998 kmem_free(rp->r_unlname, MAXNAMELEN);
7991 7999 rp->r_unlname = NULL;
7992 8000 }
7993 8001 mutex_exit(&rp->r_statelock);
7994 8002
7995 8003 if (nvp) {
7996 8004 VN_RELE(nvp);
7997 8005 }
7998 8006 goto link_call;
7999 8007 }
8000 8008
8001 8009 if (error) {
8002 8010 VN_RELE(ovp);
8003 8011 nfs_rw_exit(&odrp->r_rwlock);
8004 8012 nfs_rw_exit(&ndrp->r_rwlock);
8005 8013 if (nvp) {
8006 8014 VN_RELE(nvp);
8007 8015 }
8008 8016 return (error);
8009 8017 }
8010 8018
8011 8019 /*
8012 8020 * when renaming directories to be a subdirectory of a
8013 8021 * different parent, the dnlc entry for ".." will no
8014 8022 * longer be valid, so it must be removed
8015 8023 */
8016 8024 rp = VTOR4(ovp);
8017 8025 if (ndvp != odvp) {
8018 8026 if (ovp->v_type == VDIR) {
8019 8027 dnlc_remove(ovp, "..");
8020 8028 if (rp->r_dir != NULL)
8021 8029 nfs4_purge_rddir_cache(ovp);
8022 8030 }
8023 8031 }
8024 8032
8025 8033 /*
8026 8034 * If we are renaming the unlinked file, update the
8027 8035 * r_unldvp and r_unlname as needed.
8028 8036 */
8029 8037 mutex_enter(&rp->r_statelock);
8030 8038 if (rp->r_unldvp != NULL) {
8031 8039 if (strcmp(rp->r_unlname, onm) == 0) {
8032 8040 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8033 8041 rp->r_unlname[MAXNAMELEN - 1] = '\0';
8034 8042 if (ndvp != rp->r_unldvp) {
8035 8043 VN_RELE(rp->r_unldvp);
8036 8044 rp->r_unldvp = ndvp;
8037 8045 VN_HOLD(ndvp);
8038 8046 }
8039 8047 }
8040 8048 }
8041 8049 mutex_exit(&rp->r_statelock);
8042 8050
8043 8051 /*
8044 8052 * Notify the rename vnevents to source vnode, and to the target
8045 8053 * vnode if it already existed.
8046 8054 */
8047 8055 if (error == 0) {
8048 8056 vnode_t *tvp;
8049 8057 rnode4_t *trp;
8050 8058 /*
8051 8059 * Notify the vnode. Each links is represented by
8052 8060 * a different vnode, in nfsv4.
8053 8061 */
8054 8062 if (nvp) {
8055 8063 trp = VTOR4(nvp);
8056 8064 tvp = nvp;
8057 8065 if (IS_SHADOW(nvp, trp))
8058 8066 tvp = RTOV4(trp);
8059 8067 vnevent_rename_dest(tvp, ndvp, nnm, ct);
8060 8068 }
8061 8069
8062 8070 /*
8063 8071 * if the source and destination directory are not the
8064 8072 * same notify the destination directory.
8065 8073 */
8066 8074 if (VTOR4(odvp) != VTOR4(ndvp)) {
8067 8075 trp = VTOR4(ndvp);
8068 8076 tvp = ndvp;
8069 8077 if (IS_SHADOW(ndvp, trp))
8070 8078 tvp = RTOV4(trp);
8071 8079 vnevent_rename_dest_dir(tvp, ct);
8072 8080 }
8073 8081
8074 8082 trp = VTOR4(ovp);
8075 8083 tvp = ovp;
8076 8084 if (IS_SHADOW(ovp, trp))
8077 8085 tvp = RTOV4(trp);
8078 8086 vnevent_rename_src(tvp, odvp, onm, ct);
8079 8087 }
8080 8088
8081 8089 if (nvp) {
8082 8090 VN_RELE(nvp);
8083 8091 }
8084 8092 VN_RELE(ovp);
8085 8093
8086 8094 nfs_rw_exit(&odrp->r_rwlock);
8087 8095 nfs_rw_exit(&ndrp->r_rwlock);
8088 8096
8089 8097 return (error);
8090 8098 }
8091 8099
8092 8100 /*
8093 8101 * When the parent directory has changed, sv_dfh must be updated
8094 8102 */
8095 8103 static void
8096 8104 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8097 8105 {
8098 8106 svnode_t *sv = VTOSV(vp);
8099 8107 nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8100 8108 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8101 8109
8102 8110 sfh4_hold(new_dfh);
8103 8111 sv->sv_dfh = new_dfh;
8104 8112 sfh4_rele(&old_dfh);
8105 8113 }
8106 8114
8107 8115 /*
8108 8116 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8109 8117 * when it is known that the filehandle is persistent through rename.
8110 8118 *
8111 8119 * Rename requires that the current fh be the target directory and the
8112 8120 * saved fh be the source directory. After the operation, the current fh
8113 8121 * is unchanged.
8114 8122 * The compound op structure for persistent fh rename is:
8115 8123 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8116 8124 * Rather than bother with the directory postop args, we'll simply
8117 8125 * update that a change occurred in the cache, so no post-op getattrs.
8118 8126 */
8119 8127 static int
8120 8128 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8121 8129 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8122 8130 {
8123 8131 COMPOUND4args_clnt args;
8124 8132 COMPOUND4res_clnt res, *resp = NULL;
8125 8133 nfs_argop4 *argop;
8126 8134 nfs_resop4 *resop;
8127 8135 int doqueue, argoplist_size;
8128 8136 mntinfo4_t *mi;
8129 8137 rnode4_t *odrp = VTOR4(odvp);
8130 8138 rnode4_t *ndrp = VTOR4(ndvp);
8131 8139 RENAME4res *rn_res;
8132 8140 bool_t needrecov;
8133 8141 nfs4_recov_state_t recov_state;
8134 8142 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8135 8143 dirattr_info_t dinfo, *dinfop;
8136 8144
8137 8145 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8138 8146
8139 8147 recov_state.rs_flags = 0;
8140 8148 recov_state.rs_num_retry_despite_err = 0;
8141 8149
8142 8150 /*
8143 8151 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8144 8152 *
8145 8153 * If source/target are different dirs, then append putfh(src); getattr
8146 8154 */
8147 8155 args.array_len = (odvp == ndvp) ? 5 : 7;
8148 8156 argoplist_size = args.array_len * sizeof (nfs_argop4);
8149 8157 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8150 8158
8151 8159 recov_retry:
8152 8160 *statp = NFS4_OK;
8153 8161
8154 8162 /* No need to Lookup the file, persistent fh */
8155 8163 args.ctag = TAG_RENAME;
8156 8164
8157 8165 mi = VTOMI4(odvp);
8158 8166 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8159 8167 if (e.error) {
8160 8168 kmem_free(argop, argoplist_size);
8161 8169 return (e.error);
8162 8170 }
8163 8171
8164 8172 /* 0: putfh source directory */
8165 8173 argop[0].argop = OP_CPUTFH;
8166 8174 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8167 8175
8168 8176 /* 1: Save source fh to free up current for target */
8169 8177 argop[1].argop = OP_SAVEFH;
8170 8178
8171 8179 /* 2: putfh targetdir */
8172 8180 argop[2].argop = OP_CPUTFH;
8173 8181 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8174 8182
8175 8183 /* 3: current_fh is targetdir, saved_fh is sourcedir */
8176 8184 argop[3].argop = OP_CRENAME;
8177 8185 argop[3].nfs_argop4_u.opcrename.coldname = onm;
8178 8186 argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8179 8187
8180 8188 /* 4: getattr (targetdir) */
8181 8189 argop[4].argop = OP_GETATTR;
8182 8190 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8183 8191 argop[4].nfs_argop4_u.opgetattr.mi = mi;
8184 8192
8185 8193 if (ndvp != odvp) {
8186 8194
8187 8195 /* 5: putfh (sourcedir) */
8188 8196 argop[5].argop = OP_CPUTFH;
8189 8197 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8190 8198
8191 8199 /* 6: getattr (sourcedir) */
8192 8200 argop[6].argop = OP_GETATTR;
8193 8201 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8194 8202 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8195 8203 }
8196 8204
8197 8205 dnlc_remove(odvp, onm);
8198 8206 dnlc_remove(ndvp, nnm);
8199 8207
8200 8208 doqueue = 1;
8201 8209 dinfo.di_time_call = gethrtime();
8202 8210 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8203 8211
8204 8212 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8205 8213 if (e.error) {
8206 8214 PURGE_ATTRCACHE4(odvp);
8207 8215 PURGE_ATTRCACHE4(ndvp);
8208 8216 } else {
8209 8217 *statp = res.status;
8210 8218 }
8211 8219
8212 8220 if (needrecov) {
8213 8221 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8214 8222 OP_RENAME, NULL, NULL, NULL) == FALSE) {
8215 8223 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8216 8224 if (!e.error)
8217 8225 (void) xdr_free(xdr_COMPOUND4res_clnt,
8218 8226 (caddr_t)&res);
8219 8227 goto recov_retry;
8220 8228 }
8221 8229 }
8222 8230
8223 8231 if (!e.error) {
8224 8232 resp = &res;
8225 8233 /*
8226 8234 * as long as OP_RENAME
8227 8235 */
8228 8236 if (res.status != NFS4_OK && res.array_len <= 4) {
8229 8237 e.error = geterrno4(res.status);
8230 8238 PURGE_ATTRCACHE4(odvp);
8231 8239 PURGE_ATTRCACHE4(ndvp);
8232 8240 /*
8233 8241 * System V defines rename to return EEXIST, not
8234 8242 * ENOTEMPTY if the target directory is not empty.
8235 8243 * Over the wire, the error is NFSERR_ENOTEMPTY
8236 8244 * which geterrno4 maps to ENOTEMPTY.
8237 8245 */
8238 8246 if (e.error == ENOTEMPTY)
8239 8247 e.error = EEXIST;
8240 8248 } else {
8241 8249
8242 8250 resop = &res.array[3]; /* rename res */
8243 8251 rn_res = &resop->nfs_resop4_u.oprename;
8244 8252
8245 8253 if (res.status == NFS4_OK) {
8246 8254 /*
8247 8255 * Update target attribute, readdir and dnlc
8248 8256 * caches.
8249 8257 */
8250 8258 dinfo.di_garp =
8251 8259 &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8252 8260 dinfo.di_cred = cr;
8253 8261 dinfop = &dinfo;
8254 8262 } else
8255 8263 dinfop = NULL;
8256 8264
8257 8265 nfs4_update_dircaches(&rn_res->target_cinfo,
8258 8266 ndvp, NULL, NULL, dinfop);
8259 8267
8260 8268 /*
8261 8269 * Update source attribute, readdir and dnlc caches
8262 8270 *
8263 8271 */
8264 8272 if (ndvp != odvp) {
8265 8273 update_parentdir_sfh(renvp, ndvp);
8266 8274
8267 8275 if (dinfop)
8268 8276 dinfo.di_garp =
8269 8277 &(res.array[6].nfs_resop4_u.
8270 8278 opgetattr.ga_res);
8271 8279
8272 8280 nfs4_update_dircaches(&rn_res->source_cinfo,
8273 8281 odvp, NULL, NULL, dinfop);
8274 8282 }
8275 8283
8276 8284 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8277 8285 nnm);
8278 8286 }
8279 8287 }
8280 8288
8281 8289 if (resp)
8282 8290 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8283 8291 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8284 8292 kmem_free(argop, argoplist_size);
8285 8293
8286 8294 return (e.error);
8287 8295 }
8288 8296
8289 8297 /*
8290 8298 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8291 8299 * it is possible for the filehandle to change due to the rename.
8292 8300 *
8293 8301 * The compound req in this case includes a post-rename lookup and getattr
8294 8302 * to ensure that we have the correct fh and attributes for the object.
8295 8303 *
8296 8304 * Rename requires that the current fh be the target directory and the
8297 8305 * saved fh be the source directory. After the operation, the current fh
8298 8306 * is unchanged.
8299 8307 *
8300 8308 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8301 8309 * update the filehandle for the renamed object. We also get the old
8302 8310 * filehandle for historical reasons; this should be taken out sometime.
8303 8311 * This results in a rather cumbersome compound...
8304 8312 *
8305 8313 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8306 8314 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8307 8315 *
8308 8316 */
8309 8317 static int
8310 8318 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8311 8319 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8312 8320 {
8313 8321 COMPOUND4args_clnt args;
8314 8322 COMPOUND4res_clnt res, *resp = NULL;
8315 8323 int argoplist_size;
8316 8324 nfs_argop4 *argop;
8317 8325 nfs_resop4 *resop;
8318 8326 int doqueue;
8319 8327 mntinfo4_t *mi;
8320 8328 rnode4_t *odrp = VTOR4(odvp); /* old directory */
8321 8329 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */
8322 8330 rnode4_t *orp = VTOR4(ovp); /* object being renamed */
8323 8331 RENAME4res *rn_res;
8324 8332 GETFH4res *ngf_res;
8325 8333 bool_t needrecov;
8326 8334 nfs4_recov_state_t recov_state;
8327 8335 hrtime_t t;
8328 8336 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8329 8337 dirattr_info_t dinfo, *dinfop = &dinfo;
8330 8338
8331 8339 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8332 8340
8333 8341 recov_state.rs_flags = 0;
8334 8342 recov_state.rs_num_retry_despite_err = 0;
8335 8343
8336 8344 recov_retry:
8337 8345 *statp = NFS4_OK;
8338 8346
8339 8347 /*
8340 8348 * There is a window between the RPC and updating the path and
8341 8349 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery
8342 8350 * code, so that it doesn't try to use the old path during that
8343 8351 * window.
8344 8352 */
8345 8353 mutex_enter(&orp->r_statelock);
8346 8354 while (orp->r_flags & R4RECEXPFH) {
8347 8355 klwp_t *lwp = ttolwp(curthread);
8348 8356
8349 8357 if (lwp != NULL)
8350 8358 lwp->lwp_nostop++;
8351 8359 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8352 8360 mutex_exit(&orp->r_statelock);
8353 8361 if (lwp != NULL)
8354 8362 lwp->lwp_nostop--;
8355 8363 return (EINTR);
8356 8364 }
8357 8365 if (lwp != NULL)
8358 8366 lwp->lwp_nostop--;
8359 8367 }
8360 8368 orp->r_flags |= R4RECEXPFH;
8361 8369 mutex_exit(&orp->r_statelock);
8362 8370
8363 8371 mi = VTOMI4(odvp);
8364 8372
8365 8373 args.ctag = TAG_RENAME_VFH;
8366 8374 args.array_len = (odvp == ndvp) ? 10 : 12;
8367 8375 argoplist_size = args.array_len * sizeof (nfs_argop4);
8368 8376 argop = kmem_alloc(argoplist_size, KM_SLEEP);
8369 8377
8370 8378 /*
8371 8379 * Rename ops:
8372 8380 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8373 8381 * PUTFH(targetdir), RENAME, GETATTR(targetdir)
8374 8382 * LOOKUP(trgt), GETFH(new), GETATTR,
8375 8383 *
8376 8384 * if (odvp != ndvp)
8377 8385 * add putfh(sourcedir), getattr(sourcedir) }
8378 8386 */
8379 8387 args.array = argop;
8380 8388
8381 8389 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8382 8390 &recov_state, NULL);
8383 8391 if (e.error) {
8384 8392 kmem_free(argop, argoplist_size);
8385 8393 mutex_enter(&orp->r_statelock);
8386 8394 orp->r_flags &= ~R4RECEXPFH;
8387 8395 cv_broadcast(&orp->r_cv);
8388 8396 mutex_exit(&orp->r_statelock);
8389 8397 return (e.error);
8390 8398 }
8391 8399
8392 8400 /* 0: putfh source directory */
8393 8401 argop[0].argop = OP_CPUTFH;
8394 8402 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8395 8403
8396 8404 /* 1: Save source fh to free up current for target */
8397 8405 argop[1].argop = OP_SAVEFH;
8398 8406
8399 8407 /* 2: Lookup pre-rename fh of renamed object */
8400 8408 argop[2].argop = OP_CLOOKUP;
8401 8409 argop[2].nfs_argop4_u.opclookup.cname = onm;
8402 8410
8403 8411 /* 3: getfh fh of renamed object (before rename) */
8404 8412 argop[3].argop = OP_GETFH;
8405 8413
8406 8414 /* 4: putfh targetdir */
8407 8415 argop[4].argop = OP_CPUTFH;
8408 8416 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8409 8417
8410 8418 /* 5: current_fh is targetdir, saved_fh is sourcedir */
8411 8419 argop[5].argop = OP_CRENAME;
8412 8420 argop[5].nfs_argop4_u.opcrename.coldname = onm;
8413 8421 argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8414 8422
8415 8423 /* 6: getattr of target dir (post op attrs) */
8416 8424 argop[6].argop = OP_GETATTR;
8417 8425 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8418 8426 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8419 8427
8420 8428 /* 7: Lookup post-rename fh of renamed object */
8421 8429 argop[7].argop = OP_CLOOKUP;
8422 8430 argop[7].nfs_argop4_u.opclookup.cname = nnm;
8423 8431
8424 8432 /* 8: getfh fh of renamed object (after rename) */
8425 8433 argop[8].argop = OP_GETFH;
8426 8434
8427 8435 /* 9: getattr of renamed object */
8428 8436 argop[9].argop = OP_GETATTR;
8429 8437 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8430 8438 argop[9].nfs_argop4_u.opgetattr.mi = mi;
8431 8439
8432 8440 /*
8433 8441 * If source/target dirs are different, then get new post-op
8434 8442 * attrs for source dir also.
8435 8443 */
8436 8444 if (ndvp != odvp) {
8437 8445 /* 10: putfh (sourcedir) */
8438 8446 argop[10].argop = OP_CPUTFH;
8439 8447 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8440 8448
8441 8449 /* 11: getattr (sourcedir) */
8442 8450 argop[11].argop = OP_GETATTR;
8443 8451 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8444 8452 argop[11].nfs_argop4_u.opgetattr.mi = mi;
8445 8453 }
8446 8454
8447 8455 dnlc_remove(odvp, onm);
8448 8456 dnlc_remove(ndvp, nnm);
8449 8457
8450 8458 doqueue = 1;
8451 8459 t = gethrtime();
8452 8460 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8453 8461
8454 8462 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8455 8463 if (e.error) {
8456 8464 PURGE_ATTRCACHE4(odvp);
8457 8465 PURGE_ATTRCACHE4(ndvp);
8458 8466 if (!needrecov) {
8459 8467 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8460 8468 &recov_state, needrecov);
8461 8469 goto out;
8462 8470 }
8463 8471 } else {
8464 8472 *statp = res.status;
8465 8473 }
8466 8474
8467 8475 if (needrecov) {
8468 8476 bool_t abort;
8469 8477
8470 8478 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8471 8479 OP_RENAME, NULL, NULL, NULL);
8472 8480 if (abort == FALSE) {
8473 8481 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8474 8482 &recov_state, needrecov);
8475 8483 kmem_free(argop, argoplist_size);
8476 8484 if (!e.error)
8477 8485 (void) xdr_free(xdr_COMPOUND4res_clnt,
8478 8486 (caddr_t)&res);
8479 8487 mutex_enter(&orp->r_statelock);
8480 8488 orp->r_flags &= ~R4RECEXPFH;
8481 8489 cv_broadcast(&orp->r_cv);
8482 8490 mutex_exit(&orp->r_statelock);
8483 8491 goto recov_retry;
8484 8492 } else {
8485 8493 if (e.error != 0) {
8486 8494 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8487 8495 &recov_state, needrecov);
8488 8496 goto out;
8489 8497 }
8490 8498 /* fall through for res.status case */
8491 8499 }
8492 8500 }
8493 8501
8494 8502 resp = &res;
8495 8503 /*
8496 8504 * If OP_RENAME (or any prev op) failed, then return an error.
8497 8505 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8498 8506 */
8499 8507 if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8500 8508 /*
8501 8509 * Error in an op other than last Getattr
8502 8510 */
8503 8511 e.error = geterrno4(res.status);
8504 8512 PURGE_ATTRCACHE4(odvp);
8505 8513 PURGE_ATTRCACHE4(ndvp);
8506 8514 /*
8507 8515 * System V defines rename to return EEXIST, not
8508 8516 * ENOTEMPTY if the target directory is not empty.
8509 8517 * Over the wire, the error is NFSERR_ENOTEMPTY
8510 8518 * which geterrno4 maps to ENOTEMPTY.
8511 8519 */
8512 8520 if (e.error == ENOTEMPTY)
8513 8521 e.error = EEXIST;
8514 8522 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8515 8523 needrecov);
8516 8524 goto out;
8517 8525 }
8518 8526
8519 8527 /* rename results */
8520 8528 rn_res = &res.array[5].nfs_resop4_u.oprename;
8521 8529
8522 8530 if (res.status == NFS4_OK) {
8523 8531 /* Update target attribute, readdir and dnlc caches */
8524 8532 dinfo.di_garp =
8525 8533 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8526 8534 dinfo.di_cred = cr;
8527 8535 dinfo.di_time_call = t;
8528 8536 } else
8529 8537 dinfop = NULL;
8530 8538
8531 8539 /* Update source cache attribute, readdir and dnlc caches */
8532 8540 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8533 8541
8534 8542 /* Update source cache attribute, readdir and dnlc caches */
8535 8543 if (ndvp != odvp) {
8536 8544 update_parentdir_sfh(ovp, ndvp);
8537 8545
8538 8546 /*
8539 8547 * If dinfop is non-NULL, then compound succeded, so
8540 8548 * set di_garp to attrs for source dir. dinfop is only
8541 8549 * set to NULL when compound fails.
8542 8550 */
8543 8551 if (dinfop)
8544 8552 dinfo.di_garp =
8545 8553 &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8546 8554 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8547 8555 dinfop);
8548 8556 }
8549 8557
8550 8558 /*
8551 8559 * Update the rnode with the new component name and args,
8552 8560 * and if the file handle changed, also update it with the new fh.
8553 8561 * This is only necessary if the target object has an rnode
8554 8562 * entry and there is no need to create one for it.
8555 8563 */
8556 8564 resop = &res.array[8]; /* getfh new res */
8557 8565 ngf_res = &resop->nfs_resop4_u.opgetfh;
8558 8566
8559 8567 /*
8560 8568 * Update the path and filehandle for the renamed object.
8561 8569 */
8562 8570 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8563 8571
8564 8572 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8565 8573
8566 8574 if (res.status == NFS4_OK) {
8567 8575 resop++; /* getattr res */
8568 8576 e.error = nfs4_update_attrcache(res.status,
8569 8577 &resop->nfs_resop4_u.opgetattr.ga_res,
8570 8578 t, ovp, cr);
8571 8579 }
8572 8580
8573 8581 out:
8574 8582 kmem_free(argop, argoplist_size);
8575 8583 if (resp)
8576 8584 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8577 8585 mutex_enter(&orp->r_statelock);
8578 8586 orp->r_flags &= ~R4RECEXPFH;
8579 8587 cv_broadcast(&orp->r_cv);
8580 8588 mutex_exit(&orp->r_statelock);
8581 8589
8582 8590 return (e.error);
8583 8591 }
8584 8592
8585 8593 /* ARGSUSED */
8586 8594 static int
8587 8595 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8588 8596 caller_context_t *ct, int flags, vsecattr_t *vsecp)
8589 8597 {
8590 8598 int error;
8591 8599 vnode_t *vp;
8592 8600
8593 8601 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8594 8602 return (EPERM);
8595 8603 /*
8596 8604 * As ".." has special meaning and rather than send a mkdir
8597 8605 * over the wire to just let the server freak out, we just
8598 8606 * short circuit it here and return EEXIST
8599 8607 */
8600 8608 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8601 8609 return (EEXIST);
8602 8610
8603 8611 /*
8604 8612 * Decision to get the right gid and setgid bit of the
8605 8613 * new directory is now made in call_nfs4_create_req.
8606 8614 */
8607 8615 va->va_mask |= AT_MODE;
8608 8616 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8609 8617 if (error)
8610 8618 return (error);
8611 8619
8612 8620 *vpp = vp;
8613 8621 return (0);
8614 8622 }
8615 8623
8616 8624
8617 8625 /*
8618 8626 * rmdir is using the same remove v4 op as does remove.
8619 8627 * Remove requires that the current fh be the target directory.
8620 8628 * After the operation, the current fh is unchanged.
8621 8629 * The compound op structure is:
8622 8630 * PUTFH(targetdir), REMOVE
8623 8631 */
8624 8632 /*ARGSUSED4*/
8625 8633 static int
8626 8634 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8627 8635 caller_context_t *ct, int flags)
8628 8636 {
8629 8637 int need_end_op = FALSE;
8630 8638 COMPOUND4args_clnt args;
8631 8639 COMPOUND4res_clnt res, *resp = NULL;
8632 8640 REMOVE4res *rm_res;
8633 8641 nfs_argop4 argop[3];
8634 8642 nfs_resop4 *resop;
8635 8643 vnode_t *vp;
8636 8644 int doqueue;
8637 8645 mntinfo4_t *mi;
8638 8646 rnode4_t *drp;
8639 8647 bool_t needrecov = FALSE;
8640 8648 nfs4_recov_state_t recov_state;
8641 8649 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8642 8650 dirattr_info_t dinfo, *dinfop;
8643 8651
8644 8652 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8645 8653 return (EPERM);
8646 8654 /*
8647 8655 * As ".." has special meaning and rather than send a rmdir
8648 8656 * over the wire to just let the server freak out, we just
8649 8657 * short circuit it here and return EEXIST
8650 8658 */
8651 8659 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8652 8660 return (EEXIST);
8653 8661
8654 8662 drp = VTOR4(dvp);
8655 8663 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8656 8664 return (EINTR);
8657 8665
8658 8666 /*
8659 8667 * Attempt to prevent a rmdir(".") from succeeding.
8660 8668 */
8661 8669 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8662 8670 if (e.error) {
8663 8671 nfs_rw_exit(&drp->r_rwlock);
8664 8672 return (e.error);
8665 8673 }
8666 8674 if (vp == cdir) {
8667 8675 VN_RELE(vp);
8668 8676 nfs_rw_exit(&drp->r_rwlock);
8669 8677 return (EINVAL);
8670 8678 }
8671 8679
8672 8680 /*
8673 8681 * Since nfsv4 remove op works on both files and directories,
8674 8682 * check that the removed object is indeed a directory.
8675 8683 */
8676 8684 if (vp->v_type != VDIR) {
8677 8685 VN_RELE(vp);
8678 8686 nfs_rw_exit(&drp->r_rwlock);
8679 8687 return (ENOTDIR);
8680 8688 }
8681 8689
8682 8690 /*
8683 8691 * First just remove the entry from the name cache, as it
8684 8692 * is most likely an entry for this vp.
8685 8693 */
8686 8694 dnlc_remove(dvp, nm);
8687 8695
8688 8696 /*
8689 8697 * If there vnode reference count is greater than one, then
8690 8698 * there may be additional references in the DNLC which will
8691 8699 * need to be purged. First, trying removing the entry for
8692 8700 * the parent directory and see if that removes the additional
8693 8701 * reference(s). If that doesn't do it, then use dnlc_purge_vp
8694 8702 * to completely remove any references to the directory which
8695 8703 * might still exist in the DNLC.
8696 8704 */
8697 8705 if (vp->v_count > 1) {
8698 8706 dnlc_remove(vp, "..");
8699 8707 if (vp->v_count > 1)
8700 8708 dnlc_purge_vp(vp);
8701 8709 }
8702 8710
8703 8711 mi = VTOMI4(dvp);
8704 8712 recov_state.rs_flags = 0;
8705 8713 recov_state.rs_num_retry_despite_err = 0;
8706 8714
8707 8715 recov_retry:
8708 8716 args.ctag = TAG_RMDIR;
8709 8717
8710 8718 /*
8711 8719 * Rmdir ops: putfh dir; remove
8712 8720 */
8713 8721 args.array_len = 3;
8714 8722 args.array = argop;
8715 8723
8716 8724 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8717 8725 if (e.error) {
8718 8726 nfs_rw_exit(&drp->r_rwlock);
8719 8727 return (e.error);
8720 8728 }
8721 8729 need_end_op = TRUE;
8722 8730
8723 8731 /* putfh directory */
8724 8732 argop[0].argop = OP_CPUTFH;
8725 8733 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8726 8734
8727 8735 /* remove */
8728 8736 argop[1].argop = OP_CREMOVE;
8729 8737 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8730 8738
8731 8739 /* getattr (postop attrs for dir that contained removed dir) */
8732 8740 argop[2].argop = OP_GETATTR;
8733 8741 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8734 8742 argop[2].nfs_argop4_u.opgetattr.mi = mi;
8735 8743
8736 8744 dinfo.di_time_call = gethrtime();
8737 8745 doqueue = 1;
8738 8746 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8739 8747
8740 8748 PURGE_ATTRCACHE4(vp);
8741 8749
8742 8750 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8743 8751 if (e.error) {
8744 8752 PURGE_ATTRCACHE4(dvp);
8745 8753 }
8746 8754
8747 8755 if (needrecov) {
8748 8756 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8749 8757 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8750 8758 if (!e.error)
8751 8759 (void) xdr_free(xdr_COMPOUND4res_clnt,
8752 8760 (caddr_t)&res);
8753 8761
8754 8762 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8755 8763 needrecov);
8756 8764 need_end_op = FALSE;
8757 8765 goto recov_retry;
8758 8766 }
8759 8767 }
8760 8768
8761 8769 if (!e.error) {
8762 8770 resp = &res;
8763 8771
8764 8772 /*
8765 8773 * Only return error if first 2 ops (OP_REMOVE or earlier)
8766 8774 * failed.
8767 8775 */
8768 8776 if (res.status != NFS4_OK && res.array_len <= 2) {
8769 8777 e.error = geterrno4(res.status);
8770 8778 PURGE_ATTRCACHE4(dvp);
8771 8779 nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8772 8780 &recov_state, needrecov);
8773 8781 need_end_op = FALSE;
8774 8782 nfs4_purge_stale_fh(e.error, dvp, cr);
8775 8783 /*
8776 8784 * System V defines rmdir to return EEXIST, not
8777 8785 * ENOTEMPTY if the directory is not empty. Over
8778 8786 * the wire, the error is NFSERR_ENOTEMPTY which
8779 8787 * geterrno4 maps to ENOTEMPTY.
8780 8788 */
8781 8789 if (e.error == ENOTEMPTY)
8782 8790 e.error = EEXIST;
8783 8791 } else {
8784 8792 resop = &res.array[1]; /* remove res */
8785 8793 rm_res = &resop->nfs_resop4_u.opremove;
8786 8794
8787 8795 if (res.status == NFS4_OK) {
8788 8796 resop = &res.array[2]; /* dir attrs */
8789 8797 dinfo.di_garp =
8790 8798 &resop->nfs_resop4_u.opgetattr.ga_res;
8791 8799 dinfo.di_cred = cr;
8792 8800 dinfop = &dinfo;
8793 8801 } else
8794 8802 dinfop = NULL;
8795 8803
8796 8804 /* Update dir attribute, readdir and dnlc caches */
8797 8805 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8798 8806 dinfop);
8799 8807
8800 8808 /* destroy rddir cache for dir that was removed */
8801 8809 if (VTOR4(vp)->r_dir != NULL)
8802 8810 nfs4_purge_rddir_cache(vp);
8803 8811 }
8804 8812 }
8805 8813
8806 8814 if (need_end_op)
8807 8815 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8808 8816
8809 8817 nfs_rw_exit(&drp->r_rwlock);
8810 8818
8811 8819 if (resp)
8812 8820 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8813 8821
8814 8822 if (e.error == 0) {
8815 8823 vnode_t *tvp;
8816 8824 rnode4_t *trp;
8817 8825 trp = VTOR4(vp);
8818 8826 tvp = vp;
8819 8827 if (IS_SHADOW(vp, trp))
8820 8828 tvp = RTOV4(trp);
8821 8829 vnevent_rmdir(tvp, dvp, nm, ct);
8822 8830 }
8823 8831
8824 8832 VN_RELE(vp);
8825 8833
8826 8834 return (e.error);
8827 8835 }
8828 8836
8829 8837 /* ARGSUSED */
8830 8838 static int
8831 8839 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8832 8840 caller_context_t *ct, int flags)
8833 8841 {
8834 8842 int error;
8835 8843 vnode_t *vp;
8836 8844 rnode4_t *rp;
8837 8845 char *contents;
8838 8846 mntinfo4_t *mi = VTOMI4(dvp);
8839 8847
8840 8848 if (nfs_zone() != mi->mi_zone)
8841 8849 return (EPERM);
8842 8850 if (!(mi->mi_flags & MI4_SYMLINK))
8843 8851 return (EOPNOTSUPP);
8844 8852
8845 8853 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8846 8854 if (error)
8847 8855 return (error);
8848 8856
8849 8857 ASSERT(nfs4_consistent_type(vp));
8850 8858 rp = VTOR4(vp);
8851 8859 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8852 8860
8853 8861 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8854 8862
8855 8863 if (contents != NULL) {
8856 8864 mutex_enter(&rp->r_statelock);
8857 8865 if (rp->r_symlink.contents == NULL) {
8858 8866 rp->r_symlink.len = strlen(tnm);
8859 8867 bcopy(tnm, contents, rp->r_symlink.len);
8860 8868 rp->r_symlink.contents = contents;
8861 8869 rp->r_symlink.size = MAXPATHLEN;
8862 8870 mutex_exit(&rp->r_statelock);
8863 8871 } else {
8864 8872 mutex_exit(&rp->r_statelock);
8865 8873 kmem_free((void *)contents, MAXPATHLEN);
8866 8874 }
8867 8875 }
8868 8876 }
8869 8877 VN_RELE(vp);
8870 8878
8871 8879 return (error);
8872 8880 }
8873 8881
8874 8882
8875 8883 /*
8876 8884 * Read directory entries.
8877 8885 * There are some weird things to look out for here. The uio_loffset
8878 8886 * field is either 0 or it is the offset returned from a previous
8879 8887 * readdir. It is an opaque value used by the server to find the
8880 8888 * correct directory block to read. The count field is the number
8881 8889 * of blocks to read on the server. This is advisory only, the server
8882 8890 * may return only one block's worth of entries. Entries may be compressed
8883 8891 * on the server.
8884 8892 */
8885 8893 /* ARGSUSED */
8886 8894 static int
8887 8895 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8888 8896 caller_context_t *ct, int flags)
8889 8897 {
8890 8898 int error;
8891 8899 uint_t count;
8892 8900 rnode4_t *rp;
8893 8901 rddir4_cache *rdc;
8894 8902 rddir4_cache *rrdc;
8895 8903
8896 8904 if (nfs_zone() != VTOMI4(vp)->mi_zone)
8897 8905 return (EIO);
8898 8906 rp = VTOR4(vp);
8899 8907
8900 8908 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8901 8909
8902 8910 /*
8903 8911 * Make sure that the directory cache is valid.
8904 8912 */
8905 8913 if (rp->r_dir != NULL) {
8906 8914 if (nfs_disable_rddir_cache != 0) {
8907 8915 /*
8908 8916 * Setting nfs_disable_rddir_cache in /etc/system
8909 8917 * allows interoperability with servers that do not
8910 8918 * properly update the attributes of directories.
8911 8919 * Any cached information gets purged before an
8912 8920 * access is made to it.
8913 8921 */
8914 8922 nfs4_purge_rddir_cache(vp);
8915 8923 }
8916 8924
8917 8925 error = nfs4_validate_caches(vp, cr);
8918 8926 if (error)
8919 8927 return (error);
8920 8928 }
8921 8929
8922 8930 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8923 8931
8924 8932 /*
8925 8933 * Short circuit last readdir which always returns 0 bytes.
8926 8934 * This can be done after the directory has been read through
8927 8935 * completely at least once. This will set r_direof which
8928 8936 * can be used to find the value of the last cookie.
8929 8937 */
8930 8938 mutex_enter(&rp->r_statelock);
8931 8939 if (rp->r_direof != NULL &&
8932 8940 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8933 8941 mutex_exit(&rp->r_statelock);
8934 8942 #ifdef DEBUG
8935 8943 nfs4_readdir_cache_shorts++;
8936 8944 #endif
8937 8945 if (eofp)
8938 8946 *eofp = 1;
8939 8947 return (0);
8940 8948 }
8941 8949
8942 8950 /*
8943 8951 * Look for a cache entry. Cache entries are identified
8944 8952 * by the NFS cookie value and the byte count requested.
8945 8953 */
8946 8954 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8947 8955
8948 8956 /*
8949 8957 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8950 8958 */
8951 8959 if (rdc == NULL) {
8952 8960 mutex_exit(&rp->r_statelock);
8953 8961 return (EINTR);
8954 8962 }
8955 8963
8956 8964 /*
8957 8965 * Check to see if we need to fill this entry in.
8958 8966 */
8959 8967 if (rdc->flags & RDDIRREQ) {
8960 8968 rdc->flags &= ~RDDIRREQ;
8961 8969 rdc->flags |= RDDIR;
8962 8970 mutex_exit(&rp->r_statelock);
8963 8971
8964 8972 /*
8965 8973 * Do the readdir.
8966 8974 */
8967 8975 nfs4readdir(vp, rdc, cr);
8968 8976
8969 8977 /*
8970 8978 * Reacquire the lock, so that we can continue
8971 8979 */
8972 8980 mutex_enter(&rp->r_statelock);
8973 8981 /*
8974 8982 * The entry is now complete
8975 8983 */
8976 8984 rdc->flags &= ~RDDIR;
8977 8985 }
8978 8986
8979 8987 ASSERT(!(rdc->flags & RDDIR));
8980 8988
8981 8989 /*
8982 8990 * If an error occurred while attempting
8983 8991 * to fill the cache entry, mark the entry invalid and
8984 8992 * just return the error.
8985 8993 */
8986 8994 if (rdc->error) {
8987 8995 error = rdc->error;
8988 8996 rdc->flags |= RDDIRREQ;
8989 8997 rddir4_cache_rele(rp, rdc);
8990 8998 mutex_exit(&rp->r_statelock);
8991 8999 return (error);
8992 9000 }
8993 9001
8994 9002 /*
8995 9003 * The cache entry is complete and good,
8996 9004 * copyout the dirent structs to the calling
8997 9005 * thread.
8998 9006 */
8999 9007 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9000 9008
9001 9009 /*
9002 9010 * If no error occurred during the copyout,
9003 9011 * update the offset in the uio struct to
9004 9012 * contain the value of the next NFS 4 cookie
9005 9013 * and set the eof value appropriately.
9006 9014 */
9007 9015 if (!error) {
9008 9016 uiop->uio_loffset = rdc->nfs4_ncookie;
9009 9017 if (eofp)
9010 9018 *eofp = rdc->eof;
9011 9019 }
9012 9020
9013 9021 /*
9014 9022 * Decide whether to do readahead. Don't if we
9015 9023 * have already read to the end of directory.
9016 9024 */
9017 9025 if (rdc->eof) {
9018 9026 /*
9019 9027 * Make the entry the direof only if it is cached
9020 9028 */
9021 9029 if (rdc->flags & RDDIRCACHED)
9022 9030 rp->r_direof = rdc;
9023 9031 rddir4_cache_rele(rp, rdc);
9024 9032 mutex_exit(&rp->r_statelock);
9025 9033 return (error);
9026 9034 }
9027 9035
9028 9036 /* Determine if a readdir readahead should be done */
9029 9037 if (!(rp->r_flags & R4LOOKUP)) {
9030 9038 rddir4_cache_rele(rp, rdc);
9031 9039 mutex_exit(&rp->r_statelock);
9032 9040 return (error);
9033 9041 }
9034 9042
9035 9043 /*
9036 9044 * Now look for a readahead entry.
9037 9045 *
9038 9046 * Check to see whether we found an entry for the readahead.
9039 9047 * If so, we don't need to do anything further, so free the new
9040 9048 * entry if one was allocated. Otherwise, allocate a new entry, add
9041 9049 * it to the cache, and then initiate an asynchronous readdir
9042 9050 * operation to fill it.
9043 9051 */
9044 9052 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9045 9053
9046 9054 /*
9047 9055 * A readdir cache entry could not be obtained for the readahead. In
9048 9056 * this case we skip the readahead and return.
9049 9057 */
9050 9058 if (rrdc == NULL) {
9051 9059 rddir4_cache_rele(rp, rdc);
9052 9060 mutex_exit(&rp->r_statelock);
9053 9061 return (error);
9054 9062 }
9055 9063
9056 9064 /*
9057 9065 * Check to see if we need to fill this entry in.
9058 9066 */
9059 9067 if (rrdc->flags & RDDIRREQ) {
9060 9068 rrdc->flags &= ~RDDIRREQ;
9061 9069 rrdc->flags |= RDDIR;
9062 9070 rddir4_cache_rele(rp, rdc);
9063 9071 mutex_exit(&rp->r_statelock);
9064 9072 #ifdef DEBUG
9065 9073 nfs4_readdir_readahead++;
9066 9074 #endif
9067 9075 /*
9068 9076 * Do the readdir.
9069 9077 */
9070 9078 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9071 9079 return (error);
9072 9080 }
9073 9081
9074 9082 rddir4_cache_rele(rp, rrdc);
9075 9083 rddir4_cache_rele(rp, rdc);
9076 9084 mutex_exit(&rp->r_statelock);
9077 9085 return (error);
9078 9086 }
9079 9087
9080 9088 static int
9081 9089 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9082 9090 {
9083 9091 int error;
9084 9092 rnode4_t *rp;
9085 9093
9086 9094 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9087 9095
9088 9096 rp = VTOR4(vp);
9089 9097
9090 9098 /*
9091 9099 * Obtain the readdir results for the caller.
9092 9100 */
9093 9101 nfs4readdir(vp, rdc, cr);
9094 9102
9095 9103 mutex_enter(&rp->r_statelock);
9096 9104 /*
9097 9105 * The entry is now complete
9098 9106 */
9099 9107 rdc->flags &= ~RDDIR;
9100 9108
9101 9109 error = rdc->error;
9102 9110 if (error)
9103 9111 rdc->flags |= RDDIRREQ;
9104 9112 rddir4_cache_rele(rp, rdc);
9105 9113 mutex_exit(&rp->r_statelock);
9106 9114
9107 9115 return (error);
9108 9116 }
9109 9117
9110 9118 /*
9111 9119 * Read directory entries.
9112 9120 * There are some weird things to look out for here. The uio_loffset
9113 9121 * field is either 0 or it is the offset returned from a previous
9114 9122 * readdir. It is an opaque value used by the server to find the
9115 9123 * correct directory block to read. The count field is the number
9116 9124 * of blocks to read on the server. This is advisory only, the server
9117 9125 * may return only one block's worth of entries. Entries may be compressed
9118 9126 * on the server.
9119 9127 *
9120 9128 * Generates the following compound request:
9121 9129 * 1. If readdir offset is zero and no dnlc entry for parent exists,
9122 9130 * must include a Lookupp as well. In this case, send:
9123 9131 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9124 9132 * 2. Otherwise just do: { Putfh <fh>; Readdir }
9125 9133 *
9126 9134 * Get complete attributes and filehandles for entries if this is the
9127 9135 * first read of the directory. Otherwise, just get fileid's.
9128 9136 */
9129 9137 static void
9130 9138 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9131 9139 {
9132 9140 COMPOUND4args_clnt args;
9133 9141 COMPOUND4res_clnt res;
9134 9142 READDIR4args *rargs;
9135 9143 READDIR4res_clnt *rd_res;
9136 9144 bitmap4 rd_bitsval;
9137 9145 nfs_argop4 argop[5];
9138 9146 nfs_resop4 *resop;
9139 9147 rnode4_t *rp = VTOR4(vp);
9140 9148 mntinfo4_t *mi = VTOMI4(vp);
9141 9149 int doqueue;
9142 9150 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */
9143 9151 vnode_t *dvp;
9144 9152 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9145 9153 int num_ops, res_opcnt;
9146 9154 bool_t needrecov = FALSE;
9147 9155 nfs4_recov_state_t recov_state;
9148 9156 hrtime_t t;
9149 9157 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9150 9158
9151 9159 ASSERT(nfs_zone() == mi->mi_zone);
9152 9160 ASSERT(rdc->flags & RDDIR);
9153 9161 ASSERT(rdc->entries == NULL);
9154 9162
9155 9163 /*
9156 9164 * If rp were a stub, it should have triggered and caused
9157 9165 * a mount for us to get this far.
9158 9166 */
9159 9167 ASSERT(!RP_ISSTUB(rp));
9160 9168
9161 9169 num_ops = 2;
9162 9170 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9163 9171 /*
9164 9172 * Since nfsv4 readdir may not return entries for "." and "..",
9165 9173 * the client must recreate them:
9166 9174 * To find the correct nodeid, do the following:
9167 9175 * For current node, get nodeid from dnlc.
9168 9176 * - if current node is rootvp, set pnodeid to nodeid.
9169 9177 * - else if parent is in the dnlc, get its nodeid from there.
9170 9178 * - else add LOOKUPP+GETATTR to compound.
9171 9179 */
9172 9180 nodeid = rp->r_attr.va_nodeid;
9173 9181 if (vp->v_flag & VROOT) {
9174 9182 pnodeid = nodeid; /* root of mount point */
9175 9183 } else {
9176 9184 dvp = dnlc_lookup(vp, "..");
9177 9185 if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9178 9186 /* parent in dnlc cache - no need for otw */
9179 9187 pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9180 9188 } else {
9181 9189 /*
9182 9190 * parent not in dnlc cache,
9183 9191 * do lookupp to get its id
9184 9192 */
9185 9193 num_ops = 5;
9186 9194 pnodeid = 0; /* set later by getattr parent */
9187 9195 }
9188 9196 if (dvp)
9189 9197 VN_RELE(dvp);
9190 9198 }
9191 9199 }
9192 9200 recov_state.rs_flags = 0;
9193 9201 recov_state.rs_num_retry_despite_err = 0;
9194 9202
9195 9203 /* Save the original mount point security flavor */
9196 9204 (void) save_mnt_secinfo(mi->mi_curr_serv);
9197 9205
9198 9206 recov_retry:
9199 9207 args.ctag = TAG_READDIR;
9200 9208
9201 9209 args.array = argop;
9202 9210 args.array_len = num_ops;
9203 9211
9204 9212 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9205 9213 &recov_state, NULL)) {
9206 9214 /*
9207 9215 * If readdir a node that is a stub for a crossed mount point,
9208 9216 * keep the original secinfo flavor for the current file
9209 9217 * system, not the crossed one.
9210 9218 */
9211 9219 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9212 9220 rdc->error = e.error;
9213 9221 return;
9214 9222 }
9215 9223
9216 9224 /*
9217 9225 * Determine which attrs to request for dirents. This code
9218 9226 * must be protected by nfs4_start/end_fop because of r_server
9219 9227 * (which will change during failover recovery).
9220 9228 *
9221 9229 */
9222 9230 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9223 9231 /*
9224 9232 * Get all vattr attrs plus filehandle and rdattr_error
9225 9233 */
9226 9234 rd_bitsval = NFS4_VATTR_MASK |
9227 9235 FATTR4_RDATTR_ERROR_MASK |
9228 9236 FATTR4_FILEHANDLE_MASK;
9229 9237
9230 9238 if (rp->r_flags & R4READDIRWATTR) {
9231 9239 mutex_enter(&rp->r_statelock);
9232 9240 rp->r_flags &= ~R4READDIRWATTR;
9233 9241 mutex_exit(&rp->r_statelock);
9234 9242 }
9235 9243 } else {
9236 9244 servinfo4_t *svp = rp->r_server;
9237 9245
9238 9246 /*
9239 9247 * Already read directory. Use readdir with
9240 9248 * no attrs (except for mounted_on_fileid) for updates.
9241 9249 */
9242 9250 rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9243 9251
9244 9252 /*
9245 9253 * request mounted on fileid if supported, else request
9246 9254 * fileid. maybe we should verify that fileid is supported
9247 9255 * and request something else if not.
9248 9256 */
9249 9257 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9250 9258 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9251 9259 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9252 9260 nfs_rw_exit(&svp->sv_lock);
9253 9261 }
9254 9262
9255 9263 /* putfh directory fh */
9256 9264 argop[0].argop = OP_CPUTFH;
9257 9265 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9258 9266
9259 9267 argop[1].argop = OP_READDIR;
9260 9268 rargs = &argop[1].nfs_argop4_u.opreaddir;
9261 9269 /*
9262 9270 * 1 and 2 are reserved for client "." and ".." entry offset.
9263 9271 * cookie 0 should be used over-the-wire to start reading at
9264 9272 * the beginning of the directory excluding "." and "..".
9265 9273 */
9266 9274 if (rdc->nfs4_cookie == 0 ||
9267 9275 rdc->nfs4_cookie == 1 ||
9268 9276 rdc->nfs4_cookie == 2) {
9269 9277 rargs->cookie = (nfs_cookie4)0;
9270 9278 rargs->cookieverf = 0;
9271 9279 } else {
9272 9280 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9273 9281 mutex_enter(&rp->r_statelock);
9274 9282 rargs->cookieverf = rp->r_cookieverf4;
9275 9283 mutex_exit(&rp->r_statelock);
9276 9284 }
9277 9285 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9278 9286 rargs->maxcount = mi->mi_tsize;
9279 9287 rargs->attr_request = rd_bitsval;
9280 9288 rargs->rdc = rdc;
9281 9289 rargs->dvp = vp;
9282 9290 rargs->mi = mi;
9283 9291 rargs->cr = cr;
9284 9292
9285 9293
9286 9294 /*
9287 9295 * If count < than the minimum required, we return no entries
9288 9296 * and fail with EINVAL
9289 9297 */
9290 9298 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9291 9299 rdc->error = EINVAL;
9292 9300 goto out;
9293 9301 }
9294 9302
9295 9303 if (args.array_len == 5) {
9296 9304 /*
9297 9305 * Add lookupp and getattr for parent nodeid.
9298 9306 */
9299 9307 argop[2].argop = OP_LOOKUPP;
9300 9308
9301 9309 argop[3].argop = OP_GETFH;
9302 9310
9303 9311 /* getattr parent */
9304 9312 argop[4].argop = OP_GETATTR;
9305 9313 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9306 9314 argop[4].nfs_argop4_u.opgetattr.mi = mi;
9307 9315 }
9308 9316
9309 9317 doqueue = 1;
9310 9318
9311 9319 if (mi->mi_io_kstats) {
9312 9320 mutex_enter(&mi->mi_lock);
9313 9321 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9314 9322 mutex_exit(&mi->mi_lock);
9315 9323 }
9316 9324
9317 9325 /* capture the time of this call */
9318 9326 rargs->t = t = gethrtime();
9319 9327
9320 9328 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9321 9329
9322 9330 if (mi->mi_io_kstats) {
9323 9331 mutex_enter(&mi->mi_lock);
9324 9332 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9325 9333 mutex_exit(&mi->mi_lock);
9326 9334 }
9327 9335
9328 9336 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9329 9337
9330 9338 /*
9331 9339 * If RPC error occurred and it isn't an error that
9332 9340 * triggers recovery, then go ahead and fail now.
9333 9341 */
9334 9342 if (e.error != 0 && !needrecov) {
9335 9343 rdc->error = e.error;
9336 9344 goto out;
9337 9345 }
9338 9346
9339 9347 if (needrecov) {
9340 9348 bool_t abort;
9341 9349
9342 9350 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9343 9351 "nfs4readdir: initiating recovery.\n"));
9344 9352
9345 9353 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9346 9354 NULL, OP_READDIR, NULL, NULL, NULL);
9347 9355 if (abort == FALSE) {
9348 9356 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9349 9357 &recov_state, needrecov);
9350 9358 if (!e.error)
9351 9359 (void) xdr_free(xdr_COMPOUND4res_clnt,
9352 9360 (caddr_t)&res);
9353 9361 if (rdc->entries != NULL) {
9354 9362 kmem_free(rdc->entries, rdc->entlen);
9355 9363 rdc->entries = NULL;
9356 9364 }
9357 9365 goto recov_retry;
9358 9366 }
9359 9367
9360 9368 if (e.error != 0) {
9361 9369 rdc->error = e.error;
9362 9370 goto out;
9363 9371 }
9364 9372
9365 9373 /* fall through for res.status case */
9366 9374 }
9367 9375
9368 9376 res_opcnt = res.array_len;
9369 9377
9370 9378 /*
9371 9379 * If compound failed first 2 ops (PUTFH+READDIR), then return
9372 9380 * failure here. Subsequent ops are for filling out dot-dot
9373 9381 * dirent, and if they fail, we still want to give the caller
9374 9382 * the dirents returned by (the successful) READDIR op, so we need
9375 9383 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9376 9384 *
9377 9385 * One example where PUTFH+READDIR ops would succeed but
9378 9386 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9379 9387 * but lacks x. In this case, a POSIX server's VOP_READDIR
9380 9388 * would succeed; however, VOP_LOOKUP(..) would fail since no
9381 9389 * x perm. We need to come up with a non-vendor-specific way
9382 9390 * for a POSIX server to return d_ino from dotdot's dirent if
9383 9391 * client only requests mounted_on_fileid, and just say the
9384 9392 * LOOKUPP succeeded and fill out the GETATTR. However, if
9385 9393 * client requested any mandatory attrs, server would be required
9386 9394 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9387 9395 * for dotdot.
9388 9396 */
9389 9397
9390 9398 if (res.status) {
9391 9399 if (res_opcnt <= 2) {
9392 9400 e.error = geterrno4(res.status);
9393 9401 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9394 9402 &recov_state, needrecov);
9395 9403 nfs4_purge_stale_fh(e.error, vp, cr);
9396 9404 rdc->error = e.error;
9397 9405 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9398 9406 if (rdc->entries != NULL) {
9399 9407 kmem_free(rdc->entries, rdc->entlen);
9400 9408 rdc->entries = NULL;
9401 9409 }
9402 9410 /*
9403 9411 * If readdir a node that is a stub for a
9404 9412 * crossed mount point, keep the original
9405 9413 * secinfo flavor for the current file system,
9406 9414 * not the crossed one.
9407 9415 */
9408 9416 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9409 9417 return;
9410 9418 }
9411 9419 }
9412 9420
9413 9421 resop = &res.array[1]; /* readdir res */
9414 9422 rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9415 9423
9416 9424 mutex_enter(&rp->r_statelock);
9417 9425 rp->r_cookieverf4 = rd_res->cookieverf;
9418 9426 mutex_exit(&rp->r_statelock);
9419 9427
9420 9428 /*
9421 9429 * For "." and ".." entries
9422 9430 * e.g.
9423 9431 * seek(cookie=0) -> "." entry with d_off = 1
9424 9432 * seek(cookie=1) -> ".." entry with d_off = 2
9425 9433 */
9426 9434 if (cookie == (nfs_cookie4) 0) {
9427 9435 if (rd_res->dotp)
9428 9436 rd_res->dotp->d_ino = nodeid;
9429 9437 if (rd_res->dotdotp)
9430 9438 rd_res->dotdotp->d_ino = pnodeid;
9431 9439 }
9432 9440 if (cookie == (nfs_cookie4) 1) {
9433 9441 if (rd_res->dotdotp)
9434 9442 rd_res->dotdotp->d_ino = pnodeid;
9435 9443 }
9436 9444
9437 9445
9438 9446 /* LOOKUPP+GETATTR attemped */
9439 9447 if (args.array_len == 5 && rd_res->dotdotp) {
9440 9448 if (res.status == NFS4_OK && res_opcnt == 5) {
9441 9449 nfs_fh4 *fhp;
9442 9450 nfs4_sharedfh_t *sfhp;
9443 9451 vnode_t *pvp;
9444 9452 nfs4_ga_res_t *garp;
9445 9453
9446 9454 resop++; /* lookupp */
9447 9455 resop++; /* getfh */
9448 9456 fhp = &resop->nfs_resop4_u.opgetfh.object;
9449 9457
9450 9458 resop++; /* getattr of parent */
9451 9459
9452 9460 /*
9453 9461 * First, take care of finishing the
9454 9462 * readdir results.
9455 9463 */
9456 9464 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9457 9465 /*
9458 9466 * The d_ino of .. must be the inode number
9459 9467 * of the mounted filesystem.
9460 9468 */
9461 9469 if (garp->n4g_va.va_mask & AT_NODEID)
9462 9470 rd_res->dotdotp->d_ino =
9463 9471 garp->n4g_va.va_nodeid;
9464 9472
9465 9473
9466 9474 /*
9467 9475 * Next, create the ".." dnlc entry
9468 9476 */
9469 9477 sfhp = sfh4_get(fhp, mi);
9470 9478 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9471 9479 dnlc_update(vp, "..", pvp);
9472 9480 VN_RELE(pvp);
9473 9481 }
9474 9482 sfh4_rele(&sfhp);
9475 9483 }
9476 9484 }
9477 9485
9478 9486 if (mi->mi_io_kstats) {
9479 9487 mutex_enter(&mi->mi_lock);
9480 9488 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9481 9489 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9482 9490 mutex_exit(&mi->mi_lock);
9483 9491 }
9484 9492
9485 9493 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9486 9494
9487 9495 out:
9488 9496 /*
9489 9497 * If readdir a node that is a stub for a crossed mount point,
9490 9498 * keep the original secinfo flavor for the current file system,
9491 9499 * not the crossed one.
9492 9500 */
9493 9501 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9494 9502
9495 9503 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9496 9504 }
9497 9505
9498 9506
9499 9507 static int
9500 9508 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9501 9509 {
9502 9510 rnode4_t *rp = VTOR4(bp->b_vp);
9503 9511 int count;
9504 9512 int error;
9505 9513 cred_t *cred_otw = NULL;
9506 9514 offset_t offset;
9507 9515 nfs4_open_stream_t *osp = NULL;
9508 9516 bool_t first_time = TRUE; /* first time getting otw cred */
9509 9517 bool_t last_time = FALSE; /* last time getting otw cred */
9510 9518
9511 9519 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9512 9520
9513 9521 DTRACE_IO1(start, struct buf *, bp);
9514 9522 offset = ldbtob(bp->b_lblkno);
9515 9523
9516 9524 if (bp->b_flags & B_READ) {
9517 9525 read_again:
9518 9526 /*
9519 9527 * Releases the osp, if it is provided.
9520 9528 * Puts a hold on the cred_otw and the new osp (if found).
9521 9529 */
9522 9530 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9523 9531 &first_time, &last_time);
9524 9532 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9525 9533 offset, bp->b_bcount, &bp->b_resid, cred_otw,
9526 9534 readahead, NULL);
9527 9535 crfree(cred_otw);
9528 9536 if (!error) {
9529 9537 if (bp->b_resid) {
9530 9538 /*
9531 9539 * Didn't get it all because we hit EOF,
9532 9540 * zero all the memory beyond the EOF.
9533 9541 */
9534 9542 /* bzero(rdaddr + */
9535 9543 bzero(bp->b_un.b_addr +
9536 9544 bp->b_bcount - bp->b_resid, bp->b_resid);
9537 9545 }
9538 9546 mutex_enter(&rp->r_statelock);
9539 9547 if (bp->b_resid == bp->b_bcount &&
9540 9548 offset >= rp->r_size) {
9541 9549 /*
9542 9550 * We didn't read anything at all as we are
9543 9551 * past EOF. Return an error indicator back
9544 9552 * but don't destroy the pages (yet).
9545 9553 */
9546 9554 error = NFS_EOF;
9547 9555 }
9548 9556 mutex_exit(&rp->r_statelock);
9549 9557 } else if (error == EACCES && last_time == FALSE) {
9550 9558 goto read_again;
9551 9559 }
9552 9560 } else {
9553 9561 if (!(rp->r_flags & R4STALE)) {
9554 9562 write_again:
9555 9563 /*
9556 9564 * Releases the osp, if it is provided.
9557 9565 * Puts a hold on the cred_otw and the new
9558 9566 * osp (if found).
9559 9567 */
9560 9568 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9561 9569 &first_time, &last_time);
9562 9570 mutex_enter(&rp->r_statelock);
9563 9571 count = MIN(bp->b_bcount, rp->r_size - offset);
9564 9572 mutex_exit(&rp->r_statelock);
9565 9573 if (count < 0)
9566 9574 cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9567 9575 #ifdef DEBUG
9568 9576 if (count == 0) {
9569 9577 zoneid_t zoneid = getzoneid();
9570 9578
9571 9579 zcmn_err(zoneid, CE_WARN,
9572 9580 "nfs4_bio: zero length write at %lld",
9573 9581 offset);
9574 9582 zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9575 9583 "b_bcount=%ld, file size=%lld",
9576 9584 rp->r_flags, (long)bp->b_bcount,
9577 9585 rp->r_size);
9578 9586 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9579 9587 if (nfs4_bio_do_stop)
9580 9588 debug_enter("nfs4_bio");
9581 9589 }
9582 9590 #endif
9583 9591 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9584 9592 count, cred_otw, stab_comm);
9585 9593 if (error == EACCES && last_time == FALSE) {
9586 9594 crfree(cred_otw);
9587 9595 goto write_again;
9588 9596 }
9589 9597 bp->b_error = error;
9590 9598 if (error && error != EINTR &&
9591 9599 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9592 9600 /*
9593 9601 * Don't print EDQUOT errors on the console.
9594 9602 * Don't print asynchronous EACCES errors.
9595 9603 * Don't print EFBIG errors.
9596 9604 * Print all other write errors.
9597 9605 */
9598 9606 if (error != EDQUOT && error != EFBIG &&
9599 9607 (error != EACCES ||
9600 9608 !(bp->b_flags & B_ASYNC)))
9601 9609 nfs4_write_error(bp->b_vp,
9602 9610 error, cred_otw);
9603 9611 /*
9604 9612 * Update r_error and r_flags as appropriate.
9605 9613 * If the error was ESTALE, then mark the
9606 9614 * rnode as not being writeable and save
9607 9615 * the error status. Otherwise, save any
9608 9616 * errors which occur from asynchronous
9609 9617 * page invalidations. Any errors occurring
9610 9618 * from other operations should be saved
9611 9619 * by the caller.
9612 9620 */
9613 9621 mutex_enter(&rp->r_statelock);
9614 9622 if (error == ESTALE) {
9615 9623 rp->r_flags |= R4STALE;
9616 9624 if (!rp->r_error)
9617 9625 rp->r_error = error;
9618 9626 } else if (!rp->r_error &&
9619 9627 (bp->b_flags &
9620 9628 (B_INVAL|B_FORCE|B_ASYNC)) ==
9621 9629 (B_INVAL|B_FORCE|B_ASYNC)) {
9622 9630 rp->r_error = error;
9623 9631 }
9624 9632 mutex_exit(&rp->r_statelock);
9625 9633 }
9626 9634 crfree(cred_otw);
9627 9635 } else {
9628 9636 error = rp->r_error;
9629 9637 /*
9630 9638 * A close may have cleared r_error, if so,
9631 9639 * propagate ESTALE error return properly
9632 9640 */
9633 9641 if (error == 0)
9634 9642 error = ESTALE;
9635 9643 }
9636 9644 }
9637 9645
9638 9646 if (error != 0 && error != NFS_EOF)
9639 9647 bp->b_flags |= B_ERROR;
9640 9648
9641 9649 if (osp)
9642 9650 open_stream_rele(osp, rp);
9643 9651
9644 9652 DTRACE_IO1(done, struct buf *, bp);
9645 9653
9646 9654 return (error);
9647 9655 }
9648 9656
9649 9657 /* ARGSUSED */
9650 9658 int
9651 9659 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9652 9660 {
9653 9661 return (EREMOTE);
9654 9662 }
9655 9663
9656 9664 /* ARGSUSED2 */
9657 9665 int
9658 9666 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9659 9667 {
9660 9668 rnode4_t *rp = VTOR4(vp);
9661 9669
9662 9670 if (!write_lock) {
9663 9671 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9664 9672 return (V_WRITELOCK_FALSE);
9665 9673 }
9666 9674
9667 9675 if ((rp->r_flags & R4DIRECTIO) ||
9668 9676 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9669 9677 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9670 9678 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9671 9679 return (V_WRITELOCK_FALSE);
9672 9680 nfs_rw_exit(&rp->r_rwlock);
9673 9681 }
9674 9682
9675 9683 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9676 9684 return (V_WRITELOCK_TRUE);
9677 9685 }
9678 9686
9679 9687 /* ARGSUSED */
9680 9688 void
9681 9689 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9682 9690 {
9683 9691 rnode4_t *rp = VTOR4(vp);
9684 9692
9685 9693 nfs_rw_exit(&rp->r_rwlock);
9686 9694 }
9687 9695
9688 9696 /* ARGSUSED */
9689 9697 static int
9690 9698 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9691 9699 {
9692 9700 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9693 9701 return (EIO);
9694 9702
9695 9703 /*
9696 9704 * Because we stuff the readdir cookie into the offset field
9697 9705 * someone may attempt to do an lseek with the cookie which
9698 9706 * we want to succeed.
9699 9707 */
9700 9708 if (vp->v_type == VDIR)
9701 9709 return (0);
9702 9710 if (*noffp < 0)
9703 9711 return (EINVAL);
9704 9712 return (0);
9705 9713 }
9706 9714
9707 9715
9708 9716 /*
9709 9717 * Return all the pages from [off..off+len) in file
9710 9718 */
9711 9719 /* ARGSUSED */
9712 9720 static int
9713 9721 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9714 9722 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9715 9723 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9716 9724 {
9717 9725 rnode4_t *rp;
9718 9726 int error;
9719 9727 mntinfo4_t *mi;
9720 9728
9721 9729 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9722 9730 return (EIO);
9723 9731 rp = VTOR4(vp);
9724 9732 if (IS_SHADOW(vp, rp))
9725 9733 vp = RTOV4(rp);
9726 9734
9727 9735 if (vp->v_flag & VNOMAP)
9728 9736 return (ENOSYS);
9729 9737
9730 9738 if (protp != NULL)
9731 9739 *protp = PROT_ALL;
9732 9740
9733 9741 /*
9734 9742 * Now validate that the caches are up to date.
9735 9743 */
9736 9744 if (error = nfs4_validate_caches(vp, cr))
9737 9745 return (error);
9738 9746
9739 9747 mi = VTOMI4(vp);
9740 9748 retry:
9741 9749 mutex_enter(&rp->r_statelock);
9742 9750
9743 9751 /*
9744 9752 * Don't create dirty pages faster than they
9745 9753 * can be cleaned so that the system doesn't
9746 9754 * get imbalanced. If the async queue is
9747 9755 * maxed out, then wait for it to drain before
9748 9756 * creating more dirty pages. Also, wait for
9749 9757 * any threads doing pagewalks in the vop_getattr
9750 9758 * entry points so that they don't block for
9751 9759 * long periods.
9752 9760 */
9753 9761 if (rw == S_CREATE) {
9754 9762 while ((mi->mi_max_threads != 0 &&
9755 9763 rp->r_awcount > 2 * mi->mi_max_threads) ||
9756 9764 rp->r_gcount > 0)
9757 9765 cv_wait(&rp->r_cv, &rp->r_statelock);
9758 9766 }
9759 9767
9760 9768 /*
9761 9769 * If we are getting called as a side effect of an nfs_write()
9762 9770 * operation the local file size might not be extended yet.
9763 9771 * In this case we want to be able to return pages of zeroes.
9764 9772 */
9765 9773 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9766 9774 NFS4_DEBUG(nfs4_pageio_debug,
9767 9775 (CE_NOTE, "getpage beyond EOF: off=%lld, "
9768 9776 "len=%llu, size=%llu, attrsize =%llu", off,
9769 9777 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9770 9778 mutex_exit(&rp->r_statelock);
9771 9779 return (EFAULT); /* beyond EOF */
9772 9780 }
9773 9781
9774 9782 mutex_exit(&rp->r_statelock);
9775 9783
9776 9784 if (len <= PAGESIZE) {
9777 9785 error = nfs4_getapage(vp, off, len, protp, pl, plsz,
9778 9786 seg, addr, rw, cr);
9779 9787 NFS4_DEBUG(nfs4_pageio_debug && error,
9780 9788 (CE_NOTE, "getpage error %d; off=%lld, "
9781 9789 "len=%lld", error, off, (u_longlong_t)len));
9782 9790 } else {
9783 9791 error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9784 9792 pl, plsz, seg, addr, rw, cr);
9785 9793 NFS4_DEBUG(nfs4_pageio_debug && error,
9786 9794 (CE_NOTE, "getpages error %d; off=%lld, "
9787 9795 "len=%lld", error, off, (u_longlong_t)len));
9788 9796 }
9789 9797
9790 9798 switch (error) {
9791 9799 case NFS_EOF:
9792 9800 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9793 9801 goto retry;
9794 9802 case ESTALE:
9795 9803 nfs4_purge_stale_fh(error, vp, cr);
9796 9804 }
9797 9805
9798 9806 return (error);
9799 9807 }
9800 9808
9801 9809 /*
9802 9810 * Called from pvn_getpages or nfs4_getpage to get a particular page.
9803 9811 */
9804 9812 /* ARGSUSED */
9805 9813 static int
9806 9814 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9807 9815 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9808 9816 enum seg_rw rw, cred_t *cr)
9809 9817 {
9810 9818 rnode4_t *rp;
9811 9819 uint_t bsize;
9812 9820 struct buf *bp;
9813 9821 page_t *pp;
9814 9822 u_offset_t lbn;
9815 9823 u_offset_t io_off;
9816 9824 u_offset_t blkoff;
9817 9825 u_offset_t rablkoff;
9818 9826 size_t io_len;
9819 9827 uint_t blksize;
9820 9828 int error;
9821 9829 int readahead;
9822 9830 int readahead_issued = 0;
9823 9831 int ra_window; /* readahead window */
9824 9832 page_t *pagefound;
9825 9833 page_t *savepp;
9826 9834
9827 9835 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9828 9836 return (EIO);
9829 9837
9830 9838 rp = VTOR4(vp);
9831 9839 ASSERT(!IS_SHADOW(vp, rp));
9832 9840 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9833 9841
9834 9842 reread:
9835 9843 bp = NULL;
9836 9844 pp = NULL;
9837 9845 pagefound = NULL;
9838 9846
9839 9847 if (pl != NULL)
9840 9848 pl[0] = NULL;
9841 9849
9842 9850 error = 0;
9843 9851 lbn = off / bsize;
9844 9852 blkoff = lbn * bsize;
9845 9853
9846 9854 /*
9847 9855 * Queueing up the readahead before doing the synchronous read
9848 9856 * results in a significant increase in read throughput because
9849 9857 * of the increased parallelism between the async threads and
9850 9858 * the process context.
9851 9859 */
9852 9860 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9853 9861 rw != S_CREATE &&
9854 9862 !(vp->v_flag & VNOCACHE)) {
9855 9863 mutex_enter(&rp->r_statelock);
9856 9864
9857 9865 /*
9858 9866 * Calculate the number of readaheads to do.
9859 9867 * a) No readaheads at offset = 0.
9860 9868 * b) Do maximum(nfs4_nra) readaheads when the readahead
9861 9869 * window is closed.
9862 9870 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9863 9871 * upon how far the readahead window is open or close.
9864 9872 * d) No readaheads if rp->r_nextr is not within the scope
9865 9873 * of the readahead window (random i/o).
9866 9874 */
9867 9875
9868 9876 if (off == 0)
9869 9877 readahead = 0;
9870 9878 else if (blkoff == rp->r_nextr)
9871 9879 readahead = nfs4_nra;
9872 9880 else if (rp->r_nextr > blkoff &&
9873 9881 ((ra_window = (rp->r_nextr - blkoff) / bsize)
9874 9882 <= (nfs4_nra - 1)))
9875 9883 readahead = nfs4_nra - ra_window;
9876 9884 else
9877 9885 readahead = 0;
9878 9886
9879 9887 rablkoff = rp->r_nextr;
9880 9888 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9881 9889 mutex_exit(&rp->r_statelock);
9882 9890 if (nfs4_async_readahead(vp, rablkoff + bsize,
9883 9891 addr + (rablkoff + bsize - off),
9884 9892 seg, cr, nfs4_readahead) < 0) {
9885 9893 mutex_enter(&rp->r_statelock);
9886 9894 break;
9887 9895 }
9888 9896 readahead--;
9889 9897 rablkoff += bsize;
9890 9898 /*
9891 9899 * Indicate that we did a readahead so
9892 9900 * readahead offset is not updated
9893 9901 * by the synchronous read below.
9894 9902 */
9895 9903 readahead_issued = 1;
9896 9904 mutex_enter(&rp->r_statelock);
9897 9905 /*
9898 9906 * set readahead offset to
9899 9907 * offset of last async readahead
9900 9908 * request.
9901 9909 */
9902 9910 rp->r_nextr = rablkoff;
9903 9911 }
9904 9912 mutex_exit(&rp->r_statelock);
9905 9913 }
9906 9914
9907 9915 again:
9908 9916 if ((pagefound = page_exists(vp, off)) == NULL) {
9909 9917 if (pl == NULL) {
9910 9918 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9911 9919 nfs4_readahead);
9912 9920 } else if (rw == S_CREATE) {
9913 9921 /*
9914 9922 * Block for this page is not allocated, or the offset
9915 9923 * is beyond the current allocation size, or we're
9916 9924 * allocating a swap slot and the page was not found,
9917 9925 * so allocate it and return a zero page.
9918 9926 */
9919 9927 if ((pp = page_create_va(vp, off,
9920 9928 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9921 9929 cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9922 9930 io_len = PAGESIZE;
9923 9931 mutex_enter(&rp->r_statelock);
9924 9932 rp->r_nextr = off + PAGESIZE;
9925 9933 mutex_exit(&rp->r_statelock);
9926 9934 } else {
9927 9935 /*
9928 9936 * Need to go to server to get a block
9929 9937 */
9930 9938 mutex_enter(&rp->r_statelock);
9931 9939 if (blkoff < rp->r_size &&
9932 9940 blkoff + bsize > rp->r_size) {
9933 9941 /*
9934 9942 * If less than a block left in
9935 9943 * file read less than a block.
9936 9944 */
9937 9945 if (rp->r_size <= off) {
9938 9946 /*
9939 9947 * Trying to access beyond EOF,
9940 9948 * set up to get at least one page.
9941 9949 */
9942 9950 blksize = off + PAGESIZE - blkoff;
9943 9951 } else
9944 9952 blksize = rp->r_size - blkoff;
9945 9953 } else if ((off == 0) ||
9946 9954 (off != rp->r_nextr && !readahead_issued)) {
9947 9955 blksize = PAGESIZE;
9948 9956 blkoff = off; /* block = page here */
9949 9957 } else
9950 9958 blksize = bsize;
9951 9959 mutex_exit(&rp->r_statelock);
9952 9960
9953 9961 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9954 9962 &io_len, blkoff, blksize, 0);
9955 9963
9956 9964 /*
9957 9965 * Some other thread has entered the page,
9958 9966 * so just use it.
9959 9967 */
9960 9968 if (pp == NULL)
9961 9969 goto again;
9962 9970
9963 9971 /*
9964 9972 * Now round the request size up to page boundaries.
9965 9973 * This ensures that the entire page will be
9966 9974 * initialized to zeroes if EOF is encountered.
9967 9975 */
9968 9976 io_len = ptob(btopr(io_len));
9969 9977
9970 9978 bp = pageio_setup(pp, io_len, vp, B_READ);
9971 9979 ASSERT(bp != NULL);
9972 9980
9973 9981 /*
9974 9982 * pageio_setup should have set b_addr to 0. This
9975 9983 * is correct since we want to do I/O on a page
9976 9984 * boundary. bp_mapin will use this addr to calculate
9977 9985 * an offset, and then set b_addr to the kernel virtual
9978 9986 * address it allocated for us.
9979 9987 */
9980 9988 ASSERT(bp->b_un.b_addr == 0);
9981 9989
9982 9990 bp->b_edev = 0;
9983 9991 bp->b_dev = 0;
9984 9992 bp->b_lblkno = lbtodb(io_off);
9985 9993 bp->b_file = vp;
9986 9994 bp->b_offset = (offset_t)off;
9987 9995 bp_mapin(bp);
9988 9996
9989 9997 /*
9990 9998 * If doing a write beyond what we believe is EOF,
9991 9999 * don't bother trying to read the pages from the
9992 10000 * server, we'll just zero the pages here. We
9993 10001 * don't check that the rw flag is S_WRITE here
9994 10002 * because some implementations may attempt a
9995 10003 * read access to the buffer before copying data.
9996 10004 */
9997 10005 mutex_enter(&rp->r_statelock);
9998 10006 if (io_off >= rp->r_size && seg == segkmap) {
9999 10007 mutex_exit(&rp->r_statelock);
10000 10008 bzero(bp->b_un.b_addr, io_len);
10001 10009 } else {
10002 10010 mutex_exit(&rp->r_statelock);
10003 10011 error = nfs4_bio(bp, NULL, cr, FALSE);
10004 10012 }
10005 10013
10006 10014 /*
10007 10015 * Unmap the buffer before freeing it.
10008 10016 */
10009 10017 bp_mapout(bp);
10010 10018 pageio_done(bp);
10011 10019
10012 10020 savepp = pp;
10013 10021 do {
10014 10022 pp->p_fsdata = C_NOCOMMIT;
10015 10023 } while ((pp = pp->p_next) != savepp);
10016 10024
10017 10025 if (error == NFS_EOF) {
10018 10026 /*
10019 10027 * If doing a write system call just return
10020 10028 * zeroed pages, else user tried to get pages
10021 10029 * beyond EOF, return error. We don't check
10022 10030 * that the rw flag is S_WRITE here because
10023 10031 * some implementations may attempt a read
10024 10032 * access to the buffer before copying data.
10025 10033 */
10026 10034 if (seg == segkmap)
10027 10035 error = 0;
10028 10036 else
10029 10037 error = EFAULT;
10030 10038 }
10031 10039
10032 10040 if (!readahead_issued && !error) {
10033 10041 mutex_enter(&rp->r_statelock);
10034 10042 rp->r_nextr = io_off + io_len;
10035 10043 mutex_exit(&rp->r_statelock);
10036 10044 }
10037 10045 }
10038 10046 }
10039 10047
10040 10048 out:
10041 10049 if (pl == NULL)
10042 10050 return (error);
10043 10051
10044 10052 if (error) {
10045 10053 if (pp != NULL)
10046 10054 pvn_read_done(pp, B_ERROR);
10047 10055 return (error);
10048 10056 }
10049 10057
10050 10058 if (pagefound) {
10051 10059 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10052 10060
10053 10061 /*
10054 10062 * Page exists in the cache, acquire the appropriate lock.
10055 10063 * If this fails, start all over again.
10056 10064 */
10057 10065 if ((pp = page_lookup(vp, off, se)) == NULL) {
10058 10066 #ifdef DEBUG
10059 10067 nfs4_lostpage++;
10060 10068 #endif
10061 10069 goto reread;
10062 10070 }
10063 10071 pl[0] = pp;
10064 10072 pl[1] = NULL;
10065 10073 return (0);
10066 10074 }
10067 10075
10068 10076 if (pp != NULL)
10069 10077 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10070 10078
10071 10079 return (error);
10072 10080 }
10073 10081
10074 10082 static void
10075 10083 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10076 10084 cred_t *cr)
10077 10085 {
10078 10086 int error;
10079 10087 page_t *pp;
10080 10088 u_offset_t io_off;
10081 10089 size_t io_len;
10082 10090 struct buf *bp;
10083 10091 uint_t bsize, blksize;
10084 10092 rnode4_t *rp = VTOR4(vp);
10085 10093 page_t *savepp;
10086 10094
10087 10095 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10088 10096
10089 10097 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10090 10098
10091 10099 mutex_enter(&rp->r_statelock);
10092 10100 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10093 10101 /*
10094 10102 * If less than a block left in file read less
10095 10103 * than a block.
10096 10104 */
10097 10105 blksize = rp->r_size - blkoff;
10098 10106 } else
10099 10107 blksize = bsize;
10100 10108 mutex_exit(&rp->r_statelock);
10101 10109
10102 10110 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10103 10111 &io_off, &io_len, blkoff, blksize, 1);
10104 10112 /*
10105 10113 * The isra flag passed to the kluster function is 1, we may have
10106 10114 * gotten a return value of NULL for a variety of reasons (# of free
10107 10115 * pages < minfree, someone entered the page on the vnode etc). In all
10108 10116 * cases, we want to punt on the readahead.
10109 10117 */
10110 10118 if (pp == NULL)
10111 10119 return;
10112 10120
10113 10121 /*
10114 10122 * Now round the request size up to page boundaries.
10115 10123 * This ensures that the entire page will be
10116 10124 * initialized to zeroes if EOF is encountered.
10117 10125 */
10118 10126 io_len = ptob(btopr(io_len));
10119 10127
10120 10128 bp = pageio_setup(pp, io_len, vp, B_READ);
10121 10129 ASSERT(bp != NULL);
10122 10130
10123 10131 /*
10124 10132 * pageio_setup should have set b_addr to 0. This is correct since
10125 10133 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10126 10134 * to calculate an offset, and then set b_addr to the kernel virtual
10127 10135 * address it allocated for us.
10128 10136 */
10129 10137 ASSERT(bp->b_un.b_addr == 0);
10130 10138
10131 10139 bp->b_edev = 0;
10132 10140 bp->b_dev = 0;
10133 10141 bp->b_lblkno = lbtodb(io_off);
10134 10142 bp->b_file = vp;
10135 10143 bp->b_offset = (offset_t)blkoff;
10136 10144 bp_mapin(bp);
10137 10145
10138 10146 /*
10139 10147 * If doing a write beyond what we believe is EOF, don't bother trying
10140 10148 * to read the pages from the server, we'll just zero the pages here.
10141 10149 * We don't check that the rw flag is S_WRITE here because some
10142 10150 * implementations may attempt a read access to the buffer before
10143 10151 * copying data.
10144 10152 */
10145 10153 mutex_enter(&rp->r_statelock);
10146 10154 if (io_off >= rp->r_size && seg == segkmap) {
10147 10155 mutex_exit(&rp->r_statelock);
10148 10156 bzero(bp->b_un.b_addr, io_len);
10149 10157 error = 0;
10150 10158 } else {
10151 10159 mutex_exit(&rp->r_statelock);
10152 10160 error = nfs4_bio(bp, NULL, cr, TRUE);
10153 10161 if (error == NFS_EOF)
10154 10162 error = 0;
10155 10163 }
10156 10164
10157 10165 /*
10158 10166 * Unmap the buffer before freeing it.
10159 10167 */
10160 10168 bp_mapout(bp);
10161 10169 pageio_done(bp);
10162 10170
10163 10171 savepp = pp;
10164 10172 do {
10165 10173 pp->p_fsdata = C_NOCOMMIT;
10166 10174 } while ((pp = pp->p_next) != savepp);
10167 10175
10168 10176 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10169 10177
10170 10178 /*
10171 10179 * In case of error set readahead offset
10172 10180 * to the lowest offset.
10173 10181 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10174 10182 */
10175 10183 if (error && rp->r_nextr > io_off) {
10176 10184 mutex_enter(&rp->r_statelock);
10177 10185 if (rp->r_nextr > io_off)
10178 10186 rp->r_nextr = io_off;
10179 10187 mutex_exit(&rp->r_statelock);
10180 10188 }
10181 10189 }
10182 10190
10183 10191 /*
10184 10192 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10185 10193 * If len == 0, do from off to EOF.
10186 10194 *
10187 10195 * The normal cases should be len == 0 && off == 0 (entire vp list) or
10188 10196 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10189 10197 * (from pageout).
10190 10198 */
10191 10199 /* ARGSUSED */
10192 10200 static int
10193 10201 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10194 10202 caller_context_t *ct)
10195 10203 {
10196 10204 int error;
10197 10205 rnode4_t *rp;
10198 10206
10199 10207 ASSERT(cr != NULL);
10200 10208
10201 10209 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10202 10210 return (EIO);
10203 10211
10204 10212 rp = VTOR4(vp);
10205 10213 if (IS_SHADOW(vp, rp))
10206 10214 vp = RTOV4(rp);
10207 10215
10208 10216 /*
10209 10217 * XXX - Why should this check be made here?
10210 10218 */
10211 10219 if (vp->v_flag & VNOMAP)
10212 10220 return (ENOSYS);
10213 10221
10214 10222 if (len == 0 && !(flags & B_INVAL) &&
10215 10223 (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10216 10224 return (0);
10217 10225
10218 10226 mutex_enter(&rp->r_statelock);
10219 10227 rp->r_count++;
10220 10228 mutex_exit(&rp->r_statelock);
10221 10229 error = nfs4_putpages(vp, off, len, flags, cr);
10222 10230 mutex_enter(&rp->r_statelock);
10223 10231 rp->r_count--;
10224 10232 cv_broadcast(&rp->r_cv);
10225 10233 mutex_exit(&rp->r_statelock);
10226 10234
10227 10235 return (error);
10228 10236 }
10229 10237
10230 10238 /*
10231 10239 * Write out a single page, possibly klustering adjacent dirty pages.
10232 10240 */
10233 10241 int
10234 10242 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10235 10243 int flags, cred_t *cr)
10236 10244 {
10237 10245 u_offset_t io_off;
10238 10246 u_offset_t lbn_off;
10239 10247 u_offset_t lbn;
10240 10248 size_t io_len;
10241 10249 uint_t bsize;
10242 10250 int error;
10243 10251 rnode4_t *rp;
10244 10252
10245 10253 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10246 10254 ASSERT(pp != NULL);
10247 10255 ASSERT(cr != NULL);
10248 10256 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10249 10257
10250 10258 rp = VTOR4(vp);
10251 10259 ASSERT(rp->r_count > 0);
10252 10260 ASSERT(!IS_SHADOW(vp, rp));
10253 10261
10254 10262 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10255 10263 lbn = pp->p_offset / bsize;
10256 10264 lbn_off = lbn * bsize;
10257 10265
10258 10266 /*
10259 10267 * Find a kluster that fits in one block, or in
10260 10268 * one page if pages are bigger than blocks. If
10261 10269 * there is less file space allocated than a whole
10262 10270 * page, we'll shorten the i/o request below.
10263 10271 */
10264 10272 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10265 10273 roundup(bsize, PAGESIZE), flags);
10266 10274
10267 10275 /*
10268 10276 * pvn_write_kluster shouldn't have returned a page with offset
10269 10277 * behind the original page we were given. Verify that.
10270 10278 */
10271 10279 ASSERT((pp->p_offset / bsize) >= lbn);
10272 10280
10273 10281 /*
10274 10282 * Now pp will have the list of kept dirty pages marked for
10275 10283 * write back. It will also handle invalidation and freeing
10276 10284 * of pages that are not dirty. Check for page length rounding
10277 10285 * problems.
10278 10286 */
10279 10287 if (io_off + io_len > lbn_off + bsize) {
10280 10288 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10281 10289 io_len = lbn_off + bsize - io_off;
10282 10290 }
10283 10291 /*
10284 10292 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10285 10293 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10286 10294 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10287 10295 * progress and the r_size has not been made consistent with the
10288 10296 * new size of the file. When the uiomove() completes the r_size is
10289 10297 * updated and the R4MODINPROGRESS flag is cleared.
10290 10298 *
10291 10299 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10292 10300 * consistent value of r_size. Without this handshaking, it is
10293 10301 * possible that nfs4_bio() picks up the old value of r_size
10294 10302 * before the uiomove() in writerp4() completes. This will result
10295 10303 * in the write through nfs4_bio() being dropped.
10296 10304 *
10297 10305 * More precisely, there is a window between the time the uiomove()
10298 10306 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10299 10307 * operation intervenes in this window, the page will be picked up,
10300 10308 * because it is dirty (it will be unlocked, unless it was
10301 10309 * pagecreate'd). When the page is picked up as dirty, the dirty
10302 10310 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10303 10311 * checked. This will still be the old size. Therefore the page will
10304 10312 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10305 10313 * the page will be found to be clean and the write will be dropped.
10306 10314 */
10307 10315 if (rp->r_flags & R4MODINPROGRESS) {
10308 10316 mutex_enter(&rp->r_statelock);
10309 10317 if ((rp->r_flags & R4MODINPROGRESS) &&
10310 10318 rp->r_modaddr + MAXBSIZE > io_off &&
10311 10319 rp->r_modaddr < io_off + io_len) {
10312 10320 page_t *plist;
10313 10321 /*
10314 10322 * A write is in progress for this region of the file.
10315 10323 * If we did not detect R4MODINPROGRESS here then this
10316 10324 * path through nfs_putapage() would eventually go to
10317 10325 * nfs4_bio() and may not write out all of the data
10318 10326 * in the pages. We end up losing data. So we decide
10319 10327 * to set the modified bit on each page in the page
10320 10328 * list and mark the rnode with R4DIRTY. This write
10321 10329 * will be restarted at some later time.
10322 10330 */
10323 10331 plist = pp;
10324 10332 while (plist != NULL) {
10325 10333 pp = plist;
10326 10334 page_sub(&plist, pp);
10327 10335 hat_setmod(pp);
10328 10336 page_io_unlock(pp);
10329 10337 page_unlock(pp);
10330 10338 }
10331 10339 rp->r_flags |= R4DIRTY;
10332 10340 mutex_exit(&rp->r_statelock);
10333 10341 if (offp)
10334 10342 *offp = io_off;
10335 10343 if (lenp)
10336 10344 *lenp = io_len;
10337 10345 return (0);
10338 10346 }
10339 10347 mutex_exit(&rp->r_statelock);
10340 10348 }
10341 10349
10342 10350 if (flags & B_ASYNC) {
10343 10351 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10344 10352 nfs4_sync_putapage);
10345 10353 } else
10346 10354 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10347 10355
10348 10356 if (offp)
10349 10357 *offp = io_off;
10350 10358 if (lenp)
10351 10359 *lenp = io_len;
10352 10360 return (error);
10353 10361 }
10354 10362
10355 10363 static int
10356 10364 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10357 10365 int flags, cred_t *cr)
10358 10366 {
10359 10367 int error;
10360 10368 rnode4_t *rp;
10361 10369
10362 10370 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10363 10371
10364 10372 flags |= B_WRITE;
10365 10373
10366 10374 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10367 10375
10368 10376 rp = VTOR4(vp);
10369 10377
10370 10378 if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10371 10379 error == EACCES) &&
10372 10380 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10373 10381 if (!(rp->r_flags & R4OUTOFSPACE)) {
10374 10382 mutex_enter(&rp->r_statelock);
10375 10383 rp->r_flags |= R4OUTOFSPACE;
10376 10384 mutex_exit(&rp->r_statelock);
10377 10385 }
10378 10386 flags |= B_ERROR;
10379 10387 pvn_write_done(pp, flags);
10380 10388 /*
10381 10389 * If this was not an async thread, then try again to
10382 10390 * write out the pages, but this time, also destroy
10383 10391 * them whether or not the write is successful. This
10384 10392 * will prevent memory from filling up with these
10385 10393 * pages and destroying them is the only alternative
10386 10394 * if they can't be written out.
10387 10395 *
10388 10396 * Don't do this if this is an async thread because
10389 10397 * when the pages are unlocked in pvn_write_done,
10390 10398 * some other thread could have come along, locked
10391 10399 * them, and queued for an async thread. It would be
10392 10400 * possible for all of the async threads to be tied
10393 10401 * up waiting to lock the pages again and they would
10394 10402 * all already be locked and waiting for an async
10395 10403 * thread to handle them. Deadlock.
10396 10404 */
10397 10405 if (!(flags & B_ASYNC)) {
10398 10406 error = nfs4_putpage(vp, io_off, io_len,
10399 10407 B_INVAL | B_FORCE, cr, NULL);
10400 10408 }
10401 10409 } else {
10402 10410 if (error)
10403 10411 flags |= B_ERROR;
10404 10412 else if (rp->r_flags & R4OUTOFSPACE) {
10405 10413 mutex_enter(&rp->r_statelock);
10406 10414 rp->r_flags &= ~R4OUTOFSPACE;
10407 10415 mutex_exit(&rp->r_statelock);
10408 10416 }
10409 10417 pvn_write_done(pp, flags);
10410 10418 if (freemem < desfree)
10411 10419 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10412 10420 NFS4_WRITE_NOWAIT);
10413 10421 }
10414 10422
10415 10423 return (error);
10416 10424 }
10417 10425
10418 10426 #ifdef DEBUG
10419 10427 int nfs4_force_open_before_mmap = 0;
10420 10428 #endif
10421 10429
10422 10430 /* ARGSUSED */
10423 10431 static int
10424 10432 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10425 10433 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10426 10434 caller_context_t *ct)
10427 10435 {
10428 10436 struct segvn_crargs vn_a;
10429 10437 int error = 0;
10430 10438 rnode4_t *rp = VTOR4(vp);
10431 10439 mntinfo4_t *mi = VTOMI4(vp);
10432 10440
10433 10441 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10434 10442 return (EIO);
10435 10443
10436 10444 if (vp->v_flag & VNOMAP)
10437 10445 return (ENOSYS);
10438 10446
10439 10447 if (off < 0 || (off + len) < 0)
10440 10448 return (ENXIO);
10441 10449
10442 10450 if (vp->v_type != VREG)
10443 10451 return (ENODEV);
10444 10452
10445 10453 /*
10446 10454 * If the file is delegated to the client don't do anything.
10447 10455 * If the file is not delegated, then validate the data cache.
10448 10456 */
10449 10457 mutex_enter(&rp->r_statev4_lock);
10450 10458 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10451 10459 mutex_exit(&rp->r_statev4_lock);
10452 10460 error = nfs4_validate_caches(vp, cr);
10453 10461 if (error)
10454 10462 return (error);
10455 10463 } else {
10456 10464 mutex_exit(&rp->r_statev4_lock);
10457 10465 }
10458 10466
10459 10467 /*
10460 10468 * Check to see if the vnode is currently marked as not cachable.
10461 10469 * This means portions of the file are locked (through VOP_FRLOCK).
10462 10470 * In this case the map request must be refused. We use
10463 10471 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10464 10472 *
10465 10473 * Atomically increment r_inmap after acquiring r_rwlock. The
10466 10474 * idea here is to acquire r_rwlock to block read/write and
10467 10475 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10468 10476 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10469 10477 * and we can prevent the deadlock that would have occurred
10470 10478 * when nfs4_addmap() would have acquired it out of order.
10471 10479 *
10472 10480 * Since we are not protecting r_inmap by any lock, we do not
10473 10481 * hold any lock when we decrement it. We atomically decrement
10474 10482 * r_inmap after we release r_lkserlock.
10475 10483 */
10476 10484
10477 10485 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10478 10486 return (EINTR);
10479 10487 atomic_add_int(&rp->r_inmap, 1);
10480 10488 nfs_rw_exit(&rp->r_rwlock);
10481 10489
10482 10490 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10483 10491 atomic_add_int(&rp->r_inmap, -1);
10484 10492 return (EINTR);
10485 10493 }
10486 10494
10487 10495
10488 10496 if (vp->v_flag & VNOCACHE) {
10489 10497 error = EAGAIN;
10490 10498 goto done;
10491 10499 }
10492 10500
10493 10501 /*
10494 10502 * Don't allow concurrent locks and mapping if mandatory locking is
10495 10503 * enabled.
10496 10504 */
10497 10505 if (flk_has_remote_locks(vp)) {
10498 10506 struct vattr va;
10499 10507 va.va_mask = AT_MODE;
10500 10508 error = nfs4getattr(vp, &va, cr);
10501 10509 if (error != 0)
10502 10510 goto done;
10503 10511 if (MANDLOCK(vp, va.va_mode)) {
10504 10512 error = EAGAIN;
10505 10513 goto done;
10506 10514 }
10507 10515 }
10508 10516
10509 10517 /*
10510 10518 * It is possible that the rnode has a lost lock request that we
10511 10519 * are still trying to recover, and that the request conflicts with
10512 10520 * this map request.
10513 10521 *
10514 10522 * An alternative approach would be for nfs4_safemap() to consider
10515 10523 * queued lock requests when deciding whether to set or clear
10516 10524 * VNOCACHE. This would require the frlock code path to call
10517 10525 * nfs4_safemap() after enqueing a lost request.
10518 10526 */
10519 10527 if (nfs4_map_lost_lock_conflict(vp)) {
10520 10528 error = EAGAIN;
10521 10529 goto done;
10522 10530 }
10523 10531
10524 10532 as_rangelock(as);
10525 10533 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10526 10534 if (error != 0) {
10527 10535 as_rangeunlock(as);
10528 10536 goto done;
10529 10537 }
10530 10538
10531 10539 if (vp->v_type == VREG) {
10532 10540 /*
10533 10541 * We need to retrieve the open stream
10534 10542 */
10535 10543 nfs4_open_stream_t *osp = NULL;
10536 10544 nfs4_open_owner_t *oop = NULL;
10537 10545
10538 10546 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10539 10547 if (oop != NULL) {
10540 10548 /* returns with 'os_sync_lock' held */
10541 10549 osp = find_open_stream(oop, rp);
10542 10550 open_owner_rele(oop);
10543 10551 }
10544 10552 if (osp == NULL) {
10545 10553 #ifdef DEBUG
10546 10554 if (nfs4_force_open_before_mmap) {
10547 10555 error = EIO;
10548 10556 goto done;
10549 10557 }
10550 10558 #endif
10551 10559 /* returns with 'os_sync_lock' held */
10552 10560 error = open_and_get_osp(vp, cr, &osp);
10553 10561 if (osp == NULL) {
10554 10562 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10555 10563 "nfs4_map: we tried to OPEN the file "
10556 10564 "but again no osp, so fail with EIO"));
10557 10565 goto done;
10558 10566 }
10559 10567 }
10560 10568
10561 10569 if (osp->os_failed_reopen) {
10562 10570 mutex_exit(&osp->os_sync_lock);
10563 10571 open_stream_rele(osp, rp);
10564 10572 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10565 10573 "nfs4_map: os_failed_reopen set on "
10566 10574 "osp %p, cr %p, rp %s", (void *)osp,
10567 10575 (void *)cr, rnode4info(rp)));
10568 10576 error = EIO;
10569 10577 goto done;
10570 10578 }
10571 10579 mutex_exit(&osp->os_sync_lock);
10572 10580 open_stream_rele(osp, rp);
10573 10581 }
10574 10582
10575 10583 vn_a.vp = vp;
10576 10584 vn_a.offset = off;
10577 10585 vn_a.type = (flags & MAP_TYPE);
10578 10586 vn_a.prot = (uchar_t)prot;
10579 10587 vn_a.maxprot = (uchar_t)maxprot;
10580 10588 vn_a.flags = (flags & ~MAP_TYPE);
10581 10589 vn_a.cred = cr;
10582 10590 vn_a.amp = NULL;
10583 10591 vn_a.szc = 0;
10584 10592 vn_a.lgrp_mem_policy_flags = 0;
10585 10593
10586 10594 error = as_map(as, *addrp, len, segvn_create, &vn_a);
10587 10595 as_rangeunlock(as);
10588 10596
10589 10597 done:
10590 10598 nfs_rw_exit(&rp->r_lkserlock);
10591 10599 atomic_add_int(&rp->r_inmap, -1);
10592 10600 return (error);
10593 10601 }
10594 10602
10595 10603 /*
10596 10604 * We're most likely dealing with a kernel module that likes to READ
10597 10605 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10598 10606 * officially OPEN the file to create the necessary client state
10599 10607 * for bookkeeping of os_mmap_read/write counts.
10600 10608 *
10601 10609 * Since VOP_MAP only passes in a pointer to the vnode rather than
10602 10610 * a double pointer, we can't handle the case where nfs4open_otw()
10603 10611 * returns a different vnode than the one passed into VOP_MAP (since
10604 10612 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case,
10605 10613 * we return NULL and let nfs4_map() fail. Note: the only case where
10606 10614 * this should happen is if the file got removed and replaced with the
10607 10615 * same name on the server (in addition to the fact that we're trying
10608 10616 * to VOP_MAP withouth VOP_OPENing the file in the first place).
10609 10617 */
10610 10618 static int
10611 10619 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10612 10620 {
10613 10621 rnode4_t *rp, *drp;
10614 10622 vnode_t *dvp, *open_vp;
10615 10623 char file_name[MAXNAMELEN];
10616 10624 int just_created;
10617 10625 nfs4_open_stream_t *osp;
10618 10626 nfs4_open_owner_t *oop;
10619 10627 int error;
10620 10628
10621 10629 *ospp = NULL;
10622 10630 open_vp = map_vp;
10623 10631
10624 10632 rp = VTOR4(open_vp);
10625 10633 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10626 10634 return (error);
10627 10635 drp = VTOR4(dvp);
10628 10636
10629 10637 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10630 10638 VN_RELE(dvp);
10631 10639 return (EINTR);
10632 10640 }
10633 10641
10634 10642 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10635 10643 nfs_rw_exit(&drp->r_rwlock);
10636 10644 VN_RELE(dvp);
10637 10645 return (error);
10638 10646 }
10639 10647
10640 10648 mutex_enter(&rp->r_statev4_lock);
10641 10649 if (rp->created_v4) {
10642 10650 rp->created_v4 = 0;
10643 10651 mutex_exit(&rp->r_statev4_lock);
10644 10652
10645 10653 dnlc_update(dvp, file_name, open_vp);
10646 10654 /* This is needed so we don't bump the open ref count */
10647 10655 just_created = 1;
10648 10656 } else {
10649 10657 mutex_exit(&rp->r_statev4_lock);
10650 10658 just_created = 0;
10651 10659 }
10652 10660
10653 10661 VN_HOLD(map_vp);
10654 10662
10655 10663 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10656 10664 just_created);
10657 10665 if (error) {
10658 10666 nfs_rw_exit(&drp->r_rwlock);
10659 10667 VN_RELE(dvp);
10660 10668 VN_RELE(map_vp);
10661 10669 return (error);
10662 10670 }
10663 10671
10664 10672 nfs_rw_exit(&drp->r_rwlock);
10665 10673 VN_RELE(dvp);
10666 10674
10667 10675 /*
10668 10676 * If nfs4open_otw() returned a different vnode then "undo"
10669 10677 * the open and return failure to the caller.
10670 10678 */
10671 10679 if (!VN_CMP(open_vp, map_vp)) {
10672 10680 nfs4_error_t e;
10673 10681
10674 10682 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10675 10683 "open returned a different vnode"));
10676 10684 /*
10677 10685 * If there's an error, ignore it,
10678 10686 * and let VOP_INACTIVE handle it.
10679 10687 */
10680 10688 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10681 10689 CLOSE_NORM, 0, 0, 0);
10682 10690 VN_RELE(map_vp);
10683 10691 return (EIO);
10684 10692 }
10685 10693
10686 10694 VN_RELE(map_vp);
10687 10695
10688 10696 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10689 10697 if (!oop) {
10690 10698 nfs4_error_t e;
10691 10699
10692 10700 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10693 10701 "no open owner"));
10694 10702 /*
10695 10703 * If there's an error, ignore it,
10696 10704 * and let VOP_INACTIVE handle it.
10697 10705 */
10698 10706 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10699 10707 CLOSE_NORM, 0, 0, 0);
10700 10708 return (EIO);
10701 10709 }
10702 10710 osp = find_open_stream(oop, rp);
10703 10711 open_owner_rele(oop);
10704 10712 *ospp = osp;
10705 10713 return (0);
10706 10714 }
10707 10715
10708 10716 /*
10709 10717 * Please be aware that when this function is called, the address space write
10710 10718 * a_lock is held. Do not put over the wire calls in this function.
10711 10719 */
10712 10720 /* ARGSUSED */
10713 10721 static int
10714 10722 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10715 10723 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10716 10724 caller_context_t *ct)
10717 10725 {
10718 10726 rnode4_t *rp;
10719 10727 int error = 0;
10720 10728 mntinfo4_t *mi;
10721 10729
10722 10730 mi = VTOMI4(vp);
10723 10731 rp = VTOR4(vp);
10724 10732
10725 10733 if (nfs_zone() != mi->mi_zone)
10726 10734 return (EIO);
10727 10735 if (vp->v_flag & VNOMAP)
10728 10736 return (ENOSYS);
10729 10737
10730 10738 /*
10731 10739 * Don't need to update the open stream first, since this
10732 10740 * mmap can't add any additional share access that isn't
10733 10741 * already contained in the open stream (for the case where we
10734 10742 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10735 10743 * take into account os_mmap_read[write] counts).
10736 10744 */
10737 10745 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10738 10746
10739 10747 if (vp->v_type == VREG) {
10740 10748 /*
10741 10749 * We need to retrieve the open stream and update the counts.
10742 10750 * If there is no open stream here, something is wrong.
10743 10751 */
10744 10752 nfs4_open_stream_t *osp = NULL;
10745 10753 nfs4_open_owner_t *oop = NULL;
10746 10754
10747 10755 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10748 10756 if (oop != NULL) {
10749 10757 /* returns with 'os_sync_lock' held */
10750 10758 osp = find_open_stream(oop, rp);
10751 10759 open_owner_rele(oop);
10752 10760 }
10753 10761 if (osp == NULL) {
10754 10762 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10755 10763 "nfs4_addmap: we should have an osp"
10756 10764 "but we don't, so fail with EIO"));
10757 10765 error = EIO;
10758 10766 goto out;
10759 10767 }
10760 10768
10761 10769 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10762 10770 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10763 10771
10764 10772 /*
10765 10773 * Update the map count in the open stream.
10766 10774 * This is necessary in the case where we
10767 10775 * open/mmap/close/, then the server reboots, and we
10768 10776 * attempt to reopen. If the mmap doesn't add share
10769 10777 * access then we send an invalid reopen with
10770 10778 * access = NONE.
10771 10779 *
10772 10780 * We need to specifically check each PROT_* so a mmap
10773 10781 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10774 10782 * read and write access. A simple comparison of prot
10775 10783 * to ~PROT_WRITE to determine read access is insufficient
10776 10784 * since prot can be |= with PROT_USER, etc.
10777 10785 */
10778 10786
10779 10787 /*
10780 10788 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10781 10789 */
10782 10790 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10783 10791 osp->os_mmap_write += btopr(len);
10784 10792 if (maxprot & PROT_READ)
10785 10793 osp->os_mmap_read += btopr(len);
10786 10794 if (maxprot & PROT_EXEC)
10787 10795 osp->os_mmap_read += btopr(len);
10788 10796 /*
10789 10797 * Ensure that os_mmap_read gets incremented, even if
10790 10798 * maxprot were to look like PROT_NONE.
10791 10799 */
10792 10800 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10793 10801 !(maxprot & PROT_EXEC))
10794 10802 osp->os_mmap_read += btopr(len);
10795 10803 osp->os_mapcnt += btopr(len);
10796 10804 mutex_exit(&osp->os_sync_lock);
10797 10805 open_stream_rele(osp, rp);
10798 10806 }
10799 10807
10800 10808 out:
10801 10809 /*
10802 10810 * If we got an error, then undo our
10803 10811 * incrementing of 'r_mapcnt'.
10804 10812 */
10805 10813
10806 10814 if (error) {
10807 10815 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10808 10816 ASSERT(rp->r_mapcnt >= 0);
10809 10817 }
10810 10818 return (error);
10811 10819 }
10812 10820
10813 10821 /* ARGSUSED */
10814 10822 static int
10815 10823 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10816 10824 {
10817 10825
10818 10826 return (VTOR4(vp1) == VTOR4(vp2));
10819 10827 }
10820 10828
10821 10829 /* ARGSUSED */
10822 10830 static int
10823 10831 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10824 10832 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10825 10833 caller_context_t *ct)
10826 10834 {
10827 10835 int rc;
10828 10836 u_offset_t start, end;
10829 10837 rnode4_t *rp;
10830 10838 int error = 0, intr = INTR4(vp);
10831 10839 nfs4_error_t e;
10832 10840
10833 10841 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10834 10842 return (EIO);
10835 10843
10836 10844 /* check for valid cmd parameter */
10837 10845 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10838 10846 return (EINVAL);
10839 10847
10840 10848 /* Verify l_type. */
10841 10849 switch (bfp->l_type) {
10842 10850 case F_RDLCK:
10843 10851 if (cmd != F_GETLK && !(flag & FREAD))
10844 10852 return (EBADF);
10845 10853 break;
10846 10854 case F_WRLCK:
10847 10855 if (cmd != F_GETLK && !(flag & FWRITE))
10848 10856 return (EBADF);
10849 10857 break;
10850 10858 case F_UNLCK:
10851 10859 intr = 0;
10852 10860 break;
10853 10861
10854 10862 default:
10855 10863 return (EINVAL);
10856 10864 }
10857 10865
10858 10866 /* check the validity of the lock range */
10859 10867 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10860 10868 return (rc);
10861 10869 if (rc = flk_check_lock_data(start, end, MAXEND))
10862 10870 return (rc);
10863 10871
10864 10872 /*
10865 10873 * If the filesystem is mounted using local locking, pass the
10866 10874 * request off to the local locking code.
10867 10875 */
10868 10876 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10869 10877 if (cmd == F_SETLK || cmd == F_SETLKW) {
10870 10878 /*
10871 10879 * For complete safety, we should be holding
10872 10880 * r_lkserlock. However, we can't call
10873 10881 * nfs4_safelock and then fs_frlock while
10874 10882 * holding r_lkserlock, so just invoke
10875 10883 * nfs4_safelock and expect that this will
10876 10884 * catch enough of the cases.
10877 10885 */
10878 10886 if (!nfs4_safelock(vp, bfp, cr))
10879 10887 return (EAGAIN);
10880 10888 }
10881 10889 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10882 10890 }
10883 10891
10884 10892 rp = VTOR4(vp);
10885 10893
10886 10894 /*
10887 10895 * Check whether the given lock request can proceed, given the
10888 10896 * current file mappings.
10889 10897 */
10890 10898 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10891 10899 return (EINTR);
10892 10900 if (cmd == F_SETLK || cmd == F_SETLKW) {
10893 10901 if (!nfs4_safelock(vp, bfp, cr)) {
10894 10902 rc = EAGAIN;
10895 10903 goto done;
10896 10904 }
10897 10905 }
10898 10906
10899 10907 /*
10900 10908 * Flush the cache after waiting for async I/O to finish. For new
10901 10909 * locks, this is so that the process gets the latest bits from the
10902 10910 * server. For unlocks, this is so that other clients see the
10903 10911 * latest bits once the file has been unlocked. If currently dirty
10904 10912 * pages can't be flushed, then don't allow a lock to be set. But
10905 10913 * allow unlocks to succeed, to avoid having orphan locks on the
10906 10914 * server.
10907 10915 */
10908 10916 if (cmd != F_GETLK) {
10909 10917 mutex_enter(&rp->r_statelock);
10910 10918 while (rp->r_count > 0) {
10911 10919 if (intr) {
10912 10920 klwp_t *lwp = ttolwp(curthread);
10913 10921
10914 10922 if (lwp != NULL)
10915 10923 lwp->lwp_nostop++;
10916 10924 if (cv_wait_sig(&rp->r_cv,
10917 10925 &rp->r_statelock) == 0) {
10918 10926 if (lwp != NULL)
10919 10927 lwp->lwp_nostop--;
10920 10928 rc = EINTR;
10921 10929 break;
10922 10930 }
10923 10931 if (lwp != NULL)
10924 10932 lwp->lwp_nostop--;
10925 10933 } else
10926 10934 cv_wait(&rp->r_cv, &rp->r_statelock);
10927 10935 }
10928 10936 mutex_exit(&rp->r_statelock);
10929 10937 if (rc != 0)
10930 10938 goto done;
10931 10939 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10932 10940 if (error) {
10933 10941 if (error == ENOSPC || error == EDQUOT) {
10934 10942 mutex_enter(&rp->r_statelock);
10935 10943 if (!rp->r_error)
10936 10944 rp->r_error = error;
10937 10945 mutex_exit(&rp->r_statelock);
10938 10946 }
10939 10947 if (bfp->l_type != F_UNLCK) {
10940 10948 rc = ENOLCK;
10941 10949 goto done;
10942 10950 }
10943 10951 }
10944 10952 }
10945 10953
10946 10954 /*
10947 10955 * Call the lock manager to do the real work of contacting
10948 10956 * the server and obtaining the lock.
10949 10957 */
10950 10958 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10951 10959 cr, &e, NULL, NULL);
10952 10960 rc = e.error;
10953 10961
10954 10962 if (rc == 0)
10955 10963 nfs4_lockcompletion(vp, cmd);
10956 10964
10957 10965 done:
10958 10966 nfs_rw_exit(&rp->r_lkserlock);
10959 10967
10960 10968 return (rc);
10961 10969 }
10962 10970
10963 10971 /*
10964 10972 * Free storage space associated with the specified vnode. The portion
10965 10973 * to be freed is specified by bfp->l_start and bfp->l_len (already
10966 10974 * normalized to a "whence" of 0).
10967 10975 *
10968 10976 * This is an experimental facility whose continued existence is not
10969 10977 * guaranteed. Currently, we only support the special case
10970 10978 * of l_len == 0, meaning free to end of file.
10971 10979 */
10972 10980 /* ARGSUSED */
10973 10981 static int
10974 10982 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10975 10983 offset_t offset, cred_t *cr, caller_context_t *ct)
10976 10984 {
10977 10985 int error;
10978 10986
10979 10987 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10980 10988 return (EIO);
10981 10989 ASSERT(vp->v_type == VREG);
10982 10990 if (cmd != F_FREESP)
10983 10991 return (EINVAL);
10984 10992
10985 10993 error = convoff(vp, bfp, 0, offset);
10986 10994 if (!error) {
10987 10995 ASSERT(bfp->l_start >= 0);
10988 10996 if (bfp->l_len == 0) {
10989 10997 struct vattr va;
10990 10998
10991 10999 va.va_mask = AT_SIZE;
10992 11000 va.va_size = bfp->l_start;
10993 11001 error = nfs4setattr(vp, &va, 0, cr, NULL);
10994 11002 } else
10995 11003 error = EINVAL;
10996 11004 }
10997 11005
10998 11006 return (error);
10999 11007 }
11000 11008
11001 11009 /* ARGSUSED */
11002 11010 int
11003 11011 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11004 11012 {
11005 11013 rnode4_t *rp;
11006 11014 rp = VTOR4(vp);
11007 11015
11008 11016 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11009 11017 vp = RTOV4(rp);
11010 11018 }
11011 11019 *vpp = vp;
11012 11020 return (0);
11013 11021 }
11014 11022
11015 11023 /*
11016 11024 * Setup and add an address space callback to do the work of the delmap call.
11017 11025 * The callback will (and must be) deleted in the actual callback function.
11018 11026 *
11019 11027 * This is done in order to take care of the problem that we have with holding
11020 11028 * the address space's a_lock for a long period of time (e.g. if the NFS server
11021 11029 * is down). Callbacks will be executed in the address space code while the
11022 11030 * a_lock is not held. Holding the address space's a_lock causes things such
11023 11031 * as ps and fork to hang because they are trying to acquire this lock as well.
11024 11032 */
11025 11033 /* ARGSUSED */
11026 11034 static int
11027 11035 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11028 11036 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11029 11037 caller_context_t *ct)
11030 11038 {
11031 11039 int caller_found;
11032 11040 int error;
11033 11041 rnode4_t *rp;
11034 11042 nfs4_delmap_args_t *dmapp;
11035 11043 nfs4_delmapcall_t *delmap_call;
11036 11044
11037 11045 if (vp->v_flag & VNOMAP)
11038 11046 return (ENOSYS);
11039 11047
11040 11048 /*
11041 11049 * A process may not change zones if it has NFS pages mmap'ed
11042 11050 * in, so we can't legitimately get here from the wrong zone.
11043 11051 */
11044 11052 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11045 11053
11046 11054 rp = VTOR4(vp);
11047 11055
11048 11056 /*
11049 11057 * The way that the address space of this process deletes its mapping
11050 11058 * of this file is via the following call chains:
11051 11059 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11052 11060 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11053 11061 *
11054 11062 * With the use of address space callbacks we are allowed to drop the
11055 11063 * address space lock, a_lock, while executing the NFS operations that
11056 11064 * need to go over the wire. Returning EAGAIN to the caller of this
11057 11065 * function is what drives the execution of the callback that we add
11058 11066 * below. The callback will be executed by the address space code
11059 11067 * after dropping the a_lock. When the callback is finished, since
11060 11068 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11061 11069 * is called again on the same segment to finish the rest of the work
11062 11070 * that needs to happen during unmapping.
11063 11071 *
11064 11072 * This action of calling back into the segment driver causes
11065 11073 * nfs4_delmap() to get called again, but since the callback was
11066 11074 * already executed at this point, it already did the work and there
11067 11075 * is nothing left for us to do.
11068 11076 *
11069 11077 * To Summarize:
11070 11078 * - The first time nfs4_delmap is called by the current thread is when
11071 11079 * we add the caller associated with this delmap to the delmap caller
11072 11080 * list, add the callback, and return EAGAIN.
11073 11081 * - The second time in this call chain when nfs4_delmap is called we
11074 11082 * will find this caller in the delmap caller list and realize there
11075 11083 * is no more work to do thus removing this caller from the list and
11076 11084 * returning the error that was set in the callback execution.
11077 11085 */
11078 11086 caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11079 11087 if (caller_found) {
11080 11088 /*
11081 11089 * 'error' is from the actual delmap operations. To avoid
11082 11090 * hangs, we need to handle the return of EAGAIN differently
11083 11091 * since this is what drives the callback execution.
11084 11092 * In this case, we don't want to return EAGAIN and do the
11085 11093 * callback execution because there are none to execute.
11086 11094 */
11087 11095 if (error == EAGAIN)
11088 11096 return (0);
11089 11097 else
11090 11098 return (error);
11091 11099 }
11092 11100
11093 11101 /* current caller was not in the list */
11094 11102 delmap_call = nfs4_init_delmapcall();
11095 11103
11096 11104 mutex_enter(&rp->r_statelock);
11097 11105 list_insert_tail(&rp->r_indelmap, delmap_call);
11098 11106 mutex_exit(&rp->r_statelock);
11099 11107
11100 11108 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11101 11109
11102 11110 dmapp->vp = vp;
11103 11111 dmapp->off = off;
11104 11112 dmapp->addr = addr;
11105 11113 dmapp->len = len;
11106 11114 dmapp->prot = prot;
11107 11115 dmapp->maxprot = maxprot;
11108 11116 dmapp->flags = flags;
11109 11117 dmapp->cr = cr;
11110 11118 dmapp->caller = delmap_call;
11111 11119
11112 11120 error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11113 11121 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11114 11122
11115 11123 return (error ? error : EAGAIN);
11116 11124 }
11117 11125
11118 11126 static nfs4_delmapcall_t *
11119 11127 nfs4_init_delmapcall()
11120 11128 {
11121 11129 nfs4_delmapcall_t *delmap_call;
11122 11130
11123 11131 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11124 11132 delmap_call->call_id = curthread;
11125 11133 delmap_call->error = 0;
11126 11134
11127 11135 return (delmap_call);
11128 11136 }
11129 11137
11130 11138 static void
11131 11139 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11132 11140 {
11133 11141 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11134 11142 }
11135 11143
11136 11144 /*
11137 11145 * Searches for the current delmap caller (based on curthread) in the list of
11138 11146 * callers. If it is found, we remove it and free the delmap caller.
11139 11147 * Returns:
11140 11148 * 0 if the caller wasn't found
11141 11149 * 1 if the caller was found, removed and freed. *errp will be set
11142 11150 * to what the result of the delmap was.
11143 11151 */
11144 11152 static int
11145 11153 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11146 11154 {
11147 11155 nfs4_delmapcall_t *delmap_call;
11148 11156
11149 11157 /*
11150 11158 * If the list doesn't exist yet, we create it and return
11151 11159 * that the caller wasn't found. No list = no callers.
11152 11160 */
11153 11161 mutex_enter(&rp->r_statelock);
11154 11162 if (!(rp->r_flags & R4DELMAPLIST)) {
11155 11163 /* The list does not exist */
11156 11164 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11157 11165 offsetof(nfs4_delmapcall_t, call_node));
11158 11166 rp->r_flags |= R4DELMAPLIST;
11159 11167 mutex_exit(&rp->r_statelock);
11160 11168 return (0);
11161 11169 } else {
11162 11170 /* The list exists so search it */
11163 11171 for (delmap_call = list_head(&rp->r_indelmap);
11164 11172 delmap_call != NULL;
11165 11173 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11166 11174 if (delmap_call->call_id == curthread) {
11167 11175 /* current caller is in the list */
11168 11176 *errp = delmap_call->error;
11169 11177 list_remove(&rp->r_indelmap, delmap_call);
11170 11178 mutex_exit(&rp->r_statelock);
11171 11179 nfs4_free_delmapcall(delmap_call);
11172 11180 return (1);
11173 11181 }
11174 11182 }
11175 11183 }
11176 11184 mutex_exit(&rp->r_statelock);
11177 11185 return (0);
11178 11186 }
11179 11187
11180 11188 /*
11181 11189 * Remove some pages from an mmap'd vnode. Just update the
11182 11190 * count of pages. If doing close-to-open, then flush and
11183 11191 * commit all of the pages associated with this file.
11184 11192 * Otherwise, start an asynchronous page flush to write out
11185 11193 * any dirty pages. This will also associate a credential
11186 11194 * with the rnode which can be used to write the pages.
11187 11195 */
11188 11196 /* ARGSUSED */
11189 11197 static void
11190 11198 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11191 11199 {
11192 11200 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11193 11201 rnode4_t *rp;
11194 11202 mntinfo4_t *mi;
11195 11203 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg;
11196 11204
11197 11205 rp = VTOR4(dmapp->vp);
11198 11206 mi = VTOMI4(dmapp->vp);
11199 11207
11200 11208 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11201 11209 ASSERT(rp->r_mapcnt >= 0);
11202 11210
11203 11211 /*
11204 11212 * Initiate a page flush and potential commit if there are
11205 11213 * pages, the file system was not mounted readonly, the segment
11206 11214 * was mapped shared, and the pages themselves were writeable.
11207 11215 */
11208 11216 if (nfs4_has_pages(dmapp->vp) &&
11209 11217 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11210 11218 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11211 11219 mutex_enter(&rp->r_statelock);
11212 11220 rp->r_flags |= R4DIRTY;
11213 11221 mutex_exit(&rp->r_statelock);
11214 11222 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11215 11223 dmapp->len, dmapp->cr);
11216 11224 if (!e.error) {
11217 11225 mutex_enter(&rp->r_statelock);
11218 11226 e.error = rp->r_error;
11219 11227 rp->r_error = 0;
11220 11228 mutex_exit(&rp->r_statelock);
11221 11229 }
11222 11230 } else
11223 11231 e.error = 0;
11224 11232
11225 11233 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11226 11234 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11227 11235 B_INVAL, dmapp->cr, NULL);
11228 11236
11229 11237 if (e.error) {
11230 11238 e.stat = puterrno4(e.error);
11231 11239 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11232 11240 OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11233 11241 dmapp->caller->error = e.error;
11234 11242 }
11235 11243
11236 11244 /* Check to see if we need to close the file */
11237 11245
11238 11246 if (dmapp->vp->v_type == VREG) {
11239 11247 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11240 11248 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11241 11249
11242 11250 if (e.error != 0 || e.stat != NFS4_OK) {
11243 11251 /*
11244 11252 * Since it is possible that e.error == 0 and
11245 11253 * e.stat != NFS4_OK (and vice versa),
11246 11254 * we do the proper checking in order to get both
11247 11255 * e.error and e.stat reporting the correct info.
11248 11256 */
11249 11257 if (e.stat == NFS4_OK)
11250 11258 e.stat = puterrno4(e.error);
11251 11259 if (e.error == 0)
11252 11260 e.error = geterrno4(e.stat);
11253 11261
11254 11262 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11255 11263 OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11256 11264 dmapp->caller->error = e.error;
11257 11265 }
11258 11266 }
11259 11267
11260 11268 (void) as_delete_callback(as, arg);
11261 11269 kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11262 11270 }
11263 11271
11264 11272
11265 11273 static uint_t
11266 11274 fattr4_maxfilesize_to_bits(uint64_t ll)
11267 11275 {
11268 11276 uint_t l = 1;
11269 11277
11270 11278 if (ll == 0) {
11271 11279 return (0);
11272 11280 }
11273 11281
11274 11282 if (ll & 0xffffffff00000000) {
11275 11283 l += 32; ll >>= 32;
11276 11284 }
11277 11285 if (ll & 0xffff0000) {
11278 11286 l += 16; ll >>= 16;
11279 11287 }
11280 11288 if (ll & 0xff00) {
11281 11289 l += 8; ll >>= 8;
11282 11290 }
11283 11291 if (ll & 0xf0) {
11284 11292 l += 4; ll >>= 4;
11285 11293 }
11286 11294 if (ll & 0xc) {
11287 11295 l += 2; ll >>= 2;
11288 11296 }
11289 11297 if (ll & 0x2) {
11290 11298 l += 1;
11291 11299 }
11292 11300 return (l);
11293 11301 }
11294 11302
11295 11303 static int
11296 11304 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11297 11305 {
11298 11306 vnode_t *avp = NULL;
11299 11307 int error;
11300 11308
11301 11309 if ((error = nfs4lookup_xattr(vp, "", &avp,
11302 11310 LOOKUP_XATTR, cr)) == 0)
11303 11311 error = do_xattr_exists_check(avp, valp, cr);
11304 11312 if (avp)
11305 11313 VN_RELE(avp);
11306 11314
11307 11315 return (error);
11308 11316 }
11309 11317
11310 11318 /* ARGSUSED */
11311 11319 int
11312 11320 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11313 11321 caller_context_t *ct)
11314 11322 {
11315 11323 int error;
11316 11324 hrtime_t t;
11317 11325 rnode4_t *rp;
11318 11326 nfs4_ga_res_t gar;
11319 11327 nfs4_ga_ext_res_t ger;
11320 11328
11321 11329 gar.n4g_ext_res = &ger;
11322 11330
11323 11331 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11324 11332 return (EIO);
11325 11333 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11326 11334 *valp = MAXPATHLEN;
11327 11335 return (0);
11328 11336 }
11329 11337 if (cmd == _PC_ACL_ENABLED) {
11330 11338 *valp = _ACL_ACE_ENABLED;
11331 11339 return (0);
11332 11340 }
11333 11341
11334 11342 rp = VTOR4(vp);
11335 11343 if (cmd == _PC_XATTR_EXISTS) {
11336 11344 /*
11337 11345 * The existence of the xattr directory is not sufficient
11338 11346 * for determining whether generic user attributes exists.
11339 11347 * The attribute directory could only be a transient directory
11340 11348 * used for Solaris sysattr support. Do a small readdir
11341 11349 * to verify if the only entries are sysattrs or not.
11342 11350 *
11343 11351 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11344 11352 * is NULL. Once the xadir vp exists, we can create xattrs,
11345 11353 * and we don't have any way to update the "base" object's
11346 11354 * pc4_xattr_exists from the xattr or xadir. Maybe FEM
11347 11355 * could help out.
11348 11356 */
11349 11357 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11350 11358 rp->r_xattr_dir == NULL) {
11351 11359 return (nfs4_have_xattrs(vp, valp, cr));
11352 11360 }
11353 11361 } else { /* OLD CODE */
11354 11362 if (ATTRCACHE4_VALID(vp)) {
11355 11363 mutex_enter(&rp->r_statelock);
11356 11364 if (rp->r_pathconf.pc4_cache_valid) {
11357 11365 error = 0;
11358 11366 switch (cmd) {
11359 11367 case _PC_FILESIZEBITS:
11360 11368 *valp =
11361 11369 rp->r_pathconf.pc4_filesizebits;
11362 11370 break;
11363 11371 case _PC_LINK_MAX:
11364 11372 *valp =
11365 11373 rp->r_pathconf.pc4_link_max;
11366 11374 break;
11367 11375 case _PC_NAME_MAX:
11368 11376 *valp =
11369 11377 rp->r_pathconf.pc4_name_max;
11370 11378 break;
11371 11379 case _PC_CHOWN_RESTRICTED:
11372 11380 *valp =
11373 11381 rp->r_pathconf.pc4_chown_restricted;
11374 11382 break;
11375 11383 case _PC_NO_TRUNC:
11376 11384 *valp =
11377 11385 rp->r_pathconf.pc4_no_trunc;
11378 11386 break;
11379 11387 default:
11380 11388 error = EINVAL;
11381 11389 break;
11382 11390 }
11383 11391 mutex_exit(&rp->r_statelock);
11384 11392 #ifdef DEBUG
11385 11393 nfs4_pathconf_cache_hits++;
11386 11394 #endif
11387 11395 return (error);
11388 11396 }
11389 11397 mutex_exit(&rp->r_statelock);
11390 11398 }
11391 11399 }
11392 11400 #ifdef DEBUG
11393 11401 nfs4_pathconf_cache_misses++;
11394 11402 #endif
11395 11403
11396 11404 t = gethrtime();
11397 11405
11398 11406 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11399 11407
11400 11408 if (error) {
11401 11409 mutex_enter(&rp->r_statelock);
11402 11410 rp->r_pathconf.pc4_cache_valid = FALSE;
11403 11411 rp->r_pathconf.pc4_xattr_valid = FALSE;
11404 11412 mutex_exit(&rp->r_statelock);
11405 11413 return (error);
11406 11414 }
11407 11415
11408 11416 /* interpret the max filesize */
11409 11417 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11410 11418 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11411 11419
11412 11420 /* Store the attributes we just received */
11413 11421 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11414 11422
11415 11423 switch (cmd) {
11416 11424 case _PC_FILESIZEBITS:
11417 11425 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11418 11426 break;
11419 11427 case _PC_LINK_MAX:
11420 11428 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11421 11429 break;
11422 11430 case _PC_NAME_MAX:
11423 11431 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11424 11432 break;
11425 11433 case _PC_CHOWN_RESTRICTED:
11426 11434 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11427 11435 break;
11428 11436 case _PC_NO_TRUNC:
11429 11437 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11430 11438 break;
11431 11439 case _PC_XATTR_EXISTS:
11432 11440 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11433 11441 if (error = nfs4_have_xattrs(vp, valp, cr))
11434 11442 return (error);
11435 11443 }
11436 11444 break;
11437 11445 default:
11438 11446 return (EINVAL);
11439 11447 }
11440 11448
11441 11449 return (0);
11442 11450 }
11443 11451
11444 11452 /*
11445 11453 * Called by async thread to do synchronous pageio. Do the i/o, wait
11446 11454 * for it to complete, and cleanup the page list when done.
11447 11455 */
11448 11456 static int
11449 11457 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11450 11458 int flags, cred_t *cr)
11451 11459 {
11452 11460 int error;
11453 11461
11454 11462 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11455 11463
11456 11464 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11457 11465 if (flags & B_READ)
11458 11466 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11459 11467 else
11460 11468 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11461 11469 return (error);
11462 11470 }
11463 11471
11464 11472 /* ARGSUSED */
11465 11473 static int
11466 11474 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11467 11475 int flags, cred_t *cr, caller_context_t *ct)
11468 11476 {
11469 11477 int error;
11470 11478 rnode4_t *rp;
11471 11479
11472 11480 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11473 11481 return (EIO);
11474 11482
11475 11483 if (pp == NULL)
11476 11484 return (EINVAL);
11477 11485
11478 11486 rp = VTOR4(vp);
11479 11487 mutex_enter(&rp->r_statelock);
11480 11488 rp->r_count++;
11481 11489 mutex_exit(&rp->r_statelock);
11482 11490
11483 11491 if (flags & B_ASYNC) {
11484 11492 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11485 11493 nfs4_sync_pageio);
11486 11494 } else
11487 11495 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11488 11496 mutex_enter(&rp->r_statelock);
11489 11497 rp->r_count--;
11490 11498 cv_broadcast(&rp->r_cv);
11491 11499 mutex_exit(&rp->r_statelock);
11492 11500 return (error);
11493 11501 }
11494 11502
11495 11503 /* ARGSUSED */
11496 11504 static void
11497 11505 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11498 11506 caller_context_t *ct)
11499 11507 {
11500 11508 int error;
11501 11509 rnode4_t *rp;
11502 11510 page_t *plist;
11503 11511 page_t *pptr;
11504 11512 offset3 offset;
11505 11513 count3 len;
11506 11514 k_sigset_t smask;
11507 11515
11508 11516 /*
11509 11517 * We should get called with fl equal to either B_FREE or
11510 11518 * B_INVAL. Any other value is illegal.
11511 11519 *
11512 11520 * The page that we are either supposed to free or destroy
11513 11521 * should be exclusive locked and its io lock should not
11514 11522 * be held.
11515 11523 */
11516 11524 ASSERT(fl == B_FREE || fl == B_INVAL);
11517 11525 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11518 11526
11519 11527 rp = VTOR4(vp);
11520 11528
11521 11529 /*
11522 11530 * If the page doesn't need to be committed or we shouldn't
11523 11531 * even bother attempting to commit it, then just make sure
11524 11532 * that the p_fsdata byte is clear and then either free or
11525 11533 * destroy the page as appropriate.
11526 11534 */
11527 11535 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11528 11536 pp->p_fsdata = C_NOCOMMIT;
11529 11537 if (fl == B_FREE)
11530 11538 page_free(pp, dn);
11531 11539 else
11532 11540 page_destroy(pp, dn);
11533 11541 return;
11534 11542 }
11535 11543
11536 11544 /*
11537 11545 * If there is a page invalidation operation going on, then
11538 11546 * if this is one of the pages being destroyed, then just
11539 11547 * clear the p_fsdata byte and then either free or destroy
11540 11548 * the page as appropriate.
11541 11549 */
11542 11550 mutex_enter(&rp->r_statelock);
11543 11551 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11544 11552 mutex_exit(&rp->r_statelock);
11545 11553 pp->p_fsdata = C_NOCOMMIT;
11546 11554 if (fl == B_FREE)
11547 11555 page_free(pp, dn);
11548 11556 else
11549 11557 page_destroy(pp, dn);
11550 11558 return;
11551 11559 }
11552 11560
11553 11561 /*
11554 11562 * If we are freeing this page and someone else is already
11555 11563 * waiting to do a commit, then just unlock the page and
11556 11564 * return. That other thread will take care of commiting
11557 11565 * this page. The page can be freed sometime after the
11558 11566 * commit has finished. Otherwise, if the page is marked
11559 11567 * as delay commit, then we may be getting called from
11560 11568 * pvn_write_done, one page at a time. This could result
11561 11569 * in one commit per page, so we end up doing lots of small
11562 11570 * commits instead of fewer larger commits. This is bad,
11563 11571 * we want do as few commits as possible.
11564 11572 */
11565 11573 if (fl == B_FREE) {
11566 11574 if (rp->r_flags & R4COMMITWAIT) {
11567 11575 page_unlock(pp);
11568 11576 mutex_exit(&rp->r_statelock);
11569 11577 return;
11570 11578 }
11571 11579 if (pp->p_fsdata == C_DELAYCOMMIT) {
11572 11580 pp->p_fsdata = C_COMMIT;
11573 11581 page_unlock(pp);
11574 11582 mutex_exit(&rp->r_statelock);
11575 11583 return;
11576 11584 }
11577 11585 }
11578 11586
11579 11587 /*
11580 11588 * Check to see if there is a signal which would prevent an
11581 11589 * attempt to commit the pages from being successful. If so,
11582 11590 * then don't bother with all of the work to gather pages and
11583 11591 * generate the unsuccessful RPC. Just return from here and
11584 11592 * let the page be committed at some later time.
11585 11593 */
11586 11594 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11587 11595 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11588 11596 sigunintr(&smask);
11589 11597 page_unlock(pp);
11590 11598 mutex_exit(&rp->r_statelock);
11591 11599 return;
11592 11600 }
11593 11601 sigunintr(&smask);
11594 11602
11595 11603 /*
11596 11604 * We are starting to need to commit pages, so let's try
11597 11605 * to commit as many as possible at once to reduce the
11598 11606 * overhead.
11599 11607 *
11600 11608 * Set the `commit inprogress' state bit. We must
11601 11609 * first wait until any current one finishes. Then
11602 11610 * we initialize the c_pages list with this page.
11603 11611 */
11604 11612 while (rp->r_flags & R4COMMIT) {
11605 11613 rp->r_flags |= R4COMMITWAIT;
11606 11614 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11607 11615 rp->r_flags &= ~R4COMMITWAIT;
11608 11616 }
11609 11617 rp->r_flags |= R4COMMIT;
11610 11618 mutex_exit(&rp->r_statelock);
11611 11619 ASSERT(rp->r_commit.c_pages == NULL);
11612 11620 rp->r_commit.c_pages = pp;
11613 11621 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11614 11622 rp->r_commit.c_commlen = PAGESIZE;
11615 11623
11616 11624 /*
11617 11625 * Gather together all other pages which can be committed.
11618 11626 * They will all be chained off r_commit.c_pages.
11619 11627 */
11620 11628 nfs4_get_commit(vp);
11621 11629
11622 11630 /*
11623 11631 * Clear the `commit inprogress' status and disconnect
11624 11632 * the list of pages to be committed from the rnode.
11625 11633 * At this same time, we also save the starting offset
11626 11634 * and length of data to be committed on the server.
11627 11635 */
11628 11636 plist = rp->r_commit.c_pages;
11629 11637 rp->r_commit.c_pages = NULL;
11630 11638 offset = rp->r_commit.c_commbase;
11631 11639 len = rp->r_commit.c_commlen;
11632 11640 mutex_enter(&rp->r_statelock);
11633 11641 rp->r_flags &= ~R4COMMIT;
11634 11642 cv_broadcast(&rp->r_commit.c_cv);
11635 11643 mutex_exit(&rp->r_statelock);
11636 11644
11637 11645 if (curproc == proc_pageout || curproc == proc_fsflush ||
11638 11646 nfs_zone() != VTOMI4(vp)->mi_zone) {
11639 11647 nfs4_async_commit(vp, plist, offset, len,
11640 11648 cr, do_nfs4_async_commit);
11641 11649 return;
11642 11650 }
11643 11651
11644 11652 /*
11645 11653 * Actually generate the COMMIT op over the wire operation.
11646 11654 */
11647 11655 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11648 11656
11649 11657 /*
11650 11658 * If we got an error during the commit, just unlock all
11651 11659 * of the pages. The pages will get retransmitted to the
11652 11660 * server during a putpage operation.
11653 11661 */
11654 11662 if (error) {
11655 11663 while (plist != NULL) {
11656 11664 pptr = plist;
11657 11665 page_sub(&plist, pptr);
11658 11666 page_unlock(pptr);
11659 11667 }
11660 11668 return;
11661 11669 }
11662 11670
11663 11671 /*
11664 11672 * We've tried as hard as we can to commit the data to stable
11665 11673 * storage on the server. We just unlock the rest of the pages
11666 11674 * and clear the commit required state. They will be put
11667 11675 * onto the tail of the cachelist if they are nolonger
11668 11676 * mapped.
11669 11677 */
11670 11678 while (plist != pp) {
11671 11679 pptr = plist;
11672 11680 page_sub(&plist, pptr);
11673 11681 pptr->p_fsdata = C_NOCOMMIT;
11674 11682 page_unlock(pptr);
11675 11683 }
11676 11684
11677 11685 /*
11678 11686 * It is possible that nfs4_commit didn't return error but
11679 11687 * some other thread has modified the page we are going
11680 11688 * to free/destroy.
11681 11689 * In this case we need to rewrite the page. Do an explicit check
11682 11690 * before attempting to free/destroy the page. If modified, needs to
11683 11691 * be rewritten so unlock the page and return.
11684 11692 */
11685 11693 if (hat_ismod(pp)) {
11686 11694 pp->p_fsdata = C_NOCOMMIT;
11687 11695 page_unlock(pp);
11688 11696 return;
11689 11697 }
11690 11698
11691 11699 /*
11692 11700 * Now, as appropriate, either free or destroy the page
11693 11701 * that we were called with.
11694 11702 */
11695 11703 pp->p_fsdata = C_NOCOMMIT;
11696 11704 if (fl == B_FREE)
11697 11705 page_free(pp, dn);
11698 11706 else
11699 11707 page_destroy(pp, dn);
11700 11708 }
11701 11709
11702 11710 /*
11703 11711 * Commit requires that the current fh be the file written to.
11704 11712 * The compound op structure is:
11705 11713 * PUTFH(file), COMMIT
11706 11714 */
11707 11715 static int
11708 11716 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11709 11717 {
11710 11718 COMPOUND4args_clnt args;
11711 11719 COMPOUND4res_clnt res;
11712 11720 COMMIT4res *cm_res;
11713 11721 nfs_argop4 argop[2];
11714 11722 nfs_resop4 *resop;
11715 11723 int doqueue;
11716 11724 mntinfo4_t *mi;
11717 11725 rnode4_t *rp;
11718 11726 cred_t *cred_otw = NULL;
11719 11727 bool_t needrecov = FALSE;
11720 11728 nfs4_recov_state_t recov_state;
11721 11729 nfs4_open_stream_t *osp = NULL;
11722 11730 bool_t first_time = TRUE; /* first time getting OTW cred */
11723 11731 bool_t last_time = FALSE; /* last time getting OTW cred */
11724 11732 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11725 11733
11726 11734 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11727 11735
11728 11736 rp = VTOR4(vp);
11729 11737
11730 11738 mi = VTOMI4(vp);
11731 11739 recov_state.rs_flags = 0;
11732 11740 recov_state.rs_num_retry_despite_err = 0;
11733 11741 get_commit_cred:
11734 11742 /*
11735 11743 * Releases the osp, if a valid open stream is provided.
11736 11744 * Puts a hold on the cred_otw and the new osp (if found).
11737 11745 */
11738 11746 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11739 11747 &first_time, &last_time);
11740 11748 args.ctag = TAG_COMMIT;
11741 11749 recov_retry:
11742 11750 /*
11743 11751 * Commit ops: putfh file; commit
11744 11752 */
11745 11753 args.array_len = 2;
11746 11754 args.array = argop;
11747 11755
11748 11756 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11749 11757 &recov_state, NULL);
11750 11758 if (e.error) {
11751 11759 crfree(cred_otw);
11752 11760 if (osp != NULL)
11753 11761 open_stream_rele(osp, rp);
11754 11762 return (e.error);
11755 11763 }
11756 11764
11757 11765 /* putfh directory */
11758 11766 argop[0].argop = OP_CPUTFH;
11759 11767 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11760 11768
11761 11769 /* commit */
11762 11770 argop[1].argop = OP_COMMIT;
11763 11771 argop[1].nfs_argop4_u.opcommit.offset = offset;
11764 11772 argop[1].nfs_argop4_u.opcommit.count = count;
11765 11773
11766 11774 doqueue = 1;
11767 11775 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11768 11776
11769 11777 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11770 11778 if (!needrecov && e.error) {
11771 11779 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11772 11780 needrecov);
11773 11781 crfree(cred_otw);
11774 11782 if (e.error == EACCES && last_time == FALSE)
11775 11783 goto get_commit_cred;
11776 11784 if (osp != NULL)
11777 11785 open_stream_rele(osp, rp);
11778 11786 return (e.error);
11779 11787 }
11780 11788
11781 11789 if (needrecov) {
11782 11790 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11783 11791 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11784 11792 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11785 11793 &recov_state, needrecov);
11786 11794 if (!e.error)
11787 11795 (void) xdr_free(xdr_COMPOUND4res_clnt,
11788 11796 (caddr_t)&res);
11789 11797 goto recov_retry;
11790 11798 }
11791 11799 if (e.error) {
11792 11800 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11793 11801 &recov_state, needrecov);
11794 11802 crfree(cred_otw);
11795 11803 if (osp != NULL)
11796 11804 open_stream_rele(osp, rp);
11797 11805 return (e.error);
11798 11806 }
11799 11807 /* fall through for res.status case */
11800 11808 }
11801 11809
11802 11810 if (res.status) {
11803 11811 e.error = geterrno4(res.status);
11804 11812 if (e.error == EACCES && last_time == FALSE) {
11805 11813 crfree(cred_otw);
11806 11814 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11807 11815 &recov_state, needrecov);
11808 11816 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11809 11817 goto get_commit_cred;
11810 11818 }
11811 11819 /*
11812 11820 * Can't do a nfs4_purge_stale_fh here because this
11813 11821 * can cause a deadlock. nfs4_commit can
11814 11822 * be called from nfs4_dispose which can be called
11815 11823 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh
11816 11824 * can call back to pvn_vplist_dirty.
11817 11825 */
11818 11826 if (e.error == ESTALE) {
11819 11827 mutex_enter(&rp->r_statelock);
11820 11828 rp->r_flags |= R4STALE;
11821 11829 if (!rp->r_error)
11822 11830 rp->r_error = e.error;
11823 11831 mutex_exit(&rp->r_statelock);
11824 11832 PURGE_ATTRCACHE4(vp);
11825 11833 } else {
11826 11834 mutex_enter(&rp->r_statelock);
11827 11835 if (!rp->r_error)
11828 11836 rp->r_error = e.error;
11829 11837 mutex_exit(&rp->r_statelock);
11830 11838 }
11831 11839 } else {
11832 11840 ASSERT(rp->r_flags & R4HAVEVERF);
11833 11841 resop = &res.array[1]; /* commit res */
11834 11842 cm_res = &resop->nfs_resop4_u.opcommit;
11835 11843 mutex_enter(&rp->r_statelock);
11836 11844 if (cm_res->writeverf == rp->r_writeverf) {
11837 11845 mutex_exit(&rp->r_statelock);
11838 11846 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11839 11847 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11840 11848 &recov_state, needrecov);
11841 11849 crfree(cred_otw);
11842 11850 if (osp != NULL)
11843 11851 open_stream_rele(osp, rp);
11844 11852 return (0);
11845 11853 }
11846 11854 nfs4_set_mod(vp);
11847 11855 rp->r_writeverf = cm_res->writeverf;
11848 11856 mutex_exit(&rp->r_statelock);
11849 11857 e.error = NFS_VERF_MISMATCH;
11850 11858 }
11851 11859
11852 11860 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11853 11861 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11854 11862 crfree(cred_otw);
11855 11863 if (osp != NULL)
11856 11864 open_stream_rele(osp, rp);
11857 11865
11858 11866 return (e.error);
11859 11867 }
11860 11868
11861 11869 static void
11862 11870 nfs4_set_mod(vnode_t *vp)
11863 11871 {
11864 11872 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11865 11873
11866 11874 /* make sure we're looking at the master vnode, not a shadow */
11867 11875 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11868 11876 }
11869 11877
11870 11878 /*
11871 11879 * This function is used to gather a page list of the pages which
11872 11880 * can be committed on the server.
11873 11881 *
11874 11882 * The calling thread must have set R4COMMIT. This bit is used to
11875 11883 * serialize access to the commit structure in the rnode. As long
11876 11884 * as the thread has set R4COMMIT, then it can manipulate the commit
11877 11885 * structure without requiring any other locks.
11878 11886 *
11879 11887 * When this function is called from nfs4_dispose() the page passed
11880 11888 * into nfs4_dispose() will be SE_EXCL locked, and so this function
11881 11889 * will skip it. This is not a problem since we initially add the
11882 11890 * page to the r_commit page list.
11883 11891 *
11884 11892 */
11885 11893 static void
11886 11894 nfs4_get_commit(vnode_t *vp)
11887 11895 {
11888 11896 rnode4_t *rp;
11889 11897 page_t *pp;
11890 11898 kmutex_t *vphm;
11891 11899
11892 11900 rp = VTOR4(vp);
11893 11901
11894 11902 ASSERT(rp->r_flags & R4COMMIT);
11895 11903
11896 11904 /* make sure we're looking at the master vnode, not a shadow */
11897 11905
11898 11906 if (IS_SHADOW(vp, rp))
11899 11907 vp = RTOV4(rp);
11900 11908
11901 11909 vphm = page_vnode_mutex(vp);
11902 11910 mutex_enter(vphm);
11903 11911
11904 11912 /*
11905 11913 * If there are no pages associated with this vnode, then
11906 11914 * just return.
11907 11915 */
11908 11916 if ((pp = vp->v_pages) == NULL) {
11909 11917 mutex_exit(vphm);
11910 11918 return;
11911 11919 }
11912 11920
11913 11921 /*
11914 11922 * Step through all of the pages associated with this vnode
11915 11923 * looking for pages which need to be committed.
11916 11924 */
11917 11925 do {
11918 11926 /* Skip marker pages. */
11919 11927 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11920 11928 continue;
11921 11929
11922 11930 /*
11923 11931 * First short-cut everything (without the page_lock)
11924 11932 * and see if this page does not need to be committed
11925 11933 * or is modified if so then we'll just skip it.
11926 11934 */
11927 11935 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11928 11936 continue;
11929 11937
11930 11938 /*
11931 11939 * Attempt to lock the page. If we can't, then
11932 11940 * someone else is messing with it or we have been
11933 11941 * called from nfs4_dispose and this is the page that
11934 11942 * nfs4_dispose was called with.. anyway just skip it.
11935 11943 */
11936 11944 if (!page_trylock(pp, SE_EXCL))
11937 11945 continue;
11938 11946
11939 11947 /*
11940 11948 * Lets check again now that we have the page lock.
11941 11949 */
11942 11950 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11943 11951 page_unlock(pp);
11944 11952 continue;
11945 11953 }
11946 11954
11947 11955 /* this had better not be a free page */
11948 11956 ASSERT(PP_ISFREE(pp) == 0);
11949 11957
11950 11958 /*
11951 11959 * The page needs to be committed and we locked it.
11952 11960 * Update the base and length parameters and add it
11953 11961 * to r_pages.
11954 11962 */
11955 11963 if (rp->r_commit.c_pages == NULL) {
11956 11964 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11957 11965 rp->r_commit.c_commlen = PAGESIZE;
11958 11966 } else if (pp->p_offset < rp->r_commit.c_commbase) {
11959 11967 rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11960 11968 (offset3)pp->p_offset + rp->r_commit.c_commlen;
11961 11969 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11962 11970 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11963 11971 <= pp->p_offset) {
11964 11972 rp->r_commit.c_commlen = (offset3)pp->p_offset -
11965 11973 rp->r_commit.c_commbase + PAGESIZE;
11966 11974 }
11967 11975 page_add(&rp->r_commit.c_pages, pp);
11968 11976 } while ((pp = pp->p_vpnext) != vp->v_pages);
11969 11977
11970 11978 mutex_exit(vphm);
11971 11979 }
11972 11980
11973 11981 /*
11974 11982 * This routine is used to gather together a page list of the pages
11975 11983 * which are to be committed on the server. This routine must not
11976 11984 * be called if the calling thread holds any locked pages.
11977 11985 *
11978 11986 * The calling thread must have set R4COMMIT. This bit is used to
11979 11987 * serialize access to the commit structure in the rnode. As long
11980 11988 * as the thread has set R4COMMIT, then it can manipulate the commit
11981 11989 * structure without requiring any other locks.
11982 11990 */
11983 11991 static void
11984 11992 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
11985 11993 {
11986 11994
11987 11995 rnode4_t *rp;
11988 11996 page_t *pp;
11989 11997 u_offset_t end;
11990 11998 u_offset_t off;
11991 11999 ASSERT(len != 0);
11992 12000 rp = VTOR4(vp);
11993 12001 ASSERT(rp->r_flags & R4COMMIT);
11994 12002
11995 12003 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11996 12004
11997 12005 /* make sure we're looking at the master vnode, not a shadow */
11998 12006
11999 12007 if (IS_SHADOW(vp, rp))
12000 12008 vp = RTOV4(rp);
12001 12009
12002 12010 /*
12003 12011 * If there are no pages associated with this vnode, then
12004 12012 * just return.
12005 12013 */
12006 12014 if ((pp = vp->v_pages) == NULL)
12007 12015 return;
12008 12016 /*
12009 12017 * Calculate the ending offset.
12010 12018 */
12011 12019 end = soff + len;
12012 12020 for (off = soff; off < end; off += PAGESIZE) {
12013 12021 /*
12014 12022 * Lookup each page by vp, offset.
12015 12023 */
12016 12024 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12017 12025 continue;
12018 12026 /*
12019 12027 * If this page does not need to be committed or is
12020 12028 * modified, then just skip it.
12021 12029 */
12022 12030 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12023 12031 page_unlock(pp);
12024 12032 continue;
12025 12033 }
12026 12034
12027 12035 ASSERT(PP_ISFREE(pp) == 0);
12028 12036 /*
12029 12037 * The page needs to be committed and we locked it.
12030 12038 * Update the base and length parameters and add it
12031 12039 * to r_pages.
12032 12040 */
12033 12041 if (rp->r_commit.c_pages == NULL) {
12034 12042 rp->r_commit.c_commbase = (offset3)pp->p_offset;
12035 12043 rp->r_commit.c_commlen = PAGESIZE;
12036 12044 } else {
12037 12045 rp->r_commit.c_commlen = (offset3)pp->p_offset -
12038 12046 rp->r_commit.c_commbase + PAGESIZE;
12039 12047 }
12040 12048 page_add(&rp->r_commit.c_pages, pp);
12041 12049 }
12042 12050 }
12043 12051
12044 12052 /*
12045 12053 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12046 12054 * Flushes and commits data to the server.
12047 12055 */
12048 12056 static int
12049 12057 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12050 12058 {
12051 12059 int error;
12052 12060 verifier4 write_verf;
12053 12061 rnode4_t *rp = VTOR4(vp);
12054 12062
12055 12063 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12056 12064
12057 12065 /*
12058 12066 * Flush the data portion of the file and then commit any
12059 12067 * portions which need to be committed. This may need to
12060 12068 * be done twice if the server has changed state since
12061 12069 * data was last written. The data will need to be
12062 12070 * rewritten to the server and then a new commit done.
12063 12071 *
12064 12072 * In fact, this may need to be done several times if the
12065 12073 * server is having problems and crashing while we are
12066 12074 * attempting to do this.
12067 12075 */
12068 12076
12069 12077 top:
12070 12078 /*
12071 12079 * Do a flush based on the poff and plen arguments. This
12072 12080 * will synchronously write out any modified pages in the
12073 12081 * range specified by (poff, plen). This starts all of the
12074 12082 * i/o operations which will be waited for in the next
12075 12083 * call to nfs4_putpage
12076 12084 */
12077 12085
12078 12086 mutex_enter(&rp->r_statelock);
12079 12087 write_verf = rp->r_writeverf;
12080 12088 mutex_exit(&rp->r_statelock);
12081 12089
12082 12090 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12083 12091 if (error == EAGAIN)
12084 12092 error = 0;
12085 12093
12086 12094 /*
12087 12095 * Do a flush based on the poff and plen arguments. This
12088 12096 * will synchronously write out any modified pages in the
12089 12097 * range specified by (poff, plen) and wait until all of
12090 12098 * the asynchronous i/o's in that range are done as well.
12091 12099 */
12092 12100 if (!error)
12093 12101 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12094 12102
12095 12103 if (error)
12096 12104 return (error);
12097 12105
12098 12106 mutex_enter(&rp->r_statelock);
12099 12107 if (rp->r_writeverf != write_verf) {
12100 12108 mutex_exit(&rp->r_statelock);
12101 12109 goto top;
12102 12110 }
12103 12111 mutex_exit(&rp->r_statelock);
12104 12112
12105 12113 /*
12106 12114 * Now commit any pages which might need to be committed.
12107 12115 * If the error, NFS_VERF_MISMATCH, is returned, then
12108 12116 * start over with the flush operation.
12109 12117 */
12110 12118 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12111 12119
12112 12120 if (error == NFS_VERF_MISMATCH)
12113 12121 goto top;
12114 12122
12115 12123 return (error);
12116 12124 }
12117 12125
12118 12126 /*
12119 12127 * nfs4_commit_vp() will wait for other pending commits and
12120 12128 * will either commit the whole file or a range, plen dictates
12121 12129 * if we commit whole file. a value of zero indicates the whole
12122 12130 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12123 12131 */
12124 12132 static int
12125 12133 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12126 12134 cred_t *cr, int wait_on_writes)
12127 12135 {
12128 12136 rnode4_t *rp;
12129 12137 page_t *plist;
12130 12138 offset3 offset;
12131 12139 count3 len;
12132 12140
12133 12141 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12134 12142
12135 12143 rp = VTOR4(vp);
12136 12144
12137 12145 /*
12138 12146 * before we gather commitable pages make
12139 12147 * sure there are no outstanding async writes
12140 12148 */
12141 12149 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12142 12150 mutex_enter(&rp->r_statelock);
12143 12151 while (rp->r_count > 0) {
12144 12152 cv_wait(&rp->r_cv, &rp->r_statelock);
12145 12153 }
12146 12154 mutex_exit(&rp->r_statelock);
12147 12155 }
12148 12156
12149 12157 /*
12150 12158 * Set the `commit inprogress' state bit. We must
12151 12159 * first wait until any current one finishes.
12152 12160 */
12153 12161 mutex_enter(&rp->r_statelock);
12154 12162 while (rp->r_flags & R4COMMIT) {
12155 12163 rp->r_flags |= R4COMMITWAIT;
12156 12164 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12157 12165 rp->r_flags &= ~R4COMMITWAIT;
12158 12166 }
12159 12167 rp->r_flags |= R4COMMIT;
12160 12168 mutex_exit(&rp->r_statelock);
12161 12169
12162 12170 /*
12163 12171 * Gather all of the pages which need to be
12164 12172 * committed.
12165 12173 */
12166 12174 if (plen == 0)
12167 12175 nfs4_get_commit(vp);
12168 12176 else
12169 12177 nfs4_get_commit_range(vp, poff, plen);
12170 12178
12171 12179 /*
12172 12180 * Clear the `commit inprogress' bit and disconnect the
12173 12181 * page list which was gathered by nfs4_get_commit.
12174 12182 */
12175 12183 plist = rp->r_commit.c_pages;
12176 12184 rp->r_commit.c_pages = NULL;
12177 12185 offset = rp->r_commit.c_commbase;
12178 12186 len = rp->r_commit.c_commlen;
12179 12187 mutex_enter(&rp->r_statelock);
12180 12188 rp->r_flags &= ~R4COMMIT;
12181 12189 cv_broadcast(&rp->r_commit.c_cv);
12182 12190 mutex_exit(&rp->r_statelock);
12183 12191
12184 12192 /*
12185 12193 * If any pages need to be committed, commit them and
12186 12194 * then unlock them so that they can be freed some
12187 12195 * time later.
12188 12196 */
12189 12197 if (plist == NULL)
12190 12198 return (0);
12191 12199
12192 12200 /*
12193 12201 * No error occurred during the flush portion
12194 12202 * of this operation, so now attempt to commit
12195 12203 * the data to stable storage on the server.
12196 12204 *
12197 12205 * This will unlock all of the pages on the list.
12198 12206 */
12199 12207 return (nfs4_sync_commit(vp, plist, offset, len, cr));
12200 12208 }
12201 12209
12202 12210 static int
12203 12211 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12204 12212 cred_t *cr)
12205 12213 {
12206 12214 int error;
12207 12215 page_t *pp;
12208 12216
12209 12217 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12210 12218
12211 12219 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12212 12220
12213 12221 /*
12214 12222 * If we got an error, then just unlock all of the pages
12215 12223 * on the list.
12216 12224 */
12217 12225 if (error) {
12218 12226 while (plist != NULL) {
12219 12227 pp = plist;
12220 12228 page_sub(&plist, pp);
12221 12229 page_unlock(pp);
12222 12230 }
12223 12231 return (error);
12224 12232 }
12225 12233 /*
12226 12234 * We've tried as hard as we can to commit the data to stable
12227 12235 * storage on the server. We just unlock the pages and clear
12228 12236 * the commit required state. They will get freed later.
12229 12237 */
12230 12238 while (plist != NULL) {
12231 12239 pp = plist;
12232 12240 page_sub(&plist, pp);
12233 12241 pp->p_fsdata = C_NOCOMMIT;
12234 12242 page_unlock(pp);
12235 12243 }
12236 12244
12237 12245 return (error);
12238 12246 }
12239 12247
12240 12248 static void
12241 12249 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12242 12250 cred_t *cr)
12243 12251 {
12244 12252
12245 12253 (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12246 12254 }
12247 12255
12248 12256 /*ARGSUSED*/
12249 12257 static int
12250 12258 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12251 12259 caller_context_t *ct)
12252 12260 {
12253 12261 int error = 0;
12254 12262 mntinfo4_t *mi;
12255 12263 vattr_t va;
12256 12264 vsecattr_t nfsace4_vsap;
12257 12265
12258 12266 mi = VTOMI4(vp);
12259 12267 if (nfs_zone() != mi->mi_zone)
12260 12268 return (EIO);
12261 12269 if (mi->mi_flags & MI4_ACL) {
12262 12270 /* if we have a delegation, return it */
12263 12271 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12264 12272 (void) nfs4delegreturn(VTOR4(vp),
12265 12273 NFS4_DR_REOPEN|NFS4_DR_PUSH);
12266 12274
12267 12275 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12268 12276 NFS4_ACL_SET);
12269 12277 if (error) /* EINVAL */
12270 12278 return (error);
12271 12279
12272 12280 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12273 12281 /*
12274 12282 * These are aclent_t type entries.
12275 12283 */
12276 12284 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12277 12285 vp->v_type == VDIR, FALSE);
12278 12286 if (error)
12279 12287 return (error);
12280 12288 } else {
12281 12289 /*
12282 12290 * These are ace_t type entries.
12283 12291 */
12284 12292 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12285 12293 FALSE);
12286 12294 if (error)
12287 12295 return (error);
12288 12296 }
12289 12297 bzero(&va, sizeof (va));
12290 12298 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12291 12299 vs_ace4_destroy(&nfsace4_vsap);
12292 12300 return (error);
12293 12301 }
12294 12302 return (ENOSYS);
12295 12303 }
12296 12304
12297 12305 /* ARGSUSED */
12298 12306 int
12299 12307 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12300 12308 caller_context_t *ct)
12301 12309 {
12302 12310 int error;
12303 12311 mntinfo4_t *mi;
12304 12312 nfs4_ga_res_t gar;
12305 12313 rnode4_t *rp = VTOR4(vp);
12306 12314
12307 12315 mi = VTOMI4(vp);
12308 12316 if (nfs_zone() != mi->mi_zone)
12309 12317 return (EIO);
12310 12318
12311 12319 bzero(&gar, sizeof (gar));
12312 12320 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12313 12321
12314 12322 /*
12315 12323 * vsecattr->vsa_mask holds the original acl request mask.
12316 12324 * This is needed when determining what to return.
12317 12325 * (See: nfs4_create_getsecattr_return())
12318 12326 */
12319 12327 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12320 12328 if (error) /* EINVAL */
12321 12329 return (error);
12322 12330
12323 12331 /*
12324 12332 * If this is a referral stub, don't try to go OTW for an ACL
12325 12333 */
12326 12334 if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12327 12335 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12328 12336
12329 12337 if (mi->mi_flags & MI4_ACL) {
12330 12338 /*
12331 12339 * Check if the data is cached and the cache is valid. If it
12332 12340 * is we don't go over the wire.
12333 12341 */
12334 12342 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12335 12343 mutex_enter(&rp->r_statelock);
12336 12344 if (rp->r_secattr != NULL) {
12337 12345 error = nfs4_create_getsecattr_return(
12338 12346 rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12339 12347 rp->r_attr.va_gid,
12340 12348 vp->v_type == VDIR);
12341 12349 if (!error) { /* error == 0 - Success! */
12342 12350 mutex_exit(&rp->r_statelock);
12343 12351 return (error);
12344 12352 }
12345 12353 }
12346 12354 mutex_exit(&rp->r_statelock);
12347 12355 }
12348 12356
12349 12357 /*
12350 12358 * The getattr otw call will always get both the acl, in
12351 12359 * the form of a list of nfsace4's, and the number of acl
12352 12360 * entries; independent of the value of gar.n4g_vsa.vsa_mask.
12353 12361 */
12354 12362 gar.n4g_va.va_mask = AT_ALL;
12355 12363 error = nfs4_getattr_otw(vp, &gar, cr, 1);
12356 12364 if (error) {
12357 12365 vs_ace4_destroy(&gar.n4g_vsa);
12358 12366 if (error == ENOTSUP || error == EOPNOTSUPP)
12359 12367 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12360 12368 return (error);
12361 12369 }
12362 12370
12363 12371 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12364 12372 /*
12365 12373 * No error was returned, but according to the response
12366 12374 * bitmap, neither was an acl.
12367 12375 */
12368 12376 vs_ace4_destroy(&gar.n4g_vsa);
12369 12377 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12370 12378 return (error);
12371 12379 }
12372 12380
12373 12381 /*
12374 12382 * Update the cache with the ACL.
12375 12383 */
12376 12384 nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12377 12385
12378 12386 error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12379 12387 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12380 12388 vp->v_type == VDIR);
12381 12389 vs_ace4_destroy(&gar.n4g_vsa);
12382 12390 if ((error) && (vsecattr->vsa_mask &
12383 12391 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12384 12392 (error != EACCES)) {
12385 12393 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12386 12394 }
12387 12395 return (error);
12388 12396 }
12389 12397 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12390 12398 return (error);
12391 12399 }
12392 12400
12393 12401 /*
12394 12402 * The function returns:
12395 12403 * - 0 (zero) if the passed in "acl_mask" is a valid request.
12396 12404 * - EINVAL if the passed in "acl_mask" is an invalid request.
12397 12405 *
12398 12406 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12399 12407 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12400 12408 *
12401 12409 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12402 12410 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12403 12411 * - We have a count field set without the corresponding acl field set. (e.g. -
12404 12412 * VSA_ACECNT is set, but VSA_ACE is not)
12405 12413 */
12406 12414 static int
12407 12415 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12408 12416 {
12409 12417 /* Shortcut the masks that are always valid. */
12410 12418 if (acl_mask == (VSA_ACE | VSA_ACECNT))
12411 12419 return (0);
12412 12420 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12413 12421 return (0);
12414 12422
12415 12423 if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12416 12424 /*
12417 12425 * We can't have any VSA_ACL type stuff in the mask now.
12418 12426 */
12419 12427 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12420 12428 VSA_DFACLCNT))
12421 12429 return (EINVAL);
12422 12430
12423 12431 if (op == NFS4_ACL_SET) {
12424 12432 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12425 12433 return (EINVAL);
12426 12434 }
12427 12435 }
12428 12436
12429 12437 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12430 12438 /*
12431 12439 * We can't have any VSA_ACE type stuff in the mask now.
12432 12440 */
12433 12441 if (acl_mask & (VSA_ACE | VSA_ACECNT))
12434 12442 return (EINVAL);
12435 12443
12436 12444 if (op == NFS4_ACL_SET) {
12437 12445 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12438 12446 return (EINVAL);
12439 12447
12440 12448 if ((acl_mask & VSA_DFACLCNT) &&
12441 12449 !(acl_mask & VSA_DFACL))
12442 12450 return (EINVAL);
12443 12451 }
12444 12452 }
12445 12453 return (0);
12446 12454 }
12447 12455
12448 12456 /*
12449 12457 * The theory behind creating the correct getsecattr return is simply this:
12450 12458 * "Don't return anything that the caller is not expecting to have to free."
12451 12459 */
12452 12460 static int
12453 12461 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12454 12462 uid_t uid, gid_t gid, int isdir)
12455 12463 {
12456 12464 int error = 0;
12457 12465 /* Save the mask since the translators modify it. */
12458 12466 uint_t orig_mask = vsap->vsa_mask;
12459 12467
12460 12468 if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12461 12469 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12462 12470
12463 12471 if (error)
12464 12472 return (error);
12465 12473
12466 12474 /*
12467 12475 * If the caller only asked for the ace count (VSA_ACECNT)
12468 12476 * don't give them the full acl (VSA_ACE), free it.
12469 12477 */
12470 12478 if (!orig_mask & VSA_ACE) {
12471 12479 if (vsap->vsa_aclentp != NULL) {
12472 12480 kmem_free(vsap->vsa_aclentp,
12473 12481 vsap->vsa_aclcnt * sizeof (ace_t));
12474 12482 vsap->vsa_aclentp = NULL;
12475 12483 }
12476 12484 }
12477 12485 vsap->vsa_mask = orig_mask;
12478 12486
12479 12487 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12480 12488 VSA_DFACLCNT)) {
12481 12489 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12482 12490 isdir, FALSE);
12483 12491
12484 12492 if (error)
12485 12493 return (error);
12486 12494
12487 12495 /*
12488 12496 * If the caller only asked for the acl count (VSA_ACLCNT)
12489 12497 * and/or the default acl count (VSA_DFACLCNT) don't give them
12490 12498 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12491 12499 */
12492 12500 if (!orig_mask & VSA_ACL) {
12493 12501 if (vsap->vsa_aclentp != NULL) {
12494 12502 kmem_free(vsap->vsa_aclentp,
12495 12503 vsap->vsa_aclcnt * sizeof (aclent_t));
12496 12504 vsap->vsa_aclentp = NULL;
12497 12505 }
12498 12506 }
12499 12507
12500 12508 if (!orig_mask & VSA_DFACL) {
12501 12509 if (vsap->vsa_dfaclentp != NULL) {
12502 12510 kmem_free(vsap->vsa_dfaclentp,
12503 12511 vsap->vsa_dfaclcnt * sizeof (aclent_t));
12504 12512 vsap->vsa_dfaclentp = NULL;
12505 12513 }
12506 12514 }
12507 12515 vsap->vsa_mask = orig_mask;
12508 12516 }
12509 12517 return (0);
12510 12518 }
12511 12519
12512 12520 /* ARGSUSED */
12513 12521 int
12514 12522 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12515 12523 caller_context_t *ct)
12516 12524 {
12517 12525 int error;
12518 12526
12519 12527 if (nfs_zone() != VTOMI4(vp)->mi_zone)
12520 12528 return (EIO);
12521 12529 /*
12522 12530 * check for valid cmd parameter
12523 12531 */
12524 12532 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12525 12533 return (EINVAL);
12526 12534
12527 12535 /*
12528 12536 * Check access permissions
12529 12537 */
12530 12538 if ((cmd & F_SHARE) &&
12531 12539 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12532 12540 (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12533 12541 return (EBADF);
12534 12542
12535 12543 /*
12536 12544 * If the filesystem is mounted using local locking, pass the
12537 12545 * request off to the local share code.
12538 12546 */
12539 12547 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12540 12548 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12541 12549
12542 12550 switch (cmd) {
12543 12551 case F_SHARE:
12544 12552 case F_UNSHARE:
12545 12553 /*
12546 12554 * This will be properly implemented later,
12547 12555 * see RFE: 4823948 .
12548 12556 */
12549 12557 error = EAGAIN;
12550 12558 break;
12551 12559
12552 12560 case F_HASREMOTELOCKS:
12553 12561 /*
12554 12562 * NFS client can't store remote locks itself
12555 12563 */
12556 12564 shr->s_access = 0;
12557 12565 error = 0;
12558 12566 break;
12559 12567
12560 12568 default:
12561 12569 error = EINVAL;
12562 12570 break;
12563 12571 }
12564 12572
12565 12573 return (error);
12566 12574 }
12567 12575
12568 12576 /*
12569 12577 * Common code called by directory ops to update the attrcache
12570 12578 */
12571 12579 static int
12572 12580 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12573 12581 hrtime_t t, vnode_t *vp, cred_t *cr)
12574 12582 {
12575 12583 int error = 0;
12576 12584
12577 12585 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12578 12586
12579 12587 if (status != NFS4_OK) {
12580 12588 /* getattr not done or failed */
12581 12589 PURGE_ATTRCACHE4(vp);
12582 12590 return (error);
12583 12591 }
12584 12592
12585 12593 if (garp) {
12586 12594 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12587 12595 } else {
12588 12596 PURGE_ATTRCACHE4(vp);
12589 12597 }
12590 12598 return (error);
12591 12599 }
12592 12600
12593 12601 /*
12594 12602 * Update directory caches for directory modification ops (link, rename, etc.)
12595 12603 * When dinfo is NULL, manage dircaches in the old way.
12596 12604 */
12597 12605 static void
12598 12606 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12599 12607 dirattr_info_t *dinfo)
12600 12608 {
12601 12609 rnode4_t *drp = VTOR4(dvp);
12602 12610
12603 12611 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12604 12612
12605 12613 /* Purge rddir cache for dir since it changed */
12606 12614 if (drp->r_dir != NULL)
12607 12615 nfs4_purge_rddir_cache(dvp);
12608 12616
12609 12617 /*
12610 12618 * If caller provided dinfo, then use it to manage dir caches.
12611 12619 */
12612 12620 if (dinfo != NULL) {
12613 12621 if (vp != NULL) {
12614 12622 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12615 12623 if (!VTOR4(vp)->created_v4) {
12616 12624 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12617 12625 dnlc_update(dvp, nm, vp);
12618 12626 } else {
12619 12627 /*
12620 12628 * XXX don't update if the created_v4 flag is
12621 12629 * set
12622 12630 */
12623 12631 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12624 12632 NFS4_DEBUG(nfs4_client_state_debug,
12625 12633 (CE_NOTE, "nfs4_update_dircaches: "
12626 12634 "don't update dnlc: created_v4 flag"));
12627 12635 }
12628 12636 }
12629 12637
12630 12638 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12631 12639 dinfo->di_cred, FALSE, cinfo);
12632 12640
12633 12641 return;
12634 12642 }
12635 12643
12636 12644 /*
12637 12645 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12638 12646 * Since caller modified dir but didn't receive post-dirmod-op dir
12639 12647 * attrs, the dir's attrs must be purged.
12640 12648 *
12641 12649 * XXX this check and dnlc update/purge should really be atomic,
12642 12650 * XXX but can't use rnode statelock because it'll deadlock in
12643 12651 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12644 12652 * XXX does occur.
12645 12653 *
12646 12654 * XXX We also may want to check that atomic is true in the
12647 12655 * XXX change_info struct. If it is not, the change_info may
12648 12656 * XXX reflect changes by more than one clients which means that
12649 12657 * XXX our cache may not be valid.
12650 12658 */
12651 12659 PURGE_ATTRCACHE4(dvp);
12652 12660 if (drp->r_change == cinfo->before) {
12653 12661 /* no changes took place in the directory prior to our link */
12654 12662 if (vp != NULL) {
12655 12663 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12656 12664 if (!VTOR4(vp)->created_v4) {
12657 12665 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12658 12666 dnlc_update(dvp, nm, vp);
12659 12667 } else {
12660 12668 /*
12661 12669 * XXX dont' update if the created_v4 flag
12662 12670 * is set
12663 12671 */
12664 12672 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12665 12673 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12666 12674 "nfs4_update_dircaches: don't"
12667 12675 " update dnlc: created_v4 flag"));
12668 12676 }
12669 12677 }
12670 12678 } else {
12671 12679 /* Another client modified directory - purge its dnlc cache */
12672 12680 dnlc_purge_vp(dvp);
12673 12681 }
12674 12682 }
12675 12683
12676 12684 /*
12677 12685 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12678 12686 * file.
12679 12687 *
12680 12688 * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12681 12689 * file (ie: client recovery) and otherwise set to FALSE.
12682 12690 *
12683 12691 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12684 12692 * initiated) calling functions.
12685 12693 *
12686 12694 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12687 12695 * of resending a 'lost' open request.
12688 12696 *
12689 12697 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12690 12698 * server that hands out BAD_SEQID on open confirm.
12691 12699 *
12692 12700 * Errors are returned via the nfs4_error_t parameter.
12693 12701 */
12694 12702 void
12695 12703 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12696 12704 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12697 12705 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12698 12706 {
12699 12707 COMPOUND4args_clnt args;
12700 12708 COMPOUND4res_clnt res;
12701 12709 nfs_argop4 argop[2];
12702 12710 nfs_resop4 *resop;
12703 12711 int doqueue = 1;
12704 12712 mntinfo4_t *mi;
12705 12713 OPEN_CONFIRM4args *open_confirm_args;
12706 12714 int needrecov;
12707 12715
12708 12716 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12709 12717 #if DEBUG
12710 12718 mutex_enter(&oop->oo_lock);
12711 12719 ASSERT(oop->oo_seqid_inuse);
12712 12720 mutex_exit(&oop->oo_lock);
12713 12721 #endif
12714 12722
12715 12723 recov_retry_confirm:
12716 12724 nfs4_error_zinit(ep);
12717 12725 *retry_open = FALSE;
12718 12726
12719 12727 if (resend)
12720 12728 args.ctag = TAG_OPEN_CONFIRM_LOST;
12721 12729 else
12722 12730 args.ctag = TAG_OPEN_CONFIRM;
12723 12731
12724 12732 args.array_len = 2;
12725 12733 args.array = argop;
12726 12734
12727 12735 /* putfh target fh */
12728 12736 argop[0].argop = OP_CPUTFH;
12729 12737 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12730 12738
12731 12739 argop[1].argop = OP_OPEN_CONFIRM;
12732 12740 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12733 12741
12734 12742 (*seqid) += 1;
12735 12743 open_confirm_args->seqid = *seqid;
12736 12744 open_confirm_args->open_stateid = *stateid;
12737 12745
12738 12746 mi = VTOMI4(vp);
12739 12747
12740 12748 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12741 12749
12742 12750 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12743 12751 nfs4_set_open_seqid((*seqid), oop, args.ctag);
12744 12752 }
12745 12753
12746 12754 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12747 12755 if (!needrecov && ep->error)
12748 12756 return;
12749 12757
12750 12758 if (needrecov) {
12751 12759 bool_t abort = FALSE;
12752 12760
12753 12761 if (reopening_file == FALSE) {
12754 12762 nfs4_bseqid_entry_t *bsep = NULL;
12755 12763
12756 12764 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12757 12765 bsep = nfs4_create_bseqid_entry(oop, NULL,
12758 12766 vp, 0, args.ctag,
12759 12767 open_confirm_args->seqid);
12760 12768
12761 12769 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12762 12770 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12763 12771 if (bsep) {
12764 12772 kmem_free(bsep, sizeof (*bsep));
12765 12773 if (num_bseqid_retryp &&
12766 12774 --(*num_bseqid_retryp) == 0)
12767 12775 abort = TRUE;
12768 12776 }
12769 12777 }
12770 12778 if ((ep->error == ETIMEDOUT ||
12771 12779 res.status == NFS4ERR_RESOURCE) &&
12772 12780 abort == FALSE && resend == FALSE) {
12773 12781 if (!ep->error)
12774 12782 (void) xdr_free(xdr_COMPOUND4res_clnt,
12775 12783 (caddr_t)&res);
12776 12784
12777 12785 delay(SEC_TO_TICK(confirm_retry_sec));
12778 12786 goto recov_retry_confirm;
12779 12787 }
12780 12788 /* State may have changed so retry the entire OPEN op */
12781 12789 if (abort == FALSE)
12782 12790 *retry_open = TRUE;
12783 12791 else
12784 12792 *retry_open = FALSE;
12785 12793 if (!ep->error)
12786 12794 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12787 12795 return;
12788 12796 }
12789 12797
12790 12798 if (res.status) {
12791 12799 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12792 12800 return;
12793 12801 }
12794 12802
12795 12803 resop = &res.array[1]; /* open confirm res */
12796 12804 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12797 12805 stateid, sizeof (*stateid));
12798 12806
12799 12807 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12800 12808 }
12801 12809
12802 12810 /*
12803 12811 * Return the credentials associated with a client state object. The
12804 12812 * caller is responsible for freeing the credentials.
12805 12813 */
12806 12814
12807 12815 static cred_t *
12808 12816 state_to_cred(nfs4_open_stream_t *osp)
12809 12817 {
12810 12818 cred_t *cr;
12811 12819
12812 12820 /*
12813 12821 * It's ok to not lock the open stream and open owner to get
12814 12822 * the oo_cred since this is only written once (upon creation)
12815 12823 * and will not change.
12816 12824 */
12817 12825 cr = osp->os_open_owner->oo_cred;
12818 12826 crhold(cr);
12819 12827
12820 12828 return (cr);
12821 12829 }
12822 12830
12823 12831 /*
12824 12832 * nfs4_find_sysid
12825 12833 *
12826 12834 * Find the sysid for the knetconfig associated with the given mi.
12827 12835 */
12828 12836 static struct lm_sysid *
12829 12837 nfs4_find_sysid(mntinfo4_t *mi)
12830 12838 {
12831 12839 ASSERT(nfs_zone() == mi->mi_zone);
12832 12840
12833 12841 /*
12834 12842 * Switch from RDMA knconf to original mount knconf
12835 12843 */
12836 12844 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12837 12845 mi->mi_curr_serv->sv_hostname, NULL));
12838 12846 }
12839 12847
12840 12848 #ifdef DEBUG
12841 12849 /*
12842 12850 * Return a string version of the call type for easy reading.
12843 12851 */
12844 12852 static char *
12845 12853 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12846 12854 {
12847 12855 switch (ctype) {
12848 12856 case NFS4_LCK_CTYPE_NORM:
12849 12857 return ("NORMAL");
12850 12858 case NFS4_LCK_CTYPE_RECLAIM:
12851 12859 return ("RECLAIM");
12852 12860 case NFS4_LCK_CTYPE_RESEND:
12853 12861 return ("RESEND");
12854 12862 case NFS4_LCK_CTYPE_REINSTATE:
12855 12863 return ("REINSTATE");
12856 12864 default:
12857 12865 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12858 12866 "type %d", ctype);
12859 12867 return ("");
12860 12868 }
12861 12869 }
12862 12870 #endif
12863 12871
12864 12872 /*
12865 12873 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12866 12874 * Unlock requests don't have an over-the-wire locktype, so we just return
12867 12875 * something non-threatening.
12868 12876 */
12869 12877
12870 12878 static nfs_lock_type4
12871 12879 flk_to_locktype(int cmd, int l_type)
12872 12880 {
12873 12881 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12874 12882
12875 12883 switch (l_type) {
12876 12884 case F_UNLCK:
12877 12885 return (READ_LT);
12878 12886 case F_RDLCK:
12879 12887 if (cmd == F_SETLK)
12880 12888 return (READ_LT);
12881 12889 else
12882 12890 return (READW_LT);
12883 12891 case F_WRLCK:
12884 12892 if (cmd == F_SETLK)
12885 12893 return (WRITE_LT);
12886 12894 else
12887 12895 return (WRITEW_LT);
12888 12896 }
12889 12897 panic("flk_to_locktype");
12890 12898 /*NOTREACHED*/
12891 12899 }
12892 12900
12893 12901 /*
12894 12902 * Do some preliminary checks for nfs4frlock.
12895 12903 */
12896 12904 static int
12897 12905 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12898 12906 u_offset_t offset)
12899 12907 {
12900 12908 int error = 0;
12901 12909
12902 12910 /*
12903 12911 * If we are setting a lock, check that the file is opened
12904 12912 * with the correct mode.
12905 12913 */
12906 12914 if (cmd == F_SETLK || cmd == F_SETLKW) {
12907 12915 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12908 12916 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12909 12917 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12910 12918 "nfs4frlock_validate_args: file was opened with "
12911 12919 "incorrect mode"));
12912 12920 return (EBADF);
12913 12921 }
12914 12922 }
12915 12923
12916 12924 /* Convert the offset. It may need to be restored before returning. */
12917 12925 if (error = convoff(vp, flk, 0, offset)) {
12918 12926 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12919 12927 "nfs4frlock_validate_args: convoff => error= %d\n",
12920 12928 error));
12921 12929 return (error);
12922 12930 }
12923 12931
12924 12932 return (error);
12925 12933 }
12926 12934
12927 12935 /*
12928 12936 * Set the flock64's lm_sysid for nfs4frlock.
12929 12937 */
12930 12938 static int
12931 12939 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12932 12940 {
12933 12941 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12934 12942
12935 12943 /* Find the lm_sysid */
12936 12944 *lspp = nfs4_find_sysid(VTOMI4(vp));
12937 12945
12938 12946 if (*lspp == NULL) {
12939 12947 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 12948 "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12941 12949 return (ENOLCK);
12942 12950 }
12943 12951
12944 12952 flk->l_sysid = lm_sysidt(*lspp);
12945 12953
12946 12954 return (0);
12947 12955 }
12948 12956
12949 12957 /*
12950 12958 * Do the remaining preliminary setup for nfs4frlock.
12951 12959 */
12952 12960 static void
12953 12961 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12954 12962 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12955 12963 cred_t **cred_otw)
12956 12964 {
12957 12965 /*
12958 12966 * set tick_delay to the base delay time.
12959 12967 * (NFS4_BASE_WAIT_TIME is in secs)
12960 12968 */
12961 12969
12962 12970 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12963 12971
12964 12972 /*
12965 12973 * If lock is relative to EOF, we need the newest length of the
12966 12974 * file. Therefore invalidate the ATTR_CACHE.
12967 12975 */
12968 12976
12969 12977 *whencep = flk->l_whence;
12970 12978
12971 12979 if (*whencep == 2) /* SEEK_END */
12972 12980 PURGE_ATTRCACHE4(vp);
12973 12981
12974 12982 recov_statep->rs_flags = 0;
12975 12983 recov_statep->rs_num_retry_despite_err = 0;
12976 12984 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12977 12985 }
12978 12986
12979 12987 /*
12980 12988 * Initialize and allocate the data structures necessary for
12981 12989 * the nfs4frlock call.
12982 12990 * Allocates argsp's op array, frees up the saved_rqstpp if there is one.
12983 12991 */
12984 12992 static void
12985 12993 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
12986 12994 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
12987 12995 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
12988 12996 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
12989 12997 {
12990 12998 int argoplist_size;
12991 12999 int num_ops = 2;
12992 13000
12993 13001 *retry = FALSE;
12994 13002 *did_start_fop = FALSE;
12995 13003 *skip_get_err = FALSE;
12996 13004 lost_rqstp->lr_op = 0;
12997 13005 argoplist_size = num_ops * sizeof (nfs_argop4);
12998 13006 /* fill array with zero */
12999 13007 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13000 13008
13001 13009 *argspp = argsp;
13002 13010 *respp = NULL;
13003 13011
13004 13012 argsp->array_len = num_ops;
13005 13013 argsp->array = *argopp;
13006 13014
13007 13015 /* initialize in case of error; will get real value down below */
13008 13016 argsp->ctag = TAG_NONE;
13009 13017
13010 13018 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13011 13019 *op_hintp = OH_LOCKU;
13012 13020 else
13013 13021 *op_hintp = OH_OTHER;
13014 13022 }
13015 13023
13016 13024 /*
13017 13025 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign
13018 13026 * the proper nfs4_server_t for this instance of nfs4frlock.
13019 13027 * Returns 0 (success) or an errno value.
13020 13028 */
13021 13029 static int
13022 13030 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13023 13031 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13024 13032 bool_t *did_start_fop, bool_t *startrecovp)
13025 13033 {
13026 13034 int error = 0;
13027 13035 rnode4_t *rp;
13028 13036
13029 13037 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13030 13038
13031 13039 if (ctype == NFS4_LCK_CTYPE_NORM) {
13032 13040 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13033 13041 recov_statep, startrecovp);
13034 13042 if (error)
13035 13043 return (error);
13036 13044 *did_start_fop = TRUE;
13037 13045 } else {
13038 13046 *did_start_fop = FALSE;
13039 13047 *startrecovp = FALSE;
13040 13048 }
13041 13049
13042 13050 if (!error) {
13043 13051 rp = VTOR4(vp);
13044 13052
13045 13053 /* If the file failed recovery, just quit. */
13046 13054 mutex_enter(&rp->r_statelock);
13047 13055 if (rp->r_flags & R4RECOVERR) {
13048 13056 error = EIO;
13049 13057 }
13050 13058 mutex_exit(&rp->r_statelock);
13051 13059 }
13052 13060
13053 13061 return (error);
13054 13062 }
13055 13063
13056 13064 /*
13057 13065 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A
13058 13066 * resend nfs4frlock call is initiated by the recovery framework.
13059 13067 * Acquires the lop and oop seqid synchronization.
13060 13068 */
13061 13069 static void
13062 13070 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13063 13071 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13064 13072 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13065 13073 LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13066 13074 {
13067 13075 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13068 13076 int error;
13069 13077
13070 13078 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13071 13079 (CE_NOTE,
13072 13080 "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13073 13081 ASSERT(resend_rqstp != NULL);
13074 13082 ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13075 13083 resend_rqstp->lr_op == OP_LOCKU);
13076 13084
13077 13085 *oopp = resend_rqstp->lr_oop;
13078 13086 if (resend_rqstp->lr_oop) {
13079 13087 open_owner_hold(resend_rqstp->lr_oop);
13080 13088 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13081 13089 ASSERT(error == 0); /* recov thread always succeeds */
13082 13090 }
13083 13091
13084 13092 /* Must resend this lost lock/locku request. */
13085 13093 ASSERT(resend_rqstp->lr_lop != NULL);
13086 13094 *lopp = resend_rqstp->lr_lop;
13087 13095 lock_owner_hold(resend_rqstp->lr_lop);
13088 13096 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13089 13097 ASSERT(error == 0); /* recov thread always succeeds */
13090 13098
13091 13099 *ospp = resend_rqstp->lr_osp;
13092 13100 if (*ospp)
13093 13101 open_stream_hold(resend_rqstp->lr_osp);
13094 13102
13095 13103 if (resend_rqstp->lr_op == OP_LOCK) {
13096 13104 LOCK4args *lock_args;
13097 13105
13098 13106 argop->argop = OP_LOCK;
13099 13107 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13100 13108 lock_args->locktype = resend_rqstp->lr_locktype;
13101 13109 lock_args->reclaim =
13102 13110 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13103 13111 lock_args->offset = resend_rqstp->lr_flk->l_start;
13104 13112 lock_args->length = resend_rqstp->lr_flk->l_len;
13105 13113 if (lock_args->length == 0)
13106 13114 lock_args->length = ~lock_args->length;
13107 13115 nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13108 13116 mi2clientid(mi), &lock_args->locker);
13109 13117
13110 13118 switch (resend_rqstp->lr_ctype) {
13111 13119 case NFS4_LCK_CTYPE_RESEND:
13112 13120 argsp->ctag = TAG_LOCK_RESEND;
13113 13121 break;
13114 13122 case NFS4_LCK_CTYPE_REINSTATE:
13115 13123 argsp->ctag = TAG_LOCK_REINSTATE;
13116 13124 break;
13117 13125 case NFS4_LCK_CTYPE_RECLAIM:
13118 13126 argsp->ctag = TAG_LOCK_RECLAIM;
13119 13127 break;
13120 13128 default:
13121 13129 argsp->ctag = TAG_LOCK_UNKNOWN;
13122 13130 break;
13123 13131 }
13124 13132 } else {
13125 13133 LOCKU4args *locku_args;
13126 13134 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13127 13135
13128 13136 argop->argop = OP_LOCKU;
13129 13137 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13130 13138 locku_args->locktype = READ_LT;
13131 13139 locku_args->seqid = lop->lock_seqid + 1;
13132 13140 mutex_enter(&lop->lo_lock);
13133 13141 locku_args->lock_stateid = lop->lock_stateid;
13134 13142 mutex_exit(&lop->lo_lock);
13135 13143 locku_args->offset = resend_rqstp->lr_flk->l_start;
13136 13144 locku_args->length = resend_rqstp->lr_flk->l_len;
13137 13145 if (locku_args->length == 0)
13138 13146 locku_args->length = ~locku_args->length;
13139 13147
13140 13148 switch (resend_rqstp->lr_ctype) {
13141 13149 case NFS4_LCK_CTYPE_RESEND:
13142 13150 argsp->ctag = TAG_LOCKU_RESEND;
13143 13151 break;
13144 13152 case NFS4_LCK_CTYPE_REINSTATE:
13145 13153 argsp->ctag = TAG_LOCKU_REINSTATE;
13146 13154 break;
13147 13155 default:
13148 13156 argsp->ctag = TAG_LOCK_UNKNOWN;
13149 13157 break;
13150 13158 }
13151 13159 }
13152 13160 }
13153 13161
13154 13162 /*
13155 13163 * Setup the LOCKT4 arguments.
13156 13164 */
13157 13165 static void
13158 13166 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13159 13167 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13160 13168 rnode4_t *rp)
13161 13169 {
13162 13170 LOCKT4args *lockt_args;
13163 13171
13164 13172 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13165 13173 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13166 13174 argop->argop = OP_LOCKT;
13167 13175 argsp->ctag = TAG_LOCKT;
13168 13176 lockt_args = &argop->nfs_argop4_u.oplockt;
13169 13177
13170 13178 /*
13171 13179 * The locktype will be READ_LT unless it's
13172 13180 * a write lock. We do this because the Solaris
13173 13181 * system call allows the combination of
13174 13182 * F_UNLCK and F_GETLK* and so in that case the
13175 13183 * unlock is mapped to a read.
13176 13184 */
13177 13185 if (flk->l_type == F_WRLCK)
13178 13186 lockt_args->locktype = WRITE_LT;
13179 13187 else
13180 13188 lockt_args->locktype = READ_LT;
13181 13189
13182 13190 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13183 13191 /* set the lock owner4 args */
13184 13192 nfs4_setlockowner_args(&lockt_args->owner, rp,
13185 13193 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13186 13194 flk->l_pid);
13187 13195 lockt_args->offset = flk->l_start;
13188 13196 lockt_args->length = flk->l_len;
13189 13197 if (flk->l_len == 0)
13190 13198 lockt_args->length = ~lockt_args->length;
13191 13199
13192 13200 *lockt_argsp = lockt_args;
13193 13201 }
13194 13202
13195 13203 /*
13196 13204 * If the client is holding a delegation, and the open stream to be used
13197 13205 * with this lock request is a delegation open stream, then re-open the stream.
13198 13206 * Sets the nfs4_error_t to all zeros unless the open stream has already
13199 13207 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY
13200 13208 * means the caller should retry (like a recovery retry).
13201 13209 */
13202 13210 static void
13203 13211 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13204 13212 {
13205 13213 open_delegation_type4 dt;
13206 13214 bool_t reopen_needed, force;
13207 13215 nfs4_open_stream_t *osp;
13208 13216 open_claim_type4 oclaim;
13209 13217 rnode4_t *rp = VTOR4(vp);
13210 13218 mntinfo4_t *mi = VTOMI4(vp);
13211 13219
13212 13220 ASSERT(nfs_zone() == mi->mi_zone);
13213 13221
13214 13222 nfs4_error_zinit(ep);
13215 13223
13216 13224 mutex_enter(&rp->r_statev4_lock);
13217 13225 dt = rp->r_deleg_type;
13218 13226 mutex_exit(&rp->r_statev4_lock);
13219 13227
13220 13228 if (dt != OPEN_DELEGATE_NONE) {
13221 13229 nfs4_open_owner_t *oop;
13222 13230
13223 13231 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13224 13232 if (!oop) {
13225 13233 ep->stat = NFS4ERR_IO;
13226 13234 return;
13227 13235 }
13228 13236 /* returns with 'os_sync_lock' held */
13229 13237 osp = find_open_stream(oop, rp);
13230 13238 if (!osp) {
13231 13239 open_owner_rele(oop);
13232 13240 ep->stat = NFS4ERR_IO;
13233 13241 return;
13234 13242 }
13235 13243
13236 13244 if (osp->os_failed_reopen) {
13237 13245 NFS4_DEBUG((nfs4_open_stream_debug ||
13238 13246 nfs4_client_lock_debug), (CE_NOTE,
13239 13247 "nfs4frlock_check_deleg: os_failed_reopen set "
13240 13248 "for osp %p, cr %p, rp %s", (void *)osp,
13241 13249 (void *)cr, rnode4info(rp)));
13242 13250 mutex_exit(&osp->os_sync_lock);
13243 13251 open_stream_rele(osp, rp);
13244 13252 open_owner_rele(oop);
13245 13253 ep->stat = NFS4ERR_IO;
13246 13254 return;
13247 13255 }
13248 13256
13249 13257 /*
13250 13258 * Determine whether a reopen is needed. If this
13251 13259 * is a delegation open stream, then send the open
13252 13260 * to the server to give visibility to the open owner.
13253 13261 * Even if it isn't a delegation open stream, we need
13254 13262 * to check if the previous open CLAIM_DELEGATE_CUR
13255 13263 * was sufficient.
13256 13264 */
13257 13265
13258 13266 reopen_needed = osp->os_delegation ||
13259 13267 ((lt == F_RDLCK &&
13260 13268 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13261 13269 (lt == F_WRLCK &&
13262 13270 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13263 13271
13264 13272 mutex_exit(&osp->os_sync_lock);
13265 13273 open_owner_rele(oop);
13266 13274
13267 13275 if (reopen_needed) {
13268 13276 /*
13269 13277 * Always use CLAIM_PREVIOUS after server reboot.
13270 13278 * The server will reject CLAIM_DELEGATE_CUR if
13271 13279 * it is used during the grace period.
13272 13280 */
13273 13281 mutex_enter(&mi->mi_lock);
13274 13282 if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13275 13283 oclaim = CLAIM_PREVIOUS;
13276 13284 force = TRUE;
13277 13285 } else {
13278 13286 oclaim = CLAIM_DELEGATE_CUR;
13279 13287 force = FALSE;
13280 13288 }
13281 13289 mutex_exit(&mi->mi_lock);
13282 13290
13283 13291 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13284 13292 if (ep->error == EAGAIN) {
13285 13293 nfs4_error_zinit(ep);
13286 13294 ep->stat = NFS4ERR_DELAY;
13287 13295 }
13288 13296 }
13289 13297 open_stream_rele(osp, rp);
13290 13298 osp = NULL;
13291 13299 }
13292 13300 }
13293 13301
13294 13302 /*
13295 13303 * Setup the LOCKU4 arguments.
13296 13304 * Returns errors via the nfs4_error_t.
13297 13305 * NFS4_OK no problems. *go_otwp is TRUE if call should go
13298 13306 * over-the-wire. The caller must release the
13299 13307 * reference on *lopp.
13300 13308 * NFS4ERR_DELAY caller should retry (like recovery retry)
13301 13309 * (other) unrecoverable error.
13302 13310 */
13303 13311 static void
13304 13312 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13305 13313 LOCKU4args **locku_argsp, flock64_t *flk,
13306 13314 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13307 13315 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13308 13316 bool_t *skip_get_err, bool_t *go_otwp)
13309 13317 {
13310 13318 nfs4_lock_owner_t *lop = NULL;
13311 13319 LOCKU4args *locku_args;
13312 13320 pid_t pid;
13313 13321 bool_t is_spec = FALSE;
13314 13322 rnode4_t *rp = VTOR4(vp);
13315 13323
13316 13324 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13317 13325 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13318 13326
13319 13327 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13320 13328 if (ep->error || ep->stat)
13321 13329 return;
13322 13330
13323 13331 argop->argop = OP_LOCKU;
13324 13332 if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13325 13333 argsp->ctag = TAG_LOCKU_REINSTATE;
13326 13334 else
13327 13335 argsp->ctag = TAG_LOCKU;
13328 13336 locku_args = &argop->nfs_argop4_u.oplocku;
13329 13337 *locku_argsp = locku_args;
13330 13338
13331 13339 /*
13332 13340 * XXX what should locku_args->locktype be?
13333 13341 * setting to ALWAYS be READ_LT so at least
13334 13342 * it is a valid locktype.
13335 13343 */
13336 13344
13337 13345 locku_args->locktype = READ_LT;
13338 13346
13339 13347 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13340 13348 flk->l_pid;
13341 13349
13342 13350 /*
13343 13351 * Get the lock owner stateid. If no lock owner
13344 13352 * exists, return success.
13345 13353 */
13346 13354 lop = find_lock_owner(rp, pid, LOWN_ANY);
13347 13355 *lopp = lop;
13348 13356 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13349 13357 is_spec = TRUE;
13350 13358 if (!lop || is_spec) {
13351 13359 /*
13352 13360 * No lock owner so no locks to unlock.
13353 13361 * Return success. If there was a failed
13354 13362 * reclaim earlier, the lock might still be
13355 13363 * registered with the local locking code,
13356 13364 * so notify it of the unlock.
13357 13365 *
13358 13366 * If the lockowner is using a special stateid,
13359 13367 * then the original lock request (that created
13360 13368 * this lockowner) was never successful, so we
13361 13369 * have no lock to undo OTW.
13362 13370 */
13363 13371 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13364 13372 "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13365 13373 "(%ld) so return success", (long)pid));
13366 13374
13367 13375 if (ctype == NFS4_LCK_CTYPE_NORM)
13368 13376 flk->l_pid = curproc->p_pid;
13369 13377 nfs4_register_lock_locally(vp, flk, flag, offset);
13370 13378 /*
13371 13379 * Release our hold and NULL out so final_cleanup
13372 13380 * doesn't try to end a lock seqid sync we
13373 13381 * never started.
13374 13382 */
13375 13383 if (is_spec) {
13376 13384 lock_owner_rele(lop);
13377 13385 *lopp = NULL;
13378 13386 }
13379 13387 *skip_get_err = TRUE;
13380 13388 *go_otwp = FALSE;
13381 13389 return;
13382 13390 }
13383 13391
13384 13392 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13385 13393 if (ep->error == EAGAIN) {
13386 13394 lock_owner_rele(lop);
13387 13395 *lopp = NULL;
13388 13396 return;
13389 13397 }
13390 13398
13391 13399 mutex_enter(&lop->lo_lock);
13392 13400 locku_args->lock_stateid = lop->lock_stateid;
13393 13401 mutex_exit(&lop->lo_lock);
13394 13402 locku_args->seqid = lop->lock_seqid + 1;
13395 13403
13396 13404 /* leave the ref count on lop, rele after RPC call */
13397 13405
13398 13406 locku_args->offset = flk->l_start;
13399 13407 locku_args->length = flk->l_len;
13400 13408 if (flk->l_len == 0)
13401 13409 locku_args->length = ~locku_args->length;
13402 13410
13403 13411 *go_otwp = TRUE;
13404 13412 }
13405 13413
13406 13414 /*
13407 13415 * Setup the LOCK4 arguments.
13408 13416 *
13409 13417 * Returns errors via the nfs4_error_t.
13410 13418 * NFS4_OK no problems
13411 13419 * NFS4ERR_DELAY caller should retry (like recovery retry)
13412 13420 * (other) unrecoverable error
13413 13421 */
13414 13422 static void
13415 13423 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13416 13424 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13417 13425 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13418 13426 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13419 13427 {
13420 13428 LOCK4args *lock_args;
13421 13429 nfs4_open_owner_t *oop = NULL;
13422 13430 nfs4_open_stream_t *osp = NULL;
13423 13431 nfs4_lock_owner_t *lop = NULL;
13424 13432 pid_t pid;
13425 13433 rnode4_t *rp = VTOR4(vp);
13426 13434
13427 13435 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13428 13436
13429 13437 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13430 13438 if (ep->error || ep->stat != NFS4_OK)
13431 13439 return;
13432 13440
13433 13441 argop->argop = OP_LOCK;
13434 13442 if (ctype == NFS4_LCK_CTYPE_NORM)
13435 13443 argsp->ctag = TAG_LOCK;
13436 13444 else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13437 13445 argsp->ctag = TAG_RELOCK;
13438 13446 else
13439 13447 argsp->ctag = TAG_LOCK_REINSTATE;
13440 13448 lock_args = &argop->nfs_argop4_u.oplock;
13441 13449 lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13442 13450 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13443 13451 /*
13444 13452 * Get the lock owner. If no lock owner exists,
13445 13453 * create a 'temporary' one and grab the open seqid
13446 13454 * synchronization (which puts a hold on the open
13447 13455 * owner and open stream).
13448 13456 * This also grabs the lock seqid synchronization.
13449 13457 */
13450 13458 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13451 13459 ep->stat =
13452 13460 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13453 13461
13454 13462 if (ep->stat != NFS4_OK)
13455 13463 goto out;
13456 13464
13457 13465 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13458 13466 &lock_args->locker);
13459 13467
13460 13468 lock_args->offset = flk->l_start;
13461 13469 lock_args->length = flk->l_len;
13462 13470 if (flk->l_len == 0)
13463 13471 lock_args->length = ~lock_args->length;
13464 13472 *lock_argsp = lock_args;
13465 13473 out:
13466 13474 *oopp = oop;
13467 13475 *ospp = osp;
13468 13476 *lopp = lop;
13469 13477 }
13470 13478
13471 13479 /*
13472 13480 * After we get the reply from the server, record the proper information
13473 13481 * for possible resend lock requests.
13474 13482 *
13475 13483 * Allocates memory for the saved_rqstp if we have a lost lock to save.
13476 13484 */
13477 13485 static void
13478 13486 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13479 13487 nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13480 13488 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13481 13489 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13482 13490 {
13483 13491 bool_t unlock = (flk->l_type == F_UNLCK);
13484 13492
13485 13493 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13486 13494 ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13487 13495 ctype == NFS4_LCK_CTYPE_REINSTATE);
13488 13496
13489 13497 if (error != 0 && !unlock) {
13490 13498 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13491 13499 nfs4_client_lock_debug), (CE_NOTE,
13492 13500 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13493 13501 " for lop %p", (void *)lop));
13494 13502 ASSERT(lop != NULL);
13495 13503 mutex_enter(&lop->lo_lock);
13496 13504 lop->lo_pending_rqsts = 1;
13497 13505 mutex_exit(&lop->lo_lock);
13498 13506 }
13499 13507
13500 13508 lost_rqstp->lr_putfirst = FALSE;
13501 13509 lost_rqstp->lr_op = 0;
13502 13510
13503 13511 /*
13504 13512 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13505 13513 * recovery purposes so that the lock request that was sent
13506 13514 * can be saved and re-issued later. Ditto for EIO from a forced
13507 13515 * unmount. This is done to have the client's local locking state
13508 13516 * match the v4 server's state; that is, the request was
13509 13517 * potentially received and accepted by the server but the client
13510 13518 * thinks it was not.
13511 13519 */
13512 13520 if (error == ETIMEDOUT || error == EINTR ||
13513 13521 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13514 13522 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13515 13523 nfs4_client_lock_debug), (CE_NOTE,
13516 13524 "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13517 13525 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13518 13526 (void *)lop, (void *)oop, (void *)osp));
13519 13527 if (unlock)
13520 13528 lost_rqstp->lr_op = OP_LOCKU;
13521 13529 else {
13522 13530 lost_rqstp->lr_op = OP_LOCK;
13523 13531 lost_rqstp->lr_locktype = locktype;
13524 13532 }
13525 13533 /*
13526 13534 * Objects are held and rele'd via the recovery code.
13527 13535 * See nfs4_save_lost_rqst.
13528 13536 */
13529 13537 lost_rqstp->lr_vp = vp;
13530 13538 lost_rqstp->lr_dvp = NULL;
13531 13539 lost_rqstp->lr_oop = oop;
13532 13540 lost_rqstp->lr_osp = osp;
13533 13541 lost_rqstp->lr_lop = lop;
13534 13542 lost_rqstp->lr_cr = cr;
13535 13543 switch (ctype) {
13536 13544 case NFS4_LCK_CTYPE_NORM:
13537 13545 flk->l_pid = ttoproc(curthread)->p_pid;
13538 13546 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13539 13547 break;
13540 13548 case NFS4_LCK_CTYPE_REINSTATE:
13541 13549 lost_rqstp->lr_putfirst = TRUE;
13542 13550 lost_rqstp->lr_ctype = ctype;
13543 13551 break;
13544 13552 default:
13545 13553 break;
13546 13554 }
13547 13555 lost_rqstp->lr_flk = flk;
13548 13556 }
13549 13557 }
13550 13558
13551 13559 /*
13552 13560 * Update lop's seqid. Also update the seqid stored in a resend request,
13553 13561 * if any. (Some recovery errors increment the seqid, and we may have to
13554 13562 * send the resend request again.)
13555 13563 */
13556 13564
13557 13565 static void
13558 13566 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13559 13567 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13560 13568 {
13561 13569 if (lock_args) {
13562 13570 if (lock_args->locker.new_lock_owner == TRUE)
13563 13571 nfs4_get_and_set_next_open_seqid(oop, tag_type);
13564 13572 else {
13565 13573 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13566 13574 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13567 13575 }
13568 13576 } else if (locku_args) {
13569 13577 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13570 13578 nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13571 13579 }
13572 13580 }
13573 13581
13574 13582 /*
13575 13583 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13576 13584 * COMPOUND4 args/res for calls that need to retry.
13577 13585 * Switches the *cred_otwp to base_cr.
13578 13586 */
13579 13587 static void
13580 13588 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13581 13589 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13582 13590 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13583 13591 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13584 13592 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13585 13593 {
13586 13594 nfs4_open_owner_t *oop = *oopp;
13587 13595 nfs4_open_stream_t *osp = *ospp;
13588 13596 nfs4_lock_owner_t *lop = *lopp;
13589 13597 nfs_argop4 *argop = (*argspp)->array;
13590 13598
13591 13599 if (*did_start_fop) {
13592 13600 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13593 13601 needrecov);
13594 13602 *did_start_fop = FALSE;
13595 13603 }
13596 13604 ASSERT((*argspp)->array_len == 2);
13597 13605 if (argop[1].argop == OP_LOCK)
13598 13606 nfs4args_lock_free(&argop[1]);
13599 13607 else if (argop[1].argop == OP_LOCKT)
13600 13608 nfs4args_lockt_free(&argop[1]);
13601 13609 kmem_free(argop, 2 * sizeof (nfs_argop4));
13602 13610 if (!error)
13603 13611 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13604 13612 *argspp = NULL;
13605 13613 *respp = NULL;
13606 13614
13607 13615 if (lop) {
13608 13616 nfs4_end_lock_seqid_sync(lop);
13609 13617 lock_owner_rele(lop);
13610 13618 *lopp = NULL;
13611 13619 }
13612 13620
13613 13621 /* need to free up the reference on osp for lock args */
13614 13622 if (osp != NULL) {
13615 13623 open_stream_rele(osp, VTOR4(vp));
13616 13624 *ospp = NULL;
13617 13625 }
13618 13626
13619 13627 /* need to free up the reference on oop for lock args */
13620 13628 if (oop != NULL) {
13621 13629 nfs4_end_open_seqid_sync(oop);
13622 13630 open_owner_rele(oop);
13623 13631 *oopp = NULL;
13624 13632 }
13625 13633
13626 13634 crfree(*cred_otwp);
13627 13635 *cred_otwp = base_cr;
13628 13636 crhold(*cred_otwp);
13629 13637 }
13630 13638
13631 13639 /*
13632 13640 * Function to process the client's recovery for nfs4frlock.
13633 13641 * Returns TRUE if we should retry the lock request; FALSE otherwise.
13634 13642 *
13635 13643 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13636 13644 * COMPOUND4 args/res for calls that need to retry.
13637 13645 *
13638 13646 * Note: the rp's r_lkserlock is *not* dropped during this path.
13639 13647 */
13640 13648 static bool_t
13641 13649 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13642 13650 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13643 13651 LOCK4args *lock_args, LOCKU4args *locku_args,
13644 13652 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13645 13653 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13646 13654 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13647 13655 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13648 13656 {
13649 13657 nfs4_open_owner_t *oop = *oopp;
13650 13658 nfs4_open_stream_t *osp = *ospp;
13651 13659 nfs4_lock_owner_t *lop = *lopp;
13652 13660
13653 13661 bool_t abort, retry;
13654 13662
13655 13663 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13656 13664 ASSERT((*argspp) != NULL);
13657 13665 ASSERT((*respp) != NULL);
13658 13666 if (lock_args || locku_args)
13659 13667 ASSERT(lop != NULL);
13660 13668
13661 13669 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13662 13670 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13663 13671
13664 13672 retry = TRUE;
13665 13673 abort = FALSE;
13666 13674 if (needrecov) {
13667 13675 nfs4_bseqid_entry_t *bsep = NULL;
13668 13676 nfs_opnum4 op;
13669 13677
13670 13678 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13671 13679
13672 13680 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13673 13681 seqid4 seqid;
13674 13682
13675 13683 if (lock_args) {
13676 13684 if (lock_args->locker.new_lock_owner == TRUE)
13677 13685 seqid = lock_args->locker.locker4_u.
13678 13686 open_owner.open_seqid;
13679 13687 else
13680 13688 seqid = lock_args->locker.locker4_u.
13681 13689 lock_owner.lock_seqid;
13682 13690 } else if (locku_args) {
13683 13691 seqid = locku_args->seqid;
13684 13692 } else {
13685 13693 seqid = 0;
13686 13694 }
13687 13695
13688 13696 bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13689 13697 flk->l_pid, (*argspp)->ctag, seqid);
13690 13698 }
13691 13699
13692 13700 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13693 13701 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13694 13702 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13695 13703 NULL, op, bsep, NULL, NULL);
13696 13704
13697 13705 if (bsep)
13698 13706 kmem_free(bsep, sizeof (*bsep));
13699 13707 }
13700 13708
13701 13709 /*
13702 13710 * Return that we do not want to retry the request for 3 cases:
13703 13711 * 1. If we received EINTR or are bailing out because of a forced
13704 13712 * unmount, we came into this code path just for the sake of
13705 13713 * initiating recovery, we now need to return the error.
13706 13714 * 2. If we have aborted recovery.
13707 13715 * 3. We received NFS4ERR_BAD_SEQID.
13708 13716 */
13709 13717 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13710 13718 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13711 13719 retry = FALSE;
13712 13720
13713 13721 if (*did_start_fop == TRUE) {
13714 13722 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13715 13723 needrecov);
13716 13724 *did_start_fop = FALSE;
13717 13725 }
13718 13726
13719 13727 if (retry == TRUE) {
13720 13728 nfs_argop4 *argop;
13721 13729
13722 13730 argop = (*argspp)->array;
13723 13731 ASSERT((*argspp)->array_len == 2);
13724 13732
13725 13733 if (argop[1].argop == OP_LOCK)
13726 13734 nfs4args_lock_free(&argop[1]);
13727 13735 else if (argop[1].argop == OP_LOCKT)
13728 13736 nfs4args_lockt_free(&argop[1]);
13729 13737 kmem_free(argop, 2 * sizeof (nfs_argop4));
13730 13738 if (!ep->error)
13731 13739 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13732 13740 *respp = NULL;
13733 13741 *argspp = NULL;
13734 13742 }
13735 13743
13736 13744 if (lop != NULL) {
13737 13745 nfs4_end_lock_seqid_sync(lop);
13738 13746 lock_owner_rele(lop);
13739 13747 }
13740 13748
13741 13749 *lopp = NULL;
13742 13750
13743 13751 /* need to free up the reference on osp for lock args */
13744 13752 if (osp != NULL) {
13745 13753 open_stream_rele(osp, rp);
13746 13754 *ospp = NULL;
13747 13755 }
13748 13756
13749 13757 /* need to free up the reference on oop for lock args */
13750 13758 if (oop != NULL) {
13751 13759 nfs4_end_open_seqid_sync(oop);
13752 13760 open_owner_rele(oop);
13753 13761 *oopp = NULL;
13754 13762 }
13755 13763
13756 13764 return (retry);
13757 13765 }
13758 13766
13759 13767 /*
13760 13768 * Handles the successful reply from the server for nfs4frlock.
13761 13769 */
13762 13770 static void
13763 13771 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13764 13772 vnode_t *vp, int flag, u_offset_t offset,
13765 13773 nfs4_lost_rqst_t *resend_rqstp)
13766 13774 {
13767 13775 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13768 13776 if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13769 13777 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13770 13778 if (ctype == NFS4_LCK_CTYPE_NORM) {
13771 13779 flk->l_pid = ttoproc(curthread)->p_pid;
13772 13780 /*
13773 13781 * We do not register lost locks locally in
13774 13782 * the 'resend' case since the user/application
13775 13783 * doesn't think we have the lock.
13776 13784 */
13777 13785 ASSERT(!resend_rqstp);
13778 13786 nfs4_register_lock_locally(vp, flk, flag, offset);
13779 13787 }
13780 13788 }
13781 13789 }
13782 13790
13783 13791 /*
13784 13792 * Handle the DENIED reply from the server for nfs4frlock.
13785 13793 * Returns TRUE if we should retry the request; FALSE otherwise.
13786 13794 *
13787 13795 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13788 13796 * COMPOUND4 args/res for calls that need to retry. Can also
13789 13797 * drop and regrab the r_lkserlock.
13790 13798 */
13791 13799 static bool_t
13792 13800 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13793 13801 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13794 13802 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13795 13803 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13796 13804 nfs4_recov_state_t *recov_statep, int needrecov,
13797 13805 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13798 13806 clock_t *tick_delayp, short *whencep, int *errorp,
13799 13807 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13800 13808 bool_t *skip_get_err)
13801 13809 {
13802 13810 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13803 13811
13804 13812 if (lock_args) {
13805 13813 nfs4_open_owner_t *oop = *oopp;
13806 13814 nfs4_open_stream_t *osp = *ospp;
13807 13815 nfs4_lock_owner_t *lop = *lopp;
13808 13816 int intr;
13809 13817
13810 13818 /*
13811 13819 * Blocking lock needs to sleep and retry from the request.
13812 13820 *
13813 13821 * Do not block and wait for 'resend' or 'reinstate'
13814 13822 * lock requests, just return the error.
13815 13823 *
13816 13824 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13817 13825 */
13818 13826 if (cmd == F_SETLKW) {
13819 13827 rnode4_t *rp = VTOR4(vp);
13820 13828 nfs_argop4 *argop = (*argspp)->array;
13821 13829
13822 13830 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13823 13831
13824 13832 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13825 13833 recov_statep, needrecov);
13826 13834 *did_start_fop = FALSE;
13827 13835 ASSERT((*argspp)->array_len == 2);
13828 13836 if (argop[1].argop == OP_LOCK)
13829 13837 nfs4args_lock_free(&argop[1]);
13830 13838 else if (argop[1].argop == OP_LOCKT)
13831 13839 nfs4args_lockt_free(&argop[1]);
13832 13840 kmem_free(argop, 2 * sizeof (nfs_argop4));
13833 13841 if (*respp)
13834 13842 (void) xdr_free(xdr_COMPOUND4res_clnt,
13835 13843 (caddr_t)*respp);
13836 13844 *argspp = NULL;
13837 13845 *respp = NULL;
13838 13846 nfs4_end_lock_seqid_sync(lop);
13839 13847 lock_owner_rele(lop);
13840 13848 *lopp = NULL;
13841 13849 if (osp != NULL) {
13842 13850 open_stream_rele(osp, rp);
13843 13851 *ospp = NULL;
13844 13852 }
13845 13853 if (oop != NULL) {
13846 13854 nfs4_end_open_seqid_sync(oop);
13847 13855 open_owner_rele(oop);
13848 13856 *oopp = NULL;
13849 13857 }
13850 13858
13851 13859 nfs_rw_exit(&rp->r_lkserlock);
13852 13860
13853 13861 intr = nfs4_block_and_wait(tick_delayp, rp);
13854 13862
13855 13863 if (intr) {
13856 13864 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13857 13865 RW_WRITER, FALSE);
13858 13866 *errorp = EINTR;
13859 13867 return (FALSE);
13860 13868 }
13861 13869
13862 13870 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13863 13871 RW_WRITER, FALSE);
13864 13872
13865 13873 /*
13866 13874 * Make sure we are still safe to lock with
13867 13875 * regards to mmapping.
13868 13876 */
13869 13877 if (!nfs4_safelock(vp, flk, cr)) {
13870 13878 *errorp = EAGAIN;
13871 13879 return (FALSE);
13872 13880 }
13873 13881
13874 13882 return (TRUE);
13875 13883 }
13876 13884 if (ctype == NFS4_LCK_CTYPE_NORM)
13877 13885 *errorp = EAGAIN;
13878 13886 *skip_get_err = TRUE;
13879 13887 flk->l_whence = 0;
13880 13888 *whencep = 0;
13881 13889 return (FALSE);
13882 13890 } else if (lockt_args) {
13883 13891 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13884 13892 "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13885 13893
13886 13894 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13887 13895 flk, lockt_args);
13888 13896
13889 13897 /* according to NLM code */
13890 13898 *errorp = 0;
13891 13899 *whencep = 0;
13892 13900 *skip_get_err = TRUE;
13893 13901 return (FALSE);
13894 13902 }
13895 13903 return (FALSE);
13896 13904 }
13897 13905
13898 13906 /*
13899 13907 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13900 13908 */
13901 13909 static void
13902 13910 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13903 13911 {
13904 13912 switch (resp->status) {
13905 13913 case NFS4ERR_ACCESS:
13906 13914 case NFS4ERR_ADMIN_REVOKED:
13907 13915 case NFS4ERR_BADHANDLE:
13908 13916 case NFS4ERR_BAD_RANGE:
13909 13917 case NFS4ERR_BAD_SEQID:
13910 13918 case NFS4ERR_BAD_STATEID:
13911 13919 case NFS4ERR_BADXDR:
13912 13920 case NFS4ERR_DEADLOCK:
13913 13921 case NFS4ERR_DELAY:
13914 13922 case NFS4ERR_EXPIRED:
13915 13923 case NFS4ERR_FHEXPIRED:
13916 13924 case NFS4ERR_GRACE:
13917 13925 case NFS4ERR_INVAL:
13918 13926 case NFS4ERR_ISDIR:
13919 13927 case NFS4ERR_LEASE_MOVED:
13920 13928 case NFS4ERR_LOCK_NOTSUPP:
13921 13929 case NFS4ERR_LOCK_RANGE:
13922 13930 case NFS4ERR_MOVED:
13923 13931 case NFS4ERR_NOFILEHANDLE:
13924 13932 case NFS4ERR_NO_GRACE:
13925 13933 case NFS4ERR_OLD_STATEID:
13926 13934 case NFS4ERR_OPENMODE:
13927 13935 case NFS4ERR_RECLAIM_BAD:
13928 13936 case NFS4ERR_RECLAIM_CONFLICT:
13929 13937 case NFS4ERR_RESOURCE:
13930 13938 case NFS4ERR_SERVERFAULT:
13931 13939 case NFS4ERR_STALE:
13932 13940 case NFS4ERR_STALE_CLIENTID:
13933 13941 case NFS4ERR_STALE_STATEID:
13934 13942 return;
13935 13943 default:
13936 13944 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13937 13945 "nfs4frlock_results_default: got unrecognizable "
13938 13946 "res.status %d", resp->status));
13939 13947 *errorp = NFS4ERR_INVAL;
13940 13948 }
13941 13949 }
13942 13950
13943 13951 /*
13944 13952 * The lock request was successful, so update the client's state.
13945 13953 */
13946 13954 static void
13947 13955 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13948 13956 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13949 13957 vnode_t *vp, flock64_t *flk, cred_t *cr,
13950 13958 nfs4_lost_rqst_t *resend_rqstp)
13951 13959 {
13952 13960 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13953 13961
13954 13962 if (lock_args) {
13955 13963 LOCK4res *lock_res;
13956 13964
13957 13965 lock_res = &resop->nfs_resop4_u.oplock;
13958 13966 /* update the stateid with server's response */
13959 13967
13960 13968 if (lock_args->locker.new_lock_owner == TRUE) {
13961 13969 mutex_enter(&lop->lo_lock);
13962 13970 lop->lo_just_created = NFS4_PERM_CREATED;
13963 13971 mutex_exit(&lop->lo_lock);
13964 13972 }
13965 13973
13966 13974 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13967 13975
13968 13976 /*
13969 13977 * If the lock was the result of a resending a lost
13970 13978 * request, we've synched up the stateid and seqid
13971 13979 * with the server, but now the server might be out of sync
13972 13980 * with what the application thinks it has for locks.
13973 13981 * Clean that up here. It's unclear whether we should do
13974 13982 * this even if the filesystem has been forcibly unmounted.
13975 13983 * For most servers, it's probably wasted effort, but
13976 13984 * RFC3530 lets servers require that unlocks exactly match
13977 13985 * the locks that are held.
13978 13986 */
13979 13987 if (resend_rqstp != NULL &&
13980 13988 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13981 13989 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13982 13990 } else {
13983 13991 flk->l_whence = 0;
13984 13992 }
13985 13993 } else if (locku_args) {
13986 13994 LOCKU4res *locku_res;
13987 13995
13988 13996 locku_res = &resop->nfs_resop4_u.oplocku;
13989 13997
13990 13998 /* Update the stateid with the server's response */
13991 13999 nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
13992 14000 } else if (lockt_args) {
13993 14001 /* Switch the lock type to express success, see fcntl */
13994 14002 flk->l_type = F_UNLCK;
13995 14003 flk->l_whence = 0;
13996 14004 }
13997 14005 }
13998 14006
13999 14007 /*
14000 14008 * Do final cleanup before exiting nfs4frlock.
14001 14009 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14002 14010 * COMPOUND4 args/res for calls that haven't already.
14003 14011 */
14004 14012 static void
14005 14013 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14006 14014 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14007 14015 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14008 14016 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14009 14017 short whence, u_offset_t offset, struct lm_sysid *ls,
14010 14018 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14011 14019 bool_t did_start_fop, bool_t skip_get_err,
14012 14020 cred_t *cred_otw, cred_t *cred)
14013 14021 {
14014 14022 mntinfo4_t *mi = VTOMI4(vp);
14015 14023 rnode4_t *rp = VTOR4(vp);
14016 14024 int error = *errorp;
14017 14025 nfs_argop4 *argop;
14018 14026 int do_flush_pages = 0;
14019 14027
14020 14028 ASSERT(nfs_zone() == mi->mi_zone);
14021 14029 /*
14022 14030 * The client recovery code wants the raw status information,
14023 14031 * so don't map the NFS status code to an errno value for
14024 14032 * non-normal call types.
14025 14033 */
14026 14034 if (ctype == NFS4_LCK_CTYPE_NORM) {
14027 14035 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14028 14036 *errorp = geterrno4(resp->status);
14029 14037 if (did_start_fop == TRUE)
14030 14038 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14031 14039 needrecov);
14032 14040
14033 14041 /*
14034 14042 * We've established a new lock on the server, so invalidate
14035 14043 * the pages associated with the vnode to get the most up to
14036 14044 * date pages from the server after acquiring the lock. We
14037 14045 * want to be sure that the read operation gets the newest data.
14038 14046 * N.B.
14039 14047 * We used to do this in nfs4frlock_results_ok but that doesn't
14040 14048 * work since VOP_PUTPAGE can call nfs4_commit which calls
14041 14049 * nfs4_start_fop. We flush the pages below after calling
14042 14050 * nfs4_end_fop above
14043 14051 * The flush of the page cache must be done after
14044 14052 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14045 14053 */
14046 14054 if (!error && resp && resp->status == NFS4_OK)
14047 14055 do_flush_pages = 1;
14048 14056 }
14049 14057 if (argsp) {
14050 14058 ASSERT(argsp->array_len == 2);
14051 14059 argop = argsp->array;
14052 14060 if (argop[1].argop == OP_LOCK)
14053 14061 nfs4args_lock_free(&argop[1]);
14054 14062 else if (argop[1].argop == OP_LOCKT)
14055 14063 nfs4args_lockt_free(&argop[1]);
14056 14064 kmem_free(argop, 2 * sizeof (nfs_argop4));
14057 14065 if (resp)
14058 14066 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14059 14067 }
14060 14068
14061 14069 /* free the reference on the lock owner */
14062 14070 if (lop != NULL) {
14063 14071 nfs4_end_lock_seqid_sync(lop);
14064 14072 lock_owner_rele(lop);
14065 14073 }
14066 14074
14067 14075 /* need to free up the reference on osp for lock args */
14068 14076 if (osp != NULL)
14069 14077 open_stream_rele(osp, rp);
14070 14078
14071 14079 /* need to free up the reference on oop for lock args */
14072 14080 if (oop != NULL) {
14073 14081 nfs4_end_open_seqid_sync(oop);
14074 14082 open_owner_rele(oop);
14075 14083 }
14076 14084
14077 14085 if (do_flush_pages)
14078 14086 nfs4_flush_pages(vp, cred);
14079 14087
14080 14088 (void) convoff(vp, flk, whence, offset);
14081 14089
14082 14090 lm_rel_sysid(ls);
14083 14091
14084 14092 /*
14085 14093 * Record debug information in the event we get EINVAL.
14086 14094 */
14087 14095 mutex_enter(&mi->mi_lock);
14088 14096 if (*errorp == EINVAL && (lock_args || locku_args) &&
14089 14097 (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14090 14098 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14091 14099 zcmn_err(getzoneid(), CE_NOTE,
14092 14100 "%s operation failed with "
14093 14101 "EINVAL probably since the server, %s,"
14094 14102 " doesn't support POSIX style locking",
14095 14103 lock_args ? "LOCK" : "LOCKU",
14096 14104 mi->mi_curr_serv->sv_hostname);
14097 14105 mi->mi_flags |= MI4_LOCK_DEBUG;
14098 14106 }
14099 14107 }
14100 14108 mutex_exit(&mi->mi_lock);
14101 14109
14102 14110 if (cred_otw)
14103 14111 crfree(cred_otw);
14104 14112 }
14105 14113
14106 14114 /*
14107 14115 * This calls the server and the local locking code.
14108 14116 *
14109 14117 * Client locks are registerred locally by oring the sysid with
14110 14118 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14111 14119 * We need to distinguish between the two to avoid collision in case one
14112 14120 * machine is used as both client and server.
14113 14121 *
14114 14122 * Blocking lock requests will continually retry to acquire the lock
14115 14123 * forever.
14116 14124 *
14117 14125 * The ctype is defined as follows:
14118 14126 * NFS4_LCK_CTYPE_NORM: normal lock request.
14119 14127 *
14120 14128 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client
14121 14129 * recovery, get the pid from flk instead of curproc, and don't reregister
14122 14130 * the lock locally.
14123 14131 *
14124 14132 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14125 14133 * that we will use the information passed in via resend_rqstp to setup the
14126 14134 * lock/locku request. This resend is the exact same request as the 'lost
14127 14135 * lock', and is initiated by the recovery framework. A successful resend
14128 14136 * request can initiate one or more reinstate requests.
14129 14137 *
14130 14138 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14131 14139 * does not trigger additional reinstate requests. This lock call type is
14132 14140 * set for setting the v4 server's locking state back to match what the
14133 14141 * client's local locking state is in the event of a received 'lost lock'.
14134 14142 *
14135 14143 * Errors are returned via the nfs4_error_t parameter.
14136 14144 */
14137 14145 void
14138 14146 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14139 14147 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14140 14148 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14141 14149 {
14142 14150 COMPOUND4args_clnt args, *argsp = NULL;
14143 14151 COMPOUND4res_clnt res, *resp = NULL;
14144 14152 nfs_argop4 *argop;
14145 14153 nfs_resop4 *resop;
14146 14154 rnode4_t *rp;
14147 14155 int doqueue = 1;
14148 14156 clock_t tick_delay; /* delay in clock ticks */
14149 14157 struct lm_sysid *ls;
14150 14158 LOCK4args *lock_args = NULL;
14151 14159 LOCKU4args *locku_args = NULL;
14152 14160 LOCKT4args *lockt_args = NULL;
14153 14161 nfs4_open_owner_t *oop = NULL;
14154 14162 nfs4_open_stream_t *osp = NULL;
14155 14163 nfs4_lock_owner_t *lop = NULL;
14156 14164 bool_t needrecov = FALSE;
14157 14165 nfs4_recov_state_t recov_state;
14158 14166 short whence;
14159 14167 nfs4_op_hint_t op_hint;
14160 14168 nfs4_lost_rqst_t lost_rqst;
14161 14169 bool_t retry = FALSE;
14162 14170 bool_t did_start_fop = FALSE;
14163 14171 bool_t skip_get_err = FALSE;
14164 14172 cred_t *cred_otw = NULL;
14165 14173 bool_t recovonly; /* just queue request */
14166 14174 int frc_no_reclaim = 0;
14167 14175 #ifdef DEBUG
14168 14176 char *name;
14169 14177 #endif
14170 14178
14171 14179 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14172 14180
14173 14181 #ifdef DEBUG
14174 14182 name = fn_name(VTOSV(vp)->sv_name);
14175 14183 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14176 14184 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14177 14185 "length %"PRIu64", pid %d, sysid %d, call type %s, "
14178 14186 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14179 14187 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14180 14188 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14181 14189 resend_rqstp ? "TRUE" : "FALSE"));
14182 14190 kmem_free(name, MAXNAMELEN);
14183 14191 #endif
14184 14192
14185 14193 nfs4_error_zinit(ep);
14186 14194 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14187 14195 if (ep->error)
14188 14196 return;
14189 14197 ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14190 14198 if (ep->error)
14191 14199 return;
14192 14200 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14193 14201 vp, cr, &cred_otw);
14194 14202
14195 14203 recov_retry:
14196 14204 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14197 14205 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14198 14206 rp = VTOR4(vp);
14199 14207
14200 14208 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14201 14209 &did_start_fop, &recovonly);
14202 14210
14203 14211 if (ep->error)
14204 14212 goto out;
14205 14213
14206 14214 if (recovonly) {
14207 14215 /*
14208 14216 * Leave the request for the recovery system to deal with.
14209 14217 */
14210 14218 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14211 14219 ASSERT(cmd != F_GETLK);
14212 14220 ASSERT(flk->l_type == F_UNLCK);
14213 14221
14214 14222 nfs4_error_init(ep, EINTR);
14215 14223 needrecov = TRUE;
14216 14224 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14217 14225 if (lop != NULL) {
14218 14226 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14219 14227 NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14220 14228 (void) nfs4_start_recovery(ep,
14221 14229 VTOMI4(vp), vp, NULL, NULL,
14222 14230 (lost_rqst.lr_op == OP_LOCK ||
14223 14231 lost_rqst.lr_op == OP_LOCKU) ?
14224 14232 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14225 14233 lock_owner_rele(lop);
14226 14234 lop = NULL;
14227 14235 }
14228 14236 flk->l_pid = curproc->p_pid;
14229 14237 nfs4_register_lock_locally(vp, flk, flag, offset);
14230 14238 goto out;
14231 14239 }
14232 14240
14233 14241 /* putfh directory fh */
14234 14242 argop[0].argop = OP_CPUTFH;
14235 14243 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14236 14244
14237 14245 /*
14238 14246 * Set up the over-the-wire arguments and get references to the
14239 14247 * open owner, etc.
14240 14248 */
14241 14249
14242 14250 if (ctype == NFS4_LCK_CTYPE_RESEND ||
14243 14251 ctype == NFS4_LCK_CTYPE_REINSTATE) {
14244 14252 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14245 14253 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14246 14254 } else {
14247 14255 bool_t go_otw = TRUE;
14248 14256
14249 14257 ASSERT(resend_rqstp == NULL);
14250 14258
14251 14259 switch (cmd) {
14252 14260 case F_GETLK:
14253 14261 case F_O_GETLK:
14254 14262 nfs4frlock_setup_lockt_args(ctype, &argop[1],
14255 14263 &lockt_args, argsp, flk, rp);
14256 14264 break;
14257 14265 case F_SETLKW:
14258 14266 case F_SETLK:
14259 14267 if (flk->l_type == F_UNLCK)
14260 14268 nfs4frlock_setup_locku_args(ctype,
14261 14269 &argop[1], &locku_args, flk,
14262 14270 &lop, ep, argsp,
14263 14271 vp, flag, offset, cr,
14264 14272 &skip_get_err, &go_otw);
14265 14273 else
14266 14274 nfs4frlock_setup_lock_args(ctype,
14267 14275 &lock_args, &oop, &osp, &lop, &argop[1],
14268 14276 argsp, flk, cmd, vp, cr, ep);
14269 14277
14270 14278 if (ep->error)
14271 14279 goto out;
14272 14280
14273 14281 switch (ep->stat) {
14274 14282 case NFS4_OK:
14275 14283 break;
14276 14284 case NFS4ERR_DELAY:
14277 14285 /* recov thread never gets this error */
14278 14286 ASSERT(resend_rqstp == NULL);
14279 14287 ASSERT(did_start_fop);
14280 14288
14281 14289 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14282 14290 &recov_state, TRUE);
14283 14291 did_start_fop = FALSE;
14284 14292 if (argop[1].argop == OP_LOCK)
14285 14293 nfs4args_lock_free(&argop[1]);
14286 14294 else if (argop[1].argop == OP_LOCKT)
14287 14295 nfs4args_lockt_free(&argop[1]);
14288 14296 kmem_free(argop, 2 * sizeof (nfs_argop4));
14289 14297 argsp = NULL;
14290 14298 goto recov_retry;
14291 14299 default:
14292 14300 ep->error = EIO;
14293 14301 goto out;
14294 14302 }
14295 14303 break;
14296 14304 default:
14297 14305 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14298 14306 "nfs4_frlock: invalid cmd %d", cmd));
14299 14307 ep->error = EINVAL;
14300 14308 goto out;
14301 14309 }
14302 14310
14303 14311 if (!go_otw)
14304 14312 goto out;
14305 14313 }
14306 14314
14307 14315 /* XXX should we use the local reclock as a cache ? */
14308 14316 /*
14309 14317 * Unregister the lock with the local locking code before
14310 14318 * contacting the server. This avoids a potential race where
14311 14319 * another process gets notified that it has been granted a lock
14312 14320 * before we can unregister ourselves locally.
14313 14321 */
14314 14322 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14315 14323 if (ctype == NFS4_LCK_CTYPE_NORM)
14316 14324 flk->l_pid = ttoproc(curthread)->p_pid;
14317 14325 nfs4_register_lock_locally(vp, flk, flag, offset);
14318 14326 }
14319 14327
14320 14328 /*
14321 14329 * Send the server the lock request. Continually loop with a delay
14322 14330 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14323 14331 */
14324 14332 resp = &res;
14325 14333
14326 14334 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14327 14335 (CE_NOTE,
14328 14336 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14329 14337 rnode4info(rp)));
14330 14338
14331 14339 if (lock_args && frc_no_reclaim) {
14332 14340 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14333 14341 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14334 14342 "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14335 14343 lock_args->reclaim = FALSE;
14336 14344 if (did_reclaimp)
14337 14345 *did_reclaimp = 0;
14338 14346 }
14339 14347
14340 14348 /*
14341 14349 * Do the OTW call.
14342 14350 */
14343 14351 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14344 14352
14345 14353 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14346 14354 "nfs4frlock: error %d, status %d", ep->error, resp->status));
14347 14355
14348 14356 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14349 14357 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14350 14358 "nfs4frlock: needrecov %d", needrecov));
14351 14359
14352 14360 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14353 14361 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14354 14362 args.ctag);
14355 14363
14356 14364 /*
14357 14365 * Check if one of these mutually exclusive error cases has
14358 14366 * happened:
14359 14367 * need to swap credentials due to access error
14360 14368 * recovery is needed
14361 14369 * different error (only known case is missing Kerberos ticket)
14362 14370 */
14363 14371
14364 14372 if ((ep->error == EACCES ||
14365 14373 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14366 14374 cred_otw != cr) {
14367 14375 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14368 14376 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14369 14377 cr, &cred_otw);
14370 14378 goto recov_retry;
14371 14379 }
14372 14380
14373 14381 if (needrecov) {
14374 14382 /*
14375 14383 * LOCKT requests don't need to recover from lost
14376 14384 * requests since they don't create/modify state.
14377 14385 */
14378 14386 if ((ep->error == EINTR ||
14379 14387 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14380 14388 lockt_args)
14381 14389 goto out;
14382 14390 /*
14383 14391 * Do not attempt recovery for requests initiated by
14384 14392 * the recovery framework. Let the framework redrive them.
14385 14393 */
14386 14394 if (ctype != NFS4_LCK_CTYPE_NORM)
14387 14395 goto out;
14388 14396 else {
14389 14397 ASSERT(resend_rqstp == NULL);
14390 14398 }
14391 14399
14392 14400 nfs4frlock_save_lost_rqst(ctype, ep->error,
14393 14401 flk_to_locktype(cmd, flk->l_type),
14394 14402 oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14395 14403
14396 14404 retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14397 14405 &resp, lock_args, locku_args, &oop, &osp, &lop,
14398 14406 rp, vp, &recov_state, op_hint, &did_start_fop,
14399 14407 cmd != F_GETLK ? &lost_rqst : NULL, flk);
14400 14408
14401 14409 if (retry) {
14402 14410 ASSERT(oop == NULL);
14403 14411 ASSERT(osp == NULL);
14404 14412 ASSERT(lop == NULL);
14405 14413 goto recov_retry;
14406 14414 }
14407 14415 goto out;
14408 14416 }
14409 14417
14410 14418 /*
14411 14419 * Bail out if have reached this point with ep->error set. Can
14412 14420 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14413 14421 * This happens if Kerberos ticket has expired or has been
14414 14422 * destroyed.
14415 14423 */
14416 14424 if (ep->error != 0)
14417 14425 goto out;
14418 14426
14419 14427 /*
14420 14428 * Process the reply.
14421 14429 */
14422 14430 switch (resp->status) {
14423 14431 case NFS4_OK:
14424 14432 resop = &resp->array[1];
14425 14433 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14426 14434 resend_rqstp);
14427 14435 /*
14428 14436 * Have a successful lock operation, now update state.
14429 14437 */
14430 14438 nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14431 14439 resop, lop, vp, flk, cr, resend_rqstp);
14432 14440 break;
14433 14441
14434 14442 case NFS4ERR_DENIED:
14435 14443 resop = &resp->array[1];
14436 14444 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14437 14445 &oop, &osp, &lop, cmd, vp, flk, op_hint,
14438 14446 &recov_state, needrecov, &argsp, &resp,
14439 14447 &tick_delay, &whence, &ep->error, resop, cr,
14440 14448 &did_start_fop, &skip_get_err);
14441 14449
14442 14450 if (retry) {
14443 14451 ASSERT(oop == NULL);
14444 14452 ASSERT(osp == NULL);
14445 14453 ASSERT(lop == NULL);
14446 14454 goto recov_retry;
14447 14455 }
14448 14456 break;
14449 14457 /*
14450 14458 * If the server won't let us reclaim, fall-back to trying to lock
14451 14459 * the file from scratch. Code elsewhere will check the changeinfo
14452 14460 * to ensure the file hasn't been changed.
14453 14461 */
14454 14462 case NFS4ERR_NO_GRACE:
14455 14463 if (lock_args && lock_args->reclaim == TRUE) {
14456 14464 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14457 14465 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14458 14466 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14459 14467 frc_no_reclaim = 1;
14460 14468 /* clean up before retrying */
14461 14469 needrecov = 0;
14462 14470 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14463 14471 lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14464 14472 &recov_state, op_hint, &did_start_fop, NULL, flk);
14465 14473 goto recov_retry;
14466 14474 }
14467 14475 /* FALLTHROUGH */
14468 14476
14469 14477 default:
14470 14478 nfs4frlock_results_default(resp, &ep->error);
14471 14479 break;
14472 14480 }
14473 14481 out:
14474 14482 /*
14475 14483 * Process and cleanup from error. Make interrupted unlock
14476 14484 * requests look successful, since they will be handled by the
14477 14485 * client recovery code.
14478 14486 */
14479 14487 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14480 14488 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14481 14489 lock_args, locku_args, did_start_fop,
14482 14490 skip_get_err, cred_otw, cr);
14483 14491
14484 14492 if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14485 14493 (cmd == F_SETLK || cmd == F_SETLKW))
14486 14494 ep->error = 0;
14487 14495 }
14488 14496
14489 14497 /*
14490 14498 * nfs4_safelock:
14491 14499 *
14492 14500 * Return non-zero if the given lock request can be handled without
14493 14501 * violating the constraints on concurrent mapping and locking.
14494 14502 */
14495 14503
14496 14504 static int
14497 14505 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14498 14506 {
14499 14507 rnode4_t *rp = VTOR4(vp);
14500 14508 struct vattr va;
14501 14509 int error;
14502 14510
14503 14511 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14504 14512 ASSERT(rp->r_mapcnt >= 0);
14505 14513 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14506 14514 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14507 14515 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14508 14516 bfp->l_start, bfp->l_len, rp->r_mapcnt));
14509 14517
14510 14518 if (rp->r_mapcnt == 0)
14511 14519 return (1); /* always safe if not mapped */
14512 14520
14513 14521 /*
14514 14522 * If the file is already mapped and there are locks, then they
14515 14523 * should be all safe locks. So adding or removing a lock is safe
14516 14524 * as long as the new request is safe (i.e., whole-file, meaning
14517 14525 * length and starting offset are both zero).
14518 14526 */
14519 14527
14520 14528 if (bfp->l_start != 0 || bfp->l_len != 0) {
14521 14529 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14522 14530 "cannot lock a memory mapped file unless locking the "
14523 14531 "entire file: start %"PRIx64", len %"PRIx64,
14524 14532 bfp->l_start, bfp->l_len));
14525 14533 return (0);
14526 14534 }
14527 14535
14528 14536 /* mandatory locking and mapping don't mix */
14529 14537 va.va_mask = AT_MODE;
14530 14538 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14531 14539 if (error != 0) {
14532 14540 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14533 14541 "getattr error %d", error));
14534 14542 return (0); /* treat errors conservatively */
14535 14543 }
14536 14544 if (MANDLOCK(vp, va.va_mode)) {
14537 14545 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14538 14546 "cannot mandatory lock and mmap a file"));
14539 14547 return (0);
14540 14548 }
14541 14549
14542 14550 return (1);
14543 14551 }
14544 14552
14545 14553
14546 14554 /*
14547 14555 * Register the lock locally within Solaris.
14548 14556 * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14549 14557 * recording locks locally.
14550 14558 *
14551 14559 * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14552 14560 * are registered locally.
14553 14561 */
14554 14562 void
14555 14563 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14556 14564 u_offset_t offset)
14557 14565 {
14558 14566 int oldsysid;
14559 14567 int error;
14560 14568 #ifdef DEBUG
14561 14569 char *name;
14562 14570 #endif
14563 14571
14564 14572 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14565 14573
14566 14574 #ifdef DEBUG
14567 14575 name = fn_name(VTOSV(vp)->sv_name);
14568 14576 NFS4_DEBUG(nfs4_client_lock_debug,
14569 14577 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14570 14578 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14571 14579 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14572 14580 flk->l_sysid));
14573 14581 kmem_free(name, MAXNAMELEN);
14574 14582 #endif
14575 14583
14576 14584 /* register the lock with local locking */
14577 14585 oldsysid = flk->l_sysid;
14578 14586 flk->l_sysid |= LM_SYSID_CLIENT;
14579 14587 error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14580 14588 #ifdef DEBUG
14581 14589 if (error != 0) {
14582 14590 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14583 14591 "nfs4_register_lock_locally: could not register with"
14584 14592 " local locking"));
14585 14593 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14586 14594 "error %d, vp 0x%p, pid %d, sysid 0x%x",
14587 14595 error, (void *)vp, flk->l_pid, flk->l_sysid));
14588 14596 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14589 14597 "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14590 14598 flk->l_type, flk->l_start, flk->l_len));
14591 14599 (void) reclock(vp, flk, 0, flag, offset, NULL);
14592 14600 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14593 14601 "blocked by pid %d sysid 0x%x type %d "
14594 14602 "off 0x%" PRIx64 " len 0x%" PRIx64,
14595 14603 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14596 14604 flk->l_len));
14597 14605 }
14598 14606 #endif
14599 14607 flk->l_sysid = oldsysid;
14600 14608 }
14601 14609
14602 14610 /*
14603 14611 * nfs4_lockrelease:
14604 14612 *
14605 14613 * Release any locks on the given vnode that are held by the current
14606 14614 * process. Also removes the lock owner (if one exists) from the rnode's
14607 14615 * list.
14608 14616 */
14609 14617 static int
14610 14618 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14611 14619 {
14612 14620 flock64_t ld;
14613 14621 int ret, error;
14614 14622 rnode4_t *rp;
14615 14623 nfs4_lock_owner_t *lop;
14616 14624 nfs4_recov_state_t recov_state;
14617 14625 mntinfo4_t *mi;
14618 14626 bool_t possible_orphan = FALSE;
14619 14627 bool_t recovonly;
14620 14628
14621 14629 ASSERT((uintptr_t)vp > KERNELBASE);
14622 14630 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14623 14631
14624 14632 rp = VTOR4(vp);
14625 14633 mi = VTOMI4(vp);
14626 14634
14627 14635 /*
14628 14636 * If we have not locked anything then we can
14629 14637 * just return since we have no work to do.
14630 14638 */
14631 14639 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14632 14640 return (0);
14633 14641 }
14634 14642
14635 14643 /*
14636 14644 * We need to comprehend that another thread may
14637 14645 * kick off recovery and the lock_owner we have stashed
14638 14646 * in lop might be invalid so we should NOT cache it
14639 14647 * locally!
14640 14648 */
14641 14649 recov_state.rs_flags = 0;
14642 14650 recov_state.rs_num_retry_despite_err = 0;
14643 14651 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14644 14652 &recovonly);
14645 14653 if (error) {
14646 14654 mutex_enter(&rp->r_statelock);
14647 14655 rp->r_flags |= R4LODANGLERS;
14648 14656 mutex_exit(&rp->r_statelock);
14649 14657 return (error);
14650 14658 }
14651 14659
14652 14660 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14653 14661
14654 14662 /*
14655 14663 * Check if the lock owner might have a lock (request was sent but
14656 14664 * no response was received). Also check if there are any remote
14657 14665 * locks on the file. (In theory we shouldn't have to make this
14658 14666 * second check if there's no lock owner, but for now we'll be
14659 14667 * conservative and do it anyway.) If either condition is true,
14660 14668 * send an unlock for the entire file to the server.
14661 14669 *
14662 14670 * Note that no explicit synchronization is needed here. At worst,
14663 14671 * flk_has_remote_locks() will return a false positive, in which case
14664 14672 * the unlock call wastes time but doesn't harm correctness.
14665 14673 */
14666 14674
14667 14675 if (lop) {
14668 14676 mutex_enter(&lop->lo_lock);
14669 14677 possible_orphan = lop->lo_pending_rqsts;
14670 14678 mutex_exit(&lop->lo_lock);
14671 14679 lock_owner_rele(lop);
14672 14680 }
14673 14681
14674 14682 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14675 14683
14676 14684 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14677 14685 "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14678 14686 "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14679 14687 (void *)lop));
14680 14688
14681 14689 if (possible_orphan || flk_has_remote_locks(vp)) {
14682 14690 ld.l_type = F_UNLCK; /* set to unlock entire file */
14683 14691 ld.l_whence = 0; /* unlock from start of file */
14684 14692 ld.l_start = 0;
14685 14693 ld.l_len = 0; /* do entire file */
14686 14694
14687 14695 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14688 14696 cr, NULL);
14689 14697
14690 14698 if (ret != 0) {
14691 14699 /*
14692 14700 * If VOP_FRLOCK fails, make sure we unregister
14693 14701 * local locks before we continue.
14694 14702 */
14695 14703 ld.l_pid = ttoproc(curthread)->p_pid;
14696 14704 nfs4_register_lock_locally(vp, &ld, flag, offset);
14697 14705 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14698 14706 "nfs4_lockrelease: lock release error on vp"
14699 14707 " %p: error %d.\n", (void *)vp, ret));
14700 14708 }
14701 14709 }
14702 14710
14703 14711 recov_state.rs_flags = 0;
14704 14712 recov_state.rs_num_retry_despite_err = 0;
14705 14713 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14706 14714 &recovonly);
14707 14715 if (error) {
14708 14716 mutex_enter(&rp->r_statelock);
14709 14717 rp->r_flags |= R4LODANGLERS;
14710 14718 mutex_exit(&rp->r_statelock);
14711 14719 return (error);
14712 14720 }
14713 14721
14714 14722 /*
14715 14723 * So, here we're going to need to retrieve the lock-owner
14716 14724 * again (in case recovery has done a switch-a-roo) and
14717 14725 * remove it because we can.
14718 14726 */
14719 14727 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14720 14728
14721 14729 if (lop) {
14722 14730 nfs4_rnode_remove_lock_owner(rp, lop);
14723 14731 lock_owner_rele(lop);
14724 14732 }
14725 14733
14726 14734 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14727 14735 return (0);
14728 14736 }
14729 14737
14730 14738 /*
14731 14739 * Wait for 'tick_delay' clock ticks.
14732 14740 * Implement exponential backoff until hit the lease_time of this nfs4_server.
14733 14741 * NOTE: lock_lease_time is in seconds.
14734 14742 *
14735 14743 * XXX For future improvements, should implement a waiting queue scheme.
14736 14744 */
14737 14745 static int
14738 14746 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14739 14747 {
14740 14748 long milliseconds_delay;
14741 14749 time_t lock_lease_time;
14742 14750
14743 14751 /* wait tick_delay clock ticks or siginteruptus */
14744 14752 if (delay_sig(*tick_delay)) {
14745 14753 return (EINTR);
14746 14754 }
14747 14755 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14748 14756 "reissue the lock request: blocked for %ld clock ticks: %ld "
14749 14757 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14750 14758
14751 14759 /* get the lease time */
14752 14760 lock_lease_time = r2lease_time(rp);
14753 14761
14754 14762 /* drv_hztousec converts ticks to microseconds */
14755 14763 milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14756 14764 if (milliseconds_delay < lock_lease_time * 1000) {
14757 14765 *tick_delay = 2 * *tick_delay;
14758 14766 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14759 14767 *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14760 14768 }
14761 14769 return (0);
14762 14770 }
14763 14771
14764 14772
14765 14773 void
14766 14774 nfs4_vnops_init(void)
14767 14775 {
14768 14776 }
14769 14777
14770 14778 void
14771 14779 nfs4_vnops_fini(void)
14772 14780 {
14773 14781 }
14774 14782
14775 14783 /*
14776 14784 * Return a reference to the directory (parent) vnode for a given vnode,
14777 14785 * using the saved pathname information and the directory file handle. The
14778 14786 * caller is responsible for disposing of the reference.
14779 14787 * Returns zero or an errno value.
14780 14788 *
14781 14789 * Caller should set need_start_op to FALSE if it is the recovery
14782 14790 * thread, or if a start_fop has already been done. Otherwise, TRUE.
14783 14791 */
14784 14792 int
14785 14793 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14786 14794 {
14787 14795 svnode_t *svnp;
14788 14796 vnode_t *dvp = NULL;
14789 14797 servinfo4_t *svp;
14790 14798 nfs4_fname_t *mfname;
14791 14799 int error;
14792 14800
14793 14801 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14794 14802
14795 14803 if (vp->v_flag & VROOT) {
14796 14804 nfs4_sharedfh_t *sfh;
14797 14805 nfs_fh4 fh;
14798 14806 mntinfo4_t *mi;
14799 14807
14800 14808 ASSERT(vp->v_type == VREG);
14801 14809
14802 14810 mi = VTOMI4(vp);
14803 14811 svp = mi->mi_curr_serv;
14804 14812 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14805 14813 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14806 14814 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14807 14815 sfh = sfh4_get(&fh, VTOMI4(vp));
14808 14816 nfs_rw_exit(&svp->sv_lock);
14809 14817 mfname = mi->mi_fname;
14810 14818 fn_hold(mfname);
14811 14819 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14812 14820 sfh4_rele(&sfh);
14813 14821
14814 14822 if (dvp->v_type == VNON)
14815 14823 dvp->v_type = VDIR;
14816 14824 *dvpp = dvp;
14817 14825 return (0);
14818 14826 }
14819 14827
14820 14828 svnp = VTOSV(vp);
14821 14829
14822 14830 if (svnp == NULL) {
14823 14831 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14824 14832 "shadow node is NULL"));
14825 14833 return (EINVAL);
14826 14834 }
14827 14835
14828 14836 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14829 14837 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14830 14838 "shadow node name or dfh val == NULL"));
14831 14839 return (EINVAL);
14832 14840 }
14833 14841
14834 14842 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14835 14843 (int)need_start_op);
14836 14844 if (error != 0) {
14837 14845 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14838 14846 "nfs4_make_dotdot returned %d", error));
14839 14847 return (error);
14840 14848 }
14841 14849 if (!dvp) {
14842 14850 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14843 14851 "nfs4_make_dotdot returned a NULL dvp"));
14844 14852 return (EIO);
14845 14853 }
14846 14854 if (dvp->v_type == VNON)
14847 14855 dvp->v_type = VDIR;
14848 14856 ASSERT(dvp->v_type == VDIR);
14849 14857 if (VTOR4(vp)->r_flags & R4ISXATTR) {
14850 14858 mutex_enter(&dvp->v_lock);
14851 14859 dvp->v_flag |= V_XATTRDIR;
14852 14860 mutex_exit(&dvp->v_lock);
14853 14861 }
14854 14862 *dvpp = dvp;
14855 14863 return (0);
14856 14864 }
14857 14865
14858 14866 /*
14859 14867 * Copy the (final) component name of vp to fnamep. maxlen is the maximum
14860 14868 * length that fnamep can accept, including the trailing null.
14861 14869 * Returns 0 if okay, returns an errno value if there was a problem.
14862 14870 */
14863 14871
14864 14872 int
14865 14873 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14866 14874 {
14867 14875 char *fn;
14868 14876 int err = 0;
14869 14877 servinfo4_t *svp;
14870 14878 svnode_t *shvp;
14871 14879
14872 14880 /*
14873 14881 * If the file being opened has VROOT set, then this is
14874 14882 * a "file" mount. sv_name will not be interesting, so
14875 14883 * go back to the servinfo4 to get the original mount
14876 14884 * path and strip off all but the final edge. Otherwise
14877 14885 * just return the name from the shadow vnode.
14878 14886 */
14879 14887
14880 14888 if (vp->v_flag & VROOT) {
14881 14889
14882 14890 svp = VTOMI4(vp)->mi_curr_serv;
14883 14891 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14884 14892
14885 14893 fn = strrchr(svp->sv_path, '/');
14886 14894 if (fn == NULL)
14887 14895 err = EINVAL;
14888 14896 else
14889 14897 fn++;
14890 14898 } else {
14891 14899 shvp = VTOSV(vp);
14892 14900 fn = fn_name(shvp->sv_name);
14893 14901 }
14894 14902
14895 14903 if (err == 0)
14896 14904 if (strlen(fn) < maxlen)
14897 14905 (void) strcpy(fnamep, fn);
14898 14906 else
14899 14907 err = ENAMETOOLONG;
14900 14908
14901 14909 if (vp->v_flag & VROOT)
14902 14910 nfs_rw_exit(&svp->sv_lock);
14903 14911 else
14904 14912 kmem_free(fn, MAXNAMELEN);
14905 14913
14906 14914 return (err);
14907 14915 }
14908 14916
14909 14917 /*
14910 14918 * Bookkeeping for a close that doesn't need to go over the wire.
14911 14919 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14912 14920 * it is left at 1.
14913 14921 */
14914 14922 void
14915 14923 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14916 14924 {
14917 14925 rnode4_t *rp;
14918 14926 mntinfo4_t *mi;
14919 14927
14920 14928 mi = VTOMI4(vp);
14921 14929 rp = VTOR4(vp);
14922 14930
14923 14931 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14924 14932 "rp=%p osp=%p", (void *)rp, (void *)osp));
14925 14933 ASSERT(nfs_zone() == mi->mi_zone);
14926 14934 ASSERT(mutex_owned(&osp->os_sync_lock));
14927 14935 ASSERT(*have_lockp);
14928 14936
14929 14937 if (!osp->os_valid ||
14930 14938 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14931 14939 return;
14932 14940 }
14933 14941
14934 14942 /*
14935 14943 * This removes the reference obtained at OPEN; ie,
14936 14944 * when the open stream structure was created.
14937 14945 *
14938 14946 * We don't have to worry about calling 'open_stream_rele'
14939 14947 * since we our currently holding a reference to this
14940 14948 * open stream which means the count can not go to 0 with
14941 14949 * this decrement.
14942 14950 */
14943 14951 ASSERT(osp->os_ref_count >= 2);
14944 14952 osp->os_ref_count--;
14945 14953 osp->os_valid = 0;
14946 14954 mutex_exit(&osp->os_sync_lock);
14947 14955 *have_lockp = 0;
14948 14956
14949 14957 nfs4_dec_state_ref_count(mi);
14950 14958 }
14951 14959
14952 14960 /*
14953 14961 * Close all remaining open streams on the rnode. These open streams
14954 14962 * could be here because:
14955 14963 * - The close attempted at either close or delmap failed
14956 14964 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14957 14965 * - Someone did mknod on a regular file but never opened it
14958 14966 */
14959 14967 int
14960 14968 nfs4close_all(vnode_t *vp, cred_t *cr)
14961 14969 {
14962 14970 nfs4_open_stream_t *osp;
14963 14971 int error;
14964 14972 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14965 14973 rnode4_t *rp;
14966 14974
14967 14975 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14968 14976
14969 14977 error = 0;
14970 14978 rp = VTOR4(vp);
14971 14979
14972 14980 /*
14973 14981 * At this point, all we know is that the last time
14974 14982 * someone called vn_rele, the count was 1. Since then,
14975 14983 * the vnode could have been re-activated. We want to
14976 14984 * loop through the open streams and close each one, but
14977 14985 * we have to be careful since once we release the rnode
14978 14986 * hash bucket lock, someone else is free to come in and
14979 14987 * re-activate the rnode and add new open streams. The
14980 14988 * strategy is take the rnode hash bucket lock, verify that
14981 14989 * the count is still 1, grab the open stream off the
14982 14990 * head of the list and mark it invalid, then release the
14983 14991 * rnode hash bucket lock and proceed with that open stream.
14984 14992 * This is ok because nfs4close_one() will acquire the proper
14985 14993 * open/create to close/destroy synchronization for open
14986 14994 * streams, and will ensure that if someone has reopened
14987 14995 * the open stream after we've dropped the hash bucket lock
14988 14996 * then we'll just simply return without destroying the
14989 14997 * open stream.
14990 14998 * Repeat until the list is empty.
14991 14999 */
14992 15000
14993 15001 for (;;) {
14994 15002
14995 15003 /* make sure vnode hasn't been reactivated */
14996 15004 rw_enter(&rp->r_hashq->r_lock, RW_READER);
14997 15005 mutex_enter(&vp->v_lock);
14998 15006 if (vp->v_count > 1) {
14999 15007 mutex_exit(&vp->v_lock);
15000 15008 rw_exit(&rp->r_hashq->r_lock);
15001 15009 break;
15002 15010 }
15003 15011 /*
15004 15012 * Grabbing r_os_lock before releasing v_lock prevents
15005 15013 * a window where the rnode/open stream could get
15006 15014 * reactivated (and os_force_close set to 0) before we
15007 15015 * had a chance to set os_force_close to 1.
15008 15016 */
15009 15017 mutex_enter(&rp->r_os_lock);
15010 15018 mutex_exit(&vp->v_lock);
15011 15019
15012 15020 osp = list_head(&rp->r_open_streams);
15013 15021 if (!osp) {
15014 15022 /* nothing left to CLOSE OTW, so return */
15015 15023 mutex_exit(&rp->r_os_lock);
15016 15024 rw_exit(&rp->r_hashq->r_lock);
15017 15025 break;
15018 15026 }
15019 15027
15020 15028 mutex_enter(&rp->r_statev4_lock);
15021 15029 /* the file can't still be mem mapped */
15022 15030 ASSERT(rp->r_mapcnt == 0);
15023 15031 if (rp->created_v4)
15024 15032 rp->created_v4 = 0;
15025 15033 mutex_exit(&rp->r_statev4_lock);
15026 15034
15027 15035 /*
15028 15036 * Grab a ref on this open stream; nfs4close_one
15029 15037 * will mark it as invalid
15030 15038 */
15031 15039 mutex_enter(&osp->os_sync_lock);
15032 15040 osp->os_ref_count++;
15033 15041 osp->os_force_close = 1;
15034 15042 mutex_exit(&osp->os_sync_lock);
15035 15043 mutex_exit(&rp->r_os_lock);
15036 15044 rw_exit(&rp->r_hashq->r_lock);
15037 15045
15038 15046 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15039 15047
15040 15048 /* Update error if it isn't already non-zero */
15041 15049 if (error == 0) {
15042 15050 if (e.error)
15043 15051 error = e.error;
15044 15052 else if (e.stat)
15045 15053 error = geterrno4(e.stat);
15046 15054 }
15047 15055
15048 15056 #ifdef DEBUG
15049 15057 nfs4close_all_cnt++;
15050 15058 #endif
15051 15059 /* Release the ref on osp acquired above. */
15052 15060 open_stream_rele(osp, rp);
15053 15061
15054 15062 /* Proceed to the next open stream, if any */
15055 15063 }
15056 15064 return (error);
15057 15065 }
15058 15066
15059 15067 /*
15060 15068 * nfs4close_one - close one open stream for a file if needed.
15061 15069 *
15062 15070 * "close_type" indicates which close path this is:
15063 15071 * CLOSE_NORM: close initiated via VOP_CLOSE.
15064 15072 * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15065 15073 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces
15066 15074 * the close and release of client state for this open stream
15067 15075 * (unless someone else has the open stream open).
15068 15076 * CLOSE_RESEND: indicates the request is a replay of an earlier request
15069 15077 * (e.g., due to abort because of a signal).
15070 15078 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15071 15079 *
15072 15080 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15073 15081 * recovery. Instead, the caller is expected to deal with retries.
15074 15082 *
15075 15083 * The caller can either pass in the osp ('provided_osp') or not.
15076 15084 *
15077 15085 * 'access_bits' represents the access we are closing/downgrading.
15078 15086 *
15079 15087 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the
15080 15088 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15081 15089 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15082 15090 *
15083 15091 * Errors are returned via the nfs4_error_t.
15084 15092 */
15085 15093 void
15086 15094 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15087 15095 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15088 15096 nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15089 15097 uint_t mmap_flags)
15090 15098 {
15091 15099 nfs4_open_owner_t *oop;
15092 15100 nfs4_open_stream_t *osp = NULL;
15093 15101 int retry = 0;
15094 15102 int num_retries = NFS4_NUM_RECOV_RETRIES;
15095 15103 rnode4_t *rp;
15096 15104 mntinfo4_t *mi;
15097 15105 nfs4_recov_state_t recov_state;
15098 15106 cred_t *cred_otw = NULL;
15099 15107 bool_t recovonly = FALSE;
15100 15108 int isrecov;
15101 15109 int force_close;
15102 15110 int close_failed = 0;
15103 15111 int did_dec_count = 0;
15104 15112 int did_start_op = 0;
15105 15113 int did_force_recovlock = 0;
15106 15114 int did_start_seqid_sync = 0;
15107 15115 int have_sync_lock = 0;
15108 15116
15109 15117 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15110 15118
15111 15119 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15112 15120 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15113 15121 (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15114 15122 len, maxprot, mmap_flags, access_bits));
15115 15123
15116 15124 nfs4_error_zinit(ep);
15117 15125 rp = VTOR4(vp);
15118 15126 mi = VTOMI4(vp);
15119 15127 isrecov = (close_type == CLOSE_RESEND ||
15120 15128 close_type == CLOSE_AFTER_RESEND);
15121 15129
15122 15130 /*
15123 15131 * First get the open owner.
15124 15132 */
15125 15133 if (!provided_osp) {
15126 15134 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15127 15135 } else {
15128 15136 oop = provided_osp->os_open_owner;
15129 15137 ASSERT(oop != NULL);
15130 15138 open_owner_hold(oop);
15131 15139 }
15132 15140
15133 15141 if (!oop) {
15134 15142 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15135 15143 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15136 15144 "close type %d", (void *)rp, (void *)mi, (void *)cr,
15137 15145 (void *)provided_osp, close_type));
15138 15146 ep->error = EIO;
15139 15147 goto out;
15140 15148 }
15141 15149
15142 15150 cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15143 15151 recov_retry:
15144 15152 osp = NULL;
15145 15153 close_failed = 0;
15146 15154 force_close = (close_type == CLOSE_FORCE);
15147 15155 retry = 0;
15148 15156 did_start_op = 0;
15149 15157 did_force_recovlock = 0;
15150 15158 did_start_seqid_sync = 0;
15151 15159 have_sync_lock = 0;
15152 15160 recovonly = FALSE;
15153 15161 recov_state.rs_flags = 0;
15154 15162 recov_state.rs_num_retry_despite_err = 0;
15155 15163
15156 15164 /*
15157 15165 * Second synchronize with recovery.
15158 15166 */
15159 15167 if (!isrecov) {
15160 15168 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15161 15169 &recov_state, &recovonly);
15162 15170 if (!ep->error) {
15163 15171 did_start_op = 1;
15164 15172 } else {
15165 15173 close_failed = 1;
15166 15174 /*
15167 15175 * If we couldn't get start_fop, but have to
15168 15176 * cleanup state, then at least acquire the
15169 15177 * mi_recovlock so we can synchronize with
15170 15178 * recovery.
15171 15179 */
15172 15180 if (close_type == CLOSE_FORCE) {
15173 15181 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15174 15182 RW_READER, FALSE);
15175 15183 did_force_recovlock = 1;
15176 15184 } else
15177 15185 goto out;
15178 15186 }
15179 15187 }
15180 15188
15181 15189 /*
15182 15190 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15183 15191 * set 'recovonly' to TRUE since most likely this is due to
15184 15192 * reovery being active (MI4_RECOV_ACTIV). If recovery is active,
15185 15193 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15186 15194 * to retry, causing us to loop until recovery finishes. Plus we
15187 15195 * don't need protection over the open seqid since we're not going
15188 15196 * OTW, hence don't need to use the seqid.
15189 15197 */
15190 15198 if (recovonly == FALSE) {
15191 15199 /* need to grab the open owner sync before 'os_sync_lock' */
15192 15200 ep->error = nfs4_start_open_seqid_sync(oop, mi);
15193 15201 if (ep->error == EAGAIN) {
15194 15202 ASSERT(!isrecov);
15195 15203 if (did_start_op)
15196 15204 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15197 15205 &recov_state, TRUE);
15198 15206 if (did_force_recovlock)
15199 15207 nfs_rw_exit(&mi->mi_recovlock);
15200 15208 goto recov_retry;
15201 15209 }
15202 15210 did_start_seqid_sync = 1;
15203 15211 }
15204 15212
15205 15213 /*
15206 15214 * Third get an open stream and acquire 'os_sync_lock' to
15207 15215 * sychronize the opening/creating of an open stream with the
15208 15216 * closing/destroying of an open stream.
15209 15217 */
15210 15218 if (!provided_osp) {
15211 15219 /* returns with 'os_sync_lock' held */
15212 15220 osp = find_open_stream(oop, rp);
15213 15221 if (!osp) {
15214 15222 ep->error = EIO;
15215 15223 goto out;
15216 15224 }
15217 15225 } else {
15218 15226 osp = provided_osp;
15219 15227 open_stream_hold(osp);
15220 15228 mutex_enter(&osp->os_sync_lock);
15221 15229 }
15222 15230 have_sync_lock = 1;
15223 15231
15224 15232 ASSERT(oop == osp->os_open_owner);
15225 15233
15226 15234 /*
15227 15235 * Fourth, do any special pre-OTW CLOSE processing
15228 15236 * based on the specific close type.
15229 15237 */
15230 15238 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15231 15239 !did_dec_count) {
15232 15240 ASSERT(osp->os_open_ref_count > 0);
15233 15241 osp->os_open_ref_count--;
15234 15242 did_dec_count = 1;
15235 15243 if (osp->os_open_ref_count == 0)
15236 15244 osp->os_final_close = 1;
15237 15245 }
15238 15246
15239 15247 if (close_type == CLOSE_FORCE) {
15240 15248 /* see if somebody reopened the open stream. */
15241 15249 if (!osp->os_force_close) {
15242 15250 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15243 15251 "nfs4close_one: skip CLOSE_FORCE as osp %p "
15244 15252 "was reopened, vp %p", (void *)osp, (void *)vp));
15245 15253 ep->error = 0;
15246 15254 ep->stat = NFS4_OK;
15247 15255 goto out;
15248 15256 }
15249 15257
15250 15258 if (!osp->os_final_close && !did_dec_count) {
15251 15259 osp->os_open_ref_count--;
15252 15260 did_dec_count = 1;
15253 15261 }
15254 15262
15255 15263 /*
15256 15264 * We can't depend on os_open_ref_count being 0 due to the
15257 15265 * way executables are opened (VN_RELE to match a VOP_OPEN).
15258 15266 */
15259 15267 #ifdef NOTYET
15260 15268 ASSERT(osp->os_open_ref_count == 0);
15261 15269 #endif
15262 15270 if (osp->os_open_ref_count != 0) {
15263 15271 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15264 15272 "nfs4close_one: should panic here on an "
15265 15273 "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15266 15274 "since this is probably the exec problem."));
15267 15275
15268 15276 osp->os_open_ref_count = 0;
15269 15277 }
15270 15278
15271 15279 /*
15272 15280 * There is the possibility that nfs4close_one()
15273 15281 * for close_type == CLOSE_DELMAP couldn't find the
15274 15282 * open stream, thus couldn't decrement its os_mapcnt;
15275 15283 * therefore we can't use this ASSERT yet.
15276 15284 */
15277 15285 #ifdef NOTYET
15278 15286 ASSERT(osp->os_mapcnt == 0);
15279 15287 #endif
15280 15288 osp->os_mapcnt = 0;
15281 15289 }
15282 15290
15283 15291 if (close_type == CLOSE_DELMAP && !did_dec_count) {
15284 15292 ASSERT(osp->os_mapcnt >= btopr(len));
15285 15293
15286 15294 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15287 15295 osp->os_mmap_write -= btopr(len);
15288 15296 if (maxprot & PROT_READ)
15289 15297 osp->os_mmap_read -= btopr(len);
15290 15298 if (maxprot & PROT_EXEC)
15291 15299 osp->os_mmap_read -= btopr(len);
15292 15300 /* mirror the PROT_NONE check in nfs4_addmap() */
15293 15301 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15294 15302 !(maxprot & PROT_EXEC))
15295 15303 osp->os_mmap_read -= btopr(len);
15296 15304 osp->os_mapcnt -= btopr(len);
15297 15305 did_dec_count = 1;
15298 15306 }
15299 15307
15300 15308 if (recovonly) {
15301 15309 nfs4_lost_rqst_t lost_rqst;
15302 15310
15303 15311 /* request should not already be in recovery queue */
15304 15312 ASSERT(lrp == NULL);
15305 15313 nfs4_error_init(ep, EINTR);
15306 15314 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15307 15315 osp, cred_otw, vp);
15308 15316 mutex_exit(&osp->os_sync_lock);
15309 15317 have_sync_lock = 0;
15310 15318 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15311 15319 lost_rqst.lr_op == OP_CLOSE ?
15312 15320 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15313 15321 close_failed = 1;
15314 15322 force_close = 0;
15315 15323 goto close_cleanup;
15316 15324 }
15317 15325
15318 15326 /*
15319 15327 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15320 15328 * we stopped operating on the open owner's <old oo_name, old seqid>
15321 15329 * space, which means we stopped operating on the open stream
15322 15330 * too. So don't go OTW (as the seqid is likely bad, and the
15323 15331 * stateid could be stale, potentially triggering a false
15324 15332 * setclientid), and just clean up the client's internal state.
15325 15333 */
15326 15334 if (osp->os_orig_oo_name != oop->oo_name) {
15327 15335 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15328 15336 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15329 15337 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15330 15338 "oo_name %" PRIx64")",
15331 15339 (void *)osp, (void *)oop, osp->os_orig_oo_name,
15332 15340 oop->oo_name));
15333 15341 close_failed = 1;
15334 15342 }
15335 15343
15336 15344 /* If the file failed recovery, just quit. */
15337 15345 mutex_enter(&rp->r_statelock);
15338 15346 if (rp->r_flags & R4RECOVERR) {
15339 15347 close_failed = 1;
15340 15348 }
15341 15349 mutex_exit(&rp->r_statelock);
15342 15350
15343 15351 /*
15344 15352 * If the force close path failed to obtain start_fop
15345 15353 * then skip the OTW close and just remove the state.
15346 15354 */
15347 15355 if (close_failed)
15348 15356 goto close_cleanup;
15349 15357
15350 15358 /*
15351 15359 * Fifth, check to see if there are still mapped pages or other
15352 15360 * opens using this open stream. If there are then we can't
15353 15361 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15354 15362 */
15355 15363 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15356 15364 nfs4_lost_rqst_t new_lost_rqst;
15357 15365 bool_t needrecov = FALSE;
15358 15366 cred_t *odg_cred_otw = NULL;
15359 15367 seqid4 open_dg_seqid = 0;
15360 15368
15361 15369 if (osp->os_delegation) {
15362 15370 /*
15363 15371 * If this open stream was never OPENed OTW then we
15364 15372 * surely can't DOWNGRADE it (especially since the
15365 15373 * osp->open_stateid is really a delegation stateid
15366 15374 * when os_delegation is 1).
15367 15375 */
15368 15376 if (access_bits & FREAD)
15369 15377 osp->os_share_acc_read--;
15370 15378 if (access_bits & FWRITE)
15371 15379 osp->os_share_acc_write--;
15372 15380 osp->os_share_deny_none--;
15373 15381 nfs4_error_zinit(ep);
15374 15382 goto out;
15375 15383 }
15376 15384 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15377 15385 lrp, ep, &odg_cred_otw, &open_dg_seqid);
15378 15386 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15379 15387 if (needrecov && !isrecov) {
15380 15388 bool_t abort;
15381 15389 nfs4_bseqid_entry_t *bsep = NULL;
15382 15390
15383 15391 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15384 15392 bsep = nfs4_create_bseqid_entry(oop, NULL,
15385 15393 vp, 0,
15386 15394 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15387 15395 open_dg_seqid);
15388 15396
15389 15397 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15390 15398 oop, osp, odg_cred_otw, vp, access_bits, 0);
15391 15399 mutex_exit(&osp->os_sync_lock);
15392 15400 have_sync_lock = 0;
15393 15401 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15394 15402 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15395 15403 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15396 15404 bsep, NULL, NULL);
15397 15405 if (odg_cred_otw)
15398 15406 crfree(odg_cred_otw);
15399 15407 if (bsep)
15400 15408 kmem_free(bsep, sizeof (*bsep));
15401 15409
15402 15410 if (abort == TRUE)
15403 15411 goto out;
15404 15412
15405 15413 if (did_start_seqid_sync) {
15406 15414 nfs4_end_open_seqid_sync(oop);
15407 15415 did_start_seqid_sync = 0;
15408 15416 }
15409 15417 open_stream_rele(osp, rp);
15410 15418
15411 15419 if (did_start_op)
15412 15420 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15413 15421 &recov_state, FALSE);
15414 15422 if (did_force_recovlock)
15415 15423 nfs_rw_exit(&mi->mi_recovlock);
15416 15424
15417 15425 goto recov_retry;
15418 15426 } else {
15419 15427 if (odg_cred_otw)
15420 15428 crfree(odg_cred_otw);
15421 15429 }
15422 15430 goto out;
15423 15431 }
15424 15432
15425 15433 /*
15426 15434 * If this open stream was created as the results of an open
15427 15435 * while holding a delegation, then just release it; no need
15428 15436 * to do an OTW close. Otherwise do a "normal" OTW close.
15429 15437 */
15430 15438 if (osp->os_delegation) {
15431 15439 nfs4close_notw(vp, osp, &have_sync_lock);
15432 15440 nfs4_error_zinit(ep);
15433 15441 goto out;
15434 15442 }
15435 15443
15436 15444 /*
15437 15445 * If this stream is not valid, we're done.
15438 15446 */
15439 15447 if (!osp->os_valid) {
15440 15448 nfs4_error_zinit(ep);
15441 15449 goto out;
15442 15450 }
15443 15451
15444 15452 /*
15445 15453 * Last open or mmap ref has vanished, need to do an OTW close.
15446 15454 * First check to see if a close is still necessary.
15447 15455 */
15448 15456 if (osp->os_failed_reopen) {
15449 15457 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15450 15458 "don't close OTW osp %p since reopen failed.",
15451 15459 (void *)osp));
15452 15460 /*
15453 15461 * Reopen of the open stream failed, hence the
15454 15462 * stateid of the open stream is invalid/stale, and
15455 15463 * sending this OTW would incorrectly cause another
15456 15464 * round of recovery. In this case, we need to set
15457 15465 * the 'os_valid' bit to 0 so another thread doesn't
15458 15466 * come in and re-open this open stream before
15459 15467 * this "closing" thread cleans up state (decrementing
15460 15468 * the nfs4_server_t's state_ref_count and decrementing
15461 15469 * the os_ref_count).
15462 15470 */
15463 15471 osp->os_valid = 0;
15464 15472 /*
15465 15473 * This removes the reference obtained at OPEN; ie,
15466 15474 * when the open stream structure was created.
15467 15475 *
15468 15476 * We don't have to worry about calling 'open_stream_rele'
15469 15477 * since we our currently holding a reference to this
15470 15478 * open stream which means the count can not go to 0 with
15471 15479 * this decrement.
15472 15480 */
15473 15481 ASSERT(osp->os_ref_count >= 2);
15474 15482 osp->os_ref_count--;
15475 15483 nfs4_error_zinit(ep);
15476 15484 close_failed = 0;
15477 15485 goto close_cleanup;
15478 15486 }
15479 15487
15480 15488 ASSERT(osp->os_ref_count > 1);
15481 15489
15482 15490 /*
15483 15491 * Sixth, try the CLOSE OTW.
15484 15492 */
15485 15493 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15486 15494 close_type, ep, &have_sync_lock);
15487 15495
15488 15496 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15489 15497 /*
15490 15498 * Let the recovery thread be responsible for
15491 15499 * removing the state for CLOSE.
15492 15500 */
15493 15501 close_failed = 1;
15494 15502 force_close = 0;
15495 15503 retry = 0;
15496 15504 }
15497 15505
15498 15506 /* See if we need to retry with a different cred */
15499 15507 if ((ep->error == EACCES ||
15500 15508 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15501 15509 cred_otw != cr) {
15502 15510 crfree(cred_otw);
15503 15511 cred_otw = cr;
15504 15512 crhold(cred_otw);
15505 15513 retry = 1;
15506 15514 }
15507 15515
15508 15516 if (ep->error || ep->stat)
15509 15517 close_failed = 1;
15510 15518
15511 15519 if (retry && !isrecov && num_retries-- > 0) {
15512 15520 if (have_sync_lock) {
15513 15521 mutex_exit(&osp->os_sync_lock);
15514 15522 have_sync_lock = 0;
15515 15523 }
15516 15524 if (did_start_seqid_sync) {
15517 15525 nfs4_end_open_seqid_sync(oop);
15518 15526 did_start_seqid_sync = 0;
15519 15527 }
15520 15528 open_stream_rele(osp, rp);
15521 15529
15522 15530 if (did_start_op)
15523 15531 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15524 15532 &recov_state, FALSE);
15525 15533 if (did_force_recovlock)
15526 15534 nfs_rw_exit(&mi->mi_recovlock);
15527 15535 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15528 15536 "nfs4close_one: need to retry the close "
15529 15537 "operation"));
15530 15538 goto recov_retry;
15531 15539 }
15532 15540 close_cleanup:
15533 15541 /*
15534 15542 * Seventh and lastly, process our results.
15535 15543 */
15536 15544 if (close_failed && force_close) {
15537 15545 /*
15538 15546 * It's ok to drop and regrab the 'os_sync_lock' since
15539 15547 * nfs4close_notw() will recheck to make sure the
15540 15548 * "close"/removal of state should happen.
15541 15549 */
15542 15550 if (!have_sync_lock) {
15543 15551 mutex_enter(&osp->os_sync_lock);
15544 15552 have_sync_lock = 1;
15545 15553 }
15546 15554 /*
15547 15555 * This is last call, remove the ref on the open
15548 15556 * stream created by open and clean everything up.
15549 15557 */
15550 15558 osp->os_pending_close = 0;
15551 15559 nfs4close_notw(vp, osp, &have_sync_lock);
15552 15560 nfs4_error_zinit(ep);
15553 15561 }
15554 15562
15555 15563 if (!close_failed) {
15556 15564 if (have_sync_lock) {
15557 15565 osp->os_pending_close = 0;
15558 15566 mutex_exit(&osp->os_sync_lock);
15559 15567 have_sync_lock = 0;
15560 15568 } else {
15561 15569 mutex_enter(&osp->os_sync_lock);
15562 15570 osp->os_pending_close = 0;
15563 15571 mutex_exit(&osp->os_sync_lock);
15564 15572 }
15565 15573 if (did_start_op && recov_state.rs_sp != NULL) {
15566 15574 mutex_enter(&recov_state.rs_sp->s_lock);
15567 15575 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15568 15576 mutex_exit(&recov_state.rs_sp->s_lock);
15569 15577 } else {
15570 15578 nfs4_dec_state_ref_count(mi);
15571 15579 }
15572 15580 nfs4_error_zinit(ep);
15573 15581 }
15574 15582
15575 15583 out:
15576 15584 if (have_sync_lock)
15577 15585 mutex_exit(&osp->os_sync_lock);
15578 15586 if (did_start_op)
15579 15587 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15580 15588 recovonly ? TRUE : FALSE);
15581 15589 if (did_force_recovlock)
15582 15590 nfs_rw_exit(&mi->mi_recovlock);
15583 15591 if (cred_otw)
15584 15592 crfree(cred_otw);
15585 15593 if (osp)
15586 15594 open_stream_rele(osp, rp);
15587 15595 if (oop) {
15588 15596 if (did_start_seqid_sync)
15589 15597 nfs4_end_open_seqid_sync(oop);
15590 15598 open_owner_rele(oop);
15591 15599 }
15592 15600 }
15593 15601
15594 15602 /*
15595 15603 * Convert information returned by the server in the LOCK4denied
15596 15604 * structure to the form required by fcntl.
15597 15605 */
15598 15606 static void
15599 15607 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15600 15608 {
15601 15609 nfs4_lo_name_t *lo;
15602 15610
15603 15611 #ifdef DEBUG
15604 15612 if (denied_to_flk_debug) {
15605 15613 lockt_denied_debug = lockt_denied;
15606 15614 debug_enter("lockt_denied");
15607 15615 }
15608 15616 #endif
15609 15617
15610 15618 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15611 15619 flk->l_whence = 0; /* aka SEEK_SET */
15612 15620 flk->l_start = lockt_denied->offset;
15613 15621 flk->l_len = lockt_denied->length;
15614 15622
15615 15623 /*
15616 15624 * If the blocking clientid matches our client id, then we can
15617 15625 * interpret the lockowner (since we built it). If not, then
15618 15626 * fabricate a sysid and pid. Note that the l_sysid field
15619 15627 * in *flk already has the local sysid.
15620 15628 */
15621 15629
15622 15630 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15623 15631
15624 15632 if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15625 15633 lo = (nfs4_lo_name_t *)
15626 15634 lockt_denied->owner.owner_val;
15627 15635
15628 15636 flk->l_pid = lo->ln_pid;
15629 15637 } else {
15630 15638 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15631 15639 "denied_to_flk: bad lock owner length\n"));
15632 15640
15633 15641 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15634 15642 }
15635 15643 } else {
15636 15644 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15637 15645 "denied_to_flk: foreign clientid\n"));
15638 15646
15639 15647 /*
15640 15648 * Construct a new sysid which should be different from
15641 15649 * sysids of other systems.
15642 15650 */
15643 15651
15644 15652 flk->l_sysid++;
15645 15653 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15646 15654 }
15647 15655 }
15648 15656
15649 15657 static pid_t
15650 15658 lo_to_pid(lock_owner4 *lop)
15651 15659 {
15652 15660 pid_t pid = 0;
15653 15661 uchar_t *cp;
15654 15662 int i;
15655 15663
15656 15664 cp = (uchar_t *)&lop->clientid;
15657 15665
15658 15666 for (i = 0; i < sizeof (lop->clientid); i++)
15659 15667 pid += (pid_t)*cp++;
15660 15668
15661 15669 cp = (uchar_t *)lop->owner_val;
15662 15670
15663 15671 for (i = 0; i < lop->owner_len; i++)
15664 15672 pid += (pid_t)*cp++;
15665 15673
15666 15674 return (pid);
15667 15675 }
15668 15676
15669 15677 /*
15670 15678 * Given a lock pointer, returns the length of that lock.
15671 15679 * "end" is the last locked offset the "l_len" covers from
15672 15680 * the start of the lock.
15673 15681 */
15674 15682 static off64_t
15675 15683 lock_to_end(flock64_t *lock)
15676 15684 {
15677 15685 off64_t lock_end;
15678 15686
15679 15687 if (lock->l_len == 0)
15680 15688 lock_end = (off64_t)MAXEND;
15681 15689 else
15682 15690 lock_end = lock->l_start + lock->l_len - 1;
15683 15691
15684 15692 return (lock_end);
15685 15693 }
15686 15694
15687 15695 /*
15688 15696 * Given the end of a lock, it will return you the length "l_len" for that lock.
15689 15697 */
15690 15698 static off64_t
15691 15699 end_to_len(off64_t start, off64_t end)
15692 15700 {
15693 15701 off64_t lock_len;
15694 15702
15695 15703 ASSERT(end >= start);
15696 15704 if (end == MAXEND)
15697 15705 lock_len = 0;
15698 15706 else
15699 15707 lock_len = end - start + 1;
15700 15708
15701 15709 return (lock_len);
15702 15710 }
15703 15711
15704 15712 /*
15705 15713 * On given end for a lock it determines if it is the last locked offset
15706 15714 * or not, if so keeps it as is, else adds one to return the length for
15707 15715 * valid start.
15708 15716 */
15709 15717 static off64_t
15710 15718 start_check(off64_t x)
15711 15719 {
15712 15720 if (x == MAXEND)
15713 15721 return (x);
15714 15722 else
15715 15723 return (x + 1);
15716 15724 }
15717 15725
15718 15726 /*
15719 15727 * See if these two locks overlap, and if so return 1;
15720 15728 * otherwise, return 0.
15721 15729 */
15722 15730 static int
15723 15731 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15724 15732 {
15725 15733 off64_t llfp_end, curfp_end;
15726 15734
15727 15735 llfp_end = lock_to_end(llfp);
15728 15736 curfp_end = lock_to_end(curfp);
15729 15737
15730 15738 if (((llfp_end >= curfp->l_start) &&
15731 15739 (llfp->l_start <= curfp->l_start)) ||
15732 15740 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15733 15741 return (1);
15734 15742 return (0);
15735 15743 }
15736 15744
15737 15745 /*
15738 15746 * Determine what the intersecting lock region is, and add that to the
15739 15747 * 'nl_llpp' locklist in increasing order (by l_start).
15740 15748 */
15741 15749 static void
15742 15750 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15743 15751 locklist_t **nl_llpp, vnode_t *vp)
15744 15752 {
15745 15753 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15746 15754 off64_t lost_flp_end, local_flp_end, len, start;
15747 15755
15748 15756 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15749 15757
15750 15758 if (!locks_intersect(lost_flp, local_flp))
15751 15759 return;
15752 15760
15753 15761 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15754 15762 "locks intersect"));
15755 15763
15756 15764 lost_flp_end = lock_to_end(lost_flp);
15757 15765 local_flp_end = lock_to_end(local_flp);
15758 15766
15759 15767 /* Find the starting point of the intersecting region */
15760 15768 if (local_flp->l_start > lost_flp->l_start)
15761 15769 start = local_flp->l_start;
15762 15770 else
15763 15771 start = lost_flp->l_start;
15764 15772
15765 15773 /* Find the lenght of the intersecting region */
15766 15774 if (lost_flp_end < local_flp_end)
15767 15775 len = end_to_len(start, lost_flp_end);
15768 15776 else
15769 15777 len = end_to_len(start, local_flp_end);
15770 15778
15771 15779 /*
15772 15780 * Prepare the flock structure for the intersection found and insert
15773 15781 * it into the new list in increasing l_start order. This list contains
15774 15782 * intersections of locks registered by the client with the local host
15775 15783 * and the lost lock.
15776 15784 * The lock type of this lock is the same as that of the local_flp.
15777 15785 */
15778 15786 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15779 15787 intersect_llp->ll_flock.l_start = start;
15780 15788 intersect_llp->ll_flock.l_len = len;
15781 15789 intersect_llp->ll_flock.l_type = local_flp->l_type;
15782 15790 intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15783 15791 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15784 15792 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */
15785 15793 intersect_llp->ll_vp = vp;
15786 15794
15787 15795 tmp_fllp = *nl_llpp;
15788 15796 cur_fllp = NULL;
15789 15797 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15790 15798 intersect_llp->ll_flock.l_start) {
15791 15799 cur_fllp = tmp_fllp;
15792 15800 tmp_fllp = tmp_fllp->ll_next;
15793 15801 }
15794 15802 if (cur_fllp == NULL) {
15795 15803 /* first on the list */
15796 15804 intersect_llp->ll_next = *nl_llpp;
15797 15805 *nl_llpp = intersect_llp;
15798 15806 } else {
15799 15807 intersect_llp->ll_next = cur_fllp->ll_next;
15800 15808 cur_fllp->ll_next = intersect_llp;
15801 15809 }
15802 15810
15803 15811 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15804 15812 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15805 15813 intersect_llp->ll_flock.l_start,
15806 15814 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15807 15815 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15808 15816 }
15809 15817
15810 15818 /*
15811 15819 * Our local locking current state is potentially different than
15812 15820 * what the NFSv4 server thinks we have due to a lost lock that was
15813 15821 * resent and then received. We need to reset our "NFSv4" locking
15814 15822 * state to match the current local locking state for this pid since
15815 15823 * that is what the user/application sees as what the world is.
15816 15824 *
15817 15825 * We cannot afford to drop the open/lock seqid sync since then we can
15818 15826 * get confused about what the current local locking state "is" versus
15819 15827 * "was".
15820 15828 *
15821 15829 * If we are unable to fix up the locks, we send SIGLOST to the affected
15822 15830 * process. This is not done if the filesystem has been forcibly
15823 15831 * unmounted, in case the process has already exited and a new process
15824 15832 * exists with the same pid.
15825 15833 */
15826 15834 static void
15827 15835 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15828 15836 nfs4_lock_owner_t *lop)
15829 15837 {
15830 15838 locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15831 15839 mntinfo4_t *mi = VTOMI4(vp);
15832 15840 const int cmd = F_SETLK;
15833 15841 off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15834 15842 flock64_t ul_fl;
15835 15843
15836 15844 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15837 15845 "nfs4_reinstitute_local_lock_state"));
15838 15846
15839 15847 /*
15840 15848 * Find active locks for this vp from the local locking code.
15841 15849 * Scan through this list and find out the locks that intersect with
15842 15850 * the lost lock. Once we find the lock that intersects, add the
15843 15851 * intersection area as a new lock to a new list "ri_llp". The lock
15844 15852 * type of the intersection region lock added to ri_llp is the same
15845 15853 * as that found in the active lock list, "list". The intersecting
15846 15854 * region locks are added to ri_llp in increasing l_start order.
15847 15855 */
15848 15856 ASSERT(nfs_zone() == mi->mi_zone);
15849 15857
15850 15858 locks = flk_active_locks_for_vp(vp);
15851 15859 ri_llp = NULL;
15852 15860
15853 15861 for (llp = locks; llp != NULL; llp = llp->ll_next) {
15854 15862 ASSERT(llp->ll_vp == vp);
15855 15863 /*
15856 15864 * Pick locks that belong to this pid/lockowner
15857 15865 */
15858 15866 if (llp->ll_flock.l_pid != lost_flp->l_pid)
15859 15867 continue;
15860 15868
15861 15869 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15862 15870 }
15863 15871
15864 15872 /*
15865 15873 * Now we have the list of intersections with the lost lock. These are
15866 15874 * the locks that were/are active before the server replied to the
15867 15875 * last/lost lock. Issue these locks to the server here. Playing these
15868 15876 * locks to the server will re-establish aur current local locking state
15869 15877 * with the v4 server.
15870 15878 * If we get an error, send SIGLOST to the application for that lock.
15871 15879 */
15872 15880
15873 15881 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15874 15882 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15875 15883 "nfs4_reinstitute_local_lock_state: need to issue "
15876 15884 "flock: [%"PRIx64" - %"PRIx64"] : %s",
15877 15885 llp->ll_flock.l_start,
15878 15886 llp->ll_flock.l_start + llp->ll_flock.l_len,
15879 15887 llp->ll_flock.l_type == F_RDLCK ? "READ" :
15880 15888 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15881 15889 /*
15882 15890 * No need to relock what we already have
15883 15891 */
15884 15892 if (llp->ll_flock.l_type == lost_flp->l_type)
15885 15893 continue;
15886 15894
15887 15895 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15888 15896 }
15889 15897
15890 15898 /*
15891 15899 * Now keeping the start of the lost lock as our reference parse the
15892 15900 * newly created ri_llp locklist to find the ranges that we have locked
15893 15901 * with the v4 server but not in the current local locking. We need
15894 15902 * to unlock these ranges.
15895 15903 * These ranges can also be reffered to as those ranges, where the lost
15896 15904 * lock does not overlap with the locks in the ri_llp but are locked
15897 15905 * since the server replied to the lost lock.
15898 15906 */
15899 15907 cur_start = lost_flp->l_start;
15900 15908 lost_flp_end = lock_to_end(lost_flp);
15901 15909
15902 15910 ul_fl.l_type = F_UNLCK;
15903 15911 ul_fl.l_whence = 0; /* aka SEEK_SET */
15904 15912 ul_fl.l_sysid = lost_flp->l_sysid;
15905 15913 ul_fl.l_pid = lost_flp->l_pid;
15906 15914
15907 15915 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15908 15916 llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15909 15917
15910 15918 if (llp->ll_flock.l_start <= cur_start) {
15911 15919 cur_start = start_check(llp_ll_flock_end);
15912 15920 continue;
15913 15921 }
15914 15922 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15915 15923 "nfs4_reinstitute_local_lock_state: "
15916 15924 "UNLOCK [%"PRIx64" - %"PRIx64"]",
15917 15925 cur_start, llp->ll_flock.l_start));
15918 15926
15919 15927 ul_fl.l_start = cur_start;
15920 15928 ul_fl.l_len = end_to_len(cur_start,
15921 15929 (llp->ll_flock.l_start - 1));
15922 15930
15923 15931 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15924 15932 cur_start = start_check(llp_ll_flock_end);
15925 15933 }
15926 15934
15927 15935 /*
15928 15936 * In the case where the lost lock ends after all intersecting locks,
15929 15937 * unlock the last part of the lost lock range.
15930 15938 */
15931 15939 if (cur_start != start_check(lost_flp_end)) {
15932 15940 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15933 15941 "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15934 15942 "lost lock region [%"PRIx64" - %"PRIx64"]",
15935 15943 cur_start, lost_flp->l_start + lost_flp->l_len));
15936 15944
15937 15945 ul_fl.l_start = cur_start;
15938 15946 /*
15939 15947 * Is it an to-EOF lock? if so unlock till the end
15940 15948 */
15941 15949 if (lost_flp->l_len == 0)
15942 15950 ul_fl.l_len = 0;
15943 15951 else
15944 15952 ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15945 15953
15946 15954 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15947 15955 }
15948 15956
15949 15957 if (locks != NULL)
15950 15958 flk_free_locklist(locks);
15951 15959
15952 15960 /* Free up our newly created locklist */
15953 15961 for (llp = ri_llp; llp != NULL; ) {
15954 15962 tmp_llp = llp->ll_next;
15955 15963 kmem_free(llp, sizeof (locklist_t));
15956 15964 llp = tmp_llp;
15957 15965 }
15958 15966
15959 15967 /*
15960 15968 * Now return back to the original calling nfs4frlock()
15961 15969 * and let us naturally drop our seqid syncs.
15962 15970 */
15963 15971 }
15964 15972
15965 15973 /*
15966 15974 * Create a lost state record for the given lock reinstantiation request
15967 15975 * and push it onto the lost state queue.
15968 15976 */
15969 15977 static void
15970 15978 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15971 15979 nfs4_lock_owner_t *lop)
15972 15980 {
15973 15981 nfs4_lost_rqst_t req;
15974 15982 nfs_lock_type4 locktype;
15975 15983 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15976 15984
15977 15985 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15978 15986
15979 15987 locktype = flk_to_locktype(cmd, flk->l_type);
15980 15988 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15981 15989 NULL, NULL, lop, flk, &req, cr, vp);
15982 15990 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
15983 15991 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
15984 15992 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
15985 15993 NULL, NULL, NULL);
15986 15994 }
↓ open down ↓ |
9311 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX