1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
25 * All rights reserved.
26 */
27
28 /*
29 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
30 */
31
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathconf.h>
58 #include <sys/utsname.h>
59 #include <sys/dnlc.h>
60 #include <sys/acl.h>
61 #include <sys/atomic.h>
62 #include <sys/policy.h>
63 #include <sys/sdt.h>
64
65 #include <rpc/types.h>
66 #include <rpc/auth.h>
67 #include <rpc/clnt.h>
68
69 #include <nfs/nfs.h>
70 #include <nfs/nfs_clnt.h>
71 #include <nfs/rnode.h>
72 #include <nfs/nfs_acl.h>
73 #include <nfs/lm.h>
74
75 #include <vm/hat.h>
76 #include <vm/as.h>
77 #include <vm/page.h>
78 #include <vm/pvn.h>
79 #include <vm/seg.h>
80 #include <vm/seg_map.h>
81 #include <vm/seg_kpm.h>
82 #include <vm/seg_vn.h>
83
84 #include <fs/fs_subr.h>
85
86 #include <sys/ddi.h>
87
88 static int nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
89 cred_t *);
90 static int nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
91 static int nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
92 static int nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
93 static int nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
94 static int nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
95 static int nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
96 caller_context_t *);
97 static int nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
98 static int nfs_bio(struct buf *, cred_t *);
99 static int nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
100 page_t *[], size_t, struct seg *, caddr_t,
101 enum seg_rw, cred_t *);
102 static void nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
103 cred_t *);
104 static int nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
105 int, cred_t *);
106 static int nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
107 int, cred_t *);
108 static void nfs_delmap_callback(struct as *, void *, uint_t);
109
110 /*
111 * Error flags used to pass information about certain special errors
112 * which need to be handled specially.
113 */
114 #define NFS_EOF -98
115
116 /*
117 * These are the vnode ops routines which implement the vnode interface to
118 * the networked file system. These routines just take their parameters,
119 * make them look networkish by putting the right info into interface structs,
120 * and then calling the appropriate remote routine(s) to do the work.
121 *
122 * Note on directory name lookup cacheing: If we detect a stale fhandle,
123 * we purge the directory cache relative to that vnode. This way, the
124 * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for
125 * more details on rnode locking.
126 */
127
128 static int nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
129 static int nfs_close(vnode_t *, int, int, offset_t, cred_t *,
130 caller_context_t *);
131 static int nfs_read(vnode_t *, struct uio *, int, cred_t *,
132 caller_context_t *);
133 static int nfs_write(vnode_t *, struct uio *, int, cred_t *,
134 caller_context_t *);
135 static int nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
136 caller_context_t *);
137 static int nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
138 caller_context_t *);
139 static int nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
140 caller_context_t *);
141 static int nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
142 static int nfs_accessx(void *, int, cred_t *);
143 static int nfs_readlink(vnode_t *, struct uio *, cred_t *,
144 caller_context_t *);
145 static int nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
146 static void nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
147 static int nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
148 int, vnode_t *, cred_t *, caller_context_t *,
149 int *, pathname_t *);
150 static int nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
151 int, vnode_t **, cred_t *, int, caller_context_t *,
152 vsecattr_t *);
153 static int nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
154 int);
155 static int nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
156 caller_context_t *, int);
157 static int nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
158 caller_context_t *, int);
159 static int nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
160 cred_t *, caller_context_t *, int, vsecattr_t *);
161 static int nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
162 caller_context_t *, int);
163 static int nfs_symlink(vnode_t *, char *, struct vattr *, char *,
164 cred_t *, caller_context_t *, int);
165 static int nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
166 caller_context_t *, int);
167 static int nfs_fid(vnode_t *, fid_t *, caller_context_t *);
168 static int nfs_rwlock(vnode_t *, int, caller_context_t *);
169 static void nfs_rwunlock(vnode_t *, int, caller_context_t *);
170 static int nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
171 static int nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
172 page_t *[], size_t, struct seg *, caddr_t,
173 enum seg_rw, cred_t *, caller_context_t *);
174 static int nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
175 caller_context_t *);
176 static int nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
177 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
178 static int nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
179 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
180 static int nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
181 struct flk_callback *, cred_t *, caller_context_t *);
182 static int nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
183 cred_t *, caller_context_t *);
184 static int nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
185 static int nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
186 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
187 static int nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
188 caller_context_t *);
189 static int nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
190 cred_t *, caller_context_t *);
191 static int nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
192 caller_context_t *);
193 static int nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
194 caller_context_t *);
195 static int nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
196 caller_context_t *);
197
198 struct vnodeops *nfs_vnodeops;
199
200 const fs_operation_def_t nfs_vnodeops_template[] = {
201 VOPNAME_OPEN, { .vop_open = nfs_open },
202 VOPNAME_CLOSE, { .vop_close = nfs_close },
203 VOPNAME_READ, { .vop_read = nfs_read },
204 VOPNAME_WRITE, { .vop_write = nfs_write },
205 VOPNAME_IOCTL, { .vop_ioctl = nfs_ioctl },
206 VOPNAME_GETATTR, { .vop_getattr = nfs_getattr },
207 VOPNAME_SETATTR, { .vop_setattr = nfs_setattr },
208 VOPNAME_ACCESS, { .vop_access = nfs_access },
209 VOPNAME_LOOKUP, { .vop_lookup = nfs_lookup },
210 VOPNAME_CREATE, { .vop_create = nfs_create },
211 VOPNAME_REMOVE, { .vop_remove = nfs_remove },
212 VOPNAME_LINK, { .vop_link = nfs_link },
213 VOPNAME_RENAME, { .vop_rename = nfs_rename },
214 VOPNAME_MKDIR, { .vop_mkdir = nfs_mkdir },
215 VOPNAME_RMDIR, { .vop_rmdir = nfs_rmdir },
216 VOPNAME_READDIR, { .vop_readdir = nfs_readdir },
217 VOPNAME_SYMLINK, { .vop_symlink = nfs_symlink },
218 VOPNAME_READLINK, { .vop_readlink = nfs_readlink },
219 VOPNAME_FSYNC, { .vop_fsync = nfs_fsync },
220 VOPNAME_INACTIVE, { .vop_inactive = nfs_inactive },
221 VOPNAME_FID, { .vop_fid = nfs_fid },
222 VOPNAME_RWLOCK, { .vop_rwlock = nfs_rwlock },
223 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs_rwunlock },
224 VOPNAME_SEEK, { .vop_seek = nfs_seek },
225 VOPNAME_FRLOCK, { .vop_frlock = nfs_frlock },
226 VOPNAME_SPACE, { .vop_space = nfs_space },
227 VOPNAME_REALVP, { .vop_realvp = nfs_realvp },
228 VOPNAME_GETPAGE, { .vop_getpage = nfs_getpage },
229 VOPNAME_PUTPAGE, { .vop_putpage = nfs_putpage },
230 VOPNAME_MAP, { .vop_map = nfs_map },
231 VOPNAME_ADDMAP, { .vop_addmap = nfs_addmap },
232 VOPNAME_DELMAP, { .vop_delmap = nfs_delmap },
233 VOPNAME_DUMP, { .vop_dump = nfs_dump },
234 VOPNAME_PATHCONF, { .vop_pathconf = nfs_pathconf },
235 VOPNAME_PAGEIO, { .vop_pageio = nfs_pageio },
236 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs_setsecattr },
237 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs_getsecattr },
238 VOPNAME_SHRLOCK, { .vop_shrlock = nfs_shrlock },
239 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
240 NULL, NULL
241 };
242
243 /*
244 * XXX: This is referenced in modstubs.s
245 */
246 struct vnodeops *
247 nfs_getvnodeops(void)
248 {
249 return (nfs_vnodeops);
250 }
251
252 /* ARGSUSED */
253 static int
254 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
255 {
256 int error;
257 struct vattr va;
258 rnode_t *rp;
259 vnode_t *vp;
260
261 vp = *vpp;
262 rp = VTOR(vp);
263 if (nfs_zone() != VTOMI(vp)->mi_zone)
264 return (EIO);
265 mutex_enter(&rp->r_statelock);
266 if (rp->r_cred == NULL) {
267 crhold(cr);
268 rp->r_cred = cr;
269 }
270 mutex_exit(&rp->r_statelock);
271
272 /*
273 * If there is no cached data or if close-to-open
274 * consistency checking is turned off, we can avoid
275 * the over the wire getattr. Otherwise, if the
276 * file system is mounted readonly, then just verify
277 * the caches are up to date using the normal mechanism.
278 * Else, if the file is not mmap'd, then just mark
279 * the attributes as timed out. They will be refreshed
280 * and the caches validated prior to being used.
281 * Else, the file system is mounted writeable so
282 * force an over the wire GETATTR in order to ensure
283 * that all cached data is valid.
284 */
285 if (vp->v_count > 1 ||
286 ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
287 !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
288 if (vn_is_readonly(vp))
289 error = nfs_validate_caches(vp, cr);
290 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
291 PURGE_ATTRCACHE(vp);
292 error = 0;
293 } else {
294 va.va_mask = AT_ALL;
295 error = nfs_getattr_otw(vp, &va, cr);
296 }
297 } else
298 error = 0;
299
300 return (error);
301 }
302
303 /* ARGSUSED */
304 static int
305 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
306 caller_context_t *ct)
307 {
308 rnode_t *rp;
309 int error;
310 struct vattr va;
311
312 /*
313 * zone_enter(2) prevents processes from changing zones with NFS files
314 * open; if we happen to get here from the wrong zone we can't do
315 * anything over the wire.
316 */
317 if (VTOMI(vp)->mi_zone != nfs_zone()) {
318 /*
319 * We could attempt to clean up locks, except we're sure
320 * that the current process didn't acquire any locks on
321 * the file: any attempt to lock a file belong to another zone
322 * will fail, and one can't lock an NFS file and then change
323 * zones, as that fails too.
324 *
325 * Returning an error here is the sane thing to do. A
326 * subsequent call to VN_RELE() which translates to a
327 * nfs_inactive() will clean up state: if the zone of the
328 * vnode's origin is still alive and kicking, an async worker
329 * thread will handle the request (from the correct zone), and
330 * everything (minus the final nfs_getattr_otw() call) should
331 * be OK. If the zone is going away nfs_async_inactive() will
332 * throw away cached pages inline.
333 */
334 return (EIO);
335 }
336
337 /*
338 * If we are using local locking for this filesystem, then
339 * release all of the SYSV style record locks. Otherwise,
340 * we are doing network locking and we need to release all
341 * of the network locks. All of the locks held by this
342 * process on this file are released no matter what the
343 * incoming reference count is.
344 */
345 if (VTOMI(vp)->mi_flags & MI_LLOCK) {
346 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
347 cleanshares(vp, ttoproc(curthread)->p_pid);
348 } else
349 nfs_lockrelease(vp, flag, offset, cr);
350
351 if (count > 1)
352 return (0);
353
354 /*
355 * If the file has been `unlinked', then purge the
356 * DNLC so that this vnode will get reycled quicker
357 * and the .nfs* file on the server will get removed.
358 */
359 rp = VTOR(vp);
360 if (rp->r_unldvp != NULL)
361 dnlc_purge_vp(vp);
362
363 /*
364 * If the file was open for write and there are pages,
365 * then if the file system was mounted using the "no-close-
366 * to-open" semantics, then start an asynchronous flush
367 * of the all of the pages in the file.
368 * else the file system was not mounted using the "no-close-
369 * to-open" semantics, then do a synchronous flush and
370 * commit of all of the dirty and uncommitted pages.
371 *
372 * The asynchronous flush of the pages in the "nocto" path
373 * mostly just associates a cred pointer with the rnode so
374 * writes which happen later will have a better chance of
375 * working. It also starts the data being written to the
376 * server, but without unnecessarily delaying the application.
377 */
378 if ((flag & FWRITE) && vn_has_cached_data(vp)) {
379 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
380 error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
381 cr, ct);
382 if (error == EAGAIN)
383 error = 0;
384 } else
385 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
386 if (!error) {
387 mutex_enter(&rp->r_statelock);
388 error = rp->r_error;
389 rp->r_error = 0;
390 mutex_exit(&rp->r_statelock);
391 }
392 } else {
393 mutex_enter(&rp->r_statelock);
394 error = rp->r_error;
395 rp->r_error = 0;
396 mutex_exit(&rp->r_statelock);
397 }
398
399 /*
400 * If RWRITEATTR is set, then issue an over the wire GETATTR to
401 * refresh the attribute cache with a set of attributes which
402 * weren't returned from a WRITE. This will enable the close-
403 * to-open processing to work.
404 */
405 if (rp->r_flags & RWRITEATTR)
406 (void) nfs_getattr_otw(vp, &va, cr);
407
408 return (error);
409 }
410
411 /* ARGSUSED */
412 static int
413 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
414 caller_context_t *ct)
415 {
416 rnode_t *rp;
417 u_offset_t off;
418 offset_t diff;
419 int on;
420 size_t n;
421 caddr_t base;
422 uint_t flags;
423 int error;
424 mntinfo_t *mi;
425
426 rp = VTOR(vp);
427 mi = VTOMI(vp);
428
429 if (nfs_zone() != mi->mi_zone)
430 return (EIO);
431
432 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
433
434 if (vp->v_type != VREG)
435 return (EISDIR);
436
437 if (uiop->uio_resid == 0)
438 return (0);
439
440 if (uiop->uio_loffset > MAXOFF32_T)
441 return (EFBIG);
442
443 if (uiop->uio_loffset < 0 ||
444 uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
445 return (EINVAL);
446
447 /*
448 * Bypass VM if caching has been disabled (e.g., locking) or if
449 * using client-side direct I/O and the file is not mmap'd and
450 * there are no cached pages.
451 */
452 if ((vp->v_flag & VNOCACHE) ||
453 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
454 rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
455 !vn_has_cached_data(vp))) {
456 size_t bufsize;
457 size_t resid = 0;
458
459 /*
460 * Let's try to do read in as large a chunk as we can
461 * (Filesystem (NFS client) bsize if possible/needed).
462 * For V3, this is 32K and for V2, this is 8K.
463 */
464 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
465 base = kmem_alloc(bufsize, KM_SLEEP);
466 do {
467 n = MIN(uiop->uio_resid, bufsize);
468 error = nfsread(vp, base, uiop->uio_offset, n,
469 &resid, cr);
470 if (!error) {
471 n -= resid;
472 error = uiomove(base, n, UIO_READ, uiop);
473 }
474 } while (!error && uiop->uio_resid > 0 && n > 0);
475 kmem_free(base, bufsize);
476 return (error);
477 }
478
479 error = 0;
480
481 do {
482 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
483 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
484 n = MIN(MAXBSIZE - on, uiop->uio_resid);
485
486 error = nfs_validate_caches(vp, cr);
487 if (error)
488 break;
489
490 mutex_enter(&rp->r_statelock);
491 while (rp->r_flags & RINCACHEPURGE) {
492 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
493 mutex_exit(&rp->r_statelock);
494 return (EINTR);
495 }
496 }
497 diff = rp->r_size - uiop->uio_loffset;
498 mutex_exit(&rp->r_statelock);
499 if (diff <= 0)
500 break;
501 if (diff < n)
502 n = (size_t)diff;
503
504 if (vpm_enable) {
505 /*
506 * Copy data.
507 */
508 error = vpm_data_copy(vp, off + on, n, uiop,
509 1, NULL, 0, S_READ);
510 } else {
511 base = segmap_getmapflt(segkmap, vp, off + on, n,
512 1, S_READ);
513 error = uiomove(base + on, n, UIO_READ, uiop);
514 }
515
516 if (!error) {
517 /*
518 * If read a whole block or read to eof,
519 * won't need this buffer again soon.
520 */
521 mutex_enter(&rp->r_statelock);
522 if (n + on == MAXBSIZE ||
523 uiop->uio_loffset == rp->r_size)
524 flags = SM_DONTNEED;
525 else
526 flags = 0;
527 mutex_exit(&rp->r_statelock);
528 if (vpm_enable) {
529 error = vpm_sync_pages(vp, off, n, flags);
530 } else {
531 error = segmap_release(segkmap, base, flags);
532 }
533 } else {
534 if (vpm_enable) {
535 (void) vpm_sync_pages(vp, off, n, 0);
536 } else {
537 (void) segmap_release(segkmap, base, 0);
538 }
539 }
540 } while (!error && uiop->uio_resid > 0);
541
542 return (error);
543 }
544
545 /* ARGSUSED */
546 static int
547 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
548 caller_context_t *ct)
549 {
550 rnode_t *rp;
551 u_offset_t off;
552 caddr_t base;
553 uint_t flags;
554 int remainder;
555 size_t n;
556 int on;
557 int error;
558 int resid;
559 offset_t offset;
560 rlim_t limit;
561 mntinfo_t *mi;
562
563 rp = VTOR(vp);
564
565 mi = VTOMI(vp);
566 if (nfs_zone() != mi->mi_zone)
567 return (EIO);
568 if (vp->v_type != VREG)
569 return (EISDIR);
570
571 if (uiop->uio_resid == 0)
572 return (0);
573
574 if (ioflag & FAPPEND) {
575 struct vattr va;
576
577 /*
578 * Must serialize if appending.
579 */
580 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
581 nfs_rw_exit(&rp->r_rwlock);
582 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
583 INTR(vp)))
584 return (EINTR);
585 }
586
587 va.va_mask = AT_SIZE;
588 error = nfsgetattr(vp, &va, cr);
589 if (error)
590 return (error);
591 uiop->uio_loffset = va.va_size;
592 }
593
594 if (uiop->uio_loffset > MAXOFF32_T)
595 return (EFBIG);
596
597 offset = uiop->uio_loffset + uiop->uio_resid;
598
599 if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
600 return (EINVAL);
601
602 if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
603 limit = MAXOFF32_T;
604 } else {
605 limit = (rlim_t)uiop->uio_llimit;
606 }
607
608 /*
609 * Check to make sure that the process will not exceed
610 * its limit on file size. It is okay to write up to
611 * the limit, but not beyond. Thus, the write which
612 * reaches the limit will be short and the next write
613 * will return an error.
614 */
615 remainder = 0;
616 if (offset > limit) {
617 remainder = offset - limit;
618 uiop->uio_resid = limit - uiop->uio_offset;
619 if (uiop->uio_resid <= 0) {
620 proc_t *p = ttoproc(curthread);
621
622 uiop->uio_resid += remainder;
623 mutex_enter(&p->p_lock);
624 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
625 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
626 mutex_exit(&p->p_lock);
627 return (EFBIG);
628 }
629 }
630
631 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
632 return (EINTR);
633
634 /*
635 * Bypass VM if caching has been disabled (e.g., locking) or if
636 * using client-side direct I/O and the file is not mmap'd and
637 * there are no cached pages.
638 */
639 if ((vp->v_flag & VNOCACHE) ||
640 (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
641 rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
642 !vn_has_cached_data(vp))) {
643 size_t bufsize;
644 int count;
645 uint_t org_offset;
646
647 nfs_fwrite:
648 if (rp->r_flags & RSTALE) {
649 resid = uiop->uio_resid;
650 offset = uiop->uio_loffset;
651 error = rp->r_error;
652 /*
653 * A close may have cleared r_error, if so,
654 * propagate ESTALE error return properly
655 */
656 if (error == 0)
657 error = ESTALE;
658 goto bottom;
659 }
660 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
661 base = kmem_alloc(bufsize, KM_SLEEP);
662 do {
663 resid = uiop->uio_resid;
664 offset = uiop->uio_loffset;
665 count = MIN(uiop->uio_resid, bufsize);
666 org_offset = uiop->uio_offset;
667 error = uiomove(base, count, UIO_WRITE, uiop);
668 if (!error) {
669 error = nfswrite(vp, base, org_offset,
670 count, cr);
671 }
672 } while (!error && uiop->uio_resid > 0);
673 kmem_free(base, bufsize);
674 goto bottom;
675 }
676
677 do {
678 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
679 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
680 n = MIN(MAXBSIZE - on, uiop->uio_resid);
681
682 resid = uiop->uio_resid;
683 offset = uiop->uio_loffset;
684
685 if (rp->r_flags & RSTALE) {
686 error = rp->r_error;
687 /*
688 * A close may have cleared r_error, if so,
689 * propagate ESTALE error return properly
690 */
691 if (error == 0)
692 error = ESTALE;
693 break;
694 }
695
696 /*
697 * Don't create dirty pages faster than they
698 * can be cleaned so that the system doesn't
699 * get imbalanced. If the async queue is
700 * maxed out, then wait for it to drain before
701 * creating more dirty pages. Also, wait for
702 * any threads doing pagewalks in the vop_getattr
703 * entry points so that they don't block for
704 * long periods.
705 */
706 mutex_enter(&rp->r_statelock);
707 while ((mi->mi_max_threads != 0 &&
708 rp->r_awcount > 2 * mi->mi_max_threads) ||
709 rp->r_gcount > 0) {
710 if (INTR(vp)) {
711 klwp_t *lwp = ttolwp(curthread);
712
713 if (lwp != NULL)
714 lwp->lwp_nostop++;
715 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
716 mutex_exit(&rp->r_statelock);
717 if (lwp != NULL)
718 lwp->lwp_nostop--;
719 error = EINTR;
720 goto bottom;
721 }
722 if (lwp != NULL)
723 lwp->lwp_nostop--;
724 } else
725 cv_wait(&rp->r_cv, &rp->r_statelock);
726 }
727 mutex_exit(&rp->r_statelock);
728
729 /*
730 * Touch the page and fault it in if it is not in core
731 * before segmap_getmapflt or vpm_data_copy can lock it.
732 * This is to avoid the deadlock if the buffer is mapped
733 * to the same file through mmap which we want to write.
734 */
735 uio_prefaultpages((long)n, uiop);
736
737 if (vpm_enable) {
738 /*
739 * It will use kpm mappings, so no need to
740 * pass an address.
741 */
742 error = writerp(rp, NULL, n, uiop, 0);
743 } else {
744 if (segmap_kpm) {
745 int pon = uiop->uio_loffset & PAGEOFFSET;
746 size_t pn = MIN(PAGESIZE - pon,
747 uiop->uio_resid);
748 int pagecreate;
749
750 mutex_enter(&rp->r_statelock);
751 pagecreate = (pon == 0) && (pn == PAGESIZE ||
752 uiop->uio_loffset + pn >= rp->r_size);
753 mutex_exit(&rp->r_statelock);
754
755 base = segmap_getmapflt(segkmap, vp, off + on,
756 pn, !pagecreate, S_WRITE);
757
758 error = writerp(rp, base + pon, n, uiop,
759 pagecreate);
760
761 } else {
762 base = segmap_getmapflt(segkmap, vp, off + on,
763 n, 0, S_READ);
764 error = writerp(rp, base + on, n, uiop, 0);
765 }
766 }
767
768 if (!error) {
769 if (mi->mi_flags & MI_NOAC)
770 flags = SM_WRITE;
771 else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
772 /*
773 * Have written a whole block.
774 * Start an asynchronous write
775 * and mark the buffer to
776 * indicate that it won't be
777 * needed again soon.
778 */
779 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
780 } else
781 flags = 0;
782 if ((ioflag & (FSYNC|FDSYNC)) ||
783 (rp->r_flags & ROUTOFSPACE)) {
784 flags &= ~SM_ASYNC;
785 flags |= SM_WRITE;
786 }
787 if (vpm_enable) {
788 error = vpm_sync_pages(vp, off, n, flags);
789 } else {
790 error = segmap_release(segkmap, base, flags);
791 }
792 } else {
793 if (vpm_enable) {
794 (void) vpm_sync_pages(vp, off, n, 0);
795 } else {
796 (void) segmap_release(segkmap, base, 0);
797 }
798 /*
799 * In the event that we got an access error while
800 * faulting in a page for a write-only file just
801 * force a write.
802 */
803 if (error == EACCES)
804 goto nfs_fwrite;
805 }
806 } while (!error && uiop->uio_resid > 0);
807
808 bottom:
809 if (error) {
810 uiop->uio_resid = resid + remainder;
811 uiop->uio_loffset = offset;
812 } else
813 uiop->uio_resid += remainder;
814
815 nfs_rw_exit(&rp->r_lkserlock);
816
817 return (error);
818 }
819
820 /*
821 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
822 */
823 static int
824 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
825 int flags, cred_t *cr)
826 {
827 struct buf *bp;
828 int error;
829
830 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
831 bp = pageio_setup(pp, len, vp, flags);
832 ASSERT(bp != NULL);
833
834 /*
835 * pageio_setup should have set b_addr to 0. This
836 * is correct since we want to do I/O on a page
837 * boundary. bp_mapin will use this addr to calculate
838 * an offset, and then set b_addr to the kernel virtual
839 * address it allocated for us.
840 */
841 ASSERT(bp->b_un.b_addr == 0);
842
843 bp->b_edev = 0;
844 bp->b_dev = 0;
845 bp->b_lblkno = lbtodb(off);
846 bp->b_file = vp;
847 bp->b_offset = (offset_t)off;
848 bp_mapin(bp);
849
850 error = nfs_bio(bp, cr);
851
852 bp_mapout(bp);
853 pageio_done(bp);
854
855 return (error);
856 }
857
858 /*
859 * Write to file. Writes to remote server in largest size
860 * chunks that the server can handle. Write is synchronous.
861 */
862 static int
863 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
864 {
865 rnode_t *rp;
866 mntinfo_t *mi;
867 struct nfswriteargs wa;
868 struct nfsattrstat ns;
869 int error;
870 int tsize;
871 int douprintf;
872
873 douprintf = 1;
874
875 rp = VTOR(vp);
876 mi = VTOMI(vp);
877
878 ASSERT(nfs_zone() == mi->mi_zone);
879
880 wa.wa_args = &wa.wa_args_buf;
881 wa.wa_fhandle = *VTOFH(vp);
882
883 do {
884 tsize = MIN(mi->mi_curwrite, count);
885 wa.wa_data = base;
886 wa.wa_begoff = offset;
887 wa.wa_totcount = tsize;
888 wa.wa_count = tsize;
889 wa.wa_offset = offset;
890
891 if (mi->mi_io_kstats) {
892 mutex_enter(&mi->mi_lock);
893 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
894 mutex_exit(&mi->mi_lock);
895 }
896 wa.wa_mblk = NULL;
897 do {
898 error = rfs2call(mi, RFS_WRITE,
899 xdr_writeargs, (caddr_t)&wa,
900 xdr_attrstat, (caddr_t)&ns, cr,
901 &douprintf, &ns.ns_status, 0, NULL);
902 } while (error == ENFS_TRYAGAIN);
903 if (mi->mi_io_kstats) {
904 mutex_enter(&mi->mi_lock);
905 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
906 mutex_exit(&mi->mi_lock);
907 }
908
909 if (!error) {
910 error = geterrno(ns.ns_status);
911 /*
912 * Can't check for stale fhandle and purge caches
913 * here because pages are held by nfs_getpage.
914 * Just mark the attribute cache as timed out
915 * and set RWRITEATTR to indicate that the file
916 * was modified with a WRITE operation.
917 */
918 if (!error) {
919 count -= tsize;
920 base += tsize;
921 offset += tsize;
922 if (mi->mi_io_kstats) {
923 mutex_enter(&mi->mi_lock);
924 KSTAT_IO_PTR(mi->mi_io_kstats)->
925 writes++;
926 KSTAT_IO_PTR(mi->mi_io_kstats)->
927 nwritten += tsize;
928 mutex_exit(&mi->mi_lock);
929 }
930 lwp_stat_update(LWP_STAT_OUBLK, 1);
931 mutex_enter(&rp->r_statelock);
932 PURGE_ATTRCACHE_LOCKED(rp);
933 rp->r_flags |= RWRITEATTR;
934 mutex_exit(&rp->r_statelock);
935 }
936 }
937 } while (!error && count);
938
939 return (error);
940 }
941
942 /*
943 * Read from a file. Reads data in largest chunks our interface can handle.
944 */
945 static int
946 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
947 int count, size_t *residp, cred_t *cr)
948 {
949 mntinfo_t *mi;
950 struct nfsreadargs ra;
951 struct nfsrdresult rr;
952 int tsize;
953 int error;
954 int douprintf;
955 failinfo_t fi;
956 rnode_t *rp;
957 struct vattr va;
958 hrtime_t t;
959
960 rp = VTOR(vp);
961 mi = VTOMI(vp);
962
963 ASSERT(nfs_zone() == mi->mi_zone);
964
965 douprintf = 1;
966
967 ra.ra_fhandle = *VTOFH(vp);
968
969 fi.vp = vp;
970 fi.fhp = (caddr_t)&ra.ra_fhandle;
971 fi.copyproc = nfscopyfh;
972 fi.lookupproc = nfslookup;
973 fi.xattrdirproc = acl_getxattrdir2;
974
975 do {
976 if (mi->mi_io_kstats) {
977 mutex_enter(&mi->mi_lock);
978 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
979 mutex_exit(&mi->mi_lock);
980 }
981
982 do {
983 tsize = MIN(mi->mi_curread, count);
984 rr.rr_data = base;
985 ra.ra_offset = offset;
986 ra.ra_totcount = tsize;
987 ra.ra_count = tsize;
988 ra.ra_data = base;
989 t = gethrtime();
990 error = rfs2call(mi, RFS_READ,
991 xdr_readargs, (caddr_t)&ra,
992 xdr_rdresult, (caddr_t)&rr, cr,
993 &douprintf, &rr.rr_status, 0, &fi);
994 } while (error == ENFS_TRYAGAIN);
995
996 if (mi->mi_io_kstats) {
997 mutex_enter(&mi->mi_lock);
998 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
999 mutex_exit(&mi->mi_lock);
1000 }
1001
1002 if (!error) {
1003 error = geterrno(rr.rr_status);
1004 if (!error) {
1005 count -= rr.rr_count;
1006 base += rr.rr_count;
1007 offset += rr.rr_count;
1008 if (mi->mi_io_kstats) {
1009 mutex_enter(&mi->mi_lock);
1010 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1011 KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
1012 rr.rr_count;
1013 mutex_exit(&mi->mi_lock);
1014 }
1015 lwp_stat_update(LWP_STAT_INBLK, 1);
1016 }
1017 }
1018 } while (!error && count && rr.rr_count == tsize);
1019
1020 *residp = count;
1021
1022 if (!error) {
1023 /*
1024 * Since no error occurred, we have the current
1025 * attributes and we need to do a cache check and then
1026 * potentially update the cached attributes. We can't
1027 * use the normal attribute check and cache mechanisms
1028 * because they might cause a cache flush which would
1029 * deadlock. Instead, we just check the cache to see
1030 * if the attributes have changed. If it is, then we
1031 * just mark the attributes as out of date. The next
1032 * time that the attributes are checked, they will be
1033 * out of date, new attributes will be fetched, and
1034 * the page cache will be flushed. If the attributes
1035 * weren't changed, then we just update the cached
1036 * attributes with these attributes.
1037 */
1038 /*
1039 * If NFS_ACL is supported on the server, then the
1040 * attributes returned by server may have minimal
1041 * permissions sometimes denying access to users having
1042 * proper access. To get the proper attributes, mark
1043 * the attributes as expired so that they will be
1044 * regotten via the NFS_ACL GETATTR2 procedure.
1045 */
1046 error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1047 mutex_enter(&rp->r_statelock);
1048 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1049 (mi->mi_flags & MI_ACL)) {
1050 mutex_exit(&rp->r_statelock);
1051 PURGE_ATTRCACHE(vp);
1052 } else {
1053 if (rp->r_mtime <= t) {
1054 nfs_attrcache_va(vp, &va);
1055 }
1056 mutex_exit(&rp->r_statelock);
1057 }
1058 }
1059
1060 return (error);
1061 }
1062
1063 /* ARGSUSED */
1064 static int
1065 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1066 caller_context_t *ct)
1067 {
1068
1069 if (nfs_zone() != VTOMI(vp)->mi_zone)
1070 return (EIO);
1071 switch (cmd) {
1072 case _FIODIRECTIO:
1073 return (nfs_directio(vp, (int)arg, cr));
1074 default:
1075 return (ENOTTY);
1076 }
1077 }
1078
1079 /* ARGSUSED */
1080 static int
1081 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1082 caller_context_t *ct)
1083 {
1084 int error;
1085 rnode_t *rp;
1086
1087 if (nfs_zone() != VTOMI(vp)->mi_zone)
1088 return (EIO);
1089 /*
1090 * If it has been specified that the return value will
1091 * just be used as a hint, and we are only being asked
1092 * for size, fsid or rdevid, then return the client's
1093 * notion of these values without checking to make sure
1094 * that the attribute cache is up to date.
1095 * The whole point is to avoid an over the wire GETATTR
1096 * call.
1097 */
1098 rp = VTOR(vp);
1099 if (flags & ATTR_HINT) {
1100 if (vap->va_mask ==
1101 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1102 mutex_enter(&rp->r_statelock);
1103 if (vap->va_mask | AT_SIZE)
1104 vap->va_size = rp->r_size;
1105 if (vap->va_mask | AT_FSID)
1106 vap->va_fsid = rp->r_attr.va_fsid;
1107 if (vap->va_mask | AT_RDEV)
1108 vap->va_rdev = rp->r_attr.va_rdev;
1109 mutex_exit(&rp->r_statelock);
1110 return (0);
1111 }
1112 }
1113
1114 /*
1115 * Only need to flush pages if asking for the mtime
1116 * and if there any dirty pages or any outstanding
1117 * asynchronous (write) requests for this file.
1118 */
1119 if (vap->va_mask & AT_MTIME) {
1120 if (vn_has_cached_data(vp) &&
1121 ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1122 mutex_enter(&rp->r_statelock);
1123 rp->r_gcount++;
1124 mutex_exit(&rp->r_statelock);
1125 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1126 mutex_enter(&rp->r_statelock);
1127 if (error && (error == ENOSPC || error == EDQUOT)) {
1128 if (!rp->r_error)
1129 rp->r_error = error;
1130 }
1131 if (--rp->r_gcount == 0)
1132 cv_broadcast(&rp->r_cv);
1133 mutex_exit(&rp->r_statelock);
1134 }
1135 }
1136
1137 return (nfsgetattr(vp, vap, cr));
1138 }
1139
1140 /*ARGSUSED4*/
1141 static int
1142 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1143 caller_context_t *ct)
1144 {
1145 int error;
1146 uint_t mask;
1147 struct vattr va;
1148
1149 mask = vap->va_mask;
1150
1151 if (mask & AT_NOSET)
1152 return (EINVAL);
1153
1154 if ((mask & AT_SIZE) &&
1155 vap->va_type == VREG &&
1156 vap->va_size > MAXOFF32_T)
1157 return (EFBIG);
1158
1159 if (nfs_zone() != VTOMI(vp)->mi_zone)
1160 return (EIO);
1161
1162 va.va_mask = AT_UID | AT_MODE;
1163
1164 error = nfsgetattr(vp, &va, cr);
1165 if (error)
1166 return (error);
1167
1168 error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1169 vp);
1170
1171 if (error)
1172 return (error);
1173
1174 return (nfssetattr(vp, vap, flags, cr));
1175 }
1176
1177 static int
1178 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1179 {
1180 int error;
1181 uint_t mask;
1182 struct nfssaargs args;
1183 struct nfsattrstat ns;
1184 int douprintf;
1185 rnode_t *rp;
1186 struct vattr va;
1187 mode_t omode;
1188 mntinfo_t *mi;
1189 vsecattr_t *vsp;
1190 hrtime_t t;
1191
1192 mask = vap->va_mask;
1193
1194 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1195
1196 rp = VTOR(vp);
1197
1198 /*
1199 * Only need to flush pages if there are any pages and
1200 * if the file is marked as dirty in some fashion. The
1201 * file must be flushed so that we can accurately
1202 * determine the size of the file and the cached data
1203 * after the SETATTR returns. A file is considered to
1204 * be dirty if it is either marked with RDIRTY, has
1205 * outstanding i/o's active, or is mmap'd. In this
1206 * last case, we can't tell whether there are dirty
1207 * pages, so we flush just to be sure.
1208 */
1209 if (vn_has_cached_data(vp) &&
1210 ((rp->r_flags & RDIRTY) ||
1211 rp->r_count > 0 ||
1212 rp->r_mapcnt > 0)) {
1213 ASSERT(vp->v_type != VCHR);
1214 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1215 if (error && (error == ENOSPC || error == EDQUOT)) {
1216 mutex_enter(&rp->r_statelock);
1217 if (!rp->r_error)
1218 rp->r_error = error;
1219 mutex_exit(&rp->r_statelock);
1220 }
1221 }
1222
1223 /*
1224 * If the system call was utime(2) or utimes(2) and the
1225 * application did not specify the times, then set the
1226 * mtime nanosecond field to 1 billion. This will get
1227 * translated from 1 billion nanoseconds to 1 million
1228 * microseconds in the over the wire request. The
1229 * server will use 1 million in the microsecond field
1230 * to tell whether both the mtime and atime should be
1231 * set to the server's current time.
1232 *
1233 * This is an overload of the protocol and should be
1234 * documented in the NFS Version 2 protocol specification.
1235 */
1236 if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1237 vap->va_mtime.tv_nsec = 1000000000;
1238 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1239 NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1240 error = vattr_to_sattr(vap, &args.saa_sa);
1241 } else {
1242 /*
1243 * Use server times. vap time values will not be used.
1244 * To ensure no time overflow, make sure vap has
1245 * valid values, but retain the original values.
1246 */
1247 timestruc_t mtime = vap->va_mtime;
1248 timestruc_t atime = vap->va_atime;
1249 time_t now;
1250
1251 now = gethrestime_sec();
1252 if (NFS_TIME_T_OK(now)) {
1253 /* Just in case server does not know of this */
1254 vap->va_mtime.tv_sec = now;
1255 vap->va_atime.tv_sec = now;
1256 } else {
1257 vap->va_mtime.tv_sec = 0;
1258 vap->va_atime.tv_sec = 0;
1259 }
1260 error = vattr_to_sattr(vap, &args.saa_sa);
1261 /* set vap times back on */
1262 vap->va_mtime = mtime;
1263 vap->va_atime = atime;
1264 }
1265 } else {
1266 /* Either do not set times or use the client specified times */
1267 error = vattr_to_sattr(vap, &args.saa_sa);
1268 }
1269 if (error) {
1270 /* req time field(s) overflow - return immediately */
1271 return (error);
1272 }
1273 args.saa_fh = *VTOFH(vp);
1274
1275 va.va_mask = AT_MODE;
1276 error = nfsgetattr(vp, &va, cr);
1277 if (error)
1278 return (error);
1279 omode = va.va_mode;
1280
1281 mi = VTOMI(vp);
1282
1283 douprintf = 1;
1284
1285 t = gethrtime();
1286
1287 error = rfs2call(mi, RFS_SETATTR,
1288 xdr_saargs, (caddr_t)&args,
1289 xdr_attrstat, (caddr_t)&ns, cr,
1290 &douprintf, &ns.ns_status, 0, NULL);
1291
1292 /*
1293 * Purge the access cache and ACL cache if changing either the
1294 * owner of the file, the group owner, or the mode. These may
1295 * change the access permissions of the file, so purge old
1296 * information and start over again.
1297 */
1298 if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1299 (void) nfs_access_purge_rp(rp);
1300 if (rp->r_secattr != NULL) {
1301 mutex_enter(&rp->r_statelock);
1302 vsp = rp->r_secattr;
1303 rp->r_secattr = NULL;
1304 mutex_exit(&rp->r_statelock);
1305 if (vsp != NULL)
1306 nfs_acl_free(vsp);
1307 }
1308 }
1309
1310 if (!error) {
1311 error = geterrno(ns.ns_status);
1312 if (!error) {
1313 /*
1314 * If changing the size of the file, invalidate
1315 * any local cached data which is no longer part
1316 * of the file. We also possibly invalidate the
1317 * last page in the file. We could use
1318 * pvn_vpzero(), but this would mark the page as
1319 * modified and require it to be written back to
1320 * the server for no particularly good reason.
1321 * This way, if we access it, then we bring it
1322 * back in. A read should be cheaper than a
1323 * write.
1324 */
1325 if (mask & AT_SIZE) {
1326 nfs_invalidate_pages(vp,
1327 (vap->va_size & PAGEMASK), cr);
1328 }
1329 (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1330 /*
1331 * If NFS_ACL is supported on the server, then the
1332 * attributes returned by server may have minimal
1333 * permissions sometimes denying access to users having
1334 * proper access. To get the proper attributes, mark
1335 * the attributes as expired so that they will be
1336 * regotten via the NFS_ACL GETATTR2 procedure.
1337 */
1338 if (mi->mi_flags & MI_ACL) {
1339 PURGE_ATTRCACHE(vp);
1340 }
1341 /*
1342 * This next check attempts to deal with NFS
1343 * servers which can not handle increasing
1344 * the size of the file via setattr. Most
1345 * of these servers do not return an error,
1346 * but do not change the size of the file.
1347 * Hence, this check and then attempt to set
1348 * the file size by writing 1 byte at the
1349 * offset of the end of the file that we need.
1350 */
1351 if ((mask & AT_SIZE) &&
1352 ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1353 char zb = '\0';
1354
1355 error = nfswrite(vp, &zb,
1356 vap->va_size - sizeof (zb),
1357 sizeof (zb), cr);
1358 }
1359 /*
1360 * Some servers will change the mode to clear the setuid
1361 * and setgid bits when changing the uid or gid. The
1362 * client needs to compensate appropriately.
1363 */
1364 if (mask & (AT_UID | AT_GID)) {
1365 int terror;
1366
1367 va.va_mask = AT_MODE;
1368 terror = nfsgetattr(vp, &va, cr);
1369 if (!terror &&
1370 (((mask & AT_MODE) &&
1371 va.va_mode != vap->va_mode) ||
1372 (!(mask & AT_MODE) &&
1373 va.va_mode != omode))) {
1374 va.va_mask = AT_MODE;
1375 if (mask & AT_MODE)
1376 va.va_mode = vap->va_mode;
1377 else
1378 va.va_mode = omode;
1379 (void) nfssetattr(vp, &va, 0, cr);
1380 }
1381 }
1382 } else {
1383 PURGE_ATTRCACHE(vp);
1384 PURGE_STALE_FH(error, vp, cr);
1385 }
1386 } else {
1387 PURGE_ATTRCACHE(vp);
1388 }
1389
1390 return (error);
1391 }
1392
1393 static int
1394 nfs_accessx(void *vp, int mode, cred_t *cr)
1395 {
1396 ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1397 return (nfs_access(vp, mode, 0, cr, NULL));
1398 }
1399
1400 /* ARGSUSED */
1401 static int
1402 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1403 {
1404 struct vattr va;
1405 int error;
1406 mntinfo_t *mi;
1407 int shift = 0;
1408
1409 mi = VTOMI(vp);
1410
1411 if (nfs_zone() != mi->mi_zone)
1412 return (EIO);
1413 if (mi->mi_flags & MI_ACL) {
1414 error = acl_access2(vp, mode, flags, cr);
1415 if (mi->mi_flags & MI_ACL)
1416 return (error);
1417 }
1418
1419 va.va_mask = AT_MODE | AT_UID | AT_GID;
1420 error = nfsgetattr(vp, &va, cr);
1421 if (error)
1422 return (error);
1423
1424 /*
1425 * Disallow write attempts on read-only
1426 * file systems, unless the file is a
1427 * device node.
1428 */
1429 if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1430 return (EROFS);
1431
1432 /*
1433 * Disallow attempts to access mandatory lock files.
1434 */
1435 if ((mode & (VWRITE | VREAD | VEXEC)) &&
1436 MANDLOCK(vp, va.va_mode))
1437 return (EACCES);
1438
1439 /*
1440 * Access check is based on only
1441 * one of owner, group, public.
1442 * If not owner, then check group.
1443 * If not a member of the group,
1444 * then check public access.
1445 */
1446 if (crgetuid(cr) != va.va_uid) {
1447 shift += 3;
1448 if (!groupmember(va.va_gid, cr))
1449 shift += 3;
1450 }
1451
1452 return (secpolicy_vnode_access2(cr, vp, va.va_uid,
1453 va.va_mode << shift, mode));
1454 }
1455
1456 static int nfs_do_symlink_cache = 1;
1457
1458 /* ARGSUSED */
1459 static int
1460 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1461 {
1462 int error;
1463 struct nfsrdlnres rl;
1464 rnode_t *rp;
1465 int douprintf;
1466 failinfo_t fi;
1467
1468 /*
1469 * We want to be consistent with UFS semantics so we will return
1470 * EINVAL instead of ENXIO. This violates the XNFS spec and
1471 * the RFC 1094, which are wrong any way. BUGID 1138002.
1472 */
1473 if (vp->v_type != VLNK)
1474 return (EINVAL);
1475
1476 if (nfs_zone() != VTOMI(vp)->mi_zone)
1477 return (EIO);
1478
1479 rp = VTOR(vp);
1480 if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1481 error = nfs_validate_caches(vp, cr);
1482 if (error)
1483 return (error);
1484 mutex_enter(&rp->r_statelock);
1485 if (rp->r_symlink.contents != NULL) {
1486 error = uiomove(rp->r_symlink.contents,
1487 rp->r_symlink.len, UIO_READ, uiop);
1488 mutex_exit(&rp->r_statelock);
1489 return (error);
1490 }
1491 mutex_exit(&rp->r_statelock);
1492 }
1493
1494
1495 rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1496
1497 fi.vp = vp;
1498 fi.fhp = NULL; /* no need to update, filehandle not copied */
1499 fi.copyproc = nfscopyfh;
1500 fi.lookupproc = nfslookup;
1501 fi.xattrdirproc = acl_getxattrdir2;
1502
1503 douprintf = 1;
1504
1505 error = rfs2call(VTOMI(vp), RFS_READLINK,
1506 xdr_readlink, (caddr_t)VTOFH(vp),
1507 xdr_rdlnres, (caddr_t)&rl, cr,
1508 &douprintf, &rl.rl_status, 0, &fi);
1509
1510 if (error) {
1511
1512 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1513 return (error);
1514 }
1515
1516 error = geterrno(rl.rl_status);
1517 if (!error) {
1518 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1519 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1520 mutex_enter(&rp->r_statelock);
1521 if (rp->r_symlink.contents == NULL) {
1522 rp->r_symlink.contents = rl.rl_data;
1523 rp->r_symlink.len = (int)rl.rl_count;
1524 rp->r_symlink.size = NFS_MAXPATHLEN;
1525 mutex_exit(&rp->r_statelock);
1526 } else {
1527 mutex_exit(&rp->r_statelock);
1528
1529 kmem_free((void *)rl.rl_data,
1530 NFS_MAXPATHLEN);
1531 }
1532 } else {
1533
1534 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1535 }
1536 } else {
1537 PURGE_STALE_FH(error, vp, cr);
1538
1539 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1540 }
1541
1542 /*
1543 * Conform to UFS semantics (see comment above)
1544 */
1545 return (error == ENXIO ? EINVAL : error);
1546 }
1547
1548 /*
1549 * Flush local dirty pages to stable storage on the server.
1550 *
1551 * If FNODSYNC is specified, then there is nothing to do because
1552 * metadata changes are not cached on the client before being
1553 * sent to the server.
1554 */
1555 /* ARGSUSED */
1556 static int
1557 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1558 {
1559 int error;
1560
1561 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1562 return (0);
1563
1564 if (nfs_zone() != VTOMI(vp)->mi_zone)
1565 return (EIO);
1566
1567 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1568 if (!error)
1569 error = VTOR(vp)->r_error;
1570 return (error);
1571 }
1572
1573
1574 /*
1575 * Weirdness: if the file was removed or the target of a rename
1576 * operation while it was open, it got renamed instead. Here we
1577 * remove the renamed file.
1578 */
1579 /* ARGSUSED */
1580 static void
1581 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1582 {
1583 rnode_t *rp;
1584
1585 ASSERT(vp != DNLC_NO_VNODE);
1586
1587 /*
1588 * If this is coming from the wrong zone, we let someone in the right
1589 * zone take care of it asynchronously. We can get here due to
1590 * VN_RELE() being called from pageout() or fsflush(). This call may
1591 * potentially turn into an expensive no-op if, for instance, v_count
1592 * gets incremented in the meantime, but it's still correct.
1593 */
1594 if (nfs_zone() != VTOMI(vp)->mi_zone) {
1595 nfs_async_inactive(vp, cr, nfs_inactive);
1596 return;
1597 }
1598
1599 rp = VTOR(vp);
1600 redo:
1601 if (rp->r_unldvp != NULL) {
1602 /*
1603 * Save the vnode pointer for the directory where the
1604 * unlinked-open file got renamed, then set it to NULL
1605 * to prevent another thread from getting here before
1606 * we're done with the remove. While we have the
1607 * statelock, make local copies of the pertinent rnode
1608 * fields. If we weren't to do this in an atomic way, the
1609 * the unl* fields could become inconsistent with respect
1610 * to each other due to a race condition between this
1611 * code and nfs_remove(). See bug report 1034328.
1612 */
1613 mutex_enter(&rp->r_statelock);
1614 if (rp->r_unldvp != NULL) {
1615 vnode_t *unldvp;
1616 char *unlname;
1617 cred_t *unlcred;
1618 struct nfsdiropargs da;
1619 enum nfsstat status;
1620 int douprintf;
1621 int error;
1622
1623 unldvp = rp->r_unldvp;
1624 rp->r_unldvp = NULL;
1625 unlname = rp->r_unlname;
1626 rp->r_unlname = NULL;
1627 unlcred = rp->r_unlcred;
1628 rp->r_unlcred = NULL;
1629 mutex_exit(&rp->r_statelock);
1630
1631 /*
1632 * If there are any dirty pages left, then flush
1633 * them. This is unfortunate because they just
1634 * may get thrown away during the remove operation,
1635 * but we have to do this for correctness.
1636 */
1637 if (vn_has_cached_data(vp) &&
1638 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1639 ASSERT(vp->v_type != VCHR);
1640 error = nfs_putpage(vp, (offset_t)0, 0, 0,
1641 cr, ct);
1642 if (error) {
1643 mutex_enter(&rp->r_statelock);
1644 if (!rp->r_error)
1645 rp->r_error = error;
1646 mutex_exit(&rp->r_statelock);
1647 }
1648 }
1649
1650 /*
1651 * Do the remove operation on the renamed file
1652 */
1653 setdiropargs(&da, unlname, unldvp);
1654
1655 douprintf = 1;
1656
1657 (void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1658 xdr_diropargs, (caddr_t)&da,
1659 xdr_enum, (caddr_t)&status, unlcred,
1660 &douprintf, &status, 0, NULL);
1661
1662 if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1663 nfs_purge_rddir_cache(unldvp);
1664 PURGE_ATTRCACHE(unldvp);
1665
1666 /*
1667 * Release stuff held for the remove
1668 */
1669 VN_RELE(unldvp);
1670 kmem_free(unlname, MAXNAMELEN);
1671 crfree(unlcred);
1672 goto redo;
1673 }
1674 mutex_exit(&rp->r_statelock);
1675 }
1676
1677 rp_addfree(rp, cr);
1678 }
1679
1680 /*
1681 * Remote file system operations having to do with directory manipulation.
1682 */
1683
1684 /* ARGSUSED */
1685 static int
1686 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1687 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1688 int *direntflags, pathname_t *realpnp)
1689 {
1690 int error;
1691 vnode_t *vp;
1692 vnode_t *avp = NULL;
1693 rnode_t *drp;
1694
1695 if (nfs_zone() != VTOMI(dvp)->mi_zone)
1696 return (EPERM);
1697
1698 drp = VTOR(dvp);
1699
1700 /*
1701 * Are we looking up extended attributes? If so, "dvp" is
1702 * the file or directory for which we want attributes, and
1703 * we need a lookup of the hidden attribute directory
1704 * before we lookup the rest of the path.
1705 */
1706 if (flags & LOOKUP_XATTR) {
1707 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1708 mntinfo_t *mi;
1709
1710 mi = VTOMI(dvp);
1711 if (!(mi->mi_flags & MI_EXTATTR))
1712 return (EINVAL);
1713
1714 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1715 return (EINTR);
1716
1717 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1718 if (avp == NULL)
1719 error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1720 else
1721 error = 0;
1722
1723 nfs_rw_exit(&drp->r_rwlock);
1724
1725 if (error) {
1726 if (mi->mi_flags & MI_EXTATTR)
1727 return (error);
1728 return (EINVAL);
1729 }
1730 dvp = avp;
1731 drp = VTOR(dvp);
1732 }
1733
1734 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1735 error = EINTR;
1736 goto out;
1737 }
1738
1739 error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1740
1741 nfs_rw_exit(&drp->r_rwlock);
1742
1743 /*
1744 * If vnode is a device, create special vnode.
1745 */
1746 if (!error && IS_DEVVP(*vpp)) {
1747 vp = *vpp;
1748 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1749 VN_RELE(vp);
1750 }
1751
1752 out:
1753 if (avp != NULL)
1754 VN_RELE(avp);
1755
1756 return (error);
1757 }
1758
1759 static int nfs_lookup_neg_cache = 1;
1760
1761 #ifdef DEBUG
1762 static int nfs_lookup_dnlc_hits = 0;
1763 static int nfs_lookup_dnlc_misses = 0;
1764 static int nfs_lookup_dnlc_neg_hits = 0;
1765 static int nfs_lookup_dnlc_disappears = 0;
1766 static int nfs_lookup_dnlc_lookups = 0;
1767 #endif
1768
1769 /* ARGSUSED */
1770 int
1771 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1772 int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1773 {
1774 int error;
1775
1776 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1777
1778 /*
1779 * If lookup is for "", just return dvp. Don't need
1780 * to send it over the wire, look it up in the dnlc,
1781 * or perform any access checks.
1782 */
1783 if (*nm == '\0') {
1784 VN_HOLD(dvp);
1785 *vpp = dvp;
1786 return (0);
1787 }
1788
1789 /*
1790 * Can't do lookups in non-directories.
1791 */
1792 if (dvp->v_type != VDIR)
1793 return (ENOTDIR);
1794
1795 /*
1796 * If we're called with RFSCALL_SOFT, it's important that
1797 * the only rfscall is one we make directly; if we permit
1798 * an access call because we're looking up "." or validating
1799 * a dnlc hit, we'll deadlock because that rfscall will not
1800 * have the RFSCALL_SOFT set.
1801 */
1802 if (rfscall_flags & RFSCALL_SOFT)
1803 goto callit;
1804
1805 /*
1806 * If lookup is for ".", just return dvp. Don't need
1807 * to send it over the wire or look it up in the dnlc,
1808 * just need to check access.
1809 */
1810 if (strcmp(nm, ".") == 0) {
1811 error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1812 if (error)
1813 return (error);
1814 VN_HOLD(dvp);
1815 *vpp = dvp;
1816 return (0);
1817 }
1818
1819 /*
1820 * Lookup this name in the DNLC. If there was a valid entry,
1821 * then return the results of the lookup.
1822 */
1823 error = nfslookup_dnlc(dvp, nm, vpp, cr);
1824 if (error || *vpp != NULL)
1825 return (error);
1826
1827 callit:
1828 error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1829
1830 return (error);
1831 }
1832
1833 static int
1834 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1835 {
1836 int error;
1837 vnode_t *vp;
1838
1839 ASSERT(*nm != '\0');
1840 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1841
1842 /*
1843 * Lookup this name in the DNLC. If successful, then validate
1844 * the caches and then recheck the DNLC. The DNLC is rechecked
1845 * just in case this entry got invalidated during the call
1846 * to nfs_validate_caches.
1847 *
1848 * An assumption is being made that it is safe to say that a
1849 * file exists which may not on the server. Any operations to
1850 * the server will fail with ESTALE.
1851 */
1852 #ifdef DEBUG
1853 nfs_lookup_dnlc_lookups++;
1854 #endif
1855 vp = dnlc_lookup(dvp, nm);
1856 if (vp != NULL) {
1857 VN_RELE(vp);
1858 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1859 PURGE_ATTRCACHE(dvp);
1860 }
1861 error = nfs_validate_caches(dvp, cr);
1862 if (error)
1863 return (error);
1864 vp = dnlc_lookup(dvp, nm);
1865 if (vp != NULL) {
1866 error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1867 if (error) {
1868 VN_RELE(vp);
1869 return (error);
1870 }
1871 if (vp == DNLC_NO_VNODE) {
1872 VN_RELE(vp);
1873 #ifdef DEBUG
1874 nfs_lookup_dnlc_neg_hits++;
1875 #endif
1876 return (ENOENT);
1877 }
1878 *vpp = vp;
1879 #ifdef DEBUG
1880 nfs_lookup_dnlc_hits++;
1881 #endif
1882 return (0);
1883 }
1884 #ifdef DEBUG
1885 nfs_lookup_dnlc_disappears++;
1886 #endif
1887 }
1888 #ifdef DEBUG
1889 else
1890 nfs_lookup_dnlc_misses++;
1891 #endif
1892
1893 *vpp = NULL;
1894
1895 return (0);
1896 }
1897
1898 static int
1899 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1900 int rfscall_flags)
1901 {
1902 int error;
1903 struct nfsdiropargs da;
1904 struct nfsdiropres dr;
1905 int douprintf;
1906 failinfo_t fi;
1907 hrtime_t t;
1908
1909 ASSERT(*nm != '\0');
1910 ASSERT(dvp->v_type == VDIR);
1911 ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1912
1913 setdiropargs(&da, nm, dvp);
1914
1915 fi.vp = dvp;
1916 fi.fhp = NULL; /* no need to update, filehandle not copied */
1917 fi.copyproc = nfscopyfh;
1918 fi.lookupproc = nfslookup;
1919 fi.xattrdirproc = acl_getxattrdir2;
1920
1921 douprintf = 1;
1922
1923 t = gethrtime();
1924
1925 error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1926 xdr_diropargs, (caddr_t)&da,
1927 xdr_diropres, (caddr_t)&dr, cr,
1928 &douprintf, &dr.dr_status, rfscall_flags, &fi);
1929
1930 if (!error) {
1931 error = geterrno(dr.dr_status);
1932 if (!error) {
1933 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1934 dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1935 /*
1936 * If NFS_ACL is supported on the server, then the
1937 * attributes returned by server may have minimal
1938 * permissions sometimes denying access to users having
1939 * proper access. To get the proper attributes, mark
1940 * the attributes as expired so that they will be
1941 * regotten via the NFS_ACL GETATTR2 procedure.
1942 */
1943 if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1944 PURGE_ATTRCACHE(*vpp);
1945 }
1946 if (!(rfscall_flags & RFSCALL_SOFT))
1947 dnlc_update(dvp, nm, *vpp);
1948 } else {
1949 PURGE_STALE_FH(error, dvp, cr);
1950 if (error == ENOENT && nfs_lookup_neg_cache)
1951 dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1952 }
1953 }
1954
1955 return (error);
1956 }
1957
1958 /* ARGSUSED */
1959 static int
1960 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1961 int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1962 vsecattr_t *vsecp)
1963 {
1964 int error;
1965 struct nfscreatargs args;
1966 struct nfsdiropres dr;
1967 int douprintf;
1968 vnode_t *vp;
1969 rnode_t *rp;
1970 struct vattr vattr;
1971 rnode_t *drp;
1972 vnode_t *tempvp;
1973 hrtime_t t;
1974
1975 drp = VTOR(dvp);
1976
1977 if (nfs_zone() != VTOMI(dvp)->mi_zone)
1978 return (EPERM);
1979 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1980 return (EINTR);
1981
1982 /*
1983 * We make a copy of the attributes because the caller does not
1984 * expect us to change what va points to.
1985 */
1986 vattr = *va;
1987
1988 /*
1989 * If the pathname is "", just use dvp. Don't need
1990 * to send it over the wire, look it up in the dnlc,
1991 * or perform any access checks.
1992 */
1993 if (*nm == '\0') {
1994 error = 0;
1995 VN_HOLD(dvp);
1996 vp = dvp;
1997 /*
1998 * If the pathname is ".", just use dvp. Don't need
1999 * to send it over the wire or look it up in the dnlc,
2000 * just need to check access.
2001 */
2002 } else if (strcmp(nm, ".") == 0) {
2003 error = nfs_access(dvp, VEXEC, 0, cr, ct);
2004 if (error) {
2005 nfs_rw_exit(&drp->r_rwlock);
2006 return (error);
2007 }
2008 VN_HOLD(dvp);
2009 vp = dvp;
2010 /*
2011 * We need to go over the wire, just to be sure whether the
2012 * file exists or not. Using the DNLC can be dangerous in
2013 * this case when making a decision regarding existence.
2014 */
2015 } else {
2016 error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2017 }
2018 if (!error) {
2019 if (exclusive == EXCL)
2020 error = EEXIST;
2021 else if (vp->v_type == VDIR && (mode & VWRITE))
2022 error = EISDIR;
2023 else {
2024 /*
2025 * If vnode is a device, create special vnode.
2026 */
2027 if (IS_DEVVP(vp)) {
2028 tempvp = vp;
2029 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2030 VN_RELE(tempvp);
2031 }
2032 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2033 if ((vattr.va_mask & AT_SIZE) &&
2034 vp->v_type == VREG) {
2035 vattr.va_mask = AT_SIZE;
2036 error = nfssetattr(vp, &vattr, 0, cr);
2037
2038 if (!error) {
2039 /*
2040 * Existing file was truncated;
2041 * emit a create event.
2042 */
2043 vnevent_create(vp, ct);
2044 }
2045 }
2046 }
2047 }
2048 nfs_rw_exit(&drp->r_rwlock);
2049 if (error) {
2050 VN_RELE(vp);
2051 } else {
2052 *vpp = vp;
2053 }
2054 return (error);
2055 }
2056
2057 ASSERT(vattr.va_mask & AT_TYPE);
2058 if (vattr.va_type == VREG) {
2059 ASSERT(vattr.va_mask & AT_MODE);
2060 if (MANDMODE(vattr.va_mode)) {
2061 nfs_rw_exit(&drp->r_rwlock);
2062 return (EACCES);
2063 }
2064 }
2065
2066 dnlc_remove(dvp, nm);
2067
2068 setdiropargs(&args.ca_da, nm, dvp);
2069
2070 /*
2071 * Decide what the group-id of the created file should be.
2072 * Set it in attribute list as advisory...then do a setattr
2073 * if the server didn't get it right the first time.
2074 */
2075 error = setdirgid(dvp, &vattr.va_gid, cr);
2076 if (error) {
2077 nfs_rw_exit(&drp->r_rwlock);
2078 return (error);
2079 }
2080 vattr.va_mask |= AT_GID;
2081
2082 /*
2083 * This is a completely gross hack to make mknod
2084 * work over the wire until we can wack the protocol
2085 */
2086 #define IFCHR 0020000 /* character special */
2087 #define IFBLK 0060000 /* block special */
2088 #define IFSOCK 0140000 /* socket */
2089
2090 /*
2091 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2092 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2093 * bits in the minor number where 4.x supports 8 bits. If the 5.x
2094 * minor/major numbers <= 8 bits long, compress the device
2095 * number before sending it. Otherwise, the 4.x server will not
2096 * create the device with the correct device number and nothing can be
2097 * done about this.
2098 */
2099 if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2100 dev_t d = vattr.va_rdev;
2101 dev32_t dev32;
2102
2103 if (vattr.va_type == VCHR)
2104 vattr.va_mode |= IFCHR;
2105 else
2106 vattr.va_mode |= IFBLK;
2107
2108 (void) cmpldev(&dev32, d);
2109 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2110 vattr.va_size = (u_offset_t)dev32;
2111 else
2112 vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2113
2114 vattr.va_mask |= AT_MODE|AT_SIZE;
2115 } else if (vattr.va_type == VFIFO) {
2116 vattr.va_mode |= IFCHR; /* xtra kludge for namedpipe */
2117 vattr.va_size = (u_offset_t)NFS_FIFO_DEV; /* blech */
2118 vattr.va_mask |= AT_MODE|AT_SIZE;
2119 } else if (vattr.va_type == VSOCK) {
2120 vattr.va_mode |= IFSOCK;
2121 /*
2122 * To avoid triggering bugs in the servers set AT_SIZE
2123 * (all other RFS_CREATE calls set this).
2124 */
2125 vattr.va_size = 0;
2126 vattr.va_mask |= AT_MODE|AT_SIZE;
2127 }
2128
2129 args.ca_sa = &args.ca_sa_buf;
2130 error = vattr_to_sattr(&vattr, args.ca_sa);
2131 if (error) {
2132 /* req time field(s) overflow - return immediately */
2133 nfs_rw_exit(&drp->r_rwlock);
2134 return (error);
2135 }
2136
2137 douprintf = 1;
2138
2139 t = gethrtime();
2140
2141 error = rfs2call(VTOMI(dvp), RFS_CREATE,
2142 xdr_creatargs, (caddr_t)&args,
2143 xdr_diropres, (caddr_t)&dr, cr,
2144 &douprintf, &dr.dr_status, 0, NULL);
2145
2146 PURGE_ATTRCACHE(dvp); /* mod time changed */
2147
2148 if (!error) {
2149 error = geterrno(dr.dr_status);
2150 if (!error) {
2151 if (HAVE_RDDIR_CACHE(drp))
2152 nfs_purge_rddir_cache(dvp);
2153 vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2154 dvp->v_vfsp, t, cr, NULL, NULL);
2155 /*
2156 * If NFS_ACL is supported on the server, then the
2157 * attributes returned by server may have minimal
2158 * permissions sometimes denying access to users having
2159 * proper access. To get the proper attributes, mark
2160 * the attributes as expired so that they will be
2161 * regotten via the NFS_ACL GETATTR2 procedure.
2162 */
2163 if (VTOMI(vp)->mi_flags & MI_ACL) {
2164 PURGE_ATTRCACHE(vp);
2165 }
2166 dnlc_update(dvp, nm, vp);
2167 rp = VTOR(vp);
2168 if (vattr.va_size == 0) {
2169 mutex_enter(&rp->r_statelock);
2170 rp->r_size = 0;
2171 mutex_exit(&rp->r_statelock);
2172 if (vn_has_cached_data(vp)) {
2173 ASSERT(vp->v_type != VCHR);
2174 nfs_invalidate_pages(vp,
2175 (u_offset_t)0, cr);
2176 }
2177 }
2178
2179 /*
2180 * Make sure the gid was set correctly.
2181 * If not, try to set it (but don't lose
2182 * any sleep over it).
2183 */
2184 if (vattr.va_gid != rp->r_attr.va_gid) {
2185 vattr.va_mask = AT_GID;
2186 (void) nfssetattr(vp, &vattr, 0, cr);
2187 }
2188
2189 /*
2190 * If vnode is a device create special vnode
2191 */
2192 if (IS_DEVVP(vp)) {
2193 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2194 VN_RELE(vp);
2195 } else
2196 *vpp = vp;
2197 } else {
2198 PURGE_STALE_FH(error, dvp, cr);
2199 }
2200 }
2201
2202 nfs_rw_exit(&drp->r_rwlock);
2203
2204 return (error);
2205 }
2206
2207 /*
2208 * Weirdness: if the vnode to be removed is open
2209 * we rename it instead of removing it and nfs_inactive
2210 * will remove the new name.
2211 */
2212 /* ARGSUSED */
2213 static int
2214 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2215 {
2216 int error;
2217 struct nfsdiropargs da;
2218 enum nfsstat status;
2219 vnode_t *vp;
2220 char *tmpname;
2221 int douprintf;
2222 rnode_t *rp;
2223 rnode_t *drp;
2224
2225 if (nfs_zone() != VTOMI(dvp)->mi_zone)
2226 return (EPERM);
2227 drp = VTOR(dvp);
2228 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2229 return (EINTR);
2230
2231 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2232 if (error) {
2233 nfs_rw_exit(&drp->r_rwlock);
2234 return (error);
2235 }
2236
2237 if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2238 VN_RELE(vp);
2239 nfs_rw_exit(&drp->r_rwlock);
2240 return (EPERM);
2241 }
2242
2243 /*
2244 * First just remove the entry from the name cache, as it
2245 * is most likely the only entry for this vp.
2246 */
2247 dnlc_remove(dvp, nm);
2248
2249 /*
2250 * If the file has a v_count > 1 then there may be more than one
2251 * entry in the name cache due multiple links or an open file,
2252 * but we don't have the real reference count so flush all
2253 * possible entries.
2254 */
2255 if (vp->v_count > 1)
2256 dnlc_purge_vp(vp);
2257
2258 /*
2259 * Now we have the real reference count on the vnode
2260 */
2261 rp = VTOR(vp);
2262 mutex_enter(&rp->r_statelock);
2263 if (vp->v_count > 1 &&
2264 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2265 mutex_exit(&rp->r_statelock);
2266 tmpname = newname();
2267 error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2268 if (error)
2269 kmem_free(tmpname, MAXNAMELEN);
2270 else {
2271 mutex_enter(&rp->r_statelock);
2272 if (rp->r_unldvp == NULL) {
2273 VN_HOLD(dvp);
2274 rp->r_unldvp = dvp;
2275 if (rp->r_unlcred != NULL)
2276 crfree(rp->r_unlcred);
2277 crhold(cr);
2278 rp->r_unlcred = cr;
2279 rp->r_unlname = tmpname;
2280 } else {
2281 kmem_free(rp->r_unlname, MAXNAMELEN);
2282 rp->r_unlname = tmpname;
2283 }
2284 mutex_exit(&rp->r_statelock);
2285 }
2286 } else {
2287 mutex_exit(&rp->r_statelock);
2288 /*
2289 * We need to flush any dirty pages which happen to
2290 * be hanging around before removing the file. This
2291 * shouldn't happen very often and mostly on file
2292 * systems mounted "nocto".
2293 */
2294 if (vn_has_cached_data(vp) &&
2295 ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2296 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2297 if (error && (error == ENOSPC || error == EDQUOT)) {
2298 mutex_enter(&rp->r_statelock);
2299 if (!rp->r_error)
2300 rp->r_error = error;
2301 mutex_exit(&rp->r_statelock);
2302 }
2303 }
2304
2305 setdiropargs(&da, nm, dvp);
2306
2307 douprintf = 1;
2308
2309 error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2310 xdr_diropargs, (caddr_t)&da,
2311 xdr_enum, (caddr_t)&status, cr,
2312 &douprintf, &status, 0, NULL);
2313
2314 /*
2315 * The xattr dir may be gone after last attr is removed,
2316 * so flush it from dnlc.
2317 */
2318 if (dvp->v_flag & V_XATTRDIR)
2319 dnlc_purge_vp(dvp);
2320
2321 PURGE_ATTRCACHE(dvp); /* mod time changed */
2322 PURGE_ATTRCACHE(vp); /* link count changed */
2323
2324 if (!error) {
2325 error = geterrno(status);
2326 if (!error) {
2327 if (HAVE_RDDIR_CACHE(drp))
2328 nfs_purge_rddir_cache(dvp);
2329 } else {
2330 PURGE_STALE_FH(error, dvp, cr);
2331 }
2332 }
2333 }
2334
2335 if (error == 0) {
2336 vnevent_remove(vp, dvp, nm, ct);
2337 }
2338 VN_RELE(vp);
2339
2340 nfs_rw_exit(&drp->r_rwlock);
2341
2342 return (error);
2343 }
2344
2345 /* ARGSUSED */
2346 static int
2347 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2348 caller_context_t *ct, int flags)
2349 {
2350 int error;
2351 struct nfslinkargs args;
2352 enum nfsstat status;
2353 vnode_t *realvp;
2354 int douprintf;
2355 rnode_t *tdrp;
2356
2357 if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2358 return (EPERM);
2359 if (VOP_REALVP(svp, &realvp, ct) == 0)
2360 svp = realvp;
2361
2362 args.la_from = VTOFH(svp);
2363 setdiropargs(&args.la_to, tnm, tdvp);
2364
2365 tdrp = VTOR(tdvp);
2366 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2367 return (EINTR);
2368
2369 dnlc_remove(tdvp, tnm);
2370
2371 douprintf = 1;
2372
2373 error = rfs2call(VTOMI(svp), RFS_LINK,
2374 xdr_linkargs, (caddr_t)&args,
2375 xdr_enum, (caddr_t)&status, cr,
2376 &douprintf, &status, 0, NULL);
2377
2378 PURGE_ATTRCACHE(tdvp); /* mod time changed */
2379 PURGE_ATTRCACHE(svp); /* link count changed */
2380
2381 if (!error) {
2382 error = geterrno(status);
2383 if (!error) {
2384 if (HAVE_RDDIR_CACHE(tdrp))
2385 nfs_purge_rddir_cache(tdvp);
2386 }
2387 }
2388
2389 nfs_rw_exit(&tdrp->r_rwlock);
2390
2391 if (!error) {
2392 /*
2393 * Notify the source file of this link operation.
2394 */
2395 vnevent_link(svp, ct);
2396 }
2397 return (error);
2398 }
2399
2400 /* ARGSUSED */
2401 static int
2402 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2403 caller_context_t *ct, int flags)
2404 {
2405 vnode_t *realvp;
2406
2407 if (nfs_zone() != VTOMI(odvp)->mi_zone)
2408 return (EPERM);
2409 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2410 ndvp = realvp;
2411
2412 return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2413 }
2414
2415 /*
2416 * nfsrename does the real work of renaming in NFS Version 2.
2417 */
2418 static int
2419 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2420 caller_context_t *ct)
2421 {
2422 int error;
2423 enum nfsstat status;
2424 struct nfsrnmargs args;
2425 int douprintf;
2426 vnode_t *nvp = NULL;
2427 vnode_t *ovp = NULL;
2428 char *tmpname;
2429 rnode_t *rp;
2430 rnode_t *odrp;
2431 rnode_t *ndrp;
2432
2433 ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2434 if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2435 strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2436 return (EINVAL);
2437
2438 odrp = VTOR(odvp);
2439 ndrp = VTOR(ndvp);
2440 if ((intptr_t)odrp < (intptr_t)ndrp) {
2441 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2442 return (EINTR);
2443 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2444 nfs_rw_exit(&odrp->r_rwlock);
2445 return (EINTR);
2446 }
2447 } else {
2448 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2449 return (EINTR);
2450 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2451 nfs_rw_exit(&ndrp->r_rwlock);
2452 return (EINTR);
2453 }
2454 }
2455
2456 /*
2457 * Lookup the target file. If it exists, it needs to be
2458 * checked to see whether it is a mount point and whether
2459 * it is active (open).
2460 */
2461 error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2462 if (!error) {
2463 /*
2464 * If this file has been mounted on, then just
2465 * return busy because renaming to it would remove
2466 * the mounted file system from the name space.
2467 */
2468 if (vn_mountedvfs(nvp) != NULL) {
2469 VN_RELE(nvp);
2470 nfs_rw_exit(&odrp->r_rwlock);
2471 nfs_rw_exit(&ndrp->r_rwlock);
2472 return (EBUSY);
2473 }
2474
2475 /*
2476 * Purge the name cache of all references to this vnode
2477 * so that we can check the reference count to infer
2478 * whether it is active or not.
2479 */
2480 /*
2481 * First just remove the entry from the name cache, as it
2482 * is most likely the only entry for this vp.
2483 */
2484 dnlc_remove(ndvp, nnm);
2485 /*
2486 * If the file has a v_count > 1 then there may be more
2487 * than one entry in the name cache due multiple links
2488 * or an open file, but we don't have the real reference
2489 * count so flush all possible entries.
2490 */
2491 if (nvp->v_count > 1)
2492 dnlc_purge_vp(nvp);
2493
2494 /*
2495 * If the vnode is active and is not a directory,
2496 * arrange to rename it to a
2497 * temporary file so that it will continue to be
2498 * accessible. This implements the "unlink-open-file"
2499 * semantics for the target of a rename operation.
2500 * Before doing this though, make sure that the
2501 * source and target files are not already the same.
2502 */
2503 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2504 /*
2505 * Lookup the source name.
2506 */
2507 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2508 cr, 0);
2509
2510 /*
2511 * The source name *should* already exist.
2512 */
2513 if (error) {
2514 VN_RELE(nvp);
2515 nfs_rw_exit(&odrp->r_rwlock);
2516 nfs_rw_exit(&ndrp->r_rwlock);
2517 return (error);
2518 }
2519
2520 /*
2521 * Compare the two vnodes. If they are the same,
2522 * just release all held vnodes and return success.
2523 */
2524 if (ovp == nvp) {
2525 VN_RELE(ovp);
2526 VN_RELE(nvp);
2527 nfs_rw_exit(&odrp->r_rwlock);
2528 nfs_rw_exit(&ndrp->r_rwlock);
2529 return (0);
2530 }
2531
2532 /*
2533 * Can't mix and match directories and non-
2534 * directories in rename operations. We already
2535 * know that the target is not a directory. If
2536 * the source is a directory, return an error.
2537 */
2538 if (ovp->v_type == VDIR) {
2539 VN_RELE(ovp);
2540 VN_RELE(nvp);
2541 nfs_rw_exit(&odrp->r_rwlock);
2542 nfs_rw_exit(&ndrp->r_rwlock);
2543 return (ENOTDIR);
2544 }
2545
2546 /*
2547 * The target file exists, is not the same as
2548 * the source file, and is active. Link it
2549 * to a temporary filename to avoid having
2550 * the server removing the file completely.
2551 */
2552 tmpname = newname();
2553 error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2554 if (error == EOPNOTSUPP) {
2555 error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2556 cr, NULL, 0);
2557 }
2558 if (error) {
2559 kmem_free(tmpname, MAXNAMELEN);
2560 VN_RELE(ovp);
2561 VN_RELE(nvp);
2562 nfs_rw_exit(&odrp->r_rwlock);
2563 nfs_rw_exit(&ndrp->r_rwlock);
2564 return (error);
2565 }
2566 rp = VTOR(nvp);
2567 mutex_enter(&rp->r_statelock);
2568 if (rp->r_unldvp == NULL) {
2569 VN_HOLD(ndvp);
2570 rp->r_unldvp = ndvp;
2571 if (rp->r_unlcred != NULL)
2572 crfree(rp->r_unlcred);
2573 crhold(cr);
2574 rp->r_unlcred = cr;
2575 rp->r_unlname = tmpname;
2576 } else {
2577 kmem_free(rp->r_unlname, MAXNAMELEN);
2578 rp->r_unlname = tmpname;
2579 }
2580 mutex_exit(&rp->r_statelock);
2581 }
2582 }
2583
2584 if (ovp == NULL) {
2585 /*
2586 * When renaming directories to be a subdirectory of a
2587 * different parent, the dnlc entry for ".." will no
2588 * longer be valid, so it must be removed.
2589 *
2590 * We do a lookup here to determine whether we are renaming
2591 * a directory and we need to check if we are renaming
2592 * an unlinked file. This might have already been done
2593 * in previous code, so we check ovp == NULL to avoid
2594 * doing it twice.
2595 */
2596
2597 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2598
2599 /*
2600 * The source name *should* already exist.
2601 */
2602 if (error) {
2603 nfs_rw_exit(&odrp->r_rwlock);
2604 nfs_rw_exit(&ndrp->r_rwlock);
2605 if (nvp) {
2606 VN_RELE(nvp);
2607 }
2608 return (error);
2609 }
2610 ASSERT(ovp != NULL);
2611 }
2612
2613 dnlc_remove(odvp, onm);
2614 dnlc_remove(ndvp, nnm);
2615
2616 setdiropargs(&args.rna_from, onm, odvp);
2617 setdiropargs(&args.rna_to, nnm, ndvp);
2618
2619 douprintf = 1;
2620
2621 error = rfs2call(VTOMI(odvp), RFS_RENAME,
2622 xdr_rnmargs, (caddr_t)&args,
2623 xdr_enum, (caddr_t)&status, cr,
2624 &douprintf, &status, 0, NULL);
2625
2626 PURGE_ATTRCACHE(odvp); /* mod time changed */
2627 PURGE_ATTRCACHE(ndvp); /* mod time changed */
2628
2629 if (!error) {
2630 error = geterrno(status);
2631 if (!error) {
2632 if (HAVE_RDDIR_CACHE(odrp))
2633 nfs_purge_rddir_cache(odvp);
2634 if (HAVE_RDDIR_CACHE(ndrp))
2635 nfs_purge_rddir_cache(ndvp);
2636 /*
2637 * when renaming directories to be a subdirectory of a
2638 * different parent, the dnlc entry for ".." will no
2639 * longer be valid, so it must be removed
2640 */
2641 rp = VTOR(ovp);
2642 if (ndvp != odvp) {
2643 if (ovp->v_type == VDIR) {
2644 dnlc_remove(ovp, "..");
2645 if (HAVE_RDDIR_CACHE(rp))
2646 nfs_purge_rddir_cache(ovp);
2647 }
2648 }
2649
2650 /*
2651 * If we are renaming the unlinked file, update the
2652 * r_unldvp and r_unlname as needed.
2653 */
2654 mutex_enter(&rp->r_statelock);
2655 if (rp->r_unldvp != NULL) {
2656 if (strcmp(rp->r_unlname, onm) == 0) {
2657 (void) strncpy(rp->r_unlname,
2658 nnm, MAXNAMELEN);
2659 rp->r_unlname[MAXNAMELEN - 1] = '\0';
2660
2661 if (ndvp != rp->r_unldvp) {
2662 VN_RELE(rp->r_unldvp);
2663 rp->r_unldvp = ndvp;
2664 VN_HOLD(ndvp);
2665 }
2666 }
2667 }
2668 mutex_exit(&rp->r_statelock);
2669 } else {
2670 /*
2671 * System V defines rename to return EEXIST, not
2672 * ENOTEMPTY if the target directory is not empty.
2673 * Over the wire, the error is NFSERR_ENOTEMPTY
2674 * which geterrno maps to ENOTEMPTY.
2675 */
2676 if (error == ENOTEMPTY)
2677 error = EEXIST;
2678 }
2679 }
2680
2681 if (error == 0) {
2682 if (nvp)
2683 vnevent_rename_dest(nvp, ndvp, nnm, ct);
2684
2685 if (odvp != ndvp)
2686 vnevent_rename_dest_dir(ndvp, ct);
2687
2688 ASSERT(ovp != NULL);
2689 vnevent_rename_src(ovp, odvp, onm, ct);
2690 }
2691
2692 if (nvp) {
2693 VN_RELE(nvp);
2694 }
2695 VN_RELE(ovp);
2696
2697 nfs_rw_exit(&odrp->r_rwlock);
2698 nfs_rw_exit(&ndrp->r_rwlock);
2699
2700 return (error);
2701 }
2702
2703 /* ARGSUSED */
2704 static int
2705 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2706 caller_context_t *ct, int flags, vsecattr_t *vsecp)
2707 {
2708 int error;
2709 struct nfscreatargs args;
2710 struct nfsdiropres dr;
2711 int douprintf;
2712 rnode_t *drp;
2713 hrtime_t t;
2714
2715 if (nfs_zone() != VTOMI(dvp)->mi_zone)
2716 return (EPERM);
2717
2718 setdiropargs(&args.ca_da, nm, dvp);
2719
2720 /*
2721 * Decide what the group-id and set-gid bit of the created directory
2722 * should be. May have to do a setattr to get the gid right.
2723 */
2724 error = setdirgid(dvp, &va->va_gid, cr);
2725 if (error)
2726 return (error);
2727 error = setdirmode(dvp, &va->va_mode, cr);
2728 if (error)
2729 return (error);
2730 va->va_mask |= AT_MODE|AT_GID;
2731
2732 args.ca_sa = &args.ca_sa_buf;
2733 error = vattr_to_sattr(va, args.ca_sa);
2734 if (error) {
2735 /* req time field(s) overflow - return immediately */
2736 return (error);
2737 }
2738
2739 drp = VTOR(dvp);
2740 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2741 return (EINTR);
2742
2743 dnlc_remove(dvp, nm);
2744
2745 douprintf = 1;
2746
2747 t = gethrtime();
2748
2749 error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2750 xdr_creatargs, (caddr_t)&args,
2751 xdr_diropres, (caddr_t)&dr, cr,
2752 &douprintf, &dr.dr_status, 0, NULL);
2753
2754 PURGE_ATTRCACHE(dvp); /* mod time changed */
2755
2756 if (!error) {
2757 error = geterrno(dr.dr_status);
2758 if (!error) {
2759 if (HAVE_RDDIR_CACHE(drp))
2760 nfs_purge_rddir_cache(dvp);
2761 /*
2762 * The attributes returned by RFS_MKDIR can not
2763 * be depended upon, so mark the attribute cache
2764 * as purged. A subsequent GETATTR will get the
2765 * correct attributes from the server.
2766 */
2767 *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2768 dvp->v_vfsp, t, cr, NULL, NULL);
2769 PURGE_ATTRCACHE(*vpp);
2770 dnlc_update(dvp, nm, *vpp);
2771
2772 /*
2773 * Make sure the gid was set correctly.
2774 * If not, try to set it (but don't lose
2775 * any sleep over it).
2776 */
2777 if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2778 va->va_mask = AT_GID;
2779 (void) nfssetattr(*vpp, va, 0, cr);
2780 }
2781 } else {
2782 PURGE_STALE_FH(error, dvp, cr);
2783 }
2784 }
2785
2786 nfs_rw_exit(&drp->r_rwlock);
2787
2788 return (error);
2789 }
2790
2791 /* ARGSUSED */
2792 static int
2793 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2794 caller_context_t *ct, int flags)
2795 {
2796 int error;
2797 enum nfsstat status;
2798 struct nfsdiropargs da;
2799 vnode_t *vp;
2800 int douprintf;
2801 rnode_t *drp;
2802
2803 if (nfs_zone() != VTOMI(dvp)->mi_zone)
2804 return (EPERM);
2805 drp = VTOR(dvp);
2806 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2807 return (EINTR);
2808
2809 /*
2810 * Attempt to prevent a rmdir(".") from succeeding.
2811 */
2812 error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2813 if (error) {
2814 nfs_rw_exit(&drp->r_rwlock);
2815 return (error);
2816 }
2817
2818 if (vp == cdir) {
2819 VN_RELE(vp);
2820 nfs_rw_exit(&drp->r_rwlock);
2821 return (EINVAL);
2822 }
2823
2824 setdiropargs(&da, nm, dvp);
2825
2826 /*
2827 * First just remove the entry from the name cache, as it
2828 * is most likely an entry for this vp.
2829 */
2830 dnlc_remove(dvp, nm);
2831
2832 /*
2833 * If there vnode reference count is greater than one, then
2834 * there may be additional references in the DNLC which will
2835 * need to be purged. First, trying removing the entry for
2836 * the parent directory and see if that removes the additional
2837 * reference(s). If that doesn't do it, then use dnlc_purge_vp
2838 * to completely remove any references to the directory which
2839 * might still exist in the DNLC.
2840 */
2841 if (vp->v_count > 1) {
2842 dnlc_remove(vp, "..");
2843 if (vp->v_count > 1)
2844 dnlc_purge_vp(vp);
2845 }
2846
2847 douprintf = 1;
2848
2849 error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2850 xdr_diropargs, (caddr_t)&da,
2851 xdr_enum, (caddr_t)&status, cr,
2852 &douprintf, &status, 0, NULL);
2853
2854 PURGE_ATTRCACHE(dvp); /* mod time changed */
2855
2856 if (error) {
2857 VN_RELE(vp);
2858 nfs_rw_exit(&drp->r_rwlock);
2859 return (error);
2860 }
2861
2862 error = geterrno(status);
2863 if (!error) {
2864 if (HAVE_RDDIR_CACHE(drp))
2865 nfs_purge_rddir_cache(dvp);
2866 if (HAVE_RDDIR_CACHE(VTOR(vp)))
2867 nfs_purge_rddir_cache(vp);
2868 } else {
2869 PURGE_STALE_FH(error, dvp, cr);
2870 /*
2871 * System V defines rmdir to return EEXIST, not
2872 * ENOTEMPTY if the directory is not empty. Over
2873 * the wire, the error is NFSERR_ENOTEMPTY which
2874 * geterrno maps to ENOTEMPTY.
2875 */
2876 if (error == ENOTEMPTY)
2877 error = EEXIST;
2878 }
2879
2880 if (error == 0) {
2881 vnevent_rmdir(vp, dvp, nm, ct);
2882 }
2883 VN_RELE(vp);
2884
2885 nfs_rw_exit(&drp->r_rwlock);
2886
2887 return (error);
2888 }
2889
2890 /* ARGSUSED */
2891 static int
2892 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2893 caller_context_t *ct, int flags)
2894 {
2895 int error;
2896 struct nfsslargs args;
2897 enum nfsstat status;
2898 int douprintf;
2899 rnode_t *drp;
2900
2901 if (nfs_zone() != VTOMI(dvp)->mi_zone)
2902 return (EPERM);
2903 setdiropargs(&args.sla_from, lnm, dvp);
2904 args.sla_sa = &args.sla_sa_buf;
2905 error = vattr_to_sattr(tva, args.sla_sa);
2906 if (error) {
2907 /* req time field(s) overflow - return immediately */
2908 return (error);
2909 }
2910 args.sla_tnm = tnm;
2911
2912 drp = VTOR(dvp);
2913 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2914 return (EINTR);
2915
2916 dnlc_remove(dvp, lnm);
2917
2918 douprintf = 1;
2919
2920 error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2921 xdr_slargs, (caddr_t)&args,
2922 xdr_enum, (caddr_t)&status, cr,
2923 &douprintf, &status, 0, NULL);
2924
2925 PURGE_ATTRCACHE(dvp); /* mod time changed */
2926
2927 if (!error) {
2928 error = geterrno(status);
2929 if (!error) {
2930 if (HAVE_RDDIR_CACHE(drp))
2931 nfs_purge_rddir_cache(dvp);
2932 } else {
2933 PURGE_STALE_FH(error, dvp, cr);
2934 }
2935 }
2936
2937 nfs_rw_exit(&drp->r_rwlock);
2938
2939 return (error);
2940 }
2941
2942 #ifdef DEBUG
2943 static int nfs_readdir_cache_hits = 0;
2944 static int nfs_readdir_cache_shorts = 0;
2945 static int nfs_readdir_cache_waits = 0;
2946 static int nfs_readdir_cache_misses = 0;
2947 static int nfs_readdir_readahead = 0;
2948 #endif
2949
2950 static int nfs_shrinkreaddir = 0;
2951
2952 /*
2953 * Read directory entries.
2954 * There are some weird things to look out for here. The uio_offset
2955 * field is either 0 or it is the offset returned from a previous
2956 * readdir. It is an opaque value used by the server to find the
2957 * correct directory block to read. The count field is the number
2958 * of blocks to read on the server. This is advisory only, the server
2959 * may return only one block's worth of entries. Entries may be compressed
2960 * on the server.
2961 */
2962 /* ARGSUSED */
2963 static int
2964 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2965 caller_context_t *ct, int flags)
2966 {
2967 int error;
2968 size_t count;
2969 rnode_t *rp;
2970 rddir_cache *rdc;
2971 rddir_cache *nrdc;
2972 rddir_cache *rrdc;
2973 #ifdef DEBUG
2974 int missed;
2975 #endif
2976 rddir_cache srdc;
2977 avl_index_t where;
2978
2979 rp = VTOR(vp);
2980
2981 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2982 if (nfs_zone() != VTOMI(vp)->mi_zone)
2983 return (EIO);
2984 /*
2985 * Make sure that the directory cache is valid.
2986 */
2987 if (HAVE_RDDIR_CACHE(rp)) {
2988 if (nfs_disable_rddir_cache) {
2989 /*
2990 * Setting nfs_disable_rddir_cache in /etc/system
2991 * allows interoperability with servers that do not
2992 * properly update the attributes of directories.
2993 * Any cached information gets purged before an
2994 * access is made to it.
2995 */
2996 nfs_purge_rddir_cache(vp);
2997 } else {
2998 error = nfs_validate_caches(vp, cr);
2999 if (error)
3000 return (error);
3001 }
3002 }
3003
3004 /*
3005 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
3006 * RFS_READDIR request with rda_count set to more than 0x400. So
3007 * we reduce the request size here purely for compatibility.
3008 *
3009 * In general, this is no longer required. However, if a server
3010 * is discovered which can not handle requests larger than 1024,
3011 * nfs_shrinkreaddir can be set to 1 to enable this backwards
3012 * compatibility.
3013 *
3014 * In any case, the request size is limited to NFS_MAXDATA bytes.
3015 */
3016 count = MIN(uiop->uio_iov->iov_len,
3017 nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
3018
3019 nrdc = NULL;
3020 #ifdef DEBUG
3021 missed = 0;
3022 #endif
3023 top:
3024 /*
3025 * Short circuit last readdir which always returns 0 bytes.
3026 * This can be done after the directory has been read through
3027 * completely at least once. This will set r_direof which
3028 * can be used to find the value of the last cookie.
3029 */
3030 mutex_enter(&rp->r_statelock);
3031 if (rp->r_direof != NULL &&
3032 uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3033 mutex_exit(&rp->r_statelock);
3034 #ifdef DEBUG
3035 nfs_readdir_cache_shorts++;
3036 #endif
3037 if (eofp)
3038 *eofp = 1;
3039 if (nrdc != NULL)
3040 rddir_cache_rele(nrdc);
3041 return (0);
3042 }
3043 /*
3044 * Look for a cache entry. Cache entries are identified
3045 * by the NFS cookie value and the byte count requested.
3046 */
3047 srdc.nfs_cookie = uiop->uio_offset;
3048 srdc.buflen = count;
3049 rdc = avl_find(&rp->r_dir, &srdc, &where);
3050 if (rdc != NULL) {
3051 rddir_cache_hold(rdc);
3052 /*
3053 * If the cache entry is in the process of being
3054 * filled in, wait until this completes. The
3055 * RDDIRWAIT bit is set to indicate that someone
3056 * is waiting and then the thread currently
3057 * filling the entry is done, it should do a
3058 * cv_broadcast to wakeup all of the threads
3059 * waiting for it to finish.
3060 */
3061 if (rdc->flags & RDDIR) {
3062 nfs_rw_exit(&rp->r_rwlock);
3063 rdc->flags |= RDDIRWAIT;
3064 #ifdef DEBUG
3065 nfs_readdir_cache_waits++;
3066 #endif
3067 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3068 /*
3069 * We got interrupted, probably
3070 * the user typed ^C or an alarm
3071 * fired. We free the new entry
3072 * if we allocated one.
3073 */
3074 mutex_exit(&rp->r_statelock);
3075 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3076 RW_READER, FALSE);
3077 rddir_cache_rele(rdc);
3078 if (nrdc != NULL)
3079 rddir_cache_rele(nrdc);
3080 return (EINTR);
3081 }
3082 mutex_exit(&rp->r_statelock);
3083 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3084 RW_READER, FALSE);
3085 rddir_cache_rele(rdc);
3086 goto top;
3087 }
3088 /*
3089 * Check to see if a readdir is required to
3090 * fill the entry. If so, mark this entry
3091 * as being filled, remove our reference,
3092 * and branch to the code to fill the entry.
3093 */
3094 if (rdc->flags & RDDIRREQ) {
3095 rdc->flags &= ~RDDIRREQ;
3096 rdc->flags |= RDDIR;
3097 if (nrdc != NULL)
3098 rddir_cache_rele(nrdc);
3099 nrdc = rdc;
3100 mutex_exit(&rp->r_statelock);
3101 goto bottom;
3102 }
3103 #ifdef DEBUG
3104 if (!missed)
3105 nfs_readdir_cache_hits++;
3106 #endif
3107 /*
3108 * If an error occurred while attempting
3109 * to fill the cache entry, just return it.
3110 */
3111 if (rdc->error) {
3112 error = rdc->error;
3113 mutex_exit(&rp->r_statelock);
3114 rddir_cache_rele(rdc);
3115 if (nrdc != NULL)
3116 rddir_cache_rele(nrdc);
3117 return (error);
3118 }
3119
3120 /*
3121 * The cache entry is complete and good,
3122 * copyout the dirent structs to the calling
3123 * thread.
3124 */
3125 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3126
3127 /*
3128 * If no error occurred during the copyout,
3129 * update the offset in the uio struct to
3130 * contain the value of the next cookie
3131 * and set the eof value appropriately.
3132 */
3133 if (!error) {
3134 uiop->uio_offset = rdc->nfs_ncookie;
3135 if (eofp)
3136 *eofp = rdc->eof;
3137 }
3138
3139 /*
3140 * Decide whether to do readahead. Don't if
3141 * have already read to the end of directory.
3142 */
3143 if (rdc->eof) {
3144 rp->r_direof = rdc;
3145 mutex_exit(&rp->r_statelock);
3146 rddir_cache_rele(rdc);
3147 if (nrdc != NULL)
3148 rddir_cache_rele(nrdc);
3149 return (error);
3150 }
3151
3152 /*
3153 * Check to see whether we found an entry
3154 * for the readahead. If so, we don't need
3155 * to do anything further, so free the new
3156 * entry if one was allocated. Otherwise,
3157 * allocate a new entry, add it to the cache,
3158 * and then initiate an asynchronous readdir
3159 * operation to fill it.
3160 */
3161 srdc.nfs_cookie = rdc->nfs_ncookie;
3162 srdc.buflen = count;
3163 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3164 if (rrdc != NULL) {
3165 if (nrdc != NULL)
3166 rddir_cache_rele(nrdc);
3167 } else {
3168 if (nrdc != NULL)
3169 rrdc = nrdc;
3170 else {
3171 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3172 }
3173 if (rrdc != NULL) {
3174 rrdc->nfs_cookie = rdc->nfs_ncookie;
3175 rrdc->buflen = count;
3176 avl_insert(&rp->r_dir, rrdc, where);
3177 rddir_cache_hold(rrdc);
3178 mutex_exit(&rp->r_statelock);
3179 rddir_cache_rele(rdc);
3180 #ifdef DEBUG
3181 nfs_readdir_readahead++;
3182 #endif
3183 nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3184 return (error);
3185 }
3186 }
3187
3188 mutex_exit(&rp->r_statelock);
3189 rddir_cache_rele(rdc);
3190 return (error);
3191 }
3192
3193 /*
3194 * Didn't find an entry in the cache. Construct a new empty
3195 * entry and link it into the cache. Other processes attempting
3196 * to access this entry will need to wait until it is filled in.
3197 *
3198 * Since kmem_alloc may block, another pass through the cache
3199 * will need to be taken to make sure that another process
3200 * hasn't already added an entry to the cache for this request.
3201 */
3202 if (nrdc == NULL) {
3203 mutex_exit(&rp->r_statelock);
3204 nrdc = rddir_cache_alloc(KM_SLEEP);
3205 nrdc->nfs_cookie = uiop->uio_offset;
3206 nrdc->buflen = count;
3207 goto top;
3208 }
3209
3210 /*
3211 * Add this entry to the cache.
3212 */
3213 avl_insert(&rp->r_dir, nrdc, where);
3214 rddir_cache_hold(nrdc);
3215 mutex_exit(&rp->r_statelock);
3216
3217 bottom:
3218 #ifdef DEBUG
3219 missed = 1;
3220 nfs_readdir_cache_misses++;
3221 #endif
3222 /*
3223 * Do the readdir.
3224 */
3225 error = nfsreaddir(vp, nrdc, cr);
3226
3227 /*
3228 * If this operation failed, just return the error which occurred.
3229 */
3230 if (error != 0)
3231 return (error);
3232
3233 /*
3234 * Since the RPC operation will have taken sometime and blocked
3235 * this process, another pass through the cache will need to be
3236 * taken to find the correct cache entry. It is possible that
3237 * the correct cache entry will not be there (although one was
3238 * added) because the directory changed during the RPC operation
3239 * and the readdir cache was flushed. In this case, just start
3240 * over. It is hoped that this will not happen too often... :-)
3241 */
3242 nrdc = NULL;
3243 goto top;
3244 /* NOTREACHED */
3245 }
3246
3247 static int
3248 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3249 {
3250 int error;
3251 struct nfsrddirargs rda;
3252 struct nfsrddirres rd;
3253 rnode_t *rp;
3254 mntinfo_t *mi;
3255 uint_t count;
3256 int douprintf;
3257 failinfo_t fi, *fip;
3258
3259 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3260 count = rdc->buflen;
3261
3262 rp = VTOR(vp);
3263 mi = VTOMI(vp);
3264
3265 rda.rda_fh = *VTOFH(vp);
3266 rda.rda_offset = rdc->nfs_cookie;
3267
3268 /*
3269 * NFS client failover support
3270 * suppress failover unless we have a zero cookie
3271 */
3272 if (rdc->nfs_cookie == (off_t)0) {
3273 fi.vp = vp;
3274 fi.fhp = (caddr_t)&rda.rda_fh;
3275 fi.copyproc = nfscopyfh;
3276 fi.lookupproc = nfslookup;
3277 fi.xattrdirproc = acl_getxattrdir2;
3278 fip = &fi;
3279 } else {
3280 fip = NULL;
3281 }
3282
3283 rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3284 rd.rd_size = count;
3285 rd.rd_offset = rda.rda_offset;
3286
3287 douprintf = 1;
3288
3289 if (mi->mi_io_kstats) {
3290 mutex_enter(&mi->mi_lock);
3291 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3292 mutex_exit(&mi->mi_lock);
3293 }
3294
3295 do {
3296 rda.rda_count = MIN(count, mi->mi_curread);
3297 error = rfs2call(mi, RFS_READDIR,
3298 xdr_rddirargs, (caddr_t)&rda,
3299 xdr_getrddirres, (caddr_t)&rd, cr,
3300 &douprintf, &rd.rd_status, 0, fip);
3301 } while (error == ENFS_TRYAGAIN);
3302
3303 if (mi->mi_io_kstats) {
3304 mutex_enter(&mi->mi_lock);
3305 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3306 mutex_exit(&mi->mi_lock);
3307 }
3308
3309 /*
3310 * Since we are actually doing a READDIR RPC, we must have
3311 * exclusive access to the cache entry being filled. Thus,
3312 * it is safe to update all fields except for the flags
3313 * field. The r_statelock in the rnode must be held to
3314 * prevent two different threads from simultaneously
3315 * attempting to update the flags field. This can happen
3316 * if we are turning off RDDIR and the other thread is
3317 * trying to set RDDIRWAIT.
3318 */
3319 ASSERT(rdc->flags & RDDIR);
3320 if (!error) {
3321 error = geterrno(rd.rd_status);
3322 if (!error) {
3323 rdc->nfs_ncookie = rd.rd_offset;
3324 rdc->eof = rd.rd_eof ? 1 : 0;
3325 rdc->entlen = rd.rd_size;
3326 ASSERT(rdc->entlen <= rdc->buflen);
3327 #ifdef DEBUG
3328 rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3329 KM_SLEEP);
3330 #else
3331 rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3332 #endif
3333 bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3334 rdc->error = 0;
3335 if (mi->mi_io_kstats) {
3336 mutex_enter(&mi->mi_lock);
3337 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3338 KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3339 rd.rd_size;
3340 mutex_exit(&mi->mi_lock);
3341 }
3342 } else {
3343 PURGE_STALE_FH(error, vp, cr);
3344 }
3345 }
3346 if (error) {
3347 rdc->entries = NULL;
3348 rdc->error = error;
3349 }
3350 kmem_free(rd.rd_entries, rdc->buflen);
3351
3352 mutex_enter(&rp->r_statelock);
3353 rdc->flags &= ~RDDIR;
3354 if (rdc->flags & RDDIRWAIT) {
3355 rdc->flags &= ~RDDIRWAIT;
3356 cv_broadcast(&rdc->cv);
3357 }
3358 if (error)
3359 rdc->flags |= RDDIRREQ;
3360 mutex_exit(&rp->r_statelock);
3361
3362 rddir_cache_rele(rdc);
3363
3364 return (error);
3365 }
3366
3367 #ifdef DEBUG
3368 static int nfs_bio_do_stop = 0;
3369 #endif
3370
3371 static int
3372 nfs_bio(struct buf *bp, cred_t *cr)
3373 {
3374 rnode_t *rp = VTOR(bp->b_vp);
3375 int count;
3376 int error;
3377 cred_t *cred;
3378 uint_t offset;
3379
3380 DTRACE_IO1(start, struct buf *, bp);
3381
3382 ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3383 offset = dbtob(bp->b_blkno);
3384
3385 if (bp->b_flags & B_READ) {
3386 mutex_enter(&rp->r_statelock);
3387 if (rp->r_cred != NULL) {
3388 cred = rp->r_cred;
3389 crhold(cred);
3390 } else {
3391 rp->r_cred = cr;
3392 crhold(cr);
3393 cred = cr;
3394 crhold(cred);
3395 }
3396 mutex_exit(&rp->r_statelock);
3397 read_again:
3398 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3399 offset, bp->b_bcount, &bp->b_resid, cred);
3400
3401 crfree(cred);
3402 if (!error) {
3403 if (bp->b_resid) {
3404 /*
3405 * Didn't get it all because we hit EOF,
3406 * zero all the memory beyond the EOF.
3407 */
3408 /* bzero(rdaddr + */
3409 bzero(bp->b_un.b_addr +
3410 bp->b_bcount - bp->b_resid, bp->b_resid);
3411 }
3412 mutex_enter(&rp->r_statelock);
3413 if (bp->b_resid == bp->b_bcount &&
3414 offset >= rp->r_size) {
3415 /*
3416 * We didn't read anything at all as we are
3417 * past EOF. Return an error indicator back
3418 * but don't destroy the pages (yet).
3419 */
3420 error = NFS_EOF;
3421 }
3422 mutex_exit(&rp->r_statelock);
3423 } else if (error == EACCES) {
3424 mutex_enter(&rp->r_statelock);
3425 if (cred != cr) {
3426 if (rp->r_cred != NULL)
3427 crfree(rp->r_cred);
3428 rp->r_cred = cr;
3429 crhold(cr);
3430 cred = cr;
3431 crhold(cred);
3432 mutex_exit(&rp->r_statelock);
3433 goto read_again;
3434 }
3435 mutex_exit(&rp->r_statelock);
3436 }
3437 } else {
3438 if (!(rp->r_flags & RSTALE)) {
3439 mutex_enter(&rp->r_statelock);
3440 if (rp->r_cred != NULL) {
3441 cred = rp->r_cred;
3442 crhold(cred);
3443 } else {
3444 rp->r_cred = cr;
3445 crhold(cr);
3446 cred = cr;
3447 crhold(cred);
3448 }
3449 mutex_exit(&rp->r_statelock);
3450 write_again:
3451 mutex_enter(&rp->r_statelock);
3452 count = MIN(bp->b_bcount, rp->r_size - offset);
3453 mutex_exit(&rp->r_statelock);
3454 if (count < 0)
3455 cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3456 #ifdef DEBUG
3457 if (count == 0) {
3458 zcmn_err(getzoneid(), CE_WARN,
3459 "nfs_bio: zero length write at %d",
3460 offset);
3461 nfs_printfhandle(&rp->r_fh);
3462 if (nfs_bio_do_stop)
3463 debug_enter("nfs_bio");
3464 }
3465 #endif
3466 error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3467 count, cred);
3468 if (error == EACCES) {
3469 mutex_enter(&rp->r_statelock);
3470 if (cred != cr) {
3471 if (rp->r_cred != NULL)
3472 crfree(rp->r_cred);
3473 rp->r_cred = cr;
3474 crhold(cr);
3475 crfree(cred);
3476 cred = cr;
3477 crhold(cred);
3478 mutex_exit(&rp->r_statelock);
3479 goto write_again;
3480 }
3481 mutex_exit(&rp->r_statelock);
3482 }
3483 bp->b_error = error;
3484 if (error && error != EINTR) {
3485 /*
3486 * Don't print EDQUOT errors on the console.
3487 * Don't print asynchronous EACCES errors.
3488 * Don't print EFBIG errors.
3489 * Print all other write errors.
3490 */
3491 if (error != EDQUOT && error != EFBIG &&
3492 (error != EACCES ||
3493 !(bp->b_flags & B_ASYNC)))
3494 nfs_write_error(bp->b_vp, error, cred);
3495 /*
3496 * Update r_error and r_flags as appropriate.
3497 * If the error was ESTALE, then mark the
3498 * rnode as not being writeable and save
3499 * the error status. Otherwise, save any
3500 * errors which occur from asynchronous
3501 * page invalidations. Any errors occurring
3502 * from other operations should be saved
3503 * by the caller.
3504 */
3505 mutex_enter(&rp->r_statelock);
3506 if (error == ESTALE) {
3507 rp->r_flags |= RSTALE;
3508 if (!rp->r_error)
3509 rp->r_error = error;
3510 } else if (!rp->r_error &&
3511 (bp->b_flags &
3512 (B_INVAL|B_FORCE|B_ASYNC)) ==
3513 (B_INVAL|B_FORCE|B_ASYNC)) {
3514 rp->r_error = error;
3515 }
3516 mutex_exit(&rp->r_statelock);
3517 }
3518 crfree(cred);
3519 } else {
3520 error = rp->r_error;
3521 /*
3522 * A close may have cleared r_error, if so,
3523 * propagate ESTALE error return properly
3524 */
3525 if (error == 0)
3526 error = ESTALE;
3527 }
3528 }
3529
3530 if (error != 0 && error != NFS_EOF)
3531 bp->b_flags |= B_ERROR;
3532
3533 DTRACE_IO1(done, struct buf *, bp);
3534
3535 return (error);
3536 }
3537
3538 /* ARGSUSED */
3539 static int
3540 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3541 {
3542 struct nfs_fid *fp;
3543 rnode_t *rp;
3544
3545 rp = VTOR(vp);
3546
3547 if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3548 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3549 return (ENOSPC);
3550 }
3551 fp = (struct nfs_fid *)fidp;
3552 fp->nf_pad = 0;
3553 fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3554 bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3555 return (0);
3556 }
3557
3558 /* ARGSUSED2 */
3559 static int
3560 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3561 {
3562 rnode_t *rp = VTOR(vp);
3563
3564 if (!write_lock) {
3565 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3566 return (V_WRITELOCK_FALSE);
3567 }
3568
3569 if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3570 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3571 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3572 return (V_WRITELOCK_FALSE);
3573 nfs_rw_exit(&rp->r_rwlock);
3574 }
3575
3576 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3577 return (V_WRITELOCK_TRUE);
3578 }
3579
3580 /* ARGSUSED */
3581 static void
3582 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3583 {
3584 rnode_t *rp = VTOR(vp);
3585
3586 nfs_rw_exit(&rp->r_rwlock);
3587 }
3588
3589 /* ARGSUSED */
3590 static int
3591 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3592 {
3593
3594 /*
3595 * Because we stuff the readdir cookie into the offset field
3596 * someone may attempt to do an lseek with the cookie which
3597 * we want to succeed.
3598 */
3599 if (vp->v_type == VDIR)
3600 return (0);
3601 if (*noffp < 0 || *noffp > MAXOFF32_T)
3602 return (EINVAL);
3603 return (0);
3604 }
3605
3606 /*
3607 * number of NFS_MAXDATA blocks to read ahead
3608 * optimized for 100 base-T.
3609 */
3610 static int nfs_nra = 4;
3611
3612 #ifdef DEBUG
3613 static int nfs_lostpage = 0; /* number of times we lost original page */
3614 #endif
3615
3616 /*
3617 * Return all the pages from [off..off+len) in file
3618 */
3619 /* ARGSUSED */
3620 static int
3621 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3622 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3623 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3624 {
3625 rnode_t *rp;
3626 int error;
3627 mntinfo_t *mi;
3628
3629 if (vp->v_flag & VNOMAP)
3630 return (ENOSYS);
3631
3632 ASSERT(off <= MAXOFF32_T);
3633 if (nfs_zone() != VTOMI(vp)->mi_zone)
3634 return (EIO);
3635 if (protp != NULL)
3636 *protp = PROT_ALL;
3637
3638 /*
3639 * Now valididate that the caches are up to date.
3640 */
3641 error = nfs_validate_caches(vp, cr);
3642 if (error)
3643 return (error);
3644
3645 rp = VTOR(vp);
3646 mi = VTOMI(vp);
3647 retry:
3648 mutex_enter(&rp->r_statelock);
3649
3650 /*
3651 * Don't create dirty pages faster than they
3652 * can be cleaned so that the system doesn't
3653 * get imbalanced. If the async queue is
3654 * maxed out, then wait for it to drain before
3655 * creating more dirty pages. Also, wait for
3656 * any threads doing pagewalks in the vop_getattr
3657 * entry points so that they don't block for
3658 * long periods.
3659 */
3660 if (rw == S_CREATE) {
3661 while ((mi->mi_max_threads != 0 &&
3662 rp->r_awcount > 2 * mi->mi_max_threads) ||
3663 rp->r_gcount > 0)
3664 cv_wait(&rp->r_cv, &rp->r_statelock);
3665 }
3666
3667 /*
3668 * If we are getting called as a side effect of an nfs_write()
3669 * operation the local file size might not be extended yet.
3670 * In this case we want to be able to return pages of zeroes.
3671 */
3672 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3673 mutex_exit(&rp->r_statelock);
3674 return (EFAULT); /* beyond EOF */
3675 }
3676
3677 mutex_exit(&rp->r_statelock);
3678
3679 if (len <= PAGESIZE) {
3680 error = nfs_getapage(vp, off, len, protp, pl, plsz,
3681 seg, addr, rw, cr);
3682 } else {
3683 error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3684 pl, plsz, seg, addr, rw, cr);
3685 }
3686
3687 switch (error) {
3688 case NFS_EOF:
3689 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3690 goto retry;
3691 case ESTALE:
3692 PURGE_STALE_FH(error, vp, cr);
3693 }
3694
3695 return (error);
3696 }
3697
3698 /*
3699 * Called from pvn_getpages or nfs_getpage to get a particular page.
3700 */
3701 /* ARGSUSED */
3702 static int
3703 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3704 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3705 enum seg_rw rw, cred_t *cr)
3706 {
3707 rnode_t *rp;
3708 uint_t bsize;
3709 struct buf *bp;
3710 page_t *pp;
3711 u_offset_t lbn;
3712 u_offset_t io_off;
3713 u_offset_t blkoff;
3714 u_offset_t rablkoff;
3715 size_t io_len;
3716 uint_t blksize;
3717 int error;
3718 int readahead;
3719 int readahead_issued = 0;
3720 int ra_window; /* readahead window */
3721 page_t *pagefound;
3722
3723 if (nfs_zone() != VTOMI(vp)->mi_zone)
3724 return (EIO);
3725 rp = VTOR(vp);
3726 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3727
3728 reread:
3729 bp = NULL;
3730 pp = NULL;
3731 pagefound = NULL;
3732
3733 if (pl != NULL)
3734 pl[0] = NULL;
3735
3736 error = 0;
3737 lbn = off / bsize;
3738 blkoff = lbn * bsize;
3739
3740 /*
3741 * Queueing up the readahead before doing the synchronous read
3742 * results in a significant increase in read throughput because
3743 * of the increased parallelism between the async threads and
3744 * the process context.
3745 */
3746 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3747 rw != S_CREATE &&
3748 !(vp->v_flag & VNOCACHE)) {
3749 mutex_enter(&rp->r_statelock);
3750
3751 /*
3752 * Calculate the number of readaheads to do.
3753 * a) No readaheads at offset = 0.
3754 * b) Do maximum(nfs_nra) readaheads when the readahead
3755 * window is closed.
3756 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3757 * upon how far the readahead window is open or close.
3758 * d) No readaheads if rp->r_nextr is not within the scope
3759 * of the readahead window (random i/o).
3760 */
3761
3762 if (off == 0)
3763 readahead = 0;
3764 else if (blkoff == rp->r_nextr)
3765 readahead = nfs_nra;
3766 else if (rp->r_nextr > blkoff &&
3767 ((ra_window = (rp->r_nextr - blkoff) / bsize)
3768 <= (nfs_nra - 1)))
3769 readahead = nfs_nra - ra_window;
3770 else
3771 readahead = 0;
3772
3773 rablkoff = rp->r_nextr;
3774 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3775 mutex_exit(&rp->r_statelock);
3776 if (nfs_async_readahead(vp, rablkoff + bsize,
3777 addr + (rablkoff + bsize - off), seg, cr,
3778 nfs_readahead) < 0) {
3779 mutex_enter(&rp->r_statelock);
3780 break;
3781 }
3782 readahead--;
3783 rablkoff += bsize;
3784 /*
3785 * Indicate that we did a readahead so
3786 * readahead offset is not updated
3787 * by the synchronous read below.
3788 */
3789 readahead_issued = 1;
3790 mutex_enter(&rp->r_statelock);
3791 /*
3792 * set readahead offset to
3793 * offset of last async readahead
3794 * request.
3795 */
3796 rp->r_nextr = rablkoff;
3797 }
3798 mutex_exit(&rp->r_statelock);
3799 }
3800
3801 again:
3802 if ((pagefound = page_exists(vp, off)) == NULL) {
3803 if (pl == NULL) {
3804 (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3805 nfs_readahead);
3806 } else if (rw == S_CREATE) {
3807 /*
3808 * Block for this page is not allocated, or the offset
3809 * is beyond the current allocation size, or we're
3810 * allocating a swap slot and the page was not found,
3811 * so allocate it and return a zero page.
3812 */
3813 if ((pp = page_create_va(vp, off,
3814 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3815 cmn_err(CE_PANIC, "nfs_getapage: page_create");
3816 io_len = PAGESIZE;
3817 mutex_enter(&rp->r_statelock);
3818 rp->r_nextr = off + PAGESIZE;
3819 mutex_exit(&rp->r_statelock);
3820 } else {
3821 /*
3822 * Need to go to server to get a BLOCK, exception to
3823 * that being while reading at offset = 0 or doing
3824 * random i/o, in that case read only a PAGE.
3825 */
3826 mutex_enter(&rp->r_statelock);
3827 if (blkoff < rp->r_size &&
3828 blkoff + bsize >= rp->r_size) {
3829 /*
3830 * If only a block or less is left in
3831 * the file, read all that is remaining.
3832 */
3833 if (rp->r_size <= off) {
3834 /*
3835 * Trying to access beyond EOF,
3836 * set up to get at least one page.
3837 */
3838 blksize = off + PAGESIZE - blkoff;
3839 } else
3840 blksize = rp->r_size - blkoff;
3841 } else if ((off == 0) ||
3842 (off != rp->r_nextr && !readahead_issued)) {
3843 blksize = PAGESIZE;
3844 blkoff = off; /* block = page here */
3845 } else
3846 blksize = bsize;
3847 mutex_exit(&rp->r_statelock);
3848
3849 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3850 &io_len, blkoff, blksize, 0);
3851
3852 /*
3853 * Some other thread has entered the page,
3854 * so just use it.
3855 */
3856 if (pp == NULL)
3857 goto again;
3858
3859 /*
3860 * Now round the request size up to page boundaries.
3861 * This ensures that the entire page will be
3862 * initialized to zeroes if EOF is encountered.
3863 */
3864 io_len = ptob(btopr(io_len));
3865
3866 bp = pageio_setup(pp, io_len, vp, B_READ);
3867 ASSERT(bp != NULL);
3868
3869 /*
3870 * pageio_setup should have set b_addr to 0. This
3871 * is correct since we want to do I/O on a page
3872 * boundary. bp_mapin will use this addr to calculate
3873 * an offset, and then set b_addr to the kernel virtual
3874 * address it allocated for us.
3875 */
3876 ASSERT(bp->b_un.b_addr == 0);
3877
3878 bp->b_edev = 0;
3879 bp->b_dev = 0;
3880 bp->b_lblkno = lbtodb(io_off);
3881 bp->b_file = vp;
3882 bp->b_offset = (offset_t)off;
3883 bp_mapin(bp);
3884
3885 /*
3886 * If doing a write beyond what we believe is EOF,
3887 * don't bother trying to read the pages from the
3888 * server, we'll just zero the pages here. We
3889 * don't check that the rw flag is S_WRITE here
3890 * because some implementations may attempt a
3891 * read access to the buffer before copying data.
3892 */
3893 mutex_enter(&rp->r_statelock);
3894 if (io_off >= rp->r_size && seg == segkmap) {
3895 mutex_exit(&rp->r_statelock);
3896 bzero(bp->b_un.b_addr, io_len);
3897 } else {
3898 mutex_exit(&rp->r_statelock);
3899 error = nfs_bio(bp, cr);
3900 }
3901
3902 /*
3903 * Unmap the buffer before freeing it.
3904 */
3905 bp_mapout(bp);
3906 pageio_done(bp);
3907
3908 if (error == NFS_EOF) {
3909 /*
3910 * If doing a write system call just return
3911 * zeroed pages, else user tried to get pages
3912 * beyond EOF, return error. We don't check
3913 * that the rw flag is S_WRITE here because
3914 * some implementations may attempt a read
3915 * access to the buffer before copying data.
3916 */
3917 if (seg == segkmap)
3918 error = 0;
3919 else
3920 error = EFAULT;
3921 }
3922
3923 if (!readahead_issued && !error) {
3924 mutex_enter(&rp->r_statelock);
3925 rp->r_nextr = io_off + io_len;
3926 mutex_exit(&rp->r_statelock);
3927 }
3928 }
3929 }
3930
3931 out:
3932 if (pl == NULL)
3933 return (error);
3934
3935 if (error) {
3936 if (pp != NULL)
3937 pvn_read_done(pp, B_ERROR);
3938 return (error);
3939 }
3940
3941 if (pagefound) {
3942 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3943
3944 /*
3945 * Page exists in the cache, acquire the appropriate lock.
3946 * If this fails, start all over again.
3947 */
3948 if ((pp = page_lookup(vp, off, se)) == NULL) {
3949 #ifdef DEBUG
3950 nfs_lostpage++;
3951 #endif
3952 goto reread;
3953 }
3954 pl[0] = pp;
3955 pl[1] = NULL;
3956 return (0);
3957 }
3958
3959 if (pp != NULL)
3960 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3961
3962 return (error);
3963 }
3964
3965 static void
3966 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3967 cred_t *cr)
3968 {
3969 int error;
3970 page_t *pp;
3971 u_offset_t io_off;
3972 size_t io_len;
3973 struct buf *bp;
3974 uint_t bsize, blksize;
3975 rnode_t *rp = VTOR(vp);
3976
3977 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3978
3979 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3980
3981 mutex_enter(&rp->r_statelock);
3982 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3983 /*
3984 * If less than a block left in file read less
3985 * than a block.
3986 */
3987 blksize = rp->r_size - blkoff;
3988 } else
3989 blksize = bsize;
3990 mutex_exit(&rp->r_statelock);
3991
3992 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3993 &io_off, &io_len, blkoff, blksize, 1);
3994 /*
3995 * The isra flag passed to the kluster function is 1, we may have
3996 * gotten a return value of NULL for a variety of reasons (# of free
3997 * pages < minfree, someone entered the page on the vnode etc). In all
3998 * cases, we want to punt on the readahead.
3999 */
4000 if (pp == NULL)
4001 return;
4002
4003 /*
4004 * Now round the request size up to page boundaries.
4005 * This ensures that the entire page will be
4006 * initialized to zeroes if EOF is encountered.
4007 */
4008 io_len = ptob(btopr(io_len));
4009
4010 bp = pageio_setup(pp, io_len, vp, B_READ);
4011 ASSERT(bp != NULL);
4012
4013 /*
4014 * pageio_setup should have set b_addr to 0. This is correct since
4015 * we want to do I/O on a page boundary. bp_mapin() will use this addr
4016 * to calculate an offset, and then set b_addr to the kernel virtual
4017 * address it allocated for us.
4018 */
4019 ASSERT(bp->b_un.b_addr == 0);
4020
4021 bp->b_edev = 0;
4022 bp->b_dev = 0;
4023 bp->b_lblkno = lbtodb(io_off);
4024 bp->b_file = vp;
4025 bp->b_offset = (offset_t)blkoff;
4026 bp_mapin(bp);
4027
4028 /*
4029 * If doing a write beyond what we believe is EOF, don't bother trying
4030 * to read the pages from the server, we'll just zero the pages here.
4031 * We don't check that the rw flag is S_WRITE here because some
4032 * implementations may attempt a read access to the buffer before
4033 * copying data.
4034 */
4035 mutex_enter(&rp->r_statelock);
4036 if (io_off >= rp->r_size && seg == segkmap) {
4037 mutex_exit(&rp->r_statelock);
4038 bzero(bp->b_un.b_addr, io_len);
4039 error = 0;
4040 } else {
4041 mutex_exit(&rp->r_statelock);
4042 error = nfs_bio(bp, cr);
4043 if (error == NFS_EOF)
4044 error = 0;
4045 }
4046
4047 /*
4048 * Unmap the buffer before freeing it.
4049 */
4050 bp_mapout(bp);
4051 pageio_done(bp);
4052
4053 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4054
4055 /*
4056 * In case of error set readahead offset
4057 * to the lowest offset.
4058 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4059 */
4060 if (error && rp->r_nextr > io_off) {
4061 mutex_enter(&rp->r_statelock);
4062 if (rp->r_nextr > io_off)
4063 rp->r_nextr = io_off;
4064 mutex_exit(&rp->r_statelock);
4065 }
4066 }
4067
4068 /*
4069 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4070 * If len == 0, do from off to EOF.
4071 *
4072 * The normal cases should be len == 0 && off == 0 (entire vp list),
4073 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4074 * (from pageout).
4075 */
4076 /* ARGSUSED */
4077 static int
4078 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4079 caller_context_t *ct)
4080 {
4081 int error;
4082 rnode_t *rp;
4083
4084 ASSERT(cr != NULL);
4085
4086 /*
4087 * XXX - Why should this check be made here?
4088 */
4089 if (vp->v_flag & VNOMAP)
4090 return (ENOSYS);
4091
4092 if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4093 return (0);
4094
4095 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4096 return (EIO);
4097 ASSERT(off <= MAXOFF32_T);
4098
4099 rp = VTOR(vp);
4100 mutex_enter(&rp->r_statelock);
4101 rp->r_count++;
4102 mutex_exit(&rp->r_statelock);
4103 error = nfs_putpages(vp, off, len, flags, cr);
4104 mutex_enter(&rp->r_statelock);
4105 rp->r_count--;
4106 cv_broadcast(&rp->r_cv);
4107 mutex_exit(&rp->r_statelock);
4108
4109 return (error);
4110 }
4111
4112 /*
4113 * Write out a single page, possibly klustering adjacent dirty pages.
4114 */
4115 int
4116 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4117 int flags, cred_t *cr)
4118 {
4119 u_offset_t io_off;
4120 u_offset_t lbn_off;
4121 u_offset_t lbn;
4122 size_t io_len;
4123 uint_t bsize;
4124 int error;
4125 rnode_t *rp;
4126
4127 ASSERT(!vn_is_readonly(vp));
4128 ASSERT(pp != NULL);
4129 ASSERT(cr != NULL);
4130 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4131
4132 rp = VTOR(vp);
4133 ASSERT(rp->r_count > 0);
4134
4135 ASSERT(pp->p_offset <= MAXOFF32_T);
4136
4137 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4138 lbn = pp->p_offset / bsize;
4139 lbn_off = lbn * bsize;
4140
4141 /*
4142 * Find a kluster that fits in one block, or in
4143 * one page if pages are bigger than blocks. If
4144 * there is less file space allocated than a whole
4145 * page, we'll shorten the i/o request below.
4146 */
4147 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4148 roundup(bsize, PAGESIZE), flags);
4149
4150 /*
4151 * pvn_write_kluster shouldn't have returned a page with offset
4152 * behind the original page we were given. Verify that.
4153 */
4154 ASSERT((pp->p_offset / bsize) >= lbn);
4155
4156 /*
4157 * Now pp will have the list of kept dirty pages marked for
4158 * write back. It will also handle invalidation and freeing
4159 * of pages that are not dirty. Check for page length rounding
4160 * problems.
4161 */
4162 if (io_off + io_len > lbn_off + bsize) {
4163 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4164 io_len = lbn_off + bsize - io_off;
4165 }
4166 /*
4167 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4168 * consistent value of r_size. RMODINPROGRESS is set in writerp().
4169 * When RMODINPROGRESS is set it indicates that a uiomove() is in
4170 * progress and the r_size has not been made consistent with the
4171 * new size of the file. When the uiomove() completes the r_size is
4172 * updated and the RMODINPROGRESS flag is cleared.
4173 *
4174 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4175 * consistent value of r_size. Without this handshaking, it is
4176 * possible that nfs(3)_bio() picks up the old value of r_size
4177 * before the uiomove() in writerp() completes. This will result
4178 * in the write through nfs(3)_bio() being dropped.
4179 *
4180 * More precisely, there is a window between the time the uiomove()
4181 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4182 * operation intervenes in this window, the page will be picked up,
4183 * because it is dirty (it will be unlocked, unless it was
4184 * pagecreate'd). When the page is picked up as dirty, the dirty
4185 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4186 * checked. This will still be the old size. Therefore the page will
4187 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4188 * the page will be found to be clean and the write will be dropped.
4189 */
4190 if (rp->r_flags & RMODINPROGRESS) {
4191 mutex_enter(&rp->r_statelock);
4192 if ((rp->r_flags & RMODINPROGRESS) &&
4193 rp->r_modaddr + MAXBSIZE > io_off &&
4194 rp->r_modaddr < io_off + io_len) {
4195 page_t *plist;
4196 /*
4197 * A write is in progress for this region of the file.
4198 * If we did not detect RMODINPROGRESS here then this
4199 * path through nfs_putapage() would eventually go to
4200 * nfs(3)_bio() and may not write out all of the data
4201 * in the pages. We end up losing data. So we decide
4202 * to set the modified bit on each page in the page
4203 * list and mark the rnode with RDIRTY. This write
4204 * will be restarted at some later time.
4205 */
4206 plist = pp;
4207 while (plist != NULL) {
4208 pp = plist;
4209 page_sub(&plist, pp);
4210 hat_setmod(pp);
4211 page_io_unlock(pp);
4212 page_unlock(pp);
4213 }
4214 rp->r_flags |= RDIRTY;
4215 mutex_exit(&rp->r_statelock);
4216 if (offp)
4217 *offp = io_off;
4218 if (lenp)
4219 *lenp = io_len;
4220 return (0);
4221 }
4222 mutex_exit(&rp->r_statelock);
4223 }
4224
4225 if (flags & B_ASYNC) {
4226 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4227 nfs_sync_putapage);
4228 } else
4229 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4230
4231 if (offp)
4232 *offp = io_off;
4233 if (lenp)
4234 *lenp = io_len;
4235 return (error);
4236 }
4237
4238 static int
4239 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4240 int flags, cred_t *cr)
4241 {
4242 int error;
4243 rnode_t *rp;
4244
4245 flags |= B_WRITE;
4246
4247 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4248 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4249
4250 rp = VTOR(vp);
4251
4252 if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4253 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4254 if (!(rp->r_flags & ROUTOFSPACE)) {
4255 mutex_enter(&rp->r_statelock);
4256 rp->r_flags |= ROUTOFSPACE;
4257 mutex_exit(&rp->r_statelock);
4258 }
4259 flags |= B_ERROR;
4260 pvn_write_done(pp, flags);
4261 /*
4262 * If this was not an async thread, then try again to
4263 * write out the pages, but this time, also destroy
4264 * them whether or not the write is successful. This
4265 * will prevent memory from filling up with these
4266 * pages and destroying them is the only alternative
4267 * if they can't be written out.
4268 *
4269 * Don't do this if this is an async thread because
4270 * when the pages are unlocked in pvn_write_done,
4271 * some other thread could have come along, locked
4272 * them, and queued for an async thread. It would be
4273 * possible for all of the async threads to be tied
4274 * up waiting to lock the pages again and they would
4275 * all already be locked and waiting for an async
4276 * thread to handle them. Deadlock.
4277 */
4278 if (!(flags & B_ASYNC)) {
4279 error = nfs_putpage(vp, io_off, io_len,
4280 B_INVAL | B_FORCE, cr, NULL);
4281 }
4282 } else {
4283 if (error)
4284 flags |= B_ERROR;
4285 else if (rp->r_flags & ROUTOFSPACE) {
4286 mutex_enter(&rp->r_statelock);
4287 rp->r_flags &= ~ROUTOFSPACE;
4288 mutex_exit(&rp->r_statelock);
4289 }
4290 pvn_write_done(pp, flags);
4291 }
4292
4293 return (error);
4294 }
4295
4296 /* ARGSUSED */
4297 static int
4298 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4299 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4300 caller_context_t *ct)
4301 {
4302 struct segvn_crargs vn_a;
4303 int error;
4304 rnode_t *rp;
4305 struct vattr va;
4306
4307 if (nfs_zone() != VTOMI(vp)->mi_zone)
4308 return (EIO);
4309
4310 if (vp->v_flag & VNOMAP)
4311 return (ENOSYS);
4312
4313 if (off > MAXOFF32_T)
4314 return (EFBIG);
4315
4316 if (off < 0 || off + len < 0)
4317 return (ENXIO);
4318
4319 if (vp->v_type != VREG)
4320 return (ENODEV);
4321
4322 /*
4323 * If there is cached data and if close-to-open consistency
4324 * checking is not turned off and if the file system is not
4325 * mounted readonly, then force an over the wire getattr.
4326 * Otherwise, just invoke nfsgetattr to get a copy of the
4327 * attributes. The attribute cache will be used unless it
4328 * is timed out and if it is, then an over the wire getattr
4329 * will be issued.
4330 */
4331 va.va_mask = AT_ALL;
4332 if (vn_has_cached_data(vp) &&
4333 !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4334 error = nfs_getattr_otw(vp, &va, cr);
4335 else
4336 error = nfsgetattr(vp, &va, cr);
4337 if (error)
4338 return (error);
4339
4340 /*
4341 * Check to see if the vnode is currently marked as not cachable.
4342 * This means portions of the file are locked (through VOP_FRLOCK).
4343 * In this case the map request must be refused. We use
4344 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4345 */
4346 rp = VTOR(vp);
4347
4348 /*
4349 * Atomically increment r_inmap after acquiring r_rwlock. The
4350 * idea here is to acquire r_rwlock to block read/write and
4351 * not to protect r_inmap. r_inmap will inform nfs_read/write()
4352 * that we are in nfs_map(). Now, r_rwlock is acquired in order
4353 * and we can prevent the deadlock that would have occurred
4354 * when nfs_addmap() would have acquired it out of order.
4355 *
4356 * Since we are not protecting r_inmap by any lock, we do not
4357 * hold any lock when we decrement it. We atomically decrement
4358 * r_inmap after we release r_lkserlock.
4359 */
4360
4361 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4362 return (EINTR);
4363 atomic_add_int(&rp->r_inmap, 1);
4364 nfs_rw_exit(&rp->r_rwlock);
4365
4366 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4367 atomic_add_int(&rp->r_inmap, -1);
4368 return (EINTR);
4369 }
4370 if (vp->v_flag & VNOCACHE) {
4371 error = EAGAIN;
4372 goto done;
4373 }
4374
4375 /*
4376 * Don't allow concurrent locks and mapping if mandatory locking is
4377 * enabled.
4378 */
4379 if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4380 MANDLOCK(vp, va.va_mode)) {
4381 error = EAGAIN;
4382 goto done;
4383 }
4384
4385 as_rangelock(as);
4386 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4387 if (error != 0) {
4388 as_rangeunlock(as);
4389 goto done;
4390 }
4391
4392 vn_a.vp = vp;
4393 vn_a.offset = off;
4394 vn_a.type = (flags & MAP_TYPE);
4395 vn_a.prot = (uchar_t)prot;
4396 vn_a.maxprot = (uchar_t)maxprot;
4397 vn_a.flags = (flags & ~MAP_TYPE);
4398 vn_a.cred = cr;
4399 vn_a.amp = NULL;
4400 vn_a.szc = 0;
4401 vn_a.lgrp_mem_policy_flags = 0;
4402
4403 error = as_map(as, *addrp, len, segvn_create, &vn_a);
4404 as_rangeunlock(as);
4405
4406 done:
4407 nfs_rw_exit(&rp->r_lkserlock);
4408 atomic_add_int(&rp->r_inmap, -1);
4409 return (error);
4410 }
4411
4412 /* ARGSUSED */
4413 static int
4414 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4415 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4416 caller_context_t *ct)
4417 {
4418 rnode_t *rp;
4419
4420 if (vp->v_flag & VNOMAP)
4421 return (ENOSYS);
4422 if (nfs_zone() != VTOMI(vp)->mi_zone)
4423 return (EIO);
4424
4425 rp = VTOR(vp);
4426 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4427
4428 return (0);
4429 }
4430
4431 /* ARGSUSED */
4432 static int
4433 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4434 struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4435 {
4436 netobj lm_fh;
4437 int rc;
4438 u_offset_t start, end;
4439 rnode_t *rp;
4440 int error = 0, intr = INTR(vp);
4441
4442 /* check for valid cmd parameter */
4443 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4444 return (EINVAL);
4445 if (nfs_zone() != VTOMI(vp)->mi_zone)
4446 return (EIO);
4447
4448 /* Verify l_type. */
4449 switch (bfp->l_type) {
4450 case F_RDLCK:
4451 if (cmd != F_GETLK && !(flag & FREAD))
4452 return (EBADF);
4453 break;
4454 case F_WRLCK:
4455 if (cmd != F_GETLK && !(flag & FWRITE))
4456 return (EBADF);
4457 break;
4458 case F_UNLCK:
4459 intr = 0;
4460 break;
4461
4462 default:
4463 return (EINVAL);
4464 }
4465
4466 /* check the validity of the lock range */
4467 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4468 return (rc);
4469 if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4470 return (rc);
4471
4472 /*
4473 * If the filesystem is mounted using local locking, pass the
4474 * request off to the local locking code.
4475 */
4476 if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4477 if (offset > MAXOFF32_T)
4478 return (EFBIG);
4479 if (cmd == F_SETLK || cmd == F_SETLKW) {
4480 /*
4481 * For complete safety, we should be holding
4482 * r_lkserlock. However, we can't call
4483 * lm_safelock and then fs_frlock while
4484 * holding r_lkserlock, so just invoke
4485 * lm_safelock and expect that this will
4486 * catch enough of the cases.
4487 */
4488 if (!lm_safelock(vp, bfp, cr))
4489 return (EAGAIN);
4490 }
4491 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4492 }
4493
4494 rp = VTOR(vp);
4495
4496 /*
4497 * Check whether the given lock request can proceed, given the
4498 * current file mappings.
4499 */
4500 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4501 return (EINTR);
4502 if (cmd == F_SETLK || cmd == F_SETLKW) {
4503 if (!lm_safelock(vp, bfp, cr)) {
4504 rc = EAGAIN;
4505 goto done;
4506 }
4507 }
4508
4509 /*
4510 * Flush the cache after waiting for async I/O to finish. For new
4511 * locks, this is so that the process gets the latest bits from the
4512 * server. For unlocks, this is so that other clients see the
4513 * latest bits once the file has been unlocked. If currently dirty
4514 * pages can't be flushed, then don't allow a lock to be set. But
4515 * allow unlocks to succeed, to avoid having orphan locks on the
4516 * server.
4517 */
4518 if (cmd != F_GETLK) {
4519 mutex_enter(&rp->r_statelock);
4520 while (rp->r_count > 0) {
4521 if (intr) {
4522 klwp_t *lwp = ttolwp(curthread);
4523
4524 if (lwp != NULL)
4525 lwp->lwp_nostop++;
4526 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4527 == 0) {
4528 if (lwp != NULL)
4529 lwp->lwp_nostop--;
4530 rc = EINTR;
4531 break;
4532 }
4533 if (lwp != NULL)
4534 lwp->lwp_nostop--;
4535 } else
4536 cv_wait(&rp->r_cv, &rp->r_statelock);
4537 }
4538 mutex_exit(&rp->r_statelock);
4539 if (rc != 0)
4540 goto done;
4541 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4542 if (error) {
4543 if (error == ENOSPC || error == EDQUOT) {
4544 mutex_enter(&rp->r_statelock);
4545 if (!rp->r_error)
4546 rp->r_error = error;
4547 mutex_exit(&rp->r_statelock);
4548 }
4549 if (bfp->l_type != F_UNLCK) {
4550 rc = ENOLCK;
4551 goto done;
4552 }
4553 }
4554 }
4555
4556 lm_fh.n_len = sizeof (fhandle_t);
4557 lm_fh.n_bytes = (char *)VTOFH(vp);
4558
4559 /*
4560 * Call the lock manager to do the real work of contacting
4561 * the server and obtaining the lock.
4562 */
4563 rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4564
4565 if (rc == 0)
4566 nfs_lockcompletion(vp, cmd);
4567
4568 done:
4569 nfs_rw_exit(&rp->r_lkserlock);
4570 return (rc);
4571 }
4572
4573 /*
4574 * Free storage space associated with the specified vnode. The portion
4575 * to be freed is specified by bfp->l_start and bfp->l_len (already
4576 * normalized to a "whence" of 0).
4577 *
4578 * This is an experimental facility whose continued existence is not
4579 * guaranteed. Currently, we only support the special case
4580 * of l_len == 0, meaning free to end of file.
4581 */
4582 /* ARGSUSED */
4583 static int
4584 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4585 offset_t offset, cred_t *cr, caller_context_t *ct)
4586 {
4587 int error;
4588
4589 ASSERT(vp->v_type == VREG);
4590 if (cmd != F_FREESP)
4591 return (EINVAL);
4592
4593 if (offset > MAXOFF32_T)
4594 return (EFBIG);
4595
4596 if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4597 (bfp->l_len > MAXOFF32_T))
4598 return (EFBIG);
4599
4600 if (nfs_zone() != VTOMI(vp)->mi_zone)
4601 return (EIO);
4602
4603 error = convoff(vp, bfp, 0, offset);
4604 if (!error) {
4605 ASSERT(bfp->l_start >= 0);
4606 if (bfp->l_len == 0) {
4607 struct vattr va;
4608
4609 /*
4610 * ftruncate should not change the ctime and
4611 * mtime if we truncate the file to its
4612 * previous size.
4613 */
4614 va.va_mask = AT_SIZE;
4615 error = nfsgetattr(vp, &va, cr);
4616 if (error || va.va_size == bfp->l_start)
4617 return (error);
4618 va.va_mask = AT_SIZE;
4619 va.va_size = bfp->l_start;
4620 error = nfssetattr(vp, &va, 0, cr);
4621 } else
4622 error = EINVAL;
4623 }
4624
4625 return (error);
4626 }
4627
4628 /* ARGSUSED */
4629 static int
4630 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4631 {
4632
4633 return (EINVAL);
4634 }
4635
4636 /*
4637 * Setup and add an address space callback to do the work of the delmap call.
4638 * The callback will (and must be) deleted in the actual callback function.
4639 *
4640 * This is done in order to take care of the problem that we have with holding
4641 * the address space's a_lock for a long period of time (e.g. if the NFS server
4642 * is down). Callbacks will be executed in the address space code while the
4643 * a_lock is not held. Holding the address space's a_lock causes things such
4644 * as ps and fork to hang because they are trying to acquire this lock as well.
4645 */
4646 /* ARGSUSED */
4647 static int
4648 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4649 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4650 caller_context_t *ct)
4651 {
4652 int caller_found;
4653 int error;
4654 rnode_t *rp;
4655 nfs_delmap_args_t *dmapp;
4656 nfs_delmapcall_t *delmap_call;
4657
4658 if (vp->v_flag & VNOMAP)
4659 return (ENOSYS);
4660 /*
4661 * A process may not change zones if it has NFS pages mmap'ed
4662 * in, so we can't legitimately get here from the wrong zone.
4663 */
4664 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4665
4666 rp = VTOR(vp);
4667
4668 /*
4669 * The way that the address space of this process deletes its mapping
4670 * of this file is via the following call chains:
4671 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4672 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4673 *
4674 * With the use of address space callbacks we are allowed to drop the
4675 * address space lock, a_lock, while executing the NFS operations that
4676 * need to go over the wire. Returning EAGAIN to the caller of this
4677 * function is what drives the execution of the callback that we add
4678 * below. The callback will be executed by the address space code
4679 * after dropping the a_lock. When the callback is finished, since
4680 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4681 * is called again on the same segment to finish the rest of the work
4682 * that needs to happen during unmapping.
4683 *
4684 * This action of calling back into the segment driver causes
4685 * nfs_delmap() to get called again, but since the callback was
4686 * already executed at this point, it already did the work and there
4687 * is nothing left for us to do.
4688 *
4689 * To Summarize:
4690 * - The first time nfs_delmap is called by the current thread is when
4691 * we add the caller associated with this delmap to the delmap caller
4692 * list, add the callback, and return EAGAIN.
4693 * - The second time in this call chain when nfs_delmap is called we
4694 * will find this caller in the delmap caller list and realize there
4695 * is no more work to do thus removing this caller from the list and
4696 * returning the error that was set in the callback execution.
4697 */
4698 caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4699 if (caller_found) {
4700 /*
4701 * 'error' is from the actual delmap operations. To avoid
4702 * hangs, we need to handle the return of EAGAIN differently
4703 * since this is what drives the callback execution.
4704 * In this case, we don't want to return EAGAIN and do the
4705 * callback execution because there are none to execute.
4706 */
4707 if (error == EAGAIN)
4708 return (0);
4709 else
4710 return (error);
4711 }
4712
4713 /* current caller was not in the list */
4714 delmap_call = nfs_init_delmapcall();
4715
4716 mutex_enter(&rp->r_statelock);
4717 list_insert_tail(&rp->r_indelmap, delmap_call);
4718 mutex_exit(&rp->r_statelock);
4719
4720 dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4721
4722 dmapp->vp = vp;
4723 dmapp->off = off;
4724 dmapp->addr = addr;
4725 dmapp->len = len;
4726 dmapp->prot = prot;
4727 dmapp->maxprot = maxprot;
4728 dmapp->flags = flags;
4729 dmapp->cr = cr;
4730 dmapp->caller = delmap_call;
4731
4732 error = as_add_callback(as, nfs_delmap_callback, dmapp,
4733 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4734
4735 return (error ? error : EAGAIN);
4736 }
4737
4738 /*
4739 * Remove some pages from an mmap'd vnode. Just update the
4740 * count of pages. If doing close-to-open, then flush all
4741 * of the pages associated with this file. Otherwise, start
4742 * an asynchronous page flush to write out any dirty pages.
4743 * This will also associate a credential with the rnode which
4744 * can be used to write the pages.
4745 */
4746 /* ARGSUSED */
4747 static void
4748 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4749 {
4750 int error;
4751 rnode_t *rp;
4752 mntinfo_t *mi;
4753 nfs_delmap_args_t *dmapp = (nfs_delmap_args_t *)arg;
4754
4755 rp = VTOR(dmapp->vp);
4756 mi = VTOMI(dmapp->vp);
4757
4758 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4759 ASSERT(rp->r_mapcnt >= 0);
4760
4761 /*
4762 * Initiate a page flush if there are pages, the file system
4763 * was not mounted readonly, the segment was mapped shared, and
4764 * the pages themselves were writeable.
4765 */
4766 if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4767 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4768 mutex_enter(&rp->r_statelock);
4769 rp->r_flags |= RDIRTY;
4770 mutex_exit(&rp->r_statelock);
4771 /*
4772 * If this is a cross-zone access a sync putpage won't work, so
4773 * the best we can do is try an async putpage. That seems
4774 * better than something more draconian such as discarding the
4775 * dirty pages.
4776 */
4777 if ((mi->mi_flags & MI_NOCTO) ||
4778 nfs_zone() != mi->mi_zone)
4779 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4780 B_ASYNC, dmapp->cr, NULL);
4781 else
4782 error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4783 0, dmapp->cr, NULL);
4784 if (!error) {
4785 mutex_enter(&rp->r_statelock);
4786 error = rp->r_error;
4787 rp->r_error = 0;
4788 mutex_exit(&rp->r_statelock);
4789 }
4790 } else
4791 error = 0;
4792
4793 if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4794 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4795 B_INVAL, dmapp->cr, NULL);
4796
4797 dmapp->caller->error = error;
4798 (void) as_delete_callback(as, arg);
4799 kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4800 }
4801
4802 /* ARGSUSED */
4803 static int
4804 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4805 caller_context_t *ct)
4806 {
4807 int error = 0;
4808
4809 if (nfs_zone() != VTOMI(vp)->mi_zone)
4810 return (EIO);
4811 /*
4812 * This looks a little weird because it's written in a general
4813 * manner but we make little use of cases. If cntl() ever gets
4814 * widely used, the outer switch will make more sense.
4815 */
4816
4817 switch (cmd) {
4818
4819 /*
4820 * Large file spec - need to base answer new query with
4821 * hardcoded constant based on the protocol.
4822 */
4823 case _PC_FILESIZEBITS:
4824 *valp = 32;
4825 return (0);
4826
4827 case _PC_LINK_MAX:
4828 case _PC_NAME_MAX:
4829 case _PC_PATH_MAX:
4830 case _PC_SYMLINK_MAX:
4831 case _PC_CHOWN_RESTRICTED:
4832 case _PC_NO_TRUNC: {
4833 mntinfo_t *mi;
4834 struct pathcnf *pc;
4835
4836 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4837 return (EINVAL);
4838 error = _PC_ISSET(cmd, pc->pc_mask); /* error or bool */
4839 switch (cmd) {
4840 case _PC_LINK_MAX:
4841 *valp = pc->pc_link_max;
4842 break;
4843 case _PC_NAME_MAX:
4844 *valp = pc->pc_name_max;
4845 break;
4846 case _PC_PATH_MAX:
4847 case _PC_SYMLINK_MAX:
4848 *valp = pc->pc_path_max;
4849 break;
4850 case _PC_CHOWN_RESTRICTED:
4851 /*
4852 * if we got here, error is really a boolean which
4853 * indicates whether cmd is set or not.
4854 */
4855 *valp = error ? 1 : 0; /* see above */
4856 error = 0;
4857 break;
4858 case _PC_NO_TRUNC:
4859 /*
4860 * if we got here, error is really a boolean which
4861 * indicates whether cmd is set or not.
4862 */
4863 *valp = error ? 1 : 0; /* see above */
4864 error = 0;
4865 break;
4866 }
4867 return (error ? EINVAL : 0);
4868 }
4869
4870 case _PC_XATTR_EXISTS:
4871 *valp = 0;
4872 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4873 vnode_t *avp;
4874 rnode_t *rp;
4875 mntinfo_t *mi = VTOMI(vp);
4876
4877 if (!(mi->mi_flags & MI_EXTATTR))
4878 return (0);
4879
4880 rp = VTOR(vp);
4881 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4882 INTR(vp)))
4883 return (EINTR);
4884
4885 error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4886 if (error || avp == NULL)
4887 error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4888
4889 nfs_rw_exit(&rp->r_rwlock);
4890
4891 if (error == 0 && avp != NULL) {
4892 error = do_xattr_exists_check(avp, valp, cr);
4893 VN_RELE(avp);
4894 }
4895 }
4896 return (error ? EINVAL : 0);
4897
4898 case _PC_ACL_ENABLED:
4899 *valp = _ACL_ACLENT_ENABLED;
4900 return (0);
4901
4902 default:
4903 return (EINVAL);
4904 }
4905 }
4906
4907 /*
4908 * Called by async thread to do synchronous pageio. Do the i/o, wait
4909 * for it to complete, and cleanup the page list when done.
4910 */
4911 static int
4912 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4913 int flags, cred_t *cr)
4914 {
4915 int error;
4916
4917 ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4918 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4919 if (flags & B_READ)
4920 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4921 else
4922 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4923 return (error);
4924 }
4925
4926 /* ARGSUSED */
4927 static int
4928 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4929 int flags, cred_t *cr, caller_context_t *ct)
4930 {
4931 int error;
4932 rnode_t *rp;
4933
4934 if (pp == NULL)
4935 return (EINVAL);
4936
4937 if (io_off > MAXOFF32_T)
4938 return (EFBIG);
4939 if (nfs_zone() != VTOMI(vp)->mi_zone)
4940 return (EIO);
4941 rp = VTOR(vp);
4942 mutex_enter(&rp->r_statelock);
4943 rp->r_count++;
4944 mutex_exit(&rp->r_statelock);
4945
4946 if (flags & B_ASYNC) {
4947 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4948 nfs_sync_pageio);
4949 } else
4950 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4951 mutex_enter(&rp->r_statelock);
4952 rp->r_count--;
4953 cv_broadcast(&rp->r_cv);
4954 mutex_exit(&rp->r_statelock);
4955 return (error);
4956 }
4957
4958 /* ARGSUSED */
4959 static int
4960 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4961 caller_context_t *ct)
4962 {
4963 int error;
4964 mntinfo_t *mi;
4965
4966 mi = VTOMI(vp);
4967
4968 if (nfs_zone() != mi->mi_zone)
4969 return (EIO);
4970 if (mi->mi_flags & MI_ACL) {
4971 error = acl_setacl2(vp, vsecattr, flag, cr);
4972 if (mi->mi_flags & MI_ACL)
4973 return (error);
4974 }
4975
4976 return (ENOSYS);
4977 }
4978
4979 /* ARGSUSED */
4980 static int
4981 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4982 caller_context_t *ct)
4983 {
4984 int error;
4985 mntinfo_t *mi;
4986
4987 mi = VTOMI(vp);
4988
4989 if (nfs_zone() != mi->mi_zone)
4990 return (EIO);
4991 if (mi->mi_flags & MI_ACL) {
4992 error = acl_getacl2(vp, vsecattr, flag, cr);
4993 if (mi->mi_flags & MI_ACL)
4994 return (error);
4995 }
4996
4997 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
4998 }
4999
5000 /* ARGSUSED */
5001 static int
5002 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
5003 caller_context_t *ct)
5004 {
5005 int error;
5006 struct shrlock nshr;
5007 struct nfs_owner nfs_owner;
5008 netobj lm_fh;
5009
5010 if (nfs_zone() != VTOMI(vp)->mi_zone)
5011 return (EIO);
5012
5013 /*
5014 * check for valid cmd parameter
5015 */
5016 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
5017 return (EINVAL);
5018
5019 /*
5020 * Check access permissions
5021 */
5022 if (cmd == F_SHARE &&
5023 (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5024 ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5025 return (EBADF);
5026
5027 /*
5028 * If the filesystem is mounted using local locking, pass the
5029 * request off to the local share code.
5030 */
5031 if (VTOMI(vp)->mi_flags & MI_LLOCK)
5032 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5033
5034 switch (cmd) {
5035 case F_SHARE:
5036 case F_UNSHARE:
5037 lm_fh.n_len = sizeof (fhandle_t);
5038 lm_fh.n_bytes = (char *)VTOFH(vp);
5039
5040 /*
5041 * If passed an owner that is too large to fit in an
5042 * nfs_owner it is likely a recursive call from the
5043 * lock manager client and pass it straight through. If
5044 * it is not a nfs_owner then simply return an error.
5045 */
5046 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5047 if (((struct nfs_owner *)shr->s_owner)->magic !=
5048 NFS_OWNER_MAGIC)
5049 return (EINVAL);
5050
5051 if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5052 error = set_errno(error);
5053 }
5054 return (error);
5055 }
5056 /*
5057 * Remote share reservations owner is a combination of
5058 * a magic number, hostname, and the local owner
5059 */
5060 bzero(&nfs_owner, sizeof (nfs_owner));
5061 nfs_owner.magic = NFS_OWNER_MAGIC;
5062 (void) strncpy(nfs_owner.hname, uts_nodename(),
5063 sizeof (nfs_owner.hname));
5064 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5065 nshr.s_access = shr->s_access;
5066 nshr.s_deny = shr->s_deny;
5067 nshr.s_sysid = 0;
5068 nshr.s_pid = ttoproc(curthread)->p_pid;
5069 nshr.s_own_len = sizeof (nfs_owner);
5070 nshr.s_owner = (caddr_t)&nfs_owner;
5071
5072 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5073 error = set_errno(error);
5074 }
5075
5076 break;
5077
5078 case F_HASREMOTELOCKS:
5079 /*
5080 * NFS client can't store remote locks itself
5081 */
5082 shr->s_access = 0;
5083 error = 0;
5084 break;
5085
5086 default:
5087 error = EINVAL;
5088 break;
5089 }
5090
5091 return (error);
5092 }