Print this page
7378 exported_lock held during nfs4 compound processing
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs_srv.c
+++ new/usr/src/uts/common/fs/nfs/nfs_srv.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 28 * All rights reserved.
29 29 */
30 30
31 31 #include <sys/param.h>
32 32 #include <sys/types.h>
33 33 #include <sys/systm.h>
34 34 #include <sys/cred.h>
35 35 #include <sys/buf.h>
36 36 #include <sys/vfs.h>
37 37 #include <sys/vnode.h>
38 38 #include <sys/uio.h>
39 39 #include <sys/stat.h>
40 40 #include <sys/errno.h>
41 41 #include <sys/sysmacros.h>
42 42 #include <sys/statvfs.h>
43 43 #include <sys/kmem.h>
44 44 #include <sys/kstat.h>
45 45 #include <sys/dirent.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/vtrace.h>
49 49 #include <sys/mode.h>
50 50 #include <sys/acl.h>
51 51 #include <sys/nbmlock.h>
52 52 #include <sys/policy.h>
53 53 #include <sys/sdt.h>
54 54
55 55 #include <rpc/types.h>
56 56 #include <rpc/auth.h>
57 57 #include <rpc/svc.h>
58 58
59 59 #include <nfs/nfs.h>
60 60 #include <nfs/export.h>
61 61 #include <nfs/nfs_cmd.h>
62 62
63 63 #include <vm/hat.h>
64 64 #include <vm/as.h>
65 65 #include <vm/seg.h>
66 66 #include <vm/seg_map.h>
67 67 #include <vm/seg_kmem.h>
68 68
69 69 #include <sys/strsubr.h>
70 70
71 71 /*
72 72 * These are the interface routines for the server side of the
73 73 * Network File System. See the NFS version 2 protocol specification
74 74 * for a description of this interface.
75 75 */
76 76
77 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
78 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 79 cred_t *);
80 80
81 81 /*
82 82 * Some "over the wire" UNIX file types. These are encoded
83 83 * into the mode. This needs to be fixed in the next rev.
84 84 */
85 85 #define IFMT 0170000 /* type of file */
86 86 #define IFCHR 0020000 /* character special */
87 87 #define IFBLK 0060000 /* block special */
88 88 #define IFSOCK 0140000 /* socket */
89 89
90 90 u_longlong_t nfs2_srv_caller_id;
91 91
92 92 /*
93 93 * Get file attributes.
94 94 * Returns the current attributes of the file with the given fhandle.
95 95 */
96 96 /* ARGSUSED */
97 97 void
98 98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 99 struct svc_req *req, cred_t *cr, bool_t ro)
100 100 {
101 101 int error;
102 102 vnode_t *vp;
103 103 struct vattr va;
104 104
105 105 vp = nfs_fhtovp(fhp, exi);
106 106 if (vp == NULL) {
107 107 ns->ns_status = NFSERR_STALE;
108 108 return;
109 109 }
110 110
111 111 /*
112 112 * Do the getattr.
113 113 */
114 114 va.va_mask = AT_ALL; /* we want all the attributes */
115 115
116 116 error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 117
118 118 /* check for overflows */
119 119 if (!error) {
120 120 /* Lie about the object type for a referral */
121 121 if (vn_is_nfs_reparse(vp, cr))
122 122 va.va_type = VLNK;
123 123
124 124 acl_perm(vp, exi, &va, cr);
125 125 error = vattr_to_nattr(&va, &ns->ns_attr);
126 126 }
127 127
128 128 VN_RELE(vp);
129 129
130 130 ns->ns_status = puterrno(error);
131 131 }
132 132 void *
133 133 rfs_getattr_getfh(fhandle_t *fhp)
134 134 {
135 135 return (fhp);
136 136 }
137 137
138 138 /*
139 139 * Set file attributes.
140 140 * Sets the attributes of the file with the given fhandle. Returns
141 141 * the new attributes.
142 142 */
143 143 /* ARGSUSED */
144 144 void
145 145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
146 146 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
147 147 {
148 148 int error;
149 149 int flag;
150 150 int in_crit = 0;
151 151 vnode_t *vp;
152 152 struct vattr va;
153 153 struct vattr bva;
154 154 struct flock64 bf;
155 155 caller_context_t ct;
156 156
157 157
158 158 vp = nfs_fhtovp(&args->saa_fh, exi);
159 159 if (vp == NULL) {
160 160 ns->ns_status = NFSERR_STALE;
161 161 return;
162 162 }
163 163
164 164 if (rdonly(ro, vp)) {
165 165 VN_RELE(vp);
166 166 ns->ns_status = NFSERR_ROFS;
167 167 return;
168 168 }
169 169
170 170 error = sattr_to_vattr(&args->saa_sa, &va);
171 171 if (error) {
172 172 VN_RELE(vp);
173 173 ns->ns_status = puterrno(error);
174 174 return;
175 175 }
176 176
177 177 /*
178 178 * If the client is requesting a change to the mtime,
179 179 * but the nanosecond field is set to 1 billion, then
180 180 * this is a flag to the server that it should set the
181 181 * atime and mtime fields to the server's current time.
182 182 * The 1 billion number actually came from the client
183 183 * as 1 million, but the units in the over the wire
184 184 * request are microseconds instead of nanoseconds.
185 185 *
186 186 * This is an overload of the protocol and should be
187 187 * documented in the NFS Version 2 protocol specification.
188 188 */
189 189 if (va.va_mask & AT_MTIME) {
190 190 if (va.va_mtime.tv_nsec == 1000000000) {
191 191 gethrestime(&va.va_mtime);
192 192 va.va_atime = va.va_mtime;
193 193 va.va_mask |= AT_ATIME;
194 194 flag = 0;
195 195 } else
196 196 flag = ATTR_UTIME;
197 197 } else
198 198 flag = 0;
199 199
200 200 /*
201 201 * If the filesystem is exported with nosuid, then mask off
202 202 * the setuid and setgid bits.
203 203 */
204 204 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
205 205 (exi->exi_export.ex_flags & EX_NOSUID))
206 206 va.va_mode &= ~(VSUID | VSGID);
207 207
208 208 ct.cc_sysid = 0;
209 209 ct.cc_pid = 0;
210 210 ct.cc_caller_id = nfs2_srv_caller_id;
211 211 ct.cc_flags = CC_DONTBLOCK;
212 212
213 213 /*
214 214 * We need to specially handle size changes because it is
215 215 * possible for the client to create a file with modes
216 216 * which indicate read-only, but with the file opened for
217 217 * writing. If the client then tries to set the size of
218 218 * the file, then the normal access checking done in
219 219 * VOP_SETATTR would prevent the client from doing so,
220 220 * although it should be legal for it to do so. To get
221 221 * around this, we do the access checking for ourselves
222 222 * and then use VOP_SPACE which doesn't do the access
223 223 * checking which VOP_SETATTR does. VOP_SPACE can only
224 224 * operate on VREG files, let VOP_SETATTR handle the other
225 225 * extremely rare cases.
226 226 * Also the client should not be allowed to change the
227 227 * size of the file if there is a conflicting non-blocking
228 228 * mandatory lock in the region of change.
229 229 */
230 230 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
231 231 if (nbl_need_check(vp)) {
232 232 nbl_start_crit(vp, RW_READER);
233 233 in_crit = 1;
234 234 }
235 235
236 236 bva.va_mask = AT_UID | AT_SIZE;
237 237
238 238 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
239 239
240 240 if (error) {
241 241 if (in_crit)
242 242 nbl_end_crit(vp);
243 243 VN_RELE(vp);
244 244 ns->ns_status = puterrno(error);
245 245 return;
246 246 }
247 247
248 248 if (in_crit) {
249 249 u_offset_t offset;
250 250 ssize_t length;
251 251
252 252 if (va.va_size < bva.va_size) {
253 253 offset = va.va_size;
254 254 length = bva.va_size - va.va_size;
255 255 } else {
256 256 offset = bva.va_size;
257 257 length = va.va_size - bva.va_size;
258 258 }
259 259 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
260 260 NULL)) {
261 261 error = EACCES;
262 262 }
263 263 }
264 264
265 265 if (crgetuid(cr) == bva.va_uid && !error &&
266 266 va.va_size != bva.va_size) {
267 267 va.va_mask &= ~AT_SIZE;
268 268 bf.l_type = F_WRLCK;
269 269 bf.l_whence = 0;
270 270 bf.l_start = (off64_t)va.va_size;
271 271 bf.l_len = 0;
272 272 bf.l_sysid = 0;
273 273 bf.l_pid = 0;
274 274
275 275 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
276 276 (offset_t)va.va_size, cr, &ct);
277 277 }
278 278 if (in_crit)
279 279 nbl_end_crit(vp);
280 280 } else
281 281 error = 0;
282 282
283 283 /*
284 284 * Do the setattr.
285 285 */
286 286 if (!error && va.va_mask) {
287 287 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
288 288 }
289 289
290 290 /*
291 291 * check if the monitor on either vop_space or vop_setattr detected
292 292 * a delegation conflict and if so, mark the thread flag as
293 293 * wouldblock so that the response is dropped and the client will
294 294 * try again.
295 295 */
296 296 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
297 297 VN_RELE(vp);
298 298 curthread->t_flag |= T_WOULDBLOCK;
299 299 return;
300 300 }
301 301
302 302 if (!error) {
303 303 va.va_mask = AT_ALL; /* get everything */
304 304
305 305 error = rfs4_delegated_getattr(vp, &va, 0, cr);
306 306
307 307 /* check for overflows */
308 308 if (!error) {
309 309 acl_perm(vp, exi, &va, cr);
310 310 error = vattr_to_nattr(&va, &ns->ns_attr);
311 311 }
312 312 }
313 313
314 314 ct.cc_flags = 0;
315 315
316 316 /*
317 317 * Force modified metadata out to stable storage.
318 318 */
319 319 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
320 320
321 321 VN_RELE(vp);
322 322
323 323 ns->ns_status = puterrno(error);
324 324 }
325 325 void *
326 326 rfs_setattr_getfh(struct nfssaargs *args)
327 327 {
328 328 return (&args->saa_fh);
329 329 }
330 330
331 331 /*
332 332 * Directory lookup.
333 333 * Returns an fhandle and file attributes for file name in a directory.
334 334 */
335 335 /* ARGSUSED */
336 336 void
337 337 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
338 338 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
339 339 {
340 340 int error;
341 341 vnode_t *dvp;
342 342 vnode_t *vp;
343 343 struct vattr va;
344 344 fhandle_t *fhp = da->da_fhandle;
345 345 struct sec_ol sec = {0, 0};
346 346 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
347 347 char *name;
348 348 struct sockaddr *ca;
349 349
350 350 /*
351 351 * Trusted Extension doesn't support NFSv2. MOUNT
352 352 * will reject v2 clients. Need to prevent v2 client
353 353 * access via WebNFS here.
354 354 */
355 355 if (is_system_labeled() && req->rq_vers == 2) {
356 356 dr->dr_status = NFSERR_ACCES;
357 357 return;
358 358 }
359 359
360 360 /*
361 361 * Disallow NULL paths
362 362 */
363 363 if (da->da_name == NULL || *da->da_name == '\0') {
364 364 dr->dr_status = NFSERR_ACCES;
365 365 return;
366 366 }
367 367
368 368 /*
369 369 * Allow lookups from the root - the default
370 370 * location of the public filehandle.
371 371 */
372 372 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
373 373 dvp = rootdir;
374 374 VN_HOLD(dvp);
375 375 } else {
376 376 dvp = nfs_fhtovp(fhp, exi);
377 377 if (dvp == NULL) {
378 378 dr->dr_status = NFSERR_STALE;
379 379 return;
380 380 }
381 381 }
382 382
383 383 /*
384 384 * Not allow lookup beyond root.
385 385 * If the filehandle matches a filehandle of the exi,
386 386 * then the ".." refers beyond the root of an exported filesystem.
387 387 */
388 388 if (strcmp(da->da_name, "..") == 0 &&
389 389 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
390 390 VN_RELE(dvp);
391 391 dr->dr_status = NFSERR_NOENT;
392 392 return;
393 393 }
394 394
395 395 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
396 396 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
397 397 MAXPATHLEN);
398 398
399 399 if (name == NULL) {
400 400 dr->dr_status = NFSERR_ACCES;
401 401 return;
402 402 }
403 403
404 404 /*
405 405 * If the public filehandle is used then allow
406 406 * a multi-component lookup, i.e. evaluate
407 407 * a pathname and follow symbolic links if
408 408 * necessary.
409 409 *
410 410 * This may result in a vnode in another filesystem
411 411 * which is OK as long as the filesystem is exported.
412 412 */
413 413 if (PUBLIC_FH2(fhp)) {
414 414 publicfh_flag = TRUE;
415 415 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
416 416 &sec);
417 417 } else {
418 418 /*
419 419 * Do a normal single component lookup.
420 420 */
421 421 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
422 422 NULL, NULL, NULL);
423 423 }
424 424
425 425 if (name != da->da_name)
426 426 kmem_free(name, MAXPATHLEN);
427 427
428 428
429 429 if (!error) {
430 430 va.va_mask = AT_ALL; /* we want everything */
431 431
432 432 error = rfs4_delegated_getattr(vp, &va, 0, cr);
433 433
434 434 /* check for overflows */
435 435 if (!error) {
436 436 acl_perm(vp, exi, &va, cr);
437 437 error = vattr_to_nattr(&va, &dr->dr_attr);
438 438 if (!error) {
439 439 if (sec.sec_flags & SEC_QUERY)
440 440 error = makefh_ol(&dr->dr_fhandle, exi,
441 441 sec.sec_index);
442 442 else {
443 443 error = makefh(&dr->dr_fhandle, vp,
444 444 exi);
445 445 if (!error && publicfh_flag &&
446 446 !chk_clnt_sec(exi, req))
447 447 auth_weak = TRUE;
448 448 }
449 449 }
450 450 }
451 451 VN_RELE(vp);
452 452 }
453 453
454 454 VN_RELE(dvp);
455 455
456 456 /*
457 457 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
458 458 * and have obtained a new exportinfo in exi which needs to be
459 459 * released. Note the the original exportinfo pointed to by exi
460 460 * will be released by the caller, comon_dispatch.
461 461 */
462 462 if (publicfh_flag && exi != NULL)
463 463 exi_rele(exi);
464 464
465 465 /*
466 466 * If it's public fh, no 0x81, and client's flavor is
467 467 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
468 468 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
469 469 */
470 470 if (auth_weak)
471 471 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
472 472 else
473 473 dr->dr_status = puterrno(error);
474 474 }
475 475 void *
476 476 rfs_lookup_getfh(struct nfsdiropargs *da)
477 477 {
478 478 return (da->da_fhandle);
479 479 }
480 480
481 481 /*
482 482 * Read symbolic link.
483 483 * Returns the string in the symbolic link at the given fhandle.
484 484 */
485 485 /* ARGSUSED */
486 486 void
487 487 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
488 488 struct svc_req *req, cred_t *cr, bool_t ro)
489 489 {
490 490 int error;
491 491 struct iovec iov;
492 492 struct uio uio;
493 493 vnode_t *vp;
494 494 struct vattr va;
495 495 struct sockaddr *ca;
496 496 char *name = NULL;
497 497 int is_referral = 0;
498 498
499 499 vp = nfs_fhtovp(fhp, exi);
500 500 if (vp == NULL) {
501 501 rl->rl_data = NULL;
502 502 rl->rl_status = NFSERR_STALE;
503 503 return;
504 504 }
505 505
506 506 va.va_mask = AT_MODE;
507 507
508 508 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
509 509
510 510 if (error) {
511 511 VN_RELE(vp);
512 512 rl->rl_data = NULL;
513 513 rl->rl_status = puterrno(error);
514 514 return;
515 515 }
516 516
517 517 if (MANDLOCK(vp, va.va_mode)) {
518 518 VN_RELE(vp);
519 519 rl->rl_data = NULL;
520 520 rl->rl_status = NFSERR_ACCES;
521 521 return;
522 522 }
523 523
524 524 /* We lied about the object type for a referral */
525 525 if (vn_is_nfs_reparse(vp, cr))
526 526 is_referral = 1;
527 527
528 528 /*
529 529 * XNFS and RFC1094 require us to return ENXIO if argument
530 530 * is not a link. BUGID 1138002.
531 531 */
532 532 if (vp->v_type != VLNK && !is_referral) {
533 533 VN_RELE(vp);
534 534 rl->rl_data = NULL;
535 535 rl->rl_status = NFSERR_NXIO;
536 536 return;
537 537 }
538 538
539 539 /*
540 540 * Allocate data for pathname. This will be freed by rfs_rlfree.
541 541 */
542 542 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
543 543
544 544 if (is_referral) {
545 545 char *s;
546 546 size_t strsz;
547 547
548 548 /* Get an artificial symlink based on a referral */
549 549 s = build_symlink(vp, cr, &strsz);
550 550 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
551 551 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
552 552 vnode_t *, vp, char *, s);
553 553 if (s == NULL)
554 554 error = EINVAL;
555 555 else {
556 556 error = 0;
557 557 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
558 558 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
559 559 kmem_free(s, strsz);
560 560 }
561 561
562 562 } else {
563 563
564 564 /*
565 565 * Set up io vector to read sym link data
566 566 */
567 567 iov.iov_base = rl->rl_data;
568 568 iov.iov_len = NFS_MAXPATHLEN;
569 569 uio.uio_iov = &iov;
570 570 uio.uio_iovcnt = 1;
571 571 uio.uio_segflg = UIO_SYSSPACE;
572 572 uio.uio_extflg = UIO_COPY_CACHED;
573 573 uio.uio_loffset = (offset_t)0;
574 574 uio.uio_resid = NFS_MAXPATHLEN;
575 575
576 576 /*
577 577 * Do the readlink.
578 578 */
579 579 error = VOP_READLINK(vp, &uio, cr, NULL);
580 580
581 581 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
582 582
583 583 if (!error)
584 584 rl->rl_data[rl->rl_count] = '\0';
585 585
586 586 }
587 587
588 588
589 589 VN_RELE(vp);
590 590
591 591 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
592 592 name = nfscmd_convname(ca, exi, rl->rl_data,
593 593 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
594 594
595 595 if (name != NULL && name != rl->rl_data) {
596 596 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
597 597 rl->rl_data = name;
598 598 }
599 599
600 600 /*
601 601 * XNFS and RFC1094 require us to return ENXIO if argument
602 602 * is not a link. UFS returns EINVAL if this is the case,
603 603 * so we do the mapping here. BUGID 1138002.
604 604 */
605 605 if (error == EINVAL)
606 606 rl->rl_status = NFSERR_NXIO;
607 607 else
608 608 rl->rl_status = puterrno(error);
609 609
610 610 }
611 611 void *
612 612 rfs_readlink_getfh(fhandle_t *fhp)
613 613 {
614 614 return (fhp);
615 615 }
616 616 /*
617 617 * Free data allocated by rfs_readlink
618 618 */
619 619 void
620 620 rfs_rlfree(struct nfsrdlnres *rl)
621 621 {
622 622 if (rl->rl_data != NULL)
623 623 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
624 624 }
625 625
626 626 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
627 627
628 628 /*
629 629 * Read data.
630 630 * Returns some data read from the file at the given fhandle.
631 631 */
632 632 /* ARGSUSED */
633 633 void
634 634 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
635 635 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
636 636 {
637 637 vnode_t *vp;
638 638 int error;
639 639 struct vattr va;
640 640 struct iovec iov;
641 641 struct uio uio;
642 642 mblk_t *mp;
643 643 int alloc_err = 0;
644 644 int in_crit = 0;
645 645 caller_context_t ct;
646 646
647 647 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
648 648 if (vp == NULL) {
649 649 rr->rr_data = NULL;
650 650 rr->rr_status = NFSERR_STALE;
651 651 return;
652 652 }
653 653
654 654 if (vp->v_type != VREG) {
655 655 VN_RELE(vp);
656 656 rr->rr_data = NULL;
657 657 rr->rr_status = NFSERR_ISDIR;
658 658 return;
659 659 }
660 660
661 661 ct.cc_sysid = 0;
662 662 ct.cc_pid = 0;
663 663 ct.cc_caller_id = nfs2_srv_caller_id;
664 664 ct.cc_flags = CC_DONTBLOCK;
665 665
666 666 /*
667 667 * Enter the critical region before calling VOP_RWLOCK
668 668 * to avoid a deadlock with write requests.
669 669 */
670 670 if (nbl_need_check(vp)) {
671 671 nbl_start_crit(vp, RW_READER);
672 672 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
673 673 0, NULL)) {
674 674 nbl_end_crit(vp);
675 675 VN_RELE(vp);
676 676 rr->rr_data = NULL;
677 677 rr->rr_status = NFSERR_ACCES;
678 678 return;
679 679 }
680 680 in_crit = 1;
681 681 }
682 682
683 683 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
684 684
685 685 /* check if a monitor detected a delegation conflict */
686 686 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
687 687 VN_RELE(vp);
688 688 /* mark as wouldblock so response is dropped */
689 689 curthread->t_flag |= T_WOULDBLOCK;
690 690
691 691 rr->rr_data = NULL;
692 692 return;
693 693 }
694 694
695 695 va.va_mask = AT_ALL;
696 696
697 697 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
698 698
699 699 if (error) {
700 700 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
701 701 if (in_crit)
702 702 nbl_end_crit(vp);
703 703
704 704 VN_RELE(vp);
705 705 rr->rr_data = NULL;
706 706 rr->rr_status = puterrno(error);
707 707
708 708 return;
709 709 }
710 710
711 711 /*
712 712 * This is a kludge to allow reading of files created
713 713 * with no read permission. The owner of the file
714 714 * is always allowed to read it.
715 715 */
716 716 if (crgetuid(cr) != va.va_uid) {
717 717 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
718 718
719 719 if (error) {
720 720 /*
721 721 * Exec is the same as read over the net because
722 722 * of demand loading.
723 723 */
724 724 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
725 725 }
726 726 if (error) {
727 727 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
728 728 if (in_crit)
729 729 nbl_end_crit(vp);
730 730 VN_RELE(vp);
731 731 rr->rr_data = NULL;
732 732 rr->rr_status = puterrno(error);
733 733
734 734 return;
735 735 }
736 736 }
737 737
738 738 if (MANDLOCK(vp, va.va_mode)) {
739 739 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
740 740 if (in_crit)
741 741 nbl_end_crit(vp);
742 742
743 743 VN_RELE(vp);
744 744 rr->rr_data = NULL;
745 745 rr->rr_status = NFSERR_ACCES;
746 746
747 747 return;
748 748 }
749 749
750 750 rr->rr_ok.rrok_wlist_len = 0;
751 751 rr->rr_ok.rrok_wlist = NULL;
752 752
753 753 if ((u_offset_t)ra->ra_offset >= va.va_size) {
754 754 rr->rr_count = 0;
755 755 rr->rr_data = NULL;
756 756 /*
757 757 * In this case, status is NFS_OK, but there is no data
758 758 * to encode. So set rr_mp to NULL.
759 759 */
760 760 rr->rr_mp = NULL;
761 761 rr->rr_ok.rrok_wlist = ra->ra_wlist;
762 762 if (rr->rr_ok.rrok_wlist)
763 763 clist_zero_len(rr->rr_ok.rrok_wlist);
764 764 goto done;
765 765 }
766 766
767 767 if (ra->ra_wlist) {
768 768 mp = NULL;
769 769 rr->rr_mp = NULL;
770 770 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
771 771 if (ra->ra_count > iov.iov_len) {
772 772 rr->rr_data = NULL;
773 773 rr->rr_status = NFSERR_INVAL;
774 774 goto done;
775 775 }
776 776 } else {
777 777 /*
778 778 * mp will contain the data to be sent out in the read reply.
779 779 * This will be freed after the reply has been sent out (by the
780 780 * driver).
781 781 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
782 782 * that the call to xdrmblk_putmblk() never fails.
783 783 */
784 784 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
785 785 &alloc_err);
786 786 ASSERT(mp != NULL);
787 787 ASSERT(alloc_err == 0);
788 788
789 789 rr->rr_mp = mp;
790 790
791 791 /*
792 792 * Set up io vector
793 793 */
794 794 iov.iov_base = (caddr_t)mp->b_datap->db_base;
795 795 iov.iov_len = ra->ra_count;
796 796 }
797 797
798 798 uio.uio_iov = &iov;
799 799 uio.uio_iovcnt = 1;
800 800 uio.uio_segflg = UIO_SYSSPACE;
801 801 uio.uio_extflg = UIO_COPY_CACHED;
802 802 uio.uio_loffset = (offset_t)ra->ra_offset;
803 803 uio.uio_resid = ra->ra_count;
804 804
805 805 error = VOP_READ(vp, &uio, 0, cr, &ct);
806 806
807 807 if (error) {
808 808 if (mp)
809 809 freeb(mp);
810 810
811 811 /*
812 812 * check if a monitor detected a delegation conflict and
813 813 * mark as wouldblock so response is dropped
814 814 */
815 815 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
816 816 curthread->t_flag |= T_WOULDBLOCK;
817 817 else
818 818 rr->rr_status = puterrno(error);
819 819
820 820 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
821 821 if (in_crit)
822 822 nbl_end_crit(vp);
823 823
824 824 VN_RELE(vp);
825 825 rr->rr_data = NULL;
826 826
827 827 return;
828 828 }
829 829
830 830 /*
831 831 * Get attributes again so we can send the latest access
832 832 * time to the client side for his cache.
833 833 */
834 834 va.va_mask = AT_ALL;
835 835
836 836 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
837 837
838 838 if (error) {
839 839 if (mp)
840 840 freeb(mp);
841 841
842 842 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
843 843 if (in_crit)
844 844 nbl_end_crit(vp);
845 845
846 846 VN_RELE(vp);
847 847 rr->rr_data = NULL;
848 848 rr->rr_status = puterrno(error);
849 849
850 850 return;
851 851 }
852 852
853 853 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
854 854
855 855 if (mp) {
856 856 rr->rr_data = (char *)mp->b_datap->db_base;
857 857 } else {
858 858 if (ra->ra_wlist) {
859 859 rr->rr_data = (caddr_t)iov.iov_base;
860 860 if (!rdma_setup_read_data2(ra, rr)) {
861 861 rr->rr_data = NULL;
862 862 rr->rr_status = puterrno(NFSERR_INVAL);
863 863 }
864 864 }
865 865 }
866 866 done:
867 867 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
868 868 if (in_crit)
869 869 nbl_end_crit(vp);
870 870
871 871 acl_perm(vp, exi, &va, cr);
872 872
873 873 /* check for overflows */
874 874 error = vattr_to_nattr(&va, &rr->rr_attr);
875 875
876 876 VN_RELE(vp);
877 877
878 878 rr->rr_status = puterrno(error);
879 879 }
880 880
881 881 /*
882 882 * Free data allocated by rfs_read
883 883 */
884 884 void
885 885 rfs_rdfree(struct nfsrdresult *rr)
886 886 {
887 887 mblk_t *mp;
888 888
889 889 if (rr->rr_status == NFS_OK) {
890 890 mp = rr->rr_mp;
891 891 if (mp != NULL)
892 892 freeb(mp);
893 893 }
894 894 }
895 895
896 896 void *
897 897 rfs_read_getfh(struct nfsreadargs *ra)
898 898 {
899 899 return (&ra->ra_fhandle);
900 900 }
901 901
902 902 #define MAX_IOVECS 12
903 903
904 904 #ifdef DEBUG
905 905 static int rfs_write_sync_hits = 0;
906 906 static int rfs_write_sync_misses = 0;
907 907 #endif
908 908
909 909 /*
910 910 * Write data to file.
911 911 * Returns attributes of a file after writing some data to it.
912 912 *
913 913 * Any changes made here, especially in error handling might have
914 914 * to also be done in rfs_write (which clusters write requests).
915 915 */
916 916 /* ARGSUSED */
917 917 void
918 918 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
919 919 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
920 920 {
921 921 int error;
922 922 vnode_t *vp;
923 923 rlim64_t rlimit;
924 924 struct vattr va;
925 925 struct uio uio;
926 926 struct iovec iov[MAX_IOVECS];
927 927 mblk_t *m;
928 928 struct iovec *iovp;
929 929 int iovcnt;
930 930 cred_t *savecred;
931 931 int in_crit = 0;
932 932 caller_context_t ct;
933 933
934 934 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
935 935 if (vp == NULL) {
936 936 ns->ns_status = NFSERR_STALE;
937 937 return;
938 938 }
939 939
940 940 if (rdonly(ro, vp)) {
941 941 VN_RELE(vp);
942 942 ns->ns_status = NFSERR_ROFS;
943 943 return;
944 944 }
945 945
946 946 if (vp->v_type != VREG) {
947 947 VN_RELE(vp);
948 948 ns->ns_status = NFSERR_ISDIR;
949 949 return;
950 950 }
951 951
952 952 ct.cc_sysid = 0;
953 953 ct.cc_pid = 0;
954 954 ct.cc_caller_id = nfs2_srv_caller_id;
955 955 ct.cc_flags = CC_DONTBLOCK;
956 956
957 957 va.va_mask = AT_UID|AT_MODE;
958 958
959 959 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
960 960
961 961 if (error) {
962 962 VN_RELE(vp);
963 963 ns->ns_status = puterrno(error);
964 964
965 965 return;
966 966 }
967 967
968 968 if (crgetuid(cr) != va.va_uid) {
969 969 /*
970 970 * This is a kludge to allow writes of files created
971 971 * with read only permission. The owner of the file
972 972 * is always allowed to write it.
973 973 */
974 974 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
975 975
976 976 if (error) {
977 977 VN_RELE(vp);
978 978 ns->ns_status = puterrno(error);
979 979 return;
980 980 }
981 981 }
982 982
983 983 /*
984 984 * Can't access a mandatory lock file. This might cause
985 985 * the NFS service thread to block forever waiting for a
986 986 * lock to be released that will never be released.
987 987 */
988 988 if (MANDLOCK(vp, va.va_mode)) {
989 989 VN_RELE(vp);
990 990 ns->ns_status = NFSERR_ACCES;
991 991 return;
992 992 }
993 993
994 994 /*
995 995 * We have to enter the critical region before calling VOP_RWLOCK
996 996 * to avoid a deadlock with ufs.
997 997 */
998 998 if (nbl_need_check(vp)) {
999 999 nbl_start_crit(vp, RW_READER);
1000 1000 in_crit = 1;
1001 1001 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1002 1002 wa->wa_count, 0, NULL)) {
1003 1003 error = EACCES;
1004 1004 goto out;
1005 1005 }
1006 1006 }
1007 1007
1008 1008 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1009 1009
1010 1010 /* check if a monitor detected a delegation conflict */
1011 1011 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1012 1012 VN_RELE(vp);
1013 1013 /* mark as wouldblock so response is dropped */
1014 1014 curthread->t_flag |= T_WOULDBLOCK;
1015 1015 return;
1016 1016 }
1017 1017
1018 1018 if (wa->wa_data || wa->wa_rlist) {
1019 1019 /* Do the RDMA thing if necessary */
1020 1020 if (wa->wa_rlist) {
1021 1021 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1022 1022 iov[0].iov_len = wa->wa_count;
1023 1023 } else {
1024 1024 iov[0].iov_base = wa->wa_data;
1025 1025 iov[0].iov_len = wa->wa_count;
1026 1026 }
1027 1027 uio.uio_iov = iov;
1028 1028 uio.uio_iovcnt = 1;
1029 1029 uio.uio_segflg = UIO_SYSSPACE;
1030 1030 uio.uio_extflg = UIO_COPY_DEFAULT;
1031 1031 uio.uio_loffset = (offset_t)wa->wa_offset;
1032 1032 uio.uio_resid = wa->wa_count;
1033 1033 /*
1034 1034 * The limit is checked on the client. We
1035 1035 * should allow any size writes here.
1036 1036 */
1037 1037 uio.uio_llimit = curproc->p_fsz_ctl;
1038 1038 rlimit = uio.uio_llimit - wa->wa_offset;
1039 1039 if (rlimit < (rlim64_t)uio.uio_resid)
1040 1040 uio.uio_resid = (uint_t)rlimit;
1041 1041
1042 1042 /*
1043 1043 * for now we assume no append mode
1044 1044 */
1045 1045 /*
1046 1046 * We're changing creds because VM may fault and we need
1047 1047 * the cred of the current thread to be used if quota
1048 1048 * checking is enabled.
1049 1049 */
1050 1050 savecred = curthread->t_cred;
1051 1051 curthread->t_cred = cr;
1052 1052 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1053 1053 curthread->t_cred = savecred;
1054 1054 } else {
1055 1055 iovcnt = 0;
1056 1056 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1057 1057 iovcnt++;
1058 1058 if (iovcnt <= MAX_IOVECS) {
1059 1059 #ifdef DEBUG
1060 1060 rfs_write_sync_hits++;
1061 1061 #endif
1062 1062 iovp = iov;
1063 1063 } else {
1064 1064 #ifdef DEBUG
1065 1065 rfs_write_sync_misses++;
1066 1066 #endif
1067 1067 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1068 1068 }
1069 1069 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1070 1070 uio.uio_iov = iovp;
1071 1071 uio.uio_iovcnt = iovcnt;
1072 1072 uio.uio_segflg = UIO_SYSSPACE;
1073 1073 uio.uio_extflg = UIO_COPY_DEFAULT;
1074 1074 uio.uio_loffset = (offset_t)wa->wa_offset;
1075 1075 uio.uio_resid = wa->wa_count;
1076 1076 /*
1077 1077 * The limit is checked on the client. We
1078 1078 * should allow any size writes here.
1079 1079 */
1080 1080 uio.uio_llimit = curproc->p_fsz_ctl;
1081 1081 rlimit = uio.uio_llimit - wa->wa_offset;
1082 1082 if (rlimit < (rlim64_t)uio.uio_resid)
1083 1083 uio.uio_resid = (uint_t)rlimit;
1084 1084
1085 1085 /*
1086 1086 * For now we assume no append mode.
1087 1087 */
1088 1088 /*
1089 1089 * We're changing creds because VM may fault and we need
1090 1090 * the cred of the current thread to be used if quota
1091 1091 * checking is enabled.
1092 1092 */
1093 1093 savecred = curthread->t_cred;
1094 1094 curthread->t_cred = cr;
1095 1095 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1096 1096 curthread->t_cred = savecred;
1097 1097
1098 1098 if (iovp != iov)
1099 1099 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1100 1100 }
1101 1101
1102 1102 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1103 1103
1104 1104 if (!error) {
1105 1105 /*
1106 1106 * Get attributes again so we send the latest mod
1107 1107 * time to the client side for his cache.
1108 1108 */
1109 1109 va.va_mask = AT_ALL; /* now we want everything */
1110 1110
1111 1111 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1112 1112
1113 1113 /* check for overflows */
1114 1114 if (!error) {
1115 1115 acl_perm(vp, exi, &va, cr);
1116 1116 error = vattr_to_nattr(&va, &ns->ns_attr);
1117 1117 }
1118 1118 }
1119 1119
1120 1120 out:
1121 1121 if (in_crit)
1122 1122 nbl_end_crit(vp);
1123 1123 VN_RELE(vp);
1124 1124
1125 1125 /* check if a monitor detected a delegation conflict */
1126 1126 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1127 1127 /* mark as wouldblock so response is dropped */
1128 1128 curthread->t_flag |= T_WOULDBLOCK;
1129 1129 else
1130 1130 ns->ns_status = puterrno(error);
1131 1131
1132 1132 }
1133 1133
1134 1134 struct rfs_async_write {
1135 1135 struct nfswriteargs *wa;
1136 1136 struct nfsattrstat *ns;
1137 1137 struct svc_req *req;
1138 1138 cred_t *cr;
1139 1139 bool_t ro;
1140 1140 kthread_t *thread;
1141 1141 struct rfs_async_write *list;
1142 1142 };
1143 1143
1144 1144 struct rfs_async_write_list {
1145 1145 fhandle_t *fhp;
1146 1146 kcondvar_t cv;
1147 1147 struct rfs_async_write *list;
1148 1148 struct rfs_async_write_list *next;
1149 1149 };
1150 1150
1151 1151 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1152 1152 static kmutex_t rfs_async_write_lock;
1153 1153 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1154 1154
1155 1155 #define MAXCLIOVECS 42
1156 1156 #define RFSWRITE_INITVAL (enum nfsstat) -1
1157 1157
1158 1158 #ifdef DEBUG
1159 1159 static int rfs_write_hits = 0;
1160 1160 static int rfs_write_misses = 0;
1161 1161 #endif
1162 1162
1163 1163 /*
1164 1164 * Write data to file.
1165 1165 * Returns attributes of a file after writing some data to it.
1166 1166 */
1167 1167 void
1168 1168 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1169 1169 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1170 1170 {
1171 1171 int error;
1172 1172 vnode_t *vp;
1173 1173 rlim64_t rlimit;
1174 1174 struct vattr va;
1175 1175 struct uio uio;
1176 1176 struct rfs_async_write_list *lp;
1177 1177 struct rfs_async_write_list *nlp;
1178 1178 struct rfs_async_write *rp;
1179 1179 struct rfs_async_write *nrp;
1180 1180 struct rfs_async_write *trp;
1181 1181 struct rfs_async_write *lrp;
1182 1182 int data_written;
1183 1183 int iovcnt;
1184 1184 mblk_t *m;
1185 1185 struct iovec *iovp;
1186 1186 struct iovec *niovp;
1187 1187 struct iovec iov[MAXCLIOVECS];
1188 1188 int count;
1189 1189 int rcount;
1190 1190 uint_t off;
1191 1191 uint_t len;
1192 1192 struct rfs_async_write nrpsp;
1193 1193 struct rfs_async_write_list nlpsp;
1194 1194 ushort_t t_flag;
1195 1195 cred_t *savecred;
1196 1196 int in_crit = 0;
1197 1197 caller_context_t ct;
1198 1198
1199 1199 if (!rfs_write_async) {
1200 1200 rfs_write_sync(wa, ns, exi, req, cr, ro);
1201 1201 return;
1202 1202 }
1203 1203
1204 1204 /*
1205 1205 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1206 1206 * is considered an OK.
1207 1207 */
1208 1208 ns->ns_status = RFSWRITE_INITVAL;
1209 1209
1210 1210 nrp = &nrpsp;
1211 1211 nrp->wa = wa;
1212 1212 nrp->ns = ns;
1213 1213 nrp->req = req;
1214 1214 nrp->cr = cr;
1215 1215 nrp->ro = ro;
1216 1216 nrp->thread = curthread;
1217 1217
1218 1218 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1219 1219
1220 1220 /*
1221 1221 * Look to see if there is already a cluster started
1222 1222 * for this file.
1223 1223 */
1224 1224 mutex_enter(&rfs_async_write_lock);
1225 1225 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1226 1226 if (bcmp(&wa->wa_fhandle, lp->fhp,
1227 1227 sizeof (fhandle_t)) == 0)
1228 1228 break;
1229 1229 }
1230 1230
1231 1231 /*
1232 1232 * If lp is non-NULL, then there is already a cluster
1233 1233 * started. We need to place ourselves in the cluster
1234 1234 * list in the right place as determined by starting
1235 1235 * offset. Conflicts with non-blocking mandatory locked
1236 1236 * regions will be checked when the cluster is processed.
1237 1237 */
1238 1238 if (lp != NULL) {
1239 1239 rp = lp->list;
1240 1240 trp = NULL;
1241 1241 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1242 1242 trp = rp;
1243 1243 rp = rp->list;
1244 1244 }
1245 1245 nrp->list = rp;
1246 1246 if (trp == NULL)
1247 1247 lp->list = nrp;
1248 1248 else
1249 1249 trp->list = nrp;
1250 1250 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1251 1251 cv_wait(&lp->cv, &rfs_async_write_lock);
1252 1252 mutex_exit(&rfs_async_write_lock);
1253 1253
1254 1254 return;
1255 1255 }
1256 1256
1257 1257 /*
1258 1258 * No cluster started yet, start one and add ourselves
1259 1259 * to the list of clusters.
1260 1260 */
1261 1261 nrp->list = NULL;
1262 1262
1263 1263 nlp = &nlpsp;
1264 1264 nlp->fhp = &wa->wa_fhandle;
1265 1265 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1266 1266 nlp->list = nrp;
1267 1267 nlp->next = NULL;
1268 1268
1269 1269 if (rfs_async_write_head == NULL) {
1270 1270 rfs_async_write_head = nlp;
1271 1271 } else {
1272 1272 lp = rfs_async_write_head;
1273 1273 while (lp->next != NULL)
1274 1274 lp = lp->next;
1275 1275 lp->next = nlp;
1276 1276 }
1277 1277 mutex_exit(&rfs_async_write_lock);
1278 1278
1279 1279 /*
1280 1280 * Convert the file handle common to all of the requests
1281 1281 * in this cluster to a vnode.
1282 1282 */
1283 1283 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1284 1284 if (vp == NULL) {
1285 1285 mutex_enter(&rfs_async_write_lock);
1286 1286 if (rfs_async_write_head == nlp)
1287 1287 rfs_async_write_head = nlp->next;
1288 1288 else {
1289 1289 lp = rfs_async_write_head;
1290 1290 while (lp->next != nlp)
1291 1291 lp = lp->next;
1292 1292 lp->next = nlp->next;
1293 1293 }
1294 1294 t_flag = curthread->t_flag & T_WOULDBLOCK;
1295 1295 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1296 1296 rp->ns->ns_status = NFSERR_STALE;
1297 1297 rp->thread->t_flag |= t_flag;
1298 1298 }
1299 1299 cv_broadcast(&nlp->cv);
1300 1300 mutex_exit(&rfs_async_write_lock);
1301 1301
1302 1302 return;
1303 1303 }
1304 1304
1305 1305 /*
1306 1306 * Can only write regular files. Attempts to write any
1307 1307 * other file types fail with EISDIR.
1308 1308 */
1309 1309 if (vp->v_type != VREG) {
1310 1310 VN_RELE(vp);
1311 1311 mutex_enter(&rfs_async_write_lock);
1312 1312 if (rfs_async_write_head == nlp)
1313 1313 rfs_async_write_head = nlp->next;
1314 1314 else {
1315 1315 lp = rfs_async_write_head;
1316 1316 while (lp->next != nlp)
1317 1317 lp = lp->next;
1318 1318 lp->next = nlp->next;
1319 1319 }
1320 1320 t_flag = curthread->t_flag & T_WOULDBLOCK;
1321 1321 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1322 1322 rp->ns->ns_status = NFSERR_ISDIR;
1323 1323 rp->thread->t_flag |= t_flag;
1324 1324 }
1325 1325 cv_broadcast(&nlp->cv);
1326 1326 mutex_exit(&rfs_async_write_lock);
1327 1327
1328 1328 return;
1329 1329 }
1330 1330
1331 1331 /*
1332 1332 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1333 1333 * deadlock with ufs.
1334 1334 */
1335 1335 if (nbl_need_check(vp)) {
1336 1336 nbl_start_crit(vp, RW_READER);
1337 1337 in_crit = 1;
1338 1338 }
1339 1339
1340 1340 ct.cc_sysid = 0;
1341 1341 ct.cc_pid = 0;
1342 1342 ct.cc_caller_id = nfs2_srv_caller_id;
1343 1343 ct.cc_flags = CC_DONTBLOCK;
1344 1344
1345 1345 /*
1346 1346 * Lock the file for writing. This operation provides
1347 1347 * the delay which allows clusters to grow.
1348 1348 */
1349 1349 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1350 1350
1351 1351 /* check if a monitor detected a delegation conflict */
1352 1352 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1353 1353 if (in_crit)
1354 1354 nbl_end_crit(vp);
1355 1355 VN_RELE(vp);
1356 1356 /* mark as wouldblock so response is dropped */
1357 1357 curthread->t_flag |= T_WOULDBLOCK;
1358 1358 mutex_enter(&rfs_async_write_lock);
1359 1359 if (rfs_async_write_head == nlp)
1360 1360 rfs_async_write_head = nlp->next;
1361 1361 else {
1362 1362 lp = rfs_async_write_head;
1363 1363 while (lp->next != nlp)
1364 1364 lp = lp->next;
1365 1365 lp->next = nlp->next;
1366 1366 }
1367 1367 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1368 1368 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1369 1369 rp->ns->ns_status = puterrno(error);
1370 1370 rp->thread->t_flag |= T_WOULDBLOCK;
1371 1371 }
1372 1372 }
1373 1373 cv_broadcast(&nlp->cv);
1374 1374 mutex_exit(&rfs_async_write_lock);
1375 1375
1376 1376 return;
1377 1377 }
1378 1378
1379 1379 /*
1380 1380 * Disconnect this cluster from the list of clusters.
1381 1381 * The cluster that is being dealt with must be fixed
1382 1382 * in size after this point, so there is no reason
1383 1383 * to leave it on the list so that new requests can
1384 1384 * find it.
1385 1385 *
1386 1386 * The algorithm is that the first write request will
1387 1387 * create a cluster, convert the file handle to a
1388 1388 * vnode pointer, and then lock the file for writing.
1389 1389 * This request is not likely to be clustered with
1390 1390 * any others. However, the next request will create
1391 1391 * a new cluster and be blocked in VOP_RWLOCK while
1392 1392 * the first request is being processed. This delay
1393 1393 * will allow more requests to be clustered in this
1394 1394 * second cluster.
1395 1395 */
1396 1396 mutex_enter(&rfs_async_write_lock);
1397 1397 if (rfs_async_write_head == nlp)
1398 1398 rfs_async_write_head = nlp->next;
1399 1399 else {
1400 1400 lp = rfs_async_write_head;
1401 1401 while (lp->next != nlp)
1402 1402 lp = lp->next;
1403 1403 lp->next = nlp->next;
1404 1404 }
1405 1405 mutex_exit(&rfs_async_write_lock);
1406 1406
1407 1407 /*
1408 1408 * Step through the list of requests in this cluster.
1409 1409 * We need to check permissions to make sure that all
1410 1410 * of the requests have sufficient permission to write
1411 1411 * the file. A cluster can be composed of requests
1412 1412 * from different clients and different users on each
1413 1413 * client.
1414 1414 *
1415 1415 * As a side effect, we also calculate the size of the
1416 1416 * byte range that this cluster encompasses.
1417 1417 */
1418 1418 rp = nlp->list;
1419 1419 off = rp->wa->wa_offset;
1420 1420 len = (uint_t)0;
1421 1421 do {
1422 1422 if (rdonly(rp->ro, vp)) {
1423 1423 rp->ns->ns_status = NFSERR_ROFS;
1424 1424 t_flag = curthread->t_flag & T_WOULDBLOCK;
1425 1425 rp->thread->t_flag |= t_flag;
1426 1426 continue;
1427 1427 }
1428 1428
1429 1429 va.va_mask = AT_UID|AT_MODE;
1430 1430
1431 1431 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1432 1432
1433 1433 if (!error) {
1434 1434 if (crgetuid(rp->cr) != va.va_uid) {
1435 1435 /*
1436 1436 * This is a kludge to allow writes of files
1437 1437 * created with read only permission. The
1438 1438 * owner of the file is always allowed to
1439 1439 * write it.
1440 1440 */
1441 1441 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1442 1442 }
1443 1443 if (!error && MANDLOCK(vp, va.va_mode))
1444 1444 error = EACCES;
1445 1445 }
1446 1446
1447 1447 /*
1448 1448 * Check for a conflict with a nbmand-locked region.
1449 1449 */
1450 1450 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1451 1451 rp->wa->wa_count, 0, NULL)) {
1452 1452 error = EACCES;
1453 1453 }
1454 1454
1455 1455 if (error) {
1456 1456 rp->ns->ns_status = puterrno(error);
1457 1457 t_flag = curthread->t_flag & T_WOULDBLOCK;
1458 1458 rp->thread->t_flag |= t_flag;
1459 1459 continue;
1460 1460 }
1461 1461 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1462 1462 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1463 1463 } while ((rp = rp->list) != NULL);
1464 1464
1465 1465 /*
1466 1466 * Step through the cluster attempting to gather as many
1467 1467 * requests which are contiguous as possible. These
1468 1468 * contiguous requests are handled via one call to VOP_WRITE
1469 1469 * instead of different calls to VOP_WRITE. We also keep
1470 1470 * track of the fact that any data was written.
1471 1471 */
1472 1472 rp = nlp->list;
1473 1473 data_written = 0;
1474 1474 do {
1475 1475 /*
1476 1476 * Skip any requests which are already marked as having an
1477 1477 * error.
1478 1478 */
1479 1479 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1480 1480 rp = rp->list;
1481 1481 continue;
1482 1482 }
1483 1483
1484 1484 /*
1485 1485 * Count the number of iovec's which are required
1486 1486 * to handle this set of requests. One iovec is
1487 1487 * needed for each data buffer, whether addressed
1488 1488 * by wa_data or by the b_rptr pointers in the
1489 1489 * mblk chains.
1490 1490 */
1491 1491 iovcnt = 0;
1492 1492 lrp = rp;
1493 1493 for (;;) {
1494 1494 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1495 1495 iovcnt++;
1496 1496 else {
1497 1497 m = lrp->wa->wa_mblk;
1498 1498 while (m != NULL) {
1499 1499 iovcnt++;
1500 1500 m = m->b_cont;
1501 1501 }
1502 1502 }
1503 1503 if (lrp->list == NULL ||
1504 1504 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1505 1505 lrp->wa->wa_offset + lrp->wa->wa_count !=
1506 1506 lrp->list->wa->wa_offset) {
1507 1507 lrp = lrp->list;
1508 1508 break;
1509 1509 }
1510 1510 lrp = lrp->list;
1511 1511 }
1512 1512
1513 1513 if (iovcnt <= MAXCLIOVECS) {
1514 1514 #ifdef DEBUG
1515 1515 rfs_write_hits++;
1516 1516 #endif
1517 1517 niovp = iov;
1518 1518 } else {
1519 1519 #ifdef DEBUG
1520 1520 rfs_write_misses++;
1521 1521 #endif
1522 1522 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1523 1523 }
1524 1524 /*
1525 1525 * Put together the scatter/gather iovecs.
1526 1526 */
1527 1527 iovp = niovp;
1528 1528 trp = rp;
1529 1529 count = 0;
1530 1530 do {
1531 1531 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1532 1532 if (trp->wa->wa_rlist) {
1533 1533 iovp->iov_base =
1534 1534 (char *)((trp->wa->wa_rlist)->
1535 1535 u.c_daddr3);
1536 1536 iovp->iov_len = trp->wa->wa_count;
1537 1537 } else {
1538 1538 iovp->iov_base = trp->wa->wa_data;
1539 1539 iovp->iov_len = trp->wa->wa_count;
1540 1540 }
1541 1541 iovp++;
1542 1542 } else {
1543 1543 m = trp->wa->wa_mblk;
1544 1544 rcount = trp->wa->wa_count;
1545 1545 while (m != NULL) {
1546 1546 iovp->iov_base = (caddr_t)m->b_rptr;
1547 1547 iovp->iov_len = (m->b_wptr - m->b_rptr);
1548 1548 rcount -= iovp->iov_len;
1549 1549 if (rcount < 0)
1550 1550 iovp->iov_len += rcount;
1551 1551 iovp++;
1552 1552 if (rcount <= 0)
1553 1553 break;
1554 1554 m = m->b_cont;
1555 1555 }
1556 1556 }
1557 1557 count += trp->wa->wa_count;
1558 1558 trp = trp->list;
1559 1559 } while (trp != lrp);
1560 1560
1561 1561 uio.uio_iov = niovp;
1562 1562 uio.uio_iovcnt = iovcnt;
1563 1563 uio.uio_segflg = UIO_SYSSPACE;
1564 1564 uio.uio_extflg = UIO_COPY_DEFAULT;
1565 1565 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1566 1566 uio.uio_resid = count;
1567 1567 /*
1568 1568 * The limit is checked on the client. We
1569 1569 * should allow any size writes here.
1570 1570 */
1571 1571 uio.uio_llimit = curproc->p_fsz_ctl;
1572 1572 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1573 1573 if (rlimit < (rlim64_t)uio.uio_resid)
1574 1574 uio.uio_resid = (uint_t)rlimit;
1575 1575
1576 1576 /*
1577 1577 * For now we assume no append mode.
1578 1578 */
1579 1579
1580 1580 /*
1581 1581 * We're changing creds because VM may fault
1582 1582 * and we need the cred of the current
1583 1583 * thread to be used if quota * checking is
1584 1584 * enabled.
1585 1585 */
1586 1586 savecred = curthread->t_cred;
1587 1587 curthread->t_cred = cr;
1588 1588 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1589 1589 curthread->t_cred = savecred;
1590 1590
1591 1591 /* check if a monitor detected a delegation conflict */
1592 1592 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1593 1593 /* mark as wouldblock so response is dropped */
1594 1594 curthread->t_flag |= T_WOULDBLOCK;
1595 1595
1596 1596 if (niovp != iov)
1597 1597 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1598 1598
1599 1599 if (!error) {
1600 1600 data_written = 1;
1601 1601 /*
1602 1602 * Get attributes again so we send the latest mod
1603 1603 * time to the client side for his cache.
1604 1604 */
1605 1605 va.va_mask = AT_ALL; /* now we want everything */
1606 1606
1607 1607 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1608 1608
1609 1609 if (!error)
1610 1610 acl_perm(vp, exi, &va, rp->cr);
1611 1611 }
1612 1612
1613 1613 /*
1614 1614 * Fill in the status responses for each request
1615 1615 * which was just handled. Also, copy the latest
1616 1616 * attributes in to the attribute responses if
1617 1617 * appropriate.
1618 1618 */
1619 1619 t_flag = curthread->t_flag & T_WOULDBLOCK;
1620 1620 do {
1621 1621 rp->thread->t_flag |= t_flag;
1622 1622 /* check for overflows */
1623 1623 if (!error) {
1624 1624 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1625 1625 }
1626 1626 rp->ns->ns_status = puterrno(error);
1627 1627 rp = rp->list;
1628 1628 } while (rp != lrp);
1629 1629 } while (rp != NULL);
1630 1630
1631 1631 /*
1632 1632 * If any data was written at all, then we need to flush
1633 1633 * the data and metadata to stable storage.
1634 1634 */
1635 1635 if (data_written) {
1636 1636 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1637 1637
1638 1638 if (!error) {
1639 1639 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1640 1640 }
1641 1641 }
1642 1642
1643 1643 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1644 1644
1645 1645 if (in_crit)
1646 1646 nbl_end_crit(vp);
1647 1647 VN_RELE(vp);
1648 1648
1649 1649 t_flag = curthread->t_flag & T_WOULDBLOCK;
1650 1650 mutex_enter(&rfs_async_write_lock);
1651 1651 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1652 1652 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1653 1653 rp->ns->ns_status = puterrno(error);
1654 1654 rp->thread->t_flag |= t_flag;
1655 1655 }
1656 1656 }
1657 1657 cv_broadcast(&nlp->cv);
1658 1658 mutex_exit(&rfs_async_write_lock);
1659 1659
1660 1660 }
1661 1661
1662 1662 void *
1663 1663 rfs_write_getfh(struct nfswriteargs *wa)
1664 1664 {
1665 1665 return (&wa->wa_fhandle);
1666 1666 }
1667 1667
1668 1668 /*
1669 1669 * Create a file.
1670 1670 * Creates a file with given attributes and returns those attributes
1671 1671 * and an fhandle for the new file.
1672 1672 */
1673 1673 void
1674 1674 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1675 1675 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1676 1676 {
1677 1677 int error;
1678 1678 int lookuperr;
1679 1679 int in_crit = 0;
1680 1680 struct vattr va;
1681 1681 vnode_t *vp;
1682 1682 vnode_t *realvp;
1683 1683 vnode_t *dvp;
1684 1684 char *name = args->ca_da.da_name;
1685 1685 vnode_t *tvp = NULL;
1686 1686 int mode;
1687 1687 int lookup_ok;
1688 1688 bool_t trunc;
1689 1689 struct sockaddr *ca;
1690 1690
1691 1691 /*
1692 1692 * Disallow NULL paths
1693 1693 */
1694 1694 if (name == NULL || *name == '\0') {
1695 1695 dr->dr_status = NFSERR_ACCES;
1696 1696 return;
1697 1697 }
1698 1698
1699 1699 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1700 1700 if (dvp == NULL) {
1701 1701 dr->dr_status = NFSERR_STALE;
1702 1702 return;
1703 1703 }
1704 1704
1705 1705 error = sattr_to_vattr(args->ca_sa, &va);
1706 1706 if (error) {
1707 1707 dr->dr_status = puterrno(error);
1708 1708 return;
1709 1709 }
1710 1710
1711 1711 /*
1712 1712 * Must specify the mode.
1713 1713 */
1714 1714 if (!(va.va_mask & AT_MODE)) {
1715 1715 VN_RELE(dvp);
1716 1716 dr->dr_status = NFSERR_INVAL;
1717 1717 return;
1718 1718 }
1719 1719
1720 1720 /*
1721 1721 * This is a completely gross hack to make mknod
1722 1722 * work over the wire until we can wack the protocol
1723 1723 */
1724 1724 if ((va.va_mode & IFMT) == IFCHR) {
1725 1725 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1726 1726 va.va_type = VFIFO; /* xtra kludge for named pipe */
1727 1727 else {
1728 1728 va.va_type = VCHR;
1729 1729 /*
1730 1730 * uncompress the received dev_t
1731 1731 * if the top half is zero indicating a request
1732 1732 * from an `older style' OS.
1733 1733 */
1734 1734 if ((va.va_size & 0xffff0000) == 0)
1735 1735 va.va_rdev = nfsv2_expdev(va.va_size);
1736 1736 else
1737 1737 va.va_rdev = (dev_t)va.va_size;
1738 1738 }
1739 1739 va.va_mask &= ~AT_SIZE;
1740 1740 } else if ((va.va_mode & IFMT) == IFBLK) {
1741 1741 va.va_type = VBLK;
1742 1742 /*
1743 1743 * uncompress the received dev_t
1744 1744 * if the top half is zero indicating a request
1745 1745 * from an `older style' OS.
1746 1746 */
1747 1747 if ((va.va_size & 0xffff0000) == 0)
1748 1748 va.va_rdev = nfsv2_expdev(va.va_size);
1749 1749 else
1750 1750 va.va_rdev = (dev_t)va.va_size;
1751 1751 va.va_mask &= ~AT_SIZE;
1752 1752 } else if ((va.va_mode & IFMT) == IFSOCK) {
1753 1753 va.va_type = VSOCK;
1754 1754 } else {
1755 1755 va.va_type = VREG;
1756 1756 }
1757 1757 va.va_mode &= ~IFMT;
1758 1758 va.va_mask |= AT_TYPE;
1759 1759
1760 1760 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1761 1761 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1762 1762 MAXPATHLEN);
1763 1763 if (name == NULL) {
1764 1764 dr->dr_status = puterrno(EINVAL);
1765 1765 return;
1766 1766 }
1767 1767
1768 1768 /*
1769 1769 * Why was the choice made to use VWRITE as the mode to the
1770 1770 * call to VOP_CREATE ? This results in a bug. When a client
1771 1771 * opens a file that already exists and is RDONLY, the second
1772 1772 * open fails with an EACESS because of the mode.
1773 1773 * bug ID 1054648.
1774 1774 */
1775 1775 lookup_ok = 0;
1776 1776 mode = VWRITE;
1777 1777 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1778 1778 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1779 1779 NULL, NULL, NULL);
1780 1780 if (!error) {
1781 1781 struct vattr at;
1782 1782
1783 1783 lookup_ok = 1;
1784 1784 at.va_mask = AT_MODE;
1785 1785 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1786 1786 if (!error)
1787 1787 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1788 1788 VN_RELE(tvp);
1789 1789 tvp = NULL;
1790 1790 }
1791 1791 }
1792 1792
1793 1793 if (!lookup_ok) {
1794 1794 if (rdonly(ro, dvp)) {
1795 1795 error = EROFS;
1796 1796 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1797 1797 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1798 1798 error = EPERM;
1799 1799 } else {
1800 1800 error = 0;
1801 1801 }
1802 1802 }
1803 1803
1804 1804 /*
1805 1805 * If file size is being modified on an already existing file
1806 1806 * make sure that there are no conflicting non-blocking mandatory
1807 1807 * locks in the region being manipulated. Return EACCES if there
1808 1808 * are conflicting locks.
1809 1809 */
1810 1810 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1811 1811 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1812 1812 NULL, NULL, NULL);
1813 1813
1814 1814 if (!lookuperr &&
1815 1815 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1816 1816 VN_RELE(tvp);
1817 1817 curthread->t_flag |= T_WOULDBLOCK;
1818 1818 goto out;
1819 1819 }
1820 1820
1821 1821 if (!lookuperr && nbl_need_check(tvp)) {
1822 1822 /*
1823 1823 * The file exists. Now check if it has any
1824 1824 * conflicting non-blocking mandatory locks
1825 1825 * in the region being changed.
1826 1826 */
1827 1827 struct vattr bva;
1828 1828 u_offset_t offset;
1829 1829 ssize_t length;
1830 1830
1831 1831 nbl_start_crit(tvp, RW_READER);
1832 1832 in_crit = 1;
1833 1833
1834 1834 bva.va_mask = AT_SIZE;
1835 1835 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1836 1836 if (!error) {
1837 1837 if (va.va_size < bva.va_size) {
1838 1838 offset = va.va_size;
1839 1839 length = bva.va_size - va.va_size;
1840 1840 } else {
1841 1841 offset = bva.va_size;
1842 1842 length = va.va_size - bva.va_size;
1843 1843 }
1844 1844 if (length) {
1845 1845 if (nbl_conflict(tvp, NBL_WRITE,
1846 1846 offset, length, 0, NULL)) {
1847 1847 error = EACCES;
1848 1848 }
1849 1849 }
1850 1850 }
1851 1851 if (error) {
1852 1852 nbl_end_crit(tvp);
1853 1853 VN_RELE(tvp);
1854 1854 in_crit = 0;
1855 1855 }
1856 1856 } else if (tvp != NULL) {
1857 1857 VN_RELE(tvp);
1858 1858 }
1859 1859 }
1860 1860
1861 1861 if (!error) {
1862 1862 /*
1863 1863 * If filesystem is shared with nosuid the remove any
1864 1864 * setuid/setgid bits on create.
1865 1865 */
1866 1866 if (va.va_type == VREG &&
1867 1867 exi->exi_export.ex_flags & EX_NOSUID)
1868 1868 va.va_mode &= ~(VSUID | VSGID);
1869 1869
1870 1870 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1871 1871 NULL, NULL);
1872 1872
1873 1873 if (!error) {
1874 1874
1875 1875 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1876 1876 trunc = TRUE;
1877 1877 else
1878 1878 trunc = FALSE;
1879 1879
1880 1880 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1881 1881 VN_RELE(vp);
1882 1882 curthread->t_flag |= T_WOULDBLOCK;
1883 1883 goto out;
1884 1884 }
1885 1885 va.va_mask = AT_ALL;
1886 1886
1887 1887 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1888 1888
1889 1889 /* check for overflows */
1890 1890 if (!error) {
1891 1891 acl_perm(vp, exi, &va, cr);
1892 1892 error = vattr_to_nattr(&va, &dr->dr_attr);
1893 1893 if (!error) {
1894 1894 error = makefh(&dr->dr_fhandle, vp,
1895 1895 exi);
1896 1896 }
1897 1897 }
1898 1898 /*
1899 1899 * Force modified metadata out to stable storage.
1900 1900 *
1901 1901 * if a underlying vp exists, pass it to VOP_FSYNC
1902 1902 */
1903 1903 if (VOP_REALVP(vp, &realvp, NULL) == 0)
1904 1904 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1905 1905 else
1906 1906 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1907 1907 VN_RELE(vp);
1908 1908 }
1909 1909
1910 1910 if (in_crit) {
1911 1911 nbl_end_crit(tvp);
1912 1912 VN_RELE(tvp);
1913 1913 }
1914 1914 }
1915 1915
1916 1916 /*
1917 1917 * Force modified data and metadata out to stable storage.
1918 1918 */
1919 1919 (void) VOP_FSYNC(dvp, 0, cr, NULL);
1920 1920
1921 1921 out:
1922 1922
1923 1923 VN_RELE(dvp);
1924 1924
1925 1925 dr->dr_status = puterrno(error);
1926 1926
1927 1927 if (name != args->ca_da.da_name)
1928 1928 kmem_free(name, MAXPATHLEN);
1929 1929 }
1930 1930 void *
1931 1931 rfs_create_getfh(struct nfscreatargs *args)
1932 1932 {
1933 1933 return (args->ca_da.da_fhandle);
1934 1934 }
1935 1935
1936 1936 /*
1937 1937 * Remove a file.
1938 1938 * Remove named file from parent directory.
1939 1939 */
1940 1940 /* ARGSUSED */
1941 1941 void
1942 1942 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1943 1943 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1944 1944 {
1945 1945 int error = 0;
1946 1946 vnode_t *vp;
1947 1947 vnode_t *targvp;
1948 1948 int in_crit = 0;
1949 1949
1950 1950 /*
1951 1951 * Disallow NULL paths
1952 1952 */
1953 1953 if (da->da_name == NULL || *da->da_name == '\0') {
1954 1954 *status = NFSERR_ACCES;
1955 1955 return;
1956 1956 }
1957 1957
1958 1958 vp = nfs_fhtovp(da->da_fhandle, exi);
1959 1959 if (vp == NULL) {
1960 1960 *status = NFSERR_STALE;
1961 1961 return;
1962 1962 }
1963 1963
1964 1964 if (rdonly(ro, vp)) {
1965 1965 VN_RELE(vp);
1966 1966 *status = NFSERR_ROFS;
1967 1967 return;
1968 1968 }
1969 1969
1970 1970 /*
1971 1971 * Check for a conflict with a non-blocking mandatory share reservation.
1972 1972 */
1973 1973 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1974 1974 NULL, cr, NULL, NULL, NULL);
1975 1975 if (error != 0) {
1976 1976 VN_RELE(vp);
1977 1977 *status = puterrno(error);
1978 1978 return;
1979 1979 }
1980 1980
1981 1981 /*
1982 1982 * If the file is delegated to an v4 client, then initiate
1983 1983 * recall and drop this request (by setting T_WOULDBLOCK).
1984 1984 * The client will eventually re-transmit the request and
1985 1985 * (hopefully), by then, the v4 client will have returned
1986 1986 * the delegation.
1987 1987 */
1988 1988
1989 1989 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1990 1990 VN_RELE(vp);
1991 1991 VN_RELE(targvp);
1992 1992 curthread->t_flag |= T_WOULDBLOCK;
1993 1993 return;
1994 1994 }
1995 1995
1996 1996 if (nbl_need_check(targvp)) {
1997 1997 nbl_start_crit(targvp, RW_READER);
1998 1998 in_crit = 1;
1999 1999 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2000 2000 error = EACCES;
2001 2001 goto out;
2002 2002 }
2003 2003 }
2004 2004
2005 2005 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2006 2006
2007 2007 /*
2008 2008 * Force modified data and metadata out to stable storage.
2009 2009 */
2010 2010 (void) VOP_FSYNC(vp, 0, cr, NULL);
2011 2011
2012 2012 out:
2013 2013 if (in_crit)
2014 2014 nbl_end_crit(targvp);
2015 2015 VN_RELE(targvp);
2016 2016 VN_RELE(vp);
2017 2017
2018 2018 *status = puterrno(error);
2019 2019
2020 2020 }
2021 2021
2022 2022 void *
2023 2023 rfs_remove_getfh(struct nfsdiropargs *da)
2024 2024 {
2025 2025 return (da->da_fhandle);
2026 2026 }
2027 2027
2028 2028 /*
2029 2029 * rename a file
2030 2030 * Give a file (from) a new name (to).
2031 2031 */
2032 2032 /* ARGSUSED */
2033 2033 void
2034 2034 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035 2035 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2036 2036 {
2037 2037 int error = 0;
2038 2038 vnode_t *fromvp;
2039 2039 vnode_t *tovp;
2040 2040 struct exportinfo *to_exi;
2041 2041 fhandle_t *fh;
2042 2042 vnode_t *srcvp;
↓ open down ↓ |
2042 lines elided |
↑ open up ↑ |
2043 2043 vnode_t *targvp;
2044 2044 int in_crit = 0;
2045 2045
2046 2046 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047 2047 if (fromvp == NULL) {
2048 2048 *status = NFSERR_STALE;
2049 2049 return;
2050 2050 }
2051 2051
2052 2052 fh = args->rna_to.da_fhandle;
2053 - to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2053 + to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen, NULL);
2054 2054 if (to_exi == NULL) {
2055 2055 VN_RELE(fromvp);
2056 2056 *status = NFSERR_ACCES;
2057 2057 return;
2058 2058 }
2059 2059 exi_rele(to_exi);
2060 2060
2061 2061 if (to_exi != exi) {
2062 2062 VN_RELE(fromvp);
2063 2063 *status = NFSERR_XDEV;
2064 2064 return;
2065 2065 }
2066 2066
2067 2067 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068 2068 if (tovp == NULL) {
2069 2069 VN_RELE(fromvp);
2070 2070 *status = NFSERR_STALE;
2071 2071 return;
2072 2072 }
2073 2073
2074 2074 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075 2075 VN_RELE(tovp);
2076 2076 VN_RELE(fromvp);
2077 2077 *status = NFSERR_NOTDIR;
2078 2078 return;
2079 2079 }
2080 2080
2081 2081 /*
2082 2082 * Disallow NULL paths
2083 2083 */
2084 2084 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085 2085 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086 2086 VN_RELE(tovp);
2087 2087 VN_RELE(fromvp);
2088 2088 *status = NFSERR_ACCES;
2089 2089 return;
2090 2090 }
2091 2091
2092 2092 if (rdonly(ro, tovp)) {
2093 2093 VN_RELE(tovp);
2094 2094 VN_RELE(fromvp);
2095 2095 *status = NFSERR_ROFS;
2096 2096 return;
2097 2097 }
2098 2098
2099 2099 /*
2100 2100 * Check for a conflict with a non-blocking mandatory share reservation.
2101 2101 */
2102 2102 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103 2103 NULL, cr, NULL, NULL, NULL);
2104 2104 if (error != 0) {
2105 2105 VN_RELE(tovp);
2106 2106 VN_RELE(fromvp);
2107 2107 *status = puterrno(error);
2108 2108 return;
2109 2109 }
2110 2110
2111 2111 /* Check for delegations on the source file */
2112 2112
2113 2113 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114 2114 VN_RELE(tovp);
2115 2115 VN_RELE(fromvp);
2116 2116 VN_RELE(srcvp);
2117 2117 curthread->t_flag |= T_WOULDBLOCK;
2118 2118 return;
2119 2119 }
2120 2120
2121 2121 /* Check for delegation on the file being renamed over, if it exists */
2122 2122
2123 2123 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124 2124 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125 2125 NULL, NULL, NULL) == 0) {
2126 2126
2127 2127 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128 2128 VN_RELE(tovp);
2129 2129 VN_RELE(fromvp);
2130 2130 VN_RELE(srcvp);
2131 2131 VN_RELE(targvp);
2132 2132 curthread->t_flag |= T_WOULDBLOCK;
2133 2133 return;
2134 2134 }
2135 2135 VN_RELE(targvp);
2136 2136 }
2137 2137
2138 2138
2139 2139 if (nbl_need_check(srcvp)) {
2140 2140 nbl_start_crit(srcvp, RW_READER);
2141 2141 in_crit = 1;
2142 2142 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143 2143 error = EACCES;
2144 2144 goto out;
2145 2145 }
2146 2146 }
2147 2147
2148 2148 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149 2149 tovp, args->rna_to.da_name, cr, NULL, 0);
2150 2150
2151 2151 if (error == 0)
2152 2152 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153 2153 strlen(args->rna_to.da_name));
2154 2154
2155 2155 /*
2156 2156 * Force modified data and metadata out to stable storage.
2157 2157 */
2158 2158 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2159 2159 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2160 2160
2161 2161 out:
2162 2162 if (in_crit)
2163 2163 nbl_end_crit(srcvp);
2164 2164 VN_RELE(srcvp);
2165 2165 VN_RELE(tovp);
2166 2166 VN_RELE(fromvp);
2167 2167
2168 2168 *status = puterrno(error);
2169 2169
2170 2170 }
2171 2171 void *
2172 2172 rfs_rename_getfh(struct nfsrnmargs *args)
2173 2173 {
2174 2174 return (args->rna_from.da_fhandle);
2175 2175 }
2176 2176
2177 2177 /*
2178 2178 * Link to a file.
2179 2179 * Create a file (to) which is a hard link to the given file (from).
2180 2180 */
2181 2181 /* ARGSUSED */
2182 2182 void
2183 2183 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2184 2184 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2185 2185 {
2186 2186 int error;
2187 2187 vnode_t *fromvp;
2188 2188 vnode_t *tovp;
↓ open down ↓ |
125 lines elided |
↑ open up ↑ |
2189 2189 struct exportinfo *to_exi;
2190 2190 fhandle_t *fh;
2191 2191
2192 2192 fromvp = nfs_fhtovp(args->la_from, exi);
2193 2193 if (fromvp == NULL) {
2194 2194 *status = NFSERR_STALE;
2195 2195 return;
2196 2196 }
2197 2197
2198 2198 fh = args->la_to.da_fhandle;
2199 - to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2199 + to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen, NULL);
2200 2200 if (to_exi == NULL) {
2201 2201 VN_RELE(fromvp);
2202 2202 *status = NFSERR_ACCES;
2203 2203 return;
2204 2204 }
2205 2205 exi_rele(to_exi);
2206 2206
2207 2207 if (to_exi != exi) {
2208 2208 VN_RELE(fromvp);
2209 2209 *status = NFSERR_XDEV;
2210 2210 return;
2211 2211 }
2212 2212
2213 2213 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2214 2214 if (tovp == NULL) {
2215 2215 VN_RELE(fromvp);
2216 2216 *status = NFSERR_STALE;
2217 2217 return;
2218 2218 }
2219 2219
2220 2220 if (tovp->v_type != VDIR) {
2221 2221 VN_RELE(tovp);
2222 2222 VN_RELE(fromvp);
2223 2223 *status = NFSERR_NOTDIR;
2224 2224 return;
2225 2225 }
2226 2226 /*
2227 2227 * Disallow NULL paths
2228 2228 */
2229 2229 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2230 2230 VN_RELE(tovp);
2231 2231 VN_RELE(fromvp);
2232 2232 *status = NFSERR_ACCES;
2233 2233 return;
2234 2234 }
2235 2235
2236 2236 if (rdonly(ro, tovp)) {
2237 2237 VN_RELE(tovp);
2238 2238 VN_RELE(fromvp);
2239 2239 *status = NFSERR_ROFS;
2240 2240 return;
2241 2241 }
2242 2242
2243 2243 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2244 2244
2245 2245 /*
2246 2246 * Force modified data and metadata out to stable storage.
2247 2247 */
2248 2248 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2249 2249 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2250 2250
2251 2251 VN_RELE(tovp);
2252 2252 VN_RELE(fromvp);
2253 2253
2254 2254 *status = puterrno(error);
2255 2255
2256 2256 }
2257 2257 void *
2258 2258 rfs_link_getfh(struct nfslinkargs *args)
2259 2259 {
2260 2260 return (args->la_from);
2261 2261 }
2262 2262
2263 2263 /*
2264 2264 * Symbolicly link to a file.
2265 2265 * Create a file (to) with the given attributes which is a symbolic link
2266 2266 * to the given path name (to).
2267 2267 */
2268 2268 void
2269 2269 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2270 2270 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2271 2271 {
2272 2272 int error;
2273 2273 struct vattr va;
2274 2274 vnode_t *vp;
2275 2275 vnode_t *svp;
2276 2276 int lerror;
2277 2277 struct sockaddr *ca;
2278 2278 char *name = NULL;
2279 2279
2280 2280 /*
2281 2281 * Disallow NULL paths
2282 2282 */
2283 2283 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2284 2284 *status = NFSERR_ACCES;
2285 2285 return;
2286 2286 }
2287 2287
2288 2288 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2289 2289 if (vp == NULL) {
2290 2290 *status = NFSERR_STALE;
2291 2291 return;
2292 2292 }
2293 2293
2294 2294 if (rdonly(ro, vp)) {
2295 2295 VN_RELE(vp);
2296 2296 *status = NFSERR_ROFS;
2297 2297 return;
2298 2298 }
2299 2299
2300 2300 error = sattr_to_vattr(args->sla_sa, &va);
2301 2301 if (error) {
2302 2302 VN_RELE(vp);
2303 2303 *status = puterrno(error);
2304 2304 return;
2305 2305 }
2306 2306
2307 2307 if (!(va.va_mask & AT_MODE)) {
2308 2308 VN_RELE(vp);
2309 2309 *status = NFSERR_INVAL;
2310 2310 return;
2311 2311 }
2312 2312
2313 2313 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2314 2314 name = nfscmd_convname(ca, exi, args->sla_tnm,
2315 2315 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2316 2316
2317 2317 if (name == NULL) {
2318 2318 *status = NFSERR_ACCES;
2319 2319 return;
2320 2320 }
2321 2321
2322 2322 va.va_type = VLNK;
2323 2323 va.va_mask |= AT_TYPE;
2324 2324
2325 2325 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2326 2326
2327 2327 /*
2328 2328 * Force new data and metadata out to stable storage.
2329 2329 */
2330 2330 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2331 2331 NULL, cr, NULL, NULL, NULL);
2332 2332
2333 2333 if (!lerror) {
2334 2334 (void) VOP_FSYNC(svp, 0, cr, NULL);
2335 2335 VN_RELE(svp);
2336 2336 }
2337 2337
2338 2338 /*
2339 2339 * Force modified data and metadata out to stable storage.
2340 2340 */
2341 2341 (void) VOP_FSYNC(vp, 0, cr, NULL);
2342 2342
2343 2343 VN_RELE(vp);
2344 2344
2345 2345 *status = puterrno(error);
2346 2346 if (name != args->sla_tnm)
2347 2347 kmem_free(name, MAXPATHLEN);
2348 2348
2349 2349 }
2350 2350 void *
2351 2351 rfs_symlink_getfh(struct nfsslargs *args)
2352 2352 {
2353 2353 return (args->sla_from.da_fhandle);
2354 2354 }
2355 2355
2356 2356 /*
2357 2357 * Make a directory.
2358 2358 * Create a directory with the given name, parent directory, and attributes.
2359 2359 * Returns a file handle and attributes for the new directory.
2360 2360 */
2361 2361 /* ARGSUSED */
2362 2362 void
2363 2363 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2364 2364 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2365 2365 {
2366 2366 int error;
2367 2367 struct vattr va;
2368 2368 vnode_t *dvp = NULL;
2369 2369 vnode_t *vp;
2370 2370 char *name = args->ca_da.da_name;
2371 2371
2372 2372 /*
2373 2373 * Disallow NULL paths
2374 2374 */
2375 2375 if (name == NULL || *name == '\0') {
2376 2376 dr->dr_status = NFSERR_ACCES;
2377 2377 return;
2378 2378 }
2379 2379
2380 2380 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2381 2381 if (vp == NULL) {
2382 2382 dr->dr_status = NFSERR_STALE;
2383 2383 return;
2384 2384 }
2385 2385
2386 2386 if (rdonly(ro, vp)) {
2387 2387 VN_RELE(vp);
2388 2388 dr->dr_status = NFSERR_ROFS;
2389 2389 return;
2390 2390 }
2391 2391
2392 2392 error = sattr_to_vattr(args->ca_sa, &va);
2393 2393 if (error) {
2394 2394 VN_RELE(vp);
2395 2395 dr->dr_status = puterrno(error);
2396 2396 return;
2397 2397 }
2398 2398
2399 2399 if (!(va.va_mask & AT_MODE)) {
2400 2400 VN_RELE(vp);
2401 2401 dr->dr_status = NFSERR_INVAL;
2402 2402 return;
2403 2403 }
2404 2404
2405 2405 va.va_type = VDIR;
2406 2406 va.va_mask |= AT_TYPE;
2407 2407
2408 2408 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2409 2409
2410 2410 if (!error) {
2411 2411 /*
2412 2412 * Attribtutes of the newly created directory should
2413 2413 * be returned to the client.
2414 2414 */
2415 2415 va.va_mask = AT_ALL; /* We want everything */
2416 2416 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2417 2417
2418 2418 /* check for overflows */
2419 2419 if (!error) {
2420 2420 acl_perm(vp, exi, &va, cr);
2421 2421 error = vattr_to_nattr(&va, &dr->dr_attr);
2422 2422 if (!error) {
2423 2423 error = makefh(&dr->dr_fhandle, dvp, exi);
2424 2424 }
2425 2425 }
2426 2426 /*
2427 2427 * Force new data and metadata out to stable storage.
2428 2428 */
2429 2429 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2430 2430 VN_RELE(dvp);
2431 2431 }
2432 2432
2433 2433 /*
2434 2434 * Force modified data and metadata out to stable storage.
2435 2435 */
2436 2436 (void) VOP_FSYNC(vp, 0, cr, NULL);
2437 2437
2438 2438 VN_RELE(vp);
2439 2439
2440 2440 dr->dr_status = puterrno(error);
2441 2441
2442 2442 }
2443 2443 void *
2444 2444 rfs_mkdir_getfh(struct nfscreatargs *args)
2445 2445 {
2446 2446 return (args->ca_da.da_fhandle);
2447 2447 }
2448 2448
2449 2449 /*
2450 2450 * Remove a directory.
2451 2451 * Remove the given directory name from the given parent directory.
2452 2452 */
2453 2453 /* ARGSUSED */
2454 2454 void
2455 2455 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2456 2456 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2457 2457 {
2458 2458 int error;
2459 2459 vnode_t *vp;
2460 2460
2461 2461 /*
2462 2462 * Disallow NULL paths
2463 2463 */
2464 2464 if (da->da_name == NULL || *da->da_name == '\0') {
2465 2465 *status = NFSERR_ACCES;
2466 2466 return;
2467 2467 }
2468 2468
2469 2469 vp = nfs_fhtovp(da->da_fhandle, exi);
2470 2470 if (vp == NULL) {
2471 2471 *status = NFSERR_STALE;
2472 2472 return;
2473 2473 }
2474 2474
2475 2475 if (rdonly(ro, vp)) {
2476 2476 VN_RELE(vp);
2477 2477 *status = NFSERR_ROFS;
2478 2478 return;
2479 2479 }
2480 2480
2481 2481 /*
2482 2482 * VOP_RMDIR takes a third argument (the current
2483 2483 * directory of the process). That's because someone
2484 2484 * wants to return EINVAL if one tries to remove ".".
2485 2485 * Of course, NFS servers have no idea what their
2486 2486 * clients' current directories are. We fake it by
2487 2487 * supplying a vnode known to exist and illegal to
2488 2488 * remove.
2489 2489 */
2490 2490 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2491 2491
2492 2492 /*
2493 2493 * Force modified data and metadata out to stable storage.
2494 2494 */
2495 2495 (void) VOP_FSYNC(vp, 0, cr, NULL);
2496 2496
2497 2497 VN_RELE(vp);
2498 2498
2499 2499 /*
2500 2500 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2501 2501 * if the directory is not empty. A System V NFS server
2502 2502 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2503 2503 * over the wire.
2504 2504 */
2505 2505 if (error == EEXIST)
2506 2506 *status = NFSERR_NOTEMPTY;
2507 2507 else
2508 2508 *status = puterrno(error);
2509 2509
2510 2510 }
2511 2511 void *
2512 2512 rfs_rmdir_getfh(struct nfsdiropargs *da)
2513 2513 {
2514 2514 return (da->da_fhandle);
2515 2515 }
2516 2516
2517 2517 /* ARGSUSED */
2518 2518 void
2519 2519 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2520 2520 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2521 2521 {
2522 2522 int error;
2523 2523 int iseof;
2524 2524 struct iovec iov;
2525 2525 struct uio uio;
2526 2526 vnode_t *vp;
2527 2527 char *ndata = NULL;
2528 2528 struct sockaddr *ca;
2529 2529 size_t nents;
2530 2530 int ret;
2531 2531
2532 2532 vp = nfs_fhtovp(&rda->rda_fh, exi);
2533 2533 if (vp == NULL) {
2534 2534 rd->rd_entries = NULL;
2535 2535 rd->rd_status = NFSERR_STALE;
2536 2536 return;
2537 2537 }
2538 2538
2539 2539 if (vp->v_type != VDIR) {
2540 2540 VN_RELE(vp);
2541 2541 rd->rd_entries = NULL;
2542 2542 rd->rd_status = NFSERR_NOTDIR;
2543 2543 return;
2544 2544 }
2545 2545
2546 2546 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2547 2547
2548 2548 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2549 2549
2550 2550 if (error) {
2551 2551 rd->rd_entries = NULL;
2552 2552 goto bad;
2553 2553 }
2554 2554
2555 2555 if (rda->rda_count == 0) {
2556 2556 rd->rd_entries = NULL;
2557 2557 rd->rd_size = 0;
2558 2558 rd->rd_eof = FALSE;
2559 2559 goto bad;
2560 2560 }
2561 2561
2562 2562 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2563 2563
2564 2564 /*
2565 2565 * Allocate data for entries. This will be freed by rfs_rddirfree.
2566 2566 */
2567 2567 rd->rd_bufsize = (uint_t)rda->rda_count;
2568 2568 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2569 2569
2570 2570 /*
2571 2571 * Set up io vector to read directory data
2572 2572 */
2573 2573 iov.iov_base = (caddr_t)rd->rd_entries;
2574 2574 iov.iov_len = rda->rda_count;
2575 2575 uio.uio_iov = &iov;
2576 2576 uio.uio_iovcnt = 1;
2577 2577 uio.uio_segflg = UIO_SYSSPACE;
2578 2578 uio.uio_extflg = UIO_COPY_CACHED;
2579 2579 uio.uio_loffset = (offset_t)rda->rda_offset;
2580 2580 uio.uio_resid = rda->rda_count;
2581 2581
2582 2582 /*
2583 2583 * read directory
2584 2584 */
2585 2585 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2586 2586
2587 2587 /*
2588 2588 * Clean up
2589 2589 */
2590 2590 if (!error) {
2591 2591 /*
2592 2592 * set size and eof
2593 2593 */
2594 2594 if (uio.uio_resid == rda->rda_count) {
2595 2595 rd->rd_size = 0;
2596 2596 rd->rd_eof = TRUE;
2597 2597 } else {
2598 2598 rd->rd_size = (uint32_t)(rda->rda_count -
2599 2599 uio.uio_resid);
2600 2600 rd->rd_eof = iseof ? TRUE : FALSE;
2601 2601 }
2602 2602 }
2603 2603
2604 2604 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2605 2605 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2606 2606 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2607 2607 rda->rda_count, &ndata);
2608 2608
2609 2609 if (ret != 0) {
2610 2610 size_t dropbytes;
2611 2611 /*
2612 2612 * We had to drop one or more entries in order to fit
2613 2613 * during the character conversion. We need to patch
2614 2614 * up the size and eof info.
2615 2615 */
2616 2616 if (rd->rd_eof)
2617 2617 rd->rd_eof = FALSE;
2618 2618 dropbytes = nfscmd_dropped_entrysize(
2619 2619 (struct dirent64 *)rd->rd_entries, nents, ret);
2620 2620 rd->rd_size -= dropbytes;
2621 2621 }
2622 2622 if (ndata == NULL) {
2623 2623 ndata = (char *)rd->rd_entries;
2624 2624 } else if (ndata != (char *)rd->rd_entries) {
2625 2625 kmem_free(rd->rd_entries, rd->rd_bufsize);
2626 2626 rd->rd_entries = (void *)ndata;
2627 2627 rd->rd_bufsize = rda->rda_count;
2628 2628 }
2629 2629
2630 2630 bad:
2631 2631 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2632 2632
2633 2633 #if 0 /* notyet */
2634 2634 /*
2635 2635 * Don't do this. It causes local disk writes when just
2636 2636 * reading the file and the overhead is deemed larger
2637 2637 * than the benefit.
2638 2638 */
2639 2639 /*
2640 2640 * Force modified metadata out to stable storage.
2641 2641 */
2642 2642 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2643 2643 #endif
2644 2644
2645 2645 VN_RELE(vp);
2646 2646
2647 2647 rd->rd_status = puterrno(error);
2648 2648
2649 2649 }
2650 2650 void *
2651 2651 rfs_readdir_getfh(struct nfsrddirargs *rda)
2652 2652 {
2653 2653 return (&rda->rda_fh);
2654 2654 }
2655 2655 void
2656 2656 rfs_rddirfree(struct nfsrddirres *rd)
2657 2657 {
2658 2658 if (rd->rd_entries != NULL)
2659 2659 kmem_free(rd->rd_entries, rd->rd_bufsize);
2660 2660 }
2661 2661
2662 2662 /* ARGSUSED */
2663 2663 void
2664 2664 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2665 2665 struct svc_req *req, cred_t *cr, bool_t ro)
2666 2666 {
2667 2667 int error;
2668 2668 struct statvfs64 sb;
2669 2669 vnode_t *vp;
2670 2670
2671 2671 vp = nfs_fhtovp(fh, exi);
2672 2672 if (vp == NULL) {
2673 2673 fs->fs_status = NFSERR_STALE;
2674 2674 return;
2675 2675 }
2676 2676
2677 2677 error = VFS_STATVFS(vp->v_vfsp, &sb);
2678 2678
2679 2679 if (!error) {
2680 2680 fs->fs_tsize = nfstsize();
2681 2681 fs->fs_bsize = sb.f_frsize;
2682 2682 fs->fs_blocks = sb.f_blocks;
2683 2683 fs->fs_bfree = sb.f_bfree;
2684 2684 fs->fs_bavail = sb.f_bavail;
2685 2685 }
2686 2686
2687 2687 VN_RELE(vp);
2688 2688
2689 2689 fs->fs_status = puterrno(error);
2690 2690
2691 2691 }
2692 2692 void *
2693 2693 rfs_statfs_getfh(fhandle_t *fh)
2694 2694 {
2695 2695 return (fh);
2696 2696 }
2697 2697
2698 2698 static int
2699 2699 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2700 2700 {
2701 2701 vap->va_mask = 0;
2702 2702
2703 2703 /*
2704 2704 * There was a sign extension bug in some VFS based systems
2705 2705 * which stored the mode as a short. When it would get
2706 2706 * assigned to a u_long, no sign extension would occur.
2707 2707 * It needed to, but this wasn't noticed because sa_mode
2708 2708 * would then get assigned back to the short, thus ignoring
2709 2709 * the upper 16 bits of sa_mode.
2710 2710 *
2711 2711 * To make this implementation work for both broken
2712 2712 * clients and good clients, we check for both versions
2713 2713 * of the mode.
2714 2714 */
2715 2715 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2716 2716 sa->sa_mode != (uint32_t)-1) {
2717 2717 vap->va_mask |= AT_MODE;
2718 2718 vap->va_mode = sa->sa_mode;
2719 2719 }
2720 2720 if (sa->sa_uid != (uint32_t)-1) {
2721 2721 vap->va_mask |= AT_UID;
2722 2722 vap->va_uid = sa->sa_uid;
2723 2723 }
2724 2724 if (sa->sa_gid != (uint32_t)-1) {
2725 2725 vap->va_mask |= AT_GID;
2726 2726 vap->va_gid = sa->sa_gid;
2727 2727 }
2728 2728 if (sa->sa_size != (uint32_t)-1) {
2729 2729 vap->va_mask |= AT_SIZE;
2730 2730 vap->va_size = sa->sa_size;
2731 2731 }
2732 2732 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2733 2733 sa->sa_atime.tv_usec != (int32_t)-1) {
2734 2734 #ifndef _LP64
2735 2735 /* return error if time overflow */
2736 2736 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2737 2737 return (EOVERFLOW);
2738 2738 #endif
2739 2739 vap->va_mask |= AT_ATIME;
2740 2740 /*
2741 2741 * nfs protocol defines times as unsigned so don't extend sign,
2742 2742 * unless sysadmin set nfs_allow_preepoch_time.
2743 2743 */
2744 2744 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2745 2745 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2746 2746 }
2747 2747 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2748 2748 sa->sa_mtime.tv_usec != (int32_t)-1) {
2749 2749 #ifndef _LP64
2750 2750 /* return error if time overflow */
2751 2751 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2752 2752 return (EOVERFLOW);
2753 2753 #endif
2754 2754 vap->va_mask |= AT_MTIME;
2755 2755 /*
2756 2756 * nfs protocol defines times as unsigned so don't extend sign,
2757 2757 * unless sysadmin set nfs_allow_preepoch_time.
2758 2758 */
2759 2759 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2760 2760 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2761 2761 }
2762 2762 return (0);
2763 2763 }
2764 2764
2765 2765 static enum nfsftype vt_to_nf[] = {
2766 2766 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2767 2767 };
2768 2768
2769 2769 /*
2770 2770 * check the following fields for overflow: nodeid, size, and time.
2771 2771 * There could be a problem when converting 64-bit LP64 fields
2772 2772 * into 32-bit ones. Return an error if there is an overflow.
2773 2773 */
2774 2774 int
2775 2775 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2776 2776 {
2777 2777 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2778 2778 na->na_type = vt_to_nf[vap->va_type];
2779 2779
2780 2780 if (vap->va_mode == (unsigned short) -1)
2781 2781 na->na_mode = (uint32_t)-1;
2782 2782 else
2783 2783 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2784 2784
2785 2785 if (vap->va_uid == (unsigned short)(-1))
2786 2786 na->na_uid = (uint32_t)(-1);
2787 2787 else if (vap->va_uid == UID_NOBODY)
2788 2788 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2789 2789 else
2790 2790 na->na_uid = vap->va_uid;
2791 2791
2792 2792 if (vap->va_gid == (unsigned short)(-1))
2793 2793 na->na_gid = (uint32_t)-1;
2794 2794 else if (vap->va_gid == GID_NOBODY)
2795 2795 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2796 2796 else
2797 2797 na->na_gid = vap->va_gid;
2798 2798
2799 2799 /*
2800 2800 * Do we need to check fsid for overflow? It is 64-bit in the
2801 2801 * vattr, but are bigger than 32 bit values supported?
2802 2802 */
2803 2803 na->na_fsid = vap->va_fsid;
2804 2804
2805 2805 na->na_nodeid = vap->va_nodeid;
2806 2806
2807 2807 /*
2808 2808 * Check to make sure that the nodeid is representable over the
2809 2809 * wire without losing bits.
2810 2810 */
2811 2811 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2812 2812 return (EFBIG);
2813 2813 na->na_nlink = vap->va_nlink;
2814 2814
2815 2815 /*
2816 2816 * Check for big files here, instead of at the caller. See
2817 2817 * comments in cstat for large special file explanation.
2818 2818 */
2819 2819 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2820 2820 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2821 2821 return (EFBIG);
2822 2822 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2823 2823 /* UNKNOWN_SIZE | OVERFLOW */
2824 2824 na->na_size = MAXOFF32_T;
2825 2825 } else
2826 2826 na->na_size = vap->va_size;
2827 2827 } else
2828 2828 na->na_size = vap->va_size;
2829 2829
2830 2830 /*
2831 2831 * If the vnode times overflow the 32-bit times that NFS2
2832 2832 * uses on the wire then return an error.
2833 2833 */
2834 2834 if (!NFS_VAP_TIME_OK(vap)) {
2835 2835 return (EOVERFLOW);
2836 2836 }
2837 2837 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2838 2838 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2839 2839
2840 2840 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2841 2841 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2842 2842
2843 2843 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2844 2844 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2845 2845
2846 2846 /*
2847 2847 * If the dev_t will fit into 16 bits then compress
2848 2848 * it, otherwise leave it alone. See comments in
2849 2849 * nfs_client.c.
2850 2850 */
2851 2851 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2852 2852 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2853 2853 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2854 2854 else
2855 2855 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2856 2856
2857 2857 na->na_blocks = vap->va_nblocks;
2858 2858 na->na_blocksize = vap->va_blksize;
2859 2859
2860 2860 /*
2861 2861 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2862 2862 * over-the-wire protocols for named-pipe vnodes. It remaps the
2863 2863 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2864 2864 *
2865 2865 * BUYER BEWARE:
2866 2866 * If you are porting the NFS to a non-Sun server, you probably
2867 2867 * don't want to include the following block of code. The
2868 2868 * over-the-wire special file types will be changing with the
2869 2869 * NFS Protocol Revision.
2870 2870 */
2871 2871 if (vap->va_type == VFIFO)
2872 2872 NA_SETFIFO(na);
2873 2873 return (0);
2874 2874 }
2875 2875
2876 2876 /*
2877 2877 * acl v2 support: returns approximate permission.
2878 2878 * default: returns minimal permission (more restrictive)
2879 2879 * aclok: returns maximal permission (less restrictive)
2880 2880 * This routine changes the permissions that are alaredy in *va.
2881 2881 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2882 2882 * CLASS_OBJ is always the same as GROUP_OBJ entry.
2883 2883 */
2884 2884 static void
2885 2885 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2886 2886 {
2887 2887 vsecattr_t vsa;
2888 2888 int aclcnt;
2889 2889 aclent_t *aclentp;
2890 2890 mode_t mask_perm;
2891 2891 mode_t grp_perm;
2892 2892 mode_t other_perm;
2893 2893 mode_t other_orig;
2894 2894 int error;
2895 2895
2896 2896 /* dont care default acl */
2897 2897 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2898 2898 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2899 2899
2900 2900 if (!error) {
2901 2901 aclcnt = vsa.vsa_aclcnt;
2902 2902 if (aclcnt > MIN_ACL_ENTRIES) {
2903 2903 /* non-trivial ACL */
2904 2904 aclentp = vsa.vsa_aclentp;
2905 2905 if (exi->exi_export.ex_flags & EX_ACLOK) {
2906 2906 /* maximal permissions */
2907 2907 grp_perm = 0;
2908 2908 other_perm = 0;
2909 2909 for (; aclcnt > 0; aclcnt--, aclentp++) {
2910 2910 switch (aclentp->a_type) {
2911 2911 case USER_OBJ:
2912 2912 break;
2913 2913 case USER:
2914 2914 grp_perm |=
2915 2915 aclentp->a_perm << 3;
2916 2916 other_perm |= aclentp->a_perm;
2917 2917 break;
2918 2918 case GROUP_OBJ:
2919 2919 grp_perm |=
2920 2920 aclentp->a_perm << 3;
2921 2921 break;
2922 2922 case GROUP:
2923 2923 other_perm |= aclentp->a_perm;
2924 2924 break;
2925 2925 case OTHER_OBJ:
2926 2926 other_orig = aclentp->a_perm;
2927 2927 break;
2928 2928 case CLASS_OBJ:
2929 2929 mask_perm = aclentp->a_perm;
2930 2930 break;
2931 2931 default:
2932 2932 break;
2933 2933 }
2934 2934 }
2935 2935 grp_perm &= mask_perm << 3;
2936 2936 other_perm &= mask_perm;
2937 2937 other_perm |= other_orig;
2938 2938
2939 2939 } else {
2940 2940 /* minimal permissions */
2941 2941 grp_perm = 070;
2942 2942 other_perm = 07;
2943 2943 for (; aclcnt > 0; aclcnt--, aclentp++) {
2944 2944 switch (aclentp->a_type) {
2945 2945 case USER_OBJ:
2946 2946 break;
2947 2947 case USER:
2948 2948 case CLASS_OBJ:
2949 2949 grp_perm &=
2950 2950 aclentp->a_perm << 3;
2951 2951 other_perm &=
2952 2952 aclentp->a_perm;
2953 2953 break;
2954 2954 case GROUP_OBJ:
2955 2955 grp_perm &=
2956 2956 aclentp->a_perm << 3;
2957 2957 break;
2958 2958 case GROUP:
2959 2959 other_perm &=
2960 2960 aclentp->a_perm;
2961 2961 break;
2962 2962 case OTHER_OBJ:
2963 2963 other_perm &=
2964 2964 aclentp->a_perm;
2965 2965 break;
2966 2966 default:
2967 2967 break;
2968 2968 }
2969 2969 }
2970 2970 }
2971 2971 /* copy to va */
2972 2972 va->va_mode &= ~077;
2973 2973 va->va_mode |= grp_perm | other_perm;
2974 2974 }
2975 2975 if (vsa.vsa_aclcnt)
2976 2976 kmem_free(vsa.vsa_aclentp,
2977 2977 vsa.vsa_aclcnt * sizeof (aclent_t));
2978 2978 }
2979 2979 }
2980 2980
2981 2981 void
2982 2982 rfs_srvrinit(void)
2983 2983 {
2984 2984 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2985 2985 nfs2_srv_caller_id = fs_new_caller_id();
2986 2986 }
2987 2987
2988 2988 void
2989 2989 rfs_srvrfini(void)
2990 2990 {
2991 2991 mutex_destroy(&rfs_async_write_lock);
2992 2992 }
2993 2993
2994 2994 static int
2995 2995 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2996 2996 {
2997 2997 struct clist *wcl;
2998 2998 int wlist_len;
2999 2999 uint32_t count = rr->rr_count;
3000 3000
3001 3001 wcl = ra->ra_wlist;
3002 3002
3003 3003 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3004 3004 return (FALSE);
3005 3005 }
3006 3006
3007 3007 wcl = ra->ra_wlist;
3008 3008 rr->rr_ok.rrok_wlist_len = wlist_len;
3009 3009 rr->rr_ok.rrok_wlist = wcl;
3010 3010
3011 3011 return (TRUE);
3012 3012 }
↓ open down ↓ |
803 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX