illumos-gate__ New usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  27  *      All rights reserved.
  28  */
  29 
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/systm.h>
  33 #include <sys/cred.h>
  34 #include <sys/buf.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vnode.h>
  37 #include <sys/uio.h>
  38 #include <sys/stat.h>
  39 #include <sys/errno.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/statvfs.h>
  42 #include <sys/kmem.h>
  43 #include <sys/kstat.h>
  44 #include <sys/dirent.h>
  45 #include <sys/cmn_err.h>
  46 #include <sys/debug.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/mode.h>
  49 #include <sys/acl.h>
  50 #include <sys/nbmlock.h>
  51 #include <sys/policy.h>
  52 #include <sys/sdt.h>
  53 
  54 #include <rpc/types.h>
  55 #include <rpc/auth.h>
  56 #include <rpc/svc.h>
  57 
  58 #include <nfs/nfs.h>
  59 #include <nfs/export.h>
  60 #include <nfs/nfs_cmd.h>
  61 
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_map.h>
  66 #include <vm/seg_kmem.h>
  67 
  68 #include <sys/strsubr.h>
  69 
  70 /*
  71  * These are the interface routines for the server side of the
  72  * Network File System.  See the NFS version 2 protocol specification
  73  * for a description of this interface.
  74  */
  75 
  76 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  77 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  78                         cred_t *);
  79 
  80 /*
  81  * Some "over the wire" UNIX file types.  These are encoded
  82  * into the mode.  This needs to be fixed in the next rev.
  83  */
  84 #define IFMT            0170000         /* type of file */
  85 #define IFCHR           0020000         /* character special */
  86 #define IFBLK           0060000         /* block special */
  87 #define IFSOCK          0140000         /* socket */
  88 
  89 u_longlong_t nfs2_srv_caller_id;
  90 
  91 /*
  92  * Get file attributes.
  93  * Returns the current attributes of the file with the given fhandle.
  94  */
  95 /* ARGSUSED */
  96 void
  97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
  98         struct svc_req *req, cred_t *cr)
  99 {
 100         int error;
 101         vnode_t *vp;
 102         struct vattr va;
 103 
 104         vp = nfs_fhtovp(fhp, exi);
 105         if (vp == NULL) {
 106                 ns->ns_status = NFSERR_STALE;
 107                 return;
 108         }
 109 
 110         /*
 111          * Do the getattr.
 112          */
 113         va.va_mask = AT_ALL;    /* we want all the attributes */
 114 
 115         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 116 
 117         /* check for overflows */
 118         if (!error) {
 119                 /* Lie about the object type for a referral */
 120                 if (vn_is_nfs_reparse(vp, cr))
 121                         va.va_type = VLNK;
 122 
 123                 acl_perm(vp, exi, &va, cr);
 124                 error = vattr_to_nattr(&va, &ns->ns_attr);
 125         }
 126 
 127         VN_RELE(vp);
 128 
 129         ns->ns_status = puterrno(error);
 130 }
 131 void *
 132 rfs_getattr_getfh(fhandle_t *fhp)
 133 {
 134         return (fhp);
 135 }
 136 
 137 /*
 138  * Set file attributes.
 139  * Sets the attributes of the file with the given fhandle.  Returns
 140  * the new attributes.
 141  */
 142 void
 143 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 144         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 145 {
 146         int error;
 147         int flag;
 148         int in_crit = 0;
 149         vnode_t *vp;
 150         struct vattr va;
 151         struct vattr bva;
 152         struct flock64 bf;
 153         caller_context_t ct;
 154 
 155 
 156         vp = nfs_fhtovp(&args->saa_fh, exi);
 157         if (vp == NULL) {
 158                 ns->ns_status = NFSERR_STALE;
 159                 return;
 160         }
 161 
 162         if (rdonly(exi, req) || vn_is_readonly(vp)) {
 163                 VN_RELE(vp);
 164                 ns->ns_status = NFSERR_ROFS;
 165                 return;
 166         }
 167 
 168         error = sattr_to_vattr(&args->saa_sa, &va);
 169         if (error) {
 170                 VN_RELE(vp);
 171                 ns->ns_status = puterrno(error);
 172                 return;
 173         }
 174 
 175         /*
 176          * If the client is requesting a change to the mtime,
 177          * but the nanosecond field is set to 1 billion, then
 178          * this is a flag to the server that it should set the
 179          * atime and mtime fields to the server's current time.
 180          * The 1 billion number actually came from the client
 181          * as 1 million, but the units in the over the wire
 182          * request are microseconds instead of nanoseconds.
 183          *
 184          * This is an overload of the protocol and should be
 185          * documented in the NFS Version 2 protocol specification.
 186          */
 187         if (va.va_mask & AT_MTIME) {
 188                 if (va.va_mtime.tv_nsec == 1000000000) {
 189                         gethrestime(&va.va_mtime);
 190                         va.va_atime = va.va_mtime;
 191                         va.va_mask |= AT_ATIME;
 192                         flag = 0;
 193                 } else
 194                         flag = ATTR_UTIME;
 195         } else
 196                 flag = 0;
 197 
 198         /*
 199          * If the filesystem is exported with nosuid, then mask off
 200          * the setuid and setgid bits.
 201          */
 202         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 203             (exi->exi_export.ex_flags & EX_NOSUID))
 204                 va.va_mode &= ~(VSUID | VSGID);
 205 
 206         ct.cc_sysid = 0;
 207         ct.cc_pid = 0;
 208         ct.cc_caller_id = nfs2_srv_caller_id;
 209         ct.cc_flags = CC_DONTBLOCK;
 210 
 211         /*
 212          * We need to specially handle size changes because it is
 213          * possible for the client to create a file with modes
 214          * which indicate read-only, but with the file opened for
 215          * writing.  If the client then tries to set the size of
 216          * the file, then the normal access checking done in
 217          * VOP_SETATTR would prevent the client from doing so,
 218          * although it should be legal for it to do so.  To get
 219          * around this, we do the access checking for ourselves
 220          * and then use VOP_SPACE which doesn't do the access
 221          * checking which VOP_SETATTR does. VOP_SPACE can only
 222          * operate on VREG files, let VOP_SETATTR handle the other
 223          * extremely rare cases.
 224          * Also the client should not be allowed to change the
 225          * size of the file if there is a conflicting non-blocking
 226          * mandatory lock in the region of change.
 227          */
 228         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 229                 if (nbl_need_check(vp)) {
 230                         nbl_start_crit(vp, RW_READER);
 231                         in_crit = 1;
 232                 }
 233 
 234                 bva.va_mask = AT_UID | AT_SIZE;
 235 
 236                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 237 
 238                 if (error) {
 239                         if (in_crit)
 240                                 nbl_end_crit(vp);
 241                         VN_RELE(vp);
 242                         ns->ns_status = puterrno(error);
 243                         return;
 244                 }
 245 
 246                 if (in_crit) {
 247                         u_offset_t offset;
 248                         ssize_t length;
 249 
 250                         if (va.va_size < bva.va_size) {
 251                                 offset = va.va_size;
 252                                 length = bva.va_size - va.va_size;
 253                         } else {
 254                                 offset = bva.va_size;
 255                                 length = va.va_size - bva.va_size;
 256                         }
 257                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 258                             NULL)) {
 259                                 error = EACCES;
 260                         }
 261                 }
 262 
 263                 if (crgetuid(cr) == bva.va_uid && !error &&
 264                     va.va_size != bva.va_size) {
 265                         va.va_mask &= ~AT_SIZE;
 266                         bf.l_type = F_WRLCK;
 267                         bf.l_whence = 0;
 268                         bf.l_start = (off64_t)va.va_size;
 269                         bf.l_len = 0;
 270                         bf.l_sysid = 0;
 271                         bf.l_pid = 0;
 272 
 273                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 274                             (offset_t)va.va_size, cr, &ct);
 275                 }
 276                 if (in_crit)
 277                         nbl_end_crit(vp);
 278         } else
 279                 error = 0;
 280 
 281         /*
 282          * Do the setattr.
 283          */
 284         if (!error && va.va_mask) {
 285                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 286         }
 287 
 288         /*
 289          * check if the monitor on either vop_space or vop_setattr detected
 290          * a delegation conflict and if so, mark the thread flag as
 291          * wouldblock so that the response is dropped and the client will
 292          * try again.
 293          */
 294         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 295                 VN_RELE(vp);
 296                 curthread->t_flag |= T_WOULDBLOCK;
 297                 return;
 298         }
 299 
 300         if (!error) {
 301                 va.va_mask = AT_ALL;    /* get everything */
 302 
 303                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 304 
 305                 /* check for overflows */
 306                 if (!error) {
 307                         acl_perm(vp, exi, &va, cr);
 308                         error = vattr_to_nattr(&va, &ns->ns_attr);
 309                 }
 310         }
 311 
 312         ct.cc_flags = 0;
 313 
 314         /*
 315          * Force modified metadata out to stable storage.
 316          */
 317         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 318 
 319         VN_RELE(vp);
 320 
 321         ns->ns_status = puterrno(error);
 322 }
 323 void *
 324 rfs_setattr_getfh(struct nfssaargs *args)
 325 {
 326         return (&args->saa_fh);
 327 }
 328 
 329 /* Change and release @exip and @vpp only in success */
 330 int
 331 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 332 {
 333         struct exportinfo *exi;
 334         vnode_t *vp;
 335         fid_t fid;
 336         int error;
 337 
 338         vp = *vpp;
 339 
 340         /* traverse() releases argument in success */
 341         VN_HOLD(*vpp);
 342 
 343         if ((error = traverse(&vp)) != 0) {
 344                 VN_RELE(*vpp);
 345                 return (error);
 346         }
 347 
 348         bzero(&fid, sizeof (fid));
 349         fid.fid_len = MAXFIDSZ;
 350         error = VOP_FID(vp, &fid, NULL);
 351         if (error) {
 352                 VN_RELE(vp);
 353                 return (error);
 354         }
 355 
 356         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 357         if (exi == NULL ||
 358             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 359                 /* It is not error, just subdir is not exported
 360                  * or "nohide" is not set
 361                  */
 362                 VN_RELE(vp);
 363         } else {
 364                 /* go to submount */
 365                 exi_rele(*exip);
 366                 *exip = exi;
 367 
 368                 VN_RELE(*vpp);
 369                 *vpp = vp;
 370         }
 371         return (0);
 372 }
 373 
 374 /*
 375  * Directory lookup.
 376  * Returns an fhandle and file attributes for file name in a directory.
 377  */
 378 /* ARGSUSED */
 379 void
 380 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 381         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 382 {
 383         int error;
 384         vnode_t *dvp;
 385         vnode_t *vp;
 386         struct vattr va;
 387         fhandle_t *fhp = da->da_fhandle;
 388         struct sec_ol sec = {0, 0};
 389         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 390         char *name;
 391         struct sockaddr *ca;
 392 
 393         /*
 394          * Trusted Extension doesn't support NFSv2. MOUNT
 395          * will reject v2 clients. Need to prevent v2 client
 396          * access via WebNFS here.
 397          */
 398         if (is_system_labeled() && req->rq_vers == 2) {
 399                 dr->dr_status = NFSERR_ACCES;
 400                 return;
 401         }
 402 
 403         /*
 404          * Disallow NULL paths
 405          */
 406         if (da->da_name == NULL || *da->da_name == '\0') {
 407                 dr->dr_status = NFSERR_ACCES;
 408                 return;
 409         }
 410 
 411         /*
 412          * Allow lookups from the root - the default
 413          * location of the public filehandle.
 414          */
 415         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 416                 dvp = rootdir;
 417                 VN_HOLD(dvp);
 418         } else {
 419                 dvp = nfs_fhtovp(fhp, exi);
 420                 if (dvp == NULL) {
 421                         dr->dr_status = NFSERR_STALE;
 422                         return;
 423                 }
 424         }
 425 
 426         /*
 427          * Not allow lookup beyond root.
 428          * If the filehandle matches a filehandle of the exi,
 429          * then the ".." refers beyond the root of an exported filesystem.
 430          */
 431         if (strcmp(da->da_name, "..") == 0 &&
 432             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 433                 VN_RELE(dvp);
 434                 dr->dr_status = NFSERR_NOENT;
 435                 return;
 436         }
 437 
 438         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 439         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 440             MAXPATHLEN);
 441 
 442         if (name == NULL) {
 443                 dr->dr_status = NFSERR_ACCES;
 444                 return;
 445         }
 446 
 447         exi_hold(exi);
 448 
 449         /*
 450          * If the public filehandle is used then allow
 451          * a multi-component lookup, i.e. evaluate
 452          * a pathname and follow symbolic links if
 453          * necessary.
 454          *
 455          * This may result in a vnode in another filesystem
 456          * which is OK as long as the filesystem is exported.
 457          */
 458         if (PUBLIC_FH2(fhp)) {
 459                 struct exportinfo *new;
 460 
 461                 publicfh_flag = TRUE;
 462                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &new,
 463                     &sec);
 464 
 465                 if (error == 0) {
 466                         exi_rele(exi);
 467                         exi = new;
 468                 }
 469         } else {
 470                 /*
 471                  * Do a normal single component lookup.
 472                  */
 473                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 474                     NULL, NULL, NULL);
 475         }
 476 
 477         if (name != da->da_name)
 478                 kmem_free(name, MAXPATHLEN);
 479 
 480         if (error == 0 && vn_ismntpt(vp)) {
 481                 error = rfs_cross_mnt(&vp, &exi);
 482                 if (error)
 483                         VN_RELE(vp);
 484         }
 485 
 486         if (!error) {
 487                 va.va_mask = AT_ALL;    /* we want everything */
 488 
 489                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 490 
 491                 /* check for overflows */
 492                 if (!error) {
 493                         acl_perm(vp, exi, &va, cr);
 494                         error = vattr_to_nattr(&va, &dr->dr_attr);
 495                         if (!error) {
 496                                 if (sec.sec_flags & SEC_QUERY)
 497                                         error = makefh_ol(&dr->dr_fhandle, exi,
 498                                             sec.sec_index);
 499                                 else {
 500                                         error = makefh(&dr->dr_fhandle, vp,
 501                                             exi);
 502                                         if (!error && publicfh_flag &&
 503                                             !chk_clnt_sec(exi, req))
 504                                                 auth_weak = TRUE;
 505                                 }
 506                         }
 507                 }
 508                 VN_RELE(vp);
 509         }
 510 
 511         VN_RELE(dvp);
 512 
 513         /* The passed argument exportinfo is released by the
 514          * caller, comon_dispatch
 515          */
 516         exi_rele(exi);
 517 
 518         /*
 519          * If it's public fh, no 0x81, and client's flavor is
 520          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 521          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 522          */
 523         if (auth_weak)
 524                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 525         else
 526                 dr->dr_status = puterrno(error);
 527 }
 528 void *
 529 rfs_lookup_getfh(struct nfsdiropargs *da)
 530 {
 531         return (da->da_fhandle);
 532 }
 533 
 534 /*
 535  * Read symbolic link.
 536  * Returns the string in the symbolic link at the given fhandle.
 537  */
 538 /* ARGSUSED */
 539 void
 540 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 541         struct svc_req *req, cred_t *cr)
 542 {
 543         int error;
 544         struct iovec iov;
 545         struct uio uio;
 546         vnode_t *vp;
 547         struct vattr va;
 548         struct sockaddr *ca;
 549         char *name = NULL;
 550         int is_referral = 0;
 551 
 552         vp = nfs_fhtovp(fhp, exi);
 553         if (vp == NULL) {
 554                 rl->rl_data = NULL;
 555                 rl->rl_status = NFSERR_STALE;
 556                 return;
 557         }
 558 
 559         va.va_mask = AT_MODE;
 560 
 561         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 562 
 563         if (error) {
 564                 VN_RELE(vp);
 565                 rl->rl_data = NULL;
 566                 rl->rl_status = puterrno(error);
 567                 return;
 568         }
 569 
 570         if (MANDLOCK(vp, va.va_mode)) {
 571                 VN_RELE(vp);
 572                 rl->rl_data = NULL;
 573                 rl->rl_status = NFSERR_ACCES;
 574                 return;
 575         }
 576 
 577         /* We lied about the object type for a referral */
 578         if (vn_is_nfs_reparse(vp, cr))
 579                 is_referral = 1;
 580 
 581         /*
 582          * XNFS and RFC1094 require us to return ENXIO if argument
 583          * is not a link. BUGID 1138002.
 584          */
 585         if (vp->v_type != VLNK && !is_referral) {
 586                 VN_RELE(vp);
 587                 rl->rl_data = NULL;
 588                 rl->rl_status = NFSERR_NXIO;
 589                 return;
 590         }
 591 
 592         /*
 593          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 594          */
 595         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 596 
 597         if (is_referral) {
 598                 char *s;
 599                 size_t strsz;
 600 
 601                 /* Get an artificial symlink based on a referral */
 602                 s = build_symlink(vp, cr, &strsz);
 603                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 604                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 605                     vnode_t *, vp, char *, s);
 606                 if (s == NULL)
 607                         error = EINVAL;
 608                 else {
 609                         error = 0;
 610                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 611                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 612                         kmem_free(s, strsz);
 613                 }
 614 
 615         } else {
 616 
 617                 /*
 618                  * Set up io vector to read sym link data
 619                  */
 620                 iov.iov_base = rl->rl_data;
 621                 iov.iov_len = NFS_MAXPATHLEN;
 622                 uio.uio_iov = &iov;
 623                 uio.uio_iovcnt = 1;
 624                 uio.uio_segflg = UIO_SYSSPACE;
 625                 uio.uio_extflg = UIO_COPY_CACHED;
 626                 uio.uio_loffset = (offset_t)0;
 627                 uio.uio_resid = NFS_MAXPATHLEN;
 628 
 629                 /*
 630                  * Do the readlink.
 631                  */
 632                 error = VOP_READLINK(vp, &uio, cr, NULL);
 633 
 634                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 635 
 636                 if (!error)
 637                         rl->rl_data[rl->rl_count] = '\0';
 638 
 639         }
 640 
 641 
 642         VN_RELE(vp);
 643 
 644         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 645         name = nfscmd_convname(ca, exi, rl->rl_data,
 646             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 647 
 648         if (name != NULL && name != rl->rl_data) {
 649                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 650                 rl->rl_data = name;
 651         }
 652 
 653         /*
 654          * XNFS and RFC1094 require us to return ENXIO if argument
 655          * is not a link. UFS returns EINVAL if this is the case,
 656          * so we do the mapping here. BUGID 1138002.
 657          */
 658         if (error == EINVAL)
 659                 rl->rl_status = NFSERR_NXIO;
 660         else
 661                 rl->rl_status = puterrno(error);
 662 
 663 }
 664 void *
 665 rfs_readlink_getfh(fhandle_t *fhp)
 666 {
 667         return (fhp);
 668 }
 669 /*
 670  * Free data allocated by rfs_readlink
 671  */
 672 void
 673 rfs_rlfree(struct nfsrdlnres *rl)
 674 {
 675         if (rl->rl_data != NULL)
 676                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 677 }
 678 
 679 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 680 
 681 /*
 682  * Read data.
 683  * Returns some data read from the file at the given fhandle.
 684  */
 685 /* ARGSUSED */
 686 void
 687 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 688         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 689 {
 690         vnode_t *vp;
 691         int error;
 692         struct vattr va;
 693         struct iovec iov;
 694         struct uio uio;
 695         mblk_t *mp;
 696         int alloc_err = 0;
 697         int in_crit = 0;
 698         caller_context_t ct;
 699 
 700         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 701         if (vp == NULL) {
 702                 rr->rr_data = NULL;
 703                 rr->rr_status = NFSERR_STALE;
 704                 return;
 705         }
 706 
 707         if (vp->v_type != VREG) {
 708                 VN_RELE(vp);
 709                 rr->rr_data = NULL;
 710                 rr->rr_status = NFSERR_ISDIR;
 711                 return;
 712         }
 713 
 714         ct.cc_sysid = 0;
 715         ct.cc_pid = 0;
 716         ct.cc_caller_id = nfs2_srv_caller_id;
 717         ct.cc_flags = CC_DONTBLOCK;
 718 
 719         /*
 720          * Enter the critical region before calling VOP_RWLOCK
 721          * to avoid a deadlock with write requests.
 722          */
 723         if (nbl_need_check(vp)) {
 724                 nbl_start_crit(vp, RW_READER);
 725                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 726                     0, NULL)) {
 727                         nbl_end_crit(vp);
 728                         VN_RELE(vp);
 729                         rr->rr_data = NULL;
 730                         rr->rr_status = NFSERR_ACCES;
 731                         return;
 732                 }
 733                 in_crit = 1;
 734         }
 735 
 736         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 737 
 738         /* check if a monitor detected a delegation conflict */
 739         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 740                 VN_RELE(vp);
 741                 /* mark as wouldblock so response is dropped */
 742                 curthread->t_flag |= T_WOULDBLOCK;
 743 
 744                 rr->rr_data = NULL;
 745                 return;
 746         }
 747 
 748         va.va_mask = AT_ALL;
 749 
 750         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 751 
 752         if (error) {
 753                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 754                 if (in_crit)
 755                         nbl_end_crit(vp);
 756 
 757                 VN_RELE(vp);
 758                 rr->rr_data = NULL;
 759                 rr->rr_status = puterrno(error);
 760 
 761                 return;
 762         }
 763 
 764         /*
 765          * This is a kludge to allow reading of files created
 766          * with no read permission.  The owner of the file
 767          * is always allowed to read it.
 768          */
 769         if (crgetuid(cr) != va.va_uid) {
 770                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 771 
 772                 if (error) {
 773                         /*
 774                          * Exec is the same as read over the net because
 775                          * of demand loading.
 776                          */
 777                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 778                 }
 779                 if (error) {
 780                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 781                         if (in_crit)
 782                                 nbl_end_crit(vp);
 783                         VN_RELE(vp);
 784                         rr->rr_data = NULL;
 785                         rr->rr_status = puterrno(error);
 786 
 787                         return;
 788                 }
 789         }
 790 
 791         if (MANDLOCK(vp, va.va_mode)) {
 792                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 793                 if (in_crit)
 794                         nbl_end_crit(vp);
 795 
 796                 VN_RELE(vp);
 797                 rr->rr_data = NULL;
 798                 rr->rr_status = NFSERR_ACCES;
 799 
 800                 return;
 801         }
 802 
 803         rr->rr_ok.rrok_wlist_len = 0;
 804         rr->rr_ok.rrok_wlist = NULL;
 805 
 806         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 807                 rr->rr_count = 0;
 808                 rr->rr_data = NULL;
 809                 /*
 810                  * In this case, status is NFS_OK, but there is no data
 811                  * to encode. So set rr_mp to NULL.
 812                  */
 813                 rr->rr_mp = NULL;
 814                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 815                 if (rr->rr_ok.rrok_wlist)
 816                         clist_zero_len(rr->rr_ok.rrok_wlist);
 817                 goto done;
 818         }
 819 
 820         if (ra->ra_wlist) {
 821                 mp = NULL;
 822                 rr->rr_mp = NULL;
 823                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 824                 if (ra->ra_count > iov.iov_len) {
 825                         rr->rr_data = NULL;
 826                         rr->rr_status = NFSERR_INVAL;
 827                         goto done;
 828                 }
 829         } else {
 830                 /*
 831                  * mp will contain the data to be sent out in the read reply.
 832                  * This will be freed after the reply has been sent out (by the
 833                  * driver).
 834                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 835                  * that the call to xdrmblk_putmblk() never fails.
 836                  */
 837                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 838                     &alloc_err);
 839                 ASSERT(mp != NULL);
 840                 ASSERT(alloc_err == 0);
 841 
 842                 rr->rr_mp = mp;
 843 
 844                 /*
 845                  * Set up io vector
 846                  */
 847                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 848                 iov.iov_len = ra->ra_count;
 849         }
 850 
 851         uio.uio_iov = &iov;
 852         uio.uio_iovcnt = 1;
 853         uio.uio_segflg = UIO_SYSSPACE;
 854         uio.uio_extflg = UIO_COPY_CACHED;
 855         uio.uio_loffset = (offset_t)ra->ra_offset;
 856         uio.uio_resid = ra->ra_count;
 857 
 858         error = VOP_READ(vp, &uio, 0, cr, &ct);
 859 
 860         if (error) {
 861                 if (mp)
 862                         freeb(mp);
 863 
 864                 /*
 865                  * check if a monitor detected a delegation conflict and
 866                  * mark as wouldblock so response is dropped
 867                  */
 868                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 869                         curthread->t_flag |= T_WOULDBLOCK;
 870                 else
 871                         rr->rr_status = puterrno(error);
 872 
 873                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 874                 if (in_crit)
 875                         nbl_end_crit(vp);
 876 
 877                 VN_RELE(vp);
 878                 rr->rr_data = NULL;
 879 
 880                 return;
 881         }
 882 
 883         /*
 884          * Get attributes again so we can send the latest access
 885          * time to the client side for his cache.
 886          */
 887         va.va_mask = AT_ALL;
 888 
 889         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 890 
 891         if (error) {
 892                 if (mp)
 893                         freeb(mp);
 894 
 895                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 896                 if (in_crit)
 897                         nbl_end_crit(vp);
 898 
 899                 VN_RELE(vp);
 900                 rr->rr_data = NULL;
 901                 rr->rr_status = puterrno(error);
 902 
 903                 return;
 904         }
 905 
 906         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 907 
 908         if (mp) {
 909                 rr->rr_data = (char *)mp->b_datap->db_base;
 910         } else {
 911                 if (ra->ra_wlist) {
 912                         rr->rr_data = (caddr_t)iov.iov_base;
 913                         if (!rdma_setup_read_data2(ra, rr)) {
 914                                 rr->rr_data = NULL;
 915                                 rr->rr_status = puterrno(NFSERR_INVAL);
 916                         }
 917                 }
 918         }
 919 done:
 920         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 921         if (in_crit)
 922                 nbl_end_crit(vp);
 923 
 924         acl_perm(vp, exi, &va, cr);
 925 
 926         /* check for overflows */
 927         error = vattr_to_nattr(&va, &rr->rr_attr);
 928 
 929         VN_RELE(vp);
 930 
 931         rr->rr_status = puterrno(error);
 932 }
 933 
 934 /*
 935  * Free data allocated by rfs_read
 936  */
 937 void
 938 rfs_rdfree(struct nfsrdresult *rr)
 939 {
 940         mblk_t *mp;
 941 
 942         if (rr->rr_status == NFS_OK) {
 943                 mp = rr->rr_mp;
 944                 if (mp != NULL)
 945                         freeb(mp);
 946         }
 947 }
 948 
 949 void *
 950 rfs_read_getfh(struct nfsreadargs *ra)
 951 {
 952         return (&ra->ra_fhandle);
 953 }
 954 
 955 #define MAX_IOVECS      12
 956 
 957 #ifdef DEBUG
 958 static int rfs_write_sync_hits = 0;
 959 static int rfs_write_sync_misses = 0;
 960 #endif
 961 
 962 /*
 963  * Write data to file.
 964  * Returns attributes of a file after writing some data to it.
 965  *
 966  * Any changes made here, especially in error handling might have
 967  * to also be done in rfs_write (which clusters write requests).
 968  */
 969 void
 970 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 971         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 972 {
 973         int error;
 974         vnode_t *vp;
 975         rlim64_t rlimit;
 976         struct vattr va;
 977         struct uio uio;
 978         struct iovec iov[MAX_IOVECS];
 979         mblk_t *m;
 980         struct iovec *iovp;
 981         int iovcnt;
 982         cred_t *savecred;
 983         int in_crit = 0;
 984         caller_context_t ct;
 985 
 986         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 987         if (vp == NULL) {
 988                 ns->ns_status = NFSERR_STALE;
 989                 return;
 990         }
 991 
 992         if (rdonly(exi, req)) {
 993                 VN_RELE(vp);
 994                 ns->ns_status = NFSERR_ROFS;
 995                 return;
 996         }
 997 
 998         if (vp->v_type != VREG) {
 999                 VN_RELE(vp);
1000                 ns->ns_status = NFSERR_ISDIR;
1001                 return;
1002         }
1003 
1004         ct.cc_sysid = 0;
1005         ct.cc_pid = 0;
1006         ct.cc_caller_id = nfs2_srv_caller_id;
1007         ct.cc_flags = CC_DONTBLOCK;
1008 
1009         va.va_mask = AT_UID|AT_MODE;
1010 
1011         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1012 
1013         if (error) {
1014                 VN_RELE(vp);
1015                 ns->ns_status = puterrno(error);
1016 
1017                 return;
1018         }
1019 
1020         if (crgetuid(cr) != va.va_uid) {
1021                 /*
1022                  * This is a kludge to allow writes of files created
1023                  * with read only permission.  The owner of the file
1024                  * is always allowed to write it.
1025                  */
1026                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1027 
1028                 if (error) {
1029                         VN_RELE(vp);
1030                         ns->ns_status = puterrno(error);
1031                         return;
1032                 }
1033         }
1034 
1035         /*
1036          * Can't access a mandatory lock file.  This might cause
1037          * the NFS service thread to block forever waiting for a
1038          * lock to be released that will never be released.
1039          */
1040         if (MANDLOCK(vp, va.va_mode)) {
1041                 VN_RELE(vp);
1042                 ns->ns_status = NFSERR_ACCES;
1043                 return;
1044         }
1045 
1046         /*
1047          * We have to enter the critical region before calling VOP_RWLOCK
1048          * to avoid a deadlock with ufs.
1049          */
1050         if (nbl_need_check(vp)) {
1051                 nbl_start_crit(vp, RW_READER);
1052                 in_crit = 1;
1053                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1054                     wa->wa_count, 0, NULL)) {
1055                         error = EACCES;
1056                         goto out;
1057                 }
1058         }
1059 
1060         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1061 
1062         /* check if a monitor detected a delegation conflict */
1063         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1064                 VN_RELE(vp);
1065                 /* mark as wouldblock so response is dropped */
1066                 curthread->t_flag |= T_WOULDBLOCK;
1067                 return;
1068         }
1069 
1070         if (wa->wa_data || wa->wa_rlist) {
1071                 /* Do the RDMA thing if necessary */
1072                 if (wa->wa_rlist) {
1073                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1074                         iov[0].iov_len = wa->wa_count;
1075                 } else  {
1076                         iov[0].iov_base = wa->wa_data;
1077                         iov[0].iov_len = wa->wa_count;
1078                 }
1079                 uio.uio_iov = iov;
1080                 uio.uio_iovcnt = 1;
1081                 uio.uio_segflg = UIO_SYSSPACE;
1082                 uio.uio_extflg = UIO_COPY_DEFAULT;
1083                 uio.uio_loffset = (offset_t)wa->wa_offset;
1084                 uio.uio_resid = wa->wa_count;
1085                 /*
1086                  * The limit is checked on the client. We
1087                  * should allow any size writes here.
1088                  */
1089                 uio.uio_llimit = curproc->p_fsz_ctl;
1090                 rlimit = uio.uio_llimit - wa->wa_offset;
1091                 if (rlimit < (rlim64_t)uio.uio_resid)
1092                         uio.uio_resid = (uint_t)rlimit;
1093 
1094                 /*
1095                  * for now we assume no append mode
1096                  */
1097                 /*
1098                  * We're changing creds because VM may fault and we need
1099                  * the cred of the current thread to be used if quota
1100                  * checking is enabled.
1101                  */
1102                 savecred = curthread->t_cred;
1103                 curthread->t_cred = cr;
1104                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1105                 curthread->t_cred = savecred;
1106         } else {
1107                 iovcnt = 0;
1108                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1109                         iovcnt++;
1110                 if (iovcnt <= MAX_IOVECS) {
1111 #ifdef DEBUG
1112                         rfs_write_sync_hits++;
1113 #endif
1114                         iovp = iov;
1115                 } else {
1116 #ifdef DEBUG
1117                         rfs_write_sync_misses++;
1118 #endif
1119                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1120                 }
1121                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1122                 uio.uio_iov = iovp;
1123                 uio.uio_iovcnt = iovcnt;
1124                 uio.uio_segflg = UIO_SYSSPACE;
1125                 uio.uio_extflg = UIO_COPY_DEFAULT;
1126                 uio.uio_loffset = (offset_t)wa->wa_offset;
1127                 uio.uio_resid = wa->wa_count;
1128                 /*
1129                  * The limit is checked on the client. We
1130                  * should allow any size writes here.
1131                  */
1132                 uio.uio_llimit = curproc->p_fsz_ctl;
1133                 rlimit = uio.uio_llimit - wa->wa_offset;
1134                 if (rlimit < (rlim64_t)uio.uio_resid)
1135                         uio.uio_resid = (uint_t)rlimit;
1136 
1137                 /*
1138                  * For now we assume no append mode.
1139                  */
1140                 /*
1141                  * We're changing creds because VM may fault and we need
1142                  * the cred of the current thread to be used if quota
1143                  * checking is enabled.
1144                  */
1145                 savecred = curthread->t_cred;
1146                 curthread->t_cred = cr;
1147                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1148                 curthread->t_cred = savecred;
1149 
1150                 if (iovp != iov)
1151                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1152         }
1153 
1154         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1155 
1156         if (!error) {
1157                 /*
1158                  * Get attributes again so we send the latest mod
1159                  * time to the client side for his cache.
1160                  */
1161                 va.va_mask = AT_ALL;    /* now we want everything */
1162 
1163                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1164 
1165                 /* check for overflows */
1166                 if (!error) {
1167                         acl_perm(vp, exi, &va, cr);
1168                         error = vattr_to_nattr(&va, &ns->ns_attr);
1169                 }
1170         }
1171 
1172 out:
1173         if (in_crit)
1174                 nbl_end_crit(vp);
1175         VN_RELE(vp);
1176 
1177         /* check if a monitor detected a delegation conflict */
1178         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1179                 /* mark as wouldblock so response is dropped */
1180                 curthread->t_flag |= T_WOULDBLOCK;
1181         else
1182                 ns->ns_status = puterrno(error);
1183 
1184 }
1185 
1186 struct rfs_async_write {
1187         struct nfswriteargs *wa;
1188         struct nfsattrstat *ns;
1189         struct svc_req *req;
1190         cred_t *cr;
1191         kthread_t *thread;
1192         struct rfs_async_write *list;
1193 };
1194 
1195 struct rfs_async_write_list {
1196         fhandle_t *fhp;
1197         kcondvar_t cv;
1198         struct rfs_async_write *list;
1199         struct rfs_async_write_list *next;
1200 };
1201 
1202 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1203 static kmutex_t rfs_async_write_lock;
1204 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1205 
1206 #define MAXCLIOVECS     42
1207 #define RFSWRITE_INITVAL (enum nfsstat) -1
1208 
1209 #ifdef DEBUG
1210 static int rfs_write_hits = 0;
1211 static int rfs_write_misses = 0;
1212 #endif
1213 
1214 /*
1215  * Write data to file.
1216  * Returns attributes of a file after writing some data to it.
1217  */
1218 void
1219 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1220         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1221 {
1222         int error;
1223         vnode_t *vp;
1224         rlim64_t rlimit;
1225         struct vattr va;
1226         struct uio uio;
1227         struct rfs_async_write_list *lp;
1228         struct rfs_async_write_list *nlp;
1229         struct rfs_async_write *rp;
1230         struct rfs_async_write *nrp;
1231         struct rfs_async_write *trp;
1232         struct rfs_async_write *lrp;
1233         int data_written;
1234         int iovcnt;
1235         mblk_t *m;
1236         struct iovec *iovp;
1237         struct iovec *niovp;
1238         struct iovec iov[MAXCLIOVECS];
1239         int count;
1240         int rcount;
1241         uint_t off;
1242         uint_t len;
1243         struct rfs_async_write nrpsp;
1244         struct rfs_async_write_list nlpsp;
1245         ushort_t t_flag;
1246         cred_t *savecred;
1247         int in_crit = 0;
1248         caller_context_t ct;
1249 
1250         if (!rfs_write_async) {
1251                 rfs_write_sync(wa, ns, exi, req, cr);
1252                 return;
1253         }
1254 
1255         /*
1256          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1257          * is considered an OK.
1258          */
1259         ns->ns_status = RFSWRITE_INITVAL;
1260 
1261         nrp = &nrpsp;
1262         nrp->wa = wa;
1263         nrp->ns = ns;
1264         nrp->req = req;
1265         nrp->cr = cr;
1266         nrp->thread = curthread;
1267 
1268         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1269 
1270         /*
1271          * Look to see if there is already a cluster started
1272          * for this file.
1273          */
1274         mutex_enter(&rfs_async_write_lock);
1275         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1276                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1277                     sizeof (fhandle_t)) == 0)
1278                         break;
1279         }
1280 
1281         /*
1282          * If lp is non-NULL, then there is already a cluster
1283          * started.  We need to place ourselves in the cluster
1284          * list in the right place as determined by starting
1285          * offset.  Conflicts with non-blocking mandatory locked
1286          * regions will be checked when the cluster is processed.
1287          */
1288         if (lp != NULL) {
1289                 rp = lp->list;
1290                 trp = NULL;
1291                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1292                         trp = rp;
1293                         rp = rp->list;
1294                 }
1295                 nrp->list = rp;
1296                 if (trp == NULL)
1297                         lp->list = nrp;
1298                 else
1299                         trp->list = nrp;
1300                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1301                         cv_wait(&lp->cv, &rfs_async_write_lock);
1302                 mutex_exit(&rfs_async_write_lock);
1303 
1304                 return;
1305         }
1306 
1307         /*
1308          * No cluster started yet, start one and add ourselves
1309          * to the list of clusters.
1310          */
1311         nrp->list = NULL;
1312 
1313         nlp = &nlpsp;
1314         nlp->fhp = &wa->wa_fhandle;
1315         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1316         nlp->list = nrp;
1317         nlp->next = NULL;
1318 
1319         if (rfs_async_write_head == NULL) {
1320                 rfs_async_write_head = nlp;
1321         } else {
1322                 lp = rfs_async_write_head;
1323                 while (lp->next != NULL)
1324                         lp = lp->next;
1325                 lp->next = nlp;
1326         }
1327         mutex_exit(&rfs_async_write_lock);
1328 
1329         /*
1330          * Convert the file handle common to all of the requests
1331          * in this cluster to a vnode.
1332          */
1333         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1334         if (vp == NULL) {
1335                 mutex_enter(&rfs_async_write_lock);
1336                 if (rfs_async_write_head == nlp)
1337                         rfs_async_write_head = nlp->next;
1338                 else {
1339                         lp = rfs_async_write_head;
1340                         while (lp->next != nlp)
1341                                 lp = lp->next;
1342                         lp->next = nlp->next;
1343                 }
1344                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1345                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1346                         rp->ns->ns_status = NFSERR_STALE;
1347                         rp->thread->t_flag |= t_flag;
1348                 }
1349                 cv_broadcast(&nlp->cv);
1350                 mutex_exit(&rfs_async_write_lock);
1351 
1352                 return;
1353         }
1354 
1355         /*
1356          * Can only write regular files.  Attempts to write any
1357          * other file types fail with EISDIR.
1358          */
1359         if (vp->v_type != VREG) {
1360                 VN_RELE(vp);
1361                 mutex_enter(&rfs_async_write_lock);
1362                 if (rfs_async_write_head == nlp)
1363                         rfs_async_write_head = nlp->next;
1364                 else {
1365                         lp = rfs_async_write_head;
1366                         while (lp->next != nlp)
1367                                 lp = lp->next;
1368                         lp->next = nlp->next;
1369                 }
1370                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1371                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1372                         rp->ns->ns_status = NFSERR_ISDIR;
1373                         rp->thread->t_flag |= t_flag;
1374                 }
1375                 cv_broadcast(&nlp->cv);
1376                 mutex_exit(&rfs_async_write_lock);
1377 
1378                 return;
1379         }
1380 
1381         /*
1382          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1383          * deadlock with ufs.
1384          */
1385         if (nbl_need_check(vp)) {
1386                 nbl_start_crit(vp, RW_READER);
1387                 in_crit = 1;
1388         }
1389 
1390         ct.cc_sysid = 0;
1391         ct.cc_pid = 0;
1392         ct.cc_caller_id = nfs2_srv_caller_id;
1393         ct.cc_flags = CC_DONTBLOCK;
1394 
1395         /*
1396          * Lock the file for writing.  This operation provides
1397          * the delay which allows clusters to grow.
1398          */
1399         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1400 
1401         /* check if a monitor detected a delegation conflict */
1402         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1403                 if (in_crit)
1404                         nbl_end_crit(vp);
1405                 VN_RELE(vp);
1406                 /* mark as wouldblock so response is dropped */
1407                 curthread->t_flag |= T_WOULDBLOCK;
1408                 mutex_enter(&rfs_async_write_lock);
1409                 if (rfs_async_write_head == nlp)
1410                         rfs_async_write_head = nlp->next;
1411                 else {
1412                         lp = rfs_async_write_head;
1413                         while (lp->next != nlp)
1414                                 lp = lp->next;
1415                         lp->next = nlp->next;
1416                 }
1417                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1418                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1419                                 rp->ns->ns_status = puterrno(error);
1420                                 rp->thread->t_flag |= T_WOULDBLOCK;
1421                         }
1422                 }
1423                 cv_broadcast(&nlp->cv);
1424                 mutex_exit(&rfs_async_write_lock);
1425 
1426                 return;
1427         }
1428 
1429         /*
1430          * Disconnect this cluster from the list of clusters.
1431          * The cluster that is being dealt with must be fixed
1432          * in size after this point, so there is no reason
1433          * to leave it on the list so that new requests can
1434          * find it.
1435          *
1436          * The algorithm is that the first write request will
1437          * create a cluster, convert the file handle to a
1438          * vnode pointer, and then lock the file for writing.
1439          * This request is not likely to be clustered with
1440          * any others.  However, the next request will create
1441          * a new cluster and be blocked in VOP_RWLOCK while
1442          * the first request is being processed.  This delay
1443          * will allow more requests to be clustered in this
1444          * second cluster.
1445          */
1446         mutex_enter(&rfs_async_write_lock);
1447         if (rfs_async_write_head == nlp)
1448                 rfs_async_write_head = nlp->next;
1449         else {
1450                 lp = rfs_async_write_head;
1451                 while (lp->next != nlp)
1452                         lp = lp->next;
1453                 lp->next = nlp->next;
1454         }
1455         mutex_exit(&rfs_async_write_lock);
1456 
1457         /*
1458          * Step through the list of requests in this cluster.
1459          * We need to check permissions to make sure that all
1460          * of the requests have sufficient permission to write
1461          * the file.  A cluster can be composed of requests
1462          * from different clients and different users on each
1463          * client.
1464          *
1465          * As a side effect, we also calculate the size of the
1466          * byte range that this cluster encompasses.
1467          */
1468         rp = nlp->list;
1469         off = rp->wa->wa_offset;
1470         len = (uint_t)0;
1471         do {
1472                 if (rdonly(exi, rp->req)) {
1473                         rp->ns->ns_status = NFSERR_ROFS;
1474                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1475                         rp->thread->t_flag |= t_flag;
1476                         continue;
1477                 }
1478 
1479                 va.va_mask = AT_UID|AT_MODE;
1480 
1481                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1482 
1483                 if (!error) {
1484                         if (crgetuid(rp->cr) != va.va_uid) {
1485                                 /*
1486                                  * This is a kludge to allow writes of files
1487                                  * created with read only permission.  The
1488                                  * owner of the file is always allowed to
1489                                  * write it.
1490                                  */
1491                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1492                         }
1493                         if (!error && MANDLOCK(vp, va.va_mode))
1494                                 error = EACCES;
1495                 }
1496 
1497                 /*
1498                  * Check for a conflict with a nbmand-locked region.
1499                  */
1500                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1501                     rp->wa->wa_count, 0, NULL)) {
1502                         error = EACCES;
1503                 }
1504 
1505                 if (error) {
1506                         rp->ns->ns_status = puterrno(error);
1507                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1508                         rp->thread->t_flag |= t_flag;
1509                         continue;
1510                 }
1511                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1512                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1513         } while ((rp = rp->list) != NULL);
1514 
1515         /*
1516          * Step through the cluster attempting to gather as many
1517          * requests which are contiguous as possible.  These
1518          * contiguous requests are handled via one call to VOP_WRITE
1519          * instead of different calls to VOP_WRITE.  We also keep
1520          * track of the fact that any data was written.
1521          */
1522         rp = nlp->list;
1523         data_written = 0;
1524         do {
1525                 /*
1526                  * Skip any requests which are already marked as having an
1527                  * error.
1528                  */
1529                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1530                         rp = rp->list;
1531                         continue;
1532                 }
1533 
1534                 /*
1535                  * Count the number of iovec's which are required
1536                  * to handle this set of requests.  One iovec is
1537                  * needed for each data buffer, whether addressed
1538                  * by wa_data or by the b_rptr pointers in the
1539                  * mblk chains.
1540                  */
1541                 iovcnt = 0;
1542                 lrp = rp;
1543                 for (;;) {
1544                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1545                                 iovcnt++;
1546                         else {
1547                                 m = lrp->wa->wa_mblk;
1548                                 while (m != NULL) {
1549                                         iovcnt++;
1550                                         m = m->b_cont;
1551                                 }
1552                         }
1553                         if (lrp->list == NULL ||
1554                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1555                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1556                             lrp->list->wa->wa_offset) {
1557                                 lrp = lrp->list;
1558                                 break;
1559                         }
1560                         lrp = lrp->list;
1561                 }
1562 
1563                 if (iovcnt <= MAXCLIOVECS) {
1564 #ifdef DEBUG
1565                         rfs_write_hits++;
1566 #endif
1567                         niovp = iov;
1568                 } else {
1569 #ifdef DEBUG
1570                         rfs_write_misses++;
1571 #endif
1572                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1573                 }
1574                 /*
1575                  * Put together the scatter/gather iovecs.
1576                  */
1577                 iovp = niovp;
1578                 trp = rp;
1579                 count = 0;
1580                 do {
1581                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1582                                 if (trp->wa->wa_rlist) {
1583                                         iovp->iov_base =
1584                                             (char *)((trp->wa->wa_rlist)->
1585                                             u.c_daddr3);
1586                                         iovp->iov_len = trp->wa->wa_count;
1587                                 } else  {
1588                                         iovp->iov_base = trp->wa->wa_data;
1589                                         iovp->iov_len = trp->wa->wa_count;
1590                                 }
1591                                 iovp++;
1592                         } else {
1593                                 m = trp->wa->wa_mblk;
1594                                 rcount = trp->wa->wa_count;
1595                                 while (m != NULL) {
1596                                         iovp->iov_base = (caddr_t)m->b_rptr;
1597                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1598                                         rcount -= iovp->iov_len;
1599                                         if (rcount < 0)
1600                                                 iovp->iov_len += rcount;
1601                                         iovp++;
1602                                         if (rcount <= 0)
1603                                                 break;
1604                                         m = m->b_cont;
1605                                 }
1606                         }
1607                         count += trp->wa->wa_count;
1608                         trp = trp->list;
1609                 } while (trp != lrp);
1610 
1611                 uio.uio_iov = niovp;
1612                 uio.uio_iovcnt = iovcnt;
1613                 uio.uio_segflg = UIO_SYSSPACE;
1614                 uio.uio_extflg = UIO_COPY_DEFAULT;
1615                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1616                 uio.uio_resid = count;
1617                 /*
1618                  * The limit is checked on the client. We
1619                  * should allow any size writes here.
1620                  */
1621                 uio.uio_llimit = curproc->p_fsz_ctl;
1622                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1623                 if (rlimit < (rlim64_t)uio.uio_resid)
1624                         uio.uio_resid = (uint_t)rlimit;
1625 
1626                 /*
1627                  * For now we assume no append mode.
1628                  */
1629 
1630                 /*
1631                  * We're changing creds because VM may fault
1632                  * and we need the cred of the current
1633                  * thread to be used if quota * checking is
1634                  * enabled.
1635                  */
1636                 savecred = curthread->t_cred;
1637                 curthread->t_cred = cr;
1638                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1639                 curthread->t_cred = savecred;
1640 
1641                 /* check if a monitor detected a delegation conflict */
1642                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1643                         /* mark as wouldblock so response is dropped */
1644                         curthread->t_flag |= T_WOULDBLOCK;
1645 
1646                 if (niovp != iov)
1647                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1648 
1649                 if (!error) {
1650                         data_written = 1;
1651                         /*
1652                          * Get attributes again so we send the latest mod
1653                          * time to the client side for his cache.
1654                          */
1655                         va.va_mask = AT_ALL;    /* now we want everything */
1656 
1657                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1658 
1659                         if (!error)
1660                                 acl_perm(vp, exi, &va, rp->cr);
1661                 }
1662 
1663                 /*
1664                  * Fill in the status responses for each request
1665                  * which was just handled.  Also, copy the latest
1666                  * attributes in to the attribute responses if
1667                  * appropriate.
1668                  */
1669                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1670                 do {
1671                         rp->thread->t_flag |= t_flag;
1672                         /* check for overflows */
1673                         if (!error) {
1674                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1675                         }
1676                         rp->ns->ns_status = puterrno(error);
1677                         rp = rp->list;
1678                 } while (rp != lrp);
1679         } while (rp != NULL);
1680 
1681         /*
1682          * If any data was written at all, then we need to flush
1683          * the data and metadata to stable storage.
1684          */
1685         if (data_written) {
1686                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1687 
1688                 if (!error) {
1689                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1690                 }
1691         }
1692 
1693         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1694 
1695         if (in_crit)
1696                 nbl_end_crit(vp);
1697         VN_RELE(vp);
1698 
1699         t_flag = curthread->t_flag & T_WOULDBLOCK;
1700         mutex_enter(&rfs_async_write_lock);
1701         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1702                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1703                         rp->ns->ns_status = puterrno(error);
1704                         rp->thread->t_flag |= t_flag;
1705                 }
1706         }
1707         cv_broadcast(&nlp->cv);
1708         mutex_exit(&rfs_async_write_lock);
1709 
1710 }
1711 
1712 void *
1713 rfs_write_getfh(struct nfswriteargs *wa)
1714 {
1715         return (&wa->wa_fhandle);
1716 }
1717 
1718 /*
1719  * Create a file.
1720  * Creates a file with given attributes and returns those attributes
1721  * and an fhandle for the new file.
1722  */
1723 void
1724 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1725         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1726 {
1727         int error;
1728         int lookuperr;
1729         int in_crit = 0;
1730         struct vattr va;
1731         vnode_t *vp;
1732         vnode_t *realvp;
1733         vnode_t *dvp;
1734         char *name = args->ca_da.da_name;
1735         vnode_t *tvp = NULL;
1736         int mode;
1737         int lookup_ok;
1738         bool_t trunc;
1739         struct sockaddr *ca;
1740 
1741         /*
1742          * Disallow NULL paths
1743          */
1744         if (name == NULL || *name == '\0') {
1745                 dr->dr_status = NFSERR_ACCES;
1746                 return;
1747         }
1748 
1749         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1750         if (dvp == NULL) {
1751                 dr->dr_status = NFSERR_STALE;
1752                 return;
1753         }
1754 
1755         error = sattr_to_vattr(args->ca_sa, &va);
1756         if (error) {
1757                 dr->dr_status = puterrno(error);
1758                 return;
1759         }
1760 
1761         /*
1762          * Must specify the mode.
1763          */
1764         if (!(va.va_mask & AT_MODE)) {
1765                 VN_RELE(dvp);
1766                 dr->dr_status = NFSERR_INVAL;
1767                 return;
1768         }
1769 
1770         /*
1771          * This is a completely gross hack to make mknod
1772          * work over the wire until we can wack the protocol
1773          */
1774         if ((va.va_mode & IFMT) == IFCHR) {
1775                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1776                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1777                 else {
1778                         va.va_type = VCHR;
1779                         /*
1780                          * uncompress the received dev_t
1781                          * if the top half is zero indicating a request
1782                          * from an `older style' OS.
1783                          */
1784                         if ((va.va_size & 0xffff0000) == 0)
1785                                 va.va_rdev = nfsv2_expdev(va.va_size);
1786                         else
1787                                 va.va_rdev = (dev_t)va.va_size;
1788                 }
1789                 va.va_mask &= ~AT_SIZE;
1790         } else if ((va.va_mode & IFMT) == IFBLK) {
1791                 va.va_type = VBLK;
1792                 /*
1793                  * uncompress the received dev_t
1794                  * if the top half is zero indicating a request
1795                  * from an `older style' OS.
1796                  */
1797                 if ((va.va_size & 0xffff0000) == 0)
1798                         va.va_rdev = nfsv2_expdev(va.va_size);
1799                 else
1800                         va.va_rdev = (dev_t)va.va_size;
1801                 va.va_mask &= ~AT_SIZE;
1802         } else if ((va.va_mode & IFMT) == IFSOCK) {
1803                 va.va_type = VSOCK;
1804         } else {
1805                 va.va_type = VREG;
1806         }
1807         va.va_mode &= ~IFMT;
1808         va.va_mask |= AT_TYPE;
1809 
1810         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1811         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1812             MAXPATHLEN);
1813         if (name == NULL) {
1814                 dr->dr_status = puterrno(EINVAL);
1815                 return;
1816         }
1817 
1818         /*
1819          * Why was the choice made to use VWRITE as the mode to the
1820          * call to VOP_CREATE ? This results in a bug.  When a client
1821          * opens a file that already exists and is RDONLY, the second
1822          * open fails with an EACESS because of the mode.
1823          * bug ID 1054648.
1824          */
1825         lookup_ok = 0;
1826         mode = VWRITE;
1827         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1828                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1829                     NULL, NULL, NULL);
1830                 if (!error) {
1831                         struct vattr at;
1832 
1833                         lookup_ok = 1;
1834                         at.va_mask = AT_MODE;
1835                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1836                         if (!error)
1837                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1838                         VN_RELE(tvp);
1839                         tvp = NULL;
1840                 }
1841         }
1842 
1843         if (!lookup_ok) {
1844                 if (rdonly(exi, req)) {
1845                         error = EROFS;
1846                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1847                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1848                         error = EPERM;
1849                 } else {
1850                         error = 0;
1851                 }
1852         }
1853 
1854         /*
1855          * If file size is being modified on an already existing file
1856          * make sure that there are no conflicting non-blocking mandatory
1857          * locks in the region being manipulated. Return EACCES if there
1858          * are conflicting locks.
1859          */
1860         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1861                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1862                     NULL, NULL, NULL);
1863 
1864                 if (!lookuperr &&
1865                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1866                         VN_RELE(tvp);
1867                         curthread->t_flag |= T_WOULDBLOCK;
1868                         goto out;
1869                 }
1870 
1871                 if (!lookuperr && nbl_need_check(tvp)) {
1872                         /*
1873                          * The file exists. Now check if it has any
1874                          * conflicting non-blocking mandatory locks
1875                          * in the region being changed.
1876                          */
1877                         struct vattr bva;
1878                         u_offset_t offset;
1879                         ssize_t length;
1880 
1881                         nbl_start_crit(tvp, RW_READER);
1882                         in_crit = 1;
1883 
1884                         bva.va_mask = AT_SIZE;
1885                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1886                         if (!error) {
1887                                 if (va.va_size < bva.va_size) {
1888                                         offset = va.va_size;
1889                                         length = bva.va_size - va.va_size;
1890                                 } else {
1891                                         offset = bva.va_size;
1892                                         length = va.va_size - bva.va_size;
1893                                 }
1894                                 if (length) {
1895                                         if (nbl_conflict(tvp, NBL_WRITE,
1896                                             offset, length, 0, NULL)) {
1897                                                 error = EACCES;
1898                                         }
1899                                 }
1900                         }
1901                         if (error) {
1902                                 nbl_end_crit(tvp);
1903                                 VN_RELE(tvp);
1904                                 in_crit = 0;
1905                         }
1906                 } else if (tvp != NULL) {
1907                         VN_RELE(tvp);
1908                 }
1909         }
1910 
1911         if (!error) {
1912                 /*
1913                  * If filesystem is shared with nosuid the remove any
1914                  * setuid/setgid bits on create.
1915                  */
1916                 if (va.va_type == VREG &&
1917                     exi->exi_export.ex_flags & EX_NOSUID)
1918                         va.va_mode &= ~(VSUID | VSGID);
1919 
1920                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1921                     NULL, NULL);
1922 
1923                 if (!error) {
1924 
1925                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1926                                 trunc = TRUE;
1927                         else
1928                                 trunc = FALSE;
1929 
1930                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1931                                 VN_RELE(vp);
1932                                 curthread->t_flag |= T_WOULDBLOCK;
1933                                 goto out;
1934                         }
1935                         va.va_mask = AT_ALL;
1936 
1937                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1938 
1939                         /* check for overflows */
1940                         if (!error) {
1941                                 acl_perm(vp, exi, &va, cr);
1942                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1943                                 if (!error) {
1944                                         error = makefh(&dr->dr_fhandle, vp,
1945                                             exi);
1946                                 }
1947                         }
1948                         /*
1949                          * Force modified metadata out to stable storage.
1950                          *
1951                          * if a underlying vp exists, pass it to VOP_FSYNC
1952                          */
1953                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1954                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1955                         else
1956                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1957                         VN_RELE(vp);
1958                 }
1959 
1960                 if (in_crit) {
1961                         nbl_end_crit(tvp);
1962                         VN_RELE(tvp);
1963                 }
1964         }
1965 
1966         /*
1967          * Force modified data and metadata out to stable storage.
1968          */
1969         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1970 
1971 out:
1972 
1973         VN_RELE(dvp);
1974 
1975         dr->dr_status = puterrno(error);
1976 
1977         if (name != args->ca_da.da_name)
1978                 kmem_free(name, MAXPATHLEN);
1979 }
1980 void *
1981 rfs_create_getfh(struct nfscreatargs *args)
1982 {
1983         return (args->ca_da.da_fhandle);
1984 }
1985 
1986 /*
1987  * Remove a file.
1988  * Remove named file from parent directory.
1989  */
1990 void
1991 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1992         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1993 {
1994         int error = 0;
1995         vnode_t *vp;
1996         vnode_t *targvp;
1997         int in_crit = 0;
1998 
1999         /*
2000          * Disallow NULL paths
2001          */
2002         if (da->da_name == NULL || *da->da_name == '\0') {
2003                 *status = NFSERR_ACCES;
2004                 return;
2005         }
2006 
2007         vp = nfs_fhtovp(da->da_fhandle, exi);
2008         if (vp == NULL) {
2009                 *status = NFSERR_STALE;
2010                 return;
2011         }
2012 
2013         if (rdonly(exi, req)) {
2014                 VN_RELE(vp);
2015                 *status = NFSERR_ROFS;
2016                 return;
2017         }
2018 
2019         /*
2020          * Check for a conflict with a non-blocking mandatory share reservation.
2021          */
2022         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2023             NULL, cr, NULL, NULL, NULL);
2024         if (error != 0) {
2025                 VN_RELE(vp);
2026                 *status = puterrno(error);
2027                 return;
2028         }
2029 
2030         /*
2031          * If the file is delegated to an v4 client, then initiate
2032          * recall and drop this request (by setting T_WOULDBLOCK).
2033          * The client will eventually re-transmit the request and
2034          * (hopefully), by then, the v4 client will have returned
2035          * the delegation.
2036          */
2037 
2038         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2039                 VN_RELE(vp);
2040                 VN_RELE(targvp);
2041                 curthread->t_flag |= T_WOULDBLOCK;
2042                 return;
2043         }
2044 
2045         if (nbl_need_check(targvp)) {
2046                 nbl_start_crit(targvp, RW_READER);
2047                 in_crit = 1;
2048                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2049                         error = EACCES;
2050                         goto out;
2051                 }
2052         }
2053 
2054         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2055 
2056         /*
2057          * Force modified data and metadata out to stable storage.
2058          */
2059         (void) VOP_FSYNC(vp, 0, cr, NULL);
2060 
2061 out:
2062         if (in_crit)
2063                 nbl_end_crit(targvp);
2064         VN_RELE(targvp);
2065         VN_RELE(vp);
2066 
2067         *status = puterrno(error);
2068 
2069 }
2070 
2071 void *
2072 rfs_remove_getfh(struct nfsdiropargs *da)
2073 {
2074         return (da->da_fhandle);
2075 }
2076 
2077 /*
2078  * rename a file
2079  * Give a file (from) a new name (to).
2080  */
2081 void
2082 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2083         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2084 {
2085         int error = 0;
2086         vnode_t *fromvp;
2087         vnode_t *tovp;
2088         struct exportinfo *to_exi;
2089         fhandle_t *fh;
2090         vnode_t *srcvp;
2091         vnode_t *targvp;
2092         int in_crit = 0;
2093 
2094         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2095         if (fromvp == NULL) {
2096                 *status = NFSERR_STALE;
2097                 return;
2098         }
2099 
2100         fh = args->rna_to.da_fhandle;
2101         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2102         if (to_exi == NULL) {
2103                 VN_RELE(fromvp);
2104                 *status = NFSERR_ACCES;
2105                 return;
2106         }
2107         exi_rele(to_exi);
2108 
2109         if (to_exi != exi) {
2110                 VN_RELE(fromvp);
2111                 *status = NFSERR_XDEV;
2112                 return;
2113         }
2114 
2115         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2116         if (tovp == NULL) {
2117                 VN_RELE(fromvp);
2118                 *status = NFSERR_STALE;
2119                 return;
2120         }
2121 
2122         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2123                 VN_RELE(tovp);
2124                 VN_RELE(fromvp);
2125                 *status = NFSERR_NOTDIR;
2126                 return;
2127         }
2128 
2129         /*
2130          * Disallow NULL paths
2131          */
2132         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2133             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2134                 VN_RELE(tovp);
2135                 VN_RELE(fromvp);
2136                 *status = NFSERR_ACCES;
2137                 return;
2138         }
2139 
2140         if (rdonly(exi, req)) {
2141                 VN_RELE(tovp);
2142                 VN_RELE(fromvp);
2143                 *status = NFSERR_ROFS;
2144                 return;
2145         }
2146 
2147         /*
2148          * Check for a conflict with a non-blocking mandatory share reservation.
2149          */
2150         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2151             NULL, cr, NULL, NULL, NULL);
2152         if (error != 0) {
2153                 VN_RELE(tovp);
2154                 VN_RELE(fromvp);
2155                 *status = puterrno(error);
2156                 return;
2157         }
2158 
2159         /* Check for delegations on the source file */
2160 
2161         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2162                 VN_RELE(tovp);
2163                 VN_RELE(fromvp);
2164                 VN_RELE(srcvp);
2165                 curthread->t_flag |= T_WOULDBLOCK;
2166                 return;
2167         }
2168 
2169         /* Check for delegation on the file being renamed over, if it exists */
2170 
2171         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2172             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2173             NULL, NULL, NULL) == 0) {
2174 
2175                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2176                         VN_RELE(tovp);
2177                         VN_RELE(fromvp);
2178                         VN_RELE(srcvp);
2179                         VN_RELE(targvp);
2180                         curthread->t_flag |= T_WOULDBLOCK;
2181                         return;
2182                 }
2183                 VN_RELE(targvp);
2184         }
2185 
2186 
2187         if (nbl_need_check(srcvp)) {
2188                 nbl_start_crit(srcvp, RW_READER);
2189                 in_crit = 1;
2190                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2191                         error = EACCES;
2192                         goto out;
2193                 }
2194         }
2195 
2196         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2197             tovp, args->rna_to.da_name, cr, NULL, 0);
2198 
2199         if (error == 0)
2200                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2201                     strlen(args->rna_to.da_name));
2202 
2203         /*
2204          * Force modified data and metadata out to stable storage.
2205          */
2206         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2207         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2208 
2209 out:
2210         if (in_crit)
2211                 nbl_end_crit(srcvp);
2212         VN_RELE(srcvp);
2213         VN_RELE(tovp);
2214         VN_RELE(fromvp);
2215 
2216         *status = puterrno(error);
2217 
2218 }
2219 void *
2220 rfs_rename_getfh(struct nfsrnmargs *args)
2221 {
2222         return (args->rna_from.da_fhandle);
2223 }
2224 
2225 /*
2226  * Link to a file.
2227  * Create a file (to) which is a hard link to the given file (from).
2228  */
2229 void
2230 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2231         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2232 {
2233         int error;
2234         vnode_t *fromvp;
2235         vnode_t *tovp;
2236         struct exportinfo *to_exi;
2237         fhandle_t *fh;
2238 
2239         fromvp = nfs_fhtovp(args->la_from, exi);
2240         if (fromvp == NULL) {
2241                 *status = NFSERR_STALE;
2242                 return;
2243         }
2244 
2245         fh = args->la_to.da_fhandle;
2246         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2247         if (to_exi == NULL) {
2248                 VN_RELE(fromvp);
2249                 *status = NFSERR_ACCES;
2250                 return;
2251         }
2252         exi_rele(to_exi);
2253 
2254         if (to_exi != exi) {
2255                 VN_RELE(fromvp);
2256                 *status = NFSERR_XDEV;
2257                 return;
2258         }
2259 
2260         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2261         if (tovp == NULL) {
2262                 VN_RELE(fromvp);
2263                 *status = NFSERR_STALE;
2264                 return;
2265         }
2266 
2267         if (tovp->v_type != VDIR) {
2268                 VN_RELE(tovp);
2269                 VN_RELE(fromvp);
2270                 *status = NFSERR_NOTDIR;
2271                 return;
2272         }
2273         /*
2274          * Disallow NULL paths
2275          */
2276         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2277                 VN_RELE(tovp);
2278                 VN_RELE(fromvp);
2279                 *status = NFSERR_ACCES;
2280                 return;
2281         }
2282 
2283         if (rdonly(exi, req)) {
2284                 VN_RELE(tovp);
2285                 VN_RELE(fromvp);
2286                 *status = NFSERR_ROFS;
2287                 return;
2288         }
2289 
2290         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2291 
2292         /*
2293          * Force modified data and metadata out to stable storage.
2294          */
2295         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2296         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2297 
2298         VN_RELE(tovp);
2299         VN_RELE(fromvp);
2300 
2301         *status = puterrno(error);
2302 
2303 }
2304 void *
2305 rfs_link_getfh(struct nfslinkargs *args)
2306 {
2307         return (args->la_from);
2308 }
2309 
2310 /*
2311  * Symbolicly link to a file.
2312  * Create a file (to) with the given attributes which is a symbolic link
2313  * to the given path name (to).
2314  */
2315 void
2316 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2317         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2318 {
2319         int error;
2320         struct vattr va;
2321         vnode_t *vp;
2322         vnode_t *svp;
2323         int lerror;
2324         struct sockaddr *ca;
2325         char *name = NULL;
2326 
2327         /*
2328          * Disallow NULL paths
2329          */
2330         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2331                 *status = NFSERR_ACCES;
2332                 return;
2333         }
2334 
2335         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2336         if (vp == NULL) {
2337                 *status = NFSERR_STALE;
2338                 return;
2339         }
2340 
2341         if (rdonly(exi, req)) {
2342                 VN_RELE(vp);
2343                 *status = NFSERR_ROFS;
2344                 return;
2345         }
2346 
2347         error = sattr_to_vattr(args->sla_sa, &va);
2348         if (error) {
2349                 VN_RELE(vp);
2350                 *status = puterrno(error);
2351                 return;
2352         }
2353 
2354         if (!(va.va_mask & AT_MODE)) {
2355                 VN_RELE(vp);
2356                 *status = NFSERR_INVAL;
2357                 return;
2358         }
2359 
2360         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2361         name = nfscmd_convname(ca, exi, args->sla_tnm,
2362             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2363 
2364         if (name == NULL) {
2365                 *status = NFSERR_ACCES;
2366                 return;
2367         }
2368 
2369         va.va_type = VLNK;
2370         va.va_mask |= AT_TYPE;
2371 
2372         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2373 
2374         /*
2375          * Force new data and metadata out to stable storage.
2376          */
2377         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2378             NULL, cr, NULL, NULL, NULL);
2379 
2380         if (!lerror) {
2381                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2382                 VN_RELE(svp);
2383         }
2384 
2385         /*
2386          * Force modified data and metadata out to stable storage.
2387          */
2388         (void) VOP_FSYNC(vp, 0, cr, NULL);
2389 
2390         VN_RELE(vp);
2391 
2392         *status = puterrno(error);
2393         if (name != args->sla_tnm)
2394                 kmem_free(name, MAXPATHLEN);
2395 
2396 }
2397 void *
2398 rfs_symlink_getfh(struct nfsslargs *args)
2399 {
2400         return (args->sla_from.da_fhandle);
2401 }
2402 
2403 /*
2404  * Make a directory.
2405  * Create a directory with the given name, parent directory, and attributes.
2406  * Returns a file handle and attributes for the new directory.
2407  */
2408 void
2409 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2410         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2411 {
2412         int error;
2413         struct vattr va;
2414         vnode_t *dvp = NULL;
2415         vnode_t *vp;
2416         char *name = args->ca_da.da_name;
2417 
2418         /*
2419          * Disallow NULL paths
2420          */
2421         if (name == NULL || *name == '\0') {
2422                 dr->dr_status = NFSERR_ACCES;
2423                 return;
2424         }
2425 
2426         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2427         if (vp == NULL) {
2428                 dr->dr_status = NFSERR_STALE;
2429                 return;
2430         }
2431 
2432         if (rdonly(exi, req)) {
2433                 VN_RELE(vp);
2434                 dr->dr_status = NFSERR_ROFS;
2435                 return;
2436         }
2437 
2438         error = sattr_to_vattr(args->ca_sa, &va);
2439         if (error) {
2440                 VN_RELE(vp);
2441                 dr->dr_status = puterrno(error);
2442                 return;
2443         }
2444 
2445         if (!(va.va_mask & AT_MODE)) {
2446                 VN_RELE(vp);
2447                 dr->dr_status = NFSERR_INVAL;
2448                 return;
2449         }
2450 
2451         va.va_type = VDIR;
2452         va.va_mask |= AT_TYPE;
2453 
2454         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2455 
2456         if (!error) {
2457                 /*
2458                  * Attribtutes of the newly created directory should
2459                  * be returned to the client.
2460                  */
2461                 va.va_mask = AT_ALL; /* We want everything */
2462                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2463 
2464                 /* check for overflows */
2465                 if (!error) {
2466                         acl_perm(vp, exi, &va, cr);
2467                         error = vattr_to_nattr(&va, &dr->dr_attr);
2468                         if (!error) {
2469                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2470                         }
2471                 }
2472                 /*
2473                  * Force new data and metadata out to stable storage.
2474                  */
2475                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2476                 VN_RELE(dvp);
2477         }
2478 
2479         /*
2480          * Force modified data and metadata out to stable storage.
2481          */
2482         (void) VOP_FSYNC(vp, 0, cr, NULL);
2483 
2484         VN_RELE(vp);
2485 
2486         dr->dr_status = puterrno(error);
2487 
2488 }
2489 void *
2490 rfs_mkdir_getfh(struct nfscreatargs *args)
2491 {
2492         return (args->ca_da.da_fhandle);
2493 }
2494 
2495 /*
2496  * Remove a directory.
2497  * Remove the given directory name from the given parent directory.
2498  */
2499 void
2500 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2501         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2502 {
2503         int error;
2504         vnode_t *vp;
2505 
2506 
2507         /*
2508          * Disallow NULL paths
2509          */
2510         if (da->da_name == NULL || *da->da_name == '\0') {
2511                 *status = NFSERR_ACCES;
2512                 return;
2513         }
2514 
2515         vp = nfs_fhtovp(da->da_fhandle, exi);
2516         if (vp == NULL) {
2517                 *status = NFSERR_STALE;
2518                 return;
2519         }
2520 
2521         if (rdonly(exi, req)) {
2522                 VN_RELE(vp);
2523                 *status = NFSERR_ROFS;
2524                 return;
2525         }
2526 
2527         /*
2528          * VOP_RMDIR now takes a new third argument (the current
2529          * directory of the process).  That's because someone
2530          * wants to return EINVAL if one tries to remove ".".
2531          * Of course, NFS servers have no idea what their
2532          * clients' current directories are.  We fake it by
2533          * supplying a vnode known to exist and illegal to
2534          * remove.
2535          */
2536         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2537 
2538         /*
2539          * Force modified data and metadata out to stable storage.
2540          */
2541         (void) VOP_FSYNC(vp, 0, cr, NULL);
2542 
2543         VN_RELE(vp);
2544 
2545         /*
2546          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2547          * if the directory is not empty.  A System V NFS server
2548          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2549          * over the wire.
2550          */
2551         if (error == EEXIST)
2552                 *status = NFSERR_NOTEMPTY;
2553         else
2554                 *status = puterrno(error);
2555 
2556 }
2557 void *
2558 rfs_rmdir_getfh(struct nfsdiropargs *da)
2559 {
2560         return (da->da_fhandle);
2561 }
2562 
2563 /* ARGSUSED */
2564 void
2565 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2566         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2567 {
2568         int error;
2569         int iseof;
2570         struct iovec iov;
2571         struct uio uio;
2572         vnode_t *vp;
2573         char *ndata = NULL;
2574         struct sockaddr *ca;
2575         size_t nents;
2576         int ret;
2577 
2578         vp = nfs_fhtovp(&rda->rda_fh, exi);
2579         if (vp == NULL) {
2580                 rd->rd_entries = NULL;
2581                 rd->rd_status = NFSERR_STALE;
2582                 return;
2583         }
2584 
2585         if (vp->v_type != VDIR) {
2586                 VN_RELE(vp);
2587                 rd->rd_entries = NULL;
2588                 rd->rd_status = NFSERR_NOTDIR;
2589                 return;
2590         }
2591 
2592         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2593 
2594         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2595 
2596         if (error) {
2597                 rd->rd_entries = NULL;
2598                 goto bad;
2599         }
2600 
2601         if (rda->rda_count == 0) {
2602                 rd->rd_entries = NULL;
2603                 rd->rd_size = 0;
2604                 rd->rd_eof = FALSE;
2605                 goto bad;
2606         }
2607 
2608         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2609 
2610         /*
2611          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2612          */
2613         rd->rd_bufsize = (uint_t)rda->rda_count;
2614         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2615 
2616         /*
2617          * Set up io vector to read directory data
2618          */
2619         iov.iov_base = (caddr_t)rd->rd_entries;
2620         iov.iov_len = rda->rda_count;
2621         uio.uio_iov = &iov;
2622         uio.uio_iovcnt = 1;
2623         uio.uio_segflg = UIO_SYSSPACE;
2624         uio.uio_extflg = UIO_COPY_CACHED;
2625         uio.uio_loffset = (offset_t)rda->rda_offset;
2626         uio.uio_resid = rda->rda_count;
2627 
2628         /*
2629          * read directory
2630          */
2631         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2632 
2633         /*
2634          * Clean up
2635          */
2636         if (!error) {
2637                 /*
2638                  * set size and eof
2639                  */
2640                 if (uio.uio_resid == rda->rda_count) {
2641                         rd->rd_size = 0;
2642                         rd->rd_eof = TRUE;
2643                 } else {
2644                         rd->rd_size = (uint32_t)(rda->rda_count -
2645                             uio.uio_resid);
2646                         rd->rd_eof = iseof ? TRUE : FALSE;
2647                 }
2648         }
2649 
2650         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2651         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2652         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2653             rda->rda_count, &ndata);
2654 
2655         if (ret != 0) {
2656                 size_t dropbytes;
2657                 /*
2658                  * We had to drop one or more entries in order to fit
2659                  * during the character conversion.  We need to patch
2660                  * up the size and eof info.
2661                  */
2662                 if (rd->rd_eof)
2663                         rd->rd_eof = FALSE;
2664                 dropbytes = nfscmd_dropped_entrysize(
2665                     (struct dirent64 *)rd->rd_entries, nents, ret);
2666                 rd->rd_size -= dropbytes;
2667         }
2668         if (ndata == NULL) {
2669                 ndata = (char *)rd->rd_entries;
2670         } else if (ndata != (char *)rd->rd_entries) {
2671                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2672                 rd->rd_entries = (void *)ndata;
2673                 rd->rd_bufsize = rda->rda_count;
2674         }
2675 
2676 bad:
2677         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2678 
2679 #if 0 /* notyet */
2680         /*
2681          * Don't do this.  It causes local disk writes when just
2682          * reading the file and the overhead is deemed larger
2683          * than the benefit.
2684          */
2685         /*
2686          * Force modified metadata out to stable storage.
2687          */
2688         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2689 #endif
2690 
2691         VN_RELE(vp);
2692 
2693         rd->rd_status = puterrno(error);
2694 
2695 }
2696 void *
2697 rfs_readdir_getfh(struct nfsrddirargs *rda)
2698 {
2699         return (&rda->rda_fh);
2700 }
2701 void
2702 rfs_rddirfree(struct nfsrddirres *rd)
2703 {
2704         if (rd->rd_entries != NULL)
2705                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2706 }
2707 
2708 /* ARGSUSED */
2709 void
2710 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2711         struct svc_req *req, cred_t *cr)
2712 {
2713         int error;
2714         struct statvfs64 sb;
2715         vnode_t *vp;
2716 
2717         vp = nfs_fhtovp(fh, exi);
2718         if (vp == NULL) {
2719                 fs->fs_status = NFSERR_STALE;
2720                 return;
2721         }
2722 
2723         error = VFS_STATVFS(vp->v_vfsp, &sb);
2724 
2725         if (!error) {
2726                 fs->fs_tsize = nfstsize();
2727                 fs->fs_bsize = sb.f_frsize;
2728                 fs->fs_blocks = sb.f_blocks;
2729                 fs->fs_bfree = sb.f_bfree;
2730                 fs->fs_bavail = sb.f_bavail;
2731         }
2732 
2733         VN_RELE(vp);
2734 
2735         fs->fs_status = puterrno(error);
2736 
2737 }
2738 void *
2739 rfs_statfs_getfh(fhandle_t *fh)
2740 {
2741         return (fh);
2742 }
2743 
2744 static int
2745 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2746 {
2747         vap->va_mask = 0;
2748 
2749         /*
2750          * There was a sign extension bug in some VFS based systems
2751          * which stored the mode as a short.  When it would get
2752          * assigned to a u_long, no sign extension would occur.
2753          * It needed to, but this wasn't noticed because sa_mode
2754          * would then get assigned back to the short, thus ignoring
2755          * the upper 16 bits of sa_mode.
2756          *
2757          * To make this implementation work for both broken
2758          * clients and good clients, we check for both versions
2759          * of the mode.
2760          */
2761         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2762             sa->sa_mode != (uint32_t)-1) {
2763                 vap->va_mask |= AT_MODE;
2764                 vap->va_mode = sa->sa_mode;
2765         }
2766         if (sa->sa_uid != (uint32_t)-1) {
2767                 vap->va_mask |= AT_UID;
2768                 vap->va_uid = sa->sa_uid;
2769         }
2770         if (sa->sa_gid != (uint32_t)-1) {
2771                 vap->va_mask |= AT_GID;
2772                 vap->va_gid = sa->sa_gid;
2773         }
2774         if (sa->sa_size != (uint32_t)-1) {
2775                 vap->va_mask |= AT_SIZE;
2776                 vap->va_size = sa->sa_size;
2777         }
2778         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2779             sa->sa_atime.tv_usec != (int32_t)-1) {
2780 #ifndef _LP64
2781                 /* return error if time overflow */
2782                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2783                         return (EOVERFLOW);
2784 #endif
2785                 vap->va_mask |= AT_ATIME;
2786                 /*
2787                  * nfs protocol defines times as unsigned so don't extend sign,
2788                  * unless sysadmin set nfs_allow_preepoch_time.
2789                  */
2790                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2791                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2792         }
2793         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2794             sa->sa_mtime.tv_usec != (int32_t)-1) {
2795 #ifndef _LP64
2796                 /* return error if time overflow */
2797                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2798                         return (EOVERFLOW);
2799 #endif
2800                 vap->va_mask |= AT_MTIME;
2801                 /*
2802                  * nfs protocol defines times as unsigned so don't extend sign,
2803                  * unless sysadmin set nfs_allow_preepoch_time.
2804                  */
2805                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2806                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2807         }
2808         return (0);
2809 }
2810 
2811 static enum nfsftype vt_to_nf[] = {
2812         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2813 };
2814 
2815 /*
2816  * check the following fields for overflow: nodeid, size, and time.
2817  * There could be a problem when converting 64-bit LP64 fields
2818  * into 32-bit ones.  Return an error if there is an overflow.
2819  */
2820 int
2821 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2822 {
2823         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2824         na->na_type = vt_to_nf[vap->va_type];
2825 
2826         if (vap->va_mode == (unsigned short) -1)
2827                 na->na_mode = (uint32_t)-1;
2828         else
2829                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2830 
2831         if (vap->va_uid == (unsigned short)(-1))
2832                 na->na_uid = (uint32_t)(-1);
2833         else if (vap->va_uid == UID_NOBODY)
2834                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2835         else
2836                 na->na_uid = vap->va_uid;
2837 
2838         if (vap->va_gid == (unsigned short)(-1))
2839                 na->na_gid = (uint32_t)-1;
2840         else if (vap->va_gid == GID_NOBODY)
2841                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2842         else
2843                 na->na_gid = vap->va_gid;
2844 
2845         /*
2846          * Do we need to check fsid for overflow?  It is 64-bit in the
2847          * vattr, but are bigger than 32 bit values supported?
2848          */
2849         na->na_fsid = vap->va_fsid;
2850 
2851         na->na_nodeid = vap->va_nodeid;
2852 
2853         /*
2854          * Check to make sure that the nodeid is representable over the
2855          * wire without losing bits.
2856          */
2857         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2858                 return (EFBIG);
2859         na->na_nlink = vap->va_nlink;
2860 
2861         /*
2862          * Check for big files here, instead of at the caller.  See
2863          * comments in cstat for large special file explanation.
2864          */
2865         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2866                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2867                         return (EFBIG);
2868                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2869                         /* UNKNOWN_SIZE | OVERFLOW */
2870                         na->na_size = MAXOFF32_T;
2871                 } else
2872                         na->na_size = vap->va_size;
2873         } else
2874                 na->na_size = vap->va_size;
2875 
2876         /*
2877          * If the vnode times overflow the 32-bit times that NFS2
2878          * uses on the wire then return an error.
2879          */
2880         if (!NFS_VAP_TIME_OK(vap)) {
2881                 return (EOVERFLOW);
2882         }
2883         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2884         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2885 
2886         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2887         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2888 
2889         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2890         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2891 
2892         /*
2893          * If the dev_t will fit into 16 bits then compress
2894          * it, otherwise leave it alone. See comments in
2895          * nfs_client.c.
2896          */
2897         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2898             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2899                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2900         else
2901                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2902 
2903         na->na_blocks = vap->va_nblocks;
2904         na->na_blocksize = vap->va_blksize;
2905 
2906         /*
2907          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2908          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2909          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2910          *
2911          * BUYER BEWARE:
2912          *  If you are porting the NFS to a non-Sun server, you probably
2913          *  don't want to include the following block of code.  The
2914          *  over-the-wire special file types will be changing with the
2915          *  NFS Protocol Revision.
2916          */
2917         if (vap->va_type == VFIFO)
2918                 NA_SETFIFO(na);
2919         return (0);
2920 }
2921 
2922 /*
2923  * acl v2 support: returns approximate permission.
2924  *      default: returns minimal permission (more restrictive)
2925  *      aclok: returns maximal permission (less restrictive)
2926  *      This routine changes the permissions that are alaredy in *va.
2927  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2928  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2929  */
2930 static void
2931 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2932 {
2933         vsecattr_t      vsa;
2934         int             aclcnt;
2935         aclent_t        *aclentp;
2936         mode_t          mask_perm;
2937         mode_t          grp_perm;
2938         mode_t          other_perm;
2939         mode_t          other_orig;
2940         int             error;
2941 
2942         /* dont care default acl */
2943         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2944         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2945 
2946         if (!error) {
2947                 aclcnt = vsa.vsa_aclcnt;
2948                 if (aclcnt > MIN_ACL_ENTRIES) {
2949                         /* non-trivial ACL */
2950                         aclentp = vsa.vsa_aclentp;
2951                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2952                                 /* maximal permissions */
2953                                 grp_perm = 0;
2954                                 other_perm = 0;
2955                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2956                                         switch (aclentp->a_type) {
2957                                         case USER_OBJ:
2958                                                 break;
2959                                         case USER:
2960                                                 grp_perm |=
2961                                                     aclentp->a_perm << 3;
2962                                                 other_perm |= aclentp->a_perm;
2963                                                 break;
2964                                         case GROUP_OBJ:
2965                                                 grp_perm |=
2966                                                     aclentp->a_perm << 3;
2967                                                 break;
2968                                         case GROUP:
2969                                                 other_perm |= aclentp->a_perm;
2970                                                 break;
2971                                         case OTHER_OBJ:
2972                                                 other_orig = aclentp->a_perm;
2973                                                 break;
2974                                         case CLASS_OBJ:
2975                                                 mask_perm = aclentp->a_perm;
2976                                                 break;
2977                                         default:
2978                                                 break;
2979                                         }
2980                                 }
2981                                 grp_perm &= mask_perm << 3;
2982                                 other_perm &= mask_perm;
2983                                 other_perm |= other_orig;
2984 
2985                         } else {
2986                                 /* minimal permissions */
2987                                 grp_perm = 070;
2988                                 other_perm = 07;
2989                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2990                                         switch (aclentp->a_type) {
2991                                         case USER_OBJ:
2992                                                 break;
2993                                         case USER:
2994                                         case CLASS_OBJ:
2995                                                 grp_perm &=
2996                                                     aclentp->a_perm << 3;
2997                                                 other_perm &=
2998                                                     aclentp->a_perm;
2999                                                 break;
3000                                         case GROUP_OBJ:
3001                                                 grp_perm &=
3002                                                     aclentp->a_perm << 3;
3003                                                 break;
3004                                         case GROUP:
3005                                                 other_perm &=
3006                                                     aclentp->a_perm;
3007                                                 break;
3008                                         case OTHER_OBJ:
3009                                                 other_perm &=
3010                                                     aclentp->a_perm;
3011                                                 break;
3012                                         default:
3013                                                 break;
3014                                         }
3015                                 }
3016                         }
3017                         /* copy to va */
3018                         va->va_mode &= ~077;
3019                         va->va_mode |= grp_perm | other_perm;
3020                 }
3021                 if (vsa.vsa_aclcnt)
3022                         kmem_free(vsa.vsa_aclentp,
3023                             vsa.vsa_aclcnt * sizeof (aclent_t));
3024         }
3025 }
3026 
3027 void
3028 rfs_srvrinit(void)
3029 {
3030         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3031         nfs2_srv_caller_id = fs_new_caller_id();
3032 }
3033 
3034 void
3035 rfs_srvrfini(void)
3036 {
3037         mutex_destroy(&rfs_async_write_lock);
3038 }
3039 
3040 static int
3041 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3042 {
3043         struct clist    *wcl;
3044         int             wlist_len;
3045         uint32_t        count = rr->rr_count;
3046 
3047         wcl = ra->ra_wlist;
3048 
3049         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3050                 return (FALSE);
3051         }
3052 
3053         wcl = ra->ra_wlist;
3054         rr->rr_ok.rrok_wlist_len = wlist_len;
3055         rr->rr_ok.rrok_wlist = wcl;
3056 
3057         return (TRUE);
3058 }