1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  27  *      All rights reserved.
  28  */
  29 
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/systm.h>
  33 #include <sys/cred.h>
  34 #include <sys/buf.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vnode.h>
  37 #include <sys/uio.h>
  38 #include <sys/stat.h>
  39 #include <sys/errno.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/statvfs.h>
  42 #include <sys/kmem.h>
  43 #include <sys/kstat.h>
  44 #include <sys/dirent.h>
  45 #include <sys/cmn_err.h>
  46 #include <sys/debug.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/mode.h>
  49 #include <sys/acl.h>
  50 #include <sys/nbmlock.h>
  51 #include <sys/policy.h>
  52 #include <sys/sdt.h>
  53 
  54 #include <rpc/types.h>
  55 #include <rpc/auth.h>
  56 #include <rpc/svc.h>
  57 
  58 #include <nfs/nfs.h>
  59 #include <nfs/export.h>
  60 #include <nfs/nfs_cmd.h>
  61 
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_map.h>
  66 #include <vm/seg_kmem.h>
  67 
  68 #include <sys/strsubr.h>
  69 
  70 /*
  71  * These are the interface routines for the server side of the
  72  * Network File System.  See the NFS version 2 protocol specification
  73  * for a description of this interface.
  74  */
  75 
  76 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  77 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  78                         cred_t *);
  79 
  80 /*
  81  * Some "over the wire" UNIX file types.  These are encoded
  82  * into the mode.  This needs to be fixed in the next rev.
  83  */
  84 #define IFMT            0170000         /* type of file */
  85 #define IFCHR           0020000         /* character special */
  86 #define IFBLK           0060000         /* block special */
  87 #define IFSOCK          0140000         /* socket */
  88 
  89 u_longlong_t nfs2_srv_caller_id;
  90 
  91 /*
  92  * Get file attributes.
  93  * Returns the current attributes of the file with the given fhandle.
  94  */
  95 /* ARGSUSED */
  96 void
  97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
  98         struct svc_req *req, cred_t *cr)
  99 {
 100         int error;
 101         vnode_t *vp;
 102         struct vattr va;
 103 
 104         vp = nfs_fhtovp(fhp, exi);
 105         if (vp == NULL) {
 106                 ns->ns_status = NFSERR_STALE;
 107                 return;
 108         }
 109 
 110         /*
 111          * Do the getattr.
 112          */
 113         va.va_mask = AT_ALL;    /* we want all the attributes */
 114 
 115         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 116 
 117         /* check for overflows */
 118         if (!error) {
 119                 /* Lie about the object type for a referral */
 120                 if (vn_is_nfs_reparse(vp, cr))
 121                         va.va_type = VLNK;
 122 
 123                 acl_perm(vp, exi, &va, cr);
 124                 error = vattr_to_nattr(&va, &ns->ns_attr);
 125         }
 126 
 127         VN_RELE(vp);
 128 
 129         ns->ns_status = puterrno(error);
 130 }
 131 void *
 132 rfs_getattr_getfh(fhandle_t *fhp)
 133 {
 134         return (fhp);
 135 }
 136 
 137 /*
 138  * Set file attributes.
 139  * Sets the attributes of the file with the given fhandle.  Returns
 140  * the new attributes.
 141  */
 142 void
 143 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 144         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 145 {
 146         int error;
 147         int flag;
 148         int in_crit = 0;
 149         vnode_t *vp;
 150         struct vattr va;
 151         struct vattr bva;
 152         struct flock64 bf;
 153         caller_context_t ct;
 154 
 155 
 156         vp = nfs_fhtovp(&args->saa_fh, exi);
 157         if (vp == NULL) {
 158                 ns->ns_status = NFSERR_STALE;
 159                 return;
 160         }
 161 
 162         if (rdonly(exi, req) || vn_is_readonly(vp)) {
 163                 VN_RELE(vp);
 164                 ns->ns_status = NFSERR_ROFS;
 165                 return;
 166         }
 167 
 168         error = sattr_to_vattr(&args->saa_sa, &va);
 169         if (error) {
 170                 VN_RELE(vp);
 171                 ns->ns_status = puterrno(error);
 172                 return;
 173         }
 174 
 175         /*
 176          * If the client is requesting a change to the mtime,
 177          * but the nanosecond field is set to 1 billion, then
 178          * this is a flag to the server that it should set the
 179          * atime and mtime fields to the server's current time.
 180          * The 1 billion number actually came from the client
 181          * as 1 million, but the units in the over the wire
 182          * request are microseconds instead of nanoseconds.
 183          *
 184          * This is an overload of the protocol and should be
 185          * documented in the NFS Version 2 protocol specification.
 186          */
 187         if (va.va_mask & AT_MTIME) {
 188                 if (va.va_mtime.tv_nsec == 1000000000) {
 189                         gethrestime(&va.va_mtime);
 190                         va.va_atime = va.va_mtime;
 191                         va.va_mask |= AT_ATIME;
 192                         flag = 0;
 193                 } else
 194                         flag = ATTR_UTIME;
 195         } else
 196                 flag = 0;
 197 
 198         /*
 199          * If the filesystem is exported with nosuid, then mask off
 200          * the setuid and setgid bits.
 201          */
 202         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 203             (exi->exi_export.ex_flags & EX_NOSUID))
 204                 va.va_mode &= ~(VSUID | VSGID);
 205 
 206         ct.cc_sysid = 0;
 207         ct.cc_pid = 0;
 208         ct.cc_caller_id = nfs2_srv_caller_id;
 209         ct.cc_flags = CC_DONTBLOCK;
 210 
 211         /*
 212          * We need to specially handle size changes because it is
 213          * possible for the client to create a file with modes
 214          * which indicate read-only, but with the file opened for
 215          * writing.  If the client then tries to set the size of
 216          * the file, then the normal access checking done in
 217          * VOP_SETATTR would prevent the client from doing so,
 218          * although it should be legal for it to do so.  To get
 219          * around this, we do the access checking for ourselves
 220          * and then use VOP_SPACE which doesn't do the access
 221          * checking which VOP_SETATTR does. VOP_SPACE can only
 222          * operate on VREG files, let VOP_SETATTR handle the other
 223          * extremely rare cases.
 224          * Also the client should not be allowed to change the
 225          * size of the file if there is a conflicting non-blocking
 226          * mandatory lock in the region of change.
 227          */
 228         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 229                 if (nbl_need_check(vp)) {
 230                         nbl_start_crit(vp, RW_READER);
 231                         in_crit = 1;
 232                 }
 233 
 234                 bva.va_mask = AT_UID | AT_SIZE;
 235 
 236                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 237 
 238                 if (error) {
 239                         if (in_crit)
 240                                 nbl_end_crit(vp);
 241                         VN_RELE(vp);
 242                         ns->ns_status = puterrno(error);
 243                         return;
 244                 }
 245 
 246                 if (in_crit) {
 247                         u_offset_t offset;
 248                         ssize_t length;
 249 
 250                         if (va.va_size < bva.va_size) {
 251                                 offset = va.va_size;
 252                                 length = bva.va_size - va.va_size;
 253                         } else {
 254                                 offset = bva.va_size;
 255                                 length = va.va_size - bva.va_size;
 256                         }
 257                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 258                             NULL)) {
 259                                 error = EACCES;
 260                         }
 261                 }
 262 
 263                 if (crgetuid(cr) == bva.va_uid && !error &&
 264                     va.va_size != bva.va_size) {
 265                         va.va_mask &= ~AT_SIZE;
 266                         bf.l_type = F_WRLCK;
 267                         bf.l_whence = 0;
 268                         bf.l_start = (off64_t)va.va_size;
 269                         bf.l_len = 0;
 270                         bf.l_sysid = 0;
 271                         bf.l_pid = 0;
 272 
 273                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 274                             (offset_t)va.va_size, cr, &ct);
 275                 }
 276                 if (in_crit)
 277                         nbl_end_crit(vp);
 278         } else
 279                 error = 0;
 280 
 281         /*
 282          * Do the setattr.
 283          */
 284         if (!error && va.va_mask) {
 285                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 286         }
 287 
 288         /*
 289          * check if the monitor on either vop_space or vop_setattr detected
 290          * a delegation conflict and if so, mark the thread flag as
 291          * wouldblock so that the response is dropped and the client will
 292          * try again.
 293          */
 294         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 295                 VN_RELE(vp);
 296                 curthread->t_flag |= T_WOULDBLOCK;
 297                 return;
 298         }
 299 
 300         if (!error) {
 301                 va.va_mask = AT_ALL;    /* get everything */
 302 
 303                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 304 
 305                 /* check for overflows */
 306                 if (!error) {
 307                         acl_perm(vp, exi, &va, cr);
 308                         error = vattr_to_nattr(&va, &ns->ns_attr);
 309                 }
 310         }
 311 
 312         ct.cc_flags = 0;
 313 
 314         /*
 315          * Force modified metadata out to stable storage.
 316          */
 317         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 318 
 319         VN_RELE(vp);
 320 
 321         ns->ns_status = puterrno(error);
 322 }
 323 void *
 324 rfs_setattr_getfh(struct nfssaargs *args)
 325 {
 326         return (&args->saa_fh);
 327 }
 328 
 329 /*
 330  * Directory lookup.
 331  * Returns an fhandle and file attributes for file name in a directory.
 332  */
 333 /* ARGSUSED */
 334 void
 335 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 336         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 337 {
 338         int error;
 339         vnode_t *dvp;
 340         vnode_t *vp;
 341         struct vattr va;
 342         fhandle_t *fhp = da->da_fhandle;
 343         struct sec_ol sec = {0, 0};
 344         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 345         char *name;
 346         struct sockaddr *ca;
 347 
 348         /*
 349          * Trusted Extension doesn't support NFSv2. MOUNT
 350          * will reject v2 clients. Need to prevent v2 client
 351          * access via WebNFS here.
 352          */
 353         if (is_system_labeled() && req->rq_vers == 2) {
 354                 dr->dr_status = NFSERR_ACCES;
 355                 return;
 356         }
 357 
 358         /*
 359          * Disallow NULL paths
 360          */
 361         if (da->da_name == NULL || *da->da_name == '\0') {
 362                 dr->dr_status = NFSERR_ACCES;
 363                 return;
 364         }
 365 
 366         /*
 367          * Allow lookups from the root - the default
 368          * location of the public filehandle.
 369          */
 370         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 371                 dvp = rootdir;
 372                 VN_HOLD(dvp);
 373         } else {
 374                 dvp = nfs_fhtovp(fhp, exi);
 375                 if (dvp == NULL) {
 376                         dr->dr_status = NFSERR_STALE;
 377                         return;
 378                 }
 379         }
 380 
 381         /*
 382          * Not allow lookup beyond root.
 383          * If the filehandle matches a filehandle of the exi,
 384          * then the ".." refers beyond the root of an exported filesystem.
 385          */
 386         if (strcmp(da->da_name, "..") == 0 &&
 387             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 388                 VN_RELE(dvp);
 389                 dr->dr_status = NFSERR_NOENT;
 390                 return;
 391         }
 392 
 393         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 394         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 395             MAXPATHLEN);
 396 
 397         if (name == NULL) {
 398                 dr->dr_status = NFSERR_ACCES;
 399                 return;
 400         }
 401 
 402         /*
 403          * If the public filehandle is used then allow
 404          * a multi-component lookup, i.e. evaluate
 405          * a pathname and follow symbolic links if
 406          * necessary.
 407          *
 408          * This may result in a vnode in another filesystem
 409          * which is OK as long as the filesystem is exported.
 410          */
 411         if (PUBLIC_FH2(fhp)) {
 412                 publicfh_flag = TRUE;
 413                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 414                     &sec);
 415         } else {
 416                 /*
 417                  * Do a normal single component lookup.
 418                  */
 419                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 420                     NULL, NULL, NULL);
 421         }
 422 
 423         if (name != da->da_name)
 424                 kmem_free(name, MAXPATHLEN);
 425 
 426 
 427         if (!error) {
 428                 va.va_mask = AT_ALL;    /* we want everything */
 429 
 430                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 431 
 432                 /* check for overflows */
 433                 if (!error) {
 434                         acl_perm(vp, exi, &va, cr);
 435                         error = vattr_to_nattr(&va, &dr->dr_attr);
 436                         if (!error) {
 437                                 if (sec.sec_flags & SEC_QUERY)
 438                                         error = makefh_ol(&dr->dr_fhandle, exi,
 439                                             sec.sec_index);
 440                                 else {
 441                                         error = makefh(&dr->dr_fhandle, vp,
 442                                             exi);
 443                                         if (!error && publicfh_flag &&
 444                                             !chk_clnt_sec(exi, req))
 445                                                 auth_weak = TRUE;
 446                                 }
 447                         }
 448                 }
 449                 VN_RELE(vp);
 450         }
 451 
 452         VN_RELE(dvp);
 453 
 454         /*
 455          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 456          * and have obtained a new exportinfo in exi which needs to be
 457          * released. Note the the original exportinfo pointed to by exi
 458          * will be released by the caller, comon_dispatch.
 459          */
 460         if (publicfh_flag && exi != NULL)
 461                 exi_rele(exi);
 462 
 463         /*
 464          * If it's public fh, no 0x81, and client's flavor is
 465          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 466          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 467          */
 468         if (auth_weak)
 469                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 470         else
 471                 dr->dr_status = puterrno(error);
 472 }
 473 void *
 474 rfs_lookup_getfh(struct nfsdiropargs *da)
 475 {
 476         return (da->da_fhandle);
 477 }
 478 
 479 /*
 480  * Read symbolic link.
 481  * Returns the string in the symbolic link at the given fhandle.
 482  */
 483 /* ARGSUSED */
 484 void
 485 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 486         struct svc_req *req, cred_t *cr)
 487 {
 488         int error;
 489         struct iovec iov;
 490         struct uio uio;
 491         vnode_t *vp;
 492         struct vattr va;
 493         struct sockaddr *ca;
 494         char *name = NULL;
 495         int is_referral = 0;
 496 
 497         vp = nfs_fhtovp(fhp, exi);
 498         if (vp == NULL) {
 499                 rl->rl_data = NULL;
 500                 rl->rl_status = NFSERR_STALE;
 501                 return;
 502         }
 503 
 504         va.va_mask = AT_MODE;
 505 
 506         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 507 
 508         if (error) {
 509                 VN_RELE(vp);
 510                 rl->rl_data = NULL;
 511                 rl->rl_status = puterrno(error);
 512                 return;
 513         }
 514 
 515         if (MANDLOCK(vp, va.va_mode)) {
 516                 VN_RELE(vp);
 517                 rl->rl_data = NULL;
 518                 rl->rl_status = NFSERR_ACCES;
 519                 return;
 520         }
 521 
 522         /* We lied about the object type for a referral */
 523         if (vn_is_nfs_reparse(vp, cr))
 524                 is_referral = 1;
 525 
 526         /*
 527          * XNFS and RFC1094 require us to return ENXIO if argument
 528          * is not a link. BUGID 1138002.
 529          */
 530         if (vp->v_type != VLNK && !is_referral) {
 531                 VN_RELE(vp);
 532                 rl->rl_data = NULL;
 533                 rl->rl_status = NFSERR_NXIO;
 534                 return;
 535         }
 536 
 537         /*
 538          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 539          */
 540         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 541 
 542         if (is_referral) {
 543                 char *s;
 544                 size_t strsz;
 545 
 546                 /* Get an artificial symlink based on a referral */
 547                 s = build_symlink(vp, cr, &strsz);
 548                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 549                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 550                     vnode_t *, vp, char *, s);
 551                 if (s == NULL)
 552                         error = EINVAL;
 553                 else {
 554                         error = 0;
 555                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 556                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 557                         kmem_free(s, strsz);
 558                 }
 559 
 560         } else {
 561 
 562                 /*
 563                  * Set up io vector to read sym link data
 564                  */
 565                 iov.iov_base = rl->rl_data;
 566                 iov.iov_len = NFS_MAXPATHLEN;
 567                 uio.uio_iov = &iov;
 568                 uio.uio_iovcnt = 1;
 569                 uio.uio_segflg = UIO_SYSSPACE;
 570                 uio.uio_extflg = UIO_COPY_CACHED;
 571                 uio.uio_loffset = (offset_t)0;
 572                 uio.uio_resid = NFS_MAXPATHLEN;
 573 
 574                 /*
 575                  * Do the readlink.
 576                  */
 577                 error = VOP_READLINK(vp, &uio, cr, NULL);
 578 
 579                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 580 
 581                 if (!error)
 582                         rl->rl_data[rl->rl_count] = '\0';
 583 
 584         }
 585 
 586 
 587         VN_RELE(vp);
 588 
 589         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 590         name = nfscmd_convname(ca, exi, rl->rl_data,
 591             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 592 
 593         if (name != NULL && name != rl->rl_data) {
 594                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 595                 rl->rl_data = name;
 596         }
 597 
 598         /*
 599          * XNFS and RFC1094 require us to return ENXIO if argument
 600          * is not a link. UFS returns EINVAL if this is the case,
 601          * so we do the mapping here. BUGID 1138002.
 602          */
 603         if (error == EINVAL)
 604                 rl->rl_status = NFSERR_NXIO;
 605         else
 606                 rl->rl_status = puterrno(error);
 607 
 608 }
 609 void *
 610 rfs_readlink_getfh(fhandle_t *fhp)
 611 {
 612         return (fhp);
 613 }
 614 /*
 615  * Free data allocated by rfs_readlink
 616  */
 617 void
 618 rfs_rlfree(struct nfsrdlnres *rl)
 619 {
 620         if (rl->rl_data != NULL)
 621                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 622 }
 623 
 624 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 625 
 626 /*
 627  * Read data.
 628  * Returns some data read from the file at the given fhandle.
 629  */
 630 /* ARGSUSED */
 631 void
 632 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 633         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 634 {
 635         vnode_t *vp;
 636         int error;
 637         struct vattr va;
 638         struct iovec iov;
 639         struct uio uio;
 640         mblk_t *mp;
 641         int alloc_err = 0;
 642         int in_crit = 0;
 643         caller_context_t ct;
 644 
 645         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 646         if (vp == NULL) {
 647                 rr->rr_data = NULL;
 648                 rr->rr_status = NFSERR_STALE;
 649                 return;
 650         }
 651 
 652         if (vp->v_type != VREG) {
 653                 VN_RELE(vp);
 654                 rr->rr_data = NULL;
 655                 rr->rr_status = NFSERR_ISDIR;
 656                 return;
 657         }
 658 
 659         ct.cc_sysid = 0;
 660         ct.cc_pid = 0;
 661         ct.cc_caller_id = nfs2_srv_caller_id;
 662         ct.cc_flags = CC_DONTBLOCK;
 663 
 664         /*
 665          * Enter the critical region before calling VOP_RWLOCK
 666          * to avoid a deadlock with write requests.
 667          */
 668         if (nbl_need_check(vp)) {
 669                 nbl_start_crit(vp, RW_READER);
 670                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 671                     0, NULL)) {
 672                         nbl_end_crit(vp);
 673                         VN_RELE(vp);
 674                         rr->rr_data = NULL;
 675                         rr->rr_status = NFSERR_ACCES;
 676                         return;
 677                 }
 678                 in_crit = 1;
 679         }
 680 
 681         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 682 
 683         /* check if a monitor detected a delegation conflict */
 684         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 685                 VN_RELE(vp);
 686                 /* mark as wouldblock so response is dropped */
 687                 curthread->t_flag |= T_WOULDBLOCK;
 688 
 689                 rr->rr_data = NULL;
 690                 return;
 691         }
 692 
 693         va.va_mask = AT_ALL;
 694 
 695         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 696 
 697         if (error) {
 698                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 699                 if (in_crit)
 700                         nbl_end_crit(vp);
 701 
 702                 VN_RELE(vp);
 703                 rr->rr_data = NULL;
 704                 rr->rr_status = puterrno(error);
 705 
 706                 return;
 707         }
 708 
 709         /*
 710          * This is a kludge to allow reading of files created
 711          * with no read permission.  The owner of the file
 712          * is always allowed to read it.
 713          */
 714         if (crgetuid(cr) != va.va_uid) {
 715                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 716 
 717                 if (error) {
 718                         /*
 719                          * Exec is the same as read over the net because
 720                          * of demand loading.
 721                          */
 722                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 723                 }
 724                 if (error) {
 725                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 726                         if (in_crit)
 727                                 nbl_end_crit(vp);
 728                         VN_RELE(vp);
 729                         rr->rr_data = NULL;
 730                         rr->rr_status = puterrno(error);
 731 
 732                         return;
 733                 }
 734         }
 735 
 736         if (MANDLOCK(vp, va.va_mode)) {
 737                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 738                 if (in_crit)
 739                         nbl_end_crit(vp);
 740 
 741                 VN_RELE(vp);
 742                 rr->rr_data = NULL;
 743                 rr->rr_status = NFSERR_ACCES;
 744 
 745                 return;
 746         }
 747 
 748         rr->rr_ok.rrok_wlist_len = 0;
 749         rr->rr_ok.rrok_wlist = NULL;
 750 
 751         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 752                 rr->rr_count = 0;
 753                 rr->rr_data = NULL;
 754                 /*
 755                  * In this case, status is NFS_OK, but there is no data
 756                  * to encode. So set rr_mp to NULL.
 757                  */
 758                 rr->rr_mp = NULL;
 759                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 760                 if (rr->rr_ok.rrok_wlist)
 761                         clist_zero_len(rr->rr_ok.rrok_wlist);
 762                 goto done;
 763         }
 764 
 765         if (ra->ra_wlist) {
 766                 mp = NULL;
 767                 rr->rr_mp = NULL;
 768                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 769                 if (ra->ra_count > iov.iov_len) {
 770                         rr->rr_data = NULL;
 771                         rr->rr_status = NFSERR_INVAL;
 772                         goto done;
 773                 }
 774         } else {
 775                 /*
 776                  * mp will contain the data to be sent out in the read reply.
 777                  * This will be freed after the reply has been sent out (by the
 778                  * driver).
 779                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 780                  * that the call to xdrmblk_putmblk() never fails.
 781                  */
 782                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 783                     &alloc_err);
 784                 ASSERT(mp != NULL);
 785                 ASSERT(alloc_err == 0);
 786 
 787                 rr->rr_mp = mp;
 788 
 789                 /*
 790                  * Set up io vector
 791                  */
 792                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 793                 iov.iov_len = ra->ra_count;
 794         }
 795 
 796         uio.uio_iov = &iov;
 797         uio.uio_iovcnt = 1;
 798         uio.uio_segflg = UIO_SYSSPACE;
 799         uio.uio_extflg = UIO_COPY_CACHED;
 800         uio.uio_loffset = (offset_t)ra->ra_offset;
 801         uio.uio_resid = ra->ra_count;
 802 
 803         error = VOP_READ(vp, &uio, 0, cr, &ct);
 804 
 805         if (error) {
 806                 if (mp)
 807                         freeb(mp);
 808 
 809                 /*
 810                  * check if a monitor detected a delegation conflict and
 811                  * mark as wouldblock so response is dropped
 812                  */
 813                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 814                         curthread->t_flag |= T_WOULDBLOCK;
 815                 else
 816                         rr->rr_status = puterrno(error);
 817 
 818                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 819                 if (in_crit)
 820                         nbl_end_crit(vp);
 821 
 822                 VN_RELE(vp);
 823                 rr->rr_data = NULL;
 824 
 825                 return;
 826         }
 827 
 828         /*
 829          * Get attributes again so we can send the latest access
 830          * time to the client side for his cache.
 831          */
 832         va.va_mask = AT_ALL;
 833 
 834         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 835 
 836         if (error) {
 837                 if (mp)
 838                         freeb(mp);
 839 
 840                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 841                 if (in_crit)
 842                         nbl_end_crit(vp);
 843 
 844                 VN_RELE(vp);
 845                 rr->rr_data = NULL;
 846                 rr->rr_status = puterrno(error);
 847 
 848                 return;
 849         }
 850 
 851         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 852 
 853         if (mp) {
 854                 rr->rr_data = (char *)mp->b_datap->db_base;
 855         } else {
 856                 if (ra->ra_wlist) {
 857                         rr->rr_data = (caddr_t)iov.iov_base;
 858                         if (!rdma_setup_read_data2(ra, rr)) {
 859                                 rr->rr_data = NULL;
 860                                 rr->rr_status = puterrno(NFSERR_INVAL);
 861                         }
 862                 }
 863         }
 864 done:
 865         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 866         if (in_crit)
 867                 nbl_end_crit(vp);
 868 
 869         acl_perm(vp, exi, &va, cr);
 870 
 871         /* check for overflows */
 872         error = vattr_to_nattr(&va, &rr->rr_attr);
 873 
 874         VN_RELE(vp);
 875 
 876         rr->rr_status = puterrno(error);
 877 }
 878 
 879 /*
 880  * Free data allocated by rfs_read
 881  */
 882 void
 883 rfs_rdfree(struct nfsrdresult *rr)
 884 {
 885         mblk_t *mp;
 886 
 887         if (rr->rr_status == NFS_OK) {
 888                 mp = rr->rr_mp;
 889                 if (mp != NULL)
 890                         freeb(mp);
 891         }
 892 }
 893 
 894 void *
 895 rfs_read_getfh(struct nfsreadargs *ra)
 896 {
 897         return (&ra->ra_fhandle);
 898 }
 899 
 900 #define MAX_IOVECS      12
 901 
 902 #ifdef DEBUG
 903 static int rfs_write_sync_hits = 0;
 904 static int rfs_write_sync_misses = 0;
 905 #endif
 906 
 907 /*
 908  * Write data to file.
 909  * Returns attributes of a file after writing some data to it.
 910  *
 911  * Any changes made here, especially in error handling might have
 912  * to also be done in rfs_write (which clusters write requests).
 913  */
 914 void
 915 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 916         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 917 {
 918         int error;
 919         vnode_t *vp;
 920         rlim64_t rlimit;
 921         struct vattr va;
 922         struct uio uio;
 923         struct iovec iov[MAX_IOVECS];
 924         mblk_t *m;
 925         struct iovec *iovp;
 926         int iovcnt;
 927         cred_t *savecred;
 928         int in_crit = 0;
 929         caller_context_t ct;
 930 
 931         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 932         if (vp == NULL) {
 933                 ns->ns_status = NFSERR_STALE;
 934                 return;
 935         }
 936 
 937         if (rdonly(exi, req)) {
 938                 VN_RELE(vp);
 939                 ns->ns_status = NFSERR_ROFS;
 940                 return;
 941         }
 942 
 943         if (vp->v_type != VREG) {
 944                 VN_RELE(vp);
 945                 ns->ns_status = NFSERR_ISDIR;
 946                 return;
 947         }
 948 
 949         ct.cc_sysid = 0;
 950         ct.cc_pid = 0;
 951         ct.cc_caller_id = nfs2_srv_caller_id;
 952         ct.cc_flags = CC_DONTBLOCK;
 953 
 954         va.va_mask = AT_UID|AT_MODE;
 955 
 956         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 957 
 958         if (error) {
 959                 VN_RELE(vp);
 960                 ns->ns_status = puterrno(error);
 961 
 962                 return;
 963         }
 964 
 965         if (crgetuid(cr) != va.va_uid) {
 966                 /*
 967                  * This is a kludge to allow writes of files created
 968                  * with read only permission.  The owner of the file
 969                  * is always allowed to write it.
 970                  */
 971                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 972 
 973                 if (error) {
 974                         VN_RELE(vp);
 975                         ns->ns_status = puterrno(error);
 976                         return;
 977                 }
 978         }
 979 
 980         /*
 981          * Can't access a mandatory lock file.  This might cause
 982          * the NFS service thread to block forever waiting for a
 983          * lock to be released that will never be released.
 984          */
 985         if (MANDLOCK(vp, va.va_mode)) {
 986                 VN_RELE(vp);
 987                 ns->ns_status = NFSERR_ACCES;
 988                 return;
 989         }
 990 
 991         /*
 992          * We have to enter the critical region before calling VOP_RWLOCK
 993          * to avoid a deadlock with ufs.
 994          */
 995         if (nbl_need_check(vp)) {
 996                 nbl_start_crit(vp, RW_READER);
 997                 in_crit = 1;
 998                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
 999                     wa->wa_count, 0, NULL)) {
1000                         error = EACCES;
1001                         goto out;
1002                 }
1003         }
1004 
1005         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1006 
1007         /* check if a monitor detected a delegation conflict */
1008         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1009                 VN_RELE(vp);
1010                 /* mark as wouldblock so response is dropped */
1011                 curthread->t_flag |= T_WOULDBLOCK;
1012                 return;
1013         }
1014 
1015         if (wa->wa_data || wa->wa_rlist) {
1016                 /* Do the RDMA thing if necessary */
1017                 if (wa->wa_rlist) {
1018                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1019                         iov[0].iov_len = wa->wa_count;
1020                 } else  {
1021                         iov[0].iov_base = wa->wa_data;
1022                         iov[0].iov_len = wa->wa_count;
1023                 }
1024                 uio.uio_iov = iov;
1025                 uio.uio_iovcnt = 1;
1026                 uio.uio_segflg = UIO_SYSSPACE;
1027                 uio.uio_extflg = UIO_COPY_DEFAULT;
1028                 uio.uio_loffset = (offset_t)wa->wa_offset;
1029                 uio.uio_resid = wa->wa_count;
1030                 /*
1031                  * The limit is checked on the client. We
1032                  * should allow any size writes here.
1033                  */
1034                 uio.uio_llimit = curproc->p_fsz_ctl;
1035                 rlimit = uio.uio_llimit - wa->wa_offset;
1036                 if (rlimit < (rlim64_t)uio.uio_resid)
1037                         uio.uio_resid = (uint_t)rlimit;
1038 
1039                 /*
1040                  * for now we assume no append mode
1041                  */
1042                 /*
1043                  * We're changing creds because VM may fault and we need
1044                  * the cred of the current thread to be used if quota
1045                  * checking is enabled.
1046                  */
1047                 savecred = curthread->t_cred;
1048                 curthread->t_cred = cr;
1049                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1050                 curthread->t_cred = savecred;
1051         } else {
1052                 iovcnt = 0;
1053                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1054                         iovcnt++;
1055                 if (iovcnt <= MAX_IOVECS) {
1056 #ifdef DEBUG
1057                         rfs_write_sync_hits++;
1058 #endif
1059                         iovp = iov;
1060                 } else {
1061 #ifdef DEBUG
1062                         rfs_write_sync_misses++;
1063 #endif
1064                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1065                 }
1066                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1067                 uio.uio_iov = iovp;
1068                 uio.uio_iovcnt = iovcnt;
1069                 uio.uio_segflg = UIO_SYSSPACE;
1070                 uio.uio_extflg = UIO_COPY_DEFAULT;
1071                 uio.uio_loffset = (offset_t)wa->wa_offset;
1072                 uio.uio_resid = wa->wa_count;
1073                 /*
1074                  * The limit is checked on the client. We
1075                  * should allow any size writes here.
1076                  */
1077                 uio.uio_llimit = curproc->p_fsz_ctl;
1078                 rlimit = uio.uio_llimit - wa->wa_offset;
1079                 if (rlimit < (rlim64_t)uio.uio_resid)
1080                         uio.uio_resid = (uint_t)rlimit;
1081 
1082                 /*
1083                  * For now we assume no append mode.
1084                  */
1085                 /*
1086                  * We're changing creds because VM may fault and we need
1087                  * the cred of the current thread to be used if quota
1088                  * checking is enabled.
1089                  */
1090                 savecred = curthread->t_cred;
1091                 curthread->t_cred = cr;
1092                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1093                 curthread->t_cred = savecred;
1094 
1095                 if (iovp != iov)
1096                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1097         }
1098 
1099         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1100 
1101         if (!error) {
1102                 /*
1103                  * Get attributes again so we send the latest mod
1104                  * time to the client side for his cache.
1105                  */
1106                 va.va_mask = AT_ALL;    /* now we want everything */
1107 
1108                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1109 
1110                 /* check for overflows */
1111                 if (!error) {
1112                         acl_perm(vp, exi, &va, cr);
1113                         error = vattr_to_nattr(&va, &ns->ns_attr);
1114                 }
1115         }
1116 
1117 out:
1118         if (in_crit)
1119                 nbl_end_crit(vp);
1120         VN_RELE(vp);
1121 
1122         /* check if a monitor detected a delegation conflict */
1123         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1124                 /* mark as wouldblock so response is dropped */
1125                 curthread->t_flag |= T_WOULDBLOCK;
1126         else
1127                 ns->ns_status = puterrno(error);
1128 
1129 }
1130 
1131 struct rfs_async_write {
1132         struct nfswriteargs *wa;
1133         struct nfsattrstat *ns;
1134         struct svc_req *req;
1135         cred_t *cr;
1136         kthread_t *thread;
1137         struct rfs_async_write *list;
1138 };
1139 
1140 struct rfs_async_write_list {
1141         fhandle_t *fhp;
1142         kcondvar_t cv;
1143         struct rfs_async_write *list;
1144         struct rfs_async_write_list *next;
1145 };
1146 
1147 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1148 static kmutex_t rfs_async_write_lock;
1149 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1150 
1151 #define MAXCLIOVECS     42
1152 #define RFSWRITE_INITVAL (enum nfsstat) -1
1153 
1154 #ifdef DEBUG
1155 static int rfs_write_hits = 0;
1156 static int rfs_write_misses = 0;
1157 #endif
1158 
1159 /*
1160  * Write data to file.
1161  * Returns attributes of a file after writing some data to it.
1162  */
1163 void
1164 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1165         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1166 {
1167         int error;
1168         vnode_t *vp;
1169         rlim64_t rlimit;
1170         struct vattr va;
1171         struct uio uio;
1172         struct rfs_async_write_list *lp;
1173         struct rfs_async_write_list *nlp;
1174         struct rfs_async_write *rp;
1175         struct rfs_async_write *nrp;
1176         struct rfs_async_write *trp;
1177         struct rfs_async_write *lrp;
1178         int data_written;
1179         int iovcnt;
1180         mblk_t *m;
1181         struct iovec *iovp;
1182         struct iovec *niovp;
1183         struct iovec iov[MAXCLIOVECS];
1184         int count;
1185         int rcount;
1186         uint_t off;
1187         uint_t len;
1188         struct rfs_async_write nrpsp;
1189         struct rfs_async_write_list nlpsp;
1190         ushort_t t_flag;
1191         cred_t *savecred;
1192         int in_crit = 0;
1193         caller_context_t ct;
1194 
1195         if (!rfs_write_async) {
1196                 rfs_write_sync(wa, ns, exi, req, cr);
1197                 return;
1198         }
1199 
1200         /*
1201          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1202          * is considered an OK.
1203          */
1204         ns->ns_status = RFSWRITE_INITVAL;
1205 
1206         nrp = &nrpsp;
1207         nrp->wa = wa;
1208         nrp->ns = ns;
1209         nrp->req = req;
1210         nrp->cr = cr;
1211         nrp->thread = curthread;
1212 
1213         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1214 
1215         /*
1216          * Look to see if there is already a cluster started
1217          * for this file.
1218          */
1219         mutex_enter(&rfs_async_write_lock);
1220         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1221                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1222                     sizeof (fhandle_t)) == 0)
1223                         break;
1224         }
1225 
1226         /*
1227          * If lp is non-NULL, then there is already a cluster
1228          * started.  We need to place ourselves in the cluster
1229          * list in the right place as determined by starting
1230          * offset.  Conflicts with non-blocking mandatory locked
1231          * regions will be checked when the cluster is processed.
1232          */
1233         if (lp != NULL) {
1234                 rp = lp->list;
1235                 trp = NULL;
1236                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1237                         trp = rp;
1238                         rp = rp->list;
1239                 }
1240                 nrp->list = rp;
1241                 if (trp == NULL)
1242                         lp->list = nrp;
1243                 else
1244                         trp->list = nrp;
1245                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1246                         cv_wait(&lp->cv, &rfs_async_write_lock);
1247                 mutex_exit(&rfs_async_write_lock);
1248 
1249                 return;
1250         }
1251 
1252         /*
1253          * No cluster started yet, start one and add ourselves
1254          * to the list of clusters.
1255          */
1256         nrp->list = NULL;
1257 
1258         nlp = &nlpsp;
1259         nlp->fhp = &wa->wa_fhandle;
1260         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1261         nlp->list = nrp;
1262         nlp->next = NULL;
1263 
1264         if (rfs_async_write_head == NULL) {
1265                 rfs_async_write_head = nlp;
1266         } else {
1267                 lp = rfs_async_write_head;
1268                 while (lp->next != NULL)
1269                         lp = lp->next;
1270                 lp->next = nlp;
1271         }
1272         mutex_exit(&rfs_async_write_lock);
1273 
1274         /*
1275          * Convert the file handle common to all of the requests
1276          * in this cluster to a vnode.
1277          */
1278         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1279         if (vp == NULL) {
1280                 mutex_enter(&rfs_async_write_lock);
1281                 if (rfs_async_write_head == nlp)
1282                         rfs_async_write_head = nlp->next;
1283                 else {
1284                         lp = rfs_async_write_head;
1285                         while (lp->next != nlp)
1286                                 lp = lp->next;
1287                         lp->next = nlp->next;
1288                 }
1289                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1290                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1291                         rp->ns->ns_status = NFSERR_STALE;
1292                         rp->thread->t_flag |= t_flag;
1293                 }
1294                 cv_broadcast(&nlp->cv);
1295                 mutex_exit(&rfs_async_write_lock);
1296 
1297                 return;
1298         }
1299 
1300         /*
1301          * Can only write regular files.  Attempts to write any
1302          * other file types fail with EISDIR.
1303          */
1304         if (vp->v_type != VREG) {
1305                 VN_RELE(vp);
1306                 mutex_enter(&rfs_async_write_lock);
1307                 if (rfs_async_write_head == nlp)
1308                         rfs_async_write_head = nlp->next;
1309                 else {
1310                         lp = rfs_async_write_head;
1311                         while (lp->next != nlp)
1312                                 lp = lp->next;
1313                         lp->next = nlp->next;
1314                 }
1315                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1316                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1317                         rp->ns->ns_status = NFSERR_ISDIR;
1318                         rp->thread->t_flag |= t_flag;
1319                 }
1320                 cv_broadcast(&nlp->cv);
1321                 mutex_exit(&rfs_async_write_lock);
1322 
1323                 return;
1324         }
1325 
1326         /*
1327          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1328          * deadlock with ufs.
1329          */
1330         if (nbl_need_check(vp)) {
1331                 nbl_start_crit(vp, RW_READER);
1332                 in_crit = 1;
1333         }
1334 
1335         ct.cc_sysid = 0;
1336         ct.cc_pid = 0;
1337         ct.cc_caller_id = nfs2_srv_caller_id;
1338         ct.cc_flags = CC_DONTBLOCK;
1339 
1340         /*
1341          * Lock the file for writing.  This operation provides
1342          * the delay which allows clusters to grow.
1343          */
1344         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1345 
1346         /* check if a monitor detected a delegation conflict */
1347         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1348                 if (in_crit)
1349                         nbl_end_crit(vp);
1350                 VN_RELE(vp);
1351                 /* mark as wouldblock so response is dropped */
1352                 curthread->t_flag |= T_WOULDBLOCK;
1353                 mutex_enter(&rfs_async_write_lock);
1354                 if (rfs_async_write_head == nlp)
1355                         rfs_async_write_head = nlp->next;
1356                 else {
1357                         lp = rfs_async_write_head;
1358                         while (lp->next != nlp)
1359                                 lp = lp->next;
1360                         lp->next = nlp->next;
1361                 }
1362                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1363                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1364                                 rp->ns->ns_status = puterrno(error);
1365                                 rp->thread->t_flag |= T_WOULDBLOCK;
1366                         }
1367                 }
1368                 cv_broadcast(&nlp->cv);
1369                 mutex_exit(&rfs_async_write_lock);
1370 
1371                 return;
1372         }
1373 
1374         /*
1375          * Disconnect this cluster from the list of clusters.
1376          * The cluster that is being dealt with must be fixed
1377          * in size after this point, so there is no reason
1378          * to leave it on the list so that new requests can
1379          * find it.
1380          *
1381          * The algorithm is that the first write request will
1382          * create a cluster, convert the file handle to a
1383          * vnode pointer, and then lock the file for writing.
1384          * This request is not likely to be clustered with
1385          * any others.  However, the next request will create
1386          * a new cluster and be blocked in VOP_RWLOCK while
1387          * the first request is being processed.  This delay
1388          * will allow more requests to be clustered in this
1389          * second cluster.
1390          */
1391         mutex_enter(&rfs_async_write_lock);
1392         if (rfs_async_write_head == nlp)
1393                 rfs_async_write_head = nlp->next;
1394         else {
1395                 lp = rfs_async_write_head;
1396                 while (lp->next != nlp)
1397                         lp = lp->next;
1398                 lp->next = nlp->next;
1399         }
1400         mutex_exit(&rfs_async_write_lock);
1401 
1402         /*
1403          * Step through the list of requests in this cluster.
1404          * We need to check permissions to make sure that all
1405          * of the requests have sufficient permission to write
1406          * the file.  A cluster can be composed of requests
1407          * from different clients and different users on each
1408          * client.
1409          *
1410          * As a side effect, we also calculate the size of the
1411          * byte range that this cluster encompasses.
1412          */
1413         rp = nlp->list;
1414         off = rp->wa->wa_offset;
1415         len = (uint_t)0;
1416         do {
1417                 if (rdonly(exi, rp->req)) {
1418                         rp->ns->ns_status = NFSERR_ROFS;
1419                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1420                         rp->thread->t_flag |= t_flag;
1421                         continue;
1422                 }
1423 
1424                 va.va_mask = AT_UID|AT_MODE;
1425 
1426                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1427 
1428                 if (!error) {
1429                         if (crgetuid(rp->cr) != va.va_uid) {
1430                                 /*
1431                                  * This is a kludge to allow writes of files
1432                                  * created with read only permission.  The
1433                                  * owner of the file is always allowed to
1434                                  * write it.
1435                                  */
1436                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1437                         }
1438                         if (!error && MANDLOCK(vp, va.va_mode))
1439                                 error = EACCES;
1440                 }
1441 
1442                 /*
1443                  * Check for a conflict with a nbmand-locked region.
1444                  */
1445                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1446                     rp->wa->wa_count, 0, NULL)) {
1447                         error = EACCES;
1448                 }
1449 
1450                 if (error) {
1451                         rp->ns->ns_status = puterrno(error);
1452                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1453                         rp->thread->t_flag |= t_flag;
1454                         continue;
1455                 }
1456                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1457                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1458         } while ((rp = rp->list) != NULL);
1459 
1460         /*
1461          * Step through the cluster attempting to gather as many
1462          * requests which are contiguous as possible.  These
1463          * contiguous requests are handled via one call to VOP_WRITE
1464          * instead of different calls to VOP_WRITE.  We also keep
1465          * track of the fact that any data was written.
1466          */
1467         rp = nlp->list;
1468         data_written = 0;
1469         do {
1470                 /*
1471                  * Skip any requests which are already marked as having an
1472                  * error.
1473                  */
1474                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1475                         rp = rp->list;
1476                         continue;
1477                 }
1478 
1479                 /*
1480                  * Count the number of iovec's which are required
1481                  * to handle this set of requests.  One iovec is
1482                  * needed for each data buffer, whether addressed
1483                  * by wa_data or by the b_rptr pointers in the
1484                  * mblk chains.
1485                  */
1486                 iovcnt = 0;
1487                 lrp = rp;
1488                 for (;;) {
1489                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1490                                 iovcnt++;
1491                         else {
1492                                 m = lrp->wa->wa_mblk;
1493                                 while (m != NULL) {
1494                                         iovcnt++;
1495                                         m = m->b_cont;
1496                                 }
1497                         }
1498                         if (lrp->list == NULL ||
1499                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1500                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1501                             lrp->list->wa->wa_offset) {
1502                                 lrp = lrp->list;
1503                                 break;
1504                         }
1505                         lrp = lrp->list;
1506                 }
1507 
1508                 if (iovcnt <= MAXCLIOVECS) {
1509 #ifdef DEBUG
1510                         rfs_write_hits++;
1511 #endif
1512                         niovp = iov;
1513                 } else {
1514 #ifdef DEBUG
1515                         rfs_write_misses++;
1516 #endif
1517                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1518                 }
1519                 /*
1520                  * Put together the scatter/gather iovecs.
1521                  */
1522                 iovp = niovp;
1523                 trp = rp;
1524                 count = 0;
1525                 do {
1526                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1527                                 if (trp->wa->wa_rlist) {
1528                                         iovp->iov_base =
1529                                             (char *)((trp->wa->wa_rlist)->
1530                                             u.c_daddr3);
1531                                         iovp->iov_len = trp->wa->wa_count;
1532                                 } else  {
1533                                         iovp->iov_base = trp->wa->wa_data;
1534                                         iovp->iov_len = trp->wa->wa_count;
1535                                 }
1536                                 iovp++;
1537                         } else {
1538                                 m = trp->wa->wa_mblk;
1539                                 rcount = trp->wa->wa_count;
1540                                 while (m != NULL) {
1541                                         iovp->iov_base = (caddr_t)m->b_rptr;
1542                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1543                                         rcount -= iovp->iov_len;
1544                                         if (rcount < 0)
1545                                                 iovp->iov_len += rcount;
1546                                         iovp++;
1547                                         if (rcount <= 0)
1548                                                 break;
1549                                         m = m->b_cont;
1550                                 }
1551                         }
1552                         count += trp->wa->wa_count;
1553                         trp = trp->list;
1554                 } while (trp != lrp);
1555 
1556                 uio.uio_iov = niovp;
1557                 uio.uio_iovcnt = iovcnt;
1558                 uio.uio_segflg = UIO_SYSSPACE;
1559                 uio.uio_extflg = UIO_COPY_DEFAULT;
1560                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1561                 uio.uio_resid = count;
1562                 /*
1563                  * The limit is checked on the client. We
1564                  * should allow any size writes here.
1565                  */
1566                 uio.uio_llimit = curproc->p_fsz_ctl;
1567                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1568                 if (rlimit < (rlim64_t)uio.uio_resid)
1569                         uio.uio_resid = (uint_t)rlimit;
1570 
1571                 /*
1572                  * For now we assume no append mode.
1573                  */
1574 
1575                 /*
1576                  * We're changing creds because VM may fault
1577                  * and we need the cred of the current
1578                  * thread to be used if quota * checking is
1579                  * enabled.
1580                  */
1581                 savecred = curthread->t_cred;
1582                 curthread->t_cred = cr;
1583                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1584                 curthread->t_cred = savecred;
1585 
1586                 /* check if a monitor detected a delegation conflict */
1587                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1588                         /* mark as wouldblock so response is dropped */
1589                         curthread->t_flag |= T_WOULDBLOCK;
1590 
1591                 if (niovp != iov)
1592                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1593 
1594                 if (!error) {
1595                         data_written = 1;
1596                         /*
1597                          * Get attributes again so we send the latest mod
1598                          * time to the client side for his cache.
1599                          */
1600                         va.va_mask = AT_ALL;    /* now we want everything */
1601 
1602                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1603 
1604                         if (!error)
1605                                 acl_perm(vp, exi, &va, rp->cr);
1606                 }
1607 
1608                 /*
1609                  * Fill in the status responses for each request
1610                  * which was just handled.  Also, copy the latest
1611                  * attributes in to the attribute responses if
1612                  * appropriate.
1613                  */
1614                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1615                 do {
1616                         rp->thread->t_flag |= t_flag;
1617                         /* check for overflows */
1618                         if (!error) {
1619                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1620                         }
1621                         rp->ns->ns_status = puterrno(error);
1622                         rp = rp->list;
1623                 } while (rp != lrp);
1624         } while (rp != NULL);
1625 
1626         /*
1627          * If any data was written at all, then we need to flush
1628          * the data and metadata to stable storage.
1629          */
1630         if (data_written) {
1631                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1632 
1633                 if (!error) {
1634                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1635                 }
1636         }
1637 
1638         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1639 
1640         if (in_crit)
1641                 nbl_end_crit(vp);
1642         VN_RELE(vp);
1643 
1644         t_flag = curthread->t_flag & T_WOULDBLOCK;
1645         mutex_enter(&rfs_async_write_lock);
1646         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1647                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1648                         rp->ns->ns_status = puterrno(error);
1649                         rp->thread->t_flag |= t_flag;
1650                 }
1651         }
1652         cv_broadcast(&nlp->cv);
1653         mutex_exit(&rfs_async_write_lock);
1654 
1655 }
1656 
1657 void *
1658 rfs_write_getfh(struct nfswriteargs *wa)
1659 {
1660         return (&wa->wa_fhandle);
1661 }
1662 
1663 /*
1664  * Create a file.
1665  * Creates a file with given attributes and returns those attributes
1666  * and an fhandle for the new file.
1667  */
1668 void
1669 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1670         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1671 {
1672         int error;
1673         int lookuperr;
1674         int in_crit = 0;
1675         struct vattr va;
1676         vnode_t *vp;
1677         vnode_t *realvp;
1678         vnode_t *dvp;
1679         char *name = args->ca_da.da_name;
1680         vnode_t *tvp = NULL;
1681         int mode;
1682         int lookup_ok;
1683         bool_t trunc;
1684         struct sockaddr *ca;
1685 
1686         /*
1687          * Disallow NULL paths
1688          */
1689         if (name == NULL || *name == '\0') {
1690                 dr->dr_status = NFSERR_ACCES;
1691                 return;
1692         }
1693 
1694         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1695         if (dvp == NULL) {
1696                 dr->dr_status = NFSERR_STALE;
1697                 return;
1698         }
1699 
1700         error = sattr_to_vattr(args->ca_sa, &va);
1701         if (error) {
1702                 dr->dr_status = puterrno(error);
1703                 return;
1704         }
1705 
1706         /*
1707          * Must specify the mode.
1708          */
1709         if (!(va.va_mask & AT_MODE)) {
1710                 VN_RELE(dvp);
1711                 dr->dr_status = NFSERR_INVAL;
1712                 return;
1713         }
1714 
1715         /*
1716          * This is a completely gross hack to make mknod
1717          * work over the wire until we can wack the protocol
1718          */
1719         if ((va.va_mode & IFMT) == IFCHR) {
1720                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1721                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1722                 else {
1723                         va.va_type = VCHR;
1724                         /*
1725                          * uncompress the received dev_t
1726                          * if the top half is zero indicating a request
1727                          * from an `older style' OS.
1728                          */
1729                         if ((va.va_size & 0xffff0000) == 0)
1730                                 va.va_rdev = nfsv2_expdev(va.va_size);
1731                         else
1732                                 va.va_rdev = (dev_t)va.va_size;
1733                 }
1734                 va.va_mask &= ~AT_SIZE;
1735         } else if ((va.va_mode & IFMT) == IFBLK) {
1736                 va.va_type = VBLK;
1737                 /*
1738                  * uncompress the received dev_t
1739                  * if the top half is zero indicating a request
1740                  * from an `older style' OS.
1741                  */
1742                 if ((va.va_size & 0xffff0000) == 0)
1743                         va.va_rdev = nfsv2_expdev(va.va_size);
1744                 else
1745                         va.va_rdev = (dev_t)va.va_size;
1746                 va.va_mask &= ~AT_SIZE;
1747         } else if ((va.va_mode & IFMT) == IFSOCK) {
1748                 va.va_type = VSOCK;
1749         } else {
1750                 va.va_type = VREG;
1751         }
1752         va.va_mode &= ~IFMT;
1753         va.va_mask |= AT_TYPE;
1754 
1755         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1756         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1757             MAXPATHLEN);
1758         if (name == NULL) {
1759                 dr->dr_status = puterrno(EINVAL);
1760                 return;
1761         }
1762 
1763         /*
1764          * Why was the choice made to use VWRITE as the mode to the
1765          * call to VOP_CREATE ? This results in a bug.  When a client
1766          * opens a file that already exists and is RDONLY, the second
1767          * open fails with an EACESS because of the mode.
1768          * bug ID 1054648.
1769          */
1770         lookup_ok = 0;
1771         mode = VWRITE;
1772         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1773                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1774                     NULL, NULL, NULL);
1775                 if (!error) {
1776                         struct vattr at;
1777 
1778                         lookup_ok = 1;
1779                         at.va_mask = AT_MODE;
1780                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1781                         if (!error)
1782                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1783                         VN_RELE(tvp);
1784                         tvp = NULL;
1785                 }
1786         }
1787 
1788         if (!lookup_ok) {
1789                 if (rdonly(exi, req)) {
1790                         error = EROFS;
1791                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1792                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1793                         error = EPERM;
1794                 } else {
1795                         error = 0;
1796                 }
1797         }
1798 
1799         /*
1800          * If file size is being modified on an already existing file
1801          * make sure that there are no conflicting non-blocking mandatory
1802          * locks in the region being manipulated. Return EACCES if there
1803          * are conflicting locks.
1804          */
1805         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1806                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1807                     NULL, NULL, NULL);
1808 
1809                 if (!lookuperr &&
1810                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1811                         VN_RELE(tvp);
1812                         curthread->t_flag |= T_WOULDBLOCK;
1813                         goto out;
1814                 }
1815 
1816                 if (!lookuperr && nbl_need_check(tvp)) {
1817                         /*
1818                          * The file exists. Now check if it has any
1819                          * conflicting non-blocking mandatory locks
1820                          * in the region being changed.
1821                          */
1822                         struct vattr bva;
1823                         u_offset_t offset;
1824                         ssize_t length;
1825 
1826                         nbl_start_crit(tvp, RW_READER);
1827                         in_crit = 1;
1828 
1829                         bva.va_mask = AT_SIZE;
1830                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1831                         if (!error) {
1832                                 if (va.va_size < bva.va_size) {
1833                                         offset = va.va_size;
1834                                         length = bva.va_size - va.va_size;
1835                                 } else {
1836                                         offset = bva.va_size;
1837                                         length = va.va_size - bva.va_size;
1838                                 }
1839                                 if (length) {
1840                                         if (nbl_conflict(tvp, NBL_WRITE,
1841                                             offset, length, 0, NULL)) {
1842                                                 error = EACCES;
1843                                         }
1844                                 }
1845                         }
1846                         if (error) {
1847                                 nbl_end_crit(tvp);
1848                                 VN_RELE(tvp);
1849                                 in_crit = 0;
1850                         }
1851                 } else if (tvp != NULL) {
1852                         VN_RELE(tvp);
1853                 }
1854         }
1855 
1856         if (!error) {
1857                 /*
1858                  * If filesystem is shared with nosuid the remove any
1859                  * setuid/setgid bits on create.
1860                  */
1861                 if (va.va_type == VREG &&
1862                     exi->exi_export.ex_flags & EX_NOSUID)
1863                         va.va_mode &= ~(VSUID | VSGID);
1864 
1865                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1866                     NULL, NULL);
1867 
1868                 if (!error) {
1869 
1870                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1871                                 trunc = TRUE;
1872                         else
1873                                 trunc = FALSE;
1874 
1875                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1876                                 VN_RELE(vp);
1877                                 curthread->t_flag |= T_WOULDBLOCK;
1878                                 goto out;
1879                         }
1880                         va.va_mask = AT_ALL;
1881 
1882                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1883 
1884                         /* check for overflows */
1885                         if (!error) {
1886                                 acl_perm(vp, exi, &va, cr);
1887                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1888                                 if (!error) {
1889                                         error = makefh(&dr->dr_fhandle, vp,
1890                                             exi);
1891                                 }
1892                         }
1893                         /*
1894                          * Force modified metadata out to stable storage.
1895                          *
1896                          * if a underlying vp exists, pass it to VOP_FSYNC
1897                          */
1898                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1899                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1900                         else
1901                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1902                         VN_RELE(vp);
1903                 }
1904 
1905                 if (in_crit) {
1906                         nbl_end_crit(tvp);
1907                         VN_RELE(tvp);
1908                 }
1909         }
1910 
1911         /*
1912          * Force modified data and metadata out to stable storage.
1913          */
1914         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1915 
1916 out:
1917 
1918         VN_RELE(dvp);
1919 
1920         dr->dr_status = puterrno(error);
1921 
1922         if (name != args->ca_da.da_name)
1923                 kmem_free(name, MAXPATHLEN);
1924 }
1925 void *
1926 rfs_create_getfh(struct nfscreatargs *args)
1927 {
1928         return (args->ca_da.da_fhandle);
1929 }
1930 
1931 /*
1932  * Remove a file.
1933  * Remove named file from parent directory.
1934  */
1935 void
1936 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1937         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1938 {
1939         int error = 0;
1940         vnode_t *vp;
1941         vnode_t *targvp;
1942         int in_crit = 0;
1943 
1944         /*
1945          * Disallow NULL paths
1946          */
1947         if (da->da_name == NULL || *da->da_name == '\0') {
1948                 *status = NFSERR_ACCES;
1949                 return;
1950         }
1951 
1952         vp = nfs_fhtovp(da->da_fhandle, exi);
1953         if (vp == NULL) {
1954                 *status = NFSERR_STALE;
1955                 return;
1956         }
1957 
1958         if (rdonly(exi, req)) {
1959                 VN_RELE(vp);
1960                 *status = NFSERR_ROFS;
1961                 return;
1962         }
1963 
1964         /*
1965          * Check for a conflict with a non-blocking mandatory share reservation.
1966          */
1967         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1968             NULL, cr, NULL, NULL, NULL);
1969         if (error != 0) {
1970                 VN_RELE(vp);
1971                 *status = puterrno(error);
1972                 return;
1973         }
1974 
1975         /*
1976          * If the file is delegated to an v4 client, then initiate
1977          * recall and drop this request (by setting T_WOULDBLOCK).
1978          * The client will eventually re-transmit the request and
1979          * (hopefully), by then, the v4 client will have returned
1980          * the delegation.
1981          */
1982 
1983         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1984                 VN_RELE(vp);
1985                 VN_RELE(targvp);
1986                 curthread->t_flag |= T_WOULDBLOCK;
1987                 return;
1988         }
1989 
1990         if (nbl_need_check(targvp)) {
1991                 nbl_start_crit(targvp, RW_READER);
1992                 in_crit = 1;
1993                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1994                         error = EACCES;
1995                         goto out;
1996                 }
1997         }
1998 
1999         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2000 
2001         /*
2002          * Force modified data and metadata out to stable storage.
2003          */
2004         (void) VOP_FSYNC(vp, 0, cr, NULL);
2005 
2006 out:
2007         if (in_crit)
2008                 nbl_end_crit(targvp);
2009         VN_RELE(targvp);
2010         VN_RELE(vp);
2011 
2012         *status = puterrno(error);
2013 
2014 }
2015 
2016 void *
2017 rfs_remove_getfh(struct nfsdiropargs *da)
2018 {
2019         return (da->da_fhandle);
2020 }
2021 
2022 /*
2023  * rename a file
2024  * Give a file (from) a new name (to).
2025  */
2026 void
2027 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2028         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2029 {
2030         int error = 0;
2031         vnode_t *fromvp;
2032         vnode_t *tovp;
2033         struct exportinfo *to_exi;
2034         fhandle_t *fh;
2035         vnode_t *srcvp;
2036         vnode_t *targvp;
2037         int in_crit = 0;
2038 
2039         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2040         if (fromvp == NULL) {
2041                 *status = NFSERR_STALE;
2042                 return;
2043         }
2044 
2045         fh = args->rna_to.da_fhandle;
2046         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2047         if (to_exi == NULL) {
2048                 VN_RELE(fromvp);
2049                 *status = NFSERR_ACCES;
2050                 return;
2051         }
2052         exi_rele(to_exi);
2053 
2054         if (to_exi != exi) {
2055                 VN_RELE(fromvp);
2056                 *status = NFSERR_XDEV;
2057                 return;
2058         }
2059 
2060         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2061         if (tovp == NULL) {
2062                 VN_RELE(fromvp);
2063                 *status = NFSERR_STALE;
2064                 return;
2065         }
2066 
2067         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2068                 VN_RELE(tovp);
2069                 VN_RELE(fromvp);
2070                 *status = NFSERR_NOTDIR;
2071                 return;
2072         }
2073 
2074         /*
2075          * Disallow NULL paths
2076          */
2077         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2078             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2079                 VN_RELE(tovp);
2080                 VN_RELE(fromvp);
2081                 *status = NFSERR_ACCES;
2082                 return;
2083         }
2084 
2085         if (rdonly(exi, req)) {
2086                 VN_RELE(tovp);
2087                 VN_RELE(fromvp);
2088                 *status = NFSERR_ROFS;
2089                 return;
2090         }
2091 
2092         /*
2093          * Check for a conflict with a non-blocking mandatory share reservation.
2094          */
2095         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2096             NULL, cr, NULL, NULL, NULL);
2097         if (error != 0) {
2098                 VN_RELE(tovp);
2099                 VN_RELE(fromvp);
2100                 *status = puterrno(error);
2101                 return;
2102         }
2103 
2104         /* Check for delegations on the source file */
2105 
2106         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2107                 VN_RELE(tovp);
2108                 VN_RELE(fromvp);
2109                 VN_RELE(srcvp);
2110                 curthread->t_flag |= T_WOULDBLOCK;
2111                 return;
2112         }
2113 
2114         /* Check for delegation on the file being renamed over, if it exists */
2115 
2116         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2117             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2118             NULL, NULL, NULL) == 0) {
2119 
2120                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2121                         VN_RELE(tovp);
2122                         VN_RELE(fromvp);
2123                         VN_RELE(srcvp);
2124                         VN_RELE(targvp);
2125                         curthread->t_flag |= T_WOULDBLOCK;
2126                         return;
2127                 }
2128                 VN_RELE(targvp);
2129         }
2130 
2131 
2132         if (nbl_need_check(srcvp)) {
2133                 nbl_start_crit(srcvp, RW_READER);
2134                 in_crit = 1;
2135                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2136                         error = EACCES;
2137                         goto out;
2138                 }
2139         }
2140 
2141         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2142             tovp, args->rna_to.da_name, cr, NULL, 0);
2143 
2144         if (error == 0)
2145                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2146                     strlen(args->rna_to.da_name));
2147 
2148         /*
2149          * Force modified data and metadata out to stable storage.
2150          */
2151         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2152         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2153 
2154 out:
2155         if (in_crit)
2156                 nbl_end_crit(srcvp);
2157         VN_RELE(srcvp);
2158         VN_RELE(tovp);
2159         VN_RELE(fromvp);
2160 
2161         *status = puterrno(error);
2162 
2163 }
2164 void *
2165 rfs_rename_getfh(struct nfsrnmargs *args)
2166 {
2167         return (args->rna_from.da_fhandle);
2168 }
2169 
2170 /*
2171  * Link to a file.
2172  * Create a file (to) which is a hard link to the given file (from).
2173  */
2174 void
2175 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2176         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2177 {
2178         int error;
2179         vnode_t *fromvp;
2180         vnode_t *tovp;
2181         struct exportinfo *to_exi;
2182         fhandle_t *fh;
2183 
2184         fromvp = nfs_fhtovp(args->la_from, exi);
2185         if (fromvp == NULL) {
2186                 *status = NFSERR_STALE;
2187                 return;
2188         }
2189 
2190         fh = args->la_to.da_fhandle;
2191         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2192         if (to_exi == NULL) {
2193                 VN_RELE(fromvp);
2194                 *status = NFSERR_ACCES;
2195                 return;
2196         }
2197         exi_rele(to_exi);
2198 
2199         if (to_exi != exi) {
2200                 VN_RELE(fromvp);
2201                 *status = NFSERR_XDEV;
2202                 return;
2203         }
2204 
2205         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2206         if (tovp == NULL) {
2207                 VN_RELE(fromvp);
2208                 *status = NFSERR_STALE;
2209                 return;
2210         }
2211 
2212         if (tovp->v_type != VDIR) {
2213                 VN_RELE(tovp);
2214                 VN_RELE(fromvp);
2215                 *status = NFSERR_NOTDIR;
2216                 return;
2217         }
2218         /*
2219          * Disallow NULL paths
2220          */
2221         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2222                 VN_RELE(tovp);
2223                 VN_RELE(fromvp);
2224                 *status = NFSERR_ACCES;
2225                 return;
2226         }
2227 
2228         if (rdonly(exi, req)) {
2229                 VN_RELE(tovp);
2230                 VN_RELE(fromvp);
2231                 *status = NFSERR_ROFS;
2232                 return;
2233         }
2234 
2235         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2236 
2237         /*
2238          * Force modified data and metadata out to stable storage.
2239          */
2240         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2241         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2242 
2243         VN_RELE(tovp);
2244         VN_RELE(fromvp);
2245 
2246         *status = puterrno(error);
2247 
2248 }
2249 void *
2250 rfs_link_getfh(struct nfslinkargs *args)
2251 {
2252         return (args->la_from);
2253 }
2254 
2255 /*
2256  * Symbolicly link to a file.
2257  * Create a file (to) with the given attributes which is a symbolic link
2258  * to the given path name (to).
2259  */
2260 void
2261 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2262         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2263 {
2264         int error;
2265         struct vattr va;
2266         vnode_t *vp;
2267         vnode_t *svp;
2268         int lerror;
2269         struct sockaddr *ca;
2270         char *name = NULL;
2271 
2272         /*
2273          * Disallow NULL paths
2274          */
2275         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2276                 *status = NFSERR_ACCES;
2277                 return;
2278         }
2279 
2280         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2281         if (vp == NULL) {
2282                 *status = NFSERR_STALE;
2283                 return;
2284         }
2285 
2286         if (rdonly(exi, req)) {
2287                 VN_RELE(vp);
2288                 *status = NFSERR_ROFS;
2289                 return;
2290         }
2291 
2292         error = sattr_to_vattr(args->sla_sa, &va);
2293         if (error) {
2294                 VN_RELE(vp);
2295                 *status = puterrno(error);
2296                 return;
2297         }
2298 
2299         if (!(va.va_mask & AT_MODE)) {
2300                 VN_RELE(vp);
2301                 *status = NFSERR_INVAL;
2302                 return;
2303         }
2304 
2305         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2306         name = nfscmd_convname(ca, exi, args->sla_tnm,
2307             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2308 
2309         if (name == NULL) {
2310                 *status = NFSERR_ACCES;
2311                 return;
2312         }
2313 
2314         va.va_type = VLNK;
2315         va.va_mask |= AT_TYPE;
2316 
2317         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2318 
2319         /*
2320          * Force new data and metadata out to stable storage.
2321          */
2322         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2323             NULL, cr, NULL, NULL, NULL);
2324 
2325         if (!lerror) {
2326                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2327                 VN_RELE(svp);
2328         }
2329 
2330         /*
2331          * Force modified data and metadata out to stable storage.
2332          */
2333         (void) VOP_FSYNC(vp, 0, cr, NULL);
2334 
2335         VN_RELE(vp);
2336 
2337         *status = puterrno(error);
2338         if (name != args->sla_tnm)
2339                 kmem_free(name, MAXPATHLEN);
2340 
2341 }
2342 void *
2343 rfs_symlink_getfh(struct nfsslargs *args)
2344 {
2345         return (args->sla_from.da_fhandle);
2346 }
2347 
2348 /*
2349  * Make a directory.
2350  * Create a directory with the given name, parent directory, and attributes.
2351  * Returns a file handle and attributes for the new directory.
2352  */
2353 void
2354 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2355         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2356 {
2357         int error;
2358         struct vattr va;
2359         vnode_t *dvp = NULL;
2360         vnode_t *vp;
2361         char *name = args->ca_da.da_name;
2362 
2363         /*
2364          * Disallow NULL paths
2365          */
2366         if (name == NULL || *name == '\0') {
2367                 dr->dr_status = NFSERR_ACCES;
2368                 return;
2369         }
2370 
2371         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2372         if (vp == NULL) {
2373                 dr->dr_status = NFSERR_STALE;
2374                 return;
2375         }
2376 
2377         if (rdonly(exi, req)) {
2378                 VN_RELE(vp);
2379                 dr->dr_status = NFSERR_ROFS;
2380                 return;
2381         }
2382 
2383         error = sattr_to_vattr(args->ca_sa, &va);
2384         if (error) {
2385                 VN_RELE(vp);
2386                 dr->dr_status = puterrno(error);
2387                 return;
2388         }
2389 
2390         if (!(va.va_mask & AT_MODE)) {
2391                 VN_RELE(vp);
2392                 dr->dr_status = NFSERR_INVAL;
2393                 return;
2394         }
2395 
2396         va.va_type = VDIR;
2397         va.va_mask |= AT_TYPE;
2398 
2399         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2400 
2401         if (!error) {
2402                 /*
2403                  * Attribtutes of the newly created directory should
2404                  * be returned to the client.
2405                  */
2406                 va.va_mask = AT_ALL; /* We want everything */
2407                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2408 
2409                 /* check for overflows */
2410                 if (!error) {
2411                         acl_perm(vp, exi, &va, cr);
2412                         error = vattr_to_nattr(&va, &dr->dr_attr);
2413                         if (!error) {
2414                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2415                         }
2416                 }
2417                 /*
2418                  * Force new data and metadata out to stable storage.
2419                  */
2420                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2421                 VN_RELE(dvp);
2422         }
2423 
2424         /*
2425          * Force modified data and metadata out to stable storage.
2426          */
2427         (void) VOP_FSYNC(vp, 0, cr, NULL);
2428 
2429         VN_RELE(vp);
2430 
2431         dr->dr_status = puterrno(error);
2432 
2433 }
2434 void *
2435 rfs_mkdir_getfh(struct nfscreatargs *args)
2436 {
2437         return (args->ca_da.da_fhandle);
2438 }
2439 
2440 /*
2441  * Remove a directory.
2442  * Remove the given directory name from the given parent directory.
2443  */
2444 void
2445 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2446         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2447 {
2448         int error;
2449         vnode_t *vp;
2450 
2451 
2452         /*
2453          * Disallow NULL paths
2454          */
2455         if (da->da_name == NULL || *da->da_name == '\0') {
2456                 *status = NFSERR_ACCES;
2457                 return;
2458         }
2459 
2460         vp = nfs_fhtovp(da->da_fhandle, exi);
2461         if (vp == NULL) {
2462                 *status = NFSERR_STALE;
2463                 return;
2464         }
2465 
2466         if (rdonly(exi, req)) {
2467                 VN_RELE(vp);
2468                 *status = NFSERR_ROFS;
2469                 return;
2470         }
2471 
2472         /*
2473          * VOP_RMDIR now takes a new third argument (the current
2474          * directory of the process).  That's because someone
2475          * wants to return EINVAL if one tries to remove ".".
2476          * Of course, NFS servers have no idea what their
2477          * clients' current directories are.  We fake it by
2478          * supplying a vnode known to exist and illegal to
2479          * remove.
2480          */
2481         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2482 
2483         /*
2484          * Force modified data and metadata out to stable storage.
2485          */
2486         (void) VOP_FSYNC(vp, 0, cr, NULL);
2487 
2488         VN_RELE(vp);
2489 
2490         /*
2491          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2492          * if the directory is not empty.  A System V NFS server
2493          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2494          * over the wire.
2495          */
2496         if (error == EEXIST)
2497                 *status = NFSERR_NOTEMPTY;
2498         else
2499                 *status = puterrno(error);
2500 
2501 }
2502 void *
2503 rfs_rmdir_getfh(struct nfsdiropargs *da)
2504 {
2505         return (da->da_fhandle);
2506 }
2507 
2508 /* ARGSUSED */
2509 void
2510 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2511         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2512 {
2513         int error;
2514         int iseof;
2515         struct iovec iov;
2516         struct uio uio;
2517         vnode_t *vp;
2518         char *ndata = NULL;
2519         struct sockaddr *ca;
2520         size_t nents;
2521         int ret;
2522 
2523         vp = nfs_fhtovp(&rda->rda_fh, exi);
2524         if (vp == NULL) {
2525                 rd->rd_entries = NULL;
2526                 rd->rd_status = NFSERR_STALE;
2527                 return;
2528         }
2529 
2530         if (vp->v_type != VDIR) {
2531                 VN_RELE(vp);
2532                 rd->rd_entries = NULL;
2533                 rd->rd_status = NFSERR_NOTDIR;
2534                 return;
2535         }
2536 
2537         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2538 
2539         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2540 
2541         if (error) {
2542                 rd->rd_entries = NULL;
2543                 goto bad;
2544         }
2545 
2546         if (rda->rda_count == 0) {
2547                 rd->rd_entries = NULL;
2548                 rd->rd_size = 0;
2549                 rd->rd_eof = FALSE;
2550                 goto bad;
2551         }
2552 
2553         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2554 
2555         /*
2556          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2557          */
2558         rd->rd_bufsize = (uint_t)rda->rda_count;
2559         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2560 
2561         /*
2562          * Set up io vector to read directory data
2563          */
2564         iov.iov_base = (caddr_t)rd->rd_entries;
2565         iov.iov_len = rda->rda_count;
2566         uio.uio_iov = &iov;
2567         uio.uio_iovcnt = 1;
2568         uio.uio_segflg = UIO_SYSSPACE;
2569         uio.uio_extflg = UIO_COPY_CACHED;
2570         uio.uio_loffset = (offset_t)rda->rda_offset;
2571         uio.uio_resid = rda->rda_count;
2572 
2573         /*
2574          * read directory
2575          */
2576         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2577 
2578         /*
2579          * Clean up
2580          */
2581         if (!error) {
2582                 /*
2583                  * set size and eof
2584                  */
2585                 if (uio.uio_resid == rda->rda_count) {
2586                         rd->rd_size = 0;
2587                         rd->rd_eof = TRUE;
2588                 } else {
2589                         rd->rd_size = (uint32_t)(rda->rda_count -
2590                             uio.uio_resid);
2591                         rd->rd_eof = iseof ? TRUE : FALSE;
2592                 }
2593         }
2594 
2595         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2596         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2597         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2598             rda->rda_count, &ndata);
2599 
2600         if (ret != 0) {
2601                 size_t dropbytes;
2602                 /*
2603                  * We had to drop one or more entries in order to fit
2604                  * during the character conversion.  We need to patch
2605                  * up the size and eof info.
2606                  */
2607                 if (rd->rd_eof)
2608                         rd->rd_eof = FALSE;
2609                 dropbytes = nfscmd_dropped_entrysize(
2610                     (struct dirent64 *)rd->rd_entries, nents, ret);
2611                 rd->rd_size -= dropbytes;
2612         }
2613         if (ndata == NULL) {
2614                 ndata = (char *)rd->rd_entries;
2615         } else if (ndata != (char *)rd->rd_entries) {
2616                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2617                 rd->rd_entries = (void *)ndata;
2618                 rd->rd_bufsize = rda->rda_count;
2619         }
2620 
2621 bad:
2622         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2623 
2624 #if 0 /* notyet */
2625         /*
2626          * Don't do this.  It causes local disk writes when just
2627          * reading the file and the overhead is deemed larger
2628          * than the benefit.
2629          */
2630         /*
2631          * Force modified metadata out to stable storage.
2632          */
2633         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2634 #endif
2635 
2636         VN_RELE(vp);
2637 
2638         rd->rd_status = puterrno(error);
2639 
2640 }
2641 void *
2642 rfs_readdir_getfh(struct nfsrddirargs *rda)
2643 {
2644         return (&rda->rda_fh);
2645 }
2646 void
2647 rfs_rddirfree(struct nfsrddirres *rd)
2648 {
2649         if (rd->rd_entries != NULL)
2650                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2651 }
2652 
2653 /* ARGSUSED */
2654 void
2655 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2656         struct svc_req *req, cred_t *cr)
2657 {
2658         int error;
2659         struct statvfs64 sb;
2660         vnode_t *vp;
2661 
2662         vp = nfs_fhtovp(fh, exi);
2663         if (vp == NULL) {
2664                 fs->fs_status = NFSERR_STALE;
2665                 return;
2666         }
2667 
2668         error = VFS_STATVFS(vp->v_vfsp, &sb);
2669 
2670         if (!error) {
2671                 fs->fs_tsize = nfstsize();
2672                 fs->fs_bsize = sb.f_frsize;
2673                 fs->fs_blocks = sb.f_blocks;
2674                 fs->fs_bfree = sb.f_bfree;
2675                 fs->fs_bavail = sb.f_bavail;
2676         }
2677 
2678         VN_RELE(vp);
2679 
2680         fs->fs_status = puterrno(error);
2681 
2682 }
2683 void *
2684 rfs_statfs_getfh(fhandle_t *fh)
2685 {
2686         return (fh);
2687 }
2688 
2689 static int
2690 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2691 {
2692         vap->va_mask = 0;
2693 
2694         /*
2695          * There was a sign extension bug in some VFS based systems
2696          * which stored the mode as a short.  When it would get
2697          * assigned to a u_long, no sign extension would occur.
2698          * It needed to, but this wasn't noticed because sa_mode
2699          * would then get assigned back to the short, thus ignoring
2700          * the upper 16 bits of sa_mode.
2701          *
2702          * To make this implementation work for both broken
2703          * clients and good clients, we check for both versions
2704          * of the mode.
2705          */
2706         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2707             sa->sa_mode != (uint32_t)-1) {
2708                 vap->va_mask |= AT_MODE;
2709                 vap->va_mode = sa->sa_mode;
2710         }
2711         if (sa->sa_uid != (uint32_t)-1) {
2712                 vap->va_mask |= AT_UID;
2713                 vap->va_uid = sa->sa_uid;
2714         }
2715         if (sa->sa_gid != (uint32_t)-1) {
2716                 vap->va_mask |= AT_GID;
2717                 vap->va_gid = sa->sa_gid;
2718         }
2719         if (sa->sa_size != (uint32_t)-1) {
2720                 vap->va_mask |= AT_SIZE;
2721                 vap->va_size = sa->sa_size;
2722         }
2723         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2724             sa->sa_atime.tv_usec != (int32_t)-1) {
2725 #ifndef _LP64
2726                 /* return error if time overflow */
2727                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2728                         return (EOVERFLOW);
2729 #endif
2730                 vap->va_mask |= AT_ATIME;
2731                 /*
2732                  * nfs protocol defines times as unsigned so don't extend sign,
2733                  * unless sysadmin set nfs_allow_preepoch_time.
2734                  */
2735                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2736                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2737         }
2738         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2739             sa->sa_mtime.tv_usec != (int32_t)-1) {
2740 #ifndef _LP64
2741                 /* return error if time overflow */
2742                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2743                         return (EOVERFLOW);
2744 #endif
2745                 vap->va_mask |= AT_MTIME;
2746                 /*
2747                  * nfs protocol defines times as unsigned so don't extend sign,
2748                  * unless sysadmin set nfs_allow_preepoch_time.
2749                  */
2750                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2751                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2752         }
2753         return (0);
2754 }
2755 
2756 static enum nfsftype vt_to_nf[] = {
2757         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2758 };
2759 
2760 /*
2761  * check the following fields for overflow: nodeid, size, and time.
2762  * There could be a problem when converting 64-bit LP64 fields
2763  * into 32-bit ones.  Return an error if there is an overflow.
2764  */
2765 int
2766 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2767 {
2768         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2769         na->na_type = vt_to_nf[vap->va_type];
2770 
2771         if (vap->va_mode == (unsigned short) -1)
2772                 na->na_mode = (uint32_t)-1;
2773         else
2774                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2775 
2776         if (vap->va_uid == (unsigned short)(-1))
2777                 na->na_uid = (uint32_t)(-1);
2778         else if (vap->va_uid == UID_NOBODY)
2779                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2780         else
2781                 na->na_uid = vap->va_uid;
2782 
2783         if (vap->va_gid == (unsigned short)(-1))
2784                 na->na_gid = (uint32_t)-1;
2785         else if (vap->va_gid == GID_NOBODY)
2786                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2787         else
2788                 na->na_gid = vap->va_gid;
2789 
2790         /*
2791          * Do we need to check fsid for overflow?  It is 64-bit in the
2792          * vattr, but are bigger than 32 bit values supported?
2793          */
2794         na->na_fsid = vap->va_fsid;
2795 
2796         na->na_nodeid = vap->va_nodeid;
2797 
2798         /*
2799          * Check to make sure that the nodeid is representable over the
2800          * wire without losing bits.
2801          */
2802         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2803                 return (EFBIG);
2804         na->na_nlink = vap->va_nlink;
2805 
2806         /*
2807          * Check for big files here, instead of at the caller.  See
2808          * comments in cstat for large special file explanation.
2809          */
2810         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2811                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2812                         return (EFBIG);
2813                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2814                         /* UNKNOWN_SIZE | OVERFLOW */
2815                         na->na_size = MAXOFF32_T;
2816                 } else
2817                         na->na_size = vap->va_size;
2818         } else
2819                 na->na_size = vap->va_size;
2820 
2821         /*
2822          * If the vnode times overflow the 32-bit times that NFS2
2823          * uses on the wire then return an error.
2824          */
2825         if (!NFS_VAP_TIME_OK(vap)) {
2826                 return (EOVERFLOW);
2827         }
2828         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2829         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2830 
2831         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2832         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2833 
2834         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2835         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2836 
2837         /*
2838          * If the dev_t will fit into 16 bits then compress
2839          * it, otherwise leave it alone. See comments in
2840          * nfs_client.c.
2841          */
2842         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2843             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2844                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2845         else
2846                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2847 
2848         na->na_blocks = vap->va_nblocks;
2849         na->na_blocksize = vap->va_blksize;
2850 
2851         /*
2852          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2853          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2854          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2855          *
2856          * BUYER BEWARE:
2857          *  If you are porting the NFS to a non-Sun server, you probably
2858          *  don't want to include the following block of code.  The
2859          *  over-the-wire special file types will be changing with the
2860          *  NFS Protocol Revision.
2861          */
2862         if (vap->va_type == VFIFO)
2863                 NA_SETFIFO(na);
2864         return (0);
2865 }
2866 
2867 /*
2868  * acl v2 support: returns approximate permission.
2869  *      default: returns minimal permission (more restrictive)
2870  *      aclok: returns maximal permission (less restrictive)
2871  *      This routine changes the permissions that are alaredy in *va.
2872  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2873  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2874  */
2875 static void
2876 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2877 {
2878         vsecattr_t      vsa;
2879         int             aclcnt;
2880         aclent_t        *aclentp;
2881         mode_t          mask_perm;
2882         mode_t          grp_perm;
2883         mode_t          other_perm;
2884         mode_t          other_orig;
2885         int             error;
2886 
2887         /* dont care default acl */
2888         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2889         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2890 
2891         if (!error) {
2892                 aclcnt = vsa.vsa_aclcnt;
2893                 if (aclcnt > MIN_ACL_ENTRIES) {
2894                         /* non-trivial ACL */
2895                         aclentp = vsa.vsa_aclentp;
2896                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2897                                 /* maximal permissions */
2898                                 grp_perm = 0;
2899                                 other_perm = 0;
2900                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2901                                         switch (aclentp->a_type) {
2902                                         case USER_OBJ:
2903                                                 break;
2904                                         case USER:
2905                                                 grp_perm |=
2906                                                     aclentp->a_perm << 3;
2907                                                 other_perm |= aclentp->a_perm;
2908                                                 break;
2909                                         case GROUP_OBJ:
2910                                                 grp_perm |=
2911                                                     aclentp->a_perm << 3;
2912                                                 break;
2913                                         case GROUP:
2914                                                 other_perm |= aclentp->a_perm;
2915                                                 break;
2916                                         case OTHER_OBJ:
2917                                                 other_orig = aclentp->a_perm;
2918                                                 break;
2919                                         case CLASS_OBJ:
2920                                                 mask_perm = aclentp->a_perm;
2921                                                 break;
2922                                         default:
2923                                                 break;
2924                                         }
2925                                 }
2926                                 grp_perm &= mask_perm << 3;
2927                                 other_perm &= mask_perm;
2928                                 other_perm |= other_orig;
2929 
2930                         } else {
2931                                 /* minimal permissions */
2932                                 grp_perm = 070;
2933                                 other_perm = 07;
2934                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2935                                         switch (aclentp->a_type) {
2936                                         case USER_OBJ:
2937                                                 break;
2938                                         case USER:
2939                                         case CLASS_OBJ:
2940                                                 grp_perm &=
2941                                                     aclentp->a_perm << 3;
2942                                                 other_perm &=
2943                                                     aclentp->a_perm;
2944                                                 break;
2945                                         case GROUP_OBJ:
2946                                                 grp_perm &=
2947                                                     aclentp->a_perm << 3;
2948                                                 break;
2949                                         case GROUP:
2950                                                 other_perm &=
2951                                                     aclentp->a_perm;
2952                                                 break;
2953                                         case OTHER_OBJ:
2954                                                 other_perm &=
2955                                                     aclentp->a_perm;
2956                                                 break;
2957                                         default:
2958                                                 break;
2959                                         }
2960                                 }
2961                         }
2962                         /* copy to va */
2963                         va->va_mode &= ~077;
2964                         va->va_mode |= grp_perm | other_perm;
2965                 }
2966                 if (vsa.vsa_aclcnt)
2967                         kmem_free(vsa.vsa_aclentp,
2968                             vsa.vsa_aclcnt * sizeof (aclent_t));
2969         }
2970 }
2971 
2972 void
2973 rfs_srvrinit(void)
2974 {
2975         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2976         nfs2_srv_caller_id = fs_new_caller_id();
2977 }
2978 
2979 void
2980 rfs_srvrfini(void)
2981 {
2982         mutex_destroy(&rfs_async_write_lock);
2983 }
2984 
2985 static int
2986 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2987 {
2988         struct clist    *wcl;
2989         int             wlist_len;
2990         uint32_t        count = rr->rr_count;
2991 
2992         wcl = ra->ra_wlist;
2993 
2994         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2995                 return (FALSE);
2996         }
2997 
2998         wcl = ra->ra_wlist;
2999         rr->rr_ok.rrok_wlist_len = wlist_len;
3000         rr->rr_ok.rrok_wlist = wcl;
3001 
3002         return (TRUE);
3003 }