1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2017, Joyent, Inc.
  26  */
  27 
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35 
  36 #include <sys/param.h>
  37 #include <sys/isa_defs.h>
  38 #include <sys/types.h>
  39 #include <sys/inttypes.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/cred.h>
  42 #include <sys/user.h>
  43 #include <sys/systm.h>
  44 #include <sys/errno.h>
  45 #include <sys/vnode.h>
  46 #include <sys/file.h>
  47 #include <sys/proc.h>
  48 #include <sys/cpuvar.h>
  49 #include <sys/uio.h>
  50 #include <sys/debug.h>
  51 #include <sys/rctl.h>
  52 #include <sys/nbmlock.h>
  53 #include <sys/limits.h>
  54 
  55 #define COPYOUT_MAX_CACHE       (1<<17)           /* 128K */
  56 
  57 size_t copyout_max_cached = COPYOUT_MAX_CACHE;  /* global so it's patchable */
  58 
  59 /*
  60  * read, write, pread, pwrite, readv, and writev syscalls.
  61  *
  62  * 64-bit open: all open's are large file opens.
  63  * Large Files: the behaviour of read depends on whether the fd
  64  *              corresponds to large open or not.
  65  * 32-bit open: FOFFMAX flag not set.
  66  *              read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
  67  *              EOVERFLOW if count is non-zero and if size of file
  68  *              is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
  69  *              at >= MAXOFF32_T returns EOF.
  70  */
  71 
  72 /*
  73  * Native system call
  74  */
  75 ssize_t
  76 read(int fdes, void *cbuf, size_t count)
  77 {
  78         struct uio auio;
  79         struct iovec aiov;
  80         file_t *fp;
  81         register vnode_t *vp;
  82         struct cpu *cp;
  83         int fflag, ioflag, rwflag;
  84         ssize_t cnt, bcount;
  85         int error = 0;
  86         u_offset_t fileoff;
  87         int in_crit = 0;
  88 
  89         if ((cnt = (ssize_t)count) < 0)
  90                 return (set_errno(EINVAL));
  91         if ((fp = getf(fdes)) == NULL)
  92                 return (set_errno(EBADF));
  93         if (((fflag = fp->f_flag) & FREAD) == 0) {
  94                 error = EBADF;
  95                 goto out;
  96         }
  97         vp = fp->f_vnode;
  98 
  99         if (vp->v_type == VREG && cnt == 0) {
 100                 goto out;
 101         }
 102 
 103         rwflag = 0;
 104         aiov.iov_base = cbuf;
 105         aiov.iov_len = cnt;
 106 
 107         /*
 108          * We have to enter the critical region before calling VOP_RWLOCK
 109          * to avoid a deadlock with write() calls.
 110          */
 111         if (nbl_need_check(vp)) {
 112                 int svmand;
 113 
 114                 nbl_start_crit(vp, RW_READER);
 115                 in_crit = 1;
 116                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 117                 if (error != 0)
 118                         goto out;
 119                 if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
 120                     NULL)) {
 121                         error = EACCES;
 122                         goto out;
 123                 }
 124         }
 125 
 126         (void) VOP_RWLOCK(vp, rwflag, NULL);
 127 
 128         /*
 129          * We do the following checks inside VOP_RWLOCK so as to
 130          * prevent file size from changing while these checks are
 131          * being done. Also, we load fp's offset to the local
 132          * variable fileoff because we can have a parallel lseek
 133          * going on (f_offset is not protected by any lock) which
 134          * could change f_offset. We need to see the value only
 135          * once here and take a decision. Seeing it more than once
 136          * can lead to incorrect functionality.
 137          */
 138 
 139         fileoff = (u_offset_t)fp->f_offset;
 140         if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
 141                 struct vattr va;
 142                 va.va_mask = AT_SIZE;
 143                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
 144                         VOP_RWUNLOCK(vp, rwflag, NULL);
 145                         goto out;
 146                 }
 147                 if (fileoff >= va.va_size) {
 148                         cnt = 0;
 149                         VOP_RWUNLOCK(vp, rwflag, NULL);
 150                         goto out;
 151                 } else {
 152                         error = EOVERFLOW;
 153                         VOP_RWUNLOCK(vp, rwflag, NULL);
 154                         goto out;
 155                 }
 156         }
 157         if ((vp->v_type == VREG) &&
 158             (fileoff + cnt > OFFSET_MAX(fp))) {
 159                 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 160         }
 161         auio.uio_loffset = fileoff;
 162         auio.uio_iov = &aiov;
 163         auio.uio_iovcnt = 1;
 164         auio.uio_resid = bcount = cnt;
 165         auio.uio_segflg = UIO_USERSPACE;
 166         auio.uio_llimit = MAXOFFSET_T;
 167         auio.uio_fmode = fflag;
 168         /*
 169          * Only use bypass caches when the count is large enough
 170          */
 171         if (bcount <= copyout_max_cached)
 172                 auio.uio_extflg = UIO_COPY_CACHED;
 173         else
 174                 auio.uio_extflg = UIO_COPY_DEFAULT;
 175 
 176         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 177 
 178         /* If read sync is not asked for, filter sync flags */
 179         if ((ioflag & FRSYNC) == 0)
 180                 ioflag &= ~(FSYNC|FDSYNC);
 181         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 182         cnt -= auio.uio_resid;
 183         CPU_STATS_ENTER_K();
 184         cp = CPU;
 185         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 186         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
 187         CPU_STATS_EXIT_K();
 188         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
 189 
 190         if (vp->v_type == VFIFO)     /* Backward compatibility */
 191                 fp->f_offset = cnt;
 192         else if (((fp->f_flag & FAPPEND) == 0) ||
 193             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 194                 fp->f_offset = auio.uio_loffset;
 195         VOP_RWUNLOCK(vp, rwflag, NULL);
 196 
 197         if (error == EINTR && cnt != 0)
 198                 error = 0;
 199 out:
 200         if (in_crit)
 201                 nbl_end_crit(vp);
 202         releasef(fdes);
 203         if (error)
 204                 return (set_errno(error));
 205         return (cnt);
 206 }
 207 
 208 /*
 209  * Native system call
 210  */
 211 ssize_t
 212 write(int fdes, void *cbuf, size_t count)
 213 {
 214         struct uio auio;
 215         struct iovec aiov;
 216         file_t *fp;
 217         register vnode_t *vp;
 218         struct cpu *cp;
 219         int fflag, ioflag, rwflag;
 220         ssize_t cnt, bcount;
 221         int error = 0;
 222         u_offset_t fileoff;
 223         int in_crit = 0;
 224 
 225         if ((cnt = (ssize_t)count) < 0)
 226                 return (set_errno(EINVAL));
 227         if ((fp = getf(fdes)) == NULL)
 228                 return (set_errno(EBADF));
 229         if (((fflag = fp->f_flag) & FWRITE) == 0) {
 230                 error = EBADF;
 231                 goto out;
 232         }
 233         vp = fp->f_vnode;
 234 
 235         if (vp->v_type == VREG && cnt == 0) {
 236                 goto out;
 237         }
 238 
 239         rwflag = 1;
 240         aiov.iov_base = cbuf;
 241         aiov.iov_len = cnt;
 242 
 243         /*
 244          * We have to enter the critical region before calling VOP_RWLOCK
 245          * to avoid a deadlock with ufs.
 246          */
 247         if (nbl_need_check(vp)) {
 248                 int svmand;
 249 
 250                 nbl_start_crit(vp, RW_READER);
 251                 in_crit = 1;
 252                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 253                 if (error != 0)
 254                         goto out;
 255                 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
 256                     NULL)) {
 257                         error = EACCES;
 258                         goto out;
 259                 }
 260         }
 261 
 262         (void) VOP_RWLOCK(vp, rwflag, NULL);
 263 
 264         fileoff = fp->f_offset;
 265         if (vp->v_type == VREG) {
 266 
 267                 /*
 268                  * We raise psignal if write for >0 bytes causes
 269                  * it to exceed the ulimit.
 270                  */
 271                 if (fileoff >= curproc->p_fsz_ctl) {
 272                         VOP_RWUNLOCK(vp, rwflag, NULL);
 273 
 274                         mutex_enter(&curproc->p_lock);
 275                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 276                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 277                         mutex_exit(&curproc->p_lock);
 278 
 279                         error = EFBIG;
 280                         goto out;
 281                 }
 282                 /*
 283                  * We return EFBIG if write is done at an offset
 284                  * greater than the offset maximum for this file structure.
 285                  */
 286 
 287                 if (fileoff >= OFFSET_MAX(fp)) {
 288                         VOP_RWUNLOCK(vp, rwflag, NULL);
 289                         error = EFBIG;
 290                         goto out;
 291                 }
 292                 /*
 293                  * Limit the bytes to be written  upto offset maximum for
 294                  * this open file structure.
 295                  */
 296                 if (fileoff + cnt > OFFSET_MAX(fp))
 297                         cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 298         }
 299         auio.uio_loffset = fileoff;
 300         auio.uio_iov = &aiov;
 301         auio.uio_iovcnt = 1;
 302         auio.uio_resid = bcount = cnt;
 303         auio.uio_segflg = UIO_USERSPACE;
 304         auio.uio_llimit = curproc->p_fsz_ctl;
 305         auio.uio_fmode = fflag;
 306         auio.uio_extflg = UIO_COPY_DEFAULT;
 307 
 308         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 309 
 310         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 311         cnt -= auio.uio_resid;
 312         CPU_STATS_ENTER_K();
 313         cp = CPU;
 314         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 315         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
 316         CPU_STATS_EXIT_K();
 317         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
 318 
 319         if (vp->v_type == VFIFO)     /* Backward compatibility */
 320                 fp->f_offset = cnt;
 321         else if (((fp->f_flag & FAPPEND) == 0) ||
 322             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 323                 fp->f_offset = auio.uio_loffset;
 324         VOP_RWUNLOCK(vp, rwflag, NULL);
 325 
 326         if (error == EINTR && cnt != 0)
 327                 error = 0;
 328 out:
 329         if (in_crit)
 330                 nbl_end_crit(vp);
 331         releasef(fdes);
 332         if (error)
 333                 return (set_errno(error));
 334         return (cnt);
 335 }
 336 
 337 ssize_t
 338 pread(int fdes, void *cbuf, size_t count, off_t offset)
 339 {
 340         struct uio auio;
 341         struct iovec aiov;
 342         file_t *fp;
 343         register vnode_t *vp;
 344         struct cpu *cp;
 345         int fflag, ioflag, rwflag;
 346         ssize_t bcount;
 347         int error = 0;
 348         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 349 #ifdef _SYSCALL32_IMPL
 350         u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
 351             MAXOFF32_T : MAXOFFSET_T;
 352 #else
 353         const u_offset_t maxoff = MAXOFF32_T;
 354 #endif
 355         int in_crit = 0;
 356 
 357         if ((bcount = (ssize_t)count) < 0)
 358                 return (set_errno(EINVAL));
 359 
 360         if ((fp = getf(fdes)) == NULL)
 361                 return (set_errno(EBADF));
 362         if (((fflag = fp->f_flag) & (FREAD)) == 0) {
 363                 error = EBADF;
 364                 goto out;
 365         }
 366 
 367         rwflag = 0;
 368         vp = fp->f_vnode;
 369 
 370         if (vp->v_type == VREG) {
 371 
 372                 if (bcount == 0)
 373                         goto out;
 374 
 375                 /*
 376                  * Return EINVAL if an invalid offset comes to pread.
 377                  * Negative offset from user will cause this error.
 378                  */
 379 
 380                 if (fileoff > maxoff) {
 381                         error = EINVAL;
 382                         goto out;
 383                 }
 384                 /*
 385                  * Limit offset such that we don't read or write
 386                  * a file beyond the maximum offset representable in
 387                  * an off_t structure.
 388                  */
 389                 if (fileoff + bcount > maxoff)
 390                         bcount = (ssize_t)((offset_t)maxoff - fileoff);
 391         } else if (vp->v_type == VFIFO) {
 392                 error = ESPIPE;
 393                 goto out;
 394         }
 395 
 396         /*
 397          * We have to enter the critical region before calling VOP_RWLOCK
 398          * to avoid a deadlock with ufs.
 399          */
 400         if (nbl_need_check(vp)) {
 401                 int svmand;
 402 
 403                 nbl_start_crit(vp, RW_READER);
 404                 in_crit = 1;
 405                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 406                 if (error != 0)
 407                         goto out;
 408                 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
 409                     NULL)) {
 410                         error = EACCES;
 411                         goto out;
 412                 }
 413         }
 414 
 415         aiov.iov_base = cbuf;
 416         aiov.iov_len = bcount;
 417         (void) VOP_RWLOCK(vp, rwflag, NULL);
 418         if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
 419                 struct vattr va;
 420                 va.va_mask = AT_SIZE;
 421                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
 422                         VOP_RWUNLOCK(vp, rwflag, NULL);
 423                         goto out;
 424                 }
 425                 VOP_RWUNLOCK(vp, rwflag, NULL);
 426 
 427                 /*
 428                  * We have to return EOF if fileoff is >= file size.
 429                  */
 430                 if (fileoff >= va.va_size) {
 431                         bcount = 0;
 432                         goto out;
 433                 }
 434 
 435                 /*
 436                  * File is greater than or equal to maxoff and therefore
 437                  * we return EOVERFLOW.
 438                  */
 439                 error = EOVERFLOW;
 440                 goto out;
 441         }
 442         auio.uio_loffset = fileoff;
 443         auio.uio_iov = &aiov;
 444         auio.uio_iovcnt = 1;
 445         auio.uio_resid = bcount;
 446         auio.uio_segflg = UIO_USERSPACE;
 447         auio.uio_llimit = MAXOFFSET_T;
 448         auio.uio_fmode = fflag;
 449         auio.uio_extflg = UIO_COPY_CACHED;
 450 
 451         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 452 
 453         /* If read sync is not asked for, filter sync flags */
 454         if ((ioflag & FRSYNC) == 0)
 455                 ioflag &= ~(FSYNC|FDSYNC);
 456         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 457         bcount -= auio.uio_resid;
 458         CPU_STATS_ENTER_K();
 459         cp = CPU;
 460         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 461         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
 462         CPU_STATS_EXIT_K();
 463         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
 464         VOP_RWUNLOCK(vp, rwflag, NULL);
 465 
 466         if (error == EINTR && bcount != 0)
 467                 error = 0;
 468 out:
 469         if (in_crit)
 470                 nbl_end_crit(vp);
 471         releasef(fdes);
 472         if (error)
 473                 return (set_errno(error));
 474         return (bcount);
 475 }
 476 
 477 ssize_t
 478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
 479 {
 480         struct uio auio;
 481         struct iovec aiov;
 482         file_t *fp;
 483         register vnode_t *vp;
 484         struct cpu *cp;
 485         int fflag, ioflag, rwflag;
 486         ssize_t bcount;
 487         int error = 0;
 488         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 489 #ifdef _SYSCALL32_IMPL
 490         u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
 491             MAXOFF32_T : MAXOFFSET_T;
 492 #else
 493         const u_offset_t maxoff = MAXOFF32_T;
 494 #endif
 495         int in_crit = 0;
 496 
 497         if ((bcount = (ssize_t)count) < 0)
 498                 return (set_errno(EINVAL));
 499         if ((fp = getf(fdes)) == NULL)
 500                 return (set_errno(EBADF));
 501         if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
 502                 error = EBADF;
 503                 goto out;
 504         }
 505 
 506         rwflag = 1;
 507         vp = fp->f_vnode;
 508 
 509         if (vp->v_type == VREG) {
 510 
 511                 if (bcount == 0)
 512                         goto out;
 513 
 514                 /*
 515                  * return EINVAL for offsets that cannot be
 516                  * represented in an off_t.
 517                  */
 518                 if (fileoff > maxoff) {
 519                         error = EINVAL;
 520                         goto out;
 521                 }
 522                 /*
 523                  * Take appropriate action if we are trying to write above the
 524                  * resource limit.
 525                  */
 526                 if (fileoff >= curproc->p_fsz_ctl) {
 527                         mutex_enter(&curproc->p_lock);
 528                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 529                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 530                         mutex_exit(&curproc->p_lock);
 531 
 532                         error = EFBIG;
 533                         goto out;
 534                 }
 535                 /*
 536                  * Don't allow pwrite to cause file sizes to exceed
 537                  * maxoff.
 538                  */
 539                 if (fileoff == maxoff) {
 540                         error = EFBIG;
 541                         goto out;
 542                 }
 543                 if (fileoff + count > maxoff)
 544                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
 545         } else if (vp->v_type == VFIFO) {
 546                 error = ESPIPE;
 547                 goto out;
 548         }
 549 
 550         /*
 551          * We have to enter the critical region before calling VOP_RWLOCK
 552          * to avoid a deadlock with ufs.
 553          */
 554         if (nbl_need_check(vp)) {
 555                 int svmand;
 556 
 557                 nbl_start_crit(vp, RW_READER);
 558                 in_crit = 1;
 559                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 560                 if (error != 0)
 561                         goto out;
 562                 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
 563                     NULL)) {
 564                         error = EACCES;
 565                         goto out;
 566                 }
 567         }
 568 
 569         aiov.iov_base = cbuf;
 570         aiov.iov_len = bcount;
 571         (void) VOP_RWLOCK(vp, rwflag, NULL);
 572         auio.uio_loffset = fileoff;
 573         auio.uio_iov = &aiov;
 574         auio.uio_iovcnt = 1;
 575         auio.uio_resid = bcount;
 576         auio.uio_segflg = UIO_USERSPACE;
 577         auio.uio_llimit = curproc->p_fsz_ctl;
 578         auio.uio_fmode = fflag;
 579         auio.uio_extflg = UIO_COPY_CACHED;
 580 
 581         /*
 582          * The SUSv4 POSIX specification states:
 583          *      The pwrite() function shall be equivalent to write(), except
 584          *      that it writes into a given position and does not change
 585          *      the file offset (regardless of whether O_APPEND is set).
 586          * To make this be true, we omit the FAPPEND flag from ioflag.
 587          */
 588         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
 589 
 590         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 591         bcount -= auio.uio_resid;
 592         CPU_STATS_ENTER_K();
 593         cp = CPU;
 594         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 595         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
 596         CPU_STATS_EXIT_K();
 597         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
 598         VOP_RWUNLOCK(vp, rwflag, NULL);
 599 
 600         if (error == EINTR && bcount != 0)
 601                 error = 0;
 602 out:
 603         if (in_crit)
 604                 nbl_end_crit(vp);
 605         releasef(fdes);
 606         if (error)
 607                 return (set_errno(error));
 608         return (bcount);
 609 }
 610 
 611 ssize_t
 612 readv(int fdes, struct iovec *iovp, int iovcnt)
 613 {
 614         struct uio auio;
 615         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
 616         int aiovlen = 0;
 617         file_t *fp;
 618         register vnode_t *vp;
 619         struct cpu *cp;
 620         int fflag, ioflag, rwflag;
 621         ssize_t count, bcount;
 622         int error = 0;
 623         int i;
 624         u_offset_t fileoff;
 625         int in_crit = 0;
 626 
 627         if (iovcnt <= 0 || iovcnt > IOV_MAX)
 628                 return (set_errno(EINVAL));
 629 
 630         if (iovcnt > IOV_MAX_STACK) {
 631                 aiovlen = iovcnt * sizeof (iovec_t);
 632                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
 633         }
 634 
 635 #ifdef _SYSCALL32_IMPL
 636         /*
 637          * 32-bit callers need to have their iovec expanded,
 638          * while ensuring that they can't move more than 2Gbytes
 639          * of data in a single call.
 640          */
 641         if (get_udatamodel() == DATAMODEL_ILP32) {
 642                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
 643                 int aiov32len;
 644                 ssize32_t count32;
 645 
 646                 aiov32len = iovcnt * sizeof (iovec32_t);
 647                 if (aiovlen != 0)
 648                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
 649 
 650                 if (copyin(iovp, aiov32, aiov32len)) {
 651                         if (aiovlen != 0) {
 652                                 kmem_free(aiov32, aiov32len);
 653                                 kmem_free(aiov, aiovlen);
 654                         }
 655                         return (set_errno(EFAULT));
 656                 }
 657 
 658                 count32 = 0;
 659                 for (i = 0; i < iovcnt; i++) {
 660                         ssize32_t iovlen32 = aiov32[i].iov_len;
 661                         count32 += iovlen32;
 662                         if (iovlen32 < 0 || count32 < 0) {
 663                                 if (aiovlen != 0) {
 664                                         kmem_free(aiov32, aiov32len);
 665                                         kmem_free(aiov, aiovlen);
 666                                 }
 667                                 return (set_errno(EINVAL));
 668                         }
 669                         aiov[i].iov_len = iovlen32;
 670                         aiov[i].iov_base =
 671                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 672                 }
 673 
 674                 if (aiovlen != 0)
 675                         kmem_free(aiov32, aiov32len);
 676         } else
 677 #endif
 678         if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
 679                 if (aiovlen != 0)
 680                         kmem_free(aiov, aiovlen);
 681                 return (set_errno(EFAULT));
 682         }
 683 
 684         count = 0;
 685         for (i = 0; i < iovcnt; i++) {
 686                 ssize_t iovlen = aiov[i].iov_len;
 687                 count += iovlen;
 688                 if (iovlen < 0 || count < 0) {
 689                         if (aiovlen != 0)
 690                                 kmem_free(aiov, aiovlen);
 691                         return (set_errno(EINVAL));
 692                 }
 693         }
 694         if ((fp = getf(fdes)) == NULL) {
 695                 if (aiovlen != 0)
 696                         kmem_free(aiov, aiovlen);
 697                 return (set_errno(EBADF));
 698         }
 699         if (((fflag = fp->f_flag) & FREAD) == 0) {
 700                 error = EBADF;
 701                 goto out;
 702         }
 703         vp = fp->f_vnode;
 704         if (vp->v_type == VREG && count == 0) {
 705                 goto out;
 706         }
 707 
 708         rwflag = 0;
 709 
 710         /*
 711          * We have to enter the critical region before calling VOP_RWLOCK
 712          * to avoid a deadlock with ufs.
 713          */
 714         if (nbl_need_check(vp)) {
 715                 int svmand;
 716 
 717                 nbl_start_crit(vp, RW_READER);
 718                 in_crit = 1;
 719                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 720                 if (error != 0)
 721                         goto out;
 722                 if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
 723                     NULL)) {
 724                         error = EACCES;
 725                         goto out;
 726                 }
 727         }
 728 
 729         (void) VOP_RWLOCK(vp, rwflag, NULL);
 730         fileoff = fp->f_offset;
 731 
 732         /*
 733          * Behaviour is same as read. Please see comments in read.
 734          */
 735 
 736         if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
 737                 struct vattr va;
 738                 va.va_mask = AT_SIZE;
 739                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
 740                         VOP_RWUNLOCK(vp, rwflag, NULL);
 741                         goto out;
 742                 }
 743                 if (fileoff >= va.va_size) {
 744                         VOP_RWUNLOCK(vp, rwflag, NULL);
 745                         count = 0;
 746                         goto out;
 747                 } else {
 748                         VOP_RWUNLOCK(vp, rwflag, NULL);
 749                         error = EOVERFLOW;
 750                         goto out;
 751                 }
 752         }
 753         if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
 754                 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 755         }
 756         auio.uio_loffset = fileoff;
 757         auio.uio_iov = aiov;
 758         auio.uio_iovcnt = iovcnt;
 759         auio.uio_resid = bcount = count;
 760         auio.uio_segflg = UIO_USERSPACE;
 761         auio.uio_llimit = MAXOFFSET_T;
 762         auio.uio_fmode = fflag;
 763         if (bcount <= copyout_max_cached)
 764                 auio.uio_extflg = UIO_COPY_CACHED;
 765         else
 766                 auio.uio_extflg = UIO_COPY_DEFAULT;
 767 
 768 
 769         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 770 
 771         /* If read sync is not asked for, filter sync flags */
 772         if ((ioflag & FRSYNC) == 0)
 773                 ioflag &= ~(FSYNC|FDSYNC);
 774         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 775         count -= auio.uio_resid;
 776         CPU_STATS_ENTER_K();
 777         cp = CPU;
 778         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 779         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
 780         CPU_STATS_EXIT_K();
 781         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
 782 
 783         if (vp->v_type == VFIFO)     /* Backward compatibility */
 784                 fp->f_offset = count;
 785         else if (((fp->f_flag & FAPPEND) == 0) ||
 786             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 787                 fp->f_offset = auio.uio_loffset;
 788 
 789         VOP_RWUNLOCK(vp, rwflag, NULL);
 790 
 791         if (error == EINTR && count != 0)
 792                 error = 0;
 793 out:
 794         if (in_crit)
 795                 nbl_end_crit(vp);
 796         releasef(fdes);
 797         if (aiovlen != 0)
 798                 kmem_free(aiov, aiovlen);
 799         if (error)
 800                 return (set_errno(error));
 801         return (count);
 802 }
 803 
 804 ssize_t
 805 writev(int fdes, struct iovec *iovp, int iovcnt)
 806 {
 807         struct uio auio;
 808         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
 809         int aiovlen = 0;
 810         file_t *fp;
 811         register vnode_t *vp;
 812         struct cpu *cp;
 813         int fflag, ioflag, rwflag;
 814         ssize_t count, bcount;
 815         int error = 0;
 816         int i;
 817         u_offset_t fileoff;
 818         int in_crit = 0;
 819 
 820         if (iovcnt <= 0 || iovcnt > IOV_MAX)
 821                 return (set_errno(EINVAL));
 822 
 823         if (iovcnt > IOV_MAX_STACK) {
 824                 aiovlen = iovcnt * sizeof (iovec_t);
 825                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
 826         }
 827 
 828 #ifdef _SYSCALL32_IMPL
 829         /*
 830          * 32-bit callers need to have their iovec expanded,
 831          * while ensuring that they can't move more than 2Gbytes
 832          * of data in a single call.
 833          */
 834         if (get_udatamodel() == DATAMODEL_ILP32) {
 835                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
 836                 int aiov32len;
 837                 ssize32_t count32;
 838 
 839                 aiov32len = iovcnt * sizeof (iovec32_t);
 840                 if (aiovlen != 0)
 841                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
 842 
 843                 if (copyin(iovp, aiov32, aiov32len)) {
 844                         if (aiovlen != 0) {
 845                                 kmem_free(aiov32, aiov32len);
 846                                 kmem_free(aiov, aiovlen);
 847                         }
 848                         return (set_errno(EFAULT));
 849                 }
 850 
 851                 count32 = 0;
 852                 for (i = 0; i < iovcnt; i++) {
 853                         ssize32_t iovlen = aiov32[i].iov_len;
 854                         count32 += iovlen;
 855                         if (iovlen < 0 || count32 < 0) {
 856                                 if (aiovlen != 0) {
 857                                         kmem_free(aiov32, aiov32len);
 858                                         kmem_free(aiov, aiovlen);
 859                                 }
 860                                 return (set_errno(EINVAL));
 861                         }
 862                         aiov[i].iov_len = iovlen;
 863                         aiov[i].iov_base =
 864                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 865                 }
 866                 if (aiovlen != 0)
 867                         kmem_free(aiov32, aiov32len);
 868         } else
 869 #endif
 870         if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
 871                 if (aiovlen != 0)
 872                         kmem_free(aiov, aiovlen);
 873                 return (set_errno(EFAULT));
 874         }
 875 
 876         count = 0;
 877         for (i = 0; i < iovcnt; i++) {
 878                 ssize_t iovlen = aiov[i].iov_len;
 879                 count += iovlen;
 880                 if (iovlen < 0 || count < 0) {
 881                         if (aiovlen != 0)
 882                                 kmem_free(aiov, aiovlen);
 883                         return (set_errno(EINVAL));
 884                 }
 885         }
 886         if ((fp = getf(fdes)) == NULL) {
 887                 if (aiovlen != 0)
 888                         kmem_free(aiov, aiovlen);
 889                 return (set_errno(EBADF));
 890         }
 891         if (((fflag = fp->f_flag) & FWRITE) == 0) {
 892                 error = EBADF;
 893                 goto out;
 894         }
 895         vp = fp->f_vnode;
 896         if (vp->v_type == VREG && count == 0) {
 897                 goto out;
 898         }
 899 
 900         rwflag = 1;
 901 
 902         /*
 903          * We have to enter the critical region before calling VOP_RWLOCK
 904          * to avoid a deadlock with ufs.
 905          */
 906         if (nbl_need_check(vp)) {
 907                 int svmand;
 908 
 909                 nbl_start_crit(vp, RW_READER);
 910                 in_crit = 1;
 911                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 912                 if (error != 0)
 913                         goto out;
 914                 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
 915                     NULL)) {
 916                         error = EACCES;
 917                         goto out;
 918                 }
 919         }
 920 
 921         (void) VOP_RWLOCK(vp, rwflag, NULL);
 922 
 923         fileoff = fp->f_offset;
 924 
 925         /*
 926          * Behaviour is same as write. Please see comments for write.
 927          */
 928 
 929         if (vp->v_type == VREG) {
 930                 if (fileoff >= curproc->p_fsz_ctl) {
 931                         VOP_RWUNLOCK(vp, rwflag, NULL);
 932                         mutex_enter(&curproc->p_lock);
 933                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 934                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 935                         mutex_exit(&curproc->p_lock);
 936                         error = EFBIG;
 937                         goto out;
 938                 }
 939                 if (fileoff >= OFFSET_MAX(fp)) {
 940                         VOP_RWUNLOCK(vp, rwflag, NULL);
 941                         error = EFBIG;
 942                         goto out;
 943                 }
 944                 if (fileoff + count > OFFSET_MAX(fp))
 945                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 946         }
 947         auio.uio_loffset = fileoff;
 948         auio.uio_iov = aiov;
 949         auio.uio_iovcnt = iovcnt;
 950         auio.uio_resid = bcount = count;
 951         auio.uio_segflg = UIO_USERSPACE;
 952         auio.uio_llimit = curproc->p_fsz_ctl;
 953         auio.uio_fmode = fflag;
 954         auio.uio_extflg = UIO_COPY_DEFAULT;
 955 
 956         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 957 
 958         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 959         count -= auio.uio_resid;
 960         CPU_STATS_ENTER_K();
 961         cp = CPU;
 962         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 963         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
 964         CPU_STATS_EXIT_K();
 965         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
 966 
 967         if (vp->v_type == VFIFO)     /* Backward compatibility */
 968                 fp->f_offset = count;
 969         else if (((fp->f_flag & FAPPEND) == 0) ||
 970             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 971                 fp->f_offset = auio.uio_loffset;
 972         VOP_RWUNLOCK(vp, rwflag, NULL);
 973 
 974         if (error == EINTR && count != 0)
 975                 error = 0;
 976 out:
 977         if (in_crit)
 978                 nbl_end_crit(vp);
 979         releasef(fdes);
 980         if (aiovlen != 0)
 981                 kmem_free(aiov, aiovlen);
 982         if (error)
 983                 return (set_errno(error));
 984         return (count);
 985 }
 986 
 987 ssize_t
 988 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 989     off_t extended_offset)
 990 {
 991         struct uio auio;
 992         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
 993         int aiovlen = 0;
 994         file_t *fp;
 995         register vnode_t *vp;
 996         struct cpu *cp;
 997         int fflag, ioflag, rwflag;
 998         ssize_t count, bcount;
 999         int error = 0;
1000         int i;
1001 
1002         /*
1003          * In a 64-bit kernel, this interface supports native 64-bit
1004          * applications as well as 32-bit applications using both standard and
1005          * large-file access. For 32-bit large-file aware applications, the
1006          * offset is passed as two parameters which are joined into the actual
1007          * offset used. The 64-bit libc always passes 0 for the extended_offset.
1008          * Note that off_t is a signed value, but the preadv/pwritev API treats
1009          * the offset as a position in the file for the operation, so passing
1010          * a negative value will likely fail the maximum offset checks below
1011          * because we convert it to an unsigned value which will be larger than
1012          * the maximum valid offset.
1013          */
1014 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1015         u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1016             (u_offset_t)offset;
1017 #else /* _SYSCALL32_IMPL || _ILP32 */
1018         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1019 #endif /* _SYSCALL32_IMPR || _ILP32 */
1020 
1021         int in_crit = 0;
1022 
1023         if (iovcnt <= 0 || iovcnt > IOV_MAX)
1024                 return (set_errno(EINVAL));
1025 
1026         if (iovcnt > IOV_MAX_STACK) {
1027                 aiovlen = iovcnt * sizeof (iovec_t);
1028                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
1029         }
1030 
1031 #ifdef _SYSCALL32_IMPL
1032         /*
1033          * 32-bit callers need to have their iovec expanded,
1034          * while ensuring that they can't move more than 2Gbytes
1035          * of data in a single call.
1036          */
1037         if (get_udatamodel() == DATAMODEL_ILP32) {
1038                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1039                 int aiov32len;
1040                 ssize32_t count32;
1041 
1042                 aiov32len = iovcnt * sizeof (iovec32_t);
1043                 if (aiovlen != 0)
1044                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1045 
1046                 if (copyin(iovp, aiov32, aiov32len)) {
1047                         if (aiovlen != 0) {
1048                                 kmem_free(aiov32, aiov32len);
1049                                 kmem_free(aiov, aiovlen);
1050                         }
1051                         return (set_errno(EFAULT));
1052                 }
1053 
1054                 count32 = 0;
1055                 for (i = 0; i < iovcnt; i++) {
1056                         ssize32_t iovlen32 = aiov32[i].iov_len;
1057                         count32 += iovlen32;
1058                         if (iovlen32 < 0 || count32 < 0) {
1059                                 if (aiovlen != 0) {
1060                                         kmem_free(aiov32, aiov32len);
1061                                         kmem_free(aiov, aiovlen);
1062                                 }
1063                                 return (set_errno(EINVAL));
1064                         }
1065                         aiov[i].iov_len = iovlen32;
1066                         aiov[i].iov_base =
1067                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1068                 }
1069                 if (aiovlen != 0)
1070                         kmem_free(aiov32, aiov32len);
1071         } else
1072 #endif /* _SYSCALL32_IMPL */
1073                 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1074                         if (aiovlen != 0)
1075                                 kmem_free(aiov, aiovlen);
1076                         return (set_errno(EFAULT));
1077                 }
1078 
1079         count = 0;
1080         for (i = 0; i < iovcnt; i++) {
1081                 ssize_t iovlen = aiov[i].iov_len;
1082                 count += iovlen;
1083                 if (iovlen < 0 || count < 0) {
1084                         if (aiovlen != 0)
1085                                 kmem_free(aiov, aiovlen);
1086                         return (set_errno(EINVAL));
1087                 }
1088         }
1089 
1090         if ((bcount = count) < 0) {
1091                 if (aiovlen != 0)
1092                         kmem_free(aiov, aiovlen);
1093                 return (set_errno(EINVAL));
1094         }
1095         if ((fp = getf(fdes)) == NULL) {
1096                 if (aiovlen != 0)
1097                         kmem_free(aiov, aiovlen);
1098                 return (set_errno(EBADF));
1099         }
1100         if (((fflag = fp->f_flag) & FREAD) == 0) {
1101                 error = EBADF;
1102                 goto out;
1103         }
1104         vp = fp->f_vnode;
1105         rwflag = 0;
1106 
1107         /*
1108          * Behaviour is same as read(2). Please see comments in read above.
1109          */
1110         if (vp->v_type == VREG) {
1111                 if (bcount == 0)
1112                         goto out;
1113 
1114                 /* Handle offset past maximum offset allowed for file. */
1115                 if (fileoff >= OFFSET_MAX(fp)) {
1116                         struct vattr va;
1117                         va.va_mask = AT_SIZE;
1118 
1119                         error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
1120                         if (error == 0)  {
1121                                 if (fileoff >= va.va_size) {
1122                                         count = 0;
1123                                 } else {
1124                                         error = EOVERFLOW;
1125                                 }
1126                         }
1127                         goto out;
1128                 }
1129 
1130                 ASSERT(bcount == count);
1131 
1132                 /* Note: modified count used in nbl_conflict() call below. */
1133                 if ((fileoff + count) > OFFSET_MAX(fp))
1134                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1135 
1136         } else if (vp->v_type == VFIFO) {
1137                 error = ESPIPE;
1138                 goto out;
1139         }
1140         /*
1141          * We have to enter the critical region before calling VOP_RWLOCK
1142          * to avoid a deadlock with ufs.
1143          */
1144         if (nbl_need_check(vp)) {
1145                 int svmand;
1146 
1147                 nbl_start_crit(vp, RW_READER);
1148                 in_crit = 1;
1149                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1150                 if (error != 0)
1151                         goto out;
1152                 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1153                         error = EACCES;
1154                         goto out;
1155                 }
1156         }
1157 
1158         (void) VOP_RWLOCK(vp, rwflag, NULL);
1159 
1160         auio.uio_loffset = fileoff;
1161         auio.uio_iov = aiov;
1162         auio.uio_iovcnt = iovcnt;
1163         auio.uio_resid = bcount = count;
1164         auio.uio_segflg = UIO_USERSPACE;
1165         auio.uio_llimit = MAXOFFSET_T;
1166         auio.uio_fmode = fflag;
1167         if (bcount <= copyout_max_cached)
1168                 auio.uio_extflg = UIO_COPY_CACHED;
1169         else
1170                 auio.uio_extflg = UIO_COPY_DEFAULT;
1171 
1172         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1173         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1174         count -= auio.uio_resid;
1175         CPU_STATS_ENTER_K();
1176         cp = CPU;
1177         CPU_STATS_ADDQ(cp, sys, sysread, 1);
1178         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1179         CPU_STATS_EXIT_K();
1180         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1181 
1182         VOP_RWUNLOCK(vp, rwflag, NULL);
1183 
1184         if (error == EINTR && count != 0)
1185                 error = 0;
1186 out:
1187         if (in_crit)
1188                 nbl_end_crit(vp);
1189         releasef(fdes);
1190         if (aiovlen != 0)
1191                 kmem_free(aiov, aiovlen);
1192         if (error)
1193                 return (set_errno(error));
1194         return (count);
1195 }
1196 
1197 ssize_t
1198 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1199     off_t extended_offset)
1200 {
1201         struct uio auio;
1202         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1203         int aiovlen = 0;
1204         file_t *fp;
1205         register vnode_t *vp;
1206         struct cpu *cp;
1207         int fflag, ioflag, rwflag;
1208         ssize_t count, bcount;
1209         int error = 0;
1210         int i;
1211 
1212         /*
1213          * See the comment in preadv for how the offset is handled.
1214          */
1215 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1216         u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1217             (u_offset_t)offset;
1218 #else /* _SYSCALL32_IMPL || _ILP32 */
1219         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1220 #endif /* _SYSCALL32_IMPR || _ILP32 */
1221 
1222         int in_crit = 0;
1223 
1224         if (iovcnt <= 0 || iovcnt > IOV_MAX)
1225                 return (set_errno(EINVAL));
1226 
1227         if (iovcnt > IOV_MAX_STACK) {
1228                 aiovlen = iovcnt * sizeof (iovec_t);
1229                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
1230         }
1231 
1232 #ifdef _SYSCALL32_IMPL
1233         /*
1234          * 32-bit callers need to have their iovec expanded,
1235          * while ensuring that they can't move more than 2Gbytes
1236          * of data in a single call.
1237          */
1238         if (get_udatamodel() == DATAMODEL_ILP32) {
1239                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1240                 int aiov32len;
1241                 ssize32_t count32;
1242 
1243                 aiov32len = iovcnt * sizeof (iovec32_t);
1244                 if (aiovlen != 0)
1245                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1246 
1247                 if (copyin(iovp, aiov32, aiov32len)) {
1248                         if (aiovlen != 0) {
1249                                 kmem_free(aiov32, aiov32len);
1250                                 kmem_free(aiov, aiovlen);
1251                         }
1252                         return (set_errno(EFAULT));
1253                 }
1254 
1255                 count32 = 0;
1256                 for (i = 0; i < iovcnt; i++) {
1257                         ssize32_t iovlen32 = aiov32[i].iov_len;
1258                         count32 += iovlen32;
1259                         if (iovlen32 < 0 || count32 < 0) {
1260                                 if (aiovlen != 0) {
1261                                         kmem_free(aiov32, aiov32len);
1262                                         kmem_free(aiov, aiovlen);
1263                                 }
1264                                 return (set_errno(EINVAL));
1265                         }
1266                         aiov[i].iov_len = iovlen32;
1267                         aiov[i].iov_base =
1268                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1269                 }
1270                 if (aiovlen != 0)
1271                         kmem_free(aiov32, aiov32len);
1272         } else
1273 #endif /* _SYSCALL32_IMPL */
1274                 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1275                         if (aiovlen != 0)
1276                                 kmem_free(aiov, aiovlen);
1277                         return (set_errno(EFAULT));
1278                 }
1279 
1280         count = 0;
1281         for (i = 0; i < iovcnt; i++) {
1282                 ssize_t iovlen = aiov[i].iov_len;
1283                 count += iovlen;
1284                 if (iovlen < 0 || count < 0) {
1285                         if (aiovlen != 0)
1286                                 kmem_free(aiov, aiovlen);
1287                         return (set_errno(EINVAL));
1288                 }
1289         }
1290 
1291         if ((bcount = count) < 0) {
1292                 if (aiovlen != 0)
1293                         kmem_free(aiov, aiovlen);
1294                 return (set_errno(EINVAL));
1295         }
1296         if ((fp = getf(fdes)) == NULL) {
1297                 if (aiovlen != 0)
1298                         kmem_free(aiov, aiovlen);
1299                 return (set_errno(EBADF));
1300         }
1301         if (((fflag = fp->f_flag) & FWRITE) == 0) {
1302                 error = EBADF;
1303                 goto out;
1304         }
1305         vp = fp->f_vnode;
1306         rwflag = 1;
1307 
1308         /*
1309          * The kernel's write(2) code checks OFFSET_MAX and the rctl, and
1310          * returns EFBIG when fileoff exceeds either limit. We do the same.
1311          */
1312         if (vp->v_type == VREG) {
1313                 if (bcount == 0)
1314                         goto out;
1315 
1316                 /*
1317                  * Don't allow pwritev to cause file size to exceed the proper
1318                  * offset limit.
1319                  */
1320                 if (fileoff >= OFFSET_MAX(fp)) {
1321                         error = EFBIG;
1322                         goto out;
1323                 }
1324 
1325                 /*
1326                  * Take appropriate action if we are trying
1327                  * to write above the resource limit.
1328                  */
1329                 if (fileoff >= curproc->p_fsz_ctl) {
1330                         mutex_enter(&curproc->p_lock);
1331                         /*
1332                          * Return value ignored because it lists
1333                          * actions taken, but we are in an error case.
1334                          * We don't have any actions that depend on
1335                          * what could happen in this call, so we ignore
1336                          * the return value.
1337                          */
1338                         (void) rctl_action(
1339                             rctlproc_legacy[RLIMIT_FSIZE],
1340                             curproc->p_rctls, curproc,
1341                             RCA_UNSAFE_SIGINFO);
1342                         mutex_exit(&curproc->p_lock);
1343 
1344                         error = EFBIG;
1345                         goto out;
1346                 }
1347 
1348                 ASSERT(bcount == count);
1349 
1350                 /* Note: modified count used in nbl_conflict() call below. */
1351                 if ((fileoff + count) > OFFSET_MAX(fp))
1352                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1353 
1354         } else if (vp->v_type == VFIFO) {
1355                 error = ESPIPE;
1356                 goto out;
1357         }
1358         /*
1359          * We have to enter the critical region before calling VOP_RWLOCK
1360          * to avoid a deadlock with ufs.
1361          */
1362         if (nbl_need_check(vp)) {
1363                 int svmand;
1364 
1365                 nbl_start_crit(vp, RW_READER);
1366                 in_crit = 1;
1367                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1368                 if (error != 0)
1369                         goto out;
1370                 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1371                         error = EACCES;
1372                         goto out;
1373                 }
1374         }
1375 
1376         (void) VOP_RWLOCK(vp, rwflag, NULL);
1377 
1378         auio.uio_loffset = fileoff;
1379         auio.uio_iov = aiov;
1380         auio.uio_iovcnt = iovcnt;
1381         auio.uio_resid = bcount = count;
1382         auio.uio_segflg = UIO_USERSPACE;
1383         auio.uio_llimit = curproc->p_fsz_ctl;
1384         auio.uio_fmode = fflag;
1385         auio.uio_extflg = UIO_COPY_CACHED;
1386         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1387         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1388         count -= auio.uio_resid;
1389         CPU_STATS_ENTER_K();
1390         cp = CPU;
1391         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1392         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1393         CPU_STATS_EXIT_K();
1394         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1395 
1396         VOP_RWUNLOCK(vp, rwflag, NULL);
1397 
1398         if (error == EINTR && count != 0)
1399                 error = 0;
1400 out:
1401         if (in_crit)
1402                 nbl_end_crit(vp);
1403         releasef(fdes);
1404         if (aiovlen != 0)
1405                 kmem_free(aiov, aiovlen);
1406         if (error)
1407                 return (set_errno(error));
1408         return (count);
1409 }
1410 
1411 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1412 
1413 /*
1414  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1415  */
1416 ssize32_t
1417 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1418     uint32_t offset_2)
1419 {
1420         struct uio auio;
1421         struct iovec aiov;
1422         file_t *fp;
1423         register vnode_t *vp;
1424         struct cpu *cp;
1425         int fflag, ioflag, rwflag;
1426         ssize_t bcount;
1427         int error = 0;
1428         u_offset_t fileoff;
1429         int in_crit = 0;
1430 
1431 #if defined(_LITTLE_ENDIAN)
1432         fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1433 #else
1434         fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1435 #endif
1436 
1437         if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1438                 return (set_errno(EINVAL));
1439 
1440         if ((fp = getf(fdes)) == NULL)
1441                 return (set_errno(EBADF));
1442         if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1443                 error = EBADF;
1444                 goto out;
1445         }
1446 
1447         rwflag = 0;
1448         vp = fp->f_vnode;
1449 
1450         if (vp->v_type == VREG) {
1451 
1452                 if (bcount == 0)
1453                         goto out;
1454 
1455                 /*
1456                  * Same as pread. See comments in pread.
1457                  */
1458 
1459                 if (fileoff > MAXOFFSET_T) {
1460                         error = EINVAL;
1461                         goto out;
1462                 }
1463                 if (fileoff + bcount > MAXOFFSET_T)
1464                         bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1465         } else if (vp->v_type == VFIFO) {
1466                 error = ESPIPE;
1467                 goto out;
1468         }
1469 
1470         /*
1471          * We have to enter the critical region before calling VOP_RWLOCK
1472          * to avoid a deadlock with ufs.
1473          */
1474         if (nbl_need_check(vp)) {
1475                 int svmand;
1476 
1477                 nbl_start_crit(vp, RW_READER);
1478                 in_crit = 1;
1479                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1480                 if (error != 0)
1481                         goto out;
1482                 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1483                     NULL)) {
1484                         error = EACCES;
1485                         goto out;
1486                 }
1487         }
1488 
1489         aiov.iov_base = cbuf;
1490         aiov.iov_len = bcount;
1491         (void) VOP_RWLOCK(vp, rwflag, NULL);
1492         auio.uio_loffset = fileoff;
1493 
1494         /*
1495          * Note: File size can never be greater than MAXOFFSET_T.
1496          * If ever we start supporting 128 bit files the code
1497          * similar to the one in pread at this place should be here.
1498          * Here we avoid the unnecessary VOP_GETATTR() when we
1499          * know that fileoff == MAXOFFSET_T implies that it is always
1500          * greater than or equal to file size.
1501          */
1502         auio.uio_iov = &aiov;
1503         auio.uio_iovcnt = 1;
1504         auio.uio_resid = bcount;
1505         auio.uio_segflg = UIO_USERSPACE;
1506         auio.uio_llimit = MAXOFFSET_T;
1507         auio.uio_fmode = fflag;
1508         auio.uio_extflg = UIO_COPY_CACHED;
1509 
1510         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1511 
1512         /* If read sync is not asked for, filter sync flags */
1513         if ((ioflag & FRSYNC) == 0)
1514                 ioflag &= ~(FSYNC|FDSYNC);
1515         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1516         bcount -= auio.uio_resid;
1517         CPU_STATS_ENTER_K();
1518         cp = CPU;
1519         CPU_STATS_ADDQ(cp, sys, sysread, 1);
1520         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1521         CPU_STATS_EXIT_K();
1522         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1523         VOP_RWUNLOCK(vp, rwflag, NULL);
1524 
1525         if (error == EINTR && bcount != 0)
1526                 error = 0;
1527 out:
1528         if (in_crit)
1529                 nbl_end_crit(vp);
1530         releasef(fdes);
1531         if (error)
1532                 return (set_errno(error));
1533         return (bcount);
1534 }
1535 
1536 /*
1537  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1538  */
1539 ssize32_t
1540 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1541     uint32_t offset_2)
1542 {
1543         struct uio auio;
1544         struct iovec aiov;
1545         file_t *fp;
1546         register vnode_t *vp;
1547         struct cpu *cp;
1548         int fflag, ioflag, rwflag;
1549         ssize_t bcount;
1550         int error = 0;
1551         u_offset_t fileoff;
1552         int in_crit = 0;
1553 
1554 #if defined(_LITTLE_ENDIAN)
1555         fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1556 #else
1557         fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1558 #endif
1559 
1560         if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1561                 return (set_errno(EINVAL));
1562         if ((fp = getf(fdes)) == NULL)
1563                 return (set_errno(EBADF));
1564         if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1565                 error = EBADF;
1566                 goto out;
1567         }
1568 
1569         rwflag = 1;
1570         vp = fp->f_vnode;
1571 
1572         if (vp->v_type == VREG) {
1573 
1574                 if (bcount == 0)
1575                         goto out;
1576 
1577                 /*
1578                  * See comments in pwrite.
1579                  */
1580                 if (fileoff > MAXOFFSET_T) {
1581                         error = EINVAL;
1582                         goto out;
1583                 }
1584                 if (fileoff >= curproc->p_fsz_ctl) {
1585                         mutex_enter(&curproc->p_lock);
1586                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1587                             curproc->p_rctls, curproc, RCA_SAFE);
1588                         mutex_exit(&curproc->p_lock);
1589                         error = EFBIG;
1590                         goto out;
1591                 }
1592                 if (fileoff == MAXOFFSET_T) {
1593                         error = EFBIG;
1594                         goto out;
1595                 }
1596                 if (fileoff + bcount > MAXOFFSET_T)
1597                         bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1598         } else if (vp->v_type == VFIFO) {
1599                 error = ESPIPE;
1600                 goto out;
1601         }
1602 
1603         /*
1604          * We have to enter the critical region before calling VOP_RWLOCK
1605          * to avoid a deadlock with ufs.
1606          */
1607         if (nbl_need_check(vp)) {
1608                 int svmand;
1609 
1610                 nbl_start_crit(vp, RW_READER);
1611                 in_crit = 1;
1612                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1613                 if (error != 0)
1614                         goto out;
1615                 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1616                     NULL)) {
1617                         error = EACCES;
1618                         goto out;
1619                 }
1620         }
1621 
1622         aiov.iov_base = cbuf;
1623         aiov.iov_len = bcount;
1624         (void) VOP_RWLOCK(vp, rwflag, NULL);
1625         auio.uio_loffset = fileoff;
1626         auio.uio_iov = &aiov;
1627         auio.uio_iovcnt = 1;
1628         auio.uio_resid = bcount;
1629         auio.uio_segflg = UIO_USERSPACE;
1630         auio.uio_llimit = curproc->p_fsz_ctl;
1631         auio.uio_fmode = fflag;
1632         auio.uio_extflg = UIO_COPY_CACHED;
1633 
1634         /*
1635          * The SUSv4 POSIX specification states:
1636          *      The pwrite() function shall be equivalent to write(), except
1637          *      that it writes into a given position and does not change
1638          *      the file offset (regardless of whether O_APPEND is set).
1639          * To make this be true, we omit the FAPPEND flag from ioflag.
1640          */
1641         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1642 
1643         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1644         bcount -= auio.uio_resid;
1645         CPU_STATS_ENTER_K();
1646         cp = CPU;
1647         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1648         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1649         CPU_STATS_EXIT_K();
1650         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1651         VOP_RWUNLOCK(vp, rwflag, NULL);
1652 
1653         if (error == EINTR && bcount != 0)
1654                 error = 0;
1655 out:
1656         if (in_crit)
1657                 nbl_end_crit(vp);
1658         releasef(fdes);
1659         if (error)
1660                 return (set_errno(error));
1661         return (bcount);
1662 }
1663 
1664 #endif  /* _SYSCALL32_IMPL || _ILP32 */
1665 
1666 #ifdef _SYSCALL32_IMPL
1667 /*
1668  * Tail-call elimination of xxx32() down to xxx()
1669  *
1670  * A number of xxx32 system calls take a len (or count) argument and
1671  * return a number in the range [0,len] or -1 on error.
1672  * Given an ssize32_t input len, the downcall xxx() will return
1673  * a 64-bit value that is -1 or in the range [0,len] which actually
1674  * is a proper return value for the xxx32 call. So even if the xxx32
1675  * calls can be considered as returning a ssize32_t, they are currently
1676  * declared as returning a ssize_t as this enables tail-call elimination.
1677  *
1678  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1679  * down negative input values as such and let the downcall handle error
1680  * reporting. Functions covered by this comments are:
1681  *
1682  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1683  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1684  * readlink.c:     readlink32.
1685  */
1686 
1687 ssize_t
1688 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1689 {
1690         return (read(fdes,
1691             (void *)(uintptr_t)cbuf, (ssize32_t)count));
1692 }
1693 
1694 ssize_t
1695 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1696 {
1697         return (write(fdes,
1698             (void *)(uintptr_t)cbuf, (ssize32_t)count));
1699 }
1700 
1701 ssize_t
1702 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1703 {
1704         return (pread(fdes,
1705             (void *)(uintptr_t)cbuf, (ssize32_t)count,
1706             (off_t)(uint32_t)offset));
1707 }
1708 
1709 ssize_t
1710 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1711 {
1712         return (pwrite(fdes,
1713             (void *)(uintptr_t)cbuf, (ssize32_t)count,
1714             (off_t)(uint32_t)offset));
1715 }
1716 
1717 ssize_t
1718 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1719 {
1720         return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1721 }
1722 
1723 ssize_t
1724 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1725 {
1726         return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1727 }
1728 #endif  /* _SYSCALL32_IMPL */