1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35 
  36 #include <sys/param.h>
  37 #include <sys/isa_defs.h>
  38 #include <sys/types.h>
  39 #include <sys/inttypes.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/cred.h>
  42 #include <sys/user.h>
  43 #include <sys/systm.h>
  44 #include <sys/errno.h>
  45 #include <sys/vnode.h>
  46 #include <sys/file.h>
  47 #include <sys/proc.h>
  48 #include <sys/cpuvar.h>
  49 #include <sys/uio.h>
  50 #include <sys/debug.h>
  51 #include <sys/rctl.h>
  52 #include <sys/nbmlock.h>
  53 #include <sys/limits.h>
  54 
  55 #define COPYOUT_MAX_CACHE       (1<<17)           /* 128K */
  56 
  57 size_t copyout_max_cached = COPYOUT_MAX_CACHE;  /* global so it's patchable */
  58 
  59 /*
  60  * read, write, pread, pwrite, readv, and writev syscalls.
  61  *
  62  * 64-bit open: all open's are large file opens.
  63  * Large Files: the behaviour of read depends on whether the fd
  64  *              corresponds to large open or not.
  65  * 32-bit open: FOFFMAX flag not set.
  66  *              read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
  67  *              EOVERFLOW if count is non-zero and if size of file
  68  *              is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
  69  *              at >= MAXOFF32_T returns EOF.
  70  */
  71 
  72 /*
  73  * Native system call
  74  */
  75 ssize_t
  76 read(int fdes, void *cbuf, size_t count)
  77 {
  78         struct uio auio;
  79         struct iovec aiov;
  80         file_t *fp;
  81         register vnode_t *vp;
  82         struct cpu *cp;
  83         int fflag, ioflag, rwflag;
  84         ssize_t cnt, bcount;
  85         int error = 0;
  86         u_offset_t fileoff;
  87         int in_crit = 0;
  88 
  89         if ((cnt = (ssize_t)count) < 0)
  90                 return (set_errno(EINVAL));
  91         if ((fp = getf(fdes)) == NULL)
  92                 return (set_errno(EBADF));
  93         if (((fflag = fp->f_flag) & FREAD) == 0) {
  94                 error = EBADF;
  95                 goto out;
  96         }
  97         vp = fp->f_vnode;
  98 
  99         if (vp->v_type == VREG && cnt == 0) {
 100                 goto out;
 101         }
 102 
 103         rwflag = 0;
 104         aiov.iov_base = cbuf;
 105         aiov.iov_len = cnt;
 106 
 107         /*
 108          * We have to enter the critical region before calling VOP_RWLOCK
 109          * to avoid a deadlock with write() calls.
 110          */
 111         if (nbl_need_check(vp)) {
 112                 int svmand;
 113 
 114                 nbl_start_crit(vp, RW_READER);
 115                 in_crit = 1;
 116                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 117                 if (error != 0)
 118                         goto out;
 119                 if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
 120                     NULL)) {
 121                         error = EACCES;
 122                         goto out;
 123                 }
 124         }
 125 
 126         (void) VOP_RWLOCK(vp, rwflag, NULL);
 127 
 128         /*
 129          * We do the following checks inside VOP_RWLOCK so as to
 130          * prevent file size from changing while these checks are
 131          * being done. Also, we load fp's offset to the local
 132          * variable fileoff because we can have a parallel lseek
 133          * going on (f_offset is not protected by any lock) which
 134          * could change f_offset. We need to see the value only
 135          * once here and take a decision. Seeing it more than once
 136          * can lead to incorrect functionality.
 137          */
 138 
 139         fileoff = (u_offset_t)fp->f_offset;
 140         if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
 141                 struct vattr va;
 142                 va.va_mask = AT_SIZE;
 143                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
 144                         VOP_RWUNLOCK(vp, rwflag, NULL);
 145                         goto out;
 146                 }
 147                 if (fileoff >= va.va_size) {
 148                         cnt = 0;
 149                         VOP_RWUNLOCK(vp, rwflag, NULL);
 150                         goto out;
 151                 } else {
 152                         error = EOVERFLOW;
 153                         VOP_RWUNLOCK(vp, rwflag, NULL);
 154                         goto out;
 155                 }
 156         }
 157         if ((vp->v_type == VREG) &&
 158             (fileoff + cnt > OFFSET_MAX(fp))) {
 159                 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 160         }
 161         auio.uio_loffset = fileoff;
 162         auio.uio_iov = &aiov;
 163         auio.uio_iovcnt = 1;
 164         auio.uio_resid = bcount = cnt;
 165         auio.uio_segflg = UIO_USERSPACE;
 166         auio.uio_llimit = MAXOFFSET_T;
 167         auio.uio_fmode = fflag;
 168         /*
 169          * Only use bypass caches when the count is large enough
 170          */
 171         if (bcount <= copyout_max_cached)
 172                 auio.uio_extflg = UIO_COPY_CACHED;
 173         else
 174                 auio.uio_extflg = UIO_COPY_DEFAULT;
 175 
 176         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 177 
 178         /* If read sync is not asked for, filter sync flags */
 179         if ((ioflag & FRSYNC) == 0)
 180                 ioflag &= ~(FSYNC|FDSYNC);
 181         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 182         cnt -= auio.uio_resid;
 183         CPU_STATS_ENTER_K();
 184         cp = CPU;
 185         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 186         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
 187         CPU_STATS_EXIT_K();
 188         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
 189 
 190         if (vp->v_type == VFIFO)     /* Backward compatibility */
 191                 fp->f_offset = cnt;
 192         else if (((fp->f_flag & FAPPEND) == 0) ||
 193             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 194                 fp->f_offset = auio.uio_loffset;
 195         VOP_RWUNLOCK(vp, rwflag, NULL);
 196 
 197         if (error == EINTR && cnt != 0)
 198                 error = 0;
 199 out:
 200         if (in_crit)
 201                 nbl_end_crit(vp);
 202         releasef(fdes);
 203         if (error)
 204                 return (set_errno(error));
 205         return (cnt);
 206 }
 207 
 208 /*
 209  * Native system call
 210  */
 211 ssize_t
 212 write(int fdes, void *cbuf, size_t count)
 213 {
 214         struct uio auio;
 215         struct iovec aiov;
 216         file_t *fp;
 217         register vnode_t *vp;
 218         struct cpu *cp;
 219         int fflag, ioflag, rwflag;
 220         ssize_t cnt, bcount;
 221         int error = 0;
 222         u_offset_t fileoff;
 223         int in_crit = 0;
 224 
 225         if ((cnt = (ssize_t)count) < 0)
 226                 return (set_errno(EINVAL));
 227         if ((fp = getf(fdes)) == NULL)
 228                 return (set_errno(EBADF));
 229         if (((fflag = fp->f_flag) & FWRITE) == 0) {
 230                 error = EBADF;
 231                 goto out;
 232         }
 233         vp = fp->f_vnode;
 234 
 235         if (vp->v_type == VREG && cnt == 0) {
 236                 goto out;
 237         }
 238 
 239         rwflag = 1;
 240         aiov.iov_base = cbuf;
 241         aiov.iov_len = cnt;
 242 
 243         /*
 244          * We have to enter the critical region before calling VOP_RWLOCK
 245          * to avoid a deadlock with ufs.
 246          */
 247         if (nbl_need_check(vp)) {
 248                 int svmand;
 249 
 250                 nbl_start_crit(vp, RW_READER);
 251                 in_crit = 1;
 252                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 253                 if (error != 0)
 254                         goto out;
 255                 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
 256                     NULL)) {
 257                         error = EACCES;
 258                         goto out;
 259                 }
 260         }
 261 
 262         (void) VOP_RWLOCK(vp, rwflag, NULL);
 263 
 264         fileoff = fp->f_offset;
 265         if (vp->v_type == VREG) {
 266 
 267                 /*
 268                  * We raise psignal if write for >0 bytes causes
 269                  * it to exceed the ulimit.
 270                  */
 271                 if (fileoff >= curproc->p_fsz_ctl) {
 272                         VOP_RWUNLOCK(vp, rwflag, NULL);
 273 
 274                         mutex_enter(&curproc->p_lock);
 275                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 276                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 277                         mutex_exit(&curproc->p_lock);
 278 
 279                         error = EFBIG;
 280                         goto out;
 281                 }
 282                 /*
 283                  * We return EFBIG if write is done at an offset
 284                  * greater than the offset maximum for this file structure.
 285                  */
 286 
 287                 if (fileoff >= OFFSET_MAX(fp)) {
 288                         VOP_RWUNLOCK(vp, rwflag, NULL);
 289                         error = EFBIG;
 290                         goto out;
 291                 }
 292                 /*
 293                  * Limit the bytes to be written  upto offset maximum for
 294                  * this open file structure.
 295                  */
 296                 if (fileoff + cnt > OFFSET_MAX(fp))
 297                         cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 298         }
 299         auio.uio_loffset = fileoff;
 300         auio.uio_iov = &aiov;
 301         auio.uio_iovcnt = 1;
 302         auio.uio_resid = bcount = cnt;
 303         auio.uio_segflg = UIO_USERSPACE;
 304         auio.uio_llimit = curproc->p_fsz_ctl;
 305         auio.uio_fmode = fflag;
 306         auio.uio_extflg = UIO_COPY_DEFAULT;
 307 
 308         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 309 
 310         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 311         cnt -= auio.uio_resid;
 312         CPU_STATS_ENTER_K();
 313         cp = CPU;
 314         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 315         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
 316         CPU_STATS_EXIT_K();
 317         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
 318 
 319         if (vp->v_type == VFIFO)     /* Backward compatibility */
 320                 fp->f_offset = cnt;
 321         else if (((fp->f_flag & FAPPEND) == 0) ||
 322             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 323                 fp->f_offset = auio.uio_loffset;
 324         VOP_RWUNLOCK(vp, rwflag, NULL);
 325 
 326         if (error == EINTR && cnt != 0)
 327                 error = 0;
 328 out:
 329         if (in_crit)
 330                 nbl_end_crit(vp);
 331         releasef(fdes);
 332         if (error)
 333                 return (set_errno(error));
 334         return (cnt);
 335 }
 336 
 337 ssize_t
 338 pread(int fdes, void *cbuf, size_t count, off_t offset)
 339 {
 340         struct uio auio;
 341         struct iovec aiov;
 342         file_t *fp;
 343         register vnode_t *vp;
 344         struct cpu *cp;
 345         int fflag, ioflag, rwflag;
 346         ssize_t bcount;
 347         int error = 0;
 348         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 349 #ifdef _SYSCALL32_IMPL
 350         u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
 351             MAXOFF32_T : MAXOFFSET_T;
 352 #else
 353         const u_offset_t maxoff = MAXOFF32_T;
 354 #endif
 355         int in_crit = 0;
 356 
 357         if ((bcount = (ssize_t)count) < 0)
 358                 return (set_errno(EINVAL));
 359 
 360         if ((fp = getf(fdes)) == NULL)
 361                 return (set_errno(EBADF));
 362         if (((fflag = fp->f_flag) & (FREAD)) == 0) {
 363                 error = EBADF;
 364                 goto out;
 365         }
 366 
 367         rwflag = 0;
 368         vp = fp->f_vnode;
 369 
 370         if (vp->v_type == VREG) {
 371 
 372                 if (bcount == 0)
 373                         goto out;
 374 
 375                 /*
 376                  * Return EINVAL if an invalid offset comes to pread.
 377                  * Negative offset from user will cause this error.
 378                  */
 379 
 380                 if (fileoff > maxoff) {
 381                         error = EINVAL;
 382                         goto out;
 383                 }
 384                 /*
 385                  * Limit offset such that we don't read or write
 386                  * a file beyond the maximum offset representable in
 387                  * an off_t structure.
 388                  */
 389                 if (fileoff + bcount > maxoff)
 390                         bcount = (ssize_t)((offset_t)maxoff - fileoff);
 391         } else if (vp->v_type == VFIFO) {
 392                 error = ESPIPE;
 393                 goto out;
 394         }
 395 
 396         /*
 397          * We have to enter the critical region before calling VOP_RWLOCK
 398          * to avoid a deadlock with ufs.
 399          */
 400         if (nbl_need_check(vp)) {
 401                 int svmand;
 402 
 403                 nbl_start_crit(vp, RW_READER);
 404                 in_crit = 1;
 405                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 406                 if (error != 0)
 407                         goto out;
 408                 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
 409                     NULL)) {
 410                         error = EACCES;
 411                         goto out;
 412                 }
 413         }
 414 
 415         aiov.iov_base = cbuf;
 416         aiov.iov_len = bcount;
 417         (void) VOP_RWLOCK(vp, rwflag, NULL);
 418         if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
 419                 struct vattr va;
 420                 va.va_mask = AT_SIZE;
 421                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
 422                         VOP_RWUNLOCK(vp, rwflag, NULL);
 423                         goto out;
 424                 }
 425                 VOP_RWUNLOCK(vp, rwflag, NULL);
 426 
 427                 /*
 428                  * We have to return EOF if fileoff is >= file size.
 429                  */
 430                 if (fileoff >= va.va_size) {
 431                         bcount = 0;
 432                         goto out;
 433                 }
 434 
 435                 /*
 436                  * File is greater than or equal to maxoff and therefore
 437                  * we return EOVERFLOW.
 438                  */
 439                 error = EOVERFLOW;
 440                 goto out;
 441         }
 442         auio.uio_loffset = fileoff;
 443         auio.uio_iov = &aiov;
 444         auio.uio_iovcnt = 1;
 445         auio.uio_resid = bcount;
 446         auio.uio_segflg = UIO_USERSPACE;
 447         auio.uio_llimit = MAXOFFSET_T;
 448         auio.uio_fmode = fflag;
 449         auio.uio_extflg = UIO_COPY_CACHED;
 450 
 451         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 452 
 453         /* If read sync is not asked for, filter sync flags */
 454         if ((ioflag & FRSYNC) == 0)
 455                 ioflag &= ~(FSYNC|FDSYNC);
 456         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 457         bcount -= auio.uio_resid;
 458         CPU_STATS_ENTER_K();
 459         cp = CPU;
 460         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 461         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
 462         CPU_STATS_EXIT_K();
 463         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
 464         VOP_RWUNLOCK(vp, rwflag, NULL);
 465 
 466         if (error == EINTR && bcount != 0)
 467                 error = 0;
 468 out:
 469         if (in_crit)
 470                 nbl_end_crit(vp);
 471         releasef(fdes);
 472         if (error)
 473                 return (set_errno(error));
 474         return (bcount);
 475 }
 476 
 477 ssize_t
 478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
 479 {
 480         struct uio auio;
 481         struct iovec aiov;
 482         file_t *fp;
 483         register vnode_t *vp;
 484         struct cpu *cp;
 485         int fflag, ioflag, rwflag;
 486         ssize_t bcount;
 487         int error = 0;
 488         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 489 #ifdef _SYSCALL32_IMPL
 490         u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
 491             MAXOFF32_T : MAXOFFSET_T;
 492 #else
 493         const u_offset_t maxoff = MAXOFF32_T;
 494 #endif
 495         int in_crit = 0;
 496 
 497         if ((bcount = (ssize_t)count) < 0)
 498                 return (set_errno(EINVAL));
 499         if ((fp = getf(fdes)) == NULL)
 500                 return (set_errno(EBADF));
 501         if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
 502                 error = EBADF;
 503                 goto out;
 504         }
 505 
 506         rwflag = 1;
 507         vp = fp->f_vnode;
 508 
 509         if (vp->v_type == VREG) {
 510 
 511                 if (bcount == 0)
 512                         goto out;
 513 
 514                 /*
 515                  * return EINVAL for offsets that cannot be
 516                  * represented in an off_t.
 517                  */
 518                 if (fileoff > maxoff) {
 519                         error = EINVAL;
 520                         goto out;
 521                 }
 522                 /*
 523                  * Take appropriate action if we are trying to write above the
 524                  * resource limit.
 525                  */
 526                 if (fileoff >= curproc->p_fsz_ctl) {
 527                         mutex_enter(&curproc->p_lock);
 528                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 529                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 530                         mutex_exit(&curproc->p_lock);
 531 
 532                         error = EFBIG;
 533                         goto out;
 534                 }
 535                 /*
 536                  * Don't allow pwrite to cause file sizes to exceed
 537                  * maxoff.
 538                  */
 539                 if (fileoff == maxoff) {
 540                         error = EFBIG;
 541                         goto out;
 542                 }
 543                 if (fileoff + count > maxoff)
 544                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
 545         } else if (vp->v_type == VFIFO) {
 546                 error = ESPIPE;
 547                 goto out;
 548         }
 549 
 550         /*
 551          * We have to enter the critical region before calling VOP_RWLOCK
 552          * to avoid a deadlock with ufs.
 553          */
 554         if (nbl_need_check(vp)) {
 555                 int svmand;
 556 
 557                 nbl_start_crit(vp, RW_READER);
 558                 in_crit = 1;
 559                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 560                 if (error != 0)
 561                         goto out;
 562                 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
 563                     NULL)) {
 564                         error = EACCES;
 565                         goto out;
 566                 }
 567         }
 568 
 569         aiov.iov_base = cbuf;
 570         aiov.iov_len = bcount;
 571         (void) VOP_RWLOCK(vp, rwflag, NULL);
 572         auio.uio_loffset = fileoff;
 573         auio.uio_iov = &aiov;
 574         auio.uio_iovcnt = 1;
 575         auio.uio_resid = bcount;
 576         auio.uio_segflg = UIO_USERSPACE;
 577         auio.uio_llimit = curproc->p_fsz_ctl;
 578         auio.uio_fmode = fflag;
 579         auio.uio_extflg = UIO_COPY_CACHED;
 580 
 581         /*
 582          * The SUSv4 POSIX specification states:
 583          *      The pwrite() function shall be equivalent to write(), except
 584          *      that it writes into a given position and does not change
 585          *      the file offset (regardless of whether O_APPEND is set).
 586          * To make this be true, we omit the FAPPEND flag from ioflag.
 587          */
 588         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
 589 
 590         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 591         bcount -= auio.uio_resid;
 592         CPU_STATS_ENTER_K();
 593         cp = CPU;
 594         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 595         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
 596         CPU_STATS_EXIT_K();
 597         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
 598         VOP_RWUNLOCK(vp, rwflag, NULL);
 599 
 600         if (error == EINTR && bcount != 0)
 601                 error = 0;
 602 out:
 603         if (in_crit)
 604                 nbl_end_crit(vp);
 605         releasef(fdes);
 606         if (error)
 607                 return (set_errno(error));
 608         return (bcount);
 609 }
 610 
 611 ssize_t
 612 readv(int fdes, struct iovec *iovp, int iovcnt)
 613 {
 614         struct uio auio;
 615         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
 616         int aiovlen = 0;
 617         file_t *fp;
 618         register vnode_t *vp;
 619         struct cpu *cp;
 620         int fflag, ioflag, rwflag;
 621         ssize_t count, bcount;
 622         int error = 0;
 623         int i;
 624         u_offset_t fileoff;
 625         int in_crit = 0;
 626 
 627         if (iovcnt <= 0 || iovcnt > IOV_MAX)
 628                 return (set_errno(EINVAL));
 629 
 630         if (iovcnt > IOV_MAX_STACK) {
 631                 aiovlen = iovcnt * sizeof (iovec_t);
 632                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
 633         }
 634 
 635 #ifdef _SYSCALL32_IMPL
 636         /*
 637          * 32-bit callers need to have their iovec expanded,
 638          * while ensuring that they can't move more than 2Gbytes
 639          * of data in a single call.
 640          */
 641         if (get_udatamodel() == DATAMODEL_ILP32) {
 642                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
 643                 int aiov32len;
 644                 ssize32_t count32;
 645 
 646                 aiov32len = iovcnt * sizeof (iovec32_t);
 647                 if (aiovlen != 0)
 648                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
 649 
 650                 if (copyin(iovp, aiov32, aiov32len)) {
 651                         if (aiovlen != 0) {
 652                                 kmem_free(aiov32, aiov32len);
 653                                 kmem_free(aiov, aiovlen);
 654                         }
 655                         return (set_errno(EFAULT));
 656                 }
 657 
 658                 count32 = 0;
 659                 for (i = 0; i < iovcnt; i++) {
 660                         ssize32_t iovlen32 = aiov32[i].iov_len;
 661                         count32 += iovlen32;
 662                         if (iovlen32 < 0 || count32 < 0) {
 663                                 if (aiovlen != 0) {
 664                                         kmem_free(aiov32, aiov32len);
 665                                         kmem_free(aiov, aiovlen);
 666                                 }
 667                                 return (set_errno(EINVAL));
 668                         }
 669                         aiov[i].iov_len = iovlen32;
 670                         aiov[i].iov_base =
 671                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 672                 }
 673 
 674                 if (aiovlen != 0)
 675                         kmem_free(aiov32, aiov32len);
 676         } else
 677 #endif
 678         if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
 679                 if (aiovlen != 0)
 680                         kmem_free(aiov, aiovlen);
 681                 return (set_errno(EFAULT));
 682         }
 683 
 684         count = 0;
 685         for (i = 0; i < iovcnt; i++) {
 686                 ssize_t iovlen = aiov[i].iov_len;
 687                 count += iovlen;
 688                 if (iovlen < 0 || count < 0) {
 689                         if (aiovlen != 0)
 690                                 kmem_free(aiov, aiovlen);
 691                         return (set_errno(EINVAL));
 692                 }
 693         }
 694         if ((fp = getf(fdes)) == NULL) {
 695                 if (aiovlen != 0)
 696                         kmem_free(aiov, aiovlen);
 697                 return (set_errno(EBADF));
 698         }
 699         if (((fflag = fp->f_flag) & FREAD) == 0) {
 700                 error = EBADF;
 701                 goto out;
 702         }
 703         vp = fp->f_vnode;
 704         if (vp->v_type == VREG && count == 0) {
 705                 goto out;
 706         }
 707 
 708         rwflag = 0;
 709 
 710         /*
 711          * We have to enter the critical region before calling VOP_RWLOCK
 712          * to avoid a deadlock with ufs.
 713          */
 714         if (nbl_need_check(vp)) {
 715                 int svmand;
 716 
 717                 nbl_start_crit(vp, RW_READER);
 718                 in_crit = 1;
 719                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 720                 if (error != 0)
 721                         goto out;
 722                 if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
 723                     NULL)) {
 724                         error = EACCES;
 725                         goto out;
 726                 }
 727         }
 728 
 729         (void) VOP_RWLOCK(vp, rwflag, NULL);
 730         fileoff = fp->f_offset;
 731 
 732         /*
 733          * Behaviour is same as read. Please see comments in read.
 734          */
 735 
 736         if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
 737                 struct vattr va;
 738                 va.va_mask = AT_SIZE;
 739                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
 740                         VOP_RWUNLOCK(vp, rwflag, NULL);
 741                         goto out;
 742                 }
 743                 if (fileoff >= va.va_size) {
 744                         VOP_RWUNLOCK(vp, rwflag, NULL);
 745                         count = 0;
 746                         goto out;
 747                 } else {
 748                         VOP_RWUNLOCK(vp, rwflag, NULL);
 749                         error = EOVERFLOW;
 750                         goto out;
 751                 }
 752         }
 753         if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
 754                 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 755         }
 756         auio.uio_loffset = fileoff;
 757         auio.uio_iov = aiov;
 758         auio.uio_iovcnt = iovcnt;
 759         auio.uio_resid = bcount = count;
 760         auio.uio_segflg = UIO_USERSPACE;
 761         auio.uio_llimit = MAXOFFSET_T;
 762         auio.uio_fmode = fflag;
 763         if (bcount <= copyout_max_cached)
 764                 auio.uio_extflg = UIO_COPY_CACHED;
 765         else
 766                 auio.uio_extflg = UIO_COPY_DEFAULT;
 767 
 768 
 769         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 770 
 771         /* If read sync is not asked for, filter sync flags */
 772         if ((ioflag & FRSYNC) == 0)
 773                 ioflag &= ~(FSYNC|FDSYNC);
 774         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 775         count -= auio.uio_resid;
 776         CPU_STATS_ENTER_K();
 777         cp = CPU;
 778         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 779         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
 780         CPU_STATS_EXIT_K();
 781         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
 782 
 783         if (vp->v_type == VFIFO)     /* Backward compatibility */
 784                 fp->f_offset = count;
 785         else if (((fp->f_flag & FAPPEND) == 0) ||
 786             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 787                 fp->f_offset = auio.uio_loffset;
 788 
 789         VOP_RWUNLOCK(vp, rwflag, NULL);
 790 
 791         if (error == EINTR && count != 0)
 792                 error = 0;
 793 out:
 794         if (in_crit)
 795                 nbl_end_crit(vp);
 796         releasef(fdes);
 797         if (aiovlen != 0)
 798                 kmem_free(aiov, aiovlen);
 799         if (error)
 800                 return (set_errno(error));
 801         return (count);
 802 }
 803 
 804 ssize_t
 805 writev(int fdes, struct iovec *iovp, int iovcnt)
 806 {
 807         struct uio auio;
 808         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
 809         int aiovlen = 0;
 810         file_t *fp;
 811         register vnode_t *vp;
 812         struct cpu *cp;
 813         int fflag, ioflag, rwflag;
 814         ssize_t count, bcount;
 815         int error = 0;
 816         int i;
 817         u_offset_t fileoff;
 818         int in_crit = 0;
 819 
 820         if (iovcnt <= 0 || iovcnt > IOV_MAX)
 821                 return (set_errno(EINVAL));
 822 
 823         if (iovcnt > IOV_MAX_STACK) {
 824                 aiovlen = iovcnt * sizeof (iovec_t);
 825                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
 826         }
 827 
 828 #ifdef _SYSCALL32_IMPL
 829         /*
 830          * 32-bit callers need to have their iovec expanded,
 831          * while ensuring that they can't move more than 2Gbytes
 832          * of data in a single call.
 833          */
 834         if (get_udatamodel() == DATAMODEL_ILP32) {
 835                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
 836                 int aiov32len;
 837                 ssize32_t count32;
 838 
 839                 aiov32len = iovcnt * sizeof (iovec32_t);
 840                 if (aiovlen != 0)
 841                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
 842 
 843                 if (copyin(iovp, aiov32, aiov32len)) {
 844                         if (aiovlen != 0) {
 845                                 kmem_free(aiov32, aiov32len);
 846                                 kmem_free(aiov, aiovlen);
 847                         }
 848                         return (set_errno(EFAULT));
 849                 }
 850 
 851                 count32 = 0;
 852                 for (i = 0; i < iovcnt; i++) {
 853                         ssize32_t iovlen = aiov32[i].iov_len;
 854                         count32 += iovlen;
 855                         if (iovlen < 0 || count32 < 0) {
 856                                 if (aiovlen != 0) {
 857                                         kmem_free(aiov32, aiov32len);
 858                                         kmem_free(aiov, aiovlen);
 859                                 }
 860                                 return (set_errno(EINVAL));
 861                         }
 862                         aiov[i].iov_len = iovlen;
 863                         aiov[i].iov_base =
 864                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 865                 }
 866                 if (aiovlen != 0)
 867                         kmem_free(aiov32, aiov32len);
 868         } else
 869 #endif
 870         if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
 871                 if (aiovlen != 0)
 872                         kmem_free(aiov, aiovlen);
 873                 return (set_errno(EFAULT));
 874         }
 875 
 876         count = 0;
 877         for (i = 0; i < iovcnt; i++) {
 878                 ssize_t iovlen = aiov[i].iov_len;
 879                 count += iovlen;
 880                 if (iovlen < 0 || count < 0) {
 881                         if (aiovlen != 0)
 882                                 kmem_free(aiov, aiovlen);
 883                         return (set_errno(EINVAL));
 884                 }
 885         }
 886         if ((fp = getf(fdes)) == NULL) {
 887                 if (aiovlen != 0)
 888                         kmem_free(aiov, aiovlen);
 889                 return (set_errno(EBADF));
 890         }
 891         if (((fflag = fp->f_flag) & FWRITE) == 0) {
 892                 error = EBADF;
 893                 goto out;
 894         }
 895         vp = fp->f_vnode;
 896         if (vp->v_type == VREG && count == 0) {
 897                 goto out;
 898         }
 899 
 900         rwflag = 1;
 901 
 902         /*
 903          * We have to enter the critical region before calling VOP_RWLOCK
 904          * to avoid a deadlock with ufs.
 905          */
 906         if (nbl_need_check(vp)) {
 907                 int svmand;
 908 
 909                 nbl_start_crit(vp, RW_READER);
 910                 in_crit = 1;
 911                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 912                 if (error != 0)
 913                         goto out;
 914                 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
 915                     NULL)) {
 916                         error = EACCES;
 917                         goto out;
 918                 }
 919         }
 920 
 921         (void) VOP_RWLOCK(vp, rwflag, NULL);
 922 
 923         fileoff = fp->f_offset;
 924 
 925         /*
 926          * Behaviour is same as write. Please see comments for write.
 927          */
 928 
 929         if (vp->v_type == VREG) {
 930                 if (fileoff >= curproc->p_fsz_ctl) {
 931                         VOP_RWUNLOCK(vp, rwflag, NULL);
 932                         mutex_enter(&curproc->p_lock);
 933                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 934                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 935                         mutex_exit(&curproc->p_lock);
 936                         error = EFBIG;
 937                         goto out;
 938                 }
 939                 if (fileoff >= OFFSET_MAX(fp)) {
 940                         VOP_RWUNLOCK(vp, rwflag, NULL);
 941                         error = EFBIG;
 942                         goto out;
 943                 }
 944                 if (fileoff + count > OFFSET_MAX(fp))
 945                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 946         }
 947         auio.uio_loffset = fileoff;
 948         auio.uio_iov = aiov;
 949         auio.uio_iovcnt = iovcnt;
 950         auio.uio_resid = bcount = count;
 951         auio.uio_segflg = UIO_USERSPACE;
 952         auio.uio_llimit = curproc->p_fsz_ctl;
 953         auio.uio_fmode = fflag;
 954         auio.uio_extflg = UIO_COPY_DEFAULT;
 955 
 956         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 957 
 958         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 959         count -= auio.uio_resid;
 960         CPU_STATS_ENTER_K();
 961         cp = CPU;
 962         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 963         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
 964         CPU_STATS_EXIT_K();
 965         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
 966 
 967         if (vp->v_type == VFIFO)     /* Backward compatibility */
 968                 fp->f_offset = count;
 969         else if (((fp->f_flag & FAPPEND) == 0) ||
 970             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 971                 fp->f_offset = auio.uio_loffset;
 972         VOP_RWUNLOCK(vp, rwflag, NULL);
 973 
 974         if (error == EINTR && count != 0)
 975                 error = 0;
 976 out:
 977         if (in_crit)
 978                 nbl_end_crit(vp);
 979         releasef(fdes);
 980         if (aiovlen != 0)
 981                 kmem_free(aiov, aiovlen);
 982         if (error)
 983                 return (set_errno(error));
 984         return (count);
 985 }
 986 
 987 ssize_t
 988 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 989     off_t extended_offset)
 990 {
 991         struct uio auio;
 992         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
 993         int aiovlen = 0;
 994         file_t *fp;
 995         register vnode_t *vp;
 996         struct cpu *cp;
 997         int fflag, ioflag, rwflag;
 998         ssize_t count, bcount;
 999         int error = 0;
1000         int i;
1001 
1002 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1003         u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1004             (u_offset_t)offset;
1005 #else /* _SYSCALL32_IMPL || _ILP32 */
1006         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1007 #endif /* _SYSCALL32_IMPR || _ILP32 */
1008 #ifdef _SYSCALL32_IMPL
1009         const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1010             extended_offset == 0?
1011             MAXOFF32_T : MAXOFFSET_T;
1012 #else /* _SYSCALL32_IMPL */
1013         const u_offset_t maxoff = MAXOFF32_T;
1014 #endif /* _SYSCALL32_IMPL */
1015 
1016         int in_crit = 0;
1017 
1018         if (iovcnt <= 0 || iovcnt > IOV_MAX)
1019                 return (set_errno(EINVAL));
1020 
1021         if (iovcnt > IOV_MAX_STACK) {
1022                 aiovlen = iovcnt * sizeof (iovec_t);
1023                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
1024         }
1025 
1026 #ifdef _SYSCALL32_IMPL
1027         /*
1028          * 32-bit callers need to have their iovec expanded,
1029          * while ensuring that they can't move more than 2Gbytes
1030          * of data in a single call.
1031          */
1032         if (get_udatamodel() == DATAMODEL_ILP32) {
1033                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1034                 int aiov32len;
1035                 ssize32_t count32;
1036 
1037                 aiov32len = iovcnt * sizeof (iovec32_t);
1038                 if (aiovlen != 0)
1039                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1040 
1041                 if (copyin(iovp, aiov32, aiov32len)) {
1042                         if (aiovlen != 0) {
1043                                 kmem_free(aiov32, aiov32len);
1044                                 kmem_free(aiov, aiovlen);
1045                         }
1046                         return (set_errno(EFAULT));
1047                 }
1048 
1049                 count32 = 0;
1050                 for (i = 0; i < iovcnt; i++) {
1051                         ssize32_t iovlen32 = aiov32[i].iov_len;
1052                         count32 += iovlen32;
1053                         if (iovlen32 < 0 || count32 < 0) {
1054                                 if (aiovlen != 0) {
1055                                         kmem_free(aiov32, aiov32len);
1056                                         kmem_free(aiov, aiovlen);
1057                                 }
1058                                 return (set_errno(EINVAL));
1059                         }
1060                         aiov[i].iov_len = iovlen32;
1061                         aiov[i].iov_base =
1062                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1063                 }
1064                 if (aiovlen != 0)
1065                         kmem_free(aiov32, aiov32len);
1066         } else
1067 #endif /* _SYSCALL32_IMPL */
1068                 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1069                         if (aiovlen != 0)
1070                                 kmem_free(aiov, aiovlen);
1071                         return (set_errno(EFAULT));
1072                 }
1073 
1074         count = 0;
1075         for (i = 0; i < iovcnt; i++) {
1076                 ssize_t iovlen = aiov[i].iov_len;
1077                 count += iovlen;
1078                 if (iovlen < 0 || count < 0) {
1079                         if (aiovlen != 0)
1080                                 kmem_free(aiov, aiovlen);
1081                         return (set_errno(EINVAL));
1082                 }
1083         }
1084 
1085         if ((bcount = (ssize_t)count) < 0) {
1086                 if (aiovlen != 0)
1087                         kmem_free(aiov, aiovlen);
1088                 return (set_errno(EINVAL));
1089         }
1090         if ((fp = getf(fdes)) == NULL) {
1091                 if (aiovlen != 0)
1092                         kmem_free(aiov, aiovlen);
1093                 return (set_errno(EBADF));
1094         }
1095         if (((fflag = fp->f_flag) & FREAD) == 0) {
1096                 error = EBADF;
1097                 goto out;
1098         }
1099         vp = fp->f_vnode;
1100         rwflag = 0;
1101         if (vp->v_type == VREG) {
1102 
1103                 if (bcount == 0)
1104                         goto out;
1105 
1106                 /*
1107                  * return EINVAL for offsets that cannot be
1108                  * represented in an off_t.
1109                  */
1110                 if (fileoff > maxoff) {
1111                         error = EINVAL;
1112                         goto out;
1113                 }
1114 
1115                 if (fileoff + bcount > maxoff)
1116                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1117         } else if (vp->v_type == VFIFO) {
1118                 error = ESPIPE;
1119                 goto out;
1120         }
1121         /*
1122          * We have to enter the critical region before calling VOP_RWLOCK
1123          * to avoid a deadlock with ufs.
1124          */
1125         if (nbl_need_check(vp)) {
1126                 int svmand;
1127 
1128                 nbl_start_crit(vp, RW_READER);
1129                 in_crit = 1;
1130                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1131                 if (error != 0)
1132                         goto out;
1133                 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1134                     NULL)) {
1135                         error = EACCES;
1136                         goto out;
1137                 }
1138         }
1139 
1140         (void) VOP_RWLOCK(vp, rwflag, NULL);
1141 
1142         /*
1143          * Behaviour is same as read(2). Please see comments in
1144          * read(2).
1145          */
1146 
1147         if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
1148                 struct vattr va;
1149                 va.va_mask = AT_SIZE;
1150                 if ((error =
1151                     VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
1152                         VOP_RWUNLOCK(vp, rwflag, NULL);
1153                         goto out;
1154                 }
1155                 if (fileoff >= va.va_size) {
1156                         VOP_RWUNLOCK(vp, rwflag, NULL);
1157                         count = 0;
1158                         goto out;
1159                 } else {
1160                         VOP_RWUNLOCK(vp, rwflag, NULL);
1161                         error = EOVERFLOW;
1162                         goto out;
1163                 }
1164         }
1165         if ((vp->v_type == VREG) &&
1166             (fileoff + count > OFFSET_MAX(fp))) {
1167                 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1168         }
1169         auio.uio_loffset = fileoff;
1170         auio.uio_iov = aiov;
1171         auio.uio_iovcnt = iovcnt;
1172         auio.uio_resid = bcount = count;
1173         auio.uio_segflg = UIO_USERSPACE;
1174         auio.uio_llimit = MAXOFFSET_T;
1175         auio.uio_fmode = fflag;
1176         if (bcount <= copyout_max_cached)
1177                 auio.uio_extflg = UIO_COPY_CACHED;
1178         else
1179                 auio.uio_extflg = UIO_COPY_DEFAULT;
1180 
1181         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1182         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1183         count -= auio.uio_resid;
1184         CPU_STATS_ENTER_K();
1185         cp = CPU;
1186         CPU_STATS_ADDQ(cp, sys, sysread, 1);
1187         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1188         CPU_STATS_EXIT_K();
1189         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1190 
1191         VOP_RWUNLOCK(vp, rwflag, NULL);
1192 
1193         if (error == EINTR && count != 0)
1194                 error = 0;
1195 out:
1196         if (in_crit)
1197                 nbl_end_crit(vp);
1198         releasef(fdes);
1199         if (aiovlen != 0)
1200                 kmem_free(aiov, aiovlen);
1201         if (error)
1202                 return (set_errno(error));
1203         return (count);
1204 }
1205 
1206 ssize_t
1207 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1208     off_t extended_offset)
1209 {
1210         struct uio auio;
1211         struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1212         int aiovlen = 0;
1213         file_t *fp;
1214         register vnode_t *vp;
1215         struct cpu *cp;
1216         int fflag, ioflag, rwflag;
1217         ssize_t count, bcount;
1218         int error = 0;
1219         int i;
1220 
1221 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1222         u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1223             (u_offset_t)offset;
1224 #else /* _SYSCALL32_IMPL || _ILP32 */
1225         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1226 #endif /* _SYSCALL32_IMPR || _ILP32 */
1227 #ifdef _SYSCALL32_IMPL
1228         const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1229             extended_offset == 0?
1230             MAXOFF32_T : MAXOFFSET_T;
1231 #else /* _SYSCALL32_IMPL */
1232         const u_offset_t maxoff = MAXOFF32_T;
1233 #endif /* _SYSCALL32_IMPL */
1234 
1235         int in_crit = 0;
1236 
1237         if (iovcnt <= 0 || iovcnt > IOV_MAX)
1238                 return (set_errno(EINVAL));
1239 
1240         if (iovcnt > IOV_MAX_STACK) {
1241                 aiovlen = iovcnt * sizeof (iovec_t);
1242                 aiov = kmem_alloc(aiovlen, KM_SLEEP);
1243         }
1244 
1245 #ifdef _SYSCALL32_IMPL
1246         /*
1247          * 32-bit callers need to have their iovec expanded,
1248          * while ensuring that they can't move more than 2Gbytes
1249          * of data in a single call.
1250          */
1251         if (get_udatamodel() == DATAMODEL_ILP32) {
1252                 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1253                 int aiov32len;
1254                 ssize32_t count32;
1255 
1256                 aiov32len = iovcnt * sizeof (iovec32_t);
1257                 if (aiovlen != 0)
1258                         aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1259 
1260                 if (copyin(iovp, aiov32, aiov32len)) {
1261                         if (aiovlen != 0) {
1262                                 kmem_free(aiov32, aiov32len);
1263                                 kmem_free(aiov, aiovlen);
1264                         }
1265                         return (set_errno(EFAULT));
1266                 }
1267 
1268                 count32 = 0;
1269                 for (i = 0; i < iovcnt; i++) {
1270                         ssize32_t iovlen32 = aiov32[i].iov_len;
1271                         count32 += iovlen32;
1272                         if (iovlen32 < 0 || count32 < 0) {
1273                                 if (aiovlen != 0) {
1274                                         kmem_free(aiov32, aiov32len);
1275                                         kmem_free(aiov, aiovlen);
1276                                 }
1277                                 return (set_errno(EINVAL));
1278                         }
1279                         aiov[i].iov_len = iovlen32;
1280                         aiov[i].iov_base =
1281                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1282                 }
1283                 if (aiovlen != 0)
1284                         kmem_free(aiov32, aiov32len);
1285         } else
1286 #endif /* _SYSCALL32_IMPL */
1287                 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1288                         if (aiovlen != 0)
1289                                 kmem_free(aiov, aiovlen);
1290                         return (set_errno(EFAULT));
1291                 }
1292 
1293         count = 0;
1294         for (i = 0; i < iovcnt; i++) {
1295                 ssize_t iovlen = aiov[i].iov_len;
1296                 count += iovlen;
1297                 if (iovlen < 0 || count < 0) {
1298                         if (aiovlen != 0)
1299                                 kmem_free(aiov, aiovlen);
1300                         return (set_errno(EINVAL));
1301                 }
1302         }
1303 
1304         if ((bcount = (ssize_t)count) < 0) {
1305                 if (aiovlen != 0)
1306                         kmem_free(aiov, aiovlen);
1307                 return (set_errno(EINVAL));
1308         }
1309         if ((fp = getf(fdes)) == NULL) {
1310                 if (aiovlen != 0)
1311                         kmem_free(aiov, aiovlen);
1312                 return (set_errno(EBADF));
1313         }
1314         if (((fflag = fp->f_flag) & FWRITE) == 0) {
1315                 error = EBADF;
1316                 goto out;
1317         }
1318         vp = fp->f_vnode;
1319         rwflag = 1;
1320         if (vp->v_type == VREG) {
1321 
1322                 if (bcount == 0)
1323                         goto out;
1324 
1325                 /*
1326                  * return EINVAL for offsets that cannot be
1327                  * represented in an off_t.
1328                  */
1329                 if (fileoff > maxoff) {
1330                         error = EINVAL;
1331                         goto out;
1332                 }
1333                 /*
1334                  * Take appropriate action if we are trying
1335                  * to write above the resource limit.
1336                  */
1337                 if (fileoff >= curproc->p_fsz_ctl) {
1338                         mutex_enter(&curproc->p_lock);
1339                         /*
1340                          * Return value ignored because it lists
1341                          * actions taken, but we are in an error case.
1342                          * We don't have any actions that depend on
1343                          * what could happen in this call, so we ignore
1344                          * the return value.
1345                          */
1346                         (void) rctl_action(
1347                             rctlproc_legacy[RLIMIT_FSIZE],
1348                             curproc->p_rctls, curproc,
1349                             RCA_UNSAFE_SIGINFO);
1350                         mutex_exit(&curproc->p_lock);
1351 
1352                         error = EFBIG;
1353                         goto out;
1354                 }
1355                 /*
1356                  * Don't allow pwritev to cause file sizes to exceed
1357                  * maxoff.
1358                  */
1359                 if (fileoff == maxoff) {
1360                         error = EFBIG;
1361                         goto out;
1362                 }
1363 
1364                 if (fileoff + bcount > maxoff)
1365                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1366         } else if (vp->v_type == VFIFO) {
1367                 error = ESPIPE;
1368                 goto out;
1369         }
1370         /*
1371          * We have to enter the critical region before calling VOP_RWLOCK
1372          * to avoid a deadlock with ufs.
1373          */
1374         if (nbl_need_check(vp)) {
1375                 int svmand;
1376 
1377                 nbl_start_crit(vp, RW_READER);
1378                 in_crit = 1;
1379                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1380                 if (error != 0)
1381                         goto out;
1382                 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1383                     NULL)) {
1384                         error = EACCES;
1385                         goto out;
1386                 }
1387         }
1388 
1389         (void) VOP_RWLOCK(vp, rwflag, NULL);
1390 
1391 
1392         /*
1393          * Behaviour is same as write(2). Please see comments for
1394          * write(2).
1395          */
1396 
1397         if (vp->v_type == VREG) {
1398                 if (fileoff >= curproc->p_fsz_ctl) {
1399                         VOP_RWUNLOCK(vp, rwflag, NULL);
1400                         mutex_enter(&curproc->p_lock);
1401                         /* see above rctl_action comment */
1402                         (void) rctl_action(
1403                             rctlproc_legacy[RLIMIT_FSIZE],
1404                             curproc->p_rctls,
1405                             curproc, RCA_UNSAFE_SIGINFO);
1406                         mutex_exit(&curproc->p_lock);
1407                         error = EFBIG;
1408                         goto out;
1409                 }
1410                 if (fileoff >= OFFSET_MAX(fp)) {
1411                         VOP_RWUNLOCK(vp, rwflag, NULL);
1412                         error = EFBIG;
1413                         goto out;
1414                 }
1415                 if (fileoff + count > OFFSET_MAX(fp))
1416                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1417         }
1418 
1419         auio.uio_loffset = fileoff;
1420         auio.uio_iov = aiov;
1421         auio.uio_iovcnt = iovcnt;
1422         auio.uio_resid = bcount = count;
1423         auio.uio_segflg = UIO_USERSPACE;
1424         auio.uio_llimit = curproc->p_fsz_ctl;
1425         auio.uio_fmode = fflag;
1426         auio.uio_extflg = UIO_COPY_CACHED;
1427         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1428         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1429         count -= auio.uio_resid;
1430         CPU_STATS_ENTER_K();
1431         cp = CPU;
1432         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1433         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1434         CPU_STATS_EXIT_K();
1435         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1436 
1437         VOP_RWUNLOCK(vp, rwflag, NULL);
1438 
1439         if (error == EINTR && count != 0)
1440                 error = 0;
1441 out:
1442         if (in_crit)
1443                 nbl_end_crit(vp);
1444         releasef(fdes);
1445         if (aiovlen != 0)
1446                 kmem_free(aiov, aiovlen);
1447         if (error)
1448                 return (set_errno(error));
1449         return (count);
1450 }
1451 
1452 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1453 
1454 /*
1455  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1456  */
1457 ssize32_t
1458 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1459     uint32_t offset_2)
1460 {
1461         struct uio auio;
1462         struct iovec aiov;
1463         file_t *fp;
1464         register vnode_t *vp;
1465         struct cpu *cp;
1466         int fflag, ioflag, rwflag;
1467         ssize_t bcount;
1468         int error = 0;
1469         u_offset_t fileoff;
1470         int in_crit = 0;
1471 
1472 #if defined(_LITTLE_ENDIAN)
1473         fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1474 #else
1475         fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1476 #endif
1477 
1478         if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1479                 return (set_errno(EINVAL));
1480 
1481         if ((fp = getf(fdes)) == NULL)
1482                 return (set_errno(EBADF));
1483         if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1484                 error = EBADF;
1485                 goto out;
1486         }
1487 
1488         rwflag = 0;
1489         vp = fp->f_vnode;
1490 
1491         if (vp->v_type == VREG) {
1492 
1493                 if (bcount == 0)
1494                         goto out;
1495 
1496                 /*
1497                  * Same as pread. See comments in pread.
1498                  */
1499 
1500                 if (fileoff > MAXOFFSET_T) {
1501                         error = EINVAL;
1502                         goto out;
1503                 }
1504                 if (fileoff + bcount > MAXOFFSET_T)
1505                         bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1506         } else if (vp->v_type == VFIFO) {
1507                 error = ESPIPE;
1508                 goto out;
1509         }
1510 
1511         /*
1512          * We have to enter the critical region before calling VOP_RWLOCK
1513          * to avoid a deadlock with ufs.
1514          */
1515         if (nbl_need_check(vp)) {
1516                 int svmand;
1517 
1518                 nbl_start_crit(vp, RW_READER);
1519                 in_crit = 1;
1520                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1521                 if (error != 0)
1522                         goto out;
1523                 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1524                     NULL)) {
1525                         error = EACCES;
1526                         goto out;
1527                 }
1528         }
1529 
1530         aiov.iov_base = cbuf;
1531         aiov.iov_len = bcount;
1532         (void) VOP_RWLOCK(vp, rwflag, NULL);
1533         auio.uio_loffset = fileoff;
1534 
1535         /*
1536          * Note: File size can never be greater than MAXOFFSET_T.
1537          * If ever we start supporting 128 bit files the code
1538          * similar to the one in pread at this place should be here.
1539          * Here we avoid the unnecessary VOP_GETATTR() when we
1540          * know that fileoff == MAXOFFSET_T implies that it is always
1541          * greater than or equal to file size.
1542          */
1543         auio.uio_iov = &aiov;
1544         auio.uio_iovcnt = 1;
1545         auio.uio_resid = bcount;
1546         auio.uio_segflg = UIO_USERSPACE;
1547         auio.uio_llimit = MAXOFFSET_T;
1548         auio.uio_fmode = fflag;
1549         auio.uio_extflg = UIO_COPY_CACHED;
1550 
1551         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1552 
1553         /* If read sync is not asked for, filter sync flags */
1554         if ((ioflag & FRSYNC) == 0)
1555                 ioflag &= ~(FSYNC|FDSYNC);
1556         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1557         bcount -= auio.uio_resid;
1558         CPU_STATS_ENTER_K();
1559         cp = CPU;
1560         CPU_STATS_ADDQ(cp, sys, sysread, 1);
1561         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1562         CPU_STATS_EXIT_K();
1563         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1564         VOP_RWUNLOCK(vp, rwflag, NULL);
1565 
1566         if (error == EINTR && bcount != 0)
1567                 error = 0;
1568 out:
1569         if (in_crit)
1570                 nbl_end_crit(vp);
1571         releasef(fdes);
1572         if (error)
1573                 return (set_errno(error));
1574         return (bcount);
1575 }
1576 
1577 /*
1578  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1579  */
1580 ssize32_t
1581 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1582     uint32_t offset_2)
1583 {
1584         struct uio auio;
1585         struct iovec aiov;
1586         file_t *fp;
1587         register vnode_t *vp;
1588         struct cpu *cp;
1589         int fflag, ioflag, rwflag;
1590         ssize_t bcount;
1591         int error = 0;
1592         u_offset_t fileoff;
1593         int in_crit = 0;
1594 
1595 #if defined(_LITTLE_ENDIAN)
1596         fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1597 #else
1598         fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1599 #endif
1600 
1601         if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1602                 return (set_errno(EINVAL));
1603         if ((fp = getf(fdes)) == NULL)
1604                 return (set_errno(EBADF));
1605         if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1606                 error = EBADF;
1607                 goto out;
1608         }
1609 
1610         rwflag = 1;
1611         vp = fp->f_vnode;
1612 
1613         if (vp->v_type == VREG) {
1614 
1615                 if (bcount == 0)
1616                         goto out;
1617 
1618                 /*
1619                  * See comments in pwrite.
1620                  */
1621                 if (fileoff > MAXOFFSET_T) {
1622                         error = EINVAL;
1623                         goto out;
1624                 }
1625                 if (fileoff >= curproc->p_fsz_ctl) {
1626                         mutex_enter(&curproc->p_lock);
1627                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1628                             curproc->p_rctls, curproc, RCA_SAFE);
1629                         mutex_exit(&curproc->p_lock);
1630                         error = EFBIG;
1631                         goto out;
1632                 }
1633                 if (fileoff == MAXOFFSET_T) {
1634                         error = EFBIG;
1635                         goto out;
1636                 }
1637                 if (fileoff + bcount > MAXOFFSET_T)
1638                         bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1639         } else if (vp->v_type == VFIFO) {
1640                 error = ESPIPE;
1641                 goto out;
1642         }
1643 
1644         /*
1645          * We have to enter the critical region before calling VOP_RWLOCK
1646          * to avoid a deadlock with ufs.
1647          */
1648         if (nbl_need_check(vp)) {
1649                 int svmand;
1650 
1651                 nbl_start_crit(vp, RW_READER);
1652                 in_crit = 1;
1653                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1654                 if (error != 0)
1655                         goto out;
1656                 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1657                     NULL)) {
1658                         error = EACCES;
1659                         goto out;
1660                 }
1661         }
1662 
1663         aiov.iov_base = cbuf;
1664         aiov.iov_len = bcount;
1665         (void) VOP_RWLOCK(vp, rwflag, NULL);
1666         auio.uio_loffset = fileoff;
1667         auio.uio_iov = &aiov;
1668         auio.uio_iovcnt = 1;
1669         auio.uio_resid = bcount;
1670         auio.uio_segflg = UIO_USERSPACE;
1671         auio.uio_llimit = curproc->p_fsz_ctl;
1672         auio.uio_fmode = fflag;
1673         auio.uio_extflg = UIO_COPY_CACHED;
1674 
1675         /*
1676          * The SUSv4 POSIX specification states:
1677          *      The pwrite() function shall be equivalent to write(), except
1678          *      that it writes into a given position and does not change
1679          *      the file offset (regardless of whether O_APPEND is set).
1680          * To make this be true, we omit the FAPPEND flag from ioflag.
1681          */
1682         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1683 
1684         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1685         bcount -= auio.uio_resid;
1686         CPU_STATS_ENTER_K();
1687         cp = CPU;
1688         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1689         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1690         CPU_STATS_EXIT_K();
1691         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1692         VOP_RWUNLOCK(vp, rwflag, NULL);
1693 
1694         if (error == EINTR && bcount != 0)
1695                 error = 0;
1696 out:
1697         if (in_crit)
1698                 nbl_end_crit(vp);
1699         releasef(fdes);
1700         if (error)
1701                 return (set_errno(error));
1702         return (bcount);
1703 }
1704 
1705 #endif  /* _SYSCALL32_IMPL || _ILP32 */
1706 
1707 #ifdef _SYSCALL32_IMPL
1708 /*
1709  * Tail-call elimination of xxx32() down to xxx()
1710  *
1711  * A number of xxx32 system calls take a len (or count) argument and
1712  * return a number in the range [0,len] or -1 on error.
1713  * Given an ssize32_t input len, the downcall xxx() will return
1714  * a 64-bit value that is -1 or in the range [0,len] which actually
1715  * is a proper return value for the xxx32 call. So even if the xxx32
1716  * calls can be considered as returning a ssize32_t, they are currently
1717  * declared as returning a ssize_t as this enables tail-call elimination.
1718  *
1719  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1720  * down negative input values as such and let the downcall handle error
1721  * reporting. Functions covered by this comments are:
1722  *
1723  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1724  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1725  * readlink.c:     readlink32.
1726  */
1727 
1728 ssize_t
1729 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1730 {
1731         return (read(fdes,
1732             (void *)(uintptr_t)cbuf, (ssize32_t)count));
1733 }
1734 
1735 ssize_t
1736 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1737 {
1738         return (write(fdes,
1739             (void *)(uintptr_t)cbuf, (ssize32_t)count));
1740 }
1741 
1742 ssize_t
1743 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1744 {
1745         return (pread(fdes,
1746             (void *)(uintptr_t)cbuf, (ssize32_t)count,
1747             (off_t)(uint32_t)offset));
1748 }
1749 
1750 ssize_t
1751 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1752 {
1753         return (pwrite(fdes,
1754             (void *)(uintptr_t)cbuf, (ssize32_t)count,
1755             (off_t)(uint32_t)offset));
1756 }
1757 
1758 ssize_t
1759 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1760 {
1761         return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1762 }
1763 
1764 ssize_t
1765 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1766 {
1767         return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1768 }
1769 #endif  /* _SYSCALL32_IMPL */