1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35 
  36 #include <sys/param.h>
  37 #include <sys/isa_defs.h>
  38 #include <sys/types.h>
  39 #include <sys/inttypes.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/cred.h>
  42 #include <sys/user.h>
  43 #include <sys/systm.h>
  44 #include <sys/errno.h>
  45 #include <sys/vnode.h>
  46 #include <sys/file.h>
  47 #include <sys/proc.h>
  48 #include <sys/cpuvar.h>
  49 #include <sys/uio.h>
  50 #include <sys/debug.h>
  51 #include <sys/rctl.h>
  52 #include <sys/nbmlock.h>
  53 
  54 #define COPYOUT_MAX_CACHE       (1<<17)           /* 128K */
  55 
  56 size_t copyout_max_cached = COPYOUT_MAX_CACHE;  /* global so it's patchable */
  57 
  58 /*
  59  * read, write, pread, pwrite, readv, and writev syscalls.
  60  *
  61  * 64-bit open: all open's are large file opens.
  62  * Large Files: the behaviour of read depends on whether the fd
  63  *              corresponds to large open or not.
  64  * 32-bit open: FOFFMAX flag not set.
  65  *              read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
  66  *              EOVERFLOW if count is non-zero and if size of file
  67  *              is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
  68  *              at >= MAXOFF32_T returns EOF.
  69  */
  70 
  71 /*
  72  * Native system call
  73  */
  74 ssize_t
  75 read(int fdes, void *cbuf, size_t count)
  76 {
  77         struct uio auio;
  78         struct iovec aiov;
  79         file_t *fp;
  80         register vnode_t *vp;
  81         struct cpu *cp;
  82         int fflag, ioflag, rwflag;
  83         ssize_t cnt, bcount;
  84         int error = 0;
  85         u_offset_t fileoff;
  86         int in_crit = 0;
  87 
  88         if ((cnt = (ssize_t)count) < 0)
  89                 return (set_errno(EINVAL));
  90         if ((fp = getf(fdes)) == NULL)
  91                 return (set_errno(EBADF));
  92         if (((fflag = fp->f_flag) & FREAD) == 0) {
  93                 error = EBADF;
  94                 goto out;
  95         }
  96         vp = fp->f_vnode;
  97 
  98         if (vp->v_type == VREG && cnt == 0) {
  99                 goto out;
 100         }
 101 
 102         rwflag = 0;
 103         aiov.iov_base = cbuf;
 104         aiov.iov_len = cnt;
 105 
 106         /*
 107          * We have to enter the critical region before calling VOP_RWLOCK
 108          * to avoid a deadlock with write() calls.
 109          */
 110         if (nbl_need_check(vp)) {
 111                 int svmand;
 112 
 113                 nbl_start_crit(vp, RW_READER);
 114                 in_crit = 1;
 115                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 116                 if (error != 0)
 117                         goto out;
 118                 if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
 119                     NULL)) {
 120                         error = EACCES;
 121                         goto out;
 122                 }
 123         }
 124 
 125         (void) VOP_RWLOCK(vp, rwflag, NULL);
 126 
 127         /*
 128          * We do the following checks inside VOP_RWLOCK so as to
 129          * prevent file size from changing while these checks are
 130          * being done. Also, we load fp's offset to the local
 131          * variable fileoff because we can have a parallel lseek
 132          * going on (f_offset is not protected by any lock) which
 133          * could change f_offset. We need to see the value only
 134          * once here and take a decision. Seeing it more than once
 135          * can lead to incorrect functionality.
 136          */
 137 
 138         fileoff = (u_offset_t)fp->f_offset;
 139         if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
 140                 struct vattr va;
 141                 va.va_mask = AT_SIZE;
 142                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
 143                         VOP_RWUNLOCK(vp, rwflag, NULL);
 144                         goto out;
 145                 }
 146                 if (fileoff >= va.va_size) {
 147                         cnt = 0;
 148                         VOP_RWUNLOCK(vp, rwflag, NULL);
 149                         goto out;
 150                 } else {
 151                         error = EOVERFLOW;
 152                         VOP_RWUNLOCK(vp, rwflag, NULL);
 153                         goto out;
 154                 }
 155         }
 156         if ((vp->v_type == VREG) &&
 157             (fileoff + cnt > OFFSET_MAX(fp))) {
 158                 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 159         }
 160         auio.uio_loffset = fileoff;
 161         auio.uio_iov = &aiov;
 162         auio.uio_iovcnt = 1;
 163         auio.uio_resid = bcount = cnt;
 164         auio.uio_segflg = UIO_USERSPACE;
 165         auio.uio_llimit = MAXOFFSET_T;
 166         auio.uio_fmode = fflag;
 167         /*
 168          * Only use bypass caches when the count is large enough
 169          */
 170         if (bcount <= copyout_max_cached)
 171                 auio.uio_extflg = UIO_COPY_CACHED;
 172         else
 173                 auio.uio_extflg = UIO_COPY_DEFAULT;
 174 
 175         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 176 
 177         /* If read sync is not asked for, filter sync flags */
 178         if ((ioflag & FRSYNC) == 0)
 179                 ioflag &= ~(FSYNC|FDSYNC);
 180         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 181         cnt -= auio.uio_resid;
 182         CPU_STATS_ENTER_K();
 183         cp = CPU;
 184         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 185         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
 186         CPU_STATS_EXIT_K();
 187         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
 188 
 189         if (vp->v_type == VFIFO)     /* Backward compatibility */
 190                 fp->f_offset = cnt;
 191         else if (((fp->f_flag & FAPPEND) == 0) ||
 192             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 193                 fp->f_offset = auio.uio_loffset;
 194         VOP_RWUNLOCK(vp, rwflag, NULL);
 195 
 196         if (error == EINTR && cnt != 0)
 197                 error = 0;
 198 out:
 199         if (in_crit)
 200                 nbl_end_crit(vp);
 201         releasef(fdes);
 202         if (error)
 203                 return (set_errno(error));
 204         return (cnt);
 205 }
 206 
 207 /*
 208  * Native system call
 209  */
 210 ssize_t
 211 write(int fdes, void *cbuf, size_t count)
 212 {
 213         struct uio auio;
 214         struct iovec aiov;
 215         file_t *fp;
 216         register vnode_t *vp;
 217         struct cpu *cp;
 218         int fflag, ioflag, rwflag;
 219         ssize_t cnt, bcount;
 220         int error = 0;
 221         u_offset_t fileoff;
 222         int in_crit = 0;
 223 
 224         if ((cnt = (ssize_t)count) < 0)
 225                 return (set_errno(EINVAL));
 226         if ((fp = getf(fdes)) == NULL)
 227                 return (set_errno(EBADF));
 228         if (((fflag = fp->f_flag) & FWRITE) == 0) {
 229                 error = EBADF;
 230                 goto out;
 231         }
 232         vp = fp->f_vnode;
 233 
 234         if (vp->v_type == VREG && cnt == 0) {
 235                 goto out;
 236         }
 237 
 238         rwflag = 1;
 239         aiov.iov_base = cbuf;
 240         aiov.iov_len = cnt;
 241 
 242         /*
 243          * We have to enter the critical region before calling VOP_RWLOCK
 244          * to avoid a deadlock with ufs.
 245          */
 246         if (nbl_need_check(vp)) {
 247                 int svmand;
 248 
 249                 nbl_start_crit(vp, RW_READER);
 250                 in_crit = 1;
 251                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 252                 if (error != 0)
 253                         goto out;
 254                 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
 255                     NULL)) {
 256                         error = EACCES;
 257                         goto out;
 258                 }
 259         }
 260 
 261         (void) VOP_RWLOCK(vp, rwflag, NULL);
 262 
 263         fileoff = fp->f_offset;
 264         if (vp->v_type == VREG) {
 265 
 266                 /*
 267                  * We raise psignal if write for >0 bytes causes
 268                  * it to exceed the ulimit.
 269                  */
 270                 if (fileoff >= curproc->p_fsz_ctl) {
 271                         VOP_RWUNLOCK(vp, rwflag, NULL);
 272 
 273                         mutex_enter(&curproc->p_lock);
 274                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 275                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 276                         mutex_exit(&curproc->p_lock);
 277 
 278                         error = EFBIG;
 279                         goto out;
 280                 }
 281                 /*
 282                  * We return EFBIG if write is done at an offset
 283                  * greater than the offset maximum for this file structure.
 284                  */
 285 
 286                 if (fileoff >= OFFSET_MAX(fp)) {
 287                         VOP_RWUNLOCK(vp, rwflag, NULL);
 288                         error = EFBIG;
 289                         goto out;
 290                 }
 291                 /*
 292                  * Limit the bytes to be written  upto offset maximum for
 293                  * this open file structure.
 294                  */
 295                 if (fileoff + cnt > OFFSET_MAX(fp))
 296                         cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 297         }
 298         auio.uio_loffset = fileoff;
 299         auio.uio_iov = &aiov;
 300         auio.uio_iovcnt = 1;
 301         auio.uio_resid = bcount = cnt;
 302         auio.uio_segflg = UIO_USERSPACE;
 303         auio.uio_llimit = curproc->p_fsz_ctl;
 304         auio.uio_fmode = fflag;
 305         auio.uio_extflg = UIO_COPY_DEFAULT;
 306 
 307         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 308 
 309         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 310         cnt -= auio.uio_resid;
 311         CPU_STATS_ENTER_K();
 312         cp = CPU;
 313         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 314         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
 315         CPU_STATS_EXIT_K();
 316         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
 317 
 318         if (vp->v_type == VFIFO)     /* Backward compatibility */
 319                 fp->f_offset = cnt;
 320         else if (((fp->f_flag & FAPPEND) == 0) ||
 321             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 322                 fp->f_offset = auio.uio_loffset;
 323         VOP_RWUNLOCK(vp, rwflag, NULL);
 324 
 325         if (error == EINTR && cnt != 0)
 326                 error = 0;
 327 out:
 328         if (in_crit)
 329                 nbl_end_crit(vp);
 330         releasef(fdes);
 331         if (error)
 332                 return (set_errno(error));
 333         return (cnt);
 334 }
 335 
 336 ssize_t
 337 pread(int fdes, void *cbuf, size_t count, off_t offset)
 338 {
 339         struct uio auio;
 340         struct iovec aiov;
 341         file_t *fp;
 342         register vnode_t *vp;
 343         struct cpu *cp;
 344         int fflag, ioflag, rwflag;
 345         ssize_t bcount;
 346         int error = 0;
 347         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 348 #ifdef _SYSCALL32_IMPL
 349         u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
 350             MAXOFF32_T : MAXOFFSET_T;
 351 #else
 352         const u_offset_t maxoff = MAXOFF32_T;
 353 #endif
 354         int in_crit = 0;
 355 
 356         if ((bcount = (ssize_t)count) < 0)
 357                 return (set_errno(EINVAL));
 358 
 359         if ((fp = getf(fdes)) == NULL)
 360                 return (set_errno(EBADF));
 361         if (((fflag = fp->f_flag) & (FREAD)) == 0) {
 362                 error = EBADF;
 363                 goto out;
 364         }
 365 
 366         rwflag = 0;
 367         vp = fp->f_vnode;
 368 
 369         if (vp->v_type == VREG) {
 370 
 371                 if (bcount == 0)
 372                         goto out;
 373 
 374                 /*
 375                  * Return EINVAL if an invalid offset comes to pread.
 376                  * Negative offset from user will cause this error.
 377                  */
 378 
 379                 if (fileoff > maxoff) {
 380                         error = EINVAL;
 381                         goto out;
 382                 }
 383                 /*
 384                  * Limit offset such that we don't read or write
 385                  * a file beyond the maximum offset representable in
 386                  * an off_t structure.
 387                  */
 388                 if (fileoff + bcount > maxoff)
 389                         bcount = (ssize_t)((offset_t)maxoff - fileoff);
 390         } else if (vp->v_type == VFIFO) {
 391                 error = ESPIPE;
 392                 goto out;
 393         }
 394 
 395         /*
 396          * We have to enter the critical region before calling VOP_RWLOCK
 397          * to avoid a deadlock with ufs.
 398          */
 399         if (nbl_need_check(vp)) {
 400                 int svmand;
 401 
 402                 nbl_start_crit(vp, RW_READER);
 403                 in_crit = 1;
 404                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 405                 if (error != 0)
 406                         goto out;
 407                 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
 408                     NULL)) {
 409                         error = EACCES;
 410                         goto out;
 411                 }
 412         }
 413 
 414         aiov.iov_base = cbuf;
 415         aiov.iov_len = bcount;
 416         (void) VOP_RWLOCK(vp, rwflag, NULL);
 417         if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
 418                 struct vattr va;
 419                 va.va_mask = AT_SIZE;
 420                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
 421                         VOP_RWUNLOCK(vp, rwflag, NULL);
 422                         goto out;
 423                 }
 424                 VOP_RWUNLOCK(vp, rwflag, NULL);
 425 
 426                 /*
 427                  * We have to return EOF if fileoff is >= file size.
 428                  */
 429                 if (fileoff >= va.va_size) {
 430                         bcount = 0;
 431                         goto out;
 432                 }
 433 
 434                 /*
 435                  * File is greater than or equal to maxoff and therefore
 436                  * we return EOVERFLOW.
 437                  */
 438                 error = EOVERFLOW;
 439                 goto out;
 440         }
 441         auio.uio_loffset = fileoff;
 442         auio.uio_iov = &aiov;
 443         auio.uio_iovcnt = 1;
 444         auio.uio_resid = bcount;
 445         auio.uio_segflg = UIO_USERSPACE;
 446         auio.uio_llimit = MAXOFFSET_T;
 447         auio.uio_fmode = fflag;
 448         auio.uio_extflg = UIO_COPY_CACHED;
 449 
 450         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 451 
 452         /* If read sync is not asked for, filter sync flags */
 453         if ((ioflag & FRSYNC) == 0)
 454                 ioflag &= ~(FSYNC|FDSYNC);
 455         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 456         bcount -= auio.uio_resid;
 457         CPU_STATS_ENTER_K();
 458         cp = CPU;
 459         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 460         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
 461         CPU_STATS_EXIT_K();
 462         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
 463         VOP_RWUNLOCK(vp, rwflag, NULL);
 464 
 465         if (error == EINTR && bcount != 0)
 466                 error = 0;
 467 out:
 468         if (in_crit)
 469                 nbl_end_crit(vp);
 470         releasef(fdes);
 471         if (error)
 472                 return (set_errno(error));
 473         return (bcount);
 474 }
 475 
 476 ssize_t
 477 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
 478 {
 479         struct uio auio;
 480         struct iovec aiov;
 481         file_t *fp;
 482         register vnode_t *vp;
 483         struct cpu *cp;
 484         int fflag, ioflag, rwflag;
 485         ssize_t bcount;
 486         int error = 0;
 487         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 488 #ifdef _SYSCALL32_IMPL
 489         u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
 490             MAXOFF32_T : MAXOFFSET_T;
 491 #else
 492         const u_offset_t maxoff = MAXOFF32_T;
 493 #endif
 494         int in_crit = 0;
 495 
 496         if ((bcount = (ssize_t)count) < 0)
 497                 return (set_errno(EINVAL));
 498         if ((fp = getf(fdes)) == NULL)
 499                 return (set_errno(EBADF));
 500         if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
 501                 error = EBADF;
 502                 goto out;
 503         }
 504 
 505         rwflag = 1;
 506         vp = fp->f_vnode;
 507 
 508         if (vp->v_type == VREG) {
 509 
 510                 if (bcount == 0)
 511                         goto out;
 512 
 513                 /*
 514                  * return EINVAL for offsets that cannot be
 515                  * represented in an off_t.
 516                  */
 517                 if (fileoff > maxoff) {
 518                         error = EINVAL;
 519                         goto out;
 520                 }
 521                 /*
 522                  * Take appropriate action if we are trying to write above the
 523                  * resource limit.
 524                  */
 525                 if (fileoff >= curproc->p_fsz_ctl) {
 526                         mutex_enter(&curproc->p_lock);
 527                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 528                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 529                         mutex_exit(&curproc->p_lock);
 530 
 531                         error = EFBIG;
 532                         goto out;
 533                 }
 534                 /*
 535                  * Don't allow pwrite to cause file sizes to exceed
 536                  * maxoff.
 537                  */
 538                 if (fileoff == maxoff) {
 539                         error = EFBIG;
 540                         goto out;
 541                 }
 542                 if (fileoff + count > maxoff)
 543                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
 544         } else if (vp->v_type == VFIFO) {
 545                 error = ESPIPE;
 546                 goto out;
 547         }
 548 
 549         /*
 550          * We have to enter the critical region before calling VOP_RWLOCK
 551          * to avoid a deadlock with ufs.
 552          */
 553         if (nbl_need_check(vp)) {
 554                 int svmand;
 555 
 556                 nbl_start_crit(vp, RW_READER);
 557                 in_crit = 1;
 558                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 559                 if (error != 0)
 560                         goto out;
 561                 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
 562                     NULL)) {
 563                         error = EACCES;
 564                         goto out;
 565                 }
 566         }
 567 
 568         aiov.iov_base = cbuf;
 569         aiov.iov_len = bcount;
 570         (void) VOP_RWLOCK(vp, rwflag, NULL);
 571         auio.uio_loffset = fileoff;
 572         auio.uio_iov = &aiov;
 573         auio.uio_iovcnt = 1;
 574         auio.uio_resid = bcount;
 575         auio.uio_segflg = UIO_USERSPACE;
 576         auio.uio_llimit = curproc->p_fsz_ctl;
 577         auio.uio_fmode = fflag;
 578         auio.uio_extflg = UIO_COPY_CACHED;
 579 
 580         /*
 581          * The SUSv4 POSIX specification states:
 582          *      The pwrite() function shall be equivalent to write(), except
 583          *      that it writes into a given position and does not change
 584          *      the file offset (regardless of whether O_APPEND is set).
 585          * To make this be true, we omit the FAPPEND flag from ioflag.
 586          */
 587         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
 588 
 589         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 590         bcount -= auio.uio_resid;
 591         CPU_STATS_ENTER_K();
 592         cp = CPU;
 593         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 594         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
 595         CPU_STATS_EXIT_K();
 596         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
 597         VOP_RWUNLOCK(vp, rwflag, NULL);
 598 
 599         if (error == EINTR && bcount != 0)
 600                 error = 0;
 601 out:
 602         if (in_crit)
 603                 nbl_end_crit(vp);
 604         releasef(fdes);
 605         if (error)
 606                 return (set_errno(error));
 607         return (bcount);
 608 }
 609 
 610 /*
 611  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
 612  * XXX -- However, SVVS expects readv() and writev() to fail if
 613  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
 614  * XXX -- so I guess that's the "interface".
 615  */
 616 #define DEF_IOV_MAX     16
 617 
 618 ssize_t
 619 readv(int fdes, struct iovec *iovp, int iovcnt)
 620 {
 621         struct uio auio;
 622         struct iovec aiov[DEF_IOV_MAX];
 623         file_t *fp;
 624         register vnode_t *vp;
 625         struct cpu *cp;
 626         int fflag, ioflag, rwflag;
 627         ssize_t count, bcount;
 628         int error = 0;
 629         int i;
 630         u_offset_t fileoff;
 631         int in_crit = 0;
 632 
 633         if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
 634                 return (set_errno(EINVAL));
 635 
 636 #ifdef _SYSCALL32_IMPL
 637         /*
 638          * 32-bit callers need to have their iovec expanded,
 639          * while ensuring that they can't move more than 2Gbytes
 640          * of data in a single call.
 641          */
 642         if (get_udatamodel() == DATAMODEL_ILP32) {
 643                 struct iovec32 aiov32[DEF_IOV_MAX];
 644                 ssize32_t count32;
 645 
 646                 if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
 647                         return (set_errno(EFAULT));
 648 
 649                 count32 = 0;
 650                 for (i = 0; i < iovcnt; i++) {
 651                         ssize32_t iovlen32 = aiov32[i].iov_len;
 652                         count32 += iovlen32;
 653                         if (iovlen32 < 0 || count32 < 0)
 654                                 return (set_errno(EINVAL));
 655                         aiov[i].iov_len = iovlen32;
 656                         aiov[i].iov_base =
 657                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 658                 }
 659         } else
 660 #endif
 661         if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
 662                 return (set_errno(EFAULT));
 663 
 664         count = 0;
 665         for (i = 0; i < iovcnt; i++) {
 666                 ssize_t iovlen = aiov[i].iov_len;
 667                 count += iovlen;
 668                 if (iovlen < 0 || count < 0)
 669                         return (set_errno(EINVAL));
 670         }
 671         if ((fp = getf(fdes)) == NULL)
 672                 return (set_errno(EBADF));
 673         if (((fflag = fp->f_flag) & FREAD) == 0) {
 674                 error = EBADF;
 675                 goto out;
 676         }
 677         vp = fp->f_vnode;
 678         if (vp->v_type == VREG && count == 0) {
 679                 goto out;
 680         }
 681 
 682         rwflag = 0;
 683 
 684         /*
 685          * We have to enter the critical region before calling VOP_RWLOCK
 686          * to avoid a deadlock with ufs.
 687          */
 688         if (nbl_need_check(vp)) {
 689                 int svmand;
 690 
 691                 nbl_start_crit(vp, RW_READER);
 692                 in_crit = 1;
 693                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 694                 if (error != 0)
 695                         goto out;
 696                 if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
 697                     NULL)) {
 698                         error = EACCES;
 699                         goto out;
 700                 }
 701         }
 702 
 703         (void) VOP_RWLOCK(vp, rwflag, NULL);
 704         fileoff = fp->f_offset;
 705 
 706         /*
 707          * Behaviour is same as read. Please see comments in read.
 708          */
 709 
 710         if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
 711                 struct vattr va;
 712                 va.va_mask = AT_SIZE;
 713                 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
 714                         VOP_RWUNLOCK(vp, rwflag, NULL);
 715                         goto out;
 716                 }
 717                 if (fileoff >= va.va_size) {
 718                         VOP_RWUNLOCK(vp, rwflag, NULL);
 719                         count = 0;
 720                         goto out;
 721                 } else {
 722                         VOP_RWUNLOCK(vp, rwflag, NULL);
 723                         error = EOVERFLOW;
 724                         goto out;
 725                 }
 726         }
 727         if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
 728                 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 729         }
 730         auio.uio_loffset = fileoff;
 731         auio.uio_iov = aiov;
 732         auio.uio_iovcnt = iovcnt;
 733         auio.uio_resid = bcount = count;
 734         auio.uio_segflg = UIO_USERSPACE;
 735         auio.uio_llimit = MAXOFFSET_T;
 736         auio.uio_fmode = fflag;
 737         if (bcount <= copyout_max_cached)
 738                 auio.uio_extflg = UIO_COPY_CACHED;
 739         else
 740                 auio.uio_extflg = UIO_COPY_DEFAULT;
 741 
 742 
 743         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 744 
 745         /* If read sync is not asked for, filter sync flags */
 746         if ((ioflag & FRSYNC) == 0)
 747                 ioflag &= ~(FSYNC|FDSYNC);
 748         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
 749         count -= auio.uio_resid;
 750         CPU_STATS_ENTER_K();
 751         cp = CPU;
 752         CPU_STATS_ADDQ(cp, sys, sysread, 1);
 753         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
 754         CPU_STATS_EXIT_K();
 755         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
 756 
 757         if (vp->v_type == VFIFO)     /* Backward compatibility */
 758                 fp->f_offset = count;
 759         else if (((fp->f_flag & FAPPEND) == 0) ||
 760             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 761                 fp->f_offset = auio.uio_loffset;
 762 
 763         VOP_RWUNLOCK(vp, rwflag, NULL);
 764 
 765         if (error == EINTR && count != 0)
 766                 error = 0;
 767 out:
 768         if (in_crit)
 769                 nbl_end_crit(vp);
 770         releasef(fdes);
 771         if (error)
 772                 return (set_errno(error));
 773         return (count);
 774 }
 775 
 776 ssize_t
 777 writev(int fdes, struct iovec *iovp, int iovcnt)
 778 {
 779         struct uio auio;
 780         struct iovec aiov[DEF_IOV_MAX];
 781         file_t *fp;
 782         register vnode_t *vp;
 783         struct cpu *cp;
 784         int fflag, ioflag, rwflag;
 785         ssize_t count, bcount;
 786         int error = 0;
 787         int i;
 788         u_offset_t fileoff;
 789         int in_crit = 0;
 790 
 791         if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
 792                 return (set_errno(EINVAL));
 793 
 794 #ifdef _SYSCALL32_IMPL
 795         /*
 796          * 32-bit callers need to have their iovec expanded,
 797          * while ensuring that they can't move more than 2Gbytes
 798          * of data in a single call.
 799          */
 800         if (get_udatamodel() == DATAMODEL_ILP32) {
 801                 struct iovec32 aiov32[DEF_IOV_MAX];
 802                 ssize32_t count32;
 803 
 804                 if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
 805                         return (set_errno(EFAULT));
 806 
 807                 count32 = 0;
 808                 for (i = 0; i < iovcnt; i++) {
 809                         ssize32_t iovlen = aiov32[i].iov_len;
 810                         count32 += iovlen;
 811                         if (iovlen < 0 || count32 < 0)
 812                                 return (set_errno(EINVAL));
 813                         aiov[i].iov_len = iovlen;
 814                         aiov[i].iov_base =
 815                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 816                 }
 817         } else
 818 #endif
 819         if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
 820                 return (set_errno(EFAULT));
 821 
 822         count = 0;
 823         for (i = 0; i < iovcnt; i++) {
 824                 ssize_t iovlen = aiov[i].iov_len;
 825                 count += iovlen;
 826                 if (iovlen < 0 || count < 0)
 827                         return (set_errno(EINVAL));
 828         }
 829         if ((fp = getf(fdes)) == NULL)
 830                 return (set_errno(EBADF));
 831         if (((fflag = fp->f_flag) & FWRITE) == 0) {
 832                 error = EBADF;
 833                 goto out;
 834         }
 835         vp = fp->f_vnode;
 836         if (vp->v_type == VREG && count == 0) {
 837                 goto out;
 838         }
 839 
 840         rwflag = 1;
 841 
 842         /*
 843          * We have to enter the critical region before calling VOP_RWLOCK
 844          * to avoid a deadlock with ufs.
 845          */
 846         if (nbl_need_check(vp)) {
 847                 int svmand;
 848 
 849                 nbl_start_crit(vp, RW_READER);
 850                 in_crit = 1;
 851                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 852                 if (error != 0)
 853                         goto out;
 854                 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
 855                     NULL)) {
 856                         error = EACCES;
 857                         goto out;
 858                 }
 859         }
 860 
 861         (void) VOP_RWLOCK(vp, rwflag, NULL);
 862 
 863         fileoff = fp->f_offset;
 864 
 865         /*
 866          * Behaviour is same as write. Please see comments for write.
 867          */
 868 
 869         if (vp->v_type == VREG) {
 870                 if (fileoff >= curproc->p_fsz_ctl) {
 871                         VOP_RWUNLOCK(vp, rwflag, NULL);
 872                         mutex_enter(&curproc->p_lock);
 873                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 874                             curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
 875                         mutex_exit(&curproc->p_lock);
 876                         error = EFBIG;
 877                         goto out;
 878                 }
 879                 if (fileoff >= OFFSET_MAX(fp)) {
 880                         VOP_RWUNLOCK(vp, rwflag, NULL);
 881                         error = EFBIG;
 882                         goto out;
 883                 }
 884                 if (fileoff + count > OFFSET_MAX(fp))
 885                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
 886         }
 887         auio.uio_loffset = fileoff;
 888         auio.uio_iov = aiov;
 889         auio.uio_iovcnt = iovcnt;
 890         auio.uio_resid = bcount = count;
 891         auio.uio_segflg = UIO_USERSPACE;
 892         auio.uio_llimit = curproc->p_fsz_ctl;
 893         auio.uio_fmode = fflag;
 894         auio.uio_extflg = UIO_COPY_DEFAULT;
 895 
 896         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
 897 
 898         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
 899         count -= auio.uio_resid;
 900         CPU_STATS_ENTER_K();
 901         cp = CPU;
 902         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
 903         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
 904         CPU_STATS_EXIT_K();
 905         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
 906 
 907         if (vp->v_type == VFIFO)     /* Backward compatibility */
 908                 fp->f_offset = count;
 909         else if (((fp->f_flag & FAPPEND) == 0) ||
 910             (vp->v_type != VREG) || (bcount != 0))   /* POSIX */
 911                 fp->f_offset = auio.uio_loffset;
 912         VOP_RWUNLOCK(vp, rwflag, NULL);
 913 
 914         if (error == EINTR && count != 0)
 915                 error = 0;
 916 out:
 917         if (in_crit)
 918                 nbl_end_crit(vp);
 919         releasef(fdes);
 920         if (error)
 921                 return (set_errno(error));
 922         return (count);
 923 }
 924 
 925 ssize_t
 926 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 927     off_t extended_offset)
 928 {
 929         struct uio auio;
 930         struct iovec aiov[DEF_IOV_MAX];
 931         file_t *fp;
 932         register vnode_t *vp;
 933         struct cpu *cp;
 934         int fflag, ioflag, rwflag;
 935         ssize_t count, bcount;
 936         int error = 0;
 937         int i;
 938 
 939 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 940         u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
 941             (u_offset_t)offset;
 942 #else /* _SYSCALL32_IMPL || _ILP32 */
 943         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
 944 #endif /* _SYSCALL32_IMPR || _ILP32 */
 945 #ifdef _SYSCALL32_IMPL
 946         const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
 947             extended_offset == 0?
 948             MAXOFF32_T : MAXOFFSET_T;
 949 #else /* _SYSCALL32_IMPL */
 950         const u_offset_t maxoff = MAXOFF32_T;
 951 #endif /* _SYSCALL32_IMPL */
 952 
 953         int in_crit = 0;
 954 
 955         if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
 956                 return (set_errno(EINVAL));
 957 
 958 #ifdef _SYSCALL32_IMPL
 959         /*
 960          * 32-bit callers need to have their iovec expanded,
 961          * while ensuring that they can't move more than 2Gbytes
 962          * of data in a single call.
 963          */
 964         if (get_udatamodel() == DATAMODEL_ILP32) {
 965                 struct iovec32 aiov32[DEF_IOV_MAX];
 966                 ssize32_t count32;
 967 
 968                 if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
 969                         return (set_errno(EFAULT));
 970 
 971                 count32 = 0;
 972                 for (i = 0; i < iovcnt; i++) {
 973                         ssize32_t iovlen32 = aiov32[i].iov_len;
 974                         count32 += iovlen32;
 975                         if (iovlen32 < 0 || count32 < 0)
 976                                 return (set_errno(EINVAL));
 977                         aiov[i].iov_len = iovlen32;
 978                         aiov[i].iov_base =
 979                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
 980                 }
 981         } else
 982 #endif /* _SYSCALL32_IMPL */
 983                 if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
 984                         return (set_errno(EFAULT));
 985 
 986         count = 0;
 987         for (i = 0; i < iovcnt; i++) {
 988                 ssize_t iovlen = aiov[i].iov_len;
 989                 count += iovlen;
 990                 if (iovlen < 0 || count < 0)
 991                         return (set_errno(EINVAL));
 992         }
 993 
 994         if ((bcount = (ssize_t)count) < 0)
 995                 return (set_errno(EINVAL));
 996         if ((fp = getf(fdes)) == NULL)
 997                 return (set_errno(EBADF));
 998         if (((fflag = fp->f_flag) & FREAD) == 0) {
 999                 error = EBADF;
1000                 goto out;
1001         }
1002         vp = fp->f_vnode;
1003         rwflag = 0;
1004         if (vp->v_type == VREG) {
1005 
1006                 if (bcount == 0)
1007                         goto out;
1008 
1009                 /*
1010                  * return EINVAL for offsets that cannot be
1011                  * represented in an off_t.
1012                  */
1013                 if (fileoff > maxoff) {
1014                         error = EINVAL;
1015                         goto out;
1016                 }
1017 
1018                 if (fileoff + bcount > maxoff)
1019                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1020         } else if (vp->v_type == VFIFO) {
1021                 error = ESPIPE;
1022                 goto out;
1023         }
1024         /*
1025          * We have to enter the critical region before calling VOP_RWLOCK
1026          * to avoid a deadlock with ufs.
1027          */
1028         if (nbl_need_check(vp)) {
1029                 int svmand;
1030 
1031                 nbl_start_crit(vp, RW_READER);
1032                 in_crit = 1;
1033                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1034                 if (error != 0)
1035                         goto out;
1036                 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1037                     NULL)) {
1038                         error = EACCES;
1039                         goto out;
1040                 }
1041         }
1042 
1043         (void) VOP_RWLOCK(vp, rwflag, NULL);
1044 
1045         /*
1046          * Behaviour is same as read(2). Please see comments in
1047          * read(2).
1048          */
1049 
1050         if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
1051                 struct vattr va;
1052                 va.va_mask = AT_SIZE;
1053                 if ((error =
1054                     VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
1055                         VOP_RWUNLOCK(vp, rwflag, NULL);
1056                         goto out;
1057                 }
1058                 if (fileoff >= va.va_size) {
1059                         VOP_RWUNLOCK(vp, rwflag, NULL);
1060                         count = 0;
1061                         goto out;
1062                 } else {
1063                         VOP_RWUNLOCK(vp, rwflag, NULL);
1064                         error = EOVERFLOW;
1065                         goto out;
1066                 }
1067         }
1068         if ((vp->v_type == VREG) &&
1069             (fileoff + count > OFFSET_MAX(fp))) {
1070                 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1071         }
1072         auio.uio_loffset = fileoff;
1073         auio.uio_iov = aiov;
1074         auio.uio_iovcnt = iovcnt;
1075         auio.uio_resid = bcount = count;
1076         auio.uio_segflg = UIO_USERSPACE;
1077         auio.uio_llimit = MAXOFFSET_T;
1078         auio.uio_fmode = fflag;
1079         if (bcount <= copyout_max_cached)
1080                 auio.uio_extflg = UIO_COPY_CACHED;
1081         else
1082                 auio.uio_extflg = UIO_COPY_DEFAULT;
1083 
1084         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1085         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1086         count -= auio.uio_resid;
1087         CPU_STATS_ENTER_K();
1088         cp = CPU;
1089         CPU_STATS_ADDQ(cp, sys, sysread, 1);
1090         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1091         CPU_STATS_EXIT_K();
1092         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1093 
1094         VOP_RWUNLOCK(vp, rwflag, NULL);
1095 
1096         if (error == EINTR && count != 0)
1097                 error = 0;
1098 out:
1099         if (in_crit)
1100                 nbl_end_crit(vp);
1101         releasef(fdes);
1102         if (error)
1103                 return (set_errno(error));
1104         return (count);
1105 }
1106 
1107 ssize_t
1108 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1109     off_t extended_offset)
1110 {
1111         struct uio auio;
1112         struct iovec aiov[DEF_IOV_MAX];
1113         file_t *fp;
1114         register vnode_t *vp;
1115         struct cpu *cp;
1116         int fflag, ioflag, rwflag;
1117         ssize_t count, bcount;
1118         int error = 0;
1119         int i;
1120 
1121 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1122         u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1123             (u_offset_t)offset;
1124 #else /* _SYSCALL32_IMPL || _ILP32 */
1125         u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1126 #endif /* _SYSCALL32_IMPR || _ILP32 */
1127 #ifdef _SYSCALL32_IMPL
1128         const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1129             extended_offset == 0?
1130             MAXOFF32_T : MAXOFFSET_T;
1131 #else /* _SYSCALL32_IMPL */
1132         const u_offset_t maxoff = MAXOFF32_T;
1133 #endif /* _SYSCALL32_IMPL */
1134 
1135         int in_crit = 0;
1136 
1137         if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
1138                 return (set_errno(EINVAL));
1139 
1140 #ifdef _SYSCALL32_IMPL
1141         /*
1142          * 32-bit callers need to have their iovec expanded,
1143          * while ensuring that they can't move more than 2Gbytes
1144          * of data in a single call.
1145          */
1146         if (get_udatamodel() == DATAMODEL_ILP32) {
1147                 struct iovec32 aiov32[DEF_IOV_MAX];
1148                 ssize32_t count32;
1149 
1150                 if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
1151                         return (set_errno(EFAULT));
1152 
1153                 count32 = 0;
1154                 for (i = 0; i < iovcnt; i++) {
1155                         ssize32_t iovlen32 = aiov32[i].iov_len;
1156                         count32 += iovlen32;
1157                         if (iovlen32 < 0 || count32 < 0)
1158                                 return (set_errno(EINVAL));
1159                         aiov[i].iov_len = iovlen32;
1160                         aiov[i].iov_base =
1161                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1162                 }
1163         } else
1164 #endif /* _SYSCALL32_IMPL */
1165                 if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
1166                         return (set_errno(EFAULT));
1167 
1168         count = 0;
1169         for (i = 0; i < iovcnt; i++) {
1170                 ssize_t iovlen = aiov[i].iov_len;
1171                 count += iovlen;
1172                 if (iovlen < 0 || count < 0)
1173                         return (set_errno(EINVAL));
1174         }
1175 
1176         if ((bcount = (ssize_t)count) < 0)
1177                 return (set_errno(EINVAL));
1178         if ((fp = getf(fdes)) == NULL)
1179                 return (set_errno(EBADF));
1180         if (((fflag = fp->f_flag) & FWRITE) == 0) {
1181                 error = EBADF;
1182                 goto out;
1183         }
1184         vp = fp->f_vnode;
1185         rwflag = 1;
1186         if (vp->v_type == VREG) {
1187 
1188                 if (bcount == 0)
1189                         goto out;
1190 
1191                 /*
1192                  * return EINVAL for offsets that cannot be
1193                  * represented in an off_t.
1194                  */
1195                 if (fileoff > maxoff) {
1196                         error = EINVAL;
1197                         goto out;
1198                 }
1199                 /*
1200                  * Take appropriate action if we are trying
1201                  * to write above the resource limit.
1202                  */
1203                 if (fileoff >= curproc->p_fsz_ctl) {
1204                         mutex_enter(&curproc->p_lock);
1205                         /*
1206                          * Return value ignored because it lists
1207                          * actions taken, but we are in an error case.
1208                          * We don't have any actions that depend on
1209                          * what could happen in this call, so we ignore
1210                          * the return value.
1211                          */
1212                         (void) rctl_action(
1213                             rctlproc_legacy[RLIMIT_FSIZE],
1214                             curproc->p_rctls, curproc,
1215                             RCA_UNSAFE_SIGINFO);
1216                         mutex_exit(&curproc->p_lock);
1217 
1218                         error = EFBIG;
1219                         goto out;
1220                 }
1221                 /*
1222                  * Don't allow pwritev to cause file sizes to exceed
1223                  * maxoff.
1224                  */
1225                 if (fileoff == maxoff) {
1226                         error = EFBIG;
1227                         goto out;
1228                 }
1229 
1230                 if (fileoff + bcount > maxoff)
1231                         bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1232         } else if (vp->v_type == VFIFO) {
1233                 error = ESPIPE;
1234                 goto out;
1235         }
1236         /*
1237          * We have to enter the critical region before calling VOP_RWLOCK
1238          * to avoid a deadlock with ufs.
1239          */
1240         if (nbl_need_check(vp)) {
1241                 int svmand;
1242 
1243                 nbl_start_crit(vp, RW_READER);
1244                 in_crit = 1;
1245                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1246                 if (error != 0)
1247                         goto out;
1248                 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1249                     NULL)) {
1250                         error = EACCES;
1251                         goto out;
1252                 }
1253         }
1254 
1255         (void) VOP_RWLOCK(vp, rwflag, NULL);
1256 
1257 
1258         /*
1259          * Behaviour is same as write(2). Please see comments for
1260          * write(2).
1261          */
1262 
1263         if (vp->v_type == VREG) {
1264                 if (fileoff >= curproc->p_fsz_ctl) {
1265                         VOP_RWUNLOCK(vp, rwflag, NULL);
1266                         mutex_enter(&curproc->p_lock);
1267                         /* see above rctl_action comment */
1268                         (void) rctl_action(
1269                             rctlproc_legacy[RLIMIT_FSIZE],
1270                             curproc->p_rctls,
1271                             curproc, RCA_UNSAFE_SIGINFO);
1272                         mutex_exit(&curproc->p_lock);
1273                         error = EFBIG;
1274                         goto out;
1275                 }
1276                 if (fileoff >= OFFSET_MAX(fp)) {
1277                         VOP_RWUNLOCK(vp, rwflag, NULL);
1278                         error = EFBIG;
1279                         goto out;
1280                 }
1281                 if (fileoff + count > OFFSET_MAX(fp))
1282                         count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1283         }
1284 
1285         auio.uio_loffset = fileoff;
1286         auio.uio_iov = aiov;
1287         auio.uio_iovcnt = iovcnt;
1288         auio.uio_resid = bcount = count;
1289         auio.uio_segflg = UIO_USERSPACE;
1290         auio.uio_llimit = curproc->p_fsz_ctl;
1291         auio.uio_fmode = fflag;
1292         auio.uio_extflg = UIO_COPY_CACHED;
1293         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1294         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1295         count -= auio.uio_resid;
1296         CPU_STATS_ENTER_K();
1297         cp = CPU;
1298         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1299         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1300         CPU_STATS_EXIT_K();
1301         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1302 
1303         VOP_RWUNLOCK(vp, rwflag, NULL);
1304 
1305         if (error == EINTR && count != 0)
1306                 error = 0;
1307 out:
1308         if (in_crit)
1309                 nbl_end_crit(vp);
1310         releasef(fdes);
1311         if (error)
1312                 return (set_errno(error));
1313         return (count);
1314 }
1315 
1316 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1317 
1318 /*
1319  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1320  */
1321 ssize32_t
1322 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1323     uint32_t offset_2)
1324 {
1325         struct uio auio;
1326         struct iovec aiov;
1327         file_t *fp;
1328         register vnode_t *vp;
1329         struct cpu *cp;
1330         int fflag, ioflag, rwflag;
1331         ssize_t bcount;
1332         int error = 0;
1333         u_offset_t fileoff;
1334         int in_crit = 0;
1335 
1336 #if defined(_LITTLE_ENDIAN)
1337         fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1338 #else
1339         fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1340 #endif
1341 
1342         if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1343                 return (set_errno(EINVAL));
1344 
1345         if ((fp = getf(fdes)) == NULL)
1346                 return (set_errno(EBADF));
1347         if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1348                 error = EBADF;
1349                 goto out;
1350         }
1351 
1352         rwflag = 0;
1353         vp = fp->f_vnode;
1354 
1355         if (vp->v_type == VREG) {
1356 
1357                 if (bcount == 0)
1358                         goto out;
1359 
1360                 /*
1361                  * Same as pread. See comments in pread.
1362                  */
1363 
1364                 if (fileoff > MAXOFFSET_T) {
1365                         error = EINVAL;
1366                         goto out;
1367                 }
1368                 if (fileoff + bcount > MAXOFFSET_T)
1369                         bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1370         } else if (vp->v_type == VFIFO) {
1371                 error = ESPIPE;
1372                 goto out;
1373         }
1374 
1375         /*
1376          * We have to enter the critical region before calling VOP_RWLOCK
1377          * to avoid a deadlock with ufs.
1378          */
1379         if (nbl_need_check(vp)) {
1380                 int svmand;
1381 
1382                 nbl_start_crit(vp, RW_READER);
1383                 in_crit = 1;
1384                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1385                 if (error != 0)
1386                         goto out;
1387                 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1388                     NULL)) {
1389                         error = EACCES;
1390                         goto out;
1391                 }
1392         }
1393 
1394         aiov.iov_base = cbuf;
1395         aiov.iov_len = bcount;
1396         (void) VOP_RWLOCK(vp, rwflag, NULL);
1397         auio.uio_loffset = fileoff;
1398 
1399         /*
1400          * Note: File size can never be greater than MAXOFFSET_T.
1401          * If ever we start supporting 128 bit files the code
1402          * similar to the one in pread at this place should be here.
1403          * Here we avoid the unnecessary VOP_GETATTR() when we
1404          * know that fileoff == MAXOFFSET_T implies that it is always
1405          * greater than or equal to file size.
1406          */
1407         auio.uio_iov = &aiov;
1408         auio.uio_iovcnt = 1;
1409         auio.uio_resid = bcount;
1410         auio.uio_segflg = UIO_USERSPACE;
1411         auio.uio_llimit = MAXOFFSET_T;
1412         auio.uio_fmode = fflag;
1413         auio.uio_extflg = UIO_COPY_CACHED;
1414 
1415         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1416 
1417         /* If read sync is not asked for, filter sync flags */
1418         if ((ioflag & FRSYNC) == 0)
1419                 ioflag &= ~(FSYNC|FDSYNC);
1420         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1421         bcount -= auio.uio_resid;
1422         CPU_STATS_ENTER_K();
1423         cp = CPU;
1424         CPU_STATS_ADDQ(cp, sys, sysread, 1);
1425         CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1426         CPU_STATS_EXIT_K();
1427         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1428         VOP_RWUNLOCK(vp, rwflag, NULL);
1429 
1430         if (error == EINTR && bcount != 0)
1431                 error = 0;
1432 out:
1433         if (in_crit)
1434                 nbl_end_crit(vp);
1435         releasef(fdes);
1436         if (error)
1437                 return (set_errno(error));
1438         return (bcount);
1439 }
1440 
1441 /*
1442  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1443  */
1444 ssize32_t
1445 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1446     uint32_t offset_2)
1447 {
1448         struct uio auio;
1449         struct iovec aiov;
1450         file_t *fp;
1451         register vnode_t *vp;
1452         struct cpu *cp;
1453         int fflag, ioflag, rwflag;
1454         ssize_t bcount;
1455         int error = 0;
1456         u_offset_t fileoff;
1457         int in_crit = 0;
1458 
1459 #if defined(_LITTLE_ENDIAN)
1460         fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1461 #else
1462         fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1463 #endif
1464 
1465         if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1466                 return (set_errno(EINVAL));
1467         if ((fp = getf(fdes)) == NULL)
1468                 return (set_errno(EBADF));
1469         if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1470                 error = EBADF;
1471                 goto out;
1472         }
1473 
1474         rwflag = 1;
1475         vp = fp->f_vnode;
1476 
1477         if (vp->v_type == VREG) {
1478 
1479                 if (bcount == 0)
1480                         goto out;
1481 
1482                 /*
1483                  * See comments in pwrite.
1484                  */
1485                 if (fileoff > MAXOFFSET_T) {
1486                         error = EINVAL;
1487                         goto out;
1488                 }
1489                 if (fileoff >= curproc->p_fsz_ctl) {
1490                         mutex_enter(&curproc->p_lock);
1491                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1492                             curproc->p_rctls, curproc, RCA_SAFE);
1493                         mutex_exit(&curproc->p_lock);
1494                         error = EFBIG;
1495                         goto out;
1496                 }
1497                 if (fileoff == MAXOFFSET_T) {
1498                         error = EFBIG;
1499                         goto out;
1500                 }
1501                 if (fileoff + bcount > MAXOFFSET_T)
1502                         bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1503         } else if (vp->v_type == VFIFO) {
1504                 error = ESPIPE;
1505                 goto out;
1506         }
1507 
1508         /*
1509          * We have to enter the critical region before calling VOP_RWLOCK
1510          * to avoid a deadlock with ufs.
1511          */
1512         if (nbl_need_check(vp)) {
1513                 int svmand;
1514 
1515                 nbl_start_crit(vp, RW_READER);
1516                 in_crit = 1;
1517                 error = nbl_svmand(vp, fp->f_cred, &svmand);
1518                 if (error != 0)
1519                         goto out;
1520                 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1521                     NULL)) {
1522                         error = EACCES;
1523                         goto out;
1524                 }
1525         }
1526 
1527         aiov.iov_base = cbuf;
1528         aiov.iov_len = bcount;
1529         (void) VOP_RWLOCK(vp, rwflag, NULL);
1530         auio.uio_loffset = fileoff;
1531         auio.uio_iov = &aiov;
1532         auio.uio_iovcnt = 1;
1533         auio.uio_resid = bcount;
1534         auio.uio_segflg = UIO_USERSPACE;
1535         auio.uio_llimit = curproc->p_fsz_ctl;
1536         auio.uio_fmode = fflag;
1537         auio.uio_extflg = UIO_COPY_CACHED;
1538 
1539         /*
1540          * The SUSv4 POSIX specification states:
1541          *      The pwrite() function shall be equivalent to write(), except
1542          *      that it writes into a given position and does not change
1543          *      the file offset (regardless of whether O_APPEND is set).
1544          * To make this be true, we omit the FAPPEND flag from ioflag.
1545          */
1546         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1547 
1548         error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1549         bcount -= auio.uio_resid;
1550         CPU_STATS_ENTER_K();
1551         cp = CPU;
1552         CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1553         CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1554         CPU_STATS_EXIT_K();
1555         ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1556         VOP_RWUNLOCK(vp, rwflag, NULL);
1557 
1558         if (error == EINTR && bcount != 0)
1559                 error = 0;
1560 out:
1561         if (in_crit)
1562                 nbl_end_crit(vp);
1563         releasef(fdes);
1564         if (error)
1565                 return (set_errno(error));
1566         return (bcount);
1567 }
1568 
1569 #endif  /* _SYSCALL32_IMPL || _ILP32 */
1570 
1571 #ifdef _SYSCALL32_IMPL
1572 /*
1573  * Tail-call elimination of xxx32() down to xxx()
1574  *
1575  * A number of xxx32 system calls take a len (or count) argument and
1576  * return a number in the range [0,len] or -1 on error.
1577  * Given an ssize32_t input len, the downcall xxx() will return
1578  * a 64-bit value that is -1 or in the range [0,len] which actually
1579  * is a proper return value for the xxx32 call. So even if the xxx32
1580  * calls can be considered as returning a ssize32_t, they are currently
1581  * declared as returning a ssize_t as this enables tail-call elimination.
1582  *
1583  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1584  * down negative input values as such and let the downcall handle error
1585  * reporting. Functions covered by this comments are:
1586  *
1587  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1588  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1589  * readlink.c:     readlink32.
1590  */
1591 
1592 ssize_t
1593 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1594 {
1595         return (read(fdes,
1596             (void *)(uintptr_t)cbuf, (ssize32_t)count));
1597 }
1598 
1599 ssize_t
1600 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1601 {
1602         return (write(fdes,
1603             (void *)(uintptr_t)cbuf, (ssize32_t)count));
1604 }
1605 
1606 ssize_t
1607 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1608 {
1609         return (pread(fdes,
1610             (void *)(uintptr_t)cbuf, (ssize32_t)count,
1611             (off_t)(uint32_t)offset));
1612 }
1613 
1614 ssize_t
1615 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1616 {
1617         return (pwrite(fdes,
1618             (void *)(uintptr_t)cbuf, (ssize32_t)count,
1619             (off_t)(uint32_t)offset));
1620 }
1621 
1622 ssize_t
1623 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1624 {
1625         return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1626 }
1627 
1628 ssize_t
1629 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1630 {
1631         return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1632 }
1633 #endif  /* _SYSCALL32_IMPL */