1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2019 Joyent, Inc.
  25  */
  26 
  27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /* All Rights Reserved */
  29 
  30 /*
  31  * Portions of this source code were derived from Berkeley 4.3 BSD
  32  * under license from the Regents of the University of California.
  33  */
  34 
  35 #include <sys/types.h>
  36 #include <sys/t_lock.h>
  37 #include <sys/param.h>
  38 #include <sys/time.h>
  39 #include <sys/systm.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/resource.h>
  42 #include <sys/signal.h>
  43 #include <sys/cred.h>
  44 #include <sys/user.h>
  45 #include <sys/buf.h>
  46 #include <sys/vfs.h>
  47 #include <sys/vnode.h>
  48 #include <sys/proc.h>
  49 #include <sys/disp.h>
  50 #include <sys/file.h>
  51 #include <sys/fcntl.h>
  52 #include <sys/flock.h>
  53 #include <sys/kmem.h>
  54 #include <sys/uio.h>
  55 #include <sys/dnlc.h>
  56 #include <sys/conf.h>
  57 #include <sys/mman.h>
  58 #include <sys/pathname.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/filio.h>
  63 #include <sys/atomic.h>
  64 
  65 #include <sys/fssnap_if.h>
  66 #include <sys/fs/ufs_fs.h>
  67 #include <sys/fs/ufs_lockfs.h>
  68 #include <sys/fs/ufs_filio.h>
  69 #include <sys/fs/ufs_inode.h>
  70 #include <sys/fs/ufs_fsdir.h>
  71 #include <sys/fs/ufs_quota.h>
  72 #include <sys/fs/ufs_trans.h>
  73 #include <sys/fs/ufs_panic.h>
  74 #include <sys/dirent.h>           /* must be AFTER <sys/fs/fsdir.h>! */
  75 #include <sys/errno.h>
  76 
  77 #include <sys/filio.h>            /* _FIOIO */
  78 
  79 #include <vm/hat.h>
  80 #include <vm/page.h>
  81 #include <vm/pvn.h>
  82 #include <vm/as.h>
  83 #include <vm/seg.h>
  84 #include <vm/seg_map.h>
  85 #include <vm/seg_vn.h>
  86 #include <vm/seg_kmem.h>
  87 #include <vm/rm.h>
  88 #include <sys/swap.h>
  89 #include <sys/epm.h>
  90 
  91 #include <fs/fs_subr.h>
  92 
  93 static void     *ufs_directio_zero_buf;
  94 static int      ufs_directio_zero_len   = 8192;
  95 
  96 int     ufs_directio_enabled = 1;       /* feature is enabled */
  97 
  98 /*
  99  * for kstats reader
 100  */
 101 struct ufs_directio_kstats {
 102         kstat_named_t   logical_reads;
 103         kstat_named_t   phys_reads;
 104         kstat_named_t   hole_reads;
 105         kstat_named_t   nread;
 106         kstat_named_t   logical_writes;
 107         kstat_named_t   phys_writes;
 108         kstat_named_t   nwritten;
 109         kstat_named_t   nflushes;
 110 } ufs_directio_kstats = {
 111         { "logical_reads",      KSTAT_DATA_UINT64 },
 112         { "phys_reads",         KSTAT_DATA_UINT64 },
 113         { "hole_reads",         KSTAT_DATA_UINT64 },
 114         { "nread",              KSTAT_DATA_UINT64 },
 115         { "logical_writes",     KSTAT_DATA_UINT64 },
 116         { "phys_writes",        KSTAT_DATA_UINT64 },
 117         { "nwritten",           KSTAT_DATA_UINT64 },
 118         { "nflushes",           KSTAT_DATA_UINT64 },
 119 };
 120 
 121 kstat_t *ufs_directio_kstatsp;
 122 
 123 /*
 124  * use kmem_cache_create for direct-physio buffers. This has shown
 125  * a better cache distribution compared to buffers on the
 126  * stack. It also avoids semaphore construction/deconstruction
 127  * per request
 128  */
 129 struct directio_buf {
 130         struct directio_buf     *next;
 131         char            *addr;
 132         size_t          nbytes;
 133         struct buf      buf;
 134 };
 135 static struct kmem_cache *directio_buf_cache;
 136 
 137 
 138 /* ARGSUSED */
 139 static int
 140 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
 141 {
 142         bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
 143         return (0);
 144 }
 145 
 146 /* ARGSUSED */
 147 static void
 148 directio_buf_destructor(void *dbp, void *cdrarg)
 149 {
 150         biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
 151 }
 152 
 153 void
 154 directio_bufs_init(void)
 155 {
 156         directio_buf_cache = kmem_cache_create("directio_buf_cache",
 157             sizeof (struct directio_buf), 0,
 158             directio_buf_constructor, directio_buf_destructor,
 159             NULL, NULL, NULL, 0);
 160 }
 161 
 162 void
 163 ufs_directio_init(void)
 164 {
 165         /*
 166          * kstats
 167          */
 168         ufs_directio_kstatsp = kstat_create("ufs", 0,
 169             "directio", "ufs", KSTAT_TYPE_NAMED,
 170             sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
 171             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 172         if (ufs_directio_kstatsp) {
 173                 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
 174                 kstat_install(ufs_directio_kstatsp);
 175         }
 176         /*
 177          * kzero is broken so we have to use a private buf of zeroes
 178          */
 179         ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
 180         directio_bufs_init();
 181 }
 182 
 183 /*
 184  * Wait for the first direct IO operation to finish
 185  */
 186 static int
 187 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
 188 {
 189         buf_t   *bp;
 190         int     error;
 191 
 192         /*
 193          * Wait for IO to finish
 194          */
 195         bp = &dbp->buf;
 196         error = biowait(bp);
 197 
 198         /*
 199          * bytes_io will be used to figure out a resid
 200          * for the caller. The resid is approximated by reporting
 201          * the bytes following the first failed IO as the residual.
 202          *
 203          * I am cautious about using b_resid because I
 204          * am not sure how well the disk drivers maintain it.
 205          */
 206         if (error)
 207                 if (bp->b_resid)
 208                         *bytes_iop = bp->b_bcount - bp->b_resid;
 209                 else
 210                         *bytes_iop = 0;
 211         else
 212                 *bytes_iop += bp->b_bcount;
 213         /*
 214          * Release direct IO resources
 215          */
 216         bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 217         kmem_cache_free(directio_buf_cache, dbp);
 218         return (error);
 219 }
 220 
 221 /*
 222  * Wait for all of the direct IO operations to finish
 223  */
 224 
 225 static int
 226 directio_wait(struct directio_buf *tail, long *bytes_iop)
 227 {
 228         int     error = 0, newerror;
 229         struct directio_buf     *dbp;
 230 
 231         /*
 232          * The linked list of directio buf structures is maintained
 233          * in reverse order (tail->last request->penultimate request->...)
 234          */
 235         while ((dbp = tail) != NULL) {
 236                 tail = dbp->next;
 237                 newerror = directio_wait_one(dbp, bytes_iop);
 238                 if (error == 0)
 239                         error = newerror;
 240         }
 241         return (error);
 242 }
 243 /*
 244  * Initiate direct IO request
 245  */
 246 static void
 247 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
 248     offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
 249     struct directio_buf **tailp, page_t **pplist)
 250 {
 251         buf_t *bp;
 252         struct directio_buf *dbp;
 253 
 254         /*
 255          * Allocate a directio buf header
 256          *   Note - list is maintained in reverse order.
 257          *   directio_wait_one() depends on this fact when
 258          *   adjusting the ``bytes_io'' param. bytes_io
 259          *   is used to compute a residual in the case of error.
 260          */
 261         dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
 262         dbp->next = *tailp;
 263         *tailp = dbp;
 264 
 265         /*
 266          * Initialize buf header
 267          */
 268         dbp->addr = addr;
 269         dbp->nbytes = nbytes;
 270         bp = &dbp->buf;
 271         bp->b_edev = ip->i_dev;
 272         bp->b_lblkno = btodt(offset);
 273         bp->b_bcount = nbytes;
 274         bp->b_un.b_addr = addr;
 275         bp->b_proc = procp;
 276         bp->b_file = ip->i_vnode;
 277 
 278         /*
 279          * Note that S_WRITE implies B_READ and vice versa: a read(2)
 280          * will B_READ data from the filesystem and S_WRITE it into
 281          * the user's buffer; a write(2) will S_READ data from the
 282          * user's buffer and B_WRITE it to the filesystem.
 283          */
 284         if (rw == S_WRITE) {
 285                 bp->b_flags = B_BUSY | B_PHYS | B_READ;
 286                 ufs_directio_kstats.phys_reads.value.ui64++;
 287                 ufs_directio_kstats.nread.value.ui64 += nbytes;
 288         } else {
 289                 bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
 290                 ufs_directio_kstats.phys_writes.value.ui64++;
 291                 ufs_directio_kstats.nwritten.value.ui64 += nbytes;
 292         }
 293         bp->b_shadow = pplist;
 294         if (pplist != NULL)
 295                 bp->b_flags |= B_SHADOW;
 296 
 297         /*
 298          * Issue I/O request.
 299          */
 300         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 301         if (ufsvfsp->vfs_snapshot)
 302                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 303         else
 304                 (void) bdev_strategy(bp);
 305 
 306         if (rw == S_WRITE)
 307                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 308         else
 309                 lwp_stat_update(LWP_STAT_INBLK, 1);
 310 
 311 }
 312 
 313 uint32_t        ufs_shared_writes;      /* writes done w/ lock shared */
 314 uint32_t        ufs_cur_writes;         /* # concurrent writes */
 315 uint32_t        ufs_maxcur_writes;      /* high water concurrent writes */
 316 uint32_t        ufs_posix_hits;         /* writes done /w lock excl. */
 317 
 318 /*
 319  * Force POSIX syncronous data integrity on all writes for testing.
 320  */
 321 uint32_t        ufs_force_posix_sdi = 0;
 322 
 323 /*
 324  * Direct Write
 325  */
 326 
 327 int
 328 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
 329     cred_t *cr, int *statusp)
 330 {
 331         long            resid, bytes_written;
 332         u_offset_t      size, uoff;
 333         uio_t           *uio = arg_uio;
 334         rlim64_t        limit = uio->uio_llimit;
 335         int             on, n, error, newerror, len, has_holes;
 336         daddr_t         bn;
 337         size_t          nbytes;
 338         struct fs       *fs;
 339         vnode_t         *vp;
 340         iovec_t         *iov;
 341         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 342         struct proc     *procp;
 343         struct as       *as;
 344         struct directio_buf     *tail;
 345         int             exclusive, ncur, bmap_peek;
 346         uio_t           copy_uio;
 347         iovec_t         copy_iov;
 348         char            *copy_base;
 349         long            copy_resid;
 350 
 351         /*
 352          * assume that directio isn't possible (normal case)
 353          */
 354         *statusp = DIRECTIO_FAILURE;
 355 
 356         /*
 357          * Don't go direct
 358          */
 359         if (ufs_directio_enabled == 0)
 360                 return (0);
 361 
 362         /*
 363          * mapped file; nevermind
 364          */
 365         if (ip->i_mapcnt)
 366                 return (0);
 367 
 368         /*
 369          * CAN WE DO DIRECT IO?
 370          */
 371         uoff = uio->uio_loffset;
 372         resid = uio->uio_resid;
 373 
 374         /*
 375          * beyond limit
 376          */
 377         if (uoff + resid > limit)
 378                 return (0);
 379 
 380         /*
 381          * must be sector aligned
 382          */
 383         if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 384                 return (0);
 385 
 386         /*
 387          * SHOULD WE DO DIRECT IO?
 388          */
 389         size = ip->i_size;
 390         has_holes = -1;
 391 
 392         /*
 393          * only on regular files; no metadata
 394          */
 395         if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
 396                 return (0);
 397 
 398         /*
 399          * Synchronous, allocating writes run very slow in Direct-Mode
 400          *      XXX - can be fixed with bmap_write changes for large writes!!!
 401          *      XXX - can be fixed for updates to "almost-full" files
 402          *      XXX - WARNING - system hangs if bmap_write() has to
 403          *                      allocate lots of pages since pageout
 404          *                      suspends on locked inode
 405          */
 406         if (!rewrite && (ip->i_flag & ISYNC)) {
 407                 if ((uoff + resid) > size)
 408                         return (0);
 409                 has_holes = bmap_has_holes(ip);
 410                 if (has_holes)
 411                         return (0);
 412         }
 413 
 414         /*
 415          * Each iovec must be short aligned and sector aligned.  If
 416          * one is not, then kmem_alloc a new buffer and copy all of
 417          * the smaller buffers into the new buffer.  This new
 418          * buffer will be short aligned and sector aligned.
 419          */
 420         iov = uio->uio_iov;
 421         nbytes = uio->uio_iovcnt;
 422         while (nbytes--) {
 423                 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
 424                     (intptr_t)(iov->iov_base) & 1) {
 425                         copy_resid = uio->uio_resid;
 426                         copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
 427                         if (copy_base == NULL)
 428                                 return (0);
 429                         copy_iov.iov_base = copy_base;
 430                         copy_iov.iov_len = copy_resid;
 431                         copy_uio.uio_iov = &copy_iov;
 432                         copy_uio.uio_iovcnt = 1;
 433                         copy_uio.uio_segflg = UIO_SYSSPACE;
 434                         copy_uio.uio_extflg = UIO_COPY_DEFAULT;
 435                         copy_uio.uio_loffset = uio->uio_loffset;
 436                         copy_uio.uio_resid = uio->uio_resid;
 437                         copy_uio.uio_llimit = uio->uio_llimit;
 438                         error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
 439                         if (error) {
 440                                 kmem_free(copy_base, copy_resid);
 441                                 return (0);
 442                         }
 443                         uio = &copy_uio;
 444                         break;
 445                 }
 446                 iov++;
 447         }
 448 
 449         /*
 450          * From here on down, all error exits must go to errout and
 451          * not simply return a 0.
 452          */
 453 
 454         /*
 455          * DIRECTIO
 456          */
 457 
 458         fs = ip->i_fs;
 459 
 460         /*
 461          * POSIX check. If attempting a concurrent re-write, make sure
 462          * that this will be a single request to the driver to meet
 463          * POSIX synchronous data integrity requirements.
 464          */
 465         bmap_peek = 0;
 466         if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
 467                 int upgrade = 0;
 468 
 469                 /* check easy conditions first */
 470                 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
 471                         upgrade = 1;
 472                 } else {
 473                         /* now look for contiguous allocation */
 474                         len = (ssize_t)blkroundup(fs, resid);
 475                         error = bmap_read(ip, uoff, &bn, &len);
 476                         if (error || bn == UFS_HOLE || len == 0)
 477                                 goto errout;
 478                         /* save a call to bmap_read later */
 479                         bmap_peek = 1;
 480                         if (len < resid)
 481                                 upgrade = 1;
 482                 }
 483                 if (upgrade) {
 484                         rw_exit(&ip->i_contents);
 485                         rw_enter(&ip->i_contents, RW_WRITER);
 486                         ufs_posix_hits++;
 487                 }
 488         }
 489 
 490 
 491         /*
 492          * allocate space
 493          */
 494 
 495         /*
 496          * If attempting a re-write, there is no allocation to do.
 497          * bmap_write would trip an ASSERT if i_contents is held shared.
 498          */
 499         if (rewrite)
 500                 goto skip_alloc;
 501 
 502         do {
 503                 on = (int)blkoff(fs, uoff);
 504                 n = (int)MIN(fs->fs_bsize - on, resid);
 505                 if ((uoff + n) > ip->i_size) {
 506                         error = bmap_write(ip, uoff, (int)(on + n),
 507                             (int)(uoff & (offset_t)MAXBOFFSET) == 0,
 508                             NULL, cr);
 509                         /* Caller is responsible for updating i_seq if needed */
 510                         if (error)
 511                                 break;
 512                         ip->i_size = uoff + n;
 513                         ip->i_flag |= IATTCHG;
 514                 } else if (n == MAXBSIZE) {
 515                         error = bmap_write(ip, uoff, (int)(on + n),
 516                             BI_ALLOC_ONLY, NULL, cr);
 517                         /* Caller is responsible for updating i_seq if needed */
 518                 } else {
 519                         if (has_holes < 0)
 520                                 has_holes = bmap_has_holes(ip);
 521                         if (has_holes) {
 522                                 uint_t  blk_size;
 523                                 u_offset_t offset;
 524 
 525                                 offset = uoff & (offset_t)fs->fs_bmask;
 526                                 blk_size = (int)blksize(fs, ip,
 527                                     (daddr_t)lblkno(fs, offset));
 528                                 error = bmap_write(ip, uoff, blk_size,
 529                                     BI_NORMAL, NULL, cr);
 530                                 /*
 531                                  * Caller is responsible for updating
 532                                  * i_seq if needed
 533                                  */
 534                         } else
 535                                 error = 0;
 536                 }
 537                 if (error)
 538                         break;
 539                 uoff += n;
 540                 resid -= n;
 541                 /*
 542                  * if file has grown larger than 2GB, set flag
 543                  * in superblock if not already set
 544                  */
 545                 if ((ip->i_size > MAXOFF32_T) &&
 546                     !(fs->fs_flags & FSLARGEFILES)) {
 547                         ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
 548                         mutex_enter(&ufsvfsp->vfs_lock);
 549                         fs->fs_flags |= FSLARGEFILES;
 550                         ufs_sbwrite(ufsvfsp);
 551                         mutex_exit(&ufsvfsp->vfs_lock);
 552                 }
 553         } while (resid);
 554 
 555         if (error) {
 556                 /*
 557                  * restore original state
 558                  */
 559                 if (resid) {
 560                         if (size == ip->i_size)
 561                                 goto errout;
 562                         (void) ufs_itrunc(ip, size, 0, cr);
 563                 }
 564                 /*
 565                  * try non-directio path
 566                  */
 567                 goto errout;
 568         }
 569 skip_alloc:
 570 
 571         /*
 572          * get rid of cached pages
 573          */
 574         vp = ITOV(ip);
 575         exclusive = rw_write_held(&ip->i_contents);
 576         if (vn_has_cached_data(vp)) {
 577                 if (!exclusive) {
 578                         /*
 579                          * Still holding i_rwlock, so no allocations
 580                          * can happen after dropping contents.
 581                          */
 582                         rw_exit(&ip->i_contents);
 583                         rw_enter(&ip->i_contents, RW_WRITER);
 584                 }
 585                 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 586                     B_INVAL, cr, NULL);
 587                 if (vn_has_cached_data(vp))
 588                         goto errout;
 589                 if (!exclusive)
 590                         rw_downgrade(&ip->i_contents);
 591                 ufs_directio_kstats.nflushes.value.ui64++;
 592         }
 593 
 594         /*
 595          * Direct Writes
 596          */
 597 
 598         if (!exclusive) {
 599                 ufs_shared_writes++;
 600                 ncur = atomic_inc_32_nv(&ufs_cur_writes);
 601                 if (ncur > ufs_maxcur_writes)
 602                         ufs_maxcur_writes = ncur;
 603         }
 604 
 605         /*
 606          * proc and as are for VM operations in directio_start()
 607          */
 608         if (uio->uio_segflg == UIO_USERSPACE) {
 609                 procp = ttoproc(curthread);
 610                 as = procp->p_as;
 611         } else {
 612                 procp = NULL;
 613                 as = &kas;
 614         }
 615         *statusp = DIRECTIO_SUCCESS;
 616         error = 0;
 617         newerror = 0;
 618         resid = uio->uio_resid;
 619         bytes_written = 0;
 620         ufs_directio_kstats.logical_writes.value.ui64++;
 621         while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 622                 size_t pglck_len, pglck_size;
 623                 caddr_t pglck_base;
 624                 page_t **pplist, **spplist;
 625 
 626                 tail = NULL;
 627 
 628                 /*
 629                  * Adjust number of bytes
 630                  */
 631                 iov = uio->uio_iov;
 632                 pglck_len = (size_t)MIN(iov->iov_len, resid);
 633                 pglck_base = iov->iov_base;
 634                 if (pglck_len == 0) {
 635                         uio->uio_iov++;
 636                         uio->uio_iovcnt--;
 637                         continue;
 638                 }
 639 
 640                 /*
 641                  * Try to Lock down the largest chunck of pages possible.
 642                  */
 643                 pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 644                 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
 645 
 646                 if (error)
 647                         break;
 648 
 649                 pglck_size = pglck_len;
 650                 while (pglck_len) {
 651 
 652                         nbytes = pglck_len;
 653                         uoff = uio->uio_loffset;
 654 
 655                         if (!bmap_peek) {
 656 
 657                                 /*
 658                                  * Re-adjust number of bytes to contiguous
 659                                  * range. May have already called bmap_read
 660                                  * in the case of a concurrent rewrite.
 661                                  */
 662                                 len = (ssize_t)blkroundup(fs, nbytes);
 663                                 error = bmap_read(ip, uoff, &bn, &len);
 664                                 if (error)
 665                                         break;
 666                                 if (bn == UFS_HOLE || len == 0)
 667                                         break;
 668                         }
 669                         nbytes = (size_t)MIN(nbytes, len);
 670                         bmap_peek = 0;
 671 
 672                         /*
 673                          * Get the pagelist pointer for this offset to be
 674                          * passed to directio_start.
 675                          */
 676 
 677                         if (pplist != NULL)
 678                                 spplist = pplist +
 679                                     btop((uintptr_t)iov->iov_base -
 680                                     ((uintptr_t)pglck_base & PAGEMASK));
 681                         else
 682                                 spplist = NULL;
 683 
 684                         /*
 685                          * Kick off the direct write requests
 686                          */
 687                         directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
 688                             iov->iov_base, S_READ, procp, &tail, spplist);
 689 
 690                         /*
 691                          * Adjust pointers and counters
 692                          */
 693                         iov->iov_len -= nbytes;
 694                         iov->iov_base += nbytes;
 695                         uio->uio_loffset += nbytes;
 696                         resid -= nbytes;
 697                         pglck_len -= nbytes;
 698                 }
 699 
 700                 /*
 701                  * Wait for outstanding requests
 702                  */
 703                 newerror = directio_wait(tail, &bytes_written);
 704 
 705                 /*
 706                  * Release VM resources
 707                  */
 708                 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
 709 
 710         }
 711 
 712         if (!exclusive) {
 713                 atomic_dec_32(&ufs_cur_writes);
 714                 /*
 715                  * If this write was done shared, readers may
 716                  * have pulled in unmodified pages. Get rid of
 717                  * these potentially stale pages.
 718                  */
 719                 if (vn_has_cached_data(vp)) {
 720                         rw_exit(&ip->i_contents);
 721                         rw_enter(&ip->i_contents, RW_WRITER);
 722                         (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 723                             B_INVAL, cr, NULL);
 724                         ufs_directio_kstats.nflushes.value.ui64++;
 725                         rw_downgrade(&ip->i_contents);
 726                 }
 727         }
 728 
 729         /*
 730          * If error, adjust resid to begin at the first
 731          * un-writable byte.
 732          */
 733         if (error == 0)
 734                 error = newerror;
 735         if (error)
 736                 resid = uio->uio_resid - bytes_written;
 737         arg_uio->uio_resid = resid;
 738 
 739         if (!rewrite) {
 740                 ip->i_flag |= IUPD | ICHG;
 741                 /* Caller will update i_seq */
 742                 TRANS_INODE(ip->i_ufsvfs, ip);
 743         }
 744         /*
 745          * If there is a residual; adjust the EOF if necessary
 746          */
 747         if (resid) {
 748                 if (size != ip->i_size) {
 749                         if (uio->uio_loffset > size)
 750                                 size = uio->uio_loffset;
 751                         (void) ufs_itrunc(ip, size, 0, cr);
 752                 }
 753         }
 754 
 755         if (uio == &copy_uio)
 756                 kmem_free(copy_base, copy_resid);
 757 
 758         return (error);
 759 
 760 errout:
 761         if (uio == &copy_uio)
 762                 kmem_free(copy_base, copy_resid);
 763 
 764         return (0);
 765 }
 766 /*
 767  * Direct read of a hole
 768  */
 769 static int
 770 directio_hole(struct uio *uio, size_t nbytes)
 771 {
 772         int             error = 0, nzero;
 773         uio_t           phys_uio;
 774         iovec_t         phys_iov;
 775 
 776         ufs_directio_kstats.hole_reads.value.ui64++;
 777         ufs_directio_kstats.nread.value.ui64 += nbytes;
 778 
 779         phys_iov.iov_base = uio->uio_iov->iov_base;
 780         phys_iov.iov_len = nbytes;
 781 
 782         phys_uio.uio_iov = &phys_iov;
 783         phys_uio.uio_iovcnt = 1;
 784         phys_uio.uio_resid = phys_iov.iov_len;
 785         phys_uio.uio_segflg = uio->uio_segflg;
 786         phys_uio.uio_extflg = uio->uio_extflg;
 787         while (error == 0 && phys_uio.uio_resid) {
 788                 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
 789                 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
 790                     &phys_uio);
 791         }
 792         return (error);
 793 }
 794 
 795 /*
 796  * Direct Read
 797  */
 798 int
 799 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
 800 {
 801         ssize_t         resid, bytes_read;
 802         u_offset_t      size, uoff;
 803         int             error, newerror, len;
 804         size_t          nbytes;
 805         struct fs       *fs;
 806         vnode_t         *vp;
 807         daddr_t         bn;
 808         iovec_t         *iov;
 809         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 810         struct proc     *procp;
 811         struct as       *as;
 812         struct directio_buf     *tail;
 813 
 814         /*
 815          * assume that directio isn't possible (normal case)
 816          */
 817         *statusp = DIRECTIO_FAILURE;
 818 
 819         /*
 820          * Don't go direct
 821          */
 822         if (ufs_directio_enabled == 0)
 823                 return (0);
 824 
 825         /*
 826          * mapped file; nevermind
 827          */
 828         if (ip->i_mapcnt)
 829                 return (0);
 830 
 831         /*
 832          * CAN WE DO DIRECT IO?
 833          */
 834         /*
 835          * must be sector aligned
 836          */
 837         uoff = uio->uio_loffset;
 838         resid = uio->uio_resid;
 839         if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 840                 return (0);
 841         /*
 842          * must be short aligned and sector aligned
 843          */
 844         iov = uio->uio_iov;
 845         nbytes = uio->uio_iovcnt;
 846         while (nbytes--) {
 847                 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
 848                         return (0);
 849                 if ((intptr_t)(iov++->iov_base) & 1)
 850                         return (0);
 851         }
 852 
 853         /*
 854          * DIRECTIO
 855          */
 856         fs = ip->i_fs;
 857 
 858         /*
 859          * don't read past EOF
 860          */
 861         size = ip->i_size;
 862 
 863         /*
 864          * The file offset is past EOF so bail out here; we don't want
 865          * to update uio_resid and make it look like we read something.
 866          * We say that direct I/O was a success to avoid having rdip()
 867          * go through the same "read past EOF logic".
 868          */
 869         if (uoff >= size) {
 870                 *statusp = DIRECTIO_SUCCESS;
 871                 return (0);
 872         }
 873 
 874         /*
 875          * The read would extend past EOF so make it smaller.
 876          */
 877         if ((uoff + resid) > size) {
 878                 resid = size - uoff;
 879                 /*
 880                  * recheck sector alignment
 881                  */
 882                 if (resid & (DEV_BSIZE - 1))
 883                         return (0);
 884         }
 885 
 886         /*
 887          * At this point, we know there is some real work to do.
 888          */
 889         ASSERT(resid);
 890 
 891         /*
 892          * get rid of cached pages
 893          */
 894         vp = ITOV(ip);
 895         if (vn_has_cached_data(vp)) {
 896                 rw_exit(&ip->i_contents);
 897                 rw_enter(&ip->i_contents, RW_WRITER);
 898                 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 899                     B_INVAL, cr, NULL);
 900                 if (vn_has_cached_data(vp))
 901                         return (0);
 902                 rw_downgrade(&ip->i_contents);
 903                 ufs_directio_kstats.nflushes.value.ui64++;
 904         }
 905         /*
 906          * Direct Reads
 907          */
 908 
 909         /*
 910          * proc and as are for VM operations in directio_start()
 911          */
 912         if (uio->uio_segflg == UIO_USERSPACE) {
 913                 procp = ttoproc(curthread);
 914                 as = procp->p_as;
 915         } else {
 916                 procp = NULL;
 917                 as = &kas;
 918         }
 919 
 920         *statusp = DIRECTIO_SUCCESS;
 921         error = 0;
 922         newerror = 0;
 923         bytes_read = 0;
 924         ufs_directio_kstats.logical_reads.value.ui64++;
 925         while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 926                 size_t pglck_len, pglck_size;
 927                 caddr_t pglck_base;
 928                 page_t **pplist, **spplist;
 929 
 930                 tail = NULL;
 931 
 932                 /*
 933                  * Adjust number of bytes
 934                  */
 935                 iov = uio->uio_iov;
 936                 pglck_len = (size_t)MIN(iov->iov_len, resid);
 937                 pglck_base = iov->iov_base;
 938                 if (pglck_len == 0) {
 939                         uio->uio_iov++;
 940                         uio->uio_iovcnt--;
 941                         continue;
 942                 }
 943 
 944                 /*
 945                  * Try to Lock down the largest chunck of pages possible.
 946                  */
 947                 pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 948                 error = as_pagelock(as, &pplist, pglck_base,
 949                     pglck_len, S_WRITE);
 950 
 951                 if (error)
 952                         break;
 953 
 954                 pglck_size = pglck_len;
 955                 while (pglck_len) {
 956 
 957                         nbytes = pglck_len;
 958                         uoff = uio->uio_loffset;
 959 
 960                         /*
 961                          * Re-adjust number of bytes to contiguous range
 962                          */
 963                         len = (ssize_t)blkroundup(fs, nbytes);
 964                         error = bmap_read(ip, uoff, &bn, &len);
 965                         if (error)
 966                                 break;
 967 
 968                         if (bn == UFS_HOLE) {
 969                                 nbytes = (size_t)MIN(fs->fs_bsize -
 970                                     (long)blkoff(fs, uoff), nbytes);
 971                                 error = directio_hole(uio, nbytes);
 972                                 /*
 973                                  * Hole reads are not added to the list
 974                                  * processed by directio_wait() below so
 975                                  * account for bytes read here.
 976                                  */
 977                                 if (!error)
 978                                         bytes_read += nbytes;
 979                         } else {
 980                                 nbytes = (size_t)MIN(nbytes, len);
 981 
 982                                 /*
 983                                  * Get the pagelist pointer for this offset
 984                                  * to be passed to directio_start.
 985                                  */
 986                                 if (pplist != NULL)
 987                                         spplist = pplist +
 988                                             btop((uintptr_t)iov->iov_base -
 989                                             ((uintptr_t)pglck_base & PAGEMASK));
 990                                 else
 991                                         spplist = NULL;
 992 
 993                                 /*
 994                                  * Kick off the direct read requests
 995                                  */
 996                                 directio_start(ufsvfsp, ip, nbytes,
 997                                     ldbtob(bn), iov->iov_base,
 998                                     S_WRITE, procp, &tail, spplist);
 999                         }
1000 
1001                         if (error)
1002                                 break;
1003 
1004                         /*
1005                          * Adjust pointers and counters
1006                          */
1007                         iov->iov_len -= nbytes;
1008                         iov->iov_base += nbytes;
1009                         uio->uio_loffset += nbytes;
1010                         resid -= nbytes;
1011                         pglck_len -= nbytes;
1012                 }
1013 
1014                 /*
1015                  * Wait for outstanding requests
1016                  */
1017                 newerror = directio_wait(tail, &bytes_read);
1018                 /*
1019                  * Release VM resources
1020                  */
1021                 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1022 
1023         }
1024 
1025         /*
1026          * If error, adjust resid to begin at the first
1027          * un-read byte.
1028          */
1029         if (error == 0)
1030                 error = newerror;
1031         uio->uio_resid -= bytes_read;
1032         return (error);
1033 }