1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /* All Rights Reserved */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 #include <sys/types.h>
  35 #include <sys/t_lock.h>
  36 #include <sys/param.h>
  37 #include <sys/time.h>
  38 #include <sys/systm.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/resource.h>
  41 #include <sys/signal.h>
  42 #include <sys/cred.h>
  43 #include <sys/user.h>
  44 #include <sys/buf.h>
  45 #include <sys/vfs.h>
  46 #include <sys/vnode.h>
  47 #include <sys/proc.h>
  48 #include <sys/disp.h>
  49 #include <sys/file.h>
  50 #include <sys/fcntl.h>
  51 #include <sys/flock.h>
  52 #include <sys/kmem.h>
  53 #include <sys/uio.h>
  54 #include <sys/dnlc.h>
  55 #include <sys/conf.h>
  56 #include <sys/mman.h>
  57 #include <sys/pathname.h>
  58 #include <sys/debug.h>
  59 #include <sys/vmsystm.h>
  60 #include <sys/cmn_err.h>
  61 #include <sys/filio.h>
  62 #include <sys/atomic.h>
  63 
  64 #include <sys/fssnap_if.h>
  65 #include <sys/fs/ufs_fs.h>
  66 #include <sys/fs/ufs_lockfs.h>
  67 #include <sys/fs/ufs_filio.h>
  68 #include <sys/fs/ufs_inode.h>
  69 #include <sys/fs/ufs_fsdir.h>
  70 #include <sys/fs/ufs_quota.h>
  71 #include <sys/fs/ufs_trans.h>
  72 #include <sys/fs/ufs_panic.h>
  73 #include <sys/dirent.h>           /* must be AFTER <sys/fs/fsdir.h>! */
  74 #include <sys/errno.h>
  75 
  76 #include <sys/filio.h>            /* _FIOIO */
  77 
  78 #include <vm/hat.h>
  79 #include <vm/page.h>
  80 #include <vm/pvn.h>
  81 #include <vm/as.h>
  82 #include <vm/seg.h>
  83 #include <vm/seg_map.h>
  84 #include <vm/seg_vn.h>
  85 #include <vm/seg_kmem.h>
  86 #include <vm/rm.h>
  87 #include <sys/swap.h>
  88 #include <sys/epm.h>
  89 
  90 #include <fs/fs_subr.h>
  91 
  92 static void     *ufs_directio_zero_buf;
  93 static int      ufs_directio_zero_len   = 8192;
  94 
  95 int     ufs_directio_enabled = 1;       /* feature is enabled */
  96 
  97 /*
  98  * for kstats reader
  99  */
 100 struct ufs_directio_kstats {
 101         kstat_named_t   logical_reads;
 102         kstat_named_t   phys_reads;
 103         kstat_named_t   hole_reads;
 104         kstat_named_t   nread;
 105         kstat_named_t   logical_writes;
 106         kstat_named_t   phys_writes;
 107         kstat_named_t   nwritten;
 108         kstat_named_t   nflushes;
 109 } ufs_directio_kstats = {
 110         { "logical_reads",      KSTAT_DATA_UINT64 },
 111         { "phys_reads",         KSTAT_DATA_UINT64 },
 112         { "hole_reads",         KSTAT_DATA_UINT64 },
 113         { "nread",              KSTAT_DATA_UINT64 },
 114         { "logical_writes",     KSTAT_DATA_UINT64 },
 115         { "phys_writes",        KSTAT_DATA_UINT64 },
 116         { "nwritten",           KSTAT_DATA_UINT64 },
 117         { "nflushes",           KSTAT_DATA_UINT64 },
 118 };
 119 
 120 kstat_t *ufs_directio_kstatsp;
 121 
 122 /*
 123  * use kmem_cache_create for direct-physio buffers. This has shown
 124  * a better cache distribution compared to buffers on the
 125  * stack. It also avoids semaphore construction/deconstruction
 126  * per request
 127  */
 128 struct directio_buf {
 129         struct directio_buf     *next;
 130         char            *addr;
 131         size_t          nbytes;
 132         struct buf      buf;
 133 };
 134 static struct kmem_cache *directio_buf_cache;
 135 
 136 
 137 /* ARGSUSED */
 138 static int
 139 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
 140 {
 141         bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
 142         return (0);
 143 }
 144 
 145 /* ARGSUSED */
 146 static void
 147 directio_buf_destructor(void *dbp, void *cdrarg)
 148 {
 149         biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
 150 }
 151 
 152 void
 153 directio_bufs_init(void)
 154 {
 155         directio_buf_cache = kmem_cache_create("directio_buf_cache",
 156             sizeof (struct directio_buf), 0,
 157             directio_buf_constructor, directio_buf_destructor,
 158             NULL, NULL, NULL, 0);
 159 }
 160 
 161 void
 162 ufs_directio_init(void)
 163 {
 164         /*
 165          * kstats
 166          */
 167         ufs_directio_kstatsp = kstat_create("ufs", 0,
 168             "directio", "ufs", KSTAT_TYPE_NAMED,
 169             sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
 170             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 171         if (ufs_directio_kstatsp) {
 172                 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
 173                 kstat_install(ufs_directio_kstatsp);
 174         }
 175         /*
 176          * kzero is broken so we have to use a private buf of zeroes
 177          */
 178         ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
 179         directio_bufs_init();
 180 }
 181 
 182 /*
 183  * Wait for the first direct IO operation to finish
 184  */
 185 static int
 186 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
 187 {
 188         buf_t   *bp;
 189         int     error;
 190 
 191         /*
 192          * Wait for IO to finish
 193          */
 194         bp = &dbp->buf;
 195         error = biowait(bp);
 196 
 197         /*
 198          * bytes_io will be used to figure out a resid
 199          * for the caller. The resid is approximated by reporting
 200          * the bytes following the first failed IO as the residual.
 201          *
 202          * I am cautious about using b_resid because I
 203          * am not sure how well the disk drivers maintain it.
 204          */
 205         if (error)
 206                 if (bp->b_resid)
 207                         *bytes_iop = bp->b_bcount - bp->b_resid;
 208                 else
 209                         *bytes_iop = 0;
 210         else
 211                 *bytes_iop += bp->b_bcount;
 212         /*
 213          * Release direct IO resources
 214          */
 215         bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 216         kmem_cache_free(directio_buf_cache, dbp);
 217         return (error);
 218 }
 219 
 220 /*
 221  * Wait for all of the direct IO operations to finish
 222  */
 223 
 224 uint32_t        ufs_directio_drop_kpri = 0;     /* enable kpri hack */
 225 
 226 static int
 227 directio_wait(struct directio_buf *tail, long *bytes_iop)
 228 {
 229         int     error = 0, newerror;
 230         struct directio_buf     *dbp;
 231         uint_t  kpri_req_save;
 232 
 233         /*
 234          * The linked list of directio buf structures is maintained
 235          * in reverse order (tail->last request->penultimate request->...)
 236          */
 237         /*
 238          * This is the k_pri_req hack. Large numbers of threads
 239          * sleeping with kernel priority will cause scheduler thrashing
 240          * on an MP machine. This can be seen running Oracle using
 241          * directio to ufs files. Sleep at normal priority here to
 242          * more closely mimic physio to a device partition. This
 243          * workaround is disabled by default as a niced thread could
 244          * be starved from running while holding i_rwlock and i_contents.
 245          */
 246         if (ufs_directio_drop_kpri) {
 247                 kpri_req_save = curthread->t_kpri_req;
 248                 curthread->t_kpri_req = 0;
 249         }
 250         while ((dbp = tail) != NULL) {
 251                 tail = dbp->next;
 252                 newerror = directio_wait_one(dbp, bytes_iop);
 253                 if (error == 0)
 254                         error = newerror;
 255         }
 256         if (ufs_directio_drop_kpri)
 257                 curthread->t_kpri_req = kpri_req_save;
 258         return (error);
 259 }
 260 /*
 261  * Initiate direct IO request
 262  */
 263 static void
 264 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
 265         offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
 266         struct directio_buf **tailp, page_t **pplist)
 267 {
 268         buf_t *bp;
 269         struct directio_buf *dbp;
 270 
 271         /*
 272          * Allocate a directio buf header
 273          *   Note - list is maintained in reverse order.
 274          *   directio_wait_one() depends on this fact when
 275          *   adjusting the ``bytes_io'' param. bytes_io
 276          *   is used to compute a residual in the case of error.
 277          */
 278         dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
 279         dbp->next = *tailp;
 280         *tailp = dbp;
 281 
 282         /*
 283          * Initialize buf header
 284          */
 285         dbp->addr = addr;
 286         dbp->nbytes = nbytes;
 287         bp = &dbp->buf;
 288         bp->b_edev = ip->i_dev;
 289         bp->b_lblkno = btodt(offset);
 290         bp->b_bcount = nbytes;
 291         bp->b_un.b_addr = addr;
 292         bp->b_proc = procp;
 293         bp->b_file = ip->i_vnode;
 294 
 295         /*
 296          * Note that S_WRITE implies B_READ and vice versa: a read(2)
 297          * will B_READ data from the filesystem and S_WRITE it into
 298          * the user's buffer; a write(2) will S_READ data from the
 299          * user's buffer and B_WRITE it to the filesystem.
 300          */
 301         if (rw == S_WRITE) {
 302                 bp->b_flags = B_BUSY | B_PHYS | B_READ;
 303                 ufs_directio_kstats.phys_reads.value.ui64++;
 304                 ufs_directio_kstats.nread.value.ui64 += nbytes;
 305         } else {
 306                 bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
 307                 ufs_directio_kstats.phys_writes.value.ui64++;
 308                 ufs_directio_kstats.nwritten.value.ui64 += nbytes;
 309         }
 310         bp->b_shadow = pplist;
 311         if (pplist != NULL)
 312                 bp->b_flags |= B_SHADOW;
 313 
 314         /*
 315          * Issue I/O request.
 316          */
 317         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 318         if (ufsvfsp->vfs_snapshot)
 319                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 320         else
 321                 (void) bdev_strategy(bp);
 322 
 323         if (rw == S_WRITE)
 324                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 325         else
 326                 lwp_stat_update(LWP_STAT_INBLK, 1);
 327 
 328 }
 329 
 330 uint32_t        ufs_shared_writes;      /* writes done w/ lock shared */
 331 uint32_t        ufs_cur_writes;         /* # concurrent writes */
 332 uint32_t        ufs_maxcur_writes;      /* high water concurrent writes */
 333 uint32_t        ufs_posix_hits;         /* writes done /w lock excl. */
 334 
 335 /*
 336  * Force POSIX syncronous data integrity on all writes for testing.
 337  */
 338 uint32_t        ufs_force_posix_sdi = 0;
 339 
 340 /*
 341  * Direct Write
 342  */
 343 
 344 int
 345 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
 346         cred_t *cr, int *statusp)
 347 {
 348         long            resid, bytes_written;
 349         u_offset_t      size, uoff;
 350         uio_t           *uio = arg_uio;
 351         rlim64_t        limit = uio->uio_llimit;
 352         int             on, n, error, newerror, len, has_holes;
 353         daddr_t         bn;
 354         size_t          nbytes;
 355         struct fs       *fs;
 356         vnode_t         *vp;
 357         iovec_t         *iov;
 358         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 359         struct proc     *procp;
 360         struct as       *as;
 361         struct directio_buf     *tail;
 362         int             exclusive, ncur, bmap_peek;
 363         uio_t           copy_uio;
 364         iovec_t         copy_iov;
 365         char            *copy_base;
 366         long            copy_resid;
 367 
 368         /*
 369          * assume that directio isn't possible (normal case)
 370          */
 371         *statusp = DIRECTIO_FAILURE;
 372 
 373         /*
 374          * Don't go direct
 375          */
 376         if (ufs_directio_enabled == 0)
 377                 return (0);
 378 
 379         /*
 380          * mapped file; nevermind
 381          */
 382         if (ip->i_mapcnt)
 383                 return (0);
 384 
 385         /*
 386          * CAN WE DO DIRECT IO?
 387          */
 388         uoff = uio->uio_loffset;
 389         resid = uio->uio_resid;
 390 
 391         /*
 392          * beyond limit
 393          */
 394         if (uoff + resid > limit)
 395                 return (0);
 396 
 397         /*
 398          * must be sector aligned
 399          */
 400         if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 401                 return (0);
 402 
 403         /*
 404          * SHOULD WE DO DIRECT IO?
 405          */
 406         size = ip->i_size;
 407         has_holes = -1;
 408 
 409         /*
 410          * only on regular files; no metadata
 411          */
 412         if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
 413                 return (0);
 414 
 415         /*
 416          * Synchronous, allocating writes run very slow in Direct-Mode
 417          *      XXX - can be fixed with bmap_write changes for large writes!!!
 418          *      XXX - can be fixed for updates to "almost-full" files
 419          *      XXX - WARNING - system hangs if bmap_write() has to
 420          *                      allocate lots of pages since pageout
 421          *                      suspends on locked inode
 422          */
 423         if (!rewrite && (ip->i_flag & ISYNC)) {
 424                 if ((uoff + resid) > size)
 425                         return (0);
 426                 has_holes = bmap_has_holes(ip);
 427                 if (has_holes)
 428                         return (0);
 429         }
 430 
 431         /*
 432          * Each iovec must be short aligned and sector aligned.  If
 433          * one is not, then kmem_alloc a new buffer and copy all of
 434          * the smaller buffers into the new buffer.  This new
 435          * buffer will be short aligned and sector aligned.
 436          */
 437         iov = uio->uio_iov;
 438         nbytes = uio->uio_iovcnt;
 439         while (nbytes--) {
 440                 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
 441                     (intptr_t)(iov->iov_base) & 1) {
 442                         copy_resid = uio->uio_resid;
 443                         copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
 444                         if (copy_base == NULL)
 445                                 return (0);
 446                         copy_iov.iov_base = copy_base;
 447                         copy_iov.iov_len = copy_resid;
 448                         copy_uio.uio_iov = &copy_iov;
 449                         copy_uio.uio_iovcnt = 1;
 450                         copy_uio.uio_segflg = UIO_SYSSPACE;
 451                         copy_uio.uio_extflg = UIO_COPY_DEFAULT;
 452                         copy_uio.uio_loffset = uio->uio_loffset;
 453                         copy_uio.uio_resid = uio->uio_resid;
 454                         copy_uio.uio_llimit = uio->uio_llimit;
 455                         error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
 456                         if (error) {
 457                                 kmem_free(copy_base, copy_resid);
 458                                 return (0);
 459                         }
 460                         uio = &copy_uio;
 461                         break;
 462                 }
 463                 iov++;
 464         }
 465 
 466         /*
 467          * From here on down, all error exits must go to errout and
 468          * not simply return a 0.
 469          */
 470 
 471         /*
 472          * DIRECTIO
 473          */
 474 
 475         fs = ip->i_fs;
 476 
 477         /*
 478          * POSIX check. If attempting a concurrent re-write, make sure
 479          * that this will be a single request to the driver to meet
 480          * POSIX synchronous data integrity requirements.
 481          */
 482         bmap_peek = 0;
 483         if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
 484                 int upgrade = 0;
 485 
 486                 /* check easy conditions first */
 487                 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
 488                         upgrade = 1;
 489                 } else {
 490                         /* now look for contiguous allocation */
 491                         len = (ssize_t)blkroundup(fs, resid);
 492                         error = bmap_read(ip, uoff, &bn, &len);
 493                         if (error || bn == UFS_HOLE || len == 0)
 494                                 goto errout;
 495                         /* save a call to bmap_read later */
 496                         bmap_peek = 1;
 497                         if (len < resid)
 498                                 upgrade = 1;
 499                 }
 500                 if (upgrade) {
 501                         rw_exit(&ip->i_contents);
 502                         rw_enter(&ip->i_contents, RW_WRITER);
 503                         ufs_posix_hits++;
 504                 }
 505         }
 506 
 507 
 508         /*
 509          * allocate space
 510          */
 511 
 512         /*
 513          * If attempting a re-write, there is no allocation to do.
 514          * bmap_write would trip an ASSERT if i_contents is held shared.
 515          */
 516         if (rewrite)
 517                 goto skip_alloc;
 518 
 519         do {
 520                 on = (int)blkoff(fs, uoff);
 521                 n = (int)MIN(fs->fs_bsize - on, resid);
 522                 if ((uoff + n) > ip->i_size) {
 523                         error = bmap_write(ip, uoff, (int)(on + n),
 524                             (int)(uoff & (offset_t)MAXBOFFSET) == 0,
 525                             NULL, cr);
 526                         /* Caller is responsible for updating i_seq if needed */
 527                         if (error)
 528                                 break;
 529                         ip->i_size = uoff + n;
 530                         ip->i_flag |= IATTCHG;
 531                 } else if (n == MAXBSIZE) {
 532                         error = bmap_write(ip, uoff, (int)(on + n),
 533                             BI_ALLOC_ONLY, NULL, cr);
 534                         /* Caller is responsible for updating i_seq if needed */
 535                 } else {
 536                         if (has_holes < 0)
 537                                 has_holes = bmap_has_holes(ip);
 538                         if (has_holes) {
 539                                 uint_t  blk_size;
 540                                 u_offset_t offset;
 541 
 542                                 offset = uoff & (offset_t)fs->fs_bmask;
 543                                 blk_size = (int)blksize(fs, ip,
 544                                     (daddr_t)lblkno(fs, offset));
 545                                 error = bmap_write(ip, uoff, blk_size,
 546                                     BI_NORMAL, NULL, cr);
 547                                 /*
 548                                  * Caller is responsible for updating
 549                                  * i_seq if needed
 550                                  */
 551                         } else
 552                                 error = 0;
 553                 }
 554                 if (error)
 555                         break;
 556                 uoff += n;
 557                 resid -= n;
 558                 /*
 559                  * if file has grown larger than 2GB, set flag
 560                  * in superblock if not already set
 561                  */
 562                 if ((ip->i_size > MAXOFF32_T) &&
 563                     !(fs->fs_flags & FSLARGEFILES)) {
 564                         ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
 565                         mutex_enter(&ufsvfsp->vfs_lock);
 566                         fs->fs_flags |= FSLARGEFILES;
 567                         ufs_sbwrite(ufsvfsp);
 568                         mutex_exit(&ufsvfsp->vfs_lock);
 569                 }
 570         } while (resid);
 571 
 572         if (error) {
 573                 /*
 574                  * restore original state
 575                  */
 576                 if (resid) {
 577                         if (size == ip->i_size)
 578                                 goto errout;
 579                         (void) ufs_itrunc(ip, size, 0, cr);
 580                 }
 581                 /*
 582                  * try non-directio path
 583                  */
 584                 goto errout;
 585         }
 586 skip_alloc:
 587 
 588         /*
 589          * get rid of cached pages
 590          */
 591         vp = ITOV(ip);
 592         exclusive = rw_write_held(&ip->i_contents);
 593         if (vn_has_cached_data(vp)) {
 594                 if (!exclusive) {
 595                         /*
 596                          * Still holding i_rwlock, so no allocations
 597                          * can happen after dropping contents.
 598                          */
 599                         rw_exit(&ip->i_contents);
 600                         rw_enter(&ip->i_contents, RW_WRITER);
 601                 }
 602                 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 603                     B_INVAL, cr, NULL);
 604                 if (vn_has_cached_data(vp))
 605                         goto errout;
 606                 if (!exclusive)
 607                         rw_downgrade(&ip->i_contents);
 608                 ufs_directio_kstats.nflushes.value.ui64++;
 609         }
 610 
 611         /*
 612          * Direct Writes
 613          */
 614 
 615         if (!exclusive) {
 616                 ufs_shared_writes++;
 617                 ncur = atomic_inc_32_nv(&ufs_cur_writes);
 618                 if (ncur > ufs_maxcur_writes)
 619                         ufs_maxcur_writes = ncur;
 620         }
 621 
 622         /*
 623          * proc and as are for VM operations in directio_start()
 624          */
 625         if (uio->uio_segflg == UIO_USERSPACE) {
 626                 procp = ttoproc(curthread);
 627                 as = procp->p_as;
 628         } else {
 629                 procp = NULL;
 630                 as = &kas;
 631         }
 632         *statusp = DIRECTIO_SUCCESS;
 633         error = 0;
 634         newerror = 0;
 635         resid = uio->uio_resid;
 636         bytes_written = 0;
 637         ufs_directio_kstats.logical_writes.value.ui64++;
 638         while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 639                 size_t pglck_len, pglck_size;
 640                 caddr_t pglck_base;
 641                 page_t **pplist, **spplist;
 642 
 643                 tail = NULL;
 644 
 645                 /*
 646                  * Adjust number of bytes
 647                  */
 648                 iov = uio->uio_iov;
 649                 pglck_len = (size_t)MIN(iov->iov_len, resid);
 650                 pglck_base = iov->iov_base;
 651                 if (pglck_len == 0) {
 652                         uio->uio_iov++;
 653                         uio->uio_iovcnt--;
 654                         continue;
 655                 }
 656 
 657                 /*
 658                  * Try to Lock down the largest chunck of pages possible.
 659                  */
 660                 pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 661                 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
 662 
 663                 if (error)
 664                         break;
 665 
 666                 pglck_size = pglck_len;
 667                 while (pglck_len) {
 668 
 669                         nbytes = pglck_len;
 670                         uoff = uio->uio_loffset;
 671 
 672                         if (!bmap_peek) {
 673 
 674                                 /*
 675                                  * Re-adjust number of bytes to contiguous
 676                                  * range. May have already called bmap_read
 677                                  * in the case of a concurrent rewrite.
 678                                  */
 679                                 len = (ssize_t)blkroundup(fs, nbytes);
 680                                 error = bmap_read(ip, uoff, &bn, &len);
 681                                 if (error)
 682                                         break;
 683                                 if (bn == UFS_HOLE || len == 0)
 684                                         break;
 685                         }
 686                         nbytes = (size_t)MIN(nbytes, len);
 687                         bmap_peek = 0;
 688 
 689                         /*
 690                          * Get the pagelist pointer for this offset to be
 691                          * passed to directio_start.
 692                          */
 693 
 694                         if (pplist != NULL)
 695                                 spplist = pplist +
 696                                     btop((uintptr_t)iov->iov_base -
 697                                     ((uintptr_t)pglck_base & PAGEMASK));
 698                         else
 699                                 spplist = NULL;
 700 
 701                         /*
 702                          * Kick off the direct write requests
 703                          */
 704                         directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
 705                             iov->iov_base, S_READ, procp, &tail, spplist);
 706 
 707                         /*
 708                          * Adjust pointers and counters
 709                          */
 710                         iov->iov_len -= nbytes;
 711                         iov->iov_base += nbytes;
 712                         uio->uio_loffset += nbytes;
 713                         resid -= nbytes;
 714                         pglck_len -= nbytes;
 715                 }
 716 
 717                 /*
 718                  * Wait for outstanding requests
 719                  */
 720                 newerror = directio_wait(tail, &bytes_written);
 721 
 722                 /*
 723                  * Release VM resources
 724                  */
 725                 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
 726 
 727         }
 728 
 729         if (!exclusive) {
 730                 atomic_dec_32(&ufs_cur_writes);
 731                 /*
 732                  * If this write was done shared, readers may
 733                  * have pulled in unmodified pages. Get rid of
 734                  * these potentially stale pages.
 735                  */
 736                 if (vn_has_cached_data(vp)) {
 737                         rw_exit(&ip->i_contents);
 738                         rw_enter(&ip->i_contents, RW_WRITER);
 739                         (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 740                             B_INVAL, cr, NULL);
 741                         ufs_directio_kstats.nflushes.value.ui64++;
 742                         rw_downgrade(&ip->i_contents);
 743                 }
 744         }
 745 
 746         /*
 747          * If error, adjust resid to begin at the first
 748          * un-writable byte.
 749          */
 750         if (error == 0)
 751                 error = newerror;
 752         if (error)
 753                 resid = uio->uio_resid - bytes_written;
 754         arg_uio->uio_resid = resid;
 755 
 756         if (!rewrite) {
 757                 ip->i_flag |= IUPD | ICHG;
 758                 /* Caller will update i_seq */
 759                 TRANS_INODE(ip->i_ufsvfs, ip);
 760         }
 761         /*
 762          * If there is a residual; adjust the EOF if necessary
 763          */
 764         if (resid) {
 765                 if (size != ip->i_size) {
 766                         if (uio->uio_loffset > size)
 767                                 size = uio->uio_loffset;
 768                         (void) ufs_itrunc(ip, size, 0, cr);
 769                 }
 770         }
 771 
 772         if (uio == &copy_uio)
 773                 kmem_free(copy_base, copy_resid);
 774 
 775         return (error);
 776 
 777 errout:
 778         if (uio == &copy_uio)
 779                 kmem_free(copy_base, copy_resid);
 780 
 781         return (0);
 782 }
 783 /*
 784  * Direct read of a hole
 785  */
 786 static int
 787 directio_hole(struct uio *uio, size_t nbytes)
 788 {
 789         int             error = 0, nzero;
 790         uio_t           phys_uio;
 791         iovec_t         phys_iov;
 792 
 793         ufs_directio_kstats.hole_reads.value.ui64++;
 794         ufs_directio_kstats.nread.value.ui64 += nbytes;
 795 
 796         phys_iov.iov_base = uio->uio_iov->iov_base;
 797         phys_iov.iov_len = nbytes;
 798 
 799         phys_uio.uio_iov = &phys_iov;
 800         phys_uio.uio_iovcnt = 1;
 801         phys_uio.uio_resid = phys_iov.iov_len;
 802         phys_uio.uio_segflg = uio->uio_segflg;
 803         phys_uio.uio_extflg = uio->uio_extflg;
 804         while (error == 0 && phys_uio.uio_resid) {
 805                 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
 806                 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
 807                     &phys_uio);
 808         }
 809         return (error);
 810 }
 811 
 812 /*
 813  * Direct Read
 814  */
 815 int
 816 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
 817 {
 818         ssize_t         resid, bytes_read;
 819         u_offset_t      size, uoff;
 820         int             error, newerror, len;
 821         size_t          nbytes;
 822         struct fs       *fs;
 823         vnode_t         *vp;
 824         daddr_t         bn;
 825         iovec_t         *iov;
 826         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 827         struct proc     *procp;
 828         struct as       *as;
 829         struct directio_buf     *tail;
 830 
 831         /*
 832          * assume that directio isn't possible (normal case)
 833          */
 834         *statusp = DIRECTIO_FAILURE;
 835 
 836         /*
 837          * Don't go direct
 838          */
 839         if (ufs_directio_enabled == 0)
 840                 return (0);
 841 
 842         /*
 843          * mapped file; nevermind
 844          */
 845         if (ip->i_mapcnt)
 846                 return (0);
 847 
 848         /*
 849          * CAN WE DO DIRECT IO?
 850          */
 851         /*
 852          * must be sector aligned
 853          */
 854         uoff = uio->uio_loffset;
 855         resid = uio->uio_resid;
 856         if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 857                 return (0);
 858         /*
 859          * must be short aligned and sector aligned
 860          */
 861         iov = uio->uio_iov;
 862         nbytes = uio->uio_iovcnt;
 863         while (nbytes--) {
 864                 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
 865                         return (0);
 866                 if ((intptr_t)(iov++->iov_base) & 1)
 867                         return (0);
 868         }
 869 
 870         /*
 871          * DIRECTIO
 872          */
 873         fs = ip->i_fs;
 874 
 875         /*
 876          * don't read past EOF
 877          */
 878         size = ip->i_size;
 879 
 880         /*
 881          * The file offset is past EOF so bail out here; we don't want
 882          * to update uio_resid and make it look like we read something.
 883          * We say that direct I/O was a success to avoid having rdip()
 884          * go through the same "read past EOF logic".
 885          */
 886         if (uoff >= size) {
 887                 *statusp = DIRECTIO_SUCCESS;
 888                 return (0);
 889         }
 890 
 891         /*
 892          * The read would extend past EOF so make it smaller.
 893          */
 894         if ((uoff + resid) > size) {
 895                 resid = size - uoff;
 896                 /*
 897                  * recheck sector alignment
 898                  */
 899                 if (resid & (DEV_BSIZE - 1))
 900                         return (0);
 901         }
 902 
 903         /*
 904          * At this point, we know there is some real work to do.
 905          */
 906         ASSERT(resid);
 907 
 908         /*
 909          * get rid of cached pages
 910          */
 911         vp = ITOV(ip);
 912         if (vn_has_cached_data(vp)) {
 913                 rw_exit(&ip->i_contents);
 914                 rw_enter(&ip->i_contents, RW_WRITER);
 915                 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 916                     B_INVAL, cr, NULL);
 917                 if (vn_has_cached_data(vp))
 918                         return (0);
 919                 rw_downgrade(&ip->i_contents);
 920                 ufs_directio_kstats.nflushes.value.ui64++;
 921         }
 922         /*
 923          * Direct Reads
 924          */
 925 
 926         /*
 927          * proc and as are for VM operations in directio_start()
 928          */
 929         if (uio->uio_segflg == UIO_USERSPACE) {
 930                 procp = ttoproc(curthread);
 931                 as = procp->p_as;
 932         } else {
 933                 procp = NULL;
 934                 as = &kas;
 935         }
 936 
 937         *statusp = DIRECTIO_SUCCESS;
 938         error = 0;
 939         newerror = 0;
 940         bytes_read = 0;
 941         ufs_directio_kstats.logical_reads.value.ui64++;
 942         while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 943                 size_t pglck_len, pglck_size;
 944                 caddr_t pglck_base;
 945                 page_t **pplist, **spplist;
 946 
 947                 tail = NULL;
 948 
 949                 /*
 950                  * Adjust number of bytes
 951                  */
 952                 iov = uio->uio_iov;
 953                 pglck_len = (size_t)MIN(iov->iov_len, resid);
 954                 pglck_base = iov->iov_base;
 955                 if (pglck_len == 0) {
 956                         uio->uio_iov++;
 957                         uio->uio_iovcnt--;
 958                         continue;
 959                 }
 960 
 961                 /*
 962                  * Try to Lock down the largest chunck of pages possible.
 963                  */
 964                 pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 965                 error = as_pagelock(as, &pplist, pglck_base,
 966                     pglck_len, S_WRITE);
 967 
 968                 if (error)
 969                         break;
 970 
 971                 pglck_size = pglck_len;
 972                 while (pglck_len) {
 973 
 974                         nbytes = pglck_len;
 975                         uoff = uio->uio_loffset;
 976 
 977                         /*
 978                          * Re-adjust number of bytes to contiguous range
 979                          */
 980                         len = (ssize_t)blkroundup(fs, nbytes);
 981                         error = bmap_read(ip, uoff, &bn, &len);
 982                         if (error)
 983                                 break;
 984 
 985                         if (bn == UFS_HOLE) {
 986                                 nbytes = (size_t)MIN(fs->fs_bsize -
 987                                     (long)blkoff(fs, uoff), nbytes);
 988                                 error = directio_hole(uio, nbytes);
 989                                 /*
 990                                  * Hole reads are not added to the list
 991                                  * processed by directio_wait() below so
 992                                  * account for bytes read here.
 993                                  */
 994                                 if (!error)
 995                                         bytes_read += nbytes;
 996                         } else {
 997                                 nbytes = (size_t)MIN(nbytes, len);
 998 
 999                                 /*
1000                                  * Get the pagelist pointer for this offset
1001                                  * to be passed to directio_start.
1002                                  */
1003                                 if (pplist != NULL)
1004                                         spplist = pplist +
1005                                             btop((uintptr_t)iov->iov_base -
1006                                             ((uintptr_t)pglck_base & PAGEMASK));
1007                                 else
1008                                         spplist = NULL;
1009 
1010                                 /*
1011                                  * Kick off the direct read requests
1012                                  */
1013                                 directio_start(ufsvfsp, ip, nbytes,
1014                                     ldbtob(bn), iov->iov_base,
1015                                     S_WRITE, procp, &tail, spplist);
1016                         }
1017 
1018                         if (error)
1019                                 break;
1020 
1021                         /*
1022                          * Adjust pointers and counters
1023                          */
1024                         iov->iov_len -= nbytes;
1025                         iov->iov_base += nbytes;
1026                         uio->uio_loffset += nbytes;
1027                         resid -= nbytes;
1028                         pglck_len -= nbytes;
1029                 }
1030 
1031                 /*
1032                  * Wait for outstanding requests
1033                  */
1034                 newerror = directio_wait(tail, &bytes_read);
1035                 /*
1036                  * Release VM resources
1037                  */
1038                 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039 
1040         }
1041 
1042         /*
1043          * If error, adjust resid to begin at the first
1044          * un-read byte.
1045          */
1046         if (error == 0)
1047                 error = newerror;
1048         uio->uio_resid -= bytes_read;
1049         return (error);
1050 }